3 // $Id: cgreylag.cpp,v 1.9 2006/10/02 15:07:00 mkc Exp mkc $
6 #include "cgreylag.hpp"
20 //#define FAST_LOWER_READ_ACCURACY
27 // use non-threadsafe versions, since we don't use threads
28 #define fgetsX fgets_unlocked
29 #define ferrorX ferror_unlocked
31 #define fgetsX std::fgets
32 #define ferrorX std::ferror
35 #ifdef FAST_LOWER_READ_ACCURACY
36 // probably faster, at some cost in accuracy
37 #define strtodX std::strtof
39 #define strtodX std::strtod
43 parameters
parameters::the
;
47 peak::__repr__() const {
48 static char temp
[128];
49 sprintf(temp
, "<peak mz=%.4f intensity=%.4f>", mz
, intensity
);
54 int spectrum::next_id
= 1;
55 int spectrum::next_physical_id
= 1;
57 std::vector
<spectrum
> spectrum::searchable_spectra
;
58 // parent mass -> searchable_spectra index
60 std::vector
<spectrum
>::size_type
> spectrum::spectrum_mass_index
;
64 spectrum::__repr__() const {
65 static char temp
[1024];
67 "<spectrum #%d (phys #%d) '%s' %.4f/%+d"
68 " [%d peaks, maxI=%f, sumI=%f]>",
69 id
, physical_id
, name
.c_str(), mass
, charge
, peaks
.size(),
70 max_peak_intensity
, sum_peak_intensity
);
76 // This is an error exit for read_spectra.
78 read_error(FILE *f
, const char *message
="") {
80 message
= "I/O error while reading ms2 file";
81 throw std::ios_base::failure(message
);
85 // Read spectra from a file in ms2 format. Multiply charge spectra (e.g.,
86 // +2/+3) are split into separate spectra having the same physical id. The
87 // peak list is initially sorted by mz. Throws an exception on invalid
91 spectrum::read_spectra(FILE *f
, const int file_id
) {
92 std::vector
<spectrum
> spectra
;
94 const int bufsiz
= 1024;
98 char *result
= fgetsX(buf
, bufsiz
, f
);
100 std::vector
<std::string
> names
;
101 std::vector
<double> masses
;
102 std::vector
<int> charges
;
103 std::vector
<peak
> peaks
;
109 if (not result
or buf
[0] != ':')
111 names
.push_back(std::string(buf
+1, buf
+std::strlen(buf
)-1));
112 if (not fgetsX(buf
, bufsiz
, f
))
113 read_error(f
, "bad ms2 format: mass/charge line expected");
115 masses
.push_back(std::strtod(buf
, &endp
)); // need double accuracy here
116 if (errno
or endp
== buf
)
117 read_error(f
, "bad ms2 format: bad mass");
118 charges
.push_back(std::strtol(endp
, &endp
, 10));
119 if (errno
or endp
== buf
)
120 read_error(f
, "bad ms2 format: bad charge");
121 result
= fgetsX(buf
, bufsiz
, f
);
129 p
.mz
= strtodX(buf
, &endp
);
130 if (errno
or endp
== buf
)
131 read_error(f
, "bad ms2 format: bad peak mz");
132 p
.intensity
= strtodX(endp
, &endp
);
133 if (errno
or endp
== buf
)
134 read_error(f
, "bad ms2 format: bad peak intensity");
137 result
= fgetsX(buf
, bufsiz
, f
);
140 if (not result
or buf
[0] == ':')
144 // add spectra to vector
145 if (names
.empty() and not peaks
.empty())
146 read_error(f
, "bad ms2 format: missing header lines?");
148 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
150 for (std::vector
<double>::size_type i
=0; i
<names
.size(); i
++) {
151 spectrum
sp(masses
[i
], charges
[i
]);
154 sp
.file_id
= file_id
;
155 sp
.physical_id
= next_physical_id
;
156 sp
.calculate_intensity_statistics();
157 spectra
.push_back(sp
);
159 spectrum::next_physical_id
++;
165 // Sets max/sum_peak_intensity, according to peaks and normalization_factor.
167 spectrum::calculate_intensity_statistics() {
168 sum_peak_intensity
= 0;
169 max_peak_intensity
= -1;
171 for (std::vector
<peak
>::const_iterator it
=peaks
.begin(); it
!= peaks
.end();
173 sum_peak_intensity
+= it
->intensity
;
174 max_peak_intensity
= std::max
<double>(it
->intensity
, max_peak_intensity
);
176 max_peak_intensity
*= normalization_factor
;
177 sum_peak_intensity
*= normalization_factor
;
181 // For each *non-overlapping* group of peaks with mz within an interval of
182 // (less than) length 'limit', keep only the most intense peak. (The
183 // algorithm works from the right, which matter here.)
185 // FIX: Is this really the best approach? The obvious alternative would be to
186 // eliminate peaks so that no interval of length 'limit' has more than one
187 // peak (eliminating weak peaks to make this so).
190 remove_close_peaks(std::vector
<peak
> &peaks
, double limit
) {
191 for (std::vector
<peak
>::size_type i
=0; i
+1<peaks
.size(); i
++) {
192 double ub_mz
= peaks
[i
].mz
+ limit
;
193 while (i
+1 < peaks
.size() and peaks
[i
+1].mz
< ub_mz
)
194 peaks
.erase(peaks
.begin()
195 + (peaks
[i
+1].intensity
> peaks
[i
].intensity
201 // remove_close_peaks(std::vector<peak> &peaks, double limit) {
202 // for (std::vector<peak>::size_type i=0; i+1<peaks.size();)
203 // if (peaks[i+1].mz - peaks[i].mz < limit)
204 // if (peaks[i+1].intensity > peaks[i].intensity)
205 // peaks.erase(peaks.begin() + i);
207 // peaks.erase(peaks.begin() + i+1);
213 // Examine this spectrum to see if it should be kept. If so, return true and
214 // sort the peaks by mz, normalize them and limit their number.
216 spectrum::filter_and_normalize(double minimum_fragment_mz
,
217 double dynamic_range
, int minimum_peaks
,
219 // "spectrum, maximum parent charge", noise suppression check,
220 // "spectrum, minimum parent m+h" not implemented
222 if (minimum_fragment_mz
< 0 or dynamic_range
<= 0 or minimum_peaks
< 0
224 throw std::invalid_argument("invalid argument value");
227 // >>> removes multiple entries within 0.95 Da of each other, retaining the
228 // highest value. this is necessary because of the behavior of some peak
229 // finding routines in commercial software
231 // peaks already sorted by mz upon read
232 remove_close_peaks(peaks
, 0.95);
235 // >>> set up m/z regions to ignore: those immediately below the m/z of the
236 // parent ion which will contain uninformative neutral loss ions, and those
237 // immediately above the parent ion m/z, which will contain the parent ion
238 // and its isotope pattern
240 const double parent_mz
= 1.00727 + (mass
- 1.00727) / charge
;
241 for (std::vector
<peak
>::size_type i
=0; i
<peaks
.size();)
242 if (parent_mz
>= peaks
[i
].mz
243 ? std::abs(parent_mz
- peaks
[i
].mz
) < (50.0 / charge
)
244 : std::abs(parent_mz
- peaks
[i
].mz
) < (5.0 / charge
))
245 peaks
.erase(peaks
.begin() + i
);
251 std::vector
<peak
>::iterator pi
;
252 for (pi
=peaks
.begin(); pi
!= peaks
.end() and pi
->mz
<= minimum_fragment_mz
;
254 peaks
.erase(peaks
.begin(), pi
);
257 // normalize spectrum
258 // >>> use the dynamic range parameter to set the maximum intensity value
259 // for the spectrum. then remove all peaks with a normalized intensity < 1
262 std::vector
<peak
>::iterator most_intense_peak
263 = std::max_element(peaks
.begin(), peaks
.end(), peak::less_intensity
);
265 = std::max
<double>(1.0, most_intense_peak
->intensity
) / dynamic_range
;
266 for (std::vector
<peak
>::iterator p
=peaks
.begin(); p
!= peaks
.end(); p
++)
267 p
->intensity
/= normalization_factor
;
268 for (std::vector
<peak
>::size_type i
=0; i
<peaks
.size();)
269 if (peaks
[i
].intensity
< 1.0)
270 peaks
.erase(peaks
.begin() + i
);
274 if (peaks
.size() < (unsigned int) minimum_peaks
)
277 // # check is_noise (NYI)
278 // # * is_noise attempts to determine if the spectrum is simply
279 // # noise. if the spectrum
280 // # * does not have any peaks within a window near the parent ion mass,
281 // # it is considered
283 // #limit = sp.mass / sp.charge
284 // #if sp.charge < 3:
285 // # limit = sp.mass - 600.0
286 // #if not [ True for mz, i in sp.peaks if mz > limit ]:
287 // # return "looks like noise"
289 // clean_isotopes removes peaks that are probably C13 isotopes
290 remove_close_peaks(peaks
, 1.5);
292 // keep only the most intense peaks
293 if (peaks
.size() > (unsigned int) total_peaks
) {
294 std::sort(peaks
.begin(), peaks
.end(), peak::less_intensity
);
295 peaks
.erase(peaks
.begin(), peaks
.end() - total_peaks
);
296 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
299 // update statistics, now that we've removed some peaks
300 calculate_intensity_statistics();
306 // Store the list of spectra that search_peptide will search against, and also
307 // build spectrum_mass_index.
309 spectrum::set_searchable_spectra(const std::vector
<spectrum
> &spectra
) {
310 searchable_spectra
= spectra
;
311 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++)
312 spectrum_mass_index
.insert(std::make_pair(spectra
[i
].mass
, i
));
316 // Multiply peak intensities by residue-dependent factors to generate a more
317 // realistic spectrum. If is_XYZ, walk through the peptide sequence in
320 synthesize_ladder_intensities(std::vector
<peak
> &mass_ladder
,
321 const std::string
&peptide_seq
,
323 const parameters
&CP
= parameters::the
;
325 assert(mass_ladder
.size() == peptide_seq
.size() - 1);
326 assert(mass_ladder
.size() >= 2);
328 // An intensity boost is given for the second peptide bond/peak (*) if the
329 // N-terminal residue is 'P':
332 const int proline_bonus_position
= is_XYZ
? mass_ladder
.size()-2 : 1;
333 mass_ladder
[proline_bonus_position
].intensity
= 3.0;
334 if (peptide_seq
[1] == 'P')
335 mass_ladder
[proline_bonus_position
].intensity
= 10.0;
337 if (CP
.spectrum_synthesis
)
338 for (unsigned int i
=0; i
<mass_ladder
.size(); i
++) {
339 // FIX: there must be a better way
340 const char peptide_index
= is_XYZ
? mass_ladder
.size()-i
-1 : i
;
341 switch (peptide_seq
[peptide_index
]) {
343 mass_ladder
[i
].intensity
*= 5.0;
349 mass_ladder
[i
].intensity
*= 3.0;
353 mass_ladder
[i
].intensity
*= 2.0;
356 switch (peptide_seq
[peptide_index
+1]) {
358 mass_ladder
[i
].intensity
*= 5.0;
365 // Calculate peaks for a synthesized mass (not mz) ladder.
367 get_synthetic_Y_mass_ladder(std::vector
<peak
> &mass_ladder
,
368 const std::string
&peptide_seq
,
369 const std::vector
<double> &mass_list
,
370 const double C_terminal_mass
,
371 const unsigned mass_regime
=0) {
372 const parameters
&CP
= parameters::the
;
373 assert(peptide_seq
.size() == mass_list
.size());
374 double m
= (CP
.fragment_mass_regime
[mass_regime
].water_mass
375 + (CP
.cleavage_C_terminal_mass_change
376 - CP
.fragment_mass_regime
[mass_regime
].hydroxyl_mass
)
379 const int ladder_size
= mass_ladder
.size();
380 for (int i
=ladder_size
-1; i
>=0; i
--) {
382 mass_ladder
[ladder_size
-1-i
] = peak(m
, 1.0);
385 synthesize_ladder_intensities(mass_ladder
, peptide_seq
, true);
388 // Calculate peaks for a synthesized mass (not mz) ladder.
390 get_synthetic_B_mass_ladder(std::vector
<peak
> &mass_ladder
,
391 const std::string
&peptide_seq
,
392 const std::vector
<double> &mass_list
,
393 const double N_terminal_mass
,
394 const unsigned mass_regime
=0) {
395 const parameters
&CP
= parameters::the
;
396 double m
= (0.0 + (CP
.cleavage_N_terminal_mass_change
- CP
.hydrogen_mass
)
399 const int ladder_size
= mass_ladder
.size();
400 for (int i
=0; i
<=ladder_size
-1; i
++) {
402 mass_ladder
[i
] = peak(m
, 1.0);
405 synthesize_ladder_intensities(mass_ladder
, peptide_seq
, false);
409 // Construct a synthetic spectrum from these parameters.
410 spectrum::spectrum(double peptide_mod_mass
, int charge
,
411 const std::vector
<peak
> &mass_ladder
) {
412 init(peptide_mod_mass
, charge
);
415 for (std::vector
<peak
>::size_type i
=0; i
<mass_ladder
.size(); i
++)
416 peaks
[i
].mz
= peak::get_mz(peaks
[i
].mz
, charge
);
418 //std::cerr << "synth charge " << charge << std::endl;
419 //for (unsigned int i=0; i<mass_ladder.size(); i++)
420 // std::cerr << "peak mz " << peaks[i].mz << " i " << peaks[i].intensity << std::endl;
425 synthetic_spectra(spectrum synth_sp
[/* max_fragment_charge+1 */][ION_MAX
],
426 const std::string
&peptide_seq
, const double peptide_mass
,
427 const std::vector
<double> &mass_list
,
428 const double N_terminal_mass
, const double C_terminal_mass
,
429 const double max_fragment_charge
) {
430 for (ion ion_type
=ION_MIN
; ion_type
<ION_MAX
; ion_type
++) {
431 std::vector
<peak
> mass_ladder(mass_list
.size()-1);
435 get_synthetic_Y_mass_ladder(mass_ladder
, peptide_seq
, mass_list
,
439 get_synthetic_B_mass_ladder(mass_ladder
, peptide_seq
, mass_list
,
446 //for (unsigned int i=0; i<mass_ladder.size(); i++)
447 // std::cerr << "ladder step " << peptide_seq[i] << " " << mass_ladder[i].mz << std::endl;
449 for (int charge
=1; charge
<=max_fragment_charge
; charge
++)
450 synth_sp
[charge
][ion_type
] = spectrum(peptide_mass
, charge
,
456 // IDEA: could we combine all these spectra and just do the correlation once?
458 // Score the specified spectrum against a group of synthetic fragment
461 score_spectrum(double &hyper_score
, double &convolution_score
,
462 std::vector
<double> &ion_scores
, std::vector
<int> &ion_peaks
,
463 spectrum synth_sp
[/* max_fragment_charge+1 */][ION_MAX
],
464 const int spectrum_id
, const int max_fragment_charge
) {
465 const parameters
&CP
= parameters::the
;
466 const spectrum sp
= spectrum::searchable_spectra
[spectrum_id
];
467 const int spectrum_charge
= sp
.charge
;
469 convolution_score
= 0;
471 for (ion ion_type
=ION_MIN
; ion_type
<ION_MAX
; ion_type
++) {
473 double i_scores
= 0.0;
474 const int charge_limit
= std::max
<int>(1, spectrum_charge
-1);
475 for (int charge
=1; charge
<=charge_limit
; charge
++) {
476 assert(charge
<= max_fragment_charge
);
478 // convolution score is just sum over charges/ions
479 // hyperscore is product of p! over charges/ions (where p is corr peak
480 // count) times the convolution score (clipped to FLT_MAX)
482 int common_peak_count
; // FIX!
483 double conv_score
= spectrum::score_similarity(synth_sp
[charge
][ion_type
],
484 sp
, &common_peak_count
);
485 i_peaks
+= common_peak_count
;
486 i_scores
+= conv_score
;
487 hyper_score
*= CP
.factorial
[common_peak_count
];
488 convolution_score
+= conv_score
;
491 ion_peaks
[ion_type
] = i_peaks
;
492 ion_scores
[ion_type
] = i_scores
;
494 hyper_score
*= convolution_score
;
504 // fx = &generator::generate_static_aa_mod;
506 typedef void generator::generator_fx(score_stats
&stats
, const match
&m
,
507 std::vector
<double> position_mass
,
508 const std::vector
<double> &terminal_mass
,
509 const std::vector
<generator
>::iterator g_it
);
511 generator() : fx(0) { }
513 generator_fx generate_static_aa_mod
;
515 generator_fx
generator::*fx
;
516 std::vector
<double> args
;
521 generator::generate_static_aa_mod(score_stats
&stats
, const match
&m
,
522 std::vector
<double> position_mass
,
523 const std::vector
<double> &terminal_mass
,
524 const std::vector
<generator
>::iterator g_it
) {
525 for (unsigned int i
=0; i
<m
.peptide_sequence
.size(); i
++)
526 position_mass
[i
] += args
[m
.peptide_sequence
[i
]];
528 ((*g_it
).*(g_it
->fx
))(stats
, m
, position_mass
, terminal_mass
, g_it
+1);
534 // FIX: currently only one mass_list is implemented, meaning that parent and
535 // fragment masses must be the same (e.g., mono/mono). A workaround is to
536 // just increase the parent error plus a bit (the xtandem default seems to
539 // Search for matches of this particular peptide modification variation
540 // against the spectra. Updates score_stats and returns the number of
541 // candidate spectra found.
543 evaluate_peptide_mod_variation(match
&m
,
544 const std::vector
<double> &mass_list
,
545 const double N_terminal_mass
,
546 const double C_terminal_mass
,
547 score_stats
&stats
) {
548 const parameters
&CP
= parameters::the
;
549 assert(m
.peptide_sequence
.size() == mass_list
.size());
551 m
.peptide_mass
= get_peptide_mass(mass_list
, N_terminal_mass
,
553 double sp_mass_lb
= m
.peptide_mass
+ CP
.parent_monoisotopic_mass_error_minus
;
554 double sp_mass_ub
= m
.peptide_mass
+ CP
.parent_monoisotopic_mass_error_plus
;
555 std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
556 candidate_spectra_info_begin
557 = spectrum::spectrum_mass_index
.lower_bound(sp_mass_lb
);
558 // FIX: is there a better way?
559 std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
560 candidate_spectra_info_end
561 = spectrum::spectrum_mass_index
.upper_bound(sp_mass_ub
);
562 if (candidate_spectra_info_begin
== candidate_spectra_info_end
)
565 int max_candidate_charge
= 0;
566 for (std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
567 it
= candidate_spectra_info_begin
;
568 it
!= candidate_spectra_info_end
; it
++)
570 = std::max
<int>(max_candidate_charge
,
571 spectrum::searchable_spectra
[it
->second
].charge
);
573 const int max_fragment_charge
= std::max
<int>(1, max_candidate_charge
-1);
574 assert(max_fragment_charge
<= spectrum::max_supported_charge
);
575 // e.g. +2 -> 'B' -> spectrum
576 // FIX: should this be a std::vector??
577 static spectrum synth_sp
[spectrum::max_supported_charge
+1][ION_MAX
];
578 synthetic_spectra(synth_sp
, m
.peptide_sequence
, m
.peptide_mass
, mass_list
,
579 N_terminal_mass
, C_terminal_mass
, max_fragment_charge
);
581 for (std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
582 candidate_it
= candidate_spectra_info_begin
;
583 candidate_it
!= candidate_spectra_info_end
; candidate_it
++) {
584 m
.spectrum_index
= candidate_it
->second
;
585 stats
.candidate_spectrum_count
++;
586 score_spectrum(m
.hyper_score
, m
.convolution_score
, m
.ion_scores
,
587 m
.ion_peaks
, synth_sp
, m
.spectrum_index
,
588 max_fragment_charge
);
589 if (m
.convolution_score
<= 2.0)
592 //std::cerr << "histod: " << m.spectrum_index << " " << m.peptide_sequence
593 //<< " " << m.peptide_mass << " " <<
594 //spectrum::searchable_spectra[m.spectrum_index].mass << std::endl;
596 // update spectrum histograms
597 std::vector
<int> &hh
= stats
.hyperscore_histogram
[m
.spectrum_index
];
598 int binned_scaled_hyper_score
= int(0.5 + scale_hyperscore(m
.hyper_score
));
599 assert(binned_scaled_hyper_score
>= 0);
601 if (static_cast<int>(hh
.size())-1 < binned_scaled_hyper_score
) {
602 assert(hh
.size() < INT_MAX
/2);
603 hh
.resize(std::max
<unsigned int>(binned_scaled_hyper_score
+1,
606 hh
[binned_scaled_hyper_score
] += 1;
607 // incr m_tPeptideScoredCount
608 // nyi: permute stuff
610 const int sp_ion_count
= m
.ion_peaks
[ION_B
] + m
.ion_peaks
[ION_Y
];
611 if (sp_ion_count
< CP
.minimum_ion_count
)
613 const bool has_b_and_y
= m
.ion_peaks
[ION_B
] and m
.ion_peaks
[ION_Y
];
617 // check that parent masses are within error range (isotope ni)
618 // already done above (why does xtandem do it here?)
619 // if check fails, only eligible for 2nd-best record
621 // Remember all of the highest-hyper-scoring matches against each
622 // spectrum. These might be in multiple domains (pos, length, mods) in
623 // multiple proteins.
625 // To avoid the problems that xtandem has with inexact float-point
626 // calculations, we consider hyperscores within the "epsilon ratio" to
627 // be "equal". The best_match vector will need to be re-screened
628 // against this ratio, because it may finally contain entries which
629 // don't meet our criterion. (We don't take the time to get it exactly
630 // right here, because we may end up discarding it all anyway.)
632 const double current_ratio
= (m
.hyper_score
633 / stats
.best_score
[m
.spectrum_index
]);
635 if (CP
.quirks_mode
and (m
.hyper_score
< stats
.best_score
[m
.spectrum_index
])
636 or (not CP
.quirks_mode
637 and (current_ratio
< CP
.hyper_score_epsilon_ratio
))) {
638 // This isn't a best score, so see if it's a second-best score.
639 if (m
.hyper_score
> stats
.second_best_score
[m
.spectrum_index
])
640 stats
.second_best_score
[m
.spectrum_index
] = m
.hyper_score
;
643 if (CP
.quirks_mode
and (m
.hyper_score
> stats
.best_score
[m
.spectrum_index
])
644 or (not CP
.quirks_mode
645 and (current_ratio
> (1/CP
.hyper_score_epsilon_ratio
)))) {
646 // This score is significantly better, so forget previous matches.
647 stats
.best_score
[m
.spectrum_index
] = m
.hyper_score
;
648 stats
.best_match
[m
.spectrum_index
].clear();
650 // Now remember this match because it's at least as good as those
652 stats
.best_match
[m
.spectrum_index
].push_back(m
);
657 // Choose a possible modification to account for PCA (pyrrolidone carboxyl
658 // acid) circularization of the peptide N-terminal. This is excluded if a
659 // static N-terminal mod was specified. (FIX: Does a PCA mod exclude other
662 choose_PCA_mod(match
&m
, std::vector
<double> &mass_list
,
663 const double N_terminal_mass
, const double C_terminal_mass
,
664 score_stats
&stats
) {
665 const parameters
&CP
= parameters::the
;
666 evaluate_peptide_mod_variation(m
, mass_list
, N_terminal_mass
,
667 C_terminal_mass
, stats
);
669 const int mass_regime
=0; // FIX
671 if (CP
.fragment_mass_regime
[mass_regime
].modification_mass
['['] == 0) {
672 for (int bug
=0; bug
<(CP
.quirks_mode
? 2 : 1); bug
++) // FIX
673 switch (m
.peptide_sequence
[0]) {
675 evaluate_peptide_mod_variation(m
, mass_list
,
677 -CP
.fragment_mass_regime
[mass_regime
].water_mass
,
678 C_terminal_mass
, stats
);
681 // FIX: need better test for C+57? (symbolic?)
683 = CP
.fragment_mass_regime
[mass_regime
].modification_mass
['C'];
684 if (CP
.quirks_mode
? int(C_mod
) != 57 : std::abs(C_mod
- 57) > 0.5)
685 break; // skip unless C+57
688 evaluate_peptide_mod_variation(m
, mass_list
,
690 -CP
.fragment_mass_regime
[mass_regime
].ammonia_mass
,
691 C_terminal_mass
, stats
);
698 // Choose among the requested mass regimes (e.g., isotopes), and also choose
699 // the static mods (including N- and C-terminal mods), which are determined by
700 // the mass regime choice.
702 choose_mass_regime(match
&m
, std::vector
<double> &mass_list
,
703 const double N_terminal_mass
, const double C_terminal_mass
,
704 score_stats
&stats
) {
705 const parameters
&CP
= parameters::the
;
707 const int mass_regime
=0; // FIX
708 for (std::vector
<double>::size_type i
=0; i
<m
.peptide_sequence
.size(); i
++) {
709 const char res
= m
.peptide_sequence
[i
];
710 mass_list
[i
] = (CP
.fragment_mass_regime
[mass_regime
].residue_mass
[res
]
711 + CP
.fragment_mass_regime
[mass_regime
].modification_mass
[res
]);
713 choose_PCA_mod(m
, mass_list
,
715 + CP
.fragment_mass_regime
[mass_regime
].modification_mass
['['],
717 + CP
.fragment_mass_regime
[mass_regime
].modification_mass
[']'],
722 // Search for matches of all modification variations of this peptide against
723 // the spectra. Updates score_stats and returns the number of candidate
726 spectrum::search_peptide_all_mods(int idno
, int offset
, int begin
,
727 const std::string
&peptide_seq
,
728 int missed_cleavage_count
,
729 score_stats
&stats
) {
730 std::vector
<double> mass_list(peptide_seq
.size());
731 double N_terminal_mass
= 0.0;
732 double C_terminal_mass
= 0.0;
734 // This match will be passed inward and used to record information that we
735 // need to remember about a match when we finally see one. At that point, a
736 // copy of this match will be saved.
738 m
.sequence_index
= idno
;
739 m
.sequence_offset
= offset
;
740 m
.peptide_begin
= begin
;
741 m
.peptide_sequence
= peptide_seq
;
742 m
.missed_cleavage_count
= missed_cleavage_count
;
744 choose_mass_regime(m
, mass_list
, N_terminal_mass
, C_terminal_mass
, stats
);
748 // Return the similarity score between this spectrum and that, and also a
749 // count of common peaks in *peak_count.
751 // xtandem quantizes mz values into bins of width fragment_mass_error first,
752 // and adds a bin on either side. This makes the effective fragment mass
753 // error for each peak an arbitrary value between 1x and 2x the parameter,
754 // based on quantization of that peak. If quirks_mode is on, we'll try to
755 // emulate this behavior.
757 spectrum::score_similarity(const spectrum
&x
, const spectrum
&y
,
759 const double frag_err
= parameters::the
.fragment_mass_error
;
760 const bool quirks_mode
= parameters::the
.quirks_mode
;
764 std::vector
<peak
>::const_iterator x_it
= x
.peaks
.begin();
765 std::vector
<peak
>::const_iterator y_it
= y
.peaks
.begin();
767 while (x_it
!= x
.peaks
.end() and y_it
!= y
.peaks
.end()) {
768 double x_mz
= x_it
->mz
, y_mz
= y_it
->mz
;
770 const float ffe
= frag_err
;
771 x_mz
= std::floor(static_cast<float>(x_mz
) / ffe
) * ffe
;
772 y_mz
= std::floor(static_cast<float>(y_mz
) / ffe
) * ffe
;
774 // in quirks mode, must be within one frag_err-wide bin
775 if (std::abs(x_mz
- y_mz
) < (quirks_mode
? frag_err
*1.5 : frag_err
)) {
777 score
+= (x_it
->intensity
* y_it
->intensity
);
778 //std::cerr << "hit " << x_mz << " vs " << y_mz << ", i = " << x_it->intensity * y_it->intensity << std::endl;
780 // assume peaks aren't "close" together
786 if (x_it
->mz
< y_it
->mz
)
798 get_parent_peptide_mass(const std::string
&peptide_seq
,
799 const unsigned mass_regime
) {
800 const parameters
&CP
= parameters::the
;
801 const bool is_N
=false, is_C
=false; // FIX
804 NC_mods
+= CP
.parent_mass_regime
[mass_regime
].modification_mass
['['];
806 NC_mods
+= CP
.parent_mass_regime
[mass_regime
].modification_mass
[']'];
808 double residue_sum
= 0;
809 for (std::string::const_iterator it
=peptide_seq
.begin();
810 it
!= peptide_seq
.end(); it
++)
811 residue_sum
+= (CP
.parent_mass_regime
[mass_regime
].residue_mass
[*it
]
812 + CP
.parent_mass_regime
[mass_regime
].modification_mass
[*it
]);
813 return (NC_mods
+ residue_sum
814 + CP
.cleavage_N_terminal_mass_change
815 + CP
.cleavage_C_terminal_mass_change
821 get_peptide_mass(const std::vector
<double> &mass_list
,
822 const double N_terminal_mass
, const double C_terminal_mass
) {
823 const parameters
&CP
= parameters::the
;
825 double m
= (N_terminal_mass
+ C_terminal_mass
826 + CP
.cleavage_N_terminal_mass_change
827 + CP
.cleavage_C_terminal_mass_change
829 return std::accumulate(mass_list
.begin(), mass_list
.end(), m
);
833 // returns log-scaled hyperscore
835 scale_hyperscore(double hyper_score
) {
836 assert(hyper_score
>= 0); // otherwise check for EDOM, ERANGE
837 if (hyper_score
== 0)
838 return -DBL_MAX
; // FIX: this shouldn't be reached?
839 return 4 * std::log10(hyper_score
);