whitespace cleanups
[greylag.git] / cgreylag.cpp
blob6a4475cb4599dce2cdbd5d072c3f285f956b1747
1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
23 #include <cerrno>
24 #include <cfloat>
25 #include <climits>
26 #include <cstdlib>
27 #include <cmath>
28 #include <numeric>
29 #include <set>
30 #include <stdexcept>
32 #include <iostream>
35 #ifdef __GNU__
36 #define NOTHROW __attribute__ ((nothrow))
37 // use faster, non-threadsafe versions, since we don't use threads
38 #define fgetsX fgets_unlocked
39 #define fputsX fputs_unlocked
40 #define fprintfX __builtin_fprintf_unlocked
41 #define ferrorX ferror_unlocked
42 #else
43 #define NOTHROW
44 #define fgetsX std::fgets
45 #define fputsX std::fputs
46 #define fprintfX std::fprintf
47 #define ferrorX std::ferror
48 #endif
51 parameters parameters::the;
54 char *
55 peak::__repr__() const {
56 static char temp[128];
57 sprintf(temp, "<peak mz=%.4f intensity=%.4f>", mz, intensity);
58 return &temp[0];
61 const int FIRST_SPECTRUM_ID = 1;
62 int spectrum::next_id = FIRST_SPECTRUM_ID;
63 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
65 std::vector<spectrum> spectrum::searchable_spectra;
66 // parent mass -> searchable_spectra index
67 std::multimap<double,
68 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
70 typedef
71 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
72 spmi_c_it; // ugh
75 char *
76 spectrum::__repr__() const {
77 static char temp[1024];
78 sprintf(temp,
79 "<spectrum #%d (phys #%d) '%s' %.4f/%+d"
80 " [%d peaks, maxI=%f, sumI=%f]>",
81 id, physical_id, name.c_str(), mass, charge, peaks.size(),
82 max_peak_intensity, sum_peak_intensity);
83 return &temp[0];
88 // This is an error exit for read_spectra and filter_ms2_by_mass.
89 static inline void
90 io_error(FILE *f, const char *message="") {
91 if (ferrorX(f))
92 message = "I/O error while reading (or writing) spectrum file";
93 throw std::ios_base::failure(message);
97 // Return sorted list of parent masses present in the given ms2 files.
98 std::vector<double>
99 spectrum::read_ms2_spectrum_masses(std::vector<int> fds) {
100 std::vector<double> masses;
102 for (std::vector<int>::const_iterator fdit=fds.begin(); fdit != fds.end();
103 fdit++) {
104 FILE *f = fdopen(*fdit, "r");
106 const int bufsiz = 1024;
107 char buf[bufsiz];
108 char *endp;
109 char *result = fgetsX(buf, bufsiz, f);
110 while (true) {
111 // read headers
112 while (true) {
113 if (ferrorX(f))
114 io_error(f);
115 if (not result or buf[0] != ':')
116 break;
117 if (not fgetsX(buf, bufsiz, f))
118 io_error(f, "bad ms2 format: mass/charge line expected");
119 errno = 0;
120 masses.push_back(std::strtod(buf, &endp)); // need double accuracy here
121 if (errno or endp == buf)
122 io_error(f, "bad ms2 format: bad mass");
123 result = fgetsX(buf, bufsiz, f);
125 if (not result)
126 break;
127 // read peaks
128 std::vector<peak> peaks;
129 while (true) {
130 result = fgetsX(buf, bufsiz, f);
131 if (ferrorX(f))
132 io_error(f);
133 if (not result or buf[0] == ':')
134 break;
138 std::sort(masses.begin(), masses.end());
139 return masses;
143 // Read spectra from file in ms2 format, tagging them with file_id. Spectra
144 // with charge zero are omitted from the result. (All of them are read,
145 // though, to keep spectrum ids in sync.) If file_id == -1, the ms2 file is
146 // an annotated file produced by split_ms2_by_mass_band.
148 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
149 // having the same physical id. The peak list is initially sorted by mz.
150 // Throws an exception on invalid input.
152 std::vector<spectrum>
153 spectrum::read_spectra_from_ms2(FILE *f, const int file_id) {
154 std::vector<spectrum> spectra;
156 const int bufsiz = 1024;
157 char buf[bufsiz];
158 char *endp;
160 char *result = fgetsX(buf, bufsiz, f);
161 while (true) {
162 std::vector<std::string> names;
163 std::vector<double> masses;
164 std::vector<int> charges;
165 // for annotated ms2
166 std::vector<int> file_ids;
167 std::vector<int> physical_ids;
168 std::vector<int> ids;
170 // read headers
171 while (true) {
172 if (ferrorX(f))
173 io_error(f);
174 if (not result or buf[0] != ':')
175 break;
176 if (file_id != -1)
177 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
178 else {
179 char *anno_hash = std::strrchr(buf, '#'); // const, but C++ doesn't
180 // like it declared thus--yuck
181 if (not anno_hash)
182 io_error(f, "bad ms2+ format: '#' not found");
183 names.push_back(std::string(buf+1, anno_hash));
184 errno = 0;
185 file_ids.push_back(std::strtol(anno_hash+1, &endp, 10));
186 physical_ids.push_back(std::strtol(endp+1, &endp, 10));
187 ids.push_back(std::strtol(endp+1, &endp, 10));
188 if (errno) // FIX: improve error check
189 io_error(f, "bad ms2+ format: bad values");
191 if (not fgetsX(buf, bufsiz, f))
192 io_error(f, "bad ms2 format: mass/charge line expected");
193 errno = 0;
194 masses.push_back(std::strtod(buf, &endp)); // need double accuracy here
195 if (errno or endp == buf)
196 io_error(f, "bad ms2 format: bad mass");
197 const char *endp0 = endp;
198 charges.push_back(std::strtol(endp0, &endp, 10));
199 if (errno or endp == endp0)
200 io_error(f, "bad ms2 format: bad charge");
201 result = fgetsX(buf, bufsiz, f);
203 if (not result)
204 break;
205 // read peaks
206 std::vector<peak> peaks;
207 while (true) {
208 peak p;
209 errno = 0;
210 p.mz = strtod(buf, &endp);
211 if (errno or endp == buf)
212 io_error(f, "bad ms2 format: bad peak mz");
213 const char *endp0 = endp;
214 p.intensity = strtod(endp0, &endp);
215 if (errno or endp == endp0)
216 io_error(f, "bad ms2 format: bad peak intensity");
217 peaks.push_back(p);
219 result = fgetsX(buf, bufsiz, f);
220 if (ferrorX(f))
221 io_error(f);
222 if (not result or buf[0] == ':')
223 break;
226 // add spectra to vector
227 if (names.empty() and not peaks.empty())
228 io_error(f, "bad ms2 format: missing header lines?");
230 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
232 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
233 spectrum sp(masses[i], charges[i]);
234 sp.peaks = peaks;
235 sp.name = names[i];
236 if (file_id != -1) {
237 sp.file_id = file_id;
238 sp.physical_id = next_physical_id;
239 } else {
240 sp.file_id = file_ids[i];
241 sp.physical_id = physical_ids[i];
242 sp.id = ids[i];
243 spectrum::next_id = std::max(spectrum::next_id, sp.id+1);
244 spectrum::next_physical_id = std::max(spectrum::next_physical_id,
245 sp.physical_id+1);
247 sp.calculate_intensity_statistics();
248 spectra.push_back(sp);
250 if (file_id != -1)
251 spectrum::next_physical_id += 1;
253 return spectra;
257 // Copy spectra from an ms2 file to a set of output ms2 files, one for each
258 // band in the set of mass bands described by their upper bounds. Extra
259 // information is written in the first header line of each spectra so that
260 // id's may be recovered. So, for example, ':0002.0002.1' might become
261 // ':0002.0002.1 # 0 45 78', where 0, 45, 78 are the spectrum's file_id,
262 // physical_id, and id, respectively. Multiply charged spectra will be
263 // split into separate spectra (having the same physical_id).
265 // FIX: cleaner to just pass in a vector of filenames, rather than fds?
266 void
267 spectrum::split_ms2_by_mass_band(FILE *inf, const std::vector<int> &outfds,
268 const int file_id,
269 const std::vector<double>
270 &mass_band_upper_bounds) {
271 std::map<double, FILE *> mass_file_index;
273 if (outfds.size() < 1
274 or outfds.size() != mass_band_upper_bounds.size())
275 throw std::invalid_argument("invalid argument value");
277 for (std::vector<int>::size_type i=0; i<outfds.size(); i++) {
278 FILE *f = fdopen(outfds[i], "w");
279 if (not f)
280 throw std::ios_base::failure("fdopen failed: fd limit exceeded?");
281 mass_file_index.insert(std::make_pair(mass_band_upper_bounds[i], f));
284 int o_next_id = FIRST_SPECTRUM_ID;
285 int o_next_physical_id = FIRST_SPECTRUM_ID;
287 const int bufsiz = 1024;
288 char buf0[bufsiz], buf1[bufsiz];
289 char *endp;
290 char *result = fgetsX(buf0, bufsiz, inf);
291 while (true) {
292 double mass;
293 std::set<FILE *> sp_files;
295 // copy headers
296 while (true) {
297 if (ferrorX(inf))
298 io_error(inf);
299 if (not result or buf0[0] != ':')
300 break;
301 if (not fgetsX(buf1, bufsiz, inf))
302 io_error(inf, "bad ms2 format: mass/charge line expected");
303 errno = 0;
304 mass = std::strtod(buf1, &endp); // need double accuracy here
305 if (errno or endp == buf1)
306 io_error(inf, "bad ms2 format: bad mass");
307 errno = 0;
308 std::map<double, FILE *>::const_iterator bit
309 = mass_file_index.lower_bound(mass);
310 if (bit == mass_file_index.end())
311 throw std::invalid_argument("internal error: mass out of range");
312 FILE *bandf = bit->second;
313 sp_files.insert(bandf);
314 assert(buf0[strlen(buf0)-1] == '\n');
315 buf0[strlen(buf0)-1] = 0; // chop newline
316 fprintfX(bandf, "%s # %d %d %d\n", buf0, file_id, o_next_physical_id,
317 o_next_id++);
318 fputsX(buf1, bandf);
319 if (errno)
320 io_error(bandf, "error writing ms2 file");
321 result = fgetsX(buf0, bufsiz, inf);
323 if (not result)
324 break;
325 if (sp_files.empty())
326 io_error(inf, "bad ms2 format: missing header lines?");
327 // copy peaks
328 while (true) {
329 peak p;
330 errno = 0;
331 for (std::set<FILE *>::const_iterator fit = sp_files.begin();
332 fit != sp_files.end(); fit++) {
333 fputsX(buf0, *fit);
334 if (errno)
335 io_error(*fit, "error writing ms2 file");
337 result = fgetsX(buf0, bufsiz, inf);
338 if (ferrorX(inf))
339 io_error(inf);
340 if (not result or buf0[0] == ':')
341 break;
343 o_next_physical_id += 1;
346 // flush all output files (is this necessary?)
347 for (std::map<double, FILE *>::const_iterator it = mass_file_index.begin();
348 it != mass_file_index.end(); it++)
349 std::fflush(it->second);
353 // For each *non-overlapping* group of peaks with mz within an interval of
354 // (less than) length 'limit', keep only the most intense peak. (The
355 // algorithm works from the right, which matter here.)
357 // FIX: Is this really the best approach? The obvious alternative would be to
358 // eliminate peaks so that no interval of length 'limit' has more than one
359 // peak (eliminating weak peaks to make this so).
361 static inline void
362 remove_close_peaks(std::vector<peak> &peaks, double limit) {
363 for (std::vector<peak>::size_type i=0; i+1<peaks.size(); i++) {
364 double ub_mz = peaks[i].mz + limit;
365 while (i+1 < peaks.size() and peaks[i+1].mz < ub_mz)
366 peaks.erase(peaks.begin()
367 + (peaks[i+1].intensity > peaks[i].intensity
368 ? i : i+1));
373 // Examine this spectrum to see if it should be kept. If so, return true and
374 // sort the peaks by mz, normalize them and limit their number.
375 bool
376 spectrum::filter_and_normalize(double minimum_fragment_mz,
377 double dynamic_range, int minimum_peaks,
378 int total_peaks) {
379 // "spectrum, maximum parent charge", noise suppression check,
380 // "spectrum, minimum parent m+h" not implemented
382 if (minimum_fragment_mz < 0 or dynamic_range <= 0 or minimum_peaks < 0
383 or total_peaks < 0)
384 throw std::invalid_argument("invalid argument value");
386 // remove_isotopes
387 // >>> removes multiple entries within 0.95 Da of each other, retaining the
388 // highest value. this is necessary because of the behavior of some peak
389 // finding routines in commercial software
391 // peaks already sorted by mz upon read
392 remove_close_peaks(peaks, 0.95);
394 // remove parent
395 // >>> set up m/z regions to ignore: those immediately below the m/z of the
396 // parent ion which will contain uninformative neutral loss ions, and those
397 // immediately above the parent ion m/z, which will contain the parent ion
398 // and its isotope pattern
400 const double parent_mz = 1.00727 + (mass - 1.00727) / charge;
401 for (std::vector<peak>::size_type i=0; i<peaks.size();)
402 if (parent_mz >= peaks[i].mz
403 ? std::abs(parent_mz - peaks[i].mz) < (50.0 / charge)
404 : std::abs(parent_mz - peaks[i].mz) < (5.0 / charge))
405 peaks.erase(peaks.begin() + i);
406 else
407 i++;
410 // remove_low_masses
411 std::vector<peak>::iterator pi;
412 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= minimum_fragment_mz;
413 pi++);
414 peaks.erase(peaks.begin(), pi);
417 // normalize spectrum
418 // >>> use the dynamic range parameter to set the maximum intensity value
419 // for the spectrum. then remove all peaks with a normalized intensity < 1
420 if (peaks.empty())
421 return false;
422 std::vector<peak>::iterator most_intense_peak
423 = std::max_element(peaks.begin(), peaks.end(), peak::less_intensity);
424 normalization_factor
425 = std::max<double>(1.0, most_intense_peak->intensity) / dynamic_range;
426 for (std::vector<peak>::iterator p=peaks.begin(); p != peaks.end(); p++)
427 p->intensity /= normalization_factor;
428 for (std::vector<peak>::size_type i=0; i<peaks.size();)
429 if (peaks[i].intensity < 1.0)
430 peaks.erase(peaks.begin() + i);
431 else
432 i++;
434 if (peaks.size() < (unsigned int) minimum_peaks)
435 return false;
437 // # check is_noise (NYI)
438 // # * is_noise attempts to determine if the spectrum is simply
439 // # noise. if the spectrum
440 // # * does not have any peaks within a window near the parent ion mass,
441 // # it is considered
442 // # * noise.
443 // #limit = sp.mass / sp.charge
444 // #if sp.charge < 3:
445 // # limit = sp.mass - 600.0
446 // #if not [ True for mz, i in sp.peaks if mz > limit ]:
447 // # return "looks like noise"
449 // clean_isotopes removes peaks that are probably C13 isotopes
450 remove_close_peaks(peaks, 1.5);
452 // keep only the most intense peaks
453 if (peaks.size() > (unsigned int) total_peaks) {
454 std::sort(peaks.begin(), peaks.end(), peak::less_intensity);
455 peaks.erase(peaks.begin(), peaks.end() - total_peaks);
456 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
459 // update statistics, now that we've removed some peaks
460 calculate_intensity_statistics();
462 return true;
466 // Store the list of spectra that search_peptide will search against, and also
467 // build spectrum_mass_index.
468 void
469 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
470 searchable_spectra = spectra;
471 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
472 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
476 // Multiply peak intensities by residue-dependent factors to generate a more
477 // realistic spectrum. If is_XYZ, walk through the peptide sequence in
478 // reverse.
479 static inline void
480 synthesize_ladder_intensities(std::vector<peak> &mass_ladder,
481 const std::string &peptide_seq,
482 const bool is_XYZ) NOTHROW {
483 const parameters &CP = parameters::the;
485 assert(mass_ladder.size() == peptide_seq.size() - 1);
486 assert(mass_ladder.size() >= 2);
488 // An intensity boost is given for the second peptide bond/peak (*) if the
489 // N-terminal residue is 'P':
490 // 0 * 2 3 (peaks)
491 // A P C D E
492 const int proline_bonus_position = is_XYZ ? mass_ladder.size()-2 : 1;
493 mass_ladder[proline_bonus_position].intensity = 3.0;
494 if (peptide_seq[1] == 'P')
495 mass_ladder[proline_bonus_position].intensity = 10.0;
497 if (CP.spectrum_synthesis)
498 for (unsigned int i=0; i<mass_ladder.size(); i++) {
499 // FIX: there must be a better way
500 const char peptide_index = is_XYZ ? mass_ladder.size()-i-1 : i;
501 switch (peptide_seq[peptide_index]) {
502 case 'D':
503 mass_ladder[i].intensity *= 5.0;
504 break;
505 case 'V':
506 case 'E':
507 case 'I':
508 case 'L':
509 mass_ladder[i].intensity *= 3.0;
510 break;
511 case 'N':
512 case 'Q':
513 mass_ladder[i].intensity *= 2.0;
514 break;
516 switch (peptide_seq[peptide_index+1]) {
517 case 'P':
518 mass_ladder[i].intensity *= 5.0;
519 break;
525 // Calculate peaks for a synthesized mass (not mz) ladder.
526 static inline void
527 get_synthetic_Y_mass_ladder(std::vector<peak> &mass_ladder,
528 const std::string &peptide_seq,
529 const std::vector<double> &mass_list,
530 const double fragment_C_fixed_mass) NOTHROW {
531 assert(peptide_seq.size() == mass_list.size());
532 double m = fragment_C_fixed_mass;
534 const int ladder_size = mass_ladder.size();
535 for (int i=ladder_size-1; i>=0; i--) {
536 m += mass_list[i+1];
537 mass_ladder[ladder_size-1-i] = peak(m, 1.0);
540 synthesize_ladder_intensities(mass_ladder, peptide_seq, true);
544 // Calculate peaks for a synthesized mass (not mz) ladder.
545 static inline void
546 get_synthetic_B_mass_ladder(std::vector<peak> &mass_ladder,
547 const std::string &peptide_seq,
548 const std::vector<double> &mass_list,
549 const double fragment_N_fixed_mass) NOTHROW {
550 double m = fragment_N_fixed_mass;
552 const int ladder_size = mass_ladder.size();
553 for (int i=0; i<=ladder_size-1; i++) {
554 m += mass_list[i];
555 mass_ladder[i] = peak(m, 1.0);
558 synthesize_ladder_intensities(mass_ladder, peptide_seq, false);
562 static inline void
563 synthetic_spectra(spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
564 const std::string &peptide_seq,
565 const std::vector<double> &mass_list,
566 const double fragment_N_fixed_mass,
567 const double fragment_C_fixed_mass,
568 const double max_fragment_charge) NOTHROW {
569 std::vector<peak> mass_ladder(mass_list.size()-1);
571 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
572 switch (ion_type) {
573 case ION_Y:
574 get_synthetic_Y_mass_ladder(mass_ladder, peptide_seq, mass_list,
575 fragment_C_fixed_mass);
576 break;
577 case ION_B:
578 get_synthetic_B_mass_ladder(mass_ladder, peptide_seq, mass_list,
579 fragment_N_fixed_mass);
580 break;
581 default:
582 assert(false);
585 //for (unsigned int i=0; i<mass_ladder.size(); i++)
586 // std::cerr << "ladder step " << peptide_seq[i] << " " << mass_ladder[i].mz << std::endl;
588 for (int charge=1; charge<=max_fragment_charge; charge++) {
589 spectrum &sp = synth_sp[charge][ion_type];
590 //sp.mass = fragment_peptide_mass;
591 sp.charge = charge;
592 sp.peaks.resize(mass_ladder.size());
593 for (std::vector<peak>::size_type i=0; i<mass_ladder.size(); i++) {
594 sp.peaks[i].intensity = mass_ladder[i].intensity;
595 sp.peaks[i].mz = peak::get_mz(mass_ladder[i].mz, charge);
602 // Return the similarity score between this spectrum and that, and also a
603 // count of common peaks in *peak_count.
605 // xtandem quantizes mz values into bins of width fragment_mass_error first,
606 // and adds a bin on either side. This makes the effective fragment mass
607 // error for each peak an arbitrary value between 1x and 2x the parameter,
608 // based on quantization of that peak. If quirks_mode is on, we'll try to
609 // emulate this behavior.
610 static inline double
611 score_similarity_(const spectrum &x, const spectrum &y,
612 int *peak_count) NOTHROW {
613 const double frag_err = parameters::the.fragment_mass_error;
614 const bool quirks_mode = parameters::the.quirks_mode;
615 const double error_limit = quirks_mode ? frag_err*1.5 : frag_err;
617 double score = 0;
618 *peak_count = 0;
619 std::vector<peak>::const_iterator x_it = x.peaks.begin();
620 std::vector<peak>::const_iterator y_it = y.peaks.begin();
622 while (x_it != x.peaks.end() and y_it != y.peaks.end()) {
623 double x_mz = x_it->mz, y_mz = y_it->mz;
624 if (quirks_mode) {
625 const float ffe = frag_err;
626 x_mz = int(static_cast<float>(x_mz) / ffe) * ffe;
627 y_mz = int(static_cast<float>(y_mz) / ffe) * ffe;
629 // in quirks mode, must be within one frag_err-wide bin
630 const double delta = y_mz - x_mz;
631 if (std::abs(delta) < error_limit) {
632 *peak_count += 1;
633 score += (x_it->intensity * y_it->intensity);
634 //std::cerr << "hit " << x_mz << " vs " << y_mz << ", i = " << x_it->intensity * y_it->intensity << std::endl;
636 if (delta > 0)
637 x_it++;
638 else
639 y_it++;
642 return score;
646 double
647 spectrum::score_similarity(const spectrum &x, const spectrum &y, int
648 *peak_count) {
649 return score_similarity_(x, y, peak_count);
653 // IDEA: could we combine all these spectra and just do the correlation once?
655 // Score the specified spectrum against a group of synthetic fragment
656 // spectra.
657 static inline void
658 score_spectrum(double &hyper_score, double &convolution_score,
659 std::vector<double> &ion_scores, std::vector<int> &ion_peaks,
660 spectrum synth_sp[/* max_fragment_charge+1 */][ION_MAX],
661 const int spectrum_id, const int max_fragment_charge) NOTHROW {
662 const parameters &CP = parameters::the;
663 const spectrum sp = spectrum::searchable_spectra[spectrum_id];
664 const int spectrum_charge = sp.charge;
665 hyper_score = 1.0;
666 convolution_score = 0;
668 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
669 int i_peaks = 0;
670 double i_scores = 0.0;
671 const int charge_limit = std::max<int>(1, spectrum_charge-1);
672 for (int charge=1; charge<=charge_limit; charge++) {
673 assert(charge <= max_fragment_charge);
675 // convolution score is just sum over charges/ions
676 // hyperscore is product of p! over charges/ions (where p is corr peak
677 // count) times the convolution score (clipped to FLT_MAX)
678 // > blurred!
679 int common_peak_count; // FIX!
680 double conv_score = score_similarity_(synth_sp[charge][ion_type],
681 sp, &common_peak_count);
682 i_peaks += common_peak_count;
683 i_scores += conv_score;
684 hyper_score *= CP.factorial[common_peak_count];
685 convolution_score += conv_score;
688 ion_peaks[ion_type] = i_peaks;
689 ion_scores[ion_type] = i_scores;
691 hyper_score *= convolution_score;
695 // FIX: does this actually help inlining?
696 // returns log-scaled hyperscore
697 static inline double
698 scale_hyperscore_(double hyper_score) NOTHROW {
699 assert(hyper_score >= 0); // otherwise check for EDOM, ERANGE
700 if (hyper_score == 0)
701 return -DBL_MAX; // FIX: this shouldn't be reached?
702 return 4 * std::log10(hyper_score);
704 double
705 scale_hyperscore(double hyper_score) { return scale_hyperscore_(hyper_score); }
708 struct mass_trace_list {
709 mass_trace_item item;
710 const mass_trace_list *next;
712 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
716 // Search for matches of this particular peptide modification variation
717 // against the spectra. Updates score_stats and returns the number of
718 // candidate spectra found.
719 static inline void
720 evaluate_peptide(const search_context &context, match &m,
721 const mass_trace_list *mtlp,
722 const std::vector<double> &mass_list,
723 const spmi_c_it &candidate_spectra_info_begin,
724 const spmi_c_it &candidate_spectra_info_end,
725 score_stats &stats) NOTHROW {
726 const parameters &CP = parameters::the;
727 assert(m.peptide_sequence.size() == mass_list.size());
729 int max_candidate_charge = 0;
730 for (spmi_c_it it=candidate_spectra_info_begin;
731 it != candidate_spectra_info_end; it++)
732 max_candidate_charge
733 = std::max<int>(max_candidate_charge,
734 spectrum::searchable_spectra[it->second].charge);
735 assert(max_candidate_charge >= 1);
737 int max_fragment_charge = max_candidate_charge;
738 if (not CP.check_all_fragment_charges)
739 max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
740 assert(max_fragment_charge <= spectrum::max_supported_charge);
741 // e.g. +2 -> 'B' -> spectrum
742 // FIX: should this be a std::vector??
743 static spectrum synth_sp[spectrum::max_supported_charge+1][ION_MAX];
744 synthetic_spectra(synth_sp, m.peptide_sequence, mass_list,
745 context.fragment_N_fixed_mass,
746 context.fragment_C_fixed_mass, max_fragment_charge);
748 for (spmi_c_it candidate_it = candidate_spectra_info_begin;
749 candidate_it != candidate_spectra_info_end; candidate_it++) {
750 m.spectrum_index = candidate_it->second;
751 stats.candidate_spectrum_count += 1;
752 if (CP.estimate_only)
753 continue;
755 //std::cerr << "sp " << m.spectrum_index << std::endl;
756 score_spectrum(m.hyper_score, m.convolution_score, m.ion_scores,
757 m.ion_peaks, synth_sp, m.spectrum_index,
758 max_fragment_charge);
759 //std::cerr << "score " << m.convolution_score << std::endl;
760 if (m.convolution_score <= 2.0)
761 continue;
763 ////std::cerr << "histod: " << m.spectrum_index << " " << m.peptide_sequence
764 //<< " " << m.peptide_mass << " " <<
765 //spectrum::searchable_spectra[m.spectrum_index].mass << std::endl;
767 // update spectrum histograms
768 std::vector<int> &hh = stats.hyperscore_histogram[m.spectrum_index];
769 int binned_scaled_hyper_score = int(0.5 + scale_hyperscore_(m.hyper_score));
770 assert(binned_scaled_hyper_score >= 0);
771 // FIX
772 if (static_cast<int>(hh.size())-1 < binned_scaled_hyper_score) {
773 assert(hh.size() < INT_MAX/2);
774 hh.resize(std::max<unsigned int>(binned_scaled_hyper_score+1,
775 2*hh.size()));
777 hh[binned_scaled_hyper_score] += 1;
778 // incr m_tPeptideScoredCount
779 // nyi: permute stuff
780 // FIX
781 const int sp_ion_count = m.ion_peaks[ION_B] + m.ion_peaks[ION_Y];
782 if (sp_ion_count < CP.minimum_ion_count)
783 continue;
784 const bool has_b_and_y = m.ion_peaks[ION_B] and m.ion_peaks[ION_Y];
785 if (not has_b_and_y)
786 continue;
788 // check that parent masses are within error range (isotope ni) already
789 // done above (why does xtandem do it here? something to do with isotope
790 // jitter?) if check fails, only eligible for 2nd-best record
792 // Remember all of the highest-hyper-scoring matches against each
793 // spectrum. These might be in multiple domains (pos, length, mods) in
794 // multiple proteins.
796 // To avoid the problems that xtandem has with inexact float-point
797 // calculations, we consider hyperscores within the "epsilon ratio" to be
798 // "equal". The best_match vector will need to be re-screened against
799 // this ratio, because it may finally contain entries which don't meet our
800 // criterion. (We don't take the time to get it exactly right here,
801 // because we will often end up discarding these later anyway.)
803 const double current_ratio = (m.hyper_score
804 / stats.best_score[m.spectrum_index]);
806 if (CP.quirks_mode and (m.hyper_score < stats.best_score[m.spectrum_index])
807 or (not CP.quirks_mode
808 and (current_ratio < CP.hyper_score_epsilon_ratio))) {
809 // This isn't a best score, so see if it's a second-best score.
810 if (m.hyper_score > stats.second_best_score[m.spectrum_index])
811 stats.second_best_score[m.spectrum_index] = m.hyper_score;
812 continue;
814 if (CP.quirks_mode and (m.hyper_score > stats.best_score[m.spectrum_index])
815 or (not CP.quirks_mode
816 and (current_ratio > (1/CP.hyper_score_epsilon_ratio)))) {
817 // This score is significantly better, so forget previous matches.
818 stats.improved_candidates += 1;
819 //if (stats.best_match[m.spectrum_index].empty()) why doesn't this work?
820 if (stats.best_score[m.spectrum_index] == 100.0)
821 stats.spectra_with_candidates += 1;
822 stats.best_score[m.spectrum_index] = m.hyper_score;
823 stats.best_match[m.spectrum_index].clear();
825 // Now remember this match because it's at least as good as those
826 // previously seen.
827 m.mass_trace.clear();
828 for (const mass_trace_list *p=mtlp; p; p=p->next)
829 m.mass_trace.push_back(p->item);
830 stats.best_match[m.spectrum_index].push_back(m);
835 // IDEA FIX: Use a cost parameter to define the iterative search front.
838 // Choose one possible residue modification position. Once they're all
839 // chosen, then evaluate.
840 static inline void
841 choose_residue_mod(const search_context &context, match &m,
842 const mass_trace_list *mtlp, std::vector<double> &mass_list,
843 const spmi_c_it &candidate_spectra_info_begin,
844 const spmi_c_it &candidate_spectra_info_end,
845 score_stats &stats,
846 std::vector<int> &db_remaining,
847 const unsigned remaining_positions_to_choose,
848 const unsigned next_position_to_consider) NOTHROW {
849 const parameters &CP = parameters::the;
850 assert(remaining_positions_to_choose
851 <= m.peptide_sequence.size() - next_position_to_consider);
853 if (CP.maximum_modification_combinations_searched
854 and (stats.combinations_searched
855 >= CP.maximum_modification_combinations_searched))
856 return;
858 if (remaining_positions_to_choose == 0) {
859 stats.combinations_searched += 1;
860 evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra_info_begin,
861 candidate_spectra_info_end, stats);
862 } else {
863 mass_trace_list mtl(mtlp);
865 // consider all of the positions where we could next add a mod
866 for (unsigned int i=next_position_to_consider;
867 i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
868 mtl.item.position = i;
869 const double save_pos_mass=mass_list[i];
870 const char pos_res = m.peptide_sequence[i];
872 // consider the possibilities for this position
873 for (std::vector<int>::const_iterator
874 it=context.delta_bag_lookup[pos_res].begin();
875 it != context.delta_bag_lookup[pos_res].end(); it++) {
876 const int db_index = *it;
877 if (db_remaining[db_index] < 1)
878 continue;
879 db_remaining[db_index] -= 1;
880 mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
881 mtl.item.delta = context.delta_bag_delta[db_index];
882 choose_residue_mod(context, m, &mtl, mass_list,
883 candidate_spectra_info_begin,
884 candidate_spectra_info_end, stats, db_remaining,
885 remaining_positions_to_choose-1, i+1);
886 db_remaining[db_index] += 1;
888 mass_list[i] = save_pos_mass;
894 // p_begin (p_end) to begin_index (end_index), updating p_mass in the process.
895 // The sign determines whether mass increases or decreases as p_begin (p_end)
896 // is moved forward--so it should be -1 for the begin case and +1 for the end
897 // case.
898 // FIX: is this worth its complexity? kill or else add reset
899 static inline void
900 update_p_mass(double &p_mass, int &p_begin, int begin_index, int sign,
901 const std::string &run_sequence,
902 const std::vector<double> &fixed_residue_mass) {
903 assert(sign == +1 or sign == -1);
904 assert(begin_index >= 0);
905 const bool moving_forward = begin_index >= p_begin;
906 int p0=p_begin, p1=begin_index;
907 if (not moving_forward) {
908 std::swap(p0, p1);
909 sign = -sign;
911 for (int i=p0; i<p1; i++)
912 p_mass += sign * fixed_residue_mass[run_sequence[i]];
913 p_begin = begin_index;
918 // Search a sequence run for matches according to the context against the
919 // spectra. Updates score_stats and the number of candidate spectra found.
921 // FIX: examine carefully for signed/unsigned problems
923 void inline
924 search_run(const search_context &context, const sequence_run &sequence_run,
925 score_stats &stats) {
926 const parameters &CP = parameters::the;
927 const int min_peptide_length = std::max<int>(CP.minimum_peptide_length,
928 context.mod_count);
929 const std::vector<double> &fixed_parent_mass \
930 = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
932 const std::string &run_sequence = sequence_run.sequence;
933 const std::vector<int> &cleavage_points = sequence_run.cleavage_points;
935 // This match will be passed inward and used to record information that we
936 // need to remember about a match when we finally see one. At that point, a
937 // copy of this match will be saved.
938 match m; // FIX: move inward?
939 m.sequence_index = sequence_run.sequence_index;
940 m.sequence_offset = sequence_run.sequence_offset;
942 assert(context.delta_bag_delta.size()
943 == context.delta_bag_count.size());
944 // counts remaining as mod positions are chosen
945 std::vector<int> db_remaining = context.delta_bag_count;
947 // the rightmost 'end' seen when all spectra masses were too high
948 // (0 means none yet encountered.)
949 unsigned int next_end = 0;
951 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
952 double p_mass = context.parent_fixed_mass;
953 int p_begin=0, p_end=0;
955 // FIX: optimize the non-specific cleavage case?
956 for (unsigned int begin=0; begin<cleavage_points.size()-1; begin++) {
957 const int begin_index = m.peptide_begin = cleavage_points[begin];
958 if (not context.pca_residues.empty())
959 if (context.pca_residues.find(run_sequence[begin_index])
960 == std::string::npos)
961 continue;
962 update_p_mass(p_mass, p_begin, begin_index, -1, run_sequence,
963 fixed_parent_mass);
964 unsigned int end = std::max<unsigned int>(begin + 1, next_end);
965 for (; end<cleavage_points.size(); end++) {
966 m.missed_cleavage_count = end - begin - 1;
967 if (m.missed_cleavage_count > context.maximum_missed_cleavage_sites)
968 break;
970 const int end_index = cleavage_points[end];
971 assert(end_index - begin_index > 0);
972 const int peptide_size = end_index - begin_index;
973 assert(peptide_size > 0);
974 if (peptide_size < min_peptide_length)
975 continue;
976 update_p_mass(p_mass, p_end, end_index, +1, run_sequence,
977 fixed_parent_mass);
978 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
981 double sp_mass_lb = p_mass + CP.parent_monoisotopic_mass_error_minus;
982 double sp_mass_ub = p_mass + CP.parent_monoisotopic_mass_error_plus;
984 const spmi_c_it candidate_spectra_info_begin
985 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
986 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
987 // spectrum masses all too low to match peptide (peptide too long)
988 //std::cerr << "peptide: sp low" << std::endl;
989 break;
992 const spmi_c_it candidate_spectra_info_end
993 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
994 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
995 // spectrum masses all too high to match peptide
996 // (peptide is too short, so increase end, if possible, else return)
997 //std::cerr << "peptide: sp high" << std::endl;
998 if (end == cleavage_points.size() - 1)
999 return;
1000 next_end = end;
1001 continue;
1003 if (candidate_spectra_info_begin == candidate_spectra_info_end) {
1004 // no spectrum with close-enough parent mass
1005 //std::cerr << "peptide: sp none nearby" << std::endl;
1006 continue;
1009 m.parent_peptide_mass = p_mass;
1010 //m.fragment_peptide_mass = ???; unneeded???
1012 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
1014 std::vector<double> mass_list(peptide_size);
1015 for (std::vector<double>::size_type i=0; i<m.peptide_sequence.size();
1016 i++)
1017 mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
1018 .fixed_residue_mass[m.peptide_sequence[i]]);
1020 choose_residue_mod(context, m, NULL, mass_list,
1021 candidate_spectra_info_begin,
1022 candidate_spectra_info_end, stats, db_remaining,
1023 context.mod_count, 0);
1029 // Search all sequence runs for matches according to the context against the
1030 // spectra. Updates score_stats and the number of candidate spectra found.
1031 void
1032 spectrum::search_runs(const search_context &context, score_stats &stats) {
1033 const parameters &CP = parameters::the;
1035 const int no_runs = context.sequence_runs.size();
1036 for (int i=0; i<no_runs; i++) {
1037 search_run(context, context.sequence_runs[i], stats);
1039 if (CP.show_progress)
1040 std::cerr << i+1 << " of " << no_runs << " sequences, "
1041 << stats.candidate_spectrum_count << " cand for "
1042 << stats.spectra_with_candidates << " sp, "
1043 << stats.improved_candidates << "++\r" << std::flush;
1046 if (CP.show_progress)
1047 std::cerr << std::endl;