minor code reorg (use new Python if/else operator)
[greylag.git] / cgreylag.cpp
blob5f1224bb6cf02cc21099e60e73973d2505398314
1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
23 #include <cerrno>
24 #include <cfloat>
25 #include <climits>
26 #include <cstdlib>
27 #include <cmath>
28 #include <iostream>
29 #include <numeric>
30 #include <set>
31 #include <stdexcept>
35 #ifdef __GNU__
36 // Use faster, non-threadsafe versions, since we don't use threads. This is
37 // maybe 10% faster?
38 #define fgetsX fgets_unlocked
39 #define fputsX fputs_unlocked
40 #define fprintfX __builtin_fprintf_unlocked
41 #define ferrorX ferror_unlocked
42 #define getcX getc_unlocked
43 #else
44 #define fgetsX std::fgets
45 #define fputsX std::fputs
46 #define fprintfX std::fprintf
47 #define ferrorX std::ferror
48 #define getcX std::getc
49 #endif
52 parameters parameters::the;
53 const parameters &CP = parameters::the;
56 char *
57 peak::__repr__() const {
58 static char temp[128];
59 sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
60 mz, intensity, intensity_class);
61 return &temp[0];
64 const int FIRST_SPECTRUM_ID = 1;
65 int spectrum::next_id = FIRST_SPECTRUM_ID;
66 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
68 std::vector<spectrum> spectrum::searchable_spectra;
69 // parent mass -> searchable_spectra index
70 std::multimap<double,
71 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
73 typedef
74 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
75 spmi_c_it; // ugh
78 char *
79 spectrum::__repr__() const {
80 static char temp[1024];
81 sprintf(temp,
82 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
83 id, physical_id, name.c_str(), mass, charge, peaks.size());
84 return &temp[0];
88 // FIX: does this actually help inlining?
89 // Return ln of n_C_k.
90 static inline double
91 ln_combination_(unsigned int n, unsigned int k) {
92 // Occasionally happens due to problems in scoring function. (FIX)
93 if (n < k)
94 return 0;
96 assert(0 <= k and k <= n and n < CP.ln_factorial.size());
97 return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
99 double
100 ln_combination(unsigned int n, unsigned int k) {
101 return ln_combination(n, k);
106 // This is an error exit for read_spectra*.
107 static inline void
108 io_error(FILE *f, const char *message="") {
109 if (ferrorX(f))
110 message = "I/O error while reading (or writing) spectrum file";
111 throw std::ios_base::failure(message);
115 // true iff s consists entirely of whitespace
116 static bool
117 at_eol(const char *s) {
118 return s[strspn(s, " \t\r\n\f\v")] == 0;
122 // Check whether we've past the end offset, throwing exception on I/O error.
123 static inline bool
124 check_past_end(FILE *f, const long offset_end) {
125 if (offset_end == -1)
126 return false;
127 long pos = std::ftell(f);
128 if (pos == -1)
129 io_error(f);
130 return pos >= offset_end;
134 // Read spectra from file in ms2 format, tagging them with file_id. Before
135 // reading, seek to absolute position offset_begin. If offset_end != -1, any
136 // spectra that begin after position offset_end in the file are not read.
138 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
139 // having the same physical id. Note that depending on
140 // offset_begin/offset_end, we may end up with the first charge and not the
141 // second, or vice versa.
143 // The peak list is initially sorted by mz. Throws an exception on invalid
144 // input. Error checking is the most stringent in this function. Other
145 // readers can be a little looser since all ms2 files eventually get read here
146 // anyway.
148 // This is pretty hideous. Is there a simpler way to read, reasonably
149 // efficiently, catching any input error?
151 std::vector<spectrum>
152 spectrum::read_spectra_from_ms2(FILE *f, const int file_id,
153 const long offset_begin,
154 const long offset_end) {
155 std::vector<spectrum> spectra;
157 const int bufsiz = 1024;
158 char buf[bufsiz];
159 char *endp;
161 // first seek to offset_begin and synchronize at next "\n:"
162 if (offset_begin > 0) {
163 if (std::fseek(f, offset_begin-1, SEEK_SET) == -1)
164 io_error(f);
165 int c = getcX(f);
166 while (c != '\n' and c != EOF)
167 c = getcX(f);
168 if (ferrorX(f))
169 io_error(f);
170 if (c == '\n') {
171 do {
172 c = getcX(f);
173 } while (c != ':' and c != EOF);
174 if (ferrorX(f))
175 io_error(f);
176 if (c == ':')
177 std::ungetc(c, f);
180 // at this point we expect to read the first ':' of a header, or EOF
182 bool past_end = check_past_end(f, offset_end);
183 char *result = fgetsX(buf, bufsiz, f);
184 while (true) {
185 std::vector<std::string> names;
186 std::vector<double> masses;
187 std::vector<int> charges;
189 // read headers
190 while (true) {
191 if (ferrorX(f))
192 io_error(f);
193 if (not result or buf[0] != ':')
194 break;
195 if (not past_end)
196 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
197 if (not fgetsX(buf, bufsiz, f))
198 io_error(f, "bad ms2 format: mass/charge line expected");
199 errno = 0;
200 double mass = std::strtod(buf, &endp); // need double accuracy here
201 if (not past_end)
202 masses.push_back(mass);
203 if (errno or endp == buf or mass <= 0)
204 io_error(f, "bad ms2 format: bad mass");
205 const char *endp0 = endp;
206 int charge = std::strtol(endp0, &endp, 10);
207 if (not past_end)
208 charges.push_back(charge);
209 if (errno or endp == endp0 or charge <= 0)
210 io_error(f, "bad ms2 format: bad charge");
211 if (not at_eol(endp))
212 io_error(f, "bad ms2 format: junk at end of mass/charge line");
213 past_end = past_end or check_past_end(f, offset_end);
214 result = fgetsX(buf, bufsiz, f);
216 if (names.empty() and past_end)
217 break;
218 if (not result) {
219 if (not names.empty())
220 io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
221 break;
223 // read peaks
224 std::vector<peak> peaks;
225 while (true) {
226 peak p;
227 errno = 0;
228 p.mz = strtod(buf, &endp);
229 if (errno or endp == buf or p.mz <= 0)
230 io_error(f, "bad ms2 format: bad peak mz");
231 const char *endp0 = endp;
232 p.intensity = strtod(endp0, &endp);
233 if (errno or endp == endp0 or p.intensity <= 0)
234 io_error(f, "bad ms2 format: bad peak intensity");
235 if (not at_eol(endp))
236 io_error(f, "bad ms2 format: junk at end of peak line");
237 peaks.push_back(p);
239 past_end = past_end or check_past_end(f, offset_end);
240 result = fgetsX(buf, bufsiz, f);
241 if (ferrorX(f))
242 io_error(f);
243 if (not result or buf[0] == ':')
244 break;
247 // add spectra to vector
248 if (names.empty() and not peaks.empty())
249 io_error(f, "bad ms2 format: missing header lines?");
251 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
253 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
254 spectrum sp(masses[i], charges[i]);
255 sp.peaks = peaks;
256 sp.name = names[i];
257 sp.file_id = file_id;
258 sp.physical_id = next_physical_id;
259 spectra.push_back(sp);
261 if (file_id != -1)
262 spectrum::next_physical_id += 1;
264 return spectra;
268 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
269 // to remove those too large to be fragment products. (Peaks are assumed to
270 // be ordered by increasing mz.)
271 void
272 spectrum::filter_peaks(double TIC_cutoff_proportion,
273 double parent_mass_tolerance_max) {
274 if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
275 or parent_mass_tolerance_max < 0)
276 throw std::invalid_argument("invalid argument value");
278 // remove peaks too large to be fragment products
279 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
280 const double peak_mz_limit = mass + parent_mass_tolerance_max;
282 std::vector<peak>::iterator pi;
283 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
284 peaks.erase(pi, peaks.end());
286 // FIX: note mzLowerBound..mzUpperBound here
288 // now filter by TIC
289 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
291 total_ion_current = 0;
292 for (pi=peaks.begin(); pi != peaks.end(); pi++)
293 total_ion_current += pi->intensity;
295 const double i_limit = total_ion_current * TIC_cutoff_proportion;
296 double accumulated_intensity = 0;
297 for (pi=peaks.begin();
298 pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
299 accumulated_intensity += pi->intensity;
301 peaks.erase(pi, peaks.end());
303 // restore order
304 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
308 // Classify peaks and update class_counts and peak bin counts.
309 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
310 void
311 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
312 double fragment_mass_tolerance) {
313 if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
314 throw std::invalid_argument("invalid argument value");
316 intensity_class_counts.clear();
317 intensity_class_counts.resize(intensity_class_count);
319 // e.g., 7 = 1 + 2 + 4
320 // FIX: does this fail for non-integer ratios?
321 int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
322 / (intensity_class_ratio - 1));
324 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
325 std::vector<peak>::iterator pi=peaks.begin();
326 for (int i_class=0; i_class<intensity_class_count; i_class++) {
327 int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
328 / min_count) + 0.5);
329 for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
330 pi->intensity_class = i_class;
331 intensity_class_counts[i_class]++;
334 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
336 if (not peaks.empty()) {
337 min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
338 max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
339 total_peak_bins = (int((max_peak_mz - min_peak_mz)
340 / (2 * fragment_mass_tolerance) + 0.5));
341 // This is the MyriMatch estimate. Wouldn't it be better to simply count
342 // how many are empty?
343 empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
348 // Update the *_cache fields from the peaks field.
349 void
350 spectrum::update_peak_cache() {
351 clear_peak_cache();
353 const unsigned int peak_count = peaks.size();
354 peak_mz_cache = new double[peak_count+1];
355 peak_intensity_class_cache = new int[peak_count+1];
357 for (unsigned int i=0; i<peak_count; i++) {
358 peak_mz_cache[i] = peaks[i].mz;
359 peak_intensity_class_cache[i] = peaks[i].intensity_class;
362 // negative value is terminator
363 peak_mz_cache[peak_count] = -1;
364 peak_intensity_class_cache[peak_count] = -1;
368 // Store the list of spectra that search_peptide will search against, and also
369 // build spectrum_mass_index.
370 void
371 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
372 searchable_spectra = spectra;
373 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++) {
374 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
375 searchable_spectra[i].update_peak_cache();
380 // Calculate peaks for a synthesized mass (not mz) ladder.
381 static inline void
382 get_synthetic_Y_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
383 const double *mass_list,
384 const double fragment_C_fixed_mass) {
385 double m = fragment_C_fixed_mass;
387 for (unsigned int i=ladder_size; i>0; i--) {
388 m += mass_list[i];
389 mass_ladder[ladder_size-i] = m;
394 // Calculate peaks for a synthesized mass (not mz) ladder.
395 static inline void
396 get_synthetic_B_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
397 const double *mass_list,
398 const double fragment_N_fixed_mass) {
399 double m = fragment_N_fixed_mass;
401 for (unsigned int i=0; i<ladder_size; i++) {
402 m += mass_list[i];
403 mass_ladder[i] = m;
408 // Generate synthetic spectra for a set of charges. Only the charge and peaks
409 // of these spectra are initialized.
410 static inline void
411 synthetic_spectra(double synth_sp_mz[/* max_fragment_charge+1 */]
412 [spectrum::MAX_THEORETICAL_FRAGMENTS],
413 const double *mass_list, const unsigned int mass_list_size,
414 const double fragment_N_fixed_mass,
415 const double fragment_C_fixed_mass,
416 const int max_fragment_charge) {
417 assert(mass_list_size >= 1);
418 const unsigned int ladder_size = mass_list_size - 1;
419 double Y_mass_ladder[ladder_size];
420 double B_mass_ladder[ladder_size];
422 for (int charge=1; charge<=max_fragment_charge; charge++) {
423 get_synthetic_Y_mass_ladder(Y_mass_ladder, ladder_size, mass_list,
424 fragment_C_fixed_mass);
425 get_synthetic_B_mass_ladder(B_mass_ladder, ladder_size, mass_list,
426 fragment_N_fixed_mass);
428 unsigned int y_i=0, b_i=0;
429 double *sp_mz_p = synth_sp_mz[charge];
431 // merge the two (already sorted) mass lists, converting them to mz values
432 // in the process
433 // FIX: try algorithm::merge?
434 while (y_i < ladder_size and b_i < ladder_size)
435 if (Y_mass_ladder[y_i] < B_mass_ladder[b_i])
436 *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
437 else
438 *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
439 while (y_i < ladder_size)
440 *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
441 while (b_i < ladder_size)
442 *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
443 *sp_mz_p = -1; // invalid mz as terminator
448 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
449 // theoretical spectrum (y) generated from a peptide candidate.
451 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
452 // better yet, sets) for peak lists simpler? As fast or faster?
454 // This is the innermost loop, so it's worthwhile to optimize this some.
455 // FIX const declaration
456 static inline double
457 score_spectrum(const spectrum &x,
458 const double synth_mzs[spectrum::MAX_THEORETICAL_FRAGMENTS]) {
459 // FIX this
460 unsigned int y_peak_count=0;
461 for (const double *p=synth_mzs; *p >= 0; p++, y_peak_count++);
463 double peak_best_delta[y_peak_count];
464 int peak_best_class[y_peak_count];
465 for (unsigned int i=0; i<y_peak_count; i++) {
466 peak_best_delta[i] = CP.fragment_mass_tolerance;
467 peak_best_class[i] = -1;
470 // Iterate over all closest pairs of peaks, collecting info on the class of
471 // the best (meaning closest mz) real peak matching each theoretical peak
472 // (for real/theoretical peak pairs that are no farther apart than
473 // fragment_mass_tolerance).
475 const unsigned int x_peak_count=x.peaks.size();
476 assert(x_peak_count > 0 and y_peak_count > 0);
478 unsigned int x_index=0, y_index=0;
479 while (true) {
480 const double delta = synth_mzs[y_index] - x.peak_mz_cache[x_index];
481 if (std::abs(delta) <= peak_best_delta[y_index]) {
482 peak_best_class[y_index] = x.peak_intensity_class_cache[x_index];
483 peak_best_delta[y_index] = std::abs(delta);
485 if (delta > 0) {
486 if (++x_index >= x_peak_count)
487 break;
488 } else {
489 if (++y_index >= y_peak_count)
490 break;
494 assert(CP.intensity_class_count < INT_MAX);
495 const unsigned int intensity_class_count = CP.intensity_class_count;
496 unsigned int peak_hit_histogram[intensity_class_count];
497 for (unsigned int i=0; i<intensity_class_count; i++)
498 peak_hit_histogram[i] = 0;
499 for (unsigned int i=0; i<y_peak_count; i++) {
500 const int bc = peak_best_class[i];
501 if (bc >= 0) {
502 assert(bc < static_cast<int>(intensity_class_count));
503 peak_hit_histogram[bc] += 1;
507 // How many theoretical peaks overlap the real peak range?
508 int valid_theoretical_peaks = 0;
509 for (unsigned int i=0; i<y_peak_count; i++)
510 if (x.min_peak_mz <= synth_mzs[i] and synth_mzs[i] <= x.max_peak_mz)
511 valid_theoretical_peaks++;
513 unsigned int total_peak_hits = 0;
514 for (unsigned int i=0; i<intensity_class_count; i++)
515 total_peak_hits += peak_hit_histogram[i];
517 // How many valid theoretical peaks were misses?
518 const int peak_misses = valid_theoretical_peaks - total_peak_hits;
519 assert(peak_misses >= 0);
521 double score = ln_combination_(x.empty_peak_bins, peak_misses);
522 for (unsigned int i=0; i<intensity_class_count; i++)
523 score += ln_combination_(x.intensity_class_counts[i],
524 peak_hit_histogram[i]);
525 score -= ln_combination_(x.total_peak_bins, valid_theoretical_peaks);
527 return score;
531 // Return the score of the specified spectrum against a group of synthetic
532 // fragment spectra.
534 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
535 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
536 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
537 // or +2b/+1y?)
538 // FIX: implement MM smart +3 model, and/or come up with something better.
539 // FIX const decl
540 static inline double
541 score_spectrum_over_charges(const double
542 synth_sp_mz[/* max_fragment_charge+1 */]
543 [spectrum::MAX_THEORETICAL_FRAGMENTS],
544 const spectrum &sp,
545 const int max_fragment_charge) {
546 double best_score = score_spectrum(sp, synth_sp_mz[1]);
548 const int charge_limit = std::max<int>(1, sp.charge-1);
549 for (int charge=2; charge<=charge_limit; charge++) {
550 assert(charge <= max_fragment_charge);
551 best_score += score_spectrum(sp, synth_sp_mz[charge]);
553 return best_score;
557 struct mass_trace_list {
558 mass_trace_item item;
559 const mass_trace_list *next;
561 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
565 // fuzzy comparison might not be necessary, but it's safer
566 static inline bool
567 score_equal(double s1, double s2) { return std::abs(s1-s2) < 1e-6; }
570 // Search for matches of this particular peptide modification variation
571 // against the spectra. Updates score_stats and returns the number of
572 // candidate spectra found.
573 static inline void
574 evaluate_peptide(const search_context &context, match &m,
575 const mass_trace_list *mtlp, const double *mass_list,
576 const std::vector<std::vector<spectrum>::size_type>
577 &candidate_spectra,
578 score_stats &stats) {
579 int max_candidate_charge = 0;
580 typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
581 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
582 max_candidate_charge
583 = std::max<int>(max_candidate_charge,
584 spectrum::searchable_spectra[*it].charge);
585 assert(max_candidate_charge >= 1);
587 const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
588 assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
590 const unsigned int peptide_size = m.peptide_sequence.size();
591 // FIX: "2" assumes just two ion series (e.g., B and Y)
592 assert(peptide_size <= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS-1));
594 // This array is preallocated to avoid the overhead of allocation and
595 // deallocation within the inner loop.
596 // FIX
597 // charge -> mz array
598 static double synth_sp_mz[spectrum::MAX_SUPPORTED_CHARGE+1]
599 [spectrum::MAX_THEORETICAL_FRAGMENTS];
601 synthetic_spectra(synth_sp_mz, mass_list, peptide_size,
602 context.fragment_N_fixed_mass,
603 context.fragment_C_fixed_mass, max_fragment_charge);
605 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
606 m.spectrum_index = *it;
607 spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
609 sp.comparisons += 1;
610 stats.candidate_spectrum_count += 1;
611 if (CP.estimate_only)
612 continue;
614 //std::cerr << "sp " << m.spectrum_index << std::endl;
615 m.score = score_spectrum_over_charges(synth_sp_mz, sp, max_fragment_charge);
616 //std::cerr << "score " << m.score << std::endl;
618 std::vector<match> &sp_best_matches = stats.best_matches[m.spectrum_index];
619 // Is this score good enough to be in the best score list? (Better scores
620 // are more negative.)
621 if (m.score > sp_best_matches.back().score)
622 continue;
624 // We're saving this match, so remember the mass trace info, too.
625 m.mass_trace.clear();
626 for (const mass_trace_list *p=mtlp; p; p=p->next)
627 m.mass_trace.push_back(p->item);
629 // If this is duplicate, just append to the existing match and return
630 for (std::vector<match>::reverse_iterator rit
631 = sp_best_matches.rbegin(); rit != sp_best_matches.rend(); rit++)
632 if (score_equal(rit->score, m.score)
633 and rit->peptide_sequence == m.peptide_sequence
634 and rit->mass_regime_index == m.mass_regime_index
635 and rit->conjunct_index == m.conjunct_index
636 and rit->mass_trace == m.mass_trace) {
637 rit->peptide_begin.push_back(m.peptide_begin[0]);
638 rit->sequence_name.push_back(m.sequence_name[0]);
639 return;
642 // Otherwise, insert this match in the correct position
643 assert(sp_best_matches.size() >= 1);
644 std::vector<match>::reverse_iterator rit = sp_best_matches.rbegin();
645 for (;rit+1 != sp_best_matches.rend() and (rit+1)->score > m.score; rit++)
646 *rit = *(rit+1);
647 *rit = m;
652 // IDEA FIX: Use a cost parameter to define the iterative search front.
655 // Choose one possible residue modification position. Once they're all
656 // chosen, then evaluate.
657 static inline void
658 choose_residue_mod(const search_context &context, match &m,
659 const mass_trace_list *mtlp, double *mass_list,
660 const std::vector<std::vector<spectrum>::size_type>
661 &candidate_spectra,
662 score_stats &stats,
663 std::vector<int> &db_remaining,
664 const unsigned int remaining_positions_to_choose,
665 const unsigned int next_position_to_consider) {
666 assert(remaining_positions_to_choose
667 <= m.peptide_sequence.size() - next_position_to_consider);
669 if (remaining_positions_to_choose == 0) {
670 stats.combinations_searched += 1;
671 evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
672 } else {
673 mass_trace_list mtl(mtlp);
675 // consider all of the positions where we could next add a mod
676 for (unsigned int i=next_position_to_consider;
677 i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
678 mtl.item.position = i;
679 const double save_pos_mass=mass_list[i];
680 const char pos_res = m.peptide_sequence[i];
682 // consider the possibilities for this position
683 for (std::vector<int>::const_iterator
684 it=context.delta_bag_lookup[pos_res].begin();
685 it != context.delta_bag_lookup[pos_res].end(); it++) {
686 const int db_index = *it;
687 if (db_remaining[db_index] < 1)
688 continue;
689 db_remaining[db_index] -= 1;
690 mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
691 mtl.item.delta = context.delta_bag_delta[db_index];
692 choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
693 stats, db_remaining,
694 remaining_positions_to_choose-1, i+1);
695 db_remaining[db_index] += 1;
697 mass_list[i] = save_pos_mass;
703 static inline int
704 search_peptide(const search_context &context, const double &p_mass, match &m,
705 const std::string &run_sequence, const int &begin_index,
706 const unsigned int &peptide_size, score_stats &stats,
707 std::vector<int> &db_remaining) {
708 // We'd like to consider only the spectra whose masses are within the
709 // tolerance range of theoretical mass of the peptide (p_mass).
710 // Unfortunately, the tolerance range of each spectrum depends on its
711 // charge, since the tolerance is specified in m/z units. Furthermore,
712 // when deciding how to adjust begin/end to generate the next peptide,
713 // we need to be conservative, as a high-charge spectrum may come into
714 // range, even though only +1 spectra are currently in-range. So, we
715 // first screen using the maximum tolerance range, then check the actual
716 // spectra below.
718 const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
719 const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
721 const spmi_c_it candidate_spectra_info_begin
722 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
723 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
724 // spectrum masses all too low to match peptide (peptide too long)
725 return -1;
728 const spmi_c_it candidate_spectra_info_end
729 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
730 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
731 // spectrum masses all too high to match peptide (peptide too short)
732 return 1;
734 if (candidate_spectra_info_begin == candidate_spectra_info_end) {
735 // no spectrum with close-enough parent mass
736 return 0;
739 // Now generate the list of candidate spectra that are actually
740 // in-range, considering the charge of each spectrum.
741 std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
742 for (spmi_c_it it=candidate_spectra_info_begin;
743 it != candidate_spectra_info_end; it++) {
744 const spectrum &csp = spectrum::searchable_spectra[it->second];
745 if (std::abs(csp.mass - p_mass) <= (csp.charge
746 * CP.parent_mass_tolerance_1))
747 candidate_spectra_0.push_back(it->second);
749 if (candidate_spectra_0.empty())
750 return 0;
752 m.predicted_parent_mass = p_mass;
753 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
755 double mass_list[peptide_size];
756 for (unsigned int i=0; i<peptide_size; i++)
757 mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
758 .fixed_residue_mass[m.peptide_sequence[i]]);
760 choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
761 stats, db_remaining, context.mod_count, 0);
762 return 0;
766 // Search a sequence run for matches according to the context against the
767 // spectra. Updates score_stats and the number of candidate spectra found.
769 // FIX: examine carefully for signed/unsigned problems
771 static inline void
772 search_run(const search_context &context, const sequence_run &sequence_run,
773 score_stats &stats) {
774 const unsigned int min_peptide_length
775 = std::max<unsigned int>(CP.minimum_peptide_length, context.mod_count);
776 const std::vector<double> &fixed_parent_mass
777 = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
779 const std::string &run_sequence = sequence_run.sequence;
781 // This match will be passed inward and used to record information that we
782 // need to remember about a match when we finally see one. At that point, a
783 // copy of this match will be saved.
784 // FIX: can this be done better?
785 match m;
786 m.sequence_name.push_back(sequence_run.name);
787 m.peptide_begin.push_back(0);
788 int &peptide_begin = m.peptide_begin[0];
789 m.mass_regime_index = context.mass_regime_index;
790 m.conjunct_index = context.conjunct_index;
791 m.pca_delta = context.pca_delta;
793 assert(context.delta_bag_delta.size() == context.delta_bag_count.size());
794 // counts remaining as mod positions are chosen
795 std::vector<int> db_remaining = context.delta_bag_count;
797 // the rightmost 'end' seen when all spectra masses were too high
798 // (0 means none yet encountered.)
799 unsigned int next_end = 0;
801 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
802 double p_mass = context.parent_fixed_mass;
803 int p_begin=0, p_end=0;
805 const std::vector<int> *cleavage_points = &sequence_run.cleavage_points;
806 std::vector<int>::size_type cleavage_points_size = cleavage_points->size();
807 // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
808 static std::vector<int> nonspecific_cleavage_points;
809 if (context.nonspecific_cleavage) {
810 cleavage_points_size = run_sequence.size() + 1;
811 for (unsigned int i=nonspecific_cleavage_points.size();
812 i<cleavage_points_size; i++)
813 nonspecific_cleavage_points.push_back(i);
814 cleavage_points = &nonspecific_cleavage_points;
816 assert(cleavage_points_size >= 2); // endpoints must be present
818 for (unsigned int begin=0; begin<cleavage_points_size-1; begin++) {
819 const int begin_index = (*cleavage_points)[begin];
820 peptide_begin = begin_index + sequence_run.sequence_offset;
822 // If pca_residues specified, require one of them to be the peptide
823 // N-terminal residue
824 switch (context.pca_residues.size()) {
825 case 0:
826 break;
827 case 2:
828 if (context.pca_residues[1] == run_sequence[begin_index])
829 break;
830 case 1:
831 if (context.pca_residues[0] == run_sequence[begin_index])
832 break;
833 continue;
834 default:
835 assert(false);
838 assert(begin_index >= p_begin and begin_index >= 0);
839 for (int i=p_begin; i<begin_index; i++)
840 p_mass -= fixed_parent_mass[run_sequence[i]];
841 p_begin = begin_index;
843 unsigned int end = std::max<unsigned int>(begin + 1, next_end);
844 for (; end<cleavage_points_size; end++) {
845 if (not context.nonspecific_cleavage) {
846 const int missed_cleavage_count = end - begin - 1;
847 if (missed_cleavage_count > context.maximum_missed_cleavage_sites)
848 break;
851 const int end_index = (*cleavage_points)[end];
852 assert(end_index > begin_index);
853 const unsigned int peptide_size = end_index - begin_index;
854 if (peptide_size < min_peptide_length)
855 continue;
857 assert(end_index >= 0);
858 if (end_index >= p_end)
859 for (int i=p_end; i<end_index; i++)
860 p_mass += fixed_parent_mass[run_sequence[i]];
861 else
862 for (int i=end_index; i<p_end; i++)
863 p_mass -= fixed_parent_mass[run_sequence[i]];
864 p_end = end_index;
867 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
869 int status = search_peptide(context, p_mass, m, run_sequence,
870 begin_index, peptide_size, stats,
871 db_remaining);
872 if (status == -1) // peptide was too long
873 break;
874 if (status == 1) { // peptide was too short
875 // so next start with a peptide at least this long
876 if (end == cleavage_points_size - 1)
877 return;
878 next_end = end;
885 // Search all sequence runs for matches according to the context against the
886 // spectra. Updates score_stats and the number of candidate spectra found.
887 void
888 spectrum::search_runs(const search_context &context, score_stats &stats) {
889 const int no_runs = context.sequence_runs.size();
890 for (int i=0; i<no_runs; i++) {
891 search_run(context, context.sequence_runs[i], stats);
893 if (CP.show_progress)
894 std::cerr << i+1 << " of " << no_runs << " sequences, "
895 << stats.candidate_spectrum_count << " candidates\r"
896 << std::flush;
899 if (CP.show_progress)
900 std::cerr << std::endl;