add chase_error (better error messages to user), hex sha1
[greylag.git] / cgreylag.cpp
blob9e523c8c4eebb02a1cce212af95cd648654bf5ae
1 // C++ module for greylag
3 // greylag, a collection of programs for MS/MS protein analysis
4 // Copyright (C) 2006-2008 Stowers Institute for Medical Research
5 //
6 // This program is free software: you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License as published by
8 // the Free Software Foundation, either version 3 of the License, or
9 // (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program. If not, see <http://www.gnu.org/licenses/>.
19 // Contact: Mike Coleman
20 // Stowers Institute for Medical Research
21 // 1000 East 50th Street
22 // Kansas City, Missouri 64110
23 // USA
26 #include "cgreylag.hpp"
29 #include <cerrno>
30 #include <cfloat>
31 #include <climits>
32 #include <cstdlib>
33 #include <cmath>
34 #include <iostream>
35 #include <numeric>
36 #include <set>
37 #include <stdexcept>
41 #ifdef __GNU__
42 // Use faster, non-threadsafe versions, since we don't use threads. This is
43 // maybe 10% faster?
44 #define fgetsX fgets_unlocked
45 #define fputsX fputs_unlocked
46 #define fprintfX __builtin_fprintf_unlocked
47 #define ferrorX ferror_unlocked
48 #define getcX getc_unlocked
49 #else
50 #define fgetsX std::fgets
51 #define fputsX std::fputs
52 #define fprintfX std::fprintf
53 #define ferrorX std::ferror
54 #define getcX std::getc
55 #endif
58 parameters parameters::the;
59 const parameters &CP = parameters::the;
62 char *
63 peak::__repr__() const {
64 static char temp[128];
65 sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
66 mz, intensity, intensity_class);
67 return &temp[0];
70 const int FIRST_SPECTRUM_ID = 1;
71 int spectrum::next_id = FIRST_SPECTRUM_ID;
72 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
74 std::vector<spectrum> spectrum::searchable_spectra;
75 // parent mass -> searchable_spectra index
76 std::multimap<double,
77 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
79 typedef
80 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
81 spmi_c_it; // ugh
84 char *
85 spectrum::__repr__() const {
86 static char temp[1024];
87 sprintf(temp,
88 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
89 id, physical_id, name.c_str(), mass, charge, peaks.size());
90 return &temp[0];
94 // Return ln of n_C_k.
95 static inline double
96 ln_combination(unsigned int n, unsigned int k) {
97 // Occasionally happens due to problems in scoring function. (FIX)
98 if (n < k)
99 return 0;
101 assert(0 <= k and k <= n and n < CP.ln_factorial.size());
102 return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
106 // FIX: Currently we're reading spectrum files in Python, which is fast enough?
107 // // This is an error exit for read_spectra*.
108 // static inline void
109 // io_error(FILE *f, const char *message="") {
110 // if (ferrorX(f))
111 // message = "I/O error while reading (or writing) spectrum file";
112 // throw std::ios_base::failure(message);
113 // }
116 // // true iff s consists entirely of whitespace
117 // static inline bool
118 // at_eol(const char *s) {
119 // return s[strspn(s, " \t\r\n\f\v")] == 0;
120 // }
123 // // Read spectra from file in ms2 format, tagging them with file_id.
125 // // Multiply charged spectra (e.g., +2/+3) are split into separate spectra
126 // // having the same physical id. Note that depending on
127 // // offset_begin/offset_end, we may end up with the first charge and not the
128 // // second, or vice versa.
130 // // The peak list is initially sorted by mz. Throws an exception on invalid
131 // // input. Error checking is stringent in this function.
133 // // This is pretty hideous. Is there a simpler way to read, reasonably
134 // // efficiently, catching any input error?
136 // std::vector<spectrum>
137 // spectrum::read_spectra_from_ms2(FILE *f, const int file_id) {
138 // std::vector<spectrum> spectra;
140 // const int bufsiz = 1024;
141 // char buf[bufsiz];
142 // char *endp;
144 // char *result = fgetsX(buf, bufsiz, f);
145 // while (true) {
146 // std::vector<std::string> names;
147 // std::vector<double> masses;
148 // std::vector<int> charges;
150 // // read headers
151 // while (true) {
152 // if (ferrorX(f))
153 // io_error(f);
154 // if (not result or buf[0] != ':')
155 // break;
156 // names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
157 // if (not fgetsX(buf, bufsiz, f))
158 // io_error(f, "bad ms2 format: mass/charge line expected");
159 // errno = 0;
160 // double mass = std::strtod(buf, &endp); // need double accuracy here
161 // masses.push_back(mass);
162 // if (errno or endp == buf or mass <= 0)
163 // io_error(f, "bad ms2 format: bad mass");
164 // const char *endp0 = endp;
165 // int charge = std::strtol(endp0, &endp, 10);
166 // charges.push_back(charge);
167 // if (errno or endp == endp0 or charge <= 0)
168 // io_error(f, "bad ms2 format: bad charge");
169 // if (not at_eol(endp))
170 // io_error(f, "bad ms2 format: junk at end of mass/charge line");
171 // result = fgetsX(buf, bufsiz, f);
172 // }
173 // if (not result) {
174 // if (not names.empty())
175 // io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
176 // break;
177 // }
178 // // read peaks
179 // std::vector<peak> peaks;
180 // while (true) {
181 // peak p;
182 // errno = 0;
183 // p.mz = strtod(buf, &endp);
184 // if (errno or endp == buf or p.mz <= 0)
185 // io_error(f, "bad ms2 format: bad peak mz");
186 // const char *endp0 = endp;
187 // p.intensity = strtod(endp0, &endp);
188 // if (errno or endp == endp0 or p.intensity <= 0)
189 // io_error(f, "bad ms2 format: bad peak intensity");
190 // if (not at_eol(endp))
191 // io_error(f, "bad ms2 format: junk at end of peak line");
192 // peaks.push_back(p);
194 // result = fgetsX(buf, bufsiz, f);
195 // if (ferrorX(f))
196 // io_error(f);
197 // if (not result or buf[0] == ':')
198 // break;
199 // }
201 // // add spectra to vector
202 // if (names.empty() and not peaks.empty())
203 // io_error(f, "bad ms2 format: missing header lines?");
205 // std::sort(peaks.begin(), peaks.end(), peak::less_mz);
207 // for (std::vector<double>::size_type i=0; i<names.size(); i++) {
208 // spectrum sp(masses[i], charges[i]);
209 // sp.peaks = peaks;
210 // sp.name = names[i];
211 // sp.file_id = file_id;
212 // sp.physical_id = next_physical_id;
213 // spectra.push_back(sp);
214 // }
215 // if (file_id != -1)
216 // spectrum::next_physical_id += 1;
217 // }
218 // return spectra;
219 // }
222 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
223 // to remove those too large to be fragment products. (Peaks are assumed to
224 // be ordered by increasing mz.)
225 void
226 spectrum::filter_peaks(double TIC_cutoff_proportion,
227 double parent_mass_tolerance_max) {
228 if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
229 or parent_mass_tolerance_max < 0)
230 throw std::invalid_argument("invalid argument value");
232 // remove peaks too large to be fragment products
233 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
234 const double peak_mz_limit = mass + parent_mass_tolerance_max;
236 std::vector<peak>::iterator pi;
237 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
238 peaks.erase(pi, peaks.end());
240 // FIX: note mzLowerBound..mzUpperBound here
242 // now filter by TIC
243 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
245 total_ion_current = 0;
246 for (pi=peaks.begin(); pi != peaks.end(); pi++)
247 total_ion_current += pi->intensity;
249 const double i_limit = total_ion_current * TIC_cutoff_proportion;
250 double accumulated_intensity = 0;
251 for (pi=peaks.begin();
252 pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
253 accumulated_intensity += pi->intensity;
255 peaks.erase(pi, peaks.end());
257 // restore order
258 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
262 // Classify peaks and update class_counts and peak bin counts.
263 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
264 void
265 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
266 double fragment_mass_tolerance) {
267 if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
268 throw std::invalid_argument("invalid argument value");
270 intensity_class_counts.clear();
271 intensity_class_counts.resize(intensity_class_count);
273 // e.g., 7 = 1 + 2 + 4
274 // FIX: does this fail for non-integer ratios?
275 int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
276 / (intensity_class_ratio - 1));
278 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
279 std::vector<peak>::iterator pi=peaks.begin();
280 for (int i_class=0; i_class<intensity_class_count; i_class++) {
281 int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
282 / min_count) + 0.5);
283 for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
284 pi->intensity_class = i_class;
285 intensity_class_counts[i_class]++;
288 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
290 if (not peaks.empty()) {
291 min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
292 max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
293 total_peak_bins = (int((max_peak_mz - min_peak_mz)
294 / (2 * fragment_mass_tolerance) + 0.5));
295 // This is the MyriMatch estimate. Wouldn't it be better to simply count
296 // how many are empty?
297 empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
302 // Update the *_cache fields from the peaks field.
303 void
304 spectrum::update_peak_cache() {
305 clear_peak_cache();
307 const unsigned int peak_count = peaks.size();
308 peak_mz_cache = new double[peak_count+1];
309 peak_intensity_class_cache = new int[peak_count+1];
311 for (unsigned int i=0; i<peak_count; i++) {
312 peak_mz_cache[i] = peaks[i].mz;
313 peak_intensity_class_cache[i] = peaks[i].intensity_class;
316 // negative value is terminator
317 peak_mz_cache[peak_count] = -1;
318 peak_intensity_class_cache[peak_count] = -1;
322 // Store the list of spectra that search_peptide will search against, and also
323 // build spectrum_mass_index.
324 void
325 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
326 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
327 spectra[i].clear_peak_cache();
328 searchable_spectra = spectra;
330 spectrum_mass_index.clear();
331 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++) {
332 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
333 searchable_spectra[i].update_peak_cache();
338 // Calculate peaks for a synthesized mass (not mz) ladder.
339 static inline void
340 get_synthetic_Y_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
341 const double *mass_list,
342 const double fragment_C_fixed_mass) {
343 double m = fragment_C_fixed_mass;
345 for (unsigned int i=ladder_size; i>0; i--) {
346 m += mass_list[i];
347 mass_ladder[ladder_size-i] = m;
352 // Calculate peaks for a synthesized mass (not mz) ladder.
353 static inline void
354 get_synthetic_B_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
355 const double *mass_list,
356 const double fragment_N_fixed_mass) {
357 double m = fragment_N_fixed_mass;
359 for (unsigned int i=0; i<ladder_size; i++) {
360 m += mass_list[i];
361 mass_ladder[i] = m;
366 // Generate synthetic spectra for a set of charges. Only the charge and peaks
367 // of these spectra are initialized.
368 static inline void
369 synthetic_spectra(double synth_sp_mz[/* max_fragment_charge+1 */]
370 [spectrum::MAX_THEORETICAL_FRAGMENTS],
371 const double *mass_list, const unsigned int mass_list_size,
372 const double fragment_N_fixed_mass,
373 const double fragment_C_fixed_mass,
374 const int max_fragment_charge) {
375 assert(mass_list_size >= 1);
376 const unsigned int ladder_size = mass_list_size - 1;
377 double Y_mass_ladder[ladder_size];
378 double B_mass_ladder[ladder_size];
380 for (int charge=1; charge<=max_fragment_charge; charge++) {
381 get_synthetic_Y_mass_ladder(Y_mass_ladder, ladder_size, mass_list,
382 fragment_C_fixed_mass);
383 get_synthetic_B_mass_ladder(B_mass_ladder, ladder_size, mass_list,
384 fragment_N_fixed_mass);
386 unsigned int y_i=0, b_i=0;
387 double *sp_mz_p = synth_sp_mz[charge];
389 // merge the two (already sorted) mass lists, converting them to mz values
390 // in the process
391 // FIX: try algorithm::merge?
392 while (y_i < ladder_size and b_i < ladder_size)
393 if (Y_mass_ladder[y_i] < B_mass_ladder[b_i])
394 *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
395 else
396 *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
397 while (y_i < ladder_size)
398 *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
399 while (b_i < ladder_size)
400 *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
401 *sp_mz_p = -1; // invalid mz as terminator
406 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
407 // theoretical spectrum (y) generated from a peptide candidate.
409 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
410 // better yet, sets) for peak lists simpler? As fast or faster?
412 // This is the innermost loop, so it's worthwhile to optimize this some.
413 // FIX const declaration
414 static inline double
415 score_spectrum(const spectrum &x,
416 const double synth_mzs[spectrum::MAX_THEORETICAL_FRAGMENTS]) {
417 // FIX this
418 unsigned int y_peak_count=0;
419 for (const double *p=synth_mzs; *p >= 0; p++, y_peak_count++);
421 double peak_best_delta[y_peak_count];
422 int peak_best_class[y_peak_count];
423 for (unsigned int i=0; i<y_peak_count; i++) {
424 peak_best_delta[i] = CP.fragment_mass_tolerance;
425 peak_best_class[i] = -1;
428 // Iterate over all closest pairs of peaks, collecting info on the class of
429 // the best (meaning closest mz) real peak matching each theoretical peak
430 // (for real/theoretical peak pairs that are no farther apart than
431 // fragment_mass_tolerance).
433 const unsigned int x_peak_count=x.peaks.size();
434 assert(x_peak_count > 0 and y_peak_count > 0);
436 unsigned int x_index=0, y_index=0;
437 while (true) {
438 const double delta = synth_mzs[y_index] - x.peak_mz_cache[x_index];
439 if (std::abs(delta) <= peak_best_delta[y_index]) {
440 peak_best_class[y_index] = x.peak_intensity_class_cache[x_index];
441 peak_best_delta[y_index] = std::abs(delta);
443 if (delta > 0) {
444 if (++x_index >= x_peak_count)
445 break;
446 } else {
447 if (++y_index >= y_peak_count)
448 break;
452 assert(CP.intensity_class_count < INT_MAX);
453 const unsigned int intensity_class_count = CP.intensity_class_count;
454 unsigned int peak_hit_histogram[intensity_class_count];
455 for (unsigned int i=0; i<intensity_class_count; i++)
456 peak_hit_histogram[i] = 0;
457 for (unsigned int i=0; i<y_peak_count; i++) {
458 const int bc = peak_best_class[i];
459 if (bc >= 0) {
460 assert(bc < static_cast<int>(intensity_class_count));
461 peak_hit_histogram[bc] += 1;
465 // How many theoretical peaks overlap the real peak range?
466 int valid_theoretical_peaks = 0;
467 for (unsigned int i=0; i<y_peak_count; i++)
468 if (x.min_peak_mz <= synth_mzs[i] and synth_mzs[i] <= x.max_peak_mz)
469 valid_theoretical_peaks++;
471 unsigned int total_peak_hits = 0;
472 for (unsigned int i=0; i<intensity_class_count; i++)
473 total_peak_hits += peak_hit_histogram[i];
475 // How many valid theoretical peaks were misses?
476 const int peak_misses = valid_theoretical_peaks - total_peak_hits;
477 assert(peak_misses >= 0);
479 double score = ln_combination(x.empty_peak_bins, peak_misses);
480 for (unsigned int i=0; i<intensity_class_count; i++)
481 score += ln_combination(x.intensity_class_counts[i],
482 peak_hit_histogram[i]);
483 score -= ln_combination(x.total_peak_bins, valid_theoretical_peaks);
485 return score;
489 // Return the score of the specified spectrum against a group of synthetic
490 // fragment spectra.
492 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
493 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
494 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
495 // or +2b/+1y?)
496 // FIX: implement MM smart +3 model, and/or come up with something better.
497 // FIX const decl
498 static inline double
499 score_spectrum_over_charges(const double
500 synth_sp_mz[/* max_fragment_charge+1 */]
501 [spectrum::MAX_THEORETICAL_FRAGMENTS],
502 const spectrum &sp,
503 const int max_fragment_charge) {
504 double best_score = score_spectrum(sp, synth_sp_mz[1]);
506 const int charge_limit = std::max<int>(1, sp.charge-1);
507 for (int charge=2; charge<=charge_limit; charge++) {
508 assert(charge <= max_fragment_charge);
509 best_score += score_spectrum(sp, synth_sp_mz[charge]);
511 return best_score;
515 struct mass_trace_list {
516 mass_trace_item item;
517 const mass_trace_list *next;
519 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
523 // fuzzy comparison might not be necessary, but it's safer
524 static inline bool
525 score_equal(double s1, double s2) { return std::abs(s1-s2) < 1e-6; }
528 // Search for matches of this particular peptide modification variation
529 // against the spectra. Updates score_stats and returns the number of
530 // candidate spectra found.
531 static inline void
532 evaluate_peptide(const search_context &context, match &m,
533 const mass_trace_list *mtlp, const double *mass_list,
534 const std::vector<std::vector<spectrum>::size_type>
535 &candidate_spectra,
536 score_stats &stats) {
537 int max_candidate_charge = 0;
538 typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
539 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
540 max_candidate_charge
541 = std::max<int>(max_candidate_charge,
542 spectrum::searchable_spectra[*it].charge);
543 assert(max_candidate_charge >= 1);
545 const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
546 assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
548 const unsigned int peptide_size = m.peptide_sequence.size();
549 // FIX: "2" assumes just two ion series (e.g., B and Y)
550 assert(peptide_size <= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS-1));
552 // This array is preallocated to avoid the overhead of allocation and
553 // deallocation within the inner loop.
554 // FIX
555 // charge -> mz array
556 static double synth_sp_mz[spectrum::MAX_SUPPORTED_CHARGE+1]
557 [spectrum::MAX_THEORETICAL_FRAGMENTS];
559 synthetic_spectra(synth_sp_mz, mass_list, peptide_size,
560 context.fragment_N_fixed_mass,
561 context.fragment_C_fixed_mass, max_fragment_charge);
563 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
564 m.spectrum_index = *it;
565 spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
567 sp.comparisons += 1;
568 stats.candidate_spectrum_count += 1;
569 if (CP.estimate_only)
570 continue;
572 //std::cerr << "sp " << m.spectrum_index << std::endl;
573 m.score = score_spectrum_over_charges(synth_sp_mz, sp, max_fragment_charge);
574 //std::cerr << "score " << m.score << std::endl;
576 std::vector<match> &sp_best_matches = stats.best_matches[m.spectrum_index];
577 // Is this score good enough to be in the best score list? (Better scores
578 // are more negative.)
579 if (m.score > sp_best_matches.back().score)
580 continue;
582 // We're saving this match, so remember the mass trace info, too.
583 m.mass_trace.clear();
584 for (const mass_trace_list *p=mtlp; p; p=p->next)
585 m.mass_trace.push_back(p->item);
587 // If this is duplicate, just append to the existing match and continue
588 for (std::vector<match>::reverse_iterator rit
589 = sp_best_matches.rbegin(); rit != sp_best_matches.rend(); rit++)
590 if (score_equal(rit->score, m.score)
591 and rit->peptide_sequence == m.peptide_sequence
592 and rit->mass_regime_index == m.mass_regime_index
593 and rit->conjunct_index == m.conjunct_index
594 and rit->mass_trace == m.mass_trace) {
595 rit->peptide_begin.push_back(m.peptide_begin[0]);
596 rit->sequence_name.push_back(m.sequence_name[0]);
597 continue;
600 // Otherwise, insert this match in the correct position
601 assert(sp_best_matches.size() >= 1);
602 std::vector<match>::reverse_iterator rit = sp_best_matches.rbegin();
603 for (;rit+1 != sp_best_matches.rend() and (rit+1)->score > m.score; rit++)
604 *rit = *(rit+1);
605 *rit = m;
610 // IDEA FIX: Use a cost parameter to define the iterative search front.
613 // Choose one possible residue modification position. Once they're all
614 // chosen, then evaluate.
615 static inline void
616 choose_residue_mod(const search_context &context, match &m,
617 const mass_trace_list *mtlp, double *mass_list,
618 const std::vector<std::vector<spectrum>::size_type>
619 &candidate_spectra,
620 score_stats &stats,
621 std::vector<int> &db_remaining,
622 const unsigned int remaining_positions_to_choose,
623 const unsigned int next_position_to_consider) {
624 assert(remaining_positions_to_choose
625 <= m.peptide_sequence.size() - next_position_to_consider);
627 if (remaining_positions_to_choose == 0) {
628 stats.evaluation_count += 1;
629 evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
630 } else {
631 mass_trace_list mtl(mtlp);
633 // consider all of the positions where we could next add a mod
634 for (unsigned int i=next_position_to_consider;
635 i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
636 mtl.item.position = i;
637 const double save_pos_mass=mass_list[i];
638 const char pos_res = m.peptide_sequence[i];
640 // consider the possibilities for this position
641 for (std::vector<int>::const_iterator
642 it=context.delta_bag_lookup[pos_res].begin();
643 it != context.delta_bag_lookup[pos_res].end(); it++) {
644 const int db_index = *it;
645 if (db_remaining[db_index] < 1)
646 continue;
647 db_remaining[db_index] -= 1;
648 mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
649 mtl.item.conjunct_item_index = db_index;
650 choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
651 stats, db_remaining,
652 remaining_positions_to_choose-1, i+1);
653 db_remaining[db_index] += 1;
655 mass_list[i] = save_pos_mass;
661 static inline int
662 search_peptide(const search_context &context, const double &p_mass, match &m,
663 const std::string &run_sequence, const int &begin_index,
664 const unsigned int &peptide_size, score_stats &stats,
665 std::vector<int> &db_remaining) {
666 // We'd like to consider only the spectra whose masses are within the
667 // tolerance range of theoretical mass of the peptide (p_mass).
668 // Unfortunately, the tolerance range of each spectrum depends on its
669 // charge, since the tolerance is specified in m/z units. Furthermore,
670 // when deciding how to adjust begin/end to generate the next peptide,
671 // we need to be conservative, as a high-charge spectrum may come into
672 // range, even though only +1 spectra are currently in-range. So, we
673 // first screen using the maximum tolerance range, then check the actual
674 // spectra below.
676 const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
677 const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
679 const spmi_c_it candidate_spectra_info_begin
680 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
681 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
682 // spectrum masses all too low to match peptide (peptide too long)
683 return -1;
686 const spmi_c_it candidate_spectra_info_end
687 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
688 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
689 // spectrum masses all too high to match peptide (peptide too short)
690 return 1;
692 if (candidate_spectra_info_begin == candidate_spectra_info_end) {
693 // no spectrum with close-enough parent mass
694 return 0;
697 // Now generate the list of candidate spectra that are actually
698 // in-range, considering the charge of each spectrum.
699 std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
700 for (spmi_c_it it=candidate_spectra_info_begin;
701 it != candidate_spectra_info_end; it++) {
702 const spectrum &csp = spectrum::searchable_spectra[it->second];
703 if (std::abs(csp.mass - p_mass) <= (csp.charge
704 * CP.parent_mass_tolerance_1))
705 candidate_spectra_0.push_back(it->second);
707 if (candidate_spectra_0.empty())
708 return 0;
710 m.predicted_parent_mass = p_mass;
711 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
713 double mass_list[peptide_size];
714 for (unsigned int i=0; i<peptide_size; i++)
715 mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
716 .fixed_residue_mass[m.peptide_sequence[i]]);
718 choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
719 stats, db_remaining, context.mod_count, 0);
720 return 0;
724 // Search a sequence run for matches according to the context against the
725 // spectra. Updates score_stats and the number of candidate spectra found.
727 // FIX: examine carefully for signed/unsigned problems
729 static inline void
730 search_run(const search_context &context, const sequence_run &sequence_run,
731 score_stats &stats) {
732 const unsigned int min_peptide_length
733 = std::max<unsigned int>(CP.minimum_peptide_length, context.mod_count);
734 const std::vector<double> &fixed_parent_mass
735 = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
737 const std::string &run_sequence = sequence_run.sequence;
739 // This match will be passed inward and used to record information that we
740 // need to remember about a match when we finally see one. At that point, a
741 // copy of this match will be saved.
742 // FIX: can this be done better?
743 match m;
744 m.sequence_name.push_back(sequence_run.name);
745 m.peptide_begin.push_back(0);
746 int &peptide_begin = m.peptide_begin[0];
747 m.mass_regime_index = context.mass_regime_index;
748 m.conjunct_index = context.conjunct_index;
749 m.pca_delta = context.pca_delta;
751 assert(context.delta_bag_delta.size() == context.delta_bag_count.size());
752 // counts remaining as mod positions are chosen
753 std::vector<int> db_remaining = context.delta_bag_count;
755 // the rightmost 'end' seen when all spectra masses were too high
756 // (0 means none yet encountered.)
757 unsigned int next_end = 0;
759 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
760 double p_mass = context.parent_fixed_mass;
761 int p_begin=0, p_end=0;
763 const std::vector<int> *cleavage_points = &sequence_run.cleavage_points;
764 std::vector<int>::size_type cleavage_points_size = cleavage_points->size();
765 // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
766 static std::vector<int> nonspecific_cleavage_points;
767 if (context.nonspecific_cleavage) {
768 cleavage_points_size = run_sequence.size() + 1;
769 for (unsigned int i=nonspecific_cleavage_points.size();
770 i<cleavage_points_size; i++)
771 nonspecific_cleavage_points.push_back(i);
772 cleavage_points = &nonspecific_cleavage_points;
774 assert(cleavage_points_size >= 2); // endpoints must be present
776 for (unsigned int begin=0; begin<cleavage_points_size-1; begin++) {
777 const int begin_index = (*cleavage_points)[begin];
778 peptide_begin = begin_index + sequence_run.sequence_offset;
780 // If pca_residues specified, require one of them to be the peptide
781 // N-terminal residue
782 switch (context.pca_residues.size()) {
783 case 0:
784 break;
785 case 2:
786 if (context.pca_residues[1] == run_sequence[begin_index])
787 break;
788 case 1:
789 if (context.pca_residues[0] == run_sequence[begin_index])
790 break;
791 continue;
792 default:
793 assert(false);
796 assert(begin_index >= p_begin and begin_index >= 0);
797 for (int i=p_begin; i<begin_index; i++)
798 p_mass -= fixed_parent_mass[run_sequence[i]];
799 p_begin = begin_index;
801 unsigned int end = std::max<unsigned int>(begin + 1, next_end);
802 for (; end<cleavage_points_size; end++) {
803 if (not context.nonspecific_cleavage) {
804 const int missed_cleavage_count = end - begin - 1;
805 if (missed_cleavage_count > context.maximum_missed_cleavage_sites)
806 break;
809 const int end_index = (*cleavage_points)[end];
810 assert(end_index > begin_index);
811 const unsigned int peptide_size = end_index - begin_index;
812 if (peptide_size < min_peptide_length)
813 continue;
815 assert(end_index >= 0);
816 if (end_index >= p_end)
817 for (int i=p_end; i<end_index; i++)
818 p_mass += fixed_parent_mass[run_sequence[i]];
819 else
820 for (int i=end_index; i<p_end; i++)
821 p_mass -= fixed_parent_mass[run_sequence[i]];
822 p_end = end_index;
824 // std::cerr << "peptide: "
825 // << std::string(run_sequence, begin_index, peptide_size)
826 // << " p_mass: " << p_mass << std::endl;
828 int status = search_peptide(context, p_mass, m, run_sequence,
829 begin_index, peptide_size, stats,
830 db_remaining);
831 if (status == -1) // peptide was too long
832 break;
833 if (status == 1) { // peptide was too short
834 // so next start with a peptide at least this long
835 if (end == cleavage_points_size - 1)
836 return;
837 next_end = end;
844 // Search all sequence runs for matches according to the context against the
845 // spectra. Updates score_stats and the number of candidate spectra found.
846 void
847 spectrum::search_runs(const search_context &context, score_stats &stats) {
848 const int no_runs = context.sequence_runs.size();
849 for (int i=0; i<no_runs; i++) {
850 search_run(context, context.sequence_runs[i], stats);
852 if (CP.show_progress)
853 std::cerr << i+1 << " of " << no_runs << " sequences, "
854 << stats.candidate_spectrum_count << " candidates\r"
855 << std::flush;
858 if (CP.show_progress)
859 std::cerr << std::endl;