Cleanups: dead code removal/etc.
[greylag.git] / cgreylag.cpp
blob12d7f7f6edec3e8e3544acc7b15e3a52471fa7ea
1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
23 #include <cerrno>
24 #include <cfloat>
25 #include <climits>
26 #include <cstdlib>
27 #include <cmath>
28 #include <iostream>
29 #include <numeric>
30 #include <set>
31 #include <stdexcept>
35 #ifdef __GNU__
36 #define NOTHROW __attribute__ ((nothrow))
37 // Use faster, non-threadsafe versions, since we don't use threads. This is
38 // maybe 10% faster?
39 #define fgetsX fgets_unlocked
40 #define fputsX fputs_unlocked
41 #define fprintfX __builtin_fprintf_unlocked
42 #define ferrorX ferror_unlocked
43 #define getcX getc_unlocked
44 #else
45 #define NOTHROW
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
50 #define getcX std::getc
51 #endif
54 parameters parameters::the;
55 const parameters &CP = parameters::the;
58 char *
59 peak::__repr__() const {
60 static char temp[128];
61 sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
62 mz, intensity, intensity_class);
63 return &temp[0];
66 const int FIRST_SPECTRUM_ID = 1;
67 int spectrum::next_id = FIRST_SPECTRUM_ID;
68 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
70 std::vector<spectrum> spectrum::searchable_spectra;
71 // parent mass -> searchable_spectra index
72 std::multimap<double,
73 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
75 typedef
76 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
77 spmi_c_it; // ugh
80 char *
81 spectrum::__repr__() const {
82 static char temp[1024];
83 sprintf(temp,
84 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
85 id, physical_id, name.c_str(), mass, charge, peaks.size());
86 return &temp[0];
90 // FIX: does this actually help inlining?
91 // Return ln of n_C_k.
92 static inline double
93 ln_combination_(unsigned int n, unsigned int k) NOTHROW {
94 // Occasionally happens due to problems in scoring function. (FIX)
95 if (n < k)
96 return 0;
98 assert(0 <= k and k <= n and n < CP.ln_factorial.size());
99 return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
101 double
102 ln_combination(unsigned int n, unsigned int k) {
103 return ln_combination(n, k);
108 // This is an error exit for read_spectra*.
109 static inline void
110 io_error(FILE *f, const char *message="") {
111 if (ferrorX(f))
112 message = "I/O error while reading (or writing) spectrum file";
113 throw std::ios_base::failure(message);
117 // true iff s consists entirely of whitespace
118 static bool
119 at_eol(const char *s) {
120 return s[strspn(s, " \t\r\n\f\v")] == 0;
124 // Check whether we've past the end offset, throwing exception on I/O error.
125 static inline bool
126 check_past_end(FILE *f, const long offset_end) {
127 if (offset_end == -1)
128 return false;
129 long pos = std::ftell(f);
130 if (pos == -1)
131 io_error(f);
132 return pos >= offset_end;
136 // Read spectra from file in ms2 format, tagging them with file_id. Before
137 // reading, seek to absolute position offset_begin. If offset_end != -1, any
138 // spectra that begin after position offset_end in the file are not read.
140 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
141 // having the same physical id. Note that depending on
142 // offset_begin/offset_end, we may end up with the first charge and not the
143 // second, or vice versa.
145 // The peak list is initially sorted by mz. Throws an exception on invalid
146 // input. Error checking is the most stringent in this function. Other
147 // readers can be a little looser since all ms2 files eventually get read here
148 // anyway.
150 // This is pretty hideous. Is there a simpler way to read, reasonably
151 // efficiently, catching any input error?
153 std::vector<spectrum>
154 spectrum::read_spectra_from_ms2(FILE *f, const int file_id,
155 const long offset_begin,
156 const long offset_end) {
157 std::vector<spectrum> spectra;
159 const int bufsiz = 1024;
160 char buf[bufsiz];
161 char *endp;
163 // first seek to offset_begin and synchronize at next "\n:"
164 if (offset_begin > 0) {
165 if (std::fseek(f, offset_begin-1, SEEK_SET) == -1)
166 io_error(f);
167 int c = getcX(f);
168 while (c != '\n' and c != EOF)
169 c = getcX(f);
170 if (ferrorX(f))
171 io_error(f);
172 if (c == '\n') {
173 do {
174 c = getcX(f);
175 } while (c != ':' and c != EOF);
176 if (ferrorX(f))
177 io_error(f);
178 if (c == ':')
179 std::ungetc(c, f);
182 // at this point we expect to read the first ':' of a header, or EOF
184 bool past_end = check_past_end(f, offset_end);
185 char *result = fgetsX(buf, bufsiz, f);
186 while (true) {
187 std::vector<std::string> names;
188 std::vector<double> masses;
189 std::vector<int> charges;
190 // for annotated ms2
191 std::vector<int> file_ids;
192 std::vector<int> physical_ids;
193 std::vector<int> ids;
195 // read headers
196 while (true) {
197 if (ferrorX(f))
198 io_error(f);
199 if (not result or buf[0] != ':')
200 break;
201 if (not past_end)
202 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
203 if (not fgetsX(buf, bufsiz, f))
204 io_error(f, "bad ms2 format: mass/charge line expected");
205 errno = 0;
206 double mass = std::strtod(buf, &endp); // need double accuracy here
207 if (not past_end)
208 masses.push_back(mass);
209 if (errno or endp == buf or mass <= 0)
210 io_error(f, "bad ms2 format: bad mass");
211 const char *endp0 = endp;
212 int charge = std::strtol(endp0, &endp, 10);
213 if (not past_end)
214 charges.push_back(charge);
215 if (errno or endp == endp0 or charge <= 0)
216 io_error(f, "bad ms2 format: bad charge");
217 if (not at_eol(endp))
218 io_error(f, "bad ms2 format: junk at end of mass/charge line");
219 past_end = past_end or check_past_end(f, offset_end);
220 result = fgetsX(buf, bufsiz, f);
222 if (names.empty() and past_end)
223 break;
224 if (not result) {
225 if (not names.empty())
226 io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
227 break;
229 // read peaks
230 std::vector<peak> peaks;
231 while (true) {
232 peak p;
233 errno = 0;
234 p.mz = strtod(buf, &endp);
235 if (errno or endp == buf or p.mz <= 0)
236 io_error(f, "bad ms2 format: bad peak mz");
237 const char *endp0 = endp;
238 p.intensity = strtod(endp0, &endp);
239 if (errno or endp == endp0 or p.intensity <= 0)
240 io_error(f, "bad ms2 format: bad peak intensity");
241 if (not at_eol(endp))
242 io_error(f, "bad ms2 format: junk at end of peak line");
243 peaks.push_back(p);
245 past_end = past_end or check_past_end(f, offset_end);
246 result = fgetsX(buf, bufsiz, f);
247 if (ferrorX(f))
248 io_error(f);
249 if (not result or buf[0] == ':')
250 break;
253 // add spectra to vector
254 if (names.empty() and not peaks.empty())
255 io_error(f, "bad ms2 format: missing header lines?");
257 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
259 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
260 spectrum sp(masses[i], charges[i]);
261 sp.peaks = peaks;
262 sp.name = names[i];
263 if (file_id != -1) {
264 sp.file_id = file_id;
265 sp.physical_id = next_physical_id;
266 } else {
267 sp.file_id = file_ids[i];
268 sp.physical_id = physical_ids[i];
269 sp.id = ids[i];
270 spectrum::next_id = std::max(spectrum::next_id, sp.id+1);
271 spectrum::next_physical_id = std::max(spectrum::next_physical_id,
272 sp.physical_id+1);
274 spectra.push_back(sp);
276 if (file_id != -1)
277 spectrum::next_physical_id += 1;
279 return spectra;
283 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
284 // to remove those too large to be fragment products. (Peaks are assumed to
285 // be ordered by increasing mz.)
286 void
287 spectrum::filter_peaks(double TIC_cutoff_proportion,
288 double parent_mass_tolerance_max) {
289 if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
290 or parent_mass_tolerance_max < 0)
291 throw std::invalid_argument("invalid argument value");
293 // remove peaks too large to be fragment products
294 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
295 const double peak_mz_limit = mass + parent_mass_tolerance_max;
297 std::vector<peak>::iterator pi;
298 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
299 peaks.erase(pi, peaks.end());
301 // FIX: note mzLowerBound..mzUpperBound here
303 // now filter by TIC
304 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
306 total_ion_current = 0;
307 for (pi=peaks.begin(); pi != peaks.end(); pi++)
308 total_ion_current += pi->intensity;
310 const double i_limit = total_ion_current * TIC_cutoff_proportion;
311 double accumulated_intensity = 0;
312 for (pi=peaks.begin();
313 pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
314 accumulated_intensity += pi->intensity;
316 peaks.erase(pi, peaks.end());
318 // restore order
319 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
323 // Classify peaks and update class_counts and peak bin counts.
324 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
325 void
326 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
327 double fragment_mass_tolerance) {
328 if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
329 throw std::invalid_argument("invalid argument value");
331 intensity_class_counts.clear();
332 intensity_class_counts.resize(intensity_class_count);
334 // e.g., 7 = 1 + 2 + 4
335 // FIX: does this fail for non-integer ratios?
336 int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
337 / (intensity_class_ratio - 1));
339 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
340 std::vector<peak>::iterator pi=peaks.begin();
341 for (int i_class=0; i_class<intensity_class_count; i_class++) {
342 int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
343 / min_count) + 0.5);
344 for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
345 pi->intensity_class = i_class;
346 intensity_class_counts[i_class]++;
349 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
351 if (not peaks.empty()) {
352 min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
353 max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
354 total_peak_bins = (int((max_peak_mz - min_peak_mz)
355 / (2 * fragment_mass_tolerance) + 0.5));
356 // This is the MyriMatch estimate. Wouldn't it be better to simply count
357 // how many are empty?
358 empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
363 // Store the list of spectra that search_peptide will search against, and also
364 // build spectrum_mass_index.
365 void
366 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
367 searchable_spectra = spectra;
368 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
369 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
373 // Calculate peaks for a synthesized mass (not mz) ladder.
374 static inline void
375 get_synthetic_Y_mass_ladder(std::vector<peak> &mass_ladder,
376 const std::vector<double> &mass_list,
377 const double fragment_C_fixed_mass) NOTHROW {
378 double m = fragment_C_fixed_mass;
380 const int ladder_size = mass_ladder.size();
381 for (int i=ladder_size-1; i>=0; i--) {
382 m += mass_list[i+1];
383 mass_ladder[ladder_size-1-i] = peak(m, 1.0);
388 // Calculate peaks for a synthesized mass (not mz) ladder.
389 static inline void
390 get_synthetic_B_mass_ladder(std::vector<peak> &mass_ladder,
391 const std::vector<double> &mass_list,
392 const double fragment_N_fixed_mass) NOTHROW {
393 double m = fragment_N_fixed_mass;
395 const int ladder_size = mass_ladder.size();
396 for (int i=0; i<=ladder_size-1; i++) {
397 m += mass_list[i];
398 mass_ladder[i] = peak(m, 1.0);
403 // Generate synthetic spectra for a set of charges. Only the charge and peaks
404 // of these spectra are initialized.
405 static inline void
406 synthetic_spectra(spectrum synth_sp[/* max_fragment_charge+1 */],
407 const std::vector<double> &mass_list,
408 const double fragment_N_fixed_mass,
409 const double fragment_C_fixed_mass,
410 const double max_fragment_charge) NOTHROW {
411 std::vector<peak> mass_ladder(mass_list.size()-1);
413 for (int charge=1; charge<=max_fragment_charge; charge++) {
414 spectrum &sp = synth_sp[charge];
415 //sp.mass = fragment_peptide_mass;
416 sp.charge = charge;
417 sp.peaks.resize(mass_ladder.size() * (ION_MAX-ION_MIN));
418 std::vector<peak>::size_type pi=0;
420 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
421 switch (ion_type) {
422 case ION_Y:
423 get_synthetic_Y_mass_ladder(mass_ladder, mass_list,
424 fragment_C_fixed_mass);
425 break;
426 case ION_B:
427 get_synthetic_B_mass_ladder(mass_ladder, mass_list,
428 fragment_N_fixed_mass);
429 break;
430 default:
431 assert(false);
434 for (std::vector<peak>::size_type i=0; i<mass_ladder.size(); i++) {
435 sp.peaks[pi++].mz = peak::get_mz(mass_ladder[i].mz, charge);
438 std::sort(sp.peaks.begin(), sp.peaks.end(), peak::less_mz);
443 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
444 // theoretical spectrum (y) generated from a peptide candidate.
446 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
447 // better yet, sets) for peak lists simpler? As fast or faster?
449 // This is the innermost loop, so it's worthwhile to optimize this some.
450 // FIX: Should some of these vectors be arrays?
451 static inline double
452 score_spectrum(const spectrum &x, const spectrum &y) NOTHROW {
453 assert (not x.peaks.empty() and not y.peaks.empty()); // FIX
455 std::vector<double> peak_best_delta(y.peaks.size(),
456 CP.fragment_mass_tolerance);
457 std::vector<int> peak_best_class(y.peaks.size(), -1);
459 std::vector<peak>::const_iterator x_it = x.peaks.begin();
460 std::vector<peak>::const_iterator y_it = y.peaks.begin();
462 // Iterate over all closest pairs of peaks, collecting info on the class of
463 // the best (meaning closest mz) real peak matching each theoretical peak
464 // (for real/theoretical peak pairs that are no farther apart than
465 // fragment_mass_tolerance).
466 int y_index = 0;
467 while (x_it != x.peaks.end() and y_it != y.peaks.end()) {
468 const double delta = y_it->mz - x_it->mz;
469 if (std::abs(delta) <= peak_best_delta[y_index]) {
470 peak_best_class[y_index] = x_it->intensity_class;
471 peak_best_delta[y_index] = std::abs(delta);
473 if (delta > 0)
474 x_it++;
475 else {
476 y_it++;
477 y_index++;
481 assert(CP.intensity_class_count < INT_MAX);
482 std::vector<unsigned> peak_hit_histogram(CP.intensity_class_count);
483 std::vector<int>::const_iterator b_it;
484 for (b_it = peak_best_class.begin(); b_it != peak_best_class.end(); b_it++)
485 if (*b_it >= 0) {
486 assert(*b_it < static_cast<int>(peak_hit_histogram.size()));
487 peak_hit_histogram[*b_it] += 1;
490 // How many theoretical peaks overlap the real peak range?
491 int valid_theoretical_peaks = 0;
492 for (y_it = y.peaks.begin(); y_it != y.peaks.end(); y_it++)
493 if (x.min_peak_mz <= y_it->mz and y_it->mz <= x.max_peak_mz)
494 valid_theoretical_peaks++;
496 // How many valid theoretical peaks were misses?
497 const int peak_misses = (valid_theoretical_peaks
498 - accumulate(peak_hit_histogram.begin(),
499 peak_hit_histogram.end(), 0));
500 assert(peak_misses >= 0);
502 double score = 0.0;
503 for (unsigned int i=0; i<peak_hit_histogram.size(); i++)
504 score += ln_combination_(x.intensity_class_counts[i],
505 peak_hit_histogram[i]);
506 score += ln_combination_(x.empty_peak_bins, peak_misses);
507 score -= ln_combination_(x.total_peak_bins, valid_theoretical_peaks);
509 return score;
513 // Return the score of the specified spectrum against a group of synthetic
514 // fragment spectra.
516 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
517 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
518 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
519 // or +2b/+1y?)
520 // FIX: implement MM smart +3 model, and/or come up with something better.
521 static inline double
522 score_spectrum_over_charges(spectrum synth_sp[/* max_fragment_charge+1 */],
523 const spectrum &sp,
524 const int max_fragment_charge) NOTHROW {
525 double best_score = score_spectrum(sp, synth_sp[1]);
527 const int charge_limit = std::max<int>(1, sp.charge-1);
528 for (int charge=2; charge<=charge_limit; charge++) {
529 assert(charge <= max_fragment_charge);
530 best_score += score_spectrum(sp, synth_sp[charge]);
532 return best_score;
536 struct mass_trace_list {
537 mass_trace_item item;
538 const mass_trace_list *next;
540 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
544 // fuzzy comparison might not be necessary, but it's safer
545 static inline bool
546 score_equal(double s1, double s2) { return std::abs(s1-s2) < 1e-6; }
549 // Search for matches of this particular peptide modification variation
550 // against the spectra. Updates score_stats and returns the number of
551 // candidate spectra found.
552 static inline void
553 evaluate_peptide(const search_context &context, match &m,
554 const mass_trace_list *mtlp,
555 const std::vector<double> &mass_list,
556 const std::vector<std::vector<spectrum>::size_type>
557 &candidate_spectra,
558 score_stats &stats) NOTHROW {
559 assert(m.peptide_sequence.size() == mass_list.size());
561 int max_candidate_charge = 0;
562 typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
563 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
564 max_candidate_charge
565 = std::max<int>(max_candidate_charge,
566 spectrum::searchable_spectra[*it].charge);
567 assert(max_candidate_charge >= 1);
569 const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
570 assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
571 // e.g. +2 -> 'B' -> spectrum
572 // FIX: should this be a std::vector??
573 static spectrum synth_sp[spectrum::MAX_SUPPORTED_CHARGE+1];
574 synthetic_spectra(synth_sp, mass_list, context.fragment_N_fixed_mass,
575 context.fragment_C_fixed_mass, max_fragment_charge);
577 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
578 m.spectrum_index = *it;
579 spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
581 sp.comparisons += 1;
582 stats.candidate_spectrum_count += 1;
583 if (CP.estimate_only)
584 continue;
586 //std::cerr << "sp " << m.spectrum_index << std::endl;
587 m.score = score_spectrum_over_charges(synth_sp, sp, max_fragment_charge);
588 //std::cerr << "score " << m.score << std::endl;
590 std::vector<match> &sp_best_matches = stats.best_matches[m.spectrum_index];
591 // Is this score good enough to be in the best score list? (Better scores
592 // are more negative.)
593 if (m.score > sp_best_matches.back().score)
594 continue;
596 // We're saving this match, so remember the mass trace info, too.
597 m.mass_trace.clear();
598 for (const mass_trace_list *p=mtlp; p; p=p->next)
599 m.mass_trace.push_back(p->item);
601 // If this is duplicate, just append to the existing match and return
602 // FIX: does comparison of mass trace work for all cases?
603 for (std::vector<match>::reverse_iterator rit
604 = sp_best_matches.rbegin(); rit != sp_best_matches.rend(); rit++)
605 if (score_equal(rit->score, m.score)
606 and rit->peptide_sequence == m.peptide_sequence
607 and rit->mass_trace == m.mass_trace) {
608 rit->peptide_begin.push_back(m.peptide_begin[0]);
609 rit->sequence_name.push_back(m.sequence_name[0]);
610 return;
613 // Otherwise, insert this match in the correct position
614 assert(sp_best_matches.size() >= 1);
615 std::vector<match>::reverse_iterator rit = sp_best_matches.rbegin();
616 for (;rit+1 != sp_best_matches.rend() and (rit+1)->score > m.score; rit++)
617 *rit = *(rit+1);
618 *rit = m;
623 // IDEA FIX: Use a cost parameter to define the iterative search front.
626 // Choose one possible residue modification position. Once they're all
627 // chosen, then evaluate.
628 static inline void
629 choose_residue_mod(const search_context &context, match &m,
630 const mass_trace_list *mtlp, std::vector<double> &mass_list,
631 const std::vector<std::vector<spectrum>::size_type>
632 &candidate_spectra,
633 score_stats &stats,
634 std::vector<int> &db_remaining,
635 const unsigned remaining_positions_to_choose,
636 const unsigned next_position_to_consider) NOTHROW {
637 assert(remaining_positions_to_choose
638 <= m.peptide_sequence.size() - next_position_to_consider);
640 if (remaining_positions_to_choose == 0) {
641 stats.combinations_searched += 1;
642 evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
643 } else {
644 mass_trace_list mtl(mtlp);
646 // consider all of the positions where we could next add a mod
647 for (unsigned int i=next_position_to_consider;
648 i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
649 mtl.item.position = i;
650 const double save_pos_mass=mass_list[i];
651 const char pos_res = m.peptide_sequence[i];
653 // consider the possibilities for this position
654 for (std::vector<int>::const_iterator
655 it=context.delta_bag_lookup[pos_res].begin();
656 it != context.delta_bag_lookup[pos_res].end(); it++) {
657 const int db_index = *it;
658 if (db_remaining[db_index] < 1)
659 continue;
660 db_remaining[db_index] -= 1;
661 mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
662 mtl.item.delta = context.delta_bag_delta[db_index];
663 choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
664 stats, db_remaining,
665 remaining_positions_to_choose-1, i+1);
666 db_remaining[db_index] += 1;
668 mass_list[i] = save_pos_mass;
674 // p_begin (p_end) to begin_index (end_index), updating p_mass in the process.
675 // The sign determines whether mass increases or decreases as p_begin (p_end)
676 // is moved forward--so it should be -1 for the begin case and +1 for the end
677 // case.
678 // FIX: is this worth its complexity? kill or else add reset
679 static inline void
680 update_p_mass(double &p_mass, int &p_begin, int begin_index, int sign,
681 const std::string &run_sequence,
682 const std::vector<double> &fixed_residue_mass) {
683 assert(sign == +1 or sign == -1);
684 assert(begin_index >= 0);
685 const bool moving_forward = begin_index >= p_begin;
686 int p0=p_begin, p1=begin_index;
687 if (not moving_forward) {
688 std::swap(p0, p1);
689 sign = -sign;
691 for (int i=p0; i<p1; i++)
692 p_mass += sign * fixed_residue_mass[run_sequence[i]];
693 p_begin = begin_index;
698 // Search a sequence run for matches according to the context against the
699 // spectra. Updates score_stats and the number of candidate spectra found.
701 // FIX: examine carefully for signed/unsigned problems
703 void inline
704 search_run(const search_context &context, const sequence_run &sequence_run,
705 score_stats &stats) {
706 const int min_peptide_length = std::max<int>(CP.minimum_peptide_length,
707 context.mod_count);
708 const std::vector<double> &fixed_parent_mass \
709 = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
711 const std::string &run_sequence = sequence_run.sequence;
712 const std::vector<int> &cleavage_points = sequence_run.cleavage_points;
713 assert(cleavage_points.size() >= 2); // endpoints assumed always present
715 // This match will be passed inward and used to record information that we
716 // need to remember about a match when we finally see one. At that point, a
717 // copy of this match will be saved.
718 // FIX: can this be done better?
719 match m;
720 m.sequence_name.push_back(sequence_run.name);
721 m.peptide_begin.push_back(0);
722 int &peptide_begin = m.peptide_begin[0];
724 assert(context.delta_bag_delta.size()
725 == context.delta_bag_count.size());
726 // counts remaining as mod positions are chosen
727 std::vector<int> db_remaining = context.delta_bag_count;
729 // the rightmost 'end' seen when all spectra masses were too high
730 // (0 means none yet encountered.)
731 unsigned int next_end = 0;
733 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
734 double p_mass = context.parent_fixed_mass;
735 int p_begin=0, p_end=0;
737 // FIX: optimize the non-specific cleavage case?
738 for (unsigned int begin=0; begin<cleavage_points.size()-1; begin++) {
739 const int begin_index = cleavage_points[begin];
740 peptide_begin = begin_index + sequence_run.sequence_offset;
741 if (not context.pca_residues.empty())
742 if (context.pca_residues.find(run_sequence[begin_index])
743 == std::string::npos)
744 continue;
745 update_p_mass(p_mass, p_begin, begin_index, -1, run_sequence,
746 fixed_parent_mass);
747 unsigned int end = std::max<unsigned int>(begin + 1, next_end);
748 for (; end<cleavage_points.size(); end++) {
749 const int missed_cleavage_count = end - begin - 1;
750 if (missed_cleavage_count > context.maximum_missed_cleavage_sites)
751 break;
753 const int end_index = cleavage_points[end];
754 assert(end_index - begin_index > 0);
755 const int peptide_size = end_index - begin_index;
756 assert(peptide_size > 0);
757 if (peptide_size < min_peptide_length)
758 continue;
759 update_p_mass(p_mass, p_end, end_index, +1, run_sequence,
760 fixed_parent_mass);
761 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
763 // We'd like to consider only the spectra whose masses are within the
764 // tolerance range of theoretical mass of the peptide (p_mass).
765 // Unfortunately, the tolerance range of each spectrum depends on its
766 // charge, since the tolerance is specified in m/z units. Furthermore,
767 // when deciding how to adjust begin/end to generate the next peptide,
768 // we need to be conservative, as a high-charge spectrum may come into
769 // range, even though only +1 spectra are currently in-range. So, we
770 // first screen using the maximum tolerance range, then check the actual
771 // spectra below.
773 const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
774 const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
776 const spmi_c_it candidate_spectra_info_begin
777 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
778 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
779 // spectrum masses all too low to match peptide (peptide too long)
780 //std::cerr << "peptide: sp low" << std::endl;
781 break;
784 const spmi_c_it candidate_spectra_info_end
785 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
786 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
787 // spectrum masses all too high to match peptide
788 // (peptide is too short, so increase end, if possible, else return)
789 //std::cerr << "peptide: sp high" << std::endl;
790 if (end == cleavage_points.size() - 1)
791 return;
792 next_end = end;
793 continue;
795 if (candidate_spectra_info_begin == candidate_spectra_info_end) {
796 // no spectrum with close-enough parent mass
797 //std::cerr << "peptide: sp none nearby" << std::endl;
798 continue;
801 // Now generate the list of candidate spectra that are actually
802 // in-range, considering the charge of each spectrum.
803 std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
804 for (spmi_c_it it=candidate_spectra_info_begin;
805 it != candidate_spectra_info_end; it++) {
806 const spectrum &csp = spectrum::searchable_spectra[it->second];
807 if (std::abs(csp.mass - p_mass) <= (csp.charge
808 * CP.parent_mass_tolerance_1))
809 candidate_spectra_0.push_back(it->second);
811 if (candidate_spectra_0.empty())
812 continue;
814 m.predicted_parent_mass = p_mass;
815 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
817 std::vector<double> mass_list(peptide_size);
818 for (std::vector<double>::size_type i=0; i<m.peptide_sequence.size(); i++)
819 mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
820 .fixed_residue_mass[m.peptide_sequence[i]]);
822 choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
823 stats, db_remaining, context.mod_count, 0);
829 // Search all sequence runs for matches according to the context against the
830 // spectra. Updates score_stats and the number of candidate spectra found.
831 void
832 spectrum::search_runs(const search_context &context, score_stats &stats) {
833 const int no_runs = context.sequence_runs.size();
834 for (int i=0; i<no_runs; i++) {
835 search_run(context, context.sequence_runs[i], stats);
837 if (CP.show_progress)
838 std::cerr << i+1 << " of " << no_runs << " sequences, "
839 << stats.candidate_spectrum_count << " candidates\r"
840 << std::flush;
843 if (CP.show_progress)
844 std::cerr << std::endl;