Check sequence db locus name uniqueness
[greylag.git] / cgreylag.cpp
blob05b340ec3ae706bce655908e43c40e685d8309a6
1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
23 #include <cerrno>
24 #include <cfloat>
25 #include <climits>
26 #include <cstdlib>
27 #include <cmath>
28 #include <iostream>
29 #include <numeric>
30 #include <set>
31 #include <stdexcept>
35 #ifdef __GNU__
36 #define NOTHROW __attribute__ ((nothrow))
37 // Use faster, non-threadsafe versions, since we don't use threads. This is
38 // maybe 10% faster?
39 #define fgetsX fgets_unlocked
40 #define fputsX fputs_unlocked
41 #define fprintfX __builtin_fprintf_unlocked
42 #define ferrorX ferror_unlocked
43 #define getcX getc_unlocked
44 #else
45 #define NOTHROW
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
50 #define getcX std::getc
51 #endif
54 parameters parameters::the;
55 const parameters &CP = parameters::the;
58 char *
59 peak::__repr__() const {
60 static char temp[128];
61 sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
62 mz, intensity, intensity_class);
63 return &temp[0];
66 const int FIRST_SPECTRUM_ID = 1;
67 int spectrum::next_id = FIRST_SPECTRUM_ID;
68 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
70 std::vector<spectrum> spectrum::searchable_spectra;
71 // parent mass -> searchable_spectra index
72 std::multimap<double,
73 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
75 typedef
76 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
77 spmi_c_it; // ugh
80 char *
81 spectrum::__repr__() const {
82 static char temp[1024];
83 sprintf(temp,
84 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
85 id, physical_id, name.c_str(), mass, charge, peaks.size());
86 return &temp[0];
90 // FIX: does this actually help inlining?
91 // Return ln of n_C_k.
92 static inline double
93 ln_combination_(unsigned int n, unsigned int k) NOTHROW {
94 // Occasionally happens due to problems in scoring function. (FIX)
95 if (n < k)
96 return 0;
98 assert(0 <= k and k <= n and n < CP.ln_factorial.size());
99 return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
101 double
102 ln_combination(unsigned int n, unsigned int k) {
103 return ln_combination(n, k);
108 // This is an error exit for read_spectra*.
109 static inline void
110 io_error(FILE *f, const char *message="") {
111 if (ferrorX(f))
112 message = "I/O error while reading (or writing) spectrum file";
113 throw std::ios_base::failure(message);
117 // true iff s consists entirely of whitespace
118 static bool
119 at_eol(const char *s) {
120 return s[strspn(s, " \t\r\n\f\v")] == 0;
124 // Check whether we've past the end offset, throwing exception on I/O error.
125 static inline bool
126 check_past_end(FILE *f, const long offset_end) {
127 if (offset_end == -1)
128 return false;
129 long pos = std::ftell(f);
130 if (pos == -1)
131 io_error(f);
132 return pos >= offset_end;
136 // Read spectra from file in ms2 format, tagging them with file_id. Before
137 // reading, seek to absolute position offset_begin. If offset_end != -1, any
138 // spectra that begin after position offset_end in the file are not read.
140 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
141 // having the same physical id. Note that depending on
142 // offset_begin/offset_end, we may end up with the first charge and not the
143 // second, or vice versa.
145 // The peak list is initially sorted by mz. Throws an exception on invalid
146 // input. Error checking is the most stringent in this function. Other
147 // readers can be a little looser since all ms2 files eventually get read here
148 // anyway.
150 // This is pretty hideous. Is there a simpler way to read, reasonably
151 // efficiently, catching any input error?
153 std::vector<spectrum>
154 spectrum::read_spectra_from_ms2(FILE *f, const int file_id,
155 const long offset_begin,
156 const long offset_end) {
157 std::vector<spectrum> spectra;
159 const int bufsiz = 1024;
160 char buf[bufsiz];
161 char *endp;
163 // first seek to offset_begin and synchronize at next "\n:"
164 if (offset_begin > 0) {
165 if (std::fseek(f, offset_begin-1, SEEK_SET) == -1)
166 io_error(f);
167 int c = getcX(f);
168 while (c != '\n' and c != EOF)
169 c = getcX(f);
170 if (ferrorX(f))
171 io_error(f);
172 if (c == '\n') {
173 do {
174 c = getcX(f);
175 } while (c != ':' and c != EOF);
176 if (ferrorX(f))
177 io_error(f);
178 if (c == ':')
179 std::ungetc(c, f);
182 // at this point we expect to read the first ':' of a header, or EOF
184 bool past_end = check_past_end(f, offset_end);
185 char *result = fgetsX(buf, bufsiz, f);
186 while (true) {
187 std::vector<std::string> names;
188 std::vector<double> masses;
189 std::vector<int> charges;
190 // for annotated ms2
191 std::vector<int> file_ids;
192 std::vector<int> physical_ids;
193 std::vector<int> ids;
195 // read headers
196 while (true) {
197 if (ferrorX(f))
198 io_error(f);
199 if (not result or buf[0] != ':')
200 break;
201 if (not past_end)
202 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
203 if (not fgetsX(buf, bufsiz, f))
204 io_error(f, "bad ms2 format: mass/charge line expected");
205 errno = 0;
206 double mass = std::strtod(buf, &endp); // need double accuracy here
207 if (not past_end)
208 masses.push_back(mass);
209 if (errno or endp == buf or mass <= 0)
210 io_error(f, "bad ms2 format: bad mass");
211 const char *endp0 = endp;
212 int charge = std::strtol(endp0, &endp, 10);
213 if (not past_end)
214 charges.push_back(charge);
215 if (errno or endp == endp0 or charge <= 0)
216 io_error(f, "bad ms2 format: bad charge");
217 if (not at_eol(endp))
218 io_error(f, "bad ms2 format: junk at end of mass/charge line");
219 past_end = past_end or check_past_end(f, offset_end);
220 result = fgetsX(buf, bufsiz, f);
222 if (names.empty() and past_end)
223 break;
224 if (not result) {
225 if (not names.empty())
226 io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
227 break;
229 // read peaks
230 std::vector<peak> peaks;
231 while (true) {
232 peak p;
233 errno = 0;
234 p.mz = strtod(buf, &endp);
235 if (errno or endp == buf or p.mz <= 0)
236 io_error(f, "bad ms2 format: bad peak mz");
237 const char *endp0 = endp;
238 p.intensity = strtod(endp0, &endp);
239 if (errno or endp == endp0 or p.intensity <= 0)
240 io_error(f, "bad ms2 format: bad peak intensity");
241 if (not at_eol(endp))
242 io_error(f, "bad ms2 format: junk at end of peak line");
243 peaks.push_back(p);
245 past_end = past_end or check_past_end(f, offset_end);
246 result = fgetsX(buf, bufsiz, f);
247 if (ferrorX(f))
248 io_error(f);
249 if (not result or buf[0] == ':')
250 break;
253 // add spectra to vector
254 if (names.empty() and not peaks.empty())
255 io_error(f, "bad ms2 format: missing header lines?");
257 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
259 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
260 spectrum sp(masses[i], charges[i]);
261 sp.peaks = peaks;
262 sp.name = names[i];
263 if (file_id != -1) {
264 sp.file_id = file_id;
265 sp.physical_id = next_physical_id;
266 } else {
267 sp.file_id = file_ids[i];
268 sp.physical_id = physical_ids[i];
269 sp.id = ids[i];
270 spectrum::next_id = std::max(spectrum::next_id, sp.id+1);
271 spectrum::next_physical_id = std::max(spectrum::next_physical_id,
272 sp.physical_id+1);
274 spectra.push_back(sp);
276 if (file_id != -1)
277 spectrum::next_physical_id += 1;
279 return spectra;
283 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
284 // to remove those too large to be fragment products. (Peaks are assumed to
285 // be ordered by increasing mz.)
286 void
287 spectrum::filter_peaks(double TIC_cutoff_proportion,
288 double parent_mass_tolerance_max) {
289 if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
290 or parent_mass_tolerance_max < 0)
291 throw std::invalid_argument("invalid argument value");
293 // remove peaks too large to be fragment products
294 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
295 const double peak_mz_limit = mass + parent_mass_tolerance_max;
297 std::vector<peak>::iterator pi;
298 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
299 peaks.erase(pi, peaks.end());
301 // FIX: note mzLowerBound..mzUpperBound here
303 // now filter by TIC
304 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
306 total_ion_current = 0;
307 for (pi=peaks.begin(); pi != peaks.end(); pi++)
308 total_ion_current += pi->intensity;
310 const double i_limit = total_ion_current * TIC_cutoff_proportion;
311 double accumulated_intensity = 0;
312 for (pi=peaks.begin();
313 pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
314 accumulated_intensity += pi->intensity;
316 peaks.erase(pi, peaks.end());
318 // restore order
319 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
323 // Classify peaks and update class_counts and peak bin counts.
324 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
325 void
326 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
327 double fragment_mass_tolerance) {
328 if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
329 throw std::invalid_argument("invalid argument value");
331 intensity_class_counts.clear();
332 intensity_class_counts.resize(intensity_class_count);
334 // e.g., 7 = 1 + 2 + 4
335 // FIX: does this fail for non-integer ratios?
336 int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
337 / (intensity_class_ratio - 1));
339 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
340 std::vector<peak>::iterator pi=peaks.begin();
341 for (int i_class=0; i_class<intensity_class_count; i_class++) {
342 int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
343 / min_count) + 0.5);
344 for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
345 pi->intensity_class = i_class;
346 intensity_class_counts[i_class]++;
349 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
351 if (not peaks.empty()) {
352 min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
353 max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
354 total_peak_bins = (int((max_peak_mz - min_peak_mz)
355 / (2 * fragment_mass_tolerance) + 0.5));
356 // This is the MyriMatch estimate. Wouldn't it be better to simply count
357 // how many are empty?
358 empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
363 // Store the list of spectra that search_peptide will search against, and also
364 // build spectrum_mass_index.
365 void
366 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
367 searchable_spectra = spectra;
368 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++)
369 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
373 // Calculate peaks for a synthesized mass (not mz) ladder.
374 static inline void
375 get_synthetic_Y_mass_ladder(std::vector<peak> &mass_ladder,
376 const std::vector<double> &mass_list,
377 const double fragment_C_fixed_mass) NOTHROW {
378 double m = fragment_C_fixed_mass;
380 const int ladder_size = mass_ladder.size();
381 for (int i=ladder_size-1; i>=0; i--) {
382 m += mass_list[i+1];
383 mass_ladder[ladder_size-1-i] = peak(m, 1.0);
388 // Calculate peaks for a synthesized mass (not mz) ladder.
389 static inline void
390 get_synthetic_B_mass_ladder(std::vector<peak> &mass_ladder,
391 const std::vector<double> &mass_list,
392 const double fragment_N_fixed_mass) NOTHROW {
393 double m = fragment_N_fixed_mass;
395 const int ladder_size = mass_ladder.size();
396 for (int i=0; i<=ladder_size-1; i++) {
397 m += mass_list[i];
398 mass_ladder[i] = peak(m, 1.0);
403 // Generate synthetic spectra for a set of charges. Only the charge and peaks
404 // of these spectra are initialized.
405 static inline void
406 synthetic_spectra(spectrum synth_sp[/* max_fragment_charge+1 */],
407 const std::vector<double> &mass_list,
408 const double fragment_N_fixed_mass,
409 const double fragment_C_fixed_mass,
410 const double max_fragment_charge) NOTHROW {
411 std::vector<peak> mass_ladder(mass_list.size()-1);
413 for (int charge=1; charge<=max_fragment_charge; charge++) {
414 spectrum &sp = synth_sp[charge];
415 //sp.mass = fragment_peptide_mass;
416 sp.charge = charge;
417 sp.peaks.resize(mass_ladder.size() * (ION_MAX-ION_MIN));
418 std::vector<peak>::size_type pi=0;
420 for (ion ion_type=ION_MIN; ion_type<ION_MAX; ion_type++) {
421 switch (ion_type) {
422 case ION_Y:
423 get_synthetic_Y_mass_ladder(mass_ladder, mass_list,
424 fragment_C_fixed_mass);
425 break;
426 case ION_B:
427 get_synthetic_B_mass_ladder(mass_ladder, mass_list,
428 fragment_N_fixed_mass);
429 break;
430 default:
431 assert(false);
434 for (std::vector<peak>::size_type i=0; i<mass_ladder.size(); i++) {
435 sp.peaks[pi++].mz = peak::get_mz(mass_ladder[i].mz, charge);
438 std::sort(sp.peaks.begin(), sp.peaks.end(), peak::less_mz);
443 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
444 // theoretical spectrum (y) generated from a peptide candidate.
446 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
447 // better yet, sets) for peak lists simpler? As fast or faster?
449 // This is the innermost loop, so it's worthwhile to optimize this some.
450 // FIX: Should some of these vectors be arrays?
451 static inline double
452 score_spectrum(const spectrum &x, const spectrum &y) NOTHROW {
453 assert (not x.peaks.empty() and not y.peaks.empty()); // FIX
455 std::vector<double> peak_best_delta(y.peaks.size(),
456 CP.fragment_mass_tolerance);
457 std::vector<int> peak_best_class(y.peaks.size(), -1);
459 std::vector<peak>::const_iterator x_it = x.peaks.begin();
460 std::vector<peak>::const_iterator y_it = y.peaks.begin();
462 // Iterate over all closest pairs of peaks, collecting info on the class of
463 // the best (meaning closest mz) real peak matching each theoretical peak
464 // (for real/theoretical peak pairs that are no farther apart than
465 // fragment_mass_tolerance).
466 int y_index = 0;
467 while (x_it != x.peaks.end() and y_it != y.peaks.end()) {
468 const double delta = y_it->mz - x_it->mz;
469 if (std::abs(delta) <= peak_best_delta[y_index]) {
470 peak_best_class[y_index] = x_it->intensity_class;
471 peak_best_delta[y_index] = std::abs(delta);
473 if (delta > 0)
474 x_it++;
475 else {
476 y_it++;
477 y_index++;
481 assert(CP.intensity_class_count < INT_MAX);
482 std::vector<unsigned> peak_hit_histogram(CP.intensity_class_count);
483 std::vector<int>::const_iterator b_it;
484 for (b_it = peak_best_class.begin(); b_it != peak_best_class.end(); b_it++)
485 if (*b_it >= 0) {
486 assert(*b_it < static_cast<int>(peak_hit_histogram.size()));
487 peak_hit_histogram[*b_it] += 1;
490 // How many theoretical peaks overlap the real peak range?
491 int valid_theoretical_peaks = 0;
492 for (y_it = y.peaks.begin(); y_it != y.peaks.end(); y_it++)
493 if (x.min_peak_mz <= y_it->mz and y_it->mz <= x.max_peak_mz)
494 valid_theoretical_peaks++;
496 // How many valid theoretical peaks were misses?
497 const int peak_misses = (valid_theoretical_peaks
498 - accumulate(peak_hit_histogram.begin(),
499 peak_hit_histogram.end(), 0));
500 #if 0
501 for (unsigned int i=0; i<peak_hit_histogram.size(); i++)
502 if (x.intensity_class_counts[i] < peak_hit_histogram[i])
503 std::cerr << i << " C(" << x.intensity_class_counts[i] << ", "
504 << peak_hit_histogram[i] << ")" << std::endl;
505 if (x.empty_peak_bins < peak_misses)
506 std::cerr << "M C(" << x.empty_peak_bins << ", " << peak_misses << ")"
507 << std::endl;
508 if (x.total_peak_bins < valid_theoretical_peaks)
509 std::cerr << "T C(" << x.total_peak_bins << ", " << valid_theoretical_peaks
510 << ")" << std::endl;
511 #endif
513 #if 0
514 for (unsigned int i=0; i<peak_hit_histogram.size(); i++)
515 std::cerr << i << " C(" << x.intensity_class_counts[i] << ", "
516 << peak_hit_histogram[i] << ")" << std::endl;
517 std::cerr << "C(" << x.empty_peak_bins << ", " << peak_misses << ")"
518 << std::endl;
519 std::cerr << "C(" << x.total_peak_bins << ", " << valid_theoretical_peaks
520 << ")" << std::endl;
521 std::cerr << std::endl;
522 #endif
524 assert(peak_misses >= 0);
526 double score = 0.0;
527 for (unsigned int i=0; i<peak_hit_histogram.size(); i++)
528 score += ln_combination_(x.intensity_class_counts[i],
529 peak_hit_histogram[i]);
530 score += ln_combination_(x.empty_peak_bins, peak_misses);
531 score -= ln_combination_(x.total_peak_bins, valid_theoretical_peaks);
533 return score;
537 // Return the score of the specified spectrum against a group of synthetic
538 // fragment spectra.
540 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
541 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
542 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
543 // or +2b/+1y?)
544 // FIX: implement MM smart +3 model, and/or come up with something better.
545 static inline double
546 score_spectrum_over_charges(spectrum synth_sp[/* max_fragment_charge+1 */],
547 const spectrum &sp,
548 const int max_fragment_charge) NOTHROW {
549 double best_score = score_spectrum(sp, synth_sp[1]);
551 const int charge_limit = std::max<int>(1, sp.charge-1);
552 for (int charge=2; charge<=charge_limit; charge++) {
553 assert(charge <= max_fragment_charge);
554 best_score += score_spectrum(sp, synth_sp[charge]);
556 return best_score;
560 struct mass_trace_list {
561 mass_trace_item item;
562 const mass_trace_list *next;
564 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
568 // Search for matches of this particular peptide modification variation
569 // against the spectra. Updates score_stats and returns the number of
570 // candidate spectra found.
571 static inline void
572 evaluate_peptide(const search_context &context, match &m,
573 const mass_trace_list *mtlp,
574 const std::vector<double> &mass_list,
575 const std::vector<std::vector<spectrum>::size_type>
576 &candidate_spectra,
577 score_stats &stats) NOTHROW {
578 assert(m.peptide_sequence.size() == mass_list.size());
580 int max_candidate_charge = 0;
581 typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
582 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
583 max_candidate_charge
584 = std::max<int>(max_candidate_charge,
585 spectrum::searchable_spectra[*it].charge);
586 assert(max_candidate_charge >= 1);
588 const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
589 assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
590 // e.g. +2 -> 'B' -> spectrum
591 // FIX: should this be a std::vector??
592 static spectrum synth_sp[spectrum::MAX_SUPPORTED_CHARGE+1];
593 synthetic_spectra(synth_sp, mass_list, context.fragment_N_fixed_mass,
594 context.fragment_C_fixed_mass, max_fragment_charge);
596 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
597 m.spectrum_index = *it;
598 spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
600 sp.comparisons += 1;
601 stats.candidate_spectrum_count += 1;
602 if (CP.estimate_only)
603 continue;
605 //std::cerr << "sp " << m.spectrum_index << std::endl;
606 m.score = score_spectrum_over_charges(synth_sp, sp, max_fragment_charge);
607 //std::cerr << "score " << m.score << std::endl;
609 // Is this score good enough to be in the best score list? (Better scores
610 // are more negative.)
611 if (m.score >= stats.best_matches[m.spectrum_index].back().score)
612 continue;
614 // We're saving this match, so remember the mass trace info, too.
615 m.mass_trace.clear();
616 for (const mass_trace_list *p=mtlp; p; p=p->next)
617 m.mass_trace.push_back(p->item);
619 // Is this match the first candidate for this spectrum?
620 if (stats.best_matches[m.spectrum_index].front().score == 0)
621 stats.spectra_with_candidates += 1;
623 // Insert this match in the correct position.
624 int i;
625 for (i=stats.best_matches[m.spectrum_index].size() - 2; i>=0; i--)
626 if (stats.best_matches[m.spectrum_index][i].score <= m.score)
627 break;
628 else
629 stats.best_matches[m.spectrum_index][i+1]
630 = stats.best_matches[m.spectrum_index][i];
631 stats.best_matches[m.spectrum_index][i+1] = m;
636 // IDEA FIX: Use a cost parameter to define the iterative search front.
639 // Choose one possible residue modification position. Once they're all
640 // chosen, then evaluate.
641 static inline void
642 choose_residue_mod(const search_context &context, match &m,
643 const mass_trace_list *mtlp, std::vector<double> &mass_list,
644 const std::vector<std::vector<spectrum>::size_type>
645 &candidate_spectra,
646 score_stats &stats,
647 std::vector<int> &db_remaining,
648 const unsigned remaining_positions_to_choose,
649 const unsigned next_position_to_consider) NOTHROW {
650 assert(remaining_positions_to_choose
651 <= m.peptide_sequence.size() - next_position_to_consider);
653 if (remaining_positions_to_choose == 0) {
654 stats.combinations_searched += 1;
655 evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
656 } else {
657 mass_trace_list mtl(mtlp);
659 // consider all of the positions where we could next add a mod
660 for (unsigned int i=next_position_to_consider;
661 i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
662 mtl.item.position = i;
663 const double save_pos_mass=mass_list[i];
664 const char pos_res = m.peptide_sequence[i];
666 // consider the possibilities for this position
667 for (std::vector<int>::const_iterator
668 it=context.delta_bag_lookup[pos_res].begin();
669 it != context.delta_bag_lookup[pos_res].end(); it++) {
670 const int db_index = *it;
671 if (db_remaining[db_index] < 1)
672 continue;
673 db_remaining[db_index] -= 1;
674 mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
675 mtl.item.delta = context.delta_bag_delta[db_index];
676 choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
677 stats, db_remaining,
678 remaining_positions_to_choose-1, i+1);
679 db_remaining[db_index] += 1;
681 mass_list[i] = save_pos_mass;
687 // p_begin (p_end) to begin_index (end_index), updating p_mass in the process.
688 // The sign determines whether mass increases or decreases as p_begin (p_end)
689 // is moved forward--so it should be -1 for the begin case and +1 for the end
690 // case.
691 // FIX: is this worth its complexity? kill or else add reset
692 static inline void
693 update_p_mass(double &p_mass, int &p_begin, int begin_index, int sign,
694 const std::string &run_sequence,
695 const std::vector<double> &fixed_residue_mass) {
696 assert(sign == +1 or sign == -1);
697 assert(begin_index >= 0);
698 const bool moving_forward = begin_index >= p_begin;
699 int p0=p_begin, p1=begin_index;
700 if (not moving_forward) {
701 std::swap(p0, p1);
702 sign = -sign;
704 for (int i=p0; i<p1; i++)
705 p_mass += sign * fixed_residue_mass[run_sequence[i]];
706 p_begin = begin_index;
711 // Search a sequence run for matches according to the context against the
712 // spectra. Updates score_stats and the number of candidate spectra found.
714 // FIX: examine carefully for signed/unsigned problems
716 void inline
717 search_run(const search_context &context, const sequence_run &sequence_run,
718 score_stats &stats) {
719 const int min_peptide_length = std::max<int>(CP.minimum_peptide_length,
720 context.mod_count);
721 const std::vector<double> &fixed_parent_mass \
722 = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
724 const std::string &run_sequence = sequence_run.sequence;
725 const std::vector<int> &cleavage_points = sequence_run.cleavage_points;
726 assert(cleavage_points.size() >= 2); // endpoints assumed always present
728 // This match will be passed inward and used to record information that we
729 // need to remember about a match when we finally see one. At that point, a
730 // copy of this match will be saved.
731 match m; // FIX: move inward?
732 m.sequence_name = sequence_run.name;
734 assert(context.delta_bag_delta.size()
735 == context.delta_bag_count.size());
736 // counts remaining as mod positions are chosen
737 std::vector<int> db_remaining = context.delta_bag_count;
739 // the rightmost 'end' seen when all spectra masses were too high
740 // (0 means none yet encountered.)
741 unsigned int next_end = 0;
743 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
744 double p_mass = context.parent_fixed_mass;
745 int p_begin=0, p_end=0;
747 // FIX: optimize the non-specific cleavage case?
748 for (unsigned int begin=0; begin<cleavage_points.size()-1; begin++) {
749 const int begin_index = cleavage_points[begin];
750 m.peptide_begin = begin_index + sequence_run.sequence_offset;
751 if (not context.pca_residues.empty())
752 if (context.pca_residues.find(run_sequence[begin_index])
753 == std::string::npos)
754 continue;
755 update_p_mass(p_mass, p_begin, begin_index, -1, run_sequence,
756 fixed_parent_mass);
757 unsigned int end = std::max<unsigned int>(begin + 1, next_end);
758 for (; end<cleavage_points.size(); end++) {
759 m.missed_cleavage_count = end - begin - 1;
760 if (m.missed_cleavage_count > context.maximum_missed_cleavage_sites)
761 break;
763 const int end_index = cleavage_points[end];
764 assert(end_index - begin_index > 0);
765 const int peptide_size = end_index - begin_index;
766 assert(peptide_size > 0);
767 if (peptide_size < min_peptide_length)
768 continue;
769 update_p_mass(p_mass, p_end, end_index, +1, run_sequence,
770 fixed_parent_mass);
771 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
773 // We'd like to consider only the spectra whose masses are within the
774 // tolerance range of theoretical mass of the peptide (p_mass).
775 // Unfortunately, the tolerance range of each spectrum depends on its
776 // charge, since the tolerance is specified in m/z units. Furthermore,
777 // when deciding how to adjust begin/end to generate the next peptide,
778 // we need to be conservative, as a high-charge spectrum may come into
779 // range, even though only +1 spectra are currently in-range. So, we
780 // first screen using the maximum tolerance range, then check the actual
781 // spectra below.
783 const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
784 const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
786 const spmi_c_it candidate_spectra_info_begin
787 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
788 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
789 // spectrum masses all too low to match peptide (peptide too long)
790 //std::cerr << "peptide: sp low" << std::endl;
791 break;
794 const spmi_c_it candidate_spectra_info_end
795 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
796 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
797 // spectrum masses all too high to match peptide
798 // (peptide is too short, so increase end, if possible, else return)
799 //std::cerr << "peptide: sp high" << std::endl;
800 if (end == cleavage_points.size() - 1)
801 return;
802 next_end = end;
803 continue;
805 if (candidate_spectra_info_begin == candidate_spectra_info_end) {
806 // no spectrum with close-enough parent mass
807 //std::cerr << "peptide: sp none nearby" << std::endl;
808 continue;
811 // Now generate the list of candidate spectra that are actually
812 // in-range, considering the charge of each spectrum.
813 std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
814 for (spmi_c_it it=candidate_spectra_info_begin;
815 it != candidate_spectra_info_end; it++) {
816 const spectrum &csp = spectrum::searchable_spectra[it->second];
817 if (std::abs(csp.mass - p_mass) <= (csp.charge
818 * CP.parent_mass_tolerance_1))
819 candidate_spectra_0.push_back(it->second);
821 if (candidate_spectra_0.empty())
822 continue;
824 m.predicted_parent_mass = p_mass;
825 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
827 std::vector<double> mass_list(peptide_size);
828 for (std::vector<double>::size_type i=0; i<m.peptide_sequence.size(); i++)
829 mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
830 .fixed_residue_mass[m.peptide_sequence[i]]);
832 choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
833 stats, db_remaining, context.mod_count, 0);
839 // Search all sequence runs for matches according to the context against the
840 // spectra. Updates score_stats and the number of candidate spectra found.
841 void
842 spectrum::search_runs(const search_context &context, score_stats &stats) {
843 const int no_runs = context.sequence_runs.size();
844 for (int i=0; i<no_runs; i++) {
845 search_run(context, context.sequence_runs[i], stats);
847 if (CP.show_progress)
848 std::cerr << i+1 << " of " << no_runs << " sequences, "
849 << stats.candidate_spectrum_count << " cand for "
850 << stats.spectra_with_candidates << " sp\r" << std::flush;
853 if (CP.show_progress)
854 std::cerr << std::endl;