update README and TODO
[greylag.git] / cgreylag.cpp
blob40ef19115c6c9c4a6db5cc4a9297cc3e810845d3
1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
23 #include <cerrno>
24 #include <cfloat>
25 #include <climits>
26 #include <cstdlib>
27 #include <cmath>
28 #include <iostream>
29 #include <numeric>
30 #include <set>
31 #include <stdexcept>
35 #ifdef __GNU__
36 #define NOTHROW __attribute__ ((nothrow))
37 // Use faster, non-threadsafe versions, since we don't use threads. This is
38 // maybe 10% faster?
39 #define fgetsX fgets_unlocked
40 #define fputsX fputs_unlocked
41 #define fprintfX __builtin_fprintf_unlocked
42 #define ferrorX ferror_unlocked
43 #define getcX getc_unlocked
44 #else
45 #define NOTHROW
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
50 #define getcX std::getc
51 #endif
54 parameters parameters::the;
55 const parameters &CP = parameters::the;
58 char *
59 peak::__repr__() const {
60 static char temp[128];
61 sprintf(temp, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
62 mz, intensity, intensity_class);
63 return &temp[0];
66 const int FIRST_SPECTRUM_ID = 1;
67 int spectrum::next_id = FIRST_SPECTRUM_ID;
68 int spectrum::next_physical_id = FIRST_SPECTRUM_ID;
70 std::vector<spectrum> spectrum::searchable_spectra;
71 // parent mass -> searchable_spectra index
72 std::multimap<double,
73 std::vector<spectrum>::size_type> spectrum::spectrum_mass_index;
75 typedef
76 std::multimap<double, std::vector<spectrum>::size_type>::const_iterator
77 spmi_c_it; // ugh
80 char *
81 spectrum::__repr__() const {
82 static char temp[1024];
83 sprintf(temp,
84 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
85 id, physical_id, name.c_str(), mass, charge, peaks.size());
86 return &temp[0];
90 // FIX: does this actually help inlining?
91 // Return ln of n_C_k.
92 static inline double
93 ln_combination_(unsigned int n, unsigned int k) NOTHROW {
94 // Occasionally happens due to problems in scoring function. (FIX)
95 if (n < k)
96 return 0;
98 assert(0 <= k and k <= n and n < CP.ln_factorial.size());
99 return CP.ln_factorial[n] - CP.ln_factorial[k] - CP.ln_factorial[n-k];
101 double
102 ln_combination(unsigned int n, unsigned int k) {
103 return ln_combination(n, k);
108 // This is an error exit for read_spectra*.
109 static inline void
110 io_error(FILE *f, const char *message="") {
111 if (ferrorX(f))
112 message = "I/O error while reading (or writing) spectrum file";
113 throw std::ios_base::failure(message);
117 // true iff s consists entirely of whitespace
118 static bool
119 at_eol(const char *s) {
120 return s[strspn(s, " \t\r\n\f\v")] == 0;
124 // Check whether we've past the end offset, throwing exception on I/O error.
125 static inline bool
126 check_past_end(FILE *f, const long offset_end) {
127 if (offset_end == -1)
128 return false;
129 long pos = std::ftell(f);
130 if (pos == -1)
131 io_error(f);
132 return pos >= offset_end;
136 // Read spectra from file in ms2 format, tagging them with file_id. Before
137 // reading, seek to absolute position offset_begin. If offset_end != -1, any
138 // spectra that begin after position offset_end in the file are not read.
140 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
141 // having the same physical id. Note that depending on
142 // offset_begin/offset_end, we may end up with the first charge and not the
143 // second, or vice versa.
145 // The peak list is initially sorted by mz. Throws an exception on invalid
146 // input. Error checking is the most stringent in this function. Other
147 // readers can be a little looser since all ms2 files eventually get read here
148 // anyway.
150 // This is pretty hideous. Is there a simpler way to read, reasonably
151 // efficiently, catching any input error?
153 std::vector<spectrum>
154 spectrum::read_spectra_from_ms2(FILE *f, const int file_id,
155 const long offset_begin,
156 const long offset_end) {
157 std::vector<spectrum> spectra;
159 const int bufsiz = 1024;
160 char buf[bufsiz];
161 char *endp;
163 // first seek to offset_begin and synchronize at next "\n:"
164 if (offset_begin > 0) {
165 if (std::fseek(f, offset_begin-1, SEEK_SET) == -1)
166 io_error(f);
167 int c = getcX(f);
168 while (c != '\n' and c != EOF)
169 c = getcX(f);
170 if (ferrorX(f))
171 io_error(f);
172 if (c == '\n') {
173 do {
174 c = getcX(f);
175 } while (c != ':' and c != EOF);
176 if (ferrorX(f))
177 io_error(f);
178 if (c == ':')
179 std::ungetc(c, f);
182 // at this point we expect to read the first ':' of a header, or EOF
184 bool past_end = check_past_end(f, offset_end);
185 char *result = fgetsX(buf, bufsiz, f);
186 while (true) {
187 std::vector<std::string> names;
188 std::vector<double> masses;
189 std::vector<int> charges;
191 // read headers
192 while (true) {
193 if (ferrorX(f))
194 io_error(f);
195 if (not result or buf[0] != ':')
196 break;
197 if (not past_end)
198 names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
199 if (not fgetsX(buf, bufsiz, f))
200 io_error(f, "bad ms2 format: mass/charge line expected");
201 errno = 0;
202 double mass = std::strtod(buf, &endp); // need double accuracy here
203 if (not past_end)
204 masses.push_back(mass);
205 if (errno or endp == buf or mass <= 0)
206 io_error(f, "bad ms2 format: bad mass");
207 const char *endp0 = endp;
208 int charge = std::strtol(endp0, &endp, 10);
209 if (not past_end)
210 charges.push_back(charge);
211 if (errno or endp == endp0 or charge <= 0)
212 io_error(f, "bad ms2 format: bad charge");
213 if (not at_eol(endp))
214 io_error(f, "bad ms2 format: junk at end of mass/charge line");
215 past_end = past_end or check_past_end(f, offset_end);
216 result = fgetsX(buf, bufsiz, f);
218 if (names.empty() and past_end)
219 break;
220 if (not result) {
221 if (not names.empty())
222 io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
223 break;
225 // read peaks
226 std::vector<peak> peaks;
227 while (true) {
228 peak p;
229 errno = 0;
230 p.mz = strtod(buf, &endp);
231 if (errno or endp == buf or p.mz <= 0)
232 io_error(f, "bad ms2 format: bad peak mz");
233 const char *endp0 = endp;
234 p.intensity = strtod(endp0, &endp);
235 if (errno or endp == endp0 or p.intensity <= 0)
236 io_error(f, "bad ms2 format: bad peak intensity");
237 if (not at_eol(endp))
238 io_error(f, "bad ms2 format: junk at end of peak line");
239 peaks.push_back(p);
241 past_end = past_end or check_past_end(f, offset_end);
242 result = fgetsX(buf, bufsiz, f);
243 if (ferrorX(f))
244 io_error(f);
245 if (not result or buf[0] == ':')
246 break;
249 // add spectra to vector
250 if (names.empty() and not peaks.empty())
251 io_error(f, "bad ms2 format: missing header lines?");
253 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
255 for (std::vector<double>::size_type i=0; i<names.size(); i++) {
256 spectrum sp(masses[i], charges[i]);
257 sp.peaks = peaks;
258 sp.name = names[i];
259 sp.file_id = file_id;
260 sp.physical_id = next_physical_id;
261 spectra.push_back(sp);
263 if (file_id != -1)
264 spectrum::next_physical_id += 1;
266 return spectra;
270 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
271 // to remove those too large to be fragment products. (Peaks are assumed to
272 // be ordered by increasing mz.)
273 void
274 spectrum::filter_peaks(double TIC_cutoff_proportion,
275 double parent_mass_tolerance_max) {
276 if (not (0 <= TIC_cutoff_proportion and TIC_cutoff_proportion <= 1)
277 or parent_mass_tolerance_max < 0)
278 throw std::invalid_argument("invalid argument value");
280 // remove peaks too large to be fragment products
281 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
282 const double peak_mz_limit = mass + parent_mass_tolerance_max;
284 std::vector<peak>::iterator pi;
285 for (pi=peaks.begin(); pi != peaks.end() and pi->mz <= peak_mz_limit; pi++);
286 peaks.erase(pi, peaks.end());
288 // FIX: note mzLowerBound..mzUpperBound here
290 // now filter by TIC
291 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
293 total_ion_current = 0;
294 for (pi=peaks.begin(); pi != peaks.end(); pi++)
295 total_ion_current += pi->intensity;
297 const double i_limit = total_ion_current * TIC_cutoff_proportion;
298 double accumulated_intensity = 0;
299 for (pi=peaks.begin();
300 pi != peaks.end() and accumulated_intensity <= i_limit; pi++)
301 accumulated_intensity += pi->intensity;
303 peaks.erase(pi, peaks.end());
305 // restore order
306 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
310 // Classify peaks and update class_counts and peak bin counts.
311 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
312 void
313 spectrum::classify(int intensity_class_count, double intensity_class_ratio,
314 double fragment_mass_tolerance) {
315 if (intensity_class_count < 1 or intensity_class_ratio <= 1.0)
316 throw std::invalid_argument("invalid argument value");
318 intensity_class_counts.clear();
319 intensity_class_counts.resize(intensity_class_count);
321 // e.g., 7 = 1 + 2 + 4
322 // FIX: does this fail for non-integer ratios?
323 int min_count = int((pow(intensity_class_ratio, intensity_class_count) - 1)
324 / (intensity_class_ratio - 1));
326 std::sort(peaks.begin(), peaks.end(), peak::greater_intensity);
327 std::vector<peak>::iterator pi=peaks.begin();
328 for (int i_class=0; i_class<intensity_class_count; i_class++) {
329 int peaks_this_class = int((pow(intensity_class_ratio, i_class) * peaks.size()
330 / min_count) + 0.5);
331 for (int cp=0; cp < peaks_this_class and pi != peaks.end(); cp++, pi++) {
332 pi->intensity_class = i_class;
333 intensity_class_counts[i_class]++;
336 std::sort(peaks.begin(), peaks.end(), peak::less_mz);
338 if (not peaks.empty()) {
339 min_peak_mz = peaks.front().mz - CP.fragment_mass_tolerance;
340 max_peak_mz = peaks.back().mz + CP.fragment_mass_tolerance;
341 total_peak_bins = (int((max_peak_mz - min_peak_mz)
342 / (2 * fragment_mass_tolerance) + 0.5));
343 // This is the MyriMatch estimate. Wouldn't it be better to simply count
344 // how many are empty?
345 empty_peak_bins = std::max<int>(total_peak_bins - peaks.size(), 0);
350 // Update the *_cache fields from the peaks field.
351 void
352 spectrum::update_peak_cache() {
353 clear_peak_cache();
355 const unsigned int peak_count = peaks.size();
356 peak_mz_cache = new double[peak_count+1];
357 peak_intensity_class_cache = new int[peak_count+1];
359 for (unsigned int i=0; i<peak_count; i++) {
360 peak_mz_cache[i] = peaks[i].mz;
361 peak_intensity_class_cache[i] = peaks[i].intensity_class;
364 // negative value is terminator
365 peak_mz_cache[peak_count] = -1;
366 peak_intensity_class_cache[peak_count] = -1;
370 // Store the list of spectra that search_peptide will search against, and also
371 // build spectrum_mass_index.
372 void
373 spectrum::set_searchable_spectra(const std::vector<spectrum> &spectra) {
374 searchable_spectra = spectra;
375 for (std::vector<spectrum>::size_type i=0; i<spectra.size(); i++) {
376 spectrum_mass_index.insert(std::make_pair(spectra[i].mass, i));
377 searchable_spectra[i].update_peak_cache();
382 // Calculate peaks for a synthesized mass (not mz) ladder.
383 static inline void
384 get_synthetic_Y_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
385 const double *mass_list,
386 const double fragment_C_fixed_mass) NOTHROW {
387 double m = fragment_C_fixed_mass;
389 for (unsigned int i=ladder_size; i>0; i--) {
390 m += mass_list[i];
391 mass_ladder[ladder_size-i] = m;
396 // Calculate peaks for a synthesized mass (not mz) ladder.
397 static inline void
398 get_synthetic_B_mass_ladder(double *mass_ladder, const unsigned int ladder_size,
399 const double *mass_list,
400 const double fragment_N_fixed_mass) NOTHROW {
401 double m = fragment_N_fixed_mass;
403 for (unsigned int i=0; i<ladder_size; i++) {
404 m += mass_list[i];
405 mass_ladder[i] = m;
410 // Generate synthetic spectra for a set of charges. Only the charge and peaks
411 // of these spectra are initialized.
412 static inline void
413 synthetic_spectra(double synth_sp_mz[/* max_fragment_charge+1 */]
414 [spectrum::MAX_THEORETICAL_FRAGMENTS],
415 const double *mass_list, const unsigned int mass_list_size,
416 const double fragment_N_fixed_mass,
417 const double fragment_C_fixed_mass,
418 const int max_fragment_charge) NOTHROW {
419 assert(mass_list_size >= 1);
420 const unsigned int ladder_size = mass_list_size - 1;
421 double Y_mass_ladder[ladder_size];
422 double B_mass_ladder[ladder_size];
424 for (int charge=1; charge<=max_fragment_charge; charge++) {
425 get_synthetic_Y_mass_ladder(Y_mass_ladder, ladder_size, mass_list,
426 fragment_C_fixed_mass);
427 get_synthetic_B_mass_ladder(B_mass_ladder, ladder_size, mass_list,
428 fragment_N_fixed_mass);
430 unsigned int y_i=0, b_i=0;
431 double *sp_mz_p = synth_sp_mz[charge];
433 // merge the two (already sorted) mass lists, converting them to mz values
434 // in the process
435 // FIX: try algorithm::merge?
436 while (y_i < ladder_size and b_i < ladder_size)
437 if (Y_mass_ladder[y_i] < B_mass_ladder[b_i])
438 *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
439 else
440 *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
441 while (y_i < ladder_size)
442 *(sp_mz_p++) = peak::get_mz(Y_mass_ladder[y_i++], charge);
443 while (b_i < ladder_size)
444 *(sp_mz_p++) = peak::get_mz(B_mass_ladder[b_i++], charge);
445 *sp_mz_p = -1; // invalid mz as terminator
450 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
451 // theoretical spectrum (y) generated from a peptide candidate.
453 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
454 // better yet, sets) for peak lists simpler? As fast or faster?
456 // This is the innermost loop, so it's worthwhile to optimize this some.
457 // FIX const declaration
458 static inline double
459 score_spectrum(const spectrum &x,
460 const double synth_mzs[spectrum::MAX_THEORETICAL_FRAGMENTS])
461 NOTHROW {
462 // FIX this
463 unsigned int y_peak_count=0;
464 for (const double *p=synth_mzs; *p >= 0; p++, y_peak_count++);
466 double peak_best_delta[y_peak_count];
467 int peak_best_class[y_peak_count];
468 for (unsigned int i=0; i<y_peak_count; i++) {
469 peak_best_delta[i] = CP.fragment_mass_tolerance;
470 peak_best_class[i] = -1;
473 // Iterate over all closest pairs of peaks, collecting info on the class of
474 // the best (meaning closest mz) real peak matching each theoretical peak
475 // (for real/theoretical peak pairs that are no farther apart than
476 // fragment_mass_tolerance).
478 const unsigned int x_peak_count=x.peaks.size();
479 assert(x_peak_count > 0 and y_peak_count > 0);
481 unsigned int x_index=0, y_index=0;
482 while (true) {
483 const double delta = synth_mzs[y_index] - x.peak_mz_cache[x_index];
484 if (std::abs(delta) <= peak_best_delta[y_index]) {
485 peak_best_class[y_index] = x.peak_intensity_class_cache[x_index];
486 peak_best_delta[y_index] = std::abs(delta);
488 if (delta > 0) {
489 if (++x_index >= x_peak_count)
490 break;
491 } else {
492 if (++y_index >= y_peak_count)
493 break;
497 assert(CP.intensity_class_count < INT_MAX);
498 const unsigned int intensity_class_count = CP.intensity_class_count;
499 unsigned int peak_hit_histogram[intensity_class_count];
500 for (unsigned int i=0; i<intensity_class_count; i++)
501 peak_hit_histogram[i] = 0;
502 for (unsigned int i=0; i<y_peak_count; i++) {
503 const int bc = peak_best_class[i];
504 if (bc >= 0) {
505 assert(bc < static_cast<int>(intensity_class_count));
506 peak_hit_histogram[bc] += 1;
510 // How many theoretical peaks overlap the real peak range?
511 int valid_theoretical_peaks = 0;
512 for (unsigned int i=0; i<y_peak_count; i++)
513 if (x.min_peak_mz <= synth_mzs[i] and synth_mzs[i] <= x.max_peak_mz)
514 valid_theoretical_peaks++;
516 unsigned int total_peak_hits = 0;
517 for (unsigned int i=0; i<intensity_class_count; i++)
518 total_peak_hits += peak_hit_histogram[i];
520 // How many valid theoretical peaks were misses?
521 const int peak_misses = valid_theoretical_peaks - total_peak_hits;
522 assert(peak_misses >= 0);
524 double score = ln_combination_(x.empty_peak_bins, peak_misses);
525 for (unsigned int i=0; i<intensity_class_count; i++)
526 score += ln_combination_(x.intensity_class_counts[i],
527 peak_hit_histogram[i]);
528 score -= ln_combination_(x.total_peak_bins, valid_theoretical_peaks);
530 return score;
534 // Return the score of the specified spectrum against a group of synthetic
535 // fragment spectra.
537 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
538 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
539 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
540 // or +2b/+1y?)
541 // FIX: implement MM smart +3 model, and/or come up with something better.
542 // FIX const decl
543 static inline double
544 score_spectrum_over_charges(const double
545 synth_sp_mz[/* max_fragment_charge+1 */]
546 [spectrum::MAX_THEORETICAL_FRAGMENTS],
547 const spectrum &sp,
548 const int max_fragment_charge) NOTHROW {
549 double best_score = score_spectrum(sp, synth_sp_mz[1]);
551 const int charge_limit = std::max<int>(1, sp.charge-1);
552 for (int charge=2; charge<=charge_limit; charge++) {
553 assert(charge <= max_fragment_charge);
554 best_score += score_spectrum(sp, synth_sp_mz[charge]);
556 return best_score;
560 struct mass_trace_list {
561 mass_trace_item item;
562 const mass_trace_list *next;
564 mass_trace_list(const mass_trace_list *p=0) : next(p) { }
568 // fuzzy comparison might not be necessary, but it's safer
569 static inline bool
570 score_equal(double s1, double s2) { return std::abs(s1-s2) < 1e-6; }
573 // Search for matches of this particular peptide modification variation
574 // against the spectra. Updates score_stats and returns the number of
575 // candidate spectra found.
576 static inline void
577 evaluate_peptide(const search_context &context, match &m,
578 const mass_trace_list *mtlp, const double *mass_list,
579 const std::vector<std::vector<spectrum>::size_type>
580 &candidate_spectra,
581 score_stats &stats) NOTHROW {
582 int max_candidate_charge = 0;
583 typedef std::vector<std::vector<spectrum>::size_type>::const_iterator c_it;
584 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++)
585 max_candidate_charge
586 = std::max<int>(max_candidate_charge,
587 spectrum::searchable_spectra[*it].charge);
588 assert(max_candidate_charge >= 1);
590 const int max_fragment_charge = std::max<int>(1, max_candidate_charge-1);
591 assert(max_fragment_charge <= spectrum::MAX_SUPPORTED_CHARGE);
593 const unsigned int peptide_size = m.peptide_sequence.size();
594 // FIX: "2" assumes just two ion series (e.g., B and Y)
595 assert(peptide_size <= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS-1));
597 // This array is preallocated to avoid the overhead of allocation and
598 // deallocation within the inner loop.
599 // FIX
600 // charge -> mz array
601 static double synth_sp_mz[spectrum::MAX_SUPPORTED_CHARGE+1]
602 [spectrum::MAX_THEORETICAL_FRAGMENTS];
604 synthetic_spectra(synth_sp_mz, mass_list, peptide_size,
605 context.fragment_N_fixed_mass,
606 context.fragment_C_fixed_mass, max_fragment_charge);
608 for (c_it it=candidate_spectra.begin(); it != candidate_spectra.end(); it++) {
609 m.spectrum_index = *it;
610 spectrum &sp = spectrum::searchable_spectra[m.spectrum_index];
612 sp.comparisons += 1;
613 stats.candidate_spectrum_count += 1;
614 if (CP.estimate_only)
615 continue;
617 //std::cerr << "sp " << m.spectrum_index << std::endl;
618 m.score = score_spectrum_over_charges(synth_sp_mz, sp, max_fragment_charge);
619 //std::cerr << "score " << m.score << std::endl;
621 std::vector<match> &sp_best_matches = stats.best_matches[m.spectrum_index];
622 // Is this score good enough to be in the best score list? (Better scores
623 // are more negative.)
624 if (m.score > sp_best_matches.back().score)
625 continue;
627 // We're saving this match, so remember the mass trace info, too.
628 m.mass_trace.clear();
629 for (const mass_trace_list *p=mtlp; p; p=p->next)
630 m.mass_trace.push_back(p->item);
632 // If this is duplicate, just append to the existing match and return
633 for (std::vector<match>::reverse_iterator rit
634 = sp_best_matches.rbegin(); rit != sp_best_matches.rend(); rit++)
635 if (score_equal(rit->score, m.score)
636 and rit->peptide_sequence == m.peptide_sequence
637 and rit->mass_regime_index == m.mass_regime_index
638 and rit->conjunct_index == m.conjunct_index
639 and rit->mass_trace == m.mass_trace) {
640 rit->peptide_begin.push_back(m.peptide_begin[0]);
641 rit->sequence_name.push_back(m.sequence_name[0]);
642 return;
645 // Otherwise, insert this match in the correct position
646 assert(sp_best_matches.size() >= 1);
647 std::vector<match>::reverse_iterator rit = sp_best_matches.rbegin();
648 for (;rit+1 != sp_best_matches.rend() and (rit+1)->score > m.score; rit++)
649 *rit = *(rit+1);
650 *rit = m;
655 // IDEA FIX: Use a cost parameter to define the iterative search front.
658 // Choose one possible residue modification position. Once they're all
659 // chosen, then evaluate.
660 static inline void
661 choose_residue_mod(const search_context &context, match &m,
662 const mass_trace_list *mtlp, double *mass_list,
663 const std::vector<std::vector<spectrum>::size_type>
664 &candidate_spectra,
665 score_stats &stats,
666 std::vector<int> &db_remaining,
667 const unsigned int remaining_positions_to_choose,
668 const unsigned int next_position_to_consider) NOTHROW {
669 assert(remaining_positions_to_choose
670 <= m.peptide_sequence.size() - next_position_to_consider);
672 if (remaining_positions_to_choose == 0) {
673 stats.combinations_searched += 1;
674 evaluate_peptide(context, m, mtlp, mass_list, candidate_spectra, stats);
675 } else {
676 mass_trace_list mtl(mtlp);
678 // consider all of the positions where we could next add a mod
679 for (unsigned int i=next_position_to_consider;
680 i <= m.peptide_sequence.size()-remaining_positions_to_choose; i++) {
681 mtl.item.position = i;
682 const double save_pos_mass=mass_list[i];
683 const char pos_res = m.peptide_sequence[i];
685 // consider the possibilities for this position
686 for (std::vector<int>::const_iterator
687 it=context.delta_bag_lookup[pos_res].begin();
688 it != context.delta_bag_lookup[pos_res].end(); it++) {
689 const int db_index = *it;
690 if (db_remaining[db_index] < 1)
691 continue;
692 db_remaining[db_index] -= 1;
693 mass_list[i] = save_pos_mass + context.delta_bag_delta[db_index];
694 mtl.item.delta = context.delta_bag_delta[db_index];
695 choose_residue_mod(context, m, &mtl, mass_list, candidate_spectra,
696 stats, db_remaining,
697 remaining_positions_to_choose-1, i+1);
698 db_remaining[db_index] += 1;
700 mass_list[i] = save_pos_mass;
706 static inline int
707 search_peptide(const search_context &context, const double &p_mass, match &m,
708 const std::string &run_sequence, const int &begin_index,
709 const unsigned int &peptide_size, score_stats &stats,
710 std::vector<int> &db_remaining) {
711 // We'd like to consider only the spectra whose masses are within the
712 // tolerance range of theoretical mass of the peptide (p_mass).
713 // Unfortunately, the tolerance range of each spectrum depends on its
714 // charge, since the tolerance is specified in m/z units. Furthermore,
715 // when deciding how to adjust begin/end to generate the next peptide,
716 // we need to be conservative, as a high-charge spectrum may come into
717 // range, even though only +1 spectra are currently in-range. So, we
718 // first screen using the maximum tolerance range, then check the actual
719 // spectra below.
721 const double sp_mass_lb = p_mass - CP.parent_mass_tolerance_max;
722 const double sp_mass_ub = p_mass + CP.parent_mass_tolerance_max;
724 const spmi_c_it candidate_spectra_info_begin
725 = spectrum::spectrum_mass_index.lower_bound(sp_mass_lb);
726 if (candidate_spectra_info_begin == spectrum::spectrum_mass_index.end()) {
727 // spectrum masses all too low to match peptide (peptide too long)
728 return -1;
731 const spmi_c_it candidate_spectra_info_end
732 = spectrum::spectrum_mass_index.upper_bound(sp_mass_ub);
733 if (candidate_spectra_info_end == spectrum::spectrum_mass_index.begin()) {
734 // spectrum masses all too high to match peptide (peptide too short)
735 return 1;
737 if (candidate_spectra_info_begin == candidate_spectra_info_end) {
738 // no spectrum with close-enough parent mass
739 return 0;
742 // Now generate the list of candidate spectra that are actually
743 // in-range, considering the charge of each spectrum.
744 std::vector<std::vector<spectrum>::size_type> candidate_spectra_0;
745 for (spmi_c_it it=candidate_spectra_info_begin;
746 it != candidate_spectra_info_end; it++) {
747 const spectrum &csp = spectrum::searchable_spectra[it->second];
748 if (std::abs(csp.mass - p_mass) <= (csp.charge
749 * CP.parent_mass_tolerance_1))
750 candidate_spectra_0.push_back(it->second);
752 if (candidate_spectra_0.empty())
753 return 0;
755 m.predicted_parent_mass = p_mass;
756 m.peptide_sequence.assign(run_sequence, begin_index, peptide_size);
758 double mass_list[peptide_size];
759 for (unsigned int i=0; i<peptide_size; i++)
760 mass_list[i] = (CP.fragment_mass_regime[context.mass_regime_index]
761 .fixed_residue_mass[m.peptide_sequence[i]]);
763 choose_residue_mod(context, m, NULL, mass_list, candidate_spectra_0,
764 stats, db_remaining, context.mod_count, 0);
765 return 0;
769 // Search a sequence run for matches according to the context against the
770 // spectra. Updates score_stats and the number of candidate spectra found.
772 // FIX: examine carefully for signed/unsigned problems
774 static inline void
775 search_run(const search_context &context, const sequence_run &sequence_run,
776 score_stats &stats) {
777 const unsigned int min_peptide_length
778 = std::max<unsigned int>(CP.minimum_peptide_length, context.mod_count);
779 const std::vector<double> &fixed_parent_mass
780 = CP.parent_mass_regime[context.mass_regime_index].fixed_residue_mass;
782 const std::string &run_sequence = sequence_run.sequence;
784 // This match will be passed inward and used to record information that we
785 // need to remember about a match when we finally see one. At that point, a
786 // copy of this match will be saved.
787 // FIX: can this be done better?
788 match m;
789 m.sequence_name.push_back(sequence_run.name);
790 m.peptide_begin.push_back(0);
791 int &peptide_begin = m.peptide_begin[0];
792 m.mass_regime_index = context.mass_regime_index;
793 m.conjunct_index = context.conjunct_index;
794 m.pca_delta = context.pca_delta;
796 assert(context.delta_bag_delta.size() == context.delta_bag_count.size());
797 // counts remaining as mod positions are chosen
798 std::vector<int> db_remaining = context.delta_bag_count;
800 // the rightmost 'end' seen when all spectra masses were too high
801 // (0 means none yet encountered.)
802 unsigned int next_end = 0;
804 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
805 double p_mass = context.parent_fixed_mass;
806 int p_begin=0, p_end=0;
808 const std::vector<int> *cleavage_points = &sequence_run.cleavage_points;
809 std::vector<int>::size_type cleavage_points_size = cleavage_points->size();
810 // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
811 static std::vector<int> nonspecific_cleavage_points;
812 if (context.nonspecific_cleavage) {
813 cleavage_points_size = run_sequence.size() + 1;
814 for (unsigned int i=nonspecific_cleavage_points.size();
815 i<cleavage_points_size; i++)
816 nonspecific_cleavage_points.push_back(i);
817 cleavage_points = &nonspecific_cleavage_points;
819 assert(cleavage_points_size >= 2); // endpoints must be present
821 for (unsigned int begin=0; begin<cleavage_points_size-1; begin++) {
822 const int begin_index = (*cleavage_points)[begin];
823 peptide_begin = begin_index + sequence_run.sequence_offset;
825 // If pca_residues specified, require one of them to be the peptide
826 // N-terminal residue
827 switch (context.pca_residues.size()) {
828 case 0:
829 break;
830 case 2:
831 if (context.pca_residues[1] == run_sequence[begin_index])
832 break;
833 case 1:
834 if (context.pca_residues[0] == run_sequence[begin_index])
835 break;
836 continue;
837 default:
838 assert(false);
841 assert(begin_index >= p_begin and begin_index >= 0);
842 for (int i=p_begin; i<begin_index; i++)
843 p_mass -= fixed_parent_mass[run_sequence[i]];
844 p_begin = begin_index;
846 unsigned int end = std::max<unsigned int>(begin + 1, next_end);
847 for (; end<cleavage_points_size; end++) {
848 if (not context.nonspecific_cleavage) {
849 const int missed_cleavage_count = end - begin - 1;
850 if (missed_cleavage_count > context.maximum_missed_cleavage_sites)
851 break;
854 const int end_index = (*cleavage_points)[end];
855 assert(end_index > begin_index);
856 const unsigned int peptide_size = end_index - begin_index;
857 if (peptide_size < min_peptide_length)
858 continue;
860 assert(end_index >= 0);
861 if (end_index >= p_end)
862 for (int i=p_end; i<end_index; i++)
863 p_mass += fixed_parent_mass[run_sequence[i]];
864 else
865 for (int i=end_index; i<p_end; i++)
866 p_mass -= fixed_parent_mass[run_sequence[i]];
867 p_end = end_index;
870 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
872 int status = search_peptide(context, p_mass, m, run_sequence,
873 begin_index, peptide_size, stats,
874 db_remaining);
875 if (status == -1) // peptide was too long
876 break;
877 if (status == 1) { // peptide was too short
878 // so next start with a peptide at least this long
879 if (end == cleavage_points_size - 1)
880 return;
881 next_end = end;
888 // Search all sequence runs for matches according to the context against the
889 // spectra. Updates score_stats and the number of candidate spectra found.
890 void
891 spectrum::search_runs(const search_context &context, score_stats &stats) {
892 const int no_runs = context.sequence_runs.size();
893 for (int i=0; i<no_runs; i++) {
894 search_run(context, context.sequence_runs[i], stats);
896 if (CP.show_progress)
897 std::cerr << i+1 << " of " << no_runs << " sequences, "
898 << stats.candidate_spectrum_count << " candidates\r"
899 << std::flush;
902 if (CP.show_progress)
903 std::cerr << std::endl;