1 // C++ module for greylag
3 // greylag, a collection of programs for MS/MS protein analysis
4 // Copyright (C) 2006-2008 Stowers Institute for Medical Research
6 // This program is free software: you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License as published by
8 // the Free Software Foundation, either version 3 of the License, or
9 // (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program. If not, see <http://www.gnu.org/licenses/>.
19 // Contact: Mike Coleman
20 // Stowers Institute for Medical Research
21 // 1000 East 50th Street
22 // Kansas City, Missouri 64110
26 #include "cgreylag.hpp"
42 // Use faster, non-threadsafe versions, since we don't use threads. This is
44 #define fgetsX fgets_unlocked
45 #define fputsX fputs_unlocked
46 #define fprintfX __builtin_fprintf_unlocked
47 #define ferrorX ferror_unlocked
48 #define getcX getc_unlocked
50 #define fgetsX std::fgets
51 #define fputsX std::fputs
52 #define fprintfX std::fprintf
53 #define ferrorX std::ferror
54 #define getcX std::getc
58 parameters
parameters::the
;
59 const parameters
&CP
= parameters::the
;
63 peak::__repr__() const {
64 static char temp
[128];
65 sprintf(temp
, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
66 mz
, intensity
, intensity_class
);
70 const int FIRST_SPECTRUM_ID
= 1;
71 int spectrum::next_id
= FIRST_SPECTRUM_ID
;
72 int spectrum::next_physical_id
= FIRST_SPECTRUM_ID
;
74 std::vector
<spectrum
> spectrum::searchable_spectra
;
75 // parent mass -> searchable_spectra index
77 std::vector
<spectrum
>::size_type
> spectrum::spectrum_mass_index
;
80 std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
85 spectrum::__repr__() const {
86 static char temp
[1024];
88 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
89 id
, physical_id
, name
.c_str(), mass
, charge
, peaks
.size());
94 // Return ln of n_C_k.
96 ln_combination(unsigned int n
, unsigned int k
) {
97 // Occasionally happens due to problems in scoring function. (FIX)
101 assert(0 <= k
and k
<= n
and n
< CP
.ln_factorial
.size());
102 return CP
.ln_factorial
[n
] - CP
.ln_factorial
[k
] - CP
.ln_factorial
[n
-k
];
106 // FIX: Currently we're reading spectrum files in Python, which is fast enough?
107 // // This is an error exit for read_spectra*.
108 // static inline void
109 // io_error(FILE *f, const char *message="") {
111 // message = "I/O error while reading (or writing) spectrum file";
112 // throw std::ios_base::failure(message);
116 // // true iff s consists entirely of whitespace
117 // static inline bool
118 // at_eol(const char *s) {
119 // return s[strspn(s, " \t\r\n\f\v")] == 0;
123 // // Read spectra from file in ms2 format, tagging them with file_id.
125 // // Multiply charged spectra (e.g., +2/+3) are split into separate spectra
126 // // having the same physical id. Note that depending on
127 // // offset_begin/offset_end, we may end up with the first charge and not the
128 // // second, or vice versa.
130 // // The peak list is initially sorted by mz. Throws an exception on invalid
131 // // input. Error checking is stringent in this function.
133 // // This is pretty hideous. Is there a simpler way to read, reasonably
134 // // efficiently, catching any input error?
136 // std::vector<spectrum>
137 // spectrum::read_spectra_from_ms2(FILE *f, const int file_id) {
138 // std::vector<spectrum> spectra;
140 // const int bufsiz = 1024;
144 // char *result = fgetsX(buf, bufsiz, f);
146 // std::vector<std::string> names;
147 // std::vector<double> masses;
148 // std::vector<int> charges;
154 // if (not result or buf[0] != ':')
156 // names.push_back(std::string(buf+1, buf+std::strlen(buf)-1));
157 // if (not fgetsX(buf, bufsiz, f))
158 // io_error(f, "bad ms2 format: mass/charge line expected");
160 // double mass = std::strtod(buf, &endp); // need double accuracy here
161 // masses.push_back(mass);
162 // if (errno or endp == buf or mass <= 0)
163 // io_error(f, "bad ms2 format: bad mass");
164 // const char *endp0 = endp;
165 // int charge = std::strtol(endp0, &endp, 10);
166 // charges.push_back(charge);
167 // if (errno or endp == endp0 or charge <= 0)
168 // io_error(f, "bad ms2 format: bad charge");
169 // if (not at_eol(endp))
170 // io_error(f, "bad ms2 format: junk at end of mass/charge line");
171 // result = fgetsX(buf, bufsiz, f);
174 // if (not names.empty())
175 // io_error(f, "bad ms2 format: spectrum has no peaks (file truncated?)");
179 // std::vector<peak> peaks;
183 // p.mz = strtod(buf, &endp);
184 // if (errno or endp == buf or p.mz <= 0)
185 // io_error(f, "bad ms2 format: bad peak mz");
186 // const char *endp0 = endp;
187 // p.intensity = strtod(endp0, &endp);
188 // if (errno or endp == endp0 or p.intensity <= 0)
189 // io_error(f, "bad ms2 format: bad peak intensity");
190 // if (not at_eol(endp))
191 // io_error(f, "bad ms2 format: junk at end of peak line");
192 // peaks.push_back(p);
194 // result = fgetsX(buf, bufsiz, f);
197 // if (not result or buf[0] == ':')
201 // // add spectra to vector
202 // if (names.empty() and not peaks.empty())
203 // io_error(f, "bad ms2 format: missing header lines?");
205 // std::sort(peaks.begin(), peaks.end(), peak::less_mz);
207 // for (std::vector<double>::size_type i=0; i<names.size(); i++) {
208 // spectrum sp(masses[i], charges[i]);
210 // sp.name = names[i];
211 // sp.file_id = file_id;
212 // sp.physical_id = next_physical_id;
213 // spectra.push_back(sp);
215 // if (file_id != -1)
216 // spectrum::next_physical_id += 1;
222 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
223 // to remove those too large to be fragment products. (Peaks are assumed to
224 // be ordered by increasing mz.)
226 spectrum::filter_peaks(double TIC_cutoff_proportion
,
227 double parent_mass_tolerance_max
) {
228 if (not (0 <= TIC_cutoff_proportion
and TIC_cutoff_proportion
<= 1)
229 or parent_mass_tolerance_max
< 0)
230 throw std::invalid_argument("invalid argument value");
232 // remove peaks too large to be fragment products
233 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
234 const double peak_mz_limit
= mass
+ parent_mass_tolerance_max
;
236 std::vector
<peak
>::iterator pi
;
237 for (pi
=peaks
.begin(); pi
!= peaks
.end() and pi
->mz
<= peak_mz_limit
; pi
++);
238 peaks
.erase(pi
, peaks
.end());
240 // FIX: note mzLowerBound..mzUpperBound here
243 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
245 total_ion_current
= 0;
246 for (pi
=peaks
.begin(); pi
!= peaks
.end(); pi
++)
247 total_ion_current
+= pi
->intensity
;
249 const double i_limit
= total_ion_current
* TIC_cutoff_proportion
;
250 double accumulated_intensity
= 0;
251 for (pi
=peaks
.begin();
252 pi
!= peaks
.end() and accumulated_intensity
<= i_limit
; pi
++)
253 accumulated_intensity
+= pi
->intensity
;
255 peaks
.erase(pi
, peaks
.end());
258 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
262 // Classify peaks and update class_counts and peak bin counts.
263 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
265 spectrum::classify(int intensity_class_count
, double intensity_class_ratio
,
266 double fragment_mass_tolerance
) {
267 if (intensity_class_count
< 1 or intensity_class_ratio
<= 1.0)
268 throw std::invalid_argument("invalid argument value");
270 intensity_class_counts
.clear();
271 intensity_class_counts
.resize(intensity_class_count
);
273 // e.g., 7 = 1 + 2 + 4
274 // FIX: does this fail for non-integer ratios?
275 int min_count
= int((pow(intensity_class_ratio
, intensity_class_count
) - 1)
276 / (intensity_class_ratio
- 1));
278 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
279 std::vector
<peak
>::iterator pi
=peaks
.begin();
280 for (int i_class
=0; i_class
<intensity_class_count
; i_class
++) {
281 int peaks_this_class
= int((pow(intensity_class_ratio
, i_class
) * peaks
.size()
283 for (int cp
=0; cp
< peaks_this_class
and pi
!= peaks
.end(); cp
++, pi
++) {
284 pi
->intensity_class
= i_class
;
285 intensity_class_counts
[i_class
]++;
288 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
290 if (not peaks
.empty()) {
291 min_peak_mz
= peaks
.front().mz
- CP
.fragment_mass_tolerance
;
292 max_peak_mz
= peaks
.back().mz
+ CP
.fragment_mass_tolerance
;
293 total_peak_bins
= (int((max_peak_mz
- min_peak_mz
)
294 / (2 * fragment_mass_tolerance
) + 0.5));
295 // This is the MyriMatch estimate. Wouldn't it be better to simply count
296 // how many are empty?
297 empty_peak_bins
= std::max
<int>(total_peak_bins
- peaks
.size(), 0);
302 // Update the *_cache fields from the peaks field.
304 spectrum::update_peak_cache() {
307 const unsigned int peak_count
= peaks
.size();
308 peak_mz_cache
= new double[peak_count
+1];
309 peak_intensity_class_cache
= new int[peak_count
+1];
311 for (unsigned int i
=0; i
<peak_count
; i
++) {
312 peak_mz_cache
[i
] = peaks
[i
].mz
;
313 peak_intensity_class_cache
[i
] = peaks
[i
].intensity_class
;
316 // negative value is terminator
317 peak_mz_cache
[peak_count
] = -1;
318 peak_intensity_class_cache
[peak_count
] = -1;
322 // Store the list of spectra that search_peptide will search against, and also
323 // build spectrum_mass_index.
325 spectrum::set_searchable_spectra(const std::vector
<spectrum
> &spectra
) {
326 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++)
327 spectra
[i
].clear_peak_cache();
328 searchable_spectra
= spectra
;
330 spectrum_mass_index
.clear();
331 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++) {
332 spectrum_mass_index
.insert(std::make_pair(spectra
[i
].mass
, i
));
333 searchable_spectra
[i
].update_peak_cache();
338 // Calculate peaks for a synthesized mass (not mz) ladder.
340 get_synthetic_Y_mass_ladder(double *mass_ladder
, const unsigned int ladder_size
,
341 const double *mass_list
,
342 const double fragment_C_fixed_mass
) {
343 double m
= fragment_C_fixed_mass
;
345 for (unsigned int i
=ladder_size
; i
>0; i
--) {
347 mass_ladder
[ladder_size
-i
] = m
;
352 // Calculate peaks for a synthesized mass (not mz) ladder.
354 get_synthetic_B_mass_ladder(double *mass_ladder
, const unsigned int ladder_size
,
355 const double *mass_list
,
356 const double fragment_N_fixed_mass
) {
357 double m
= fragment_N_fixed_mass
;
359 for (unsigned int i
=0; i
<ladder_size
; i
++) {
366 // Generate synthetic spectra for a set of charges. Only the charge and peaks
367 // of these spectra are initialized.
369 synthetic_spectra(double synth_sp_mz
[/* max_fragment_charge+1 */]
370 [spectrum::MAX_THEORETICAL_FRAGMENTS
],
371 const double *mass_list
, const unsigned int mass_list_size
,
372 const double fragment_N_fixed_mass
,
373 const double fragment_C_fixed_mass
,
374 const int max_fragment_charge
) {
375 assert(mass_list_size
>= 1);
376 const unsigned int ladder_size
= mass_list_size
- 1;
377 double Y_mass_ladder
[ladder_size
];
378 double B_mass_ladder
[ladder_size
];
380 for (int charge
=1; charge
<=max_fragment_charge
; charge
++) {
381 get_synthetic_Y_mass_ladder(Y_mass_ladder
, ladder_size
, mass_list
,
382 fragment_C_fixed_mass
);
383 get_synthetic_B_mass_ladder(B_mass_ladder
, ladder_size
, mass_list
,
384 fragment_N_fixed_mass
);
386 unsigned int y_i
=0, b_i
=0;
387 double *sp_mz_p
= synth_sp_mz
[charge
];
389 // merge the two (already sorted) mass lists, converting them to mz values
391 // FIX: try algorithm::merge?
392 while (y_i
< ladder_size
and b_i
< ladder_size
)
393 if (Y_mass_ladder
[y_i
] < B_mass_ladder
[b_i
])
394 *(sp_mz_p
++) = peak::get_mz(Y_mass_ladder
[y_i
++], charge
);
396 *(sp_mz_p
++) = peak::get_mz(B_mass_ladder
[b_i
++], charge
);
397 while (y_i
< ladder_size
)
398 *(sp_mz_p
++) = peak::get_mz(Y_mass_ladder
[y_i
++], charge
);
399 while (b_i
< ladder_size
)
400 *(sp_mz_p
++) = peak::get_mz(B_mass_ladder
[b_i
++], charge
);
401 *sp_mz_p
= -1; // invalid mz as terminator
406 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
407 // theoretical spectrum (y) generated from a peptide candidate.
409 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
410 // better yet, sets) for peak lists simpler? As fast or faster?
412 // This is the innermost loop, so it's worthwhile to optimize this some.
413 // FIX const declaration
415 score_spectrum(const spectrum
&x
,
416 const double synth_mzs
[spectrum::MAX_THEORETICAL_FRAGMENTS
]) {
418 unsigned int y_peak_count
=0;
419 for (const double *p
=synth_mzs
; *p
>= 0; p
++, y_peak_count
++);
421 double peak_best_delta
[y_peak_count
];
422 int peak_best_class
[y_peak_count
];
423 for (unsigned int i
=0; i
<y_peak_count
; i
++) {
424 peak_best_delta
[i
] = CP
.fragment_mass_tolerance
;
425 peak_best_class
[i
] = -1;
428 // Iterate over all closest pairs of peaks, collecting info on the class of
429 // the best (meaning closest mz) real peak matching each theoretical peak
430 // (for real/theoretical peak pairs that are no farther apart than
431 // fragment_mass_tolerance).
433 const unsigned int x_peak_count
=x
.peaks
.size();
434 assert(x_peak_count
> 0 and y_peak_count
> 0);
436 unsigned int x_index
=0, y_index
=0;
438 const double delta
= synth_mzs
[y_index
] - x
.peak_mz_cache
[x_index
];
439 if (std::abs(delta
) <= peak_best_delta
[y_index
]) {
440 peak_best_class
[y_index
] = x
.peak_intensity_class_cache
[x_index
];
441 peak_best_delta
[y_index
] = std::abs(delta
);
444 if (++x_index
>= x_peak_count
)
447 if (++y_index
>= y_peak_count
)
452 assert(CP
.intensity_class_count
< INT_MAX
);
453 const unsigned int intensity_class_count
= CP
.intensity_class_count
;
454 unsigned int peak_hit_histogram
[intensity_class_count
];
455 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
456 peak_hit_histogram
[i
] = 0;
457 for (unsigned int i
=0; i
<y_peak_count
; i
++) {
458 const int bc
= peak_best_class
[i
];
460 assert(bc
< static_cast<int>(intensity_class_count
));
461 peak_hit_histogram
[bc
] += 1;
465 // How many theoretical peaks overlap the real peak range?
466 int valid_theoretical_peaks
= 0;
467 for (unsigned int i
=0; i
<y_peak_count
; i
++)
468 if (x
.min_peak_mz
<= synth_mzs
[i
] and synth_mzs
[i
] <= x
.max_peak_mz
)
469 valid_theoretical_peaks
++;
471 unsigned int total_peak_hits
= 0;
472 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
473 total_peak_hits
+= peak_hit_histogram
[i
];
475 // How many valid theoretical peaks were misses?
476 const int peak_misses
= valid_theoretical_peaks
- total_peak_hits
;
477 assert(peak_misses
>= 0);
479 double score
= ln_combination(x
.empty_peak_bins
, peak_misses
);
480 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
481 score
+= ln_combination(x
.intensity_class_counts
[i
],
482 peak_hit_histogram
[i
]);
483 score
-= ln_combination(x
.total_peak_bins
, valid_theoretical_peaks
);
489 // Return the score of the specified spectrum against a group of synthetic
492 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
493 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
494 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
496 // FIX: implement MM smart +3 model, and/or come up with something better.
499 score_spectrum_over_charges(const double
500 synth_sp_mz
[/* max_fragment_charge+1 */]
501 [spectrum::MAX_THEORETICAL_FRAGMENTS
],
503 const int max_fragment_charge
) {
504 double best_score
= score_spectrum(sp
, synth_sp_mz
[1]);
506 const int charge_limit
= std::max
<int>(1, sp
.charge
-1);
507 for (int charge
=2; charge
<=charge_limit
; charge
++) {
508 assert(charge
<= max_fragment_charge
);
509 best_score
+= score_spectrum(sp
, synth_sp_mz
[charge
]);
515 struct mass_trace_list
{
516 mass_trace_item item
;
517 const mass_trace_list
*next
;
519 mass_trace_list(const mass_trace_list
*p
=0) : next(p
) { }
523 // fuzzy comparison might not be necessary, but it's safer
525 score_equal(double s1
, double s2
) { return std::abs(s1
-s2
) < 1e-6; }
528 // Search for matches of this particular peptide modification variation
529 // against the spectra. Updates score_stats and returns the number of
530 // candidate spectra found.
532 evaluate_peptide(const search_context
&context
, match
&m
,
533 const mass_trace_list
*mtlp
, const double *mass_list
,
534 const std::vector
<std::vector
<spectrum
>::size_type
>
536 score_stats
&stats
) {
537 int max_candidate_charge
= 0;
538 typedef std::vector
<std::vector
<spectrum
>::size_type
>::const_iterator c_it
;
539 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++)
541 = std::max
<int>(max_candidate_charge
,
542 spectrum::searchable_spectra
[*it
].charge
);
543 assert(max_candidate_charge
>= 1);
545 const int max_fragment_charge
= std::max
<int>(1, max_candidate_charge
-1);
546 assert(max_fragment_charge
<= spectrum::MAX_SUPPORTED_CHARGE
);
548 const unsigned int peptide_size
= m
.peptide_sequence
.size();
549 // FIX: "2" assumes just two ion series (e.g., B and Y)
550 assert(peptide_size
<= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS
-1));
552 // This array is preallocated to avoid the overhead of allocation and
553 // deallocation within the inner loop.
555 // charge -> mz array
556 static double synth_sp_mz
[spectrum::MAX_SUPPORTED_CHARGE
+1]
557 [spectrum::MAX_THEORETICAL_FRAGMENTS
];
559 synthetic_spectra(synth_sp_mz
, mass_list
, peptide_size
,
560 context
.fragment_N_fixed_mass
,
561 context
.fragment_C_fixed_mass
, max_fragment_charge
);
563 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++) {
564 m
.spectrum_index
= *it
;
565 spectrum
&sp
= spectrum::searchable_spectra
[m
.spectrum_index
];
568 stats
.candidate_spectrum_count
+= 1;
569 if (CP
.estimate_only
)
572 //std::cerr << "sp " << m.spectrum_index << std::endl;
573 m
.score
= score_spectrum_over_charges(synth_sp_mz
, sp
, max_fragment_charge
);
574 //std::cerr << "score " << m.score << std::endl;
576 std::vector
<match
> &sp_best_matches
= stats
.best_matches
[m
.spectrum_index
];
577 // Is this score good enough to be in the best score list? (Better scores
578 // are more negative.)
579 if (m
.score
> sp_best_matches
.back().score
)
582 // We're saving this match, so remember the mass trace info, too.
583 m
.mass_trace
.clear();
584 for (const mass_trace_list
*p
=mtlp
; p
; p
=p
->next
)
585 m
.mass_trace
.push_back(p
->item
);
587 // If this is duplicate, just append to the existing match and continue
588 bool duplicate
= false;
589 for (std::vector
<match
>::reverse_iterator rit
590 = sp_best_matches
.rbegin(); rit
!= sp_best_matches
.rend(); rit
++)
591 if (score_equal(rit
->score
, m
.score
)
592 and rit
->peptide_sequence
== m
.peptide_sequence
593 and rit
->mass_regime_index
== m
.mass_regime_index
594 and rit
->conjunct_index
== m
.conjunct_index
595 and rit
->mass_trace
== m
.mass_trace
) {
596 rit
->peptide_begin
.push_back(m
.peptide_begin
[0]);
597 rit
->sequence_name
.push_back(m
.sequence_name
[0]);
602 // Otherwise, insert this match in the correct position
604 assert(sp_best_matches
.size() >= 1);
605 std::vector
<match
>::reverse_iterator rit
= sp_best_matches
.rbegin();
606 for (;rit
+1 != sp_best_matches
.rend() and (rit
+1)->score
> m
.score
; rit
++)
614 // IDEA FIX: Use a cost parameter to define the iterative search front.
617 // Choose one possible residue modification position. Once they're all
618 // chosen, then evaluate.
620 choose_residue_mod(const search_context
&context
, match
&m
,
621 const mass_trace_list
*mtlp
, double *mass_list
,
622 const std::vector
<std::vector
<spectrum
>::size_type
>
625 std::vector
<int> &db_remaining
,
626 const unsigned int remaining_positions_to_choose
,
627 const unsigned int next_position_to_consider
) {
628 assert(remaining_positions_to_choose
629 <= m
.peptide_sequence
.size() - next_position_to_consider
);
631 if (remaining_positions_to_choose
== 0) {
632 stats
.evaluation_count
+= 1;
633 evaluate_peptide(context
, m
, mtlp
, mass_list
, candidate_spectra
, stats
);
635 mass_trace_list
mtl(mtlp
);
637 // consider all of the positions where we could next add a mod
638 for (unsigned int i
=next_position_to_consider
;
639 i
<= m
.peptide_sequence
.size()-remaining_positions_to_choose
; i
++) {
640 mtl
.item
.position
= i
;
641 const double save_pos_mass
=mass_list
[i
];
642 const char pos_res
= m
.peptide_sequence
[i
];
644 // consider the possibilities for this position
645 for (std::vector
<int>::const_iterator
646 it
=context
.delta_bag_lookup
[pos_res
].begin();
647 it
!= context
.delta_bag_lookup
[pos_res
].end(); it
++) {
648 const int db_index
= *it
;
649 if (db_remaining
[db_index
] < 1)
651 db_remaining
[db_index
] -= 1;
652 mass_list
[i
] = save_pos_mass
+ context
.delta_bag_delta
[db_index
];
653 mtl
.item
.conjunct_item_index
= db_index
;
654 choose_residue_mod(context
, m
, &mtl
, mass_list
, candidate_spectra
,
656 remaining_positions_to_choose
-1, i
+1);
657 db_remaining
[db_index
] += 1;
659 mass_list
[i
] = save_pos_mass
;
666 search_peptide(const search_context
&context
, const double &p_mass
, match
&m
,
667 const std::string
&run_sequence
, const int &begin_index
,
668 const unsigned int &peptide_size
, score_stats
&stats
,
669 std::vector
<int> &db_remaining
) {
670 // We'd like to consider only the spectra whose masses are within the
671 // tolerance range of theoretical mass of the peptide (p_mass).
672 // Unfortunately, the tolerance range of each spectrum depends on its
673 // charge, since the tolerance is specified in m/z units. Furthermore,
674 // when deciding how to adjust begin/end to generate the next peptide,
675 // we need to be conservative, as a high-charge spectrum may come into
676 // range, even though only +1 spectra are currently in-range. So, we
677 // first screen using the maximum tolerance range, then check the actual
680 const double sp_mass_lb
= p_mass
- CP
.parent_mass_tolerance_max
;
681 const double sp_mass_ub
= p_mass
+ CP
.parent_mass_tolerance_max
;
683 const spmi_c_it candidate_spectra_info_begin
684 = spectrum::spectrum_mass_index
.lower_bound(sp_mass_lb
);
685 if (candidate_spectra_info_begin
== spectrum::spectrum_mass_index
.end()) {
686 // spectrum masses all too low to match peptide (peptide too long)
690 const spmi_c_it candidate_spectra_info_end
691 = spectrum::spectrum_mass_index
.upper_bound(sp_mass_ub
);
692 if (candidate_spectra_info_end
== spectrum::spectrum_mass_index
.begin()) {
693 // spectrum masses all too high to match peptide (peptide too short)
696 if (candidate_spectra_info_begin
== candidate_spectra_info_end
) {
697 // no spectrum with close-enough parent mass
701 // Now generate the list of candidate spectra that are actually
702 // in-range, considering the charge of each spectrum.
703 std::vector
<std::vector
<spectrum
>::size_type
> candidate_spectra_0
;
704 for (spmi_c_it it
=candidate_spectra_info_begin
;
705 it
!= candidate_spectra_info_end
; it
++) {
706 const spectrum
&csp
= spectrum::searchable_spectra
[it
->second
];
707 if (std::abs(csp
.mass
- p_mass
) <= (csp
.charge
708 * CP
.parent_mass_tolerance_1
))
709 candidate_spectra_0
.push_back(it
->second
);
711 if (candidate_spectra_0
.empty())
714 m
.predicted_parent_mass
= p_mass
;
715 m
.peptide_sequence
.assign(run_sequence
, begin_index
, peptide_size
);
717 double mass_list
[peptide_size
];
718 for (unsigned int i
=0; i
<peptide_size
; i
++)
719 mass_list
[i
] = (CP
.fragment_mass_regime
[context
.mass_regime_index
]
720 .fixed_residue_mass
[m
.peptide_sequence
[i
]]);
722 choose_residue_mod(context
, m
, NULL
, mass_list
, candidate_spectra_0
,
723 stats
, db_remaining
, context
.mod_count
, 0);
728 // Search a sequence run for matches according to the context against the
729 // spectra. Updates score_stats and the number of candidate spectra found.
731 // FIX: examine carefully for signed/unsigned problems
734 search_run(const search_context
&context
, const sequence_run
&sequence_run
,
735 score_stats
&stats
) {
736 const unsigned int min_peptide_length
737 = std::max
<unsigned int>(CP
.minimum_peptide_length
, context
.mod_count
);
738 const std::vector
<double> &fixed_parent_mass
739 = CP
.parent_mass_regime
[context
.mass_regime_index
].fixed_residue_mass
;
741 const std::string
&run_sequence
= sequence_run
.sequence
;
743 // This match will be passed inward and used to record information that we
744 // need to remember about a match when we finally see one. At that point, a
745 // copy of this match will be saved.
746 // FIX: can this be done better?
748 m
.sequence_name
.push_back(sequence_run
.name
);
749 m
.peptide_begin
.push_back(0);
750 int &peptide_begin
= m
.peptide_begin
[0];
751 m
.mass_regime_index
= context
.mass_regime_index
;
752 m
.conjunct_index
= context
.conjunct_index
;
753 m
.pca_delta
= context
.pca_delta
;
755 assert(context
.delta_bag_delta
.size() == context
.delta_bag_count
.size());
756 // counts remaining as mod positions are chosen
757 std::vector
<int> db_remaining
= context
.delta_bag_count
;
759 // the rightmost 'end' seen when all spectra masses were too high
760 // (0 means none yet encountered.)
761 unsigned int next_end
= 0;
763 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
764 double p_mass
= context
.parent_fixed_mass
;
765 int p_begin
=0, p_end
=0;
767 const std::vector
<int> *cleavage_points
= &sequence_run
.cleavage_points
;
768 std::vector
<int>::size_type cleavage_points_size
= cleavage_points
->size();
769 // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
770 static std::vector
<int> nonspecific_cleavage_points
;
771 if (context
.nonspecific_cleavage
) {
772 cleavage_points_size
= run_sequence
.size() + 1;
773 for (unsigned int i
=nonspecific_cleavage_points
.size();
774 i
<cleavage_points_size
; i
++)
775 nonspecific_cleavage_points
.push_back(i
);
776 cleavage_points
= &nonspecific_cleavage_points
;
778 assert(cleavage_points_size
>= 2); // endpoints must be present
780 for (unsigned int begin
=0; begin
<cleavage_points_size
-1; begin
++) {
781 const int begin_index
= (*cleavage_points
)[begin
];
782 peptide_begin
= begin_index
+ sequence_run
.sequence_offset
;
784 // If pca_residues specified, require one of them to be the peptide
785 // N-terminal residue
786 switch (context
.pca_residues
.size()) {
790 if (context
.pca_residues
[1] == run_sequence
[begin_index
])
793 if (context
.pca_residues
[0] == run_sequence
[begin_index
])
800 assert(begin_index
>= p_begin
and begin_index
>= 0);
801 for (int i
=p_begin
; i
<begin_index
; i
++)
802 p_mass
-= fixed_parent_mass
[run_sequence
[i
]];
803 p_begin
= begin_index
;
805 unsigned int end
= std::max
<unsigned int>(begin
+ 1, next_end
);
806 for (; end
<cleavage_points_size
; end
++) {
807 if (not context
.nonspecific_cleavage
) {
808 const int missed_cleavage_count
= end
- begin
- 1;
809 if (missed_cleavage_count
> context
.maximum_missed_cleavage_sites
)
813 const int end_index
= (*cleavage_points
)[end
];
814 assert(end_index
> begin_index
);
815 const unsigned int peptide_size
= end_index
- begin_index
;
816 if (peptide_size
< min_peptide_length
)
819 assert(end_index
>= 0);
820 if (end_index
>= p_end
)
821 for (int i
=p_end
; i
<end_index
; i
++)
822 p_mass
+= fixed_parent_mass
[run_sequence
[i
]];
824 for (int i
=end_index
; i
<p_end
; i
++)
825 p_mass
-= fixed_parent_mass
[run_sequence
[i
]];
828 // std::cerr << "peptide: "
829 // << std::string(run_sequence, begin_index, peptide_size)
830 // << " p_mass: " << p_mass << std::endl;
832 int status
= search_peptide(context
, p_mass
, m
, run_sequence
,
833 begin_index
, peptide_size
, stats
,
835 if (status
== -1) // peptide was too long
837 if (status
== 1) { // peptide was too short
838 // so next start with a peptide at least this long
839 if (end
== cleavage_points_size
- 1)
848 // Search all sequence runs for matches according to the context against the
849 // spectra. Updates score_stats and the number of candidate spectra found.
851 spectrum::search_runs(const search_context
&context
, score_stats
&stats
) {
852 const int no_runs
= context
.sequence_runs
.size();
853 for (int i
=0; i
<no_runs
; i
++) {
854 search_run(context
, context
.sequence_runs
[i
], stats
);
856 if (CP
.show_progress
)
857 std::cerr
<< i
+1 << " of " << no_runs
<< " sequences, "
858 << stats
.candidate_spectrum_count
<< " candidates\r"
862 if (CP
.show_progress
)
863 std::cerr
<< std::endl
;