1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
36 // Use faster, non-threadsafe versions, since we don't use threads. This is
38 #define fgetsX fgets_unlocked
39 #define fputsX fputs_unlocked
40 #define fprintfX __builtin_fprintf_unlocked
41 #define ferrorX ferror_unlocked
42 #define getcX getc_unlocked
44 #define fgetsX std::fgets
45 #define fputsX std::fputs
46 #define fprintfX std::fprintf
47 #define ferrorX std::ferror
48 #define getcX std::getc
52 parameters
parameters::the
;
53 const parameters
&CP
= parameters::the
;
57 peak::__repr__() const {
58 static char temp
[128];
59 sprintf(temp
, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
60 mz
, intensity
, intensity_class
);
64 const int FIRST_SPECTRUM_ID
= 1;
65 int spectrum::next_id
= FIRST_SPECTRUM_ID
;
66 int spectrum::next_physical_id
= FIRST_SPECTRUM_ID
;
68 std::vector
<spectrum
> spectrum::searchable_spectra
;
69 // parent mass -> searchable_spectra index
71 std::vector
<spectrum
>::size_type
> spectrum::spectrum_mass_index
;
74 std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
79 spectrum::__repr__() const {
80 static char temp
[1024];
82 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
83 id
, physical_id
, name
.c_str(), mass
, charge
, peaks
.size());
88 // FIX: does this actually help inlining?
89 // Return ln of n_C_k.
91 ln_combination_(unsigned int n
, unsigned int k
) {
92 // Occasionally happens due to problems in scoring function. (FIX)
96 assert(0 <= k
and k
<= n
and n
< CP
.ln_factorial
.size());
97 return CP
.ln_factorial
[n
] - CP
.ln_factorial
[k
] - CP
.ln_factorial
[n
-k
];
100 ln_combination(unsigned int n
, unsigned int k
) {
101 return ln_combination(n
, k
);
106 // This is an error exit for read_spectra*.
108 io_error(FILE *f
, const char *message
="") {
110 message
= "I/O error while reading (or writing) spectrum file";
111 throw std::ios_base::failure(message
);
115 // true iff s consists entirely of whitespace
117 at_eol(const char *s
) {
118 return s
[strspn(s
, " \t\r\n\f\v")] == 0;
122 // Check whether we've past the end offset, throwing exception on I/O error.
124 check_past_end(FILE *f
, const long offset_end
) {
125 if (offset_end
== -1)
127 long pos
= std::ftell(f
);
130 return pos
>= offset_end
;
134 // Read spectra from file in ms2 format, tagging them with file_id. Before
135 // reading, seek to absolute position offset_begin. If offset_end != -1, any
136 // spectra that begin after position offset_end in the file are not read.
138 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
139 // having the same physical id. Note that depending on
140 // offset_begin/offset_end, we may end up with the first charge and not the
141 // second, or vice versa.
143 // The peak list is initially sorted by mz. Throws an exception on invalid
144 // input. Error checking is the most stringent in this function. Other
145 // readers can be a little looser since all ms2 files eventually get read here
148 // This is pretty hideous. Is there a simpler way to read, reasonably
149 // efficiently, catching any input error?
151 std::vector
<spectrum
>
152 spectrum::read_spectra_from_ms2(FILE *f
, const int file_id
,
153 const long offset_begin
,
154 const long offset_end
) {
155 std::vector
<spectrum
> spectra
;
157 const int bufsiz
= 1024;
161 // first seek to offset_begin and synchronize at next "\n:"
162 if (offset_begin
> 0) {
163 if (std::fseek(f
, offset_begin
-1, SEEK_SET
) == -1)
166 while (c
!= '\n' and c
!= EOF
)
173 } while (c
!= ':' and c
!= EOF
);
180 // at this point we expect to read the first ':' of a header, or EOF
182 bool past_end
= check_past_end(f
, offset_end
);
183 char *result
= fgetsX(buf
, bufsiz
, f
);
185 std::vector
<std::string
> names
;
186 std::vector
<double> masses
;
187 std::vector
<int> charges
;
193 if (not result
or buf
[0] != ':')
196 names
.push_back(std::string(buf
+1, buf
+std::strlen(buf
)-1));
197 if (not fgetsX(buf
, bufsiz
, f
))
198 io_error(f
, "bad ms2 format: mass/charge line expected");
200 double mass
= std::strtod(buf
, &endp
); // need double accuracy here
202 masses
.push_back(mass
);
203 if (errno
or endp
== buf
or mass
<= 0)
204 io_error(f
, "bad ms2 format: bad mass");
205 const char *endp0
= endp
;
206 int charge
= std::strtol(endp0
, &endp
, 10);
208 charges
.push_back(charge
);
209 if (errno
or endp
== endp0
or charge
<= 0)
210 io_error(f
, "bad ms2 format: bad charge");
211 if (not at_eol(endp
))
212 io_error(f
, "bad ms2 format: junk at end of mass/charge line");
213 past_end
= past_end
or check_past_end(f
, offset_end
);
214 result
= fgetsX(buf
, bufsiz
, f
);
216 if (names
.empty() and past_end
)
219 if (not names
.empty())
220 io_error(f
, "bad ms2 format: spectrum has no peaks (file truncated?)");
224 std::vector
<peak
> peaks
;
228 p
.mz
= strtod(buf
, &endp
);
229 if (errno
or endp
== buf
or p
.mz
<= 0)
230 io_error(f
, "bad ms2 format: bad peak mz");
231 const char *endp0
= endp
;
232 p
.intensity
= strtod(endp0
, &endp
);
233 if (errno
or endp
== endp0
or p
.intensity
<= 0)
234 io_error(f
, "bad ms2 format: bad peak intensity");
235 if (not at_eol(endp
))
236 io_error(f
, "bad ms2 format: junk at end of peak line");
239 past_end
= past_end
or check_past_end(f
, offset_end
);
240 result
= fgetsX(buf
, bufsiz
, f
);
243 if (not result
or buf
[0] == ':')
247 // add spectra to vector
248 if (names
.empty() and not peaks
.empty())
249 io_error(f
, "bad ms2 format: missing header lines?");
251 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
253 for (std::vector
<double>::size_type i
=0; i
<names
.size(); i
++) {
254 spectrum
sp(masses
[i
], charges
[i
]);
257 sp
.file_id
= file_id
;
258 sp
.physical_id
= next_physical_id
;
259 spectra
.push_back(sp
);
262 spectrum::next_physical_id
+= 1;
268 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
269 // to remove those too large to be fragment products. (Peaks are assumed to
270 // be ordered by increasing mz.)
272 spectrum::filter_peaks(double TIC_cutoff_proportion
,
273 double parent_mass_tolerance_max
) {
274 if (not (0 <= TIC_cutoff_proportion
and TIC_cutoff_proportion
<= 1)
275 or parent_mass_tolerance_max
< 0)
276 throw std::invalid_argument("invalid argument value");
278 // remove peaks too large to be fragment products
279 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
280 const double peak_mz_limit
= mass
+ parent_mass_tolerance_max
;
282 std::vector
<peak
>::iterator pi
;
283 for (pi
=peaks
.begin(); pi
!= peaks
.end() and pi
->mz
<= peak_mz_limit
; pi
++);
284 peaks
.erase(pi
, peaks
.end());
286 // FIX: note mzLowerBound..mzUpperBound here
289 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
291 total_ion_current
= 0;
292 for (pi
=peaks
.begin(); pi
!= peaks
.end(); pi
++)
293 total_ion_current
+= pi
->intensity
;
295 const double i_limit
= total_ion_current
* TIC_cutoff_proportion
;
296 double accumulated_intensity
= 0;
297 for (pi
=peaks
.begin();
298 pi
!= peaks
.end() and accumulated_intensity
<= i_limit
; pi
++)
299 accumulated_intensity
+= pi
->intensity
;
301 peaks
.erase(pi
, peaks
.end());
304 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
308 // Classify peaks and update class_counts and peak bin counts.
309 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
311 spectrum::classify(int intensity_class_count
, double intensity_class_ratio
,
312 double fragment_mass_tolerance
) {
313 if (intensity_class_count
< 1 or intensity_class_ratio
<= 1.0)
314 throw std::invalid_argument("invalid argument value");
316 intensity_class_counts
.clear();
317 intensity_class_counts
.resize(intensity_class_count
);
319 // e.g., 7 = 1 + 2 + 4
320 // FIX: does this fail for non-integer ratios?
321 int min_count
= int((pow(intensity_class_ratio
, intensity_class_count
) - 1)
322 / (intensity_class_ratio
- 1));
324 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
325 std::vector
<peak
>::iterator pi
=peaks
.begin();
326 for (int i_class
=0; i_class
<intensity_class_count
; i_class
++) {
327 int peaks_this_class
= int((pow(intensity_class_ratio
, i_class
) * peaks
.size()
329 for (int cp
=0; cp
< peaks_this_class
and pi
!= peaks
.end(); cp
++, pi
++) {
330 pi
->intensity_class
= i_class
;
331 intensity_class_counts
[i_class
]++;
334 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
336 if (not peaks
.empty()) {
337 min_peak_mz
= peaks
.front().mz
- CP
.fragment_mass_tolerance
;
338 max_peak_mz
= peaks
.back().mz
+ CP
.fragment_mass_tolerance
;
339 total_peak_bins
= (int((max_peak_mz
- min_peak_mz
)
340 / (2 * fragment_mass_tolerance
) + 0.5));
341 // This is the MyriMatch estimate. Wouldn't it be better to simply count
342 // how many are empty?
343 empty_peak_bins
= std::max
<int>(total_peak_bins
- peaks
.size(), 0);
348 // Update the *_cache fields from the peaks field.
350 spectrum::update_peak_cache() {
353 const unsigned int peak_count
= peaks
.size();
354 peak_mz_cache
= new double[peak_count
+1];
355 peak_intensity_class_cache
= new int[peak_count
+1];
357 for (unsigned int i
=0; i
<peak_count
; i
++) {
358 peak_mz_cache
[i
] = peaks
[i
].mz
;
359 peak_intensity_class_cache
[i
] = peaks
[i
].intensity_class
;
362 // negative value is terminator
363 peak_mz_cache
[peak_count
] = -1;
364 peak_intensity_class_cache
[peak_count
] = -1;
368 // Store the list of spectra that search_peptide will search against, and also
369 // build spectrum_mass_index.
371 spectrum::set_searchable_spectra(const std::vector
<spectrum
> &spectra
) {
372 searchable_spectra
= spectra
;
373 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++) {
374 spectrum_mass_index
.insert(std::make_pair(spectra
[i
].mass
, i
));
375 searchable_spectra
[i
].update_peak_cache();
380 // Calculate peaks for a synthesized mass (not mz) ladder.
382 get_synthetic_Y_mass_ladder(double *mass_ladder
, const unsigned int ladder_size
,
383 const double *mass_list
,
384 const double fragment_C_fixed_mass
) {
385 double m
= fragment_C_fixed_mass
;
387 for (unsigned int i
=ladder_size
; i
>0; i
--) {
389 mass_ladder
[ladder_size
-i
] = m
;
394 // Calculate peaks for a synthesized mass (not mz) ladder.
396 get_synthetic_B_mass_ladder(double *mass_ladder
, const unsigned int ladder_size
,
397 const double *mass_list
,
398 const double fragment_N_fixed_mass
) {
399 double m
= fragment_N_fixed_mass
;
401 for (unsigned int i
=0; i
<ladder_size
; i
++) {
408 // Generate synthetic spectra for a set of charges. Only the charge and peaks
409 // of these spectra are initialized.
411 synthetic_spectra(double synth_sp_mz
[/* max_fragment_charge+1 */]
412 [spectrum::MAX_THEORETICAL_FRAGMENTS
],
413 const double *mass_list
, const unsigned int mass_list_size
,
414 const double fragment_N_fixed_mass
,
415 const double fragment_C_fixed_mass
,
416 const int max_fragment_charge
) {
417 assert(mass_list_size
>= 1);
418 const unsigned int ladder_size
= mass_list_size
- 1;
419 double Y_mass_ladder
[ladder_size
];
420 double B_mass_ladder
[ladder_size
];
422 for (int charge
=1; charge
<=max_fragment_charge
; charge
++) {
423 get_synthetic_Y_mass_ladder(Y_mass_ladder
, ladder_size
, mass_list
,
424 fragment_C_fixed_mass
);
425 get_synthetic_B_mass_ladder(B_mass_ladder
, ladder_size
, mass_list
,
426 fragment_N_fixed_mass
);
428 unsigned int y_i
=0, b_i
=0;
429 double *sp_mz_p
= synth_sp_mz
[charge
];
431 // merge the two (already sorted) mass lists, converting them to mz values
433 // FIX: try algorithm::merge?
434 while (y_i
< ladder_size
and b_i
< ladder_size
)
435 if (Y_mass_ladder
[y_i
] < B_mass_ladder
[b_i
])
436 *(sp_mz_p
++) = peak::get_mz(Y_mass_ladder
[y_i
++], charge
);
438 *(sp_mz_p
++) = peak::get_mz(B_mass_ladder
[b_i
++], charge
);
439 while (y_i
< ladder_size
)
440 *(sp_mz_p
++) = peak::get_mz(Y_mass_ladder
[y_i
++], charge
);
441 while (b_i
< ladder_size
)
442 *(sp_mz_p
++) = peak::get_mz(B_mass_ladder
[b_i
++], charge
);
443 *sp_mz_p
= -1; // invalid mz as terminator
448 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
449 // theoretical spectrum (y) generated from a peptide candidate.
451 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
452 // better yet, sets) for peak lists simpler? As fast or faster?
454 // This is the innermost loop, so it's worthwhile to optimize this some.
455 // FIX const declaration
457 score_spectrum(const spectrum
&x
,
458 const double synth_mzs
[spectrum::MAX_THEORETICAL_FRAGMENTS
]) {
460 unsigned int y_peak_count
=0;
461 for (const double *p
=synth_mzs
; *p
>= 0; p
++, y_peak_count
++);
463 double peak_best_delta
[y_peak_count
];
464 int peak_best_class
[y_peak_count
];
465 for (unsigned int i
=0; i
<y_peak_count
; i
++) {
466 peak_best_delta
[i
] = CP
.fragment_mass_tolerance
;
467 peak_best_class
[i
] = -1;
470 // Iterate over all closest pairs of peaks, collecting info on the class of
471 // the best (meaning closest mz) real peak matching each theoretical peak
472 // (for real/theoretical peak pairs that are no farther apart than
473 // fragment_mass_tolerance).
475 const unsigned int x_peak_count
=x
.peaks
.size();
476 assert(x_peak_count
> 0 and y_peak_count
> 0);
478 unsigned int x_index
=0, y_index
=0;
480 const double delta
= synth_mzs
[y_index
] - x
.peak_mz_cache
[x_index
];
481 if (std::abs(delta
) <= peak_best_delta
[y_index
]) {
482 peak_best_class
[y_index
] = x
.peak_intensity_class_cache
[x_index
];
483 peak_best_delta
[y_index
] = std::abs(delta
);
486 if (++x_index
>= x_peak_count
)
489 if (++y_index
>= y_peak_count
)
494 assert(CP
.intensity_class_count
< INT_MAX
);
495 const unsigned int intensity_class_count
= CP
.intensity_class_count
;
496 unsigned int peak_hit_histogram
[intensity_class_count
];
497 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
498 peak_hit_histogram
[i
] = 0;
499 for (unsigned int i
=0; i
<y_peak_count
; i
++) {
500 const int bc
= peak_best_class
[i
];
502 assert(bc
< static_cast<int>(intensity_class_count
));
503 peak_hit_histogram
[bc
] += 1;
507 // How many theoretical peaks overlap the real peak range?
508 int valid_theoretical_peaks
= 0;
509 for (unsigned int i
=0; i
<y_peak_count
; i
++)
510 if (x
.min_peak_mz
<= synth_mzs
[i
] and synth_mzs
[i
] <= x
.max_peak_mz
)
511 valid_theoretical_peaks
++;
513 unsigned int total_peak_hits
= 0;
514 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
515 total_peak_hits
+= peak_hit_histogram
[i
];
517 // How many valid theoretical peaks were misses?
518 const int peak_misses
= valid_theoretical_peaks
- total_peak_hits
;
519 assert(peak_misses
>= 0);
521 double score
= ln_combination_(x
.empty_peak_bins
, peak_misses
);
522 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
523 score
+= ln_combination_(x
.intensity_class_counts
[i
],
524 peak_hit_histogram
[i
]);
525 score
-= ln_combination_(x
.total_peak_bins
, valid_theoretical_peaks
);
531 // Return the score of the specified spectrum against a group of synthetic
534 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
535 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
536 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
538 // FIX: implement MM smart +3 model, and/or come up with something better.
541 score_spectrum_over_charges(const double
542 synth_sp_mz
[/* max_fragment_charge+1 */]
543 [spectrum::MAX_THEORETICAL_FRAGMENTS
],
545 const int max_fragment_charge
) {
546 double best_score
= score_spectrum(sp
, synth_sp_mz
[1]);
548 const int charge_limit
= std::max
<int>(1, sp
.charge
-1);
549 for (int charge
=2; charge
<=charge_limit
; charge
++) {
550 assert(charge
<= max_fragment_charge
);
551 best_score
+= score_spectrum(sp
, synth_sp_mz
[charge
]);
557 struct mass_trace_list
{
558 mass_trace_item item
;
559 const mass_trace_list
*next
;
561 mass_trace_list(const mass_trace_list
*p
=0) : next(p
) { }
565 // fuzzy comparison might not be necessary, but it's safer
567 score_equal(double s1
, double s2
) { return std::abs(s1
-s2
) < 1e-6; }
570 // Search for matches of this particular peptide modification variation
571 // against the spectra. Updates score_stats and returns the number of
572 // candidate spectra found.
574 evaluate_peptide(const search_context
&context
, match
&m
,
575 const mass_trace_list
*mtlp
, const double *mass_list
,
576 const std::vector
<std::vector
<spectrum
>::size_type
>
578 score_stats
&stats
) {
579 int max_candidate_charge
= 0;
580 typedef std::vector
<std::vector
<spectrum
>::size_type
>::const_iterator c_it
;
581 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++)
583 = std::max
<int>(max_candidate_charge
,
584 spectrum::searchable_spectra
[*it
].charge
);
585 assert(max_candidate_charge
>= 1);
587 const int max_fragment_charge
= std::max
<int>(1, max_candidate_charge
-1);
588 assert(max_fragment_charge
<= spectrum::MAX_SUPPORTED_CHARGE
);
590 const unsigned int peptide_size
= m
.peptide_sequence
.size();
591 // FIX: "2" assumes just two ion series (e.g., B and Y)
592 assert(peptide_size
<= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS
-1));
594 // This array is preallocated to avoid the overhead of allocation and
595 // deallocation within the inner loop.
597 // charge -> mz array
598 static double synth_sp_mz
[spectrum::MAX_SUPPORTED_CHARGE
+1]
599 [spectrum::MAX_THEORETICAL_FRAGMENTS
];
601 synthetic_spectra(synth_sp_mz
, mass_list
, peptide_size
,
602 context
.fragment_N_fixed_mass
,
603 context
.fragment_C_fixed_mass
, max_fragment_charge
);
605 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++) {
606 m
.spectrum_index
= *it
;
607 spectrum
&sp
= spectrum::searchable_spectra
[m
.spectrum_index
];
610 stats
.candidate_spectrum_count
+= 1;
611 if (CP
.estimate_only
)
614 //std::cerr << "sp " << m.spectrum_index << std::endl;
615 m
.score
= score_spectrum_over_charges(synth_sp_mz
, sp
, max_fragment_charge
);
616 //std::cerr << "score " << m.score << std::endl;
618 std::vector
<match
> &sp_best_matches
= stats
.best_matches
[m
.spectrum_index
];
619 // Is this score good enough to be in the best score list? (Better scores
620 // are more negative.)
621 if (m
.score
> sp_best_matches
.back().score
)
624 // We're saving this match, so remember the mass trace info, too.
625 m
.mass_trace
.clear();
626 for (const mass_trace_list
*p
=mtlp
; p
; p
=p
->next
)
627 m
.mass_trace
.push_back(p
->item
);
629 // If this is duplicate, just append to the existing match and return
630 for (std::vector
<match
>::reverse_iterator rit
631 = sp_best_matches
.rbegin(); rit
!= sp_best_matches
.rend(); rit
++)
632 if (score_equal(rit
->score
, m
.score
)
633 and rit
->peptide_sequence
== m
.peptide_sequence
634 and rit
->mass_regime_index
== m
.mass_regime_index
635 and rit
->conjunct_index
== m
.conjunct_index
636 and rit
->mass_trace
== m
.mass_trace
) {
637 rit
->peptide_begin
.push_back(m
.peptide_begin
[0]);
638 rit
->sequence_name
.push_back(m
.sequence_name
[0]);
642 // Otherwise, insert this match in the correct position
643 assert(sp_best_matches
.size() >= 1);
644 std::vector
<match
>::reverse_iterator rit
= sp_best_matches
.rbegin();
645 for (;rit
+1 != sp_best_matches
.rend() and (rit
+1)->score
> m
.score
; rit
++)
652 // IDEA FIX: Use a cost parameter to define the iterative search front.
655 // Choose one possible residue modification position. Once they're all
656 // chosen, then evaluate.
658 choose_residue_mod(const search_context
&context
, match
&m
,
659 const mass_trace_list
*mtlp
, double *mass_list
,
660 const std::vector
<std::vector
<spectrum
>::size_type
>
663 std::vector
<int> &db_remaining
,
664 const unsigned int remaining_positions_to_choose
,
665 const unsigned int next_position_to_consider
) {
666 assert(remaining_positions_to_choose
667 <= m
.peptide_sequence
.size() - next_position_to_consider
);
669 if (remaining_positions_to_choose
== 0) {
670 stats
.combinations_searched
+= 1;
671 evaluate_peptide(context
, m
, mtlp
, mass_list
, candidate_spectra
, stats
);
673 mass_trace_list
mtl(mtlp
);
675 // consider all of the positions where we could next add a mod
676 for (unsigned int i
=next_position_to_consider
;
677 i
<= m
.peptide_sequence
.size()-remaining_positions_to_choose
; i
++) {
678 mtl
.item
.position
= i
;
679 const double save_pos_mass
=mass_list
[i
];
680 const char pos_res
= m
.peptide_sequence
[i
];
682 // consider the possibilities for this position
683 for (std::vector
<int>::const_iterator
684 it
=context
.delta_bag_lookup
[pos_res
].begin();
685 it
!= context
.delta_bag_lookup
[pos_res
].end(); it
++) {
686 const int db_index
= *it
;
687 if (db_remaining
[db_index
] < 1)
689 db_remaining
[db_index
] -= 1;
690 mass_list
[i
] = save_pos_mass
+ context
.delta_bag_delta
[db_index
];
691 mtl
.item
.delta
= context
.delta_bag_delta
[db_index
];
692 choose_residue_mod(context
, m
, &mtl
, mass_list
, candidate_spectra
,
694 remaining_positions_to_choose
-1, i
+1);
695 db_remaining
[db_index
] += 1;
697 mass_list
[i
] = save_pos_mass
;
704 search_peptide(const search_context
&context
, const double &p_mass
, match
&m
,
705 const std::string
&run_sequence
, const int &begin_index
,
706 const unsigned int &peptide_size
, score_stats
&stats
,
707 std::vector
<int> &db_remaining
) {
708 // We'd like to consider only the spectra whose masses are within the
709 // tolerance range of theoretical mass of the peptide (p_mass).
710 // Unfortunately, the tolerance range of each spectrum depends on its
711 // charge, since the tolerance is specified in m/z units. Furthermore,
712 // when deciding how to adjust begin/end to generate the next peptide,
713 // we need to be conservative, as a high-charge spectrum may come into
714 // range, even though only +1 spectra are currently in-range. So, we
715 // first screen using the maximum tolerance range, then check the actual
718 const double sp_mass_lb
= p_mass
- CP
.parent_mass_tolerance_max
;
719 const double sp_mass_ub
= p_mass
+ CP
.parent_mass_tolerance_max
;
721 const spmi_c_it candidate_spectra_info_begin
722 = spectrum::spectrum_mass_index
.lower_bound(sp_mass_lb
);
723 if (candidate_spectra_info_begin
== spectrum::spectrum_mass_index
.end()) {
724 // spectrum masses all too low to match peptide (peptide too long)
728 const spmi_c_it candidate_spectra_info_end
729 = spectrum::spectrum_mass_index
.upper_bound(sp_mass_ub
);
730 if (candidate_spectra_info_end
== spectrum::spectrum_mass_index
.begin()) {
731 // spectrum masses all too high to match peptide (peptide too short)
734 if (candidate_spectra_info_begin
== candidate_spectra_info_end
) {
735 // no spectrum with close-enough parent mass
739 // Now generate the list of candidate spectra that are actually
740 // in-range, considering the charge of each spectrum.
741 std::vector
<std::vector
<spectrum
>::size_type
> candidate_spectra_0
;
742 for (spmi_c_it it
=candidate_spectra_info_begin
;
743 it
!= candidate_spectra_info_end
; it
++) {
744 const spectrum
&csp
= spectrum::searchable_spectra
[it
->second
];
745 if (std::abs(csp
.mass
- p_mass
) <= (csp
.charge
746 * CP
.parent_mass_tolerance_1
))
747 candidate_spectra_0
.push_back(it
->second
);
749 if (candidate_spectra_0
.empty())
752 m
.predicted_parent_mass
= p_mass
;
753 m
.peptide_sequence
.assign(run_sequence
, begin_index
, peptide_size
);
755 double mass_list
[peptide_size
];
756 for (unsigned int i
=0; i
<peptide_size
; i
++)
757 mass_list
[i
] = (CP
.fragment_mass_regime
[context
.mass_regime_index
]
758 .fixed_residue_mass
[m
.peptide_sequence
[i
]]);
760 choose_residue_mod(context
, m
, NULL
, mass_list
, candidate_spectra_0
,
761 stats
, db_remaining
, context
.mod_count
, 0);
766 // Search a sequence run for matches according to the context against the
767 // spectra. Updates score_stats and the number of candidate spectra found.
769 // FIX: examine carefully for signed/unsigned problems
772 search_run(const search_context
&context
, const sequence_run
&sequence_run
,
773 score_stats
&stats
) {
774 const unsigned int min_peptide_length
775 = std::max
<unsigned int>(CP
.minimum_peptide_length
, context
.mod_count
);
776 const std::vector
<double> &fixed_parent_mass
777 = CP
.parent_mass_regime
[context
.mass_regime_index
].fixed_residue_mass
;
779 const std::string
&run_sequence
= sequence_run
.sequence
;
781 // This match will be passed inward and used to record information that we
782 // need to remember about a match when we finally see one. At that point, a
783 // copy of this match will be saved.
784 // FIX: can this be done better?
786 m
.sequence_name
.push_back(sequence_run
.name
);
787 m
.peptide_begin
.push_back(0);
788 int &peptide_begin
= m
.peptide_begin
[0];
789 m
.mass_regime_index
= context
.mass_regime_index
;
790 m
.conjunct_index
= context
.conjunct_index
;
791 m
.pca_delta
= context
.pca_delta
;
793 assert(context
.delta_bag_delta
.size() == context
.delta_bag_count
.size());
794 // counts remaining as mod positions are chosen
795 std::vector
<int> db_remaining
= context
.delta_bag_count
;
797 // the rightmost 'end' seen when all spectra masses were too high
798 // (0 means none yet encountered.)
799 unsigned int next_end
= 0;
801 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
802 double p_mass
= context
.parent_fixed_mass
;
803 int p_begin
=0, p_end
=0;
805 const std::vector
<int> *cleavage_points
= &sequence_run
.cleavage_points
;
806 std::vector
<int>::size_type cleavage_points_size
= cleavage_points
->size();
807 // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
808 static std::vector
<int> nonspecific_cleavage_points
;
809 if (context
.nonspecific_cleavage
) {
810 cleavage_points_size
= run_sequence
.size() + 1;
811 for (unsigned int i
=nonspecific_cleavage_points
.size();
812 i
<cleavage_points_size
; i
++)
813 nonspecific_cleavage_points
.push_back(i
);
814 cleavage_points
= &nonspecific_cleavage_points
;
816 assert(cleavage_points_size
>= 2); // endpoints must be present
818 for (unsigned int begin
=0; begin
<cleavage_points_size
-1; begin
++) {
819 const int begin_index
= (*cleavage_points
)[begin
];
820 peptide_begin
= begin_index
+ sequence_run
.sequence_offset
;
822 // If pca_residues specified, require one of them to be the peptide
823 // N-terminal residue
824 switch (context
.pca_residues
.size()) {
828 if (context
.pca_residues
[1] == run_sequence
[begin_index
])
831 if (context
.pca_residues
[0] == run_sequence
[begin_index
])
838 assert(begin_index
>= p_begin
and begin_index
>= 0);
839 for (int i
=p_begin
; i
<begin_index
; i
++)
840 p_mass
-= fixed_parent_mass
[run_sequence
[i
]];
841 p_begin
= begin_index
;
843 unsigned int end
= std::max
<unsigned int>(begin
+ 1, next_end
);
844 for (; end
<cleavage_points_size
; end
++) {
845 if (not context
.nonspecific_cleavage
) {
846 const int missed_cleavage_count
= end
- begin
- 1;
847 if (missed_cleavage_count
> context
.maximum_missed_cleavage_sites
)
851 const int end_index
= (*cleavage_points
)[end
];
852 assert(end_index
> begin_index
);
853 const unsigned int peptide_size
= end_index
- begin_index
;
854 if (peptide_size
< min_peptide_length
)
857 assert(end_index
>= 0);
858 if (end_index
>= p_end
)
859 for (int i
=p_end
; i
<end_index
; i
++)
860 p_mass
+= fixed_parent_mass
[run_sequence
[i
]];
862 for (int i
=end_index
; i
<p_end
; i
++)
863 p_mass
-= fixed_parent_mass
[run_sequence
[i
]];
867 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
869 int status
= search_peptide(context
, p_mass
, m
, run_sequence
,
870 begin_index
, peptide_size
, stats
,
872 if (status
== -1) // peptide was too long
874 if (status
== 1) { // peptide was too short
875 // so next start with a peptide at least this long
876 if (end
== cleavage_points_size
- 1)
885 // Search all sequence runs for matches according to the context against the
886 // spectra. Updates score_stats and the number of candidate spectra found.
888 spectrum::search_runs(const search_context
&context
, score_stats
&stats
) {
889 const int no_runs
= context
.sequence_runs
.size();
890 for (int i
=0; i
<no_runs
; i
++) {
891 search_run(context
, context
.sequence_runs
[i
], stats
);
893 if (CP
.show_progress
)
894 std::cerr
<< i
+1 << " of " << no_runs
<< " sequences, "
895 << stats
.candidate_spectrum_count
<< " candidates\r"
899 if (CP
.show_progress
)
900 std::cerr
<< std::endl
;