1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
36 #define NOTHROW __attribute__ ((nothrow))
37 // Use faster, non-threadsafe versions, since we don't use threads. This is
39 #define fgetsX fgets_unlocked
40 #define fputsX fputs_unlocked
41 #define fprintfX __builtin_fprintf_unlocked
42 #define ferrorX ferror_unlocked
43 #define getcX getc_unlocked
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
50 #define getcX std::getc
54 parameters
parameters::the
;
55 const parameters
&CP
= parameters::the
;
59 peak::__repr__() const {
60 static char temp
[128];
61 sprintf(temp
, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
62 mz
, intensity
, intensity_class
);
66 const int FIRST_SPECTRUM_ID
= 1;
67 int spectrum::next_id
= FIRST_SPECTRUM_ID
;
68 int spectrum::next_physical_id
= FIRST_SPECTRUM_ID
;
70 std::vector
<spectrum
> spectrum::searchable_spectra
;
71 // parent mass -> searchable_spectra index
73 std::vector
<spectrum
>::size_type
> spectrum::spectrum_mass_index
;
76 std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
81 spectrum::__repr__() const {
82 static char temp
[1024];
84 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
85 id
, physical_id
, name
.c_str(), mass
, charge
, peaks
.size());
90 // FIX: does this actually help inlining?
91 // Return ln of n_C_k.
93 ln_combination_(unsigned int n
, unsigned int k
) NOTHROW
{
94 // Occasionally happens due to problems in scoring function. (FIX)
98 assert(0 <= k
and k
<= n
and n
< CP
.ln_factorial
.size());
99 return CP
.ln_factorial
[n
] - CP
.ln_factorial
[k
] - CP
.ln_factorial
[n
-k
];
102 ln_combination(unsigned int n
, unsigned int k
) {
103 return ln_combination(n
, k
);
108 // This is an error exit for read_spectra*.
110 io_error(FILE *f
, const char *message
="") {
112 message
= "I/O error while reading (or writing) spectrum file";
113 throw std::ios_base::failure(message
);
117 // true iff s consists entirely of whitespace
119 at_eol(const char *s
) {
120 return s
[strspn(s
, " \t\r\n\f\v")] == 0;
124 // Check whether we've past the end offset, throwing exception on I/O error.
126 check_past_end(FILE *f
, const long offset_end
) {
127 if (offset_end
== -1)
129 long pos
= std::ftell(f
);
132 return pos
>= offset_end
;
136 // Read spectra from file in ms2 format, tagging them with file_id. Before
137 // reading, seek to absolute position offset_begin. If offset_end != -1, any
138 // spectra that begin after position offset_end in the file are not read.
140 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
141 // having the same physical id. Note that depending on
142 // offset_begin/offset_end, we may end up with the first charge and not the
143 // second, or vice versa.
145 // The peak list is initially sorted by mz. Throws an exception on invalid
146 // input. Error checking is the most stringent in this function. Other
147 // readers can be a little looser since all ms2 files eventually get read here
150 // This is pretty hideous. Is there a simpler way to read, reasonably
151 // efficiently, catching any input error?
153 std::vector
<spectrum
>
154 spectrum::read_spectra_from_ms2(FILE *f
, const int file_id
,
155 const long offset_begin
,
156 const long offset_end
) {
157 std::vector
<spectrum
> spectra
;
159 const int bufsiz
= 1024;
163 // first seek to offset_begin and synchronize at next "\n:"
164 if (offset_begin
> 0) {
165 if (std::fseek(f
, offset_begin
-1, SEEK_SET
) == -1)
168 while (c
!= '\n' and c
!= EOF
)
175 } while (c
!= ':' and c
!= EOF
);
182 // at this point we expect to read the first ':' of a header, or EOF
184 bool past_end
= check_past_end(f
, offset_end
);
185 char *result
= fgetsX(buf
, bufsiz
, f
);
187 std::vector
<std::string
> names
;
188 std::vector
<double> masses
;
189 std::vector
<int> charges
;
195 if (not result
or buf
[0] != ':')
198 names
.push_back(std::string(buf
+1, buf
+std::strlen(buf
)-1));
199 if (not fgetsX(buf
, bufsiz
, f
))
200 io_error(f
, "bad ms2 format: mass/charge line expected");
202 double mass
= std::strtod(buf
, &endp
); // need double accuracy here
204 masses
.push_back(mass
);
205 if (errno
or endp
== buf
or mass
<= 0)
206 io_error(f
, "bad ms2 format: bad mass");
207 const char *endp0
= endp
;
208 int charge
= std::strtol(endp0
, &endp
, 10);
210 charges
.push_back(charge
);
211 if (errno
or endp
== endp0
or charge
<= 0)
212 io_error(f
, "bad ms2 format: bad charge");
213 if (not at_eol(endp
))
214 io_error(f
, "bad ms2 format: junk at end of mass/charge line");
215 past_end
= past_end
or check_past_end(f
, offset_end
);
216 result
= fgetsX(buf
, bufsiz
, f
);
218 if (names
.empty() and past_end
)
221 if (not names
.empty())
222 io_error(f
, "bad ms2 format: spectrum has no peaks (file truncated?)");
226 std::vector
<peak
> peaks
;
230 p
.mz
= strtod(buf
, &endp
);
231 if (errno
or endp
== buf
or p
.mz
<= 0)
232 io_error(f
, "bad ms2 format: bad peak mz");
233 const char *endp0
= endp
;
234 p
.intensity
= strtod(endp0
, &endp
);
235 if (errno
or endp
== endp0
or p
.intensity
<= 0)
236 io_error(f
, "bad ms2 format: bad peak intensity");
237 if (not at_eol(endp
))
238 io_error(f
, "bad ms2 format: junk at end of peak line");
241 past_end
= past_end
or check_past_end(f
, offset_end
);
242 result
= fgetsX(buf
, bufsiz
, f
);
245 if (not result
or buf
[0] == ':')
249 // add spectra to vector
250 if (names
.empty() and not peaks
.empty())
251 io_error(f
, "bad ms2 format: missing header lines?");
253 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
255 for (std::vector
<double>::size_type i
=0; i
<names
.size(); i
++) {
256 spectrum
sp(masses
[i
], charges
[i
]);
259 sp
.file_id
= file_id
;
260 sp
.physical_id
= next_physical_id
;
261 spectra
.push_back(sp
);
264 spectrum::next_physical_id
+= 1;
270 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
271 // to remove those too large to be fragment products. (Peaks are assumed to
272 // be ordered by increasing mz.)
274 spectrum::filter_peaks(double TIC_cutoff_proportion
,
275 double parent_mass_tolerance_max
) {
276 if (not (0 <= TIC_cutoff_proportion
and TIC_cutoff_proportion
<= 1)
277 or parent_mass_tolerance_max
< 0)
278 throw std::invalid_argument("invalid argument value");
280 // remove peaks too large to be fragment products
281 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
282 const double peak_mz_limit
= mass
+ parent_mass_tolerance_max
;
284 std::vector
<peak
>::iterator pi
;
285 for (pi
=peaks
.begin(); pi
!= peaks
.end() and pi
->mz
<= peak_mz_limit
; pi
++);
286 peaks
.erase(pi
, peaks
.end());
288 // FIX: note mzLowerBound..mzUpperBound here
291 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
293 total_ion_current
= 0;
294 for (pi
=peaks
.begin(); pi
!= peaks
.end(); pi
++)
295 total_ion_current
+= pi
->intensity
;
297 const double i_limit
= total_ion_current
* TIC_cutoff_proportion
;
298 double accumulated_intensity
= 0;
299 for (pi
=peaks
.begin();
300 pi
!= peaks
.end() and accumulated_intensity
<= i_limit
; pi
++)
301 accumulated_intensity
+= pi
->intensity
;
303 peaks
.erase(pi
, peaks
.end());
306 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
310 // Classify peaks and update class_counts and peak bin counts.
311 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
313 spectrum::classify(int intensity_class_count
, double intensity_class_ratio
,
314 double fragment_mass_tolerance
) {
315 if (intensity_class_count
< 1 or intensity_class_ratio
<= 1.0)
316 throw std::invalid_argument("invalid argument value");
318 intensity_class_counts
.clear();
319 intensity_class_counts
.resize(intensity_class_count
);
321 // e.g., 7 = 1 + 2 + 4
322 // FIX: does this fail for non-integer ratios?
323 int min_count
= int((pow(intensity_class_ratio
, intensity_class_count
) - 1)
324 / (intensity_class_ratio
- 1));
326 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
327 std::vector
<peak
>::iterator pi
=peaks
.begin();
328 for (int i_class
=0; i_class
<intensity_class_count
; i_class
++) {
329 int peaks_this_class
= int((pow(intensity_class_ratio
, i_class
) * peaks
.size()
331 for (int cp
=0; cp
< peaks_this_class
and pi
!= peaks
.end(); cp
++, pi
++) {
332 pi
->intensity_class
= i_class
;
333 intensity_class_counts
[i_class
]++;
336 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
338 if (not peaks
.empty()) {
339 min_peak_mz
= peaks
.front().mz
- CP
.fragment_mass_tolerance
;
340 max_peak_mz
= peaks
.back().mz
+ CP
.fragment_mass_tolerance
;
341 total_peak_bins
= (int((max_peak_mz
- min_peak_mz
)
342 / (2 * fragment_mass_tolerance
) + 0.5));
343 // This is the MyriMatch estimate. Wouldn't it be better to simply count
344 // how many are empty?
345 empty_peak_bins
= std::max
<int>(total_peak_bins
- peaks
.size(), 0);
350 // Update the *_cache fields from the peaks field.
352 spectrum::update_peak_cache() {
355 const unsigned int peak_count
= peaks
.size();
356 peak_mz_cache
= new double[peak_count
+1];
357 peak_intensity_class_cache
= new int[peak_count
+1];
359 for (unsigned int i
=0; i
<peak_count
; i
++) {
360 peak_mz_cache
[i
] = peaks
[i
].mz
;
361 peak_intensity_class_cache
[i
] = peaks
[i
].intensity_class
;
364 // negative value is terminator
365 peak_mz_cache
[peak_count
] = -1;
366 peak_intensity_class_cache
[peak_count
] = -1;
370 // Store the list of spectra that search_peptide will search against, and also
371 // build spectrum_mass_index.
373 spectrum::set_searchable_spectra(const std::vector
<spectrum
> &spectra
) {
374 searchable_spectra
= spectra
;
375 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++) {
376 spectrum_mass_index
.insert(std::make_pair(spectra
[i
].mass
, i
));
377 searchable_spectra
[i
].update_peak_cache();
382 // Calculate peaks for a synthesized mass (not mz) ladder.
384 get_synthetic_Y_mass_ladder(double *mass_ladder
, const unsigned int ladder_size
,
385 const double *mass_list
,
386 const double fragment_C_fixed_mass
) NOTHROW
{
387 double m
= fragment_C_fixed_mass
;
389 for (unsigned int i
=ladder_size
; i
>0; i
--) {
391 mass_ladder
[ladder_size
-i
] = m
;
396 // Calculate peaks for a synthesized mass (not mz) ladder.
398 get_synthetic_B_mass_ladder(double *mass_ladder
, const unsigned int ladder_size
,
399 const double *mass_list
,
400 const double fragment_N_fixed_mass
) NOTHROW
{
401 double m
= fragment_N_fixed_mass
;
403 for (unsigned int i
=0; i
<ladder_size
; i
++) {
410 // Generate synthetic spectra for a set of charges. Only the charge and peaks
411 // of these spectra are initialized.
413 synthetic_spectra(double synth_sp_mz
[/* max_fragment_charge+1 */]
414 [spectrum::MAX_THEORETICAL_FRAGMENTS
],
415 const double *mass_list
, const unsigned int mass_list_size
,
416 const double fragment_N_fixed_mass
,
417 const double fragment_C_fixed_mass
,
418 const int max_fragment_charge
) NOTHROW
{
419 assert(mass_list_size
>= 1);
420 const unsigned int ladder_size
= mass_list_size
- 1;
421 double Y_mass_ladder
[ladder_size
];
422 double B_mass_ladder
[ladder_size
];
424 for (int charge
=1; charge
<=max_fragment_charge
; charge
++) {
425 get_synthetic_Y_mass_ladder(Y_mass_ladder
, ladder_size
, mass_list
,
426 fragment_C_fixed_mass
);
427 get_synthetic_B_mass_ladder(B_mass_ladder
, ladder_size
, mass_list
,
428 fragment_N_fixed_mass
);
430 unsigned int y_i
=0, b_i
=0;
431 double *sp_mz_p
= synth_sp_mz
[charge
];
433 // merge the two (already sorted) mass lists, converting them to mz values
435 // FIX: try algorithm::merge?
436 while (y_i
< ladder_size
and b_i
< ladder_size
)
437 if (Y_mass_ladder
[y_i
] < B_mass_ladder
[b_i
])
438 *(sp_mz_p
++) = peak::get_mz(Y_mass_ladder
[y_i
++], charge
);
440 *(sp_mz_p
++) = peak::get_mz(B_mass_ladder
[b_i
++], charge
);
441 while (y_i
< ladder_size
)
442 *(sp_mz_p
++) = peak::get_mz(Y_mass_ladder
[y_i
++], charge
);
443 while (b_i
< ladder_size
)
444 *(sp_mz_p
++) = peak::get_mz(B_mass_ladder
[b_i
++], charge
);
445 *sp_mz_p
= -1; // invalid mz as terminator
450 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
451 // theoretical spectrum (y) generated from a peptide candidate.
453 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
454 // better yet, sets) for peak lists simpler? As fast or faster?
456 // This is the innermost loop, so it's worthwhile to optimize this some.
457 // FIX const declaration
459 score_spectrum(const spectrum
&x
,
460 const double synth_mzs
[spectrum::MAX_THEORETICAL_FRAGMENTS
])
463 unsigned int y_peak_count
=0;
464 for (const double *p
=synth_mzs
; *p
>= 0; p
++, y_peak_count
++);
466 double peak_best_delta
[y_peak_count
];
467 int peak_best_class
[y_peak_count
];
468 for (unsigned int i
=0; i
<y_peak_count
; i
++) {
469 peak_best_delta
[i
] = CP
.fragment_mass_tolerance
;
470 peak_best_class
[i
] = -1;
473 // Iterate over all closest pairs of peaks, collecting info on the class of
474 // the best (meaning closest mz) real peak matching each theoretical peak
475 // (for real/theoretical peak pairs that are no farther apart than
476 // fragment_mass_tolerance).
478 const unsigned int x_peak_count
=x
.peaks
.size();
479 assert(x_peak_count
> 0 and y_peak_count
> 0);
481 unsigned int x_index
=0, y_index
=0;
483 const double delta
= synth_mzs
[y_index
] - x
.peak_mz_cache
[x_index
];
484 if (std::abs(delta
) <= peak_best_delta
[y_index
]) {
485 peak_best_class
[y_index
] = x
.peak_intensity_class_cache
[x_index
];
486 peak_best_delta
[y_index
] = std::abs(delta
);
489 if (++x_index
>= x_peak_count
)
492 if (++y_index
>= y_peak_count
)
497 assert(CP
.intensity_class_count
< INT_MAX
);
498 const unsigned int intensity_class_count
= CP
.intensity_class_count
;
499 unsigned int peak_hit_histogram
[intensity_class_count
];
500 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
501 peak_hit_histogram
[i
] = 0;
502 for (unsigned int i
=0; i
<y_peak_count
; i
++) {
503 const int bc
= peak_best_class
[i
];
505 assert(bc
< static_cast<int>(intensity_class_count
));
506 peak_hit_histogram
[bc
] += 1;
510 // How many theoretical peaks overlap the real peak range?
511 int valid_theoretical_peaks
= 0;
512 for (unsigned int i
=0; i
<y_peak_count
; i
++)
513 if (x
.min_peak_mz
<= synth_mzs
[i
] and synth_mzs
[i
] <= x
.max_peak_mz
)
514 valid_theoretical_peaks
++;
516 unsigned int total_peak_hits
= 0;
517 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
518 total_peak_hits
+= peak_hit_histogram
[i
];
520 // How many valid theoretical peaks were misses?
521 const int peak_misses
= valid_theoretical_peaks
- total_peak_hits
;
522 assert(peak_misses
>= 0);
524 double score
= ln_combination_(x
.empty_peak_bins
, peak_misses
);
525 for (unsigned int i
=0; i
<intensity_class_count
; i
++)
526 score
+= ln_combination_(x
.intensity_class_counts
[i
],
527 peak_hit_histogram
[i
]);
528 score
-= ln_combination_(x
.total_peak_bins
, valid_theoretical_peaks
);
534 // Return the score of the specified spectrum against a group of synthetic
537 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
538 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
539 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
541 // FIX: implement MM smart +3 model, and/or come up with something better.
544 score_spectrum_over_charges(const double
545 synth_sp_mz
[/* max_fragment_charge+1 */]
546 [spectrum::MAX_THEORETICAL_FRAGMENTS
],
548 const int max_fragment_charge
) NOTHROW
{
549 double best_score
= score_spectrum(sp
, synth_sp_mz
[1]);
551 const int charge_limit
= std::max
<int>(1, sp
.charge
-1);
552 for (int charge
=2; charge
<=charge_limit
; charge
++) {
553 assert(charge
<= max_fragment_charge
);
554 best_score
+= score_spectrum(sp
, synth_sp_mz
[charge
]);
560 struct mass_trace_list
{
561 mass_trace_item item
;
562 const mass_trace_list
*next
;
564 mass_trace_list(const mass_trace_list
*p
=0) : next(p
) { }
568 // fuzzy comparison might not be necessary, but it's safer
570 score_equal(double s1
, double s2
) { return std::abs(s1
-s2
) < 1e-6; }
573 // Search for matches of this particular peptide modification variation
574 // against the spectra. Updates score_stats and returns the number of
575 // candidate spectra found.
577 evaluate_peptide(const search_context
&context
, match
&m
,
578 const mass_trace_list
*mtlp
, const double *mass_list
,
579 const std::vector
<std::vector
<spectrum
>::size_type
>
581 score_stats
&stats
) NOTHROW
{
582 int max_candidate_charge
= 0;
583 typedef std::vector
<std::vector
<spectrum
>::size_type
>::const_iterator c_it
;
584 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++)
586 = std::max
<int>(max_candidate_charge
,
587 spectrum::searchable_spectra
[*it
].charge
);
588 assert(max_candidate_charge
>= 1);
590 const int max_fragment_charge
= std::max
<int>(1, max_candidate_charge
-1);
591 assert(max_fragment_charge
<= spectrum::MAX_SUPPORTED_CHARGE
);
593 const unsigned int peptide_size
= m
.peptide_sequence
.size();
594 // FIX: "2" assumes just two ion series (e.g., B and Y)
595 assert(peptide_size
<= 2*(spectrum::MAX_THEORETICAL_FRAGMENTS
-1));
597 // This array is preallocated to avoid the overhead of allocation and
598 // deallocation within the inner loop.
600 // charge -> mz array
601 static double synth_sp_mz
[spectrum::MAX_SUPPORTED_CHARGE
+1]
602 [spectrum::MAX_THEORETICAL_FRAGMENTS
];
604 synthetic_spectra(synth_sp_mz
, mass_list
, peptide_size
,
605 context
.fragment_N_fixed_mass
,
606 context
.fragment_C_fixed_mass
, max_fragment_charge
);
608 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++) {
609 m
.spectrum_index
= *it
;
610 spectrum
&sp
= spectrum::searchable_spectra
[m
.spectrum_index
];
613 stats
.candidate_spectrum_count
+= 1;
614 if (CP
.estimate_only
)
617 //std::cerr << "sp " << m.spectrum_index << std::endl;
618 m
.score
= score_spectrum_over_charges(synth_sp_mz
, sp
, max_fragment_charge
);
619 //std::cerr << "score " << m.score << std::endl;
621 std::vector
<match
> &sp_best_matches
= stats
.best_matches
[m
.spectrum_index
];
622 // Is this score good enough to be in the best score list? (Better scores
623 // are more negative.)
624 if (m
.score
> sp_best_matches
.back().score
)
627 // We're saving this match, so remember the mass trace info, too.
628 m
.mass_trace
.clear();
629 for (const mass_trace_list
*p
=mtlp
; p
; p
=p
->next
)
630 m
.mass_trace
.push_back(p
->item
);
632 // If this is duplicate, just append to the existing match and return
633 for (std::vector
<match
>::reverse_iterator rit
634 = sp_best_matches
.rbegin(); rit
!= sp_best_matches
.rend(); rit
++)
635 if (score_equal(rit
->score
, m
.score
)
636 and rit
->peptide_sequence
== m
.peptide_sequence
637 and rit
->mass_regime_index
== m
.mass_regime_index
638 and rit
->conjunct_index
== m
.conjunct_index
639 and rit
->mass_trace
== m
.mass_trace
) {
640 rit
->peptide_begin
.push_back(m
.peptide_begin
[0]);
641 rit
->sequence_name
.push_back(m
.sequence_name
[0]);
645 // Otherwise, insert this match in the correct position
646 assert(sp_best_matches
.size() >= 1);
647 std::vector
<match
>::reverse_iterator rit
= sp_best_matches
.rbegin();
648 for (;rit
+1 != sp_best_matches
.rend() and (rit
+1)->score
> m
.score
; rit
++)
655 // IDEA FIX: Use a cost parameter to define the iterative search front.
658 // Choose one possible residue modification position. Once they're all
659 // chosen, then evaluate.
661 choose_residue_mod(const search_context
&context
, match
&m
,
662 const mass_trace_list
*mtlp
, double *mass_list
,
663 const std::vector
<std::vector
<spectrum
>::size_type
>
666 std::vector
<int> &db_remaining
,
667 const unsigned int remaining_positions_to_choose
,
668 const unsigned int next_position_to_consider
) NOTHROW
{
669 assert(remaining_positions_to_choose
670 <= m
.peptide_sequence
.size() - next_position_to_consider
);
672 if (remaining_positions_to_choose
== 0) {
673 stats
.combinations_searched
+= 1;
674 evaluate_peptide(context
, m
, mtlp
, mass_list
, candidate_spectra
, stats
);
676 mass_trace_list
mtl(mtlp
);
678 // consider all of the positions where we could next add a mod
679 for (unsigned int i
=next_position_to_consider
;
680 i
<= m
.peptide_sequence
.size()-remaining_positions_to_choose
; i
++) {
681 mtl
.item
.position
= i
;
682 const double save_pos_mass
=mass_list
[i
];
683 const char pos_res
= m
.peptide_sequence
[i
];
685 // consider the possibilities for this position
686 for (std::vector
<int>::const_iterator
687 it
=context
.delta_bag_lookup
[pos_res
].begin();
688 it
!= context
.delta_bag_lookup
[pos_res
].end(); it
++) {
689 const int db_index
= *it
;
690 if (db_remaining
[db_index
] < 1)
692 db_remaining
[db_index
] -= 1;
693 mass_list
[i
] = save_pos_mass
+ context
.delta_bag_delta
[db_index
];
694 mtl
.item
.delta
= context
.delta_bag_delta
[db_index
];
695 choose_residue_mod(context
, m
, &mtl
, mass_list
, candidate_spectra
,
697 remaining_positions_to_choose
-1, i
+1);
698 db_remaining
[db_index
] += 1;
700 mass_list
[i
] = save_pos_mass
;
707 search_peptide(const search_context
&context
, const double &p_mass
, match
&m
,
708 const std::string
&run_sequence
, const int &begin_index
,
709 const unsigned int &peptide_size
, score_stats
&stats
,
710 std::vector
<int> &db_remaining
) {
711 // We'd like to consider only the spectra whose masses are within the
712 // tolerance range of theoretical mass of the peptide (p_mass).
713 // Unfortunately, the tolerance range of each spectrum depends on its
714 // charge, since the tolerance is specified in m/z units. Furthermore,
715 // when deciding how to adjust begin/end to generate the next peptide,
716 // we need to be conservative, as a high-charge spectrum may come into
717 // range, even though only +1 spectra are currently in-range. So, we
718 // first screen using the maximum tolerance range, then check the actual
721 const double sp_mass_lb
= p_mass
- CP
.parent_mass_tolerance_max
;
722 const double sp_mass_ub
= p_mass
+ CP
.parent_mass_tolerance_max
;
724 const spmi_c_it candidate_spectra_info_begin
725 = spectrum::spectrum_mass_index
.lower_bound(sp_mass_lb
);
726 if (candidate_spectra_info_begin
== spectrum::spectrum_mass_index
.end()) {
727 // spectrum masses all too low to match peptide (peptide too long)
731 const spmi_c_it candidate_spectra_info_end
732 = spectrum::spectrum_mass_index
.upper_bound(sp_mass_ub
);
733 if (candidate_spectra_info_end
== spectrum::spectrum_mass_index
.begin()) {
734 // spectrum masses all too high to match peptide (peptide too short)
737 if (candidate_spectra_info_begin
== candidate_spectra_info_end
) {
738 // no spectrum with close-enough parent mass
742 // Now generate the list of candidate spectra that are actually
743 // in-range, considering the charge of each spectrum.
744 std::vector
<std::vector
<spectrum
>::size_type
> candidate_spectra_0
;
745 for (spmi_c_it it
=candidate_spectra_info_begin
;
746 it
!= candidate_spectra_info_end
; it
++) {
747 const spectrum
&csp
= spectrum::searchable_spectra
[it
->second
];
748 if (std::abs(csp
.mass
- p_mass
) <= (csp
.charge
749 * CP
.parent_mass_tolerance_1
))
750 candidate_spectra_0
.push_back(it
->second
);
752 if (candidate_spectra_0
.empty())
755 m
.predicted_parent_mass
= p_mass
;
756 m
.peptide_sequence
.assign(run_sequence
, begin_index
, peptide_size
);
758 double mass_list
[peptide_size
];
759 for (unsigned int i
=0; i
<peptide_size
; i
++)
760 mass_list
[i
] = (CP
.fragment_mass_regime
[context
.mass_regime_index
]
761 .fixed_residue_mass
[m
.peptide_sequence
[i
]]);
763 choose_residue_mod(context
, m
, NULL
, mass_list
, candidate_spectra_0
,
764 stats
, db_remaining
, context
.mod_count
, 0);
769 // Search a sequence run for matches according to the context against the
770 // spectra. Updates score_stats and the number of candidate spectra found.
772 // FIX: examine carefully for signed/unsigned problems
775 search_run(const search_context
&context
, const sequence_run
&sequence_run
,
776 score_stats
&stats
) {
777 const unsigned int min_peptide_length
778 = std::max
<unsigned int>(CP
.minimum_peptide_length
, context
.mod_count
);
779 const std::vector
<double> &fixed_parent_mass
780 = CP
.parent_mass_regime
[context
.mass_regime_index
].fixed_residue_mass
;
782 const std::string
&run_sequence
= sequence_run
.sequence
;
784 // This match will be passed inward and used to record information that we
785 // need to remember about a match when we finally see one. At that point, a
786 // copy of this match will be saved.
787 // FIX: can this be done better?
789 m
.sequence_name
.push_back(sequence_run
.name
);
790 m
.peptide_begin
.push_back(0);
791 int &peptide_begin
= m
.peptide_begin
[0];
792 m
.mass_regime_index
= context
.mass_regime_index
;
793 m
.conjunct_index
= context
.conjunct_index
;
794 m
.pca_delta
= context
.pca_delta
;
796 assert(context
.delta_bag_delta
.size() == context
.delta_bag_count
.size());
797 // counts remaining as mod positions are chosen
798 std::vector
<int> db_remaining
= context
.delta_bag_count
;
800 // the rightmost 'end' seen when all spectra masses were too high
801 // (0 means none yet encountered.)
802 unsigned int next_end
= 0;
804 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
805 double p_mass
= context
.parent_fixed_mass
;
806 int p_begin
=0, p_end
=0;
808 const std::vector
<int> *cleavage_points
= &sequence_run
.cleavage_points
;
809 std::vector
<int>::size_type cleavage_points_size
= cleavage_points
->size();
810 // "fake" cleavage point vector used for nonspecific cleavage (v[i]==i)
811 static std::vector
<int> nonspecific_cleavage_points
;
812 if (context
.nonspecific_cleavage
) {
813 cleavage_points_size
= run_sequence
.size() + 1;
814 for (unsigned int i
=nonspecific_cleavage_points
.size();
815 i
<cleavage_points_size
; i
++)
816 nonspecific_cleavage_points
.push_back(i
);
817 cleavage_points
= &nonspecific_cleavage_points
;
819 assert(cleavage_points_size
>= 2); // endpoints must be present
821 for (unsigned int begin
=0; begin
<cleavage_points_size
-1; begin
++) {
822 const int begin_index
= (*cleavage_points
)[begin
];
823 peptide_begin
= begin_index
+ sequence_run
.sequence_offset
;
825 // If pca_residues specified, require one of them to be the peptide
826 // N-terminal residue
827 switch (context
.pca_residues
.size()) {
831 if (context
.pca_residues
[1] == run_sequence
[begin_index
])
834 if (context
.pca_residues
[0] == run_sequence
[begin_index
])
841 assert(begin_index
>= p_begin
and begin_index
>= 0);
842 for (int i
=p_begin
; i
<begin_index
; i
++)
843 p_mass
-= fixed_parent_mass
[run_sequence
[i
]];
844 p_begin
= begin_index
;
846 unsigned int end
= std::max
<unsigned int>(begin
+ 1, next_end
);
847 for (; end
<cleavage_points_size
; end
++) {
848 if (not context
.nonspecific_cleavage
) {
849 const int missed_cleavage_count
= end
- begin
- 1;
850 if (missed_cleavage_count
> context
.maximum_missed_cleavage_sites
)
854 const int end_index
= (*cleavage_points
)[end
];
855 assert(end_index
> begin_index
);
856 const unsigned int peptide_size
= end_index
- begin_index
;
857 if (peptide_size
< min_peptide_length
)
860 assert(end_index
>= 0);
861 if (end_index
>= p_end
)
862 for (int i
=p_end
; i
<end_index
; i
++)
863 p_mass
+= fixed_parent_mass
[run_sequence
[i
]];
865 for (int i
=end_index
; i
<p_end
; i
++)
866 p_mass
-= fixed_parent_mass
[run_sequence
[i
]];
870 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
872 int status
= search_peptide(context
, p_mass
, m
, run_sequence
,
873 begin_index
, peptide_size
, stats
,
875 if (status
== -1) // peptide was too long
877 if (status
== 1) { // peptide was too short
878 // so next start with a peptide at least this long
879 if (end
== cleavage_points_size
- 1)
888 // Search all sequence runs for matches according to the context against the
889 // spectra. Updates score_stats and the number of candidate spectra found.
891 spectrum::search_runs(const search_context
&context
, score_stats
&stats
) {
892 const int no_runs
= context
.sequence_runs
.size();
893 for (int i
=0; i
<no_runs
; i
++) {
894 search_run(context
, context
.sequence_runs
[i
], stats
);
896 if (CP
.show_progress
)
897 std::cerr
<< i
+1 << " of " << no_runs
<< " sequences, "
898 << stats
.candidate_spectrum_count
<< " candidates\r"
902 if (CP
.show_progress
)
903 std::cerr
<< std::endl
;