1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
36 #define NOTHROW __attribute__ ((nothrow))
37 // Use faster, non-threadsafe versions, since we don't use threads. This is
39 #define fgetsX fgets_unlocked
40 #define fputsX fputs_unlocked
41 #define fprintfX __builtin_fprintf_unlocked
42 #define ferrorX ferror_unlocked
43 #define getcX getc_unlocked
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
50 #define getcX std::getc
54 parameters
parameters::the
;
55 const parameters
&CP
= parameters::the
;
59 peak::__repr__() const {
60 static char temp
[128];
61 sprintf(temp
, "<peak mz=%.4f intensity=%.4f intensity_class=%d>",
62 mz
, intensity
, intensity_class
);
66 const int FIRST_SPECTRUM_ID
= 1;
67 int spectrum::next_id
= FIRST_SPECTRUM_ID
;
68 int spectrum::next_physical_id
= FIRST_SPECTRUM_ID
;
70 std::vector
<spectrum
> spectrum::searchable_spectra
;
71 // parent mass -> searchable_spectra index
73 std::vector
<spectrum
>::size_type
> spectrum::spectrum_mass_index
;
76 std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
81 spectrum::__repr__() const {
82 static char temp
[1024];
84 "<spectrum #%d (phys #%d) '%s' %.4f/%+d [%zd peaks]>",
85 id
, physical_id
, name
.c_str(), mass
, charge
, peaks
.size());
90 // FIX: does this actually help inlining?
91 // Return ln of n_C_k.
93 ln_combination_(unsigned int n
, unsigned int k
) NOTHROW
{
94 // Occasionally happens due to problems in scoring function. (FIX)
98 assert(0 <= k
and k
<= n
and n
< CP
.ln_factorial
.size());
99 return CP
.ln_factorial
[n
] - CP
.ln_factorial
[k
] - CP
.ln_factorial
[n
-k
];
102 ln_combination(unsigned int n
, unsigned int k
) {
103 return ln_combination(n
, k
);
108 // This is an error exit for read_spectra*.
110 io_error(FILE *f
, const char *message
="") {
112 message
= "I/O error while reading (or writing) spectrum file";
113 throw std::ios_base::failure(message
);
117 // true iff s consists entirely of whitespace
119 at_eol(const char *s
) {
120 return s
[strspn(s
, " \t\r\n\f\v")] == 0;
124 // Check whether we've past the end offset, throwing exception on I/O error.
126 check_past_end(FILE *f
, const long offset_end
) {
127 if (offset_end
== -1)
129 long pos
= std::ftell(f
);
132 return pos
>= offset_end
;
136 // Read spectra from file in ms2 format, tagging them with file_id. Before
137 // reading, seek to absolute position offset_begin. If offset_end != -1, any
138 // spectra that begin after position offset_end in the file are not read.
140 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
141 // having the same physical id. Note that depending on
142 // offset_begin/offset_end, we may end up with the first charge and not the
143 // second, or vice versa.
145 // The peak list is initially sorted by mz. Throws an exception on invalid
146 // input. Error checking is the most stringent in this function. Other
147 // readers can be a little looser since all ms2 files eventually get read here
150 // This is pretty hideous. Is there a simpler way to read, reasonably
151 // efficiently, catching any input error?
153 std::vector
<spectrum
>
154 spectrum::read_spectra_from_ms2(FILE *f
, const int file_id
,
155 const long offset_begin
,
156 const long offset_end
) {
157 std::vector
<spectrum
> spectra
;
159 const int bufsiz
= 1024;
163 // first seek to offset_begin and synchronize at next "\n:"
164 if (offset_begin
> 0) {
165 if (std::fseek(f
, offset_begin
-1, SEEK_SET
) == -1)
168 while (c
!= '\n' and c
!= EOF
)
175 } while (c
!= ':' and c
!= EOF
);
182 // at this point we expect to read the first ':' of a header, or EOF
184 bool past_end
= check_past_end(f
, offset_end
);
185 char *result
= fgetsX(buf
, bufsiz
, f
);
187 std::vector
<std::string
> names
;
188 std::vector
<double> masses
;
189 std::vector
<int> charges
;
191 std::vector
<int> file_ids
;
192 std::vector
<int> physical_ids
;
193 std::vector
<int> ids
;
199 if (not result
or buf
[0] != ':')
202 names
.push_back(std::string(buf
+1, buf
+std::strlen(buf
)-1));
203 if (not fgetsX(buf
, bufsiz
, f
))
204 io_error(f
, "bad ms2 format: mass/charge line expected");
206 double mass
= std::strtod(buf
, &endp
); // need double accuracy here
208 masses
.push_back(mass
);
209 if (errno
or endp
== buf
or mass
<= 0)
210 io_error(f
, "bad ms2 format: bad mass");
211 const char *endp0
= endp
;
212 int charge
= std::strtol(endp0
, &endp
, 10);
214 charges
.push_back(charge
);
215 if (errno
or endp
== endp0
or charge
<= 0)
216 io_error(f
, "bad ms2 format: bad charge");
217 if (not at_eol(endp
))
218 io_error(f
, "bad ms2 format: junk at end of mass/charge line");
219 past_end
= past_end
or check_past_end(f
, offset_end
);
220 result
= fgetsX(buf
, bufsiz
, f
);
222 if (names
.empty() and past_end
)
225 if (not names
.empty())
226 io_error(f
, "bad ms2 format: spectrum has no peaks (file truncated?)");
230 std::vector
<peak
> peaks
;
234 p
.mz
= strtod(buf
, &endp
);
235 if (errno
or endp
== buf
or p
.mz
<= 0)
236 io_error(f
, "bad ms2 format: bad peak mz");
237 const char *endp0
= endp
;
238 p
.intensity
= strtod(endp0
, &endp
);
239 if (errno
or endp
== endp0
or p
.intensity
<= 0)
240 io_error(f
, "bad ms2 format: bad peak intensity");
241 if (not at_eol(endp
))
242 io_error(f
, "bad ms2 format: junk at end of peak line");
245 past_end
= past_end
or check_past_end(f
, offset_end
);
246 result
= fgetsX(buf
, bufsiz
, f
);
249 if (not result
or buf
[0] == ':')
253 // add spectra to vector
254 if (names
.empty() and not peaks
.empty())
255 io_error(f
, "bad ms2 format: missing header lines?");
257 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
259 for (std::vector
<double>::size_type i
=0; i
<names
.size(); i
++) {
260 spectrum
sp(masses
[i
], charges
[i
]);
264 sp
.file_id
= file_id
;
265 sp
.physical_id
= next_physical_id
;
267 sp
.file_id
= file_ids
[i
];
268 sp
.physical_id
= physical_ids
[i
];
270 spectrum::next_id
= std::max(spectrum::next_id
, sp
.id
+1);
271 spectrum::next_physical_id
= std::max(spectrum::next_physical_id
,
274 spectra
.push_back(sp
);
277 spectrum::next_physical_id
+= 1;
283 // Filter peaks to limit their number according to TIC_cutoff_proportion, and
284 // to remove those too large to be fragment products. (Peaks are assumed to
285 // be ordered by increasing mz.)
287 spectrum::filter_peaks(double TIC_cutoff_proportion
,
288 double parent_mass_tolerance_max
) {
289 if (not (0 <= TIC_cutoff_proportion
and TIC_cutoff_proportion
<= 1)
290 or parent_mass_tolerance_max
< 0)
291 throw std::invalid_argument("invalid argument value");
293 // remove peaks too large to be fragment products
294 // (i.e., peak m/z > parent [M+H+] + parent_mass_tolerance * charge_limit)
295 const double peak_mz_limit
= mass
+ parent_mass_tolerance_max
;
297 std::vector
<peak
>::iterator pi
;
298 for (pi
=peaks
.begin(); pi
!= peaks
.end() and pi
->mz
<= peak_mz_limit
; pi
++);
299 peaks
.erase(pi
, peaks
.end());
301 // FIX: note mzLowerBound..mzUpperBound here
304 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
306 total_ion_current
= 0;
307 for (pi
=peaks
.begin(); pi
!= peaks
.end(); pi
++)
308 total_ion_current
+= pi
->intensity
;
310 const double i_limit
= total_ion_current
* TIC_cutoff_proportion
;
311 double accumulated_intensity
= 0;
312 for (pi
=peaks
.begin();
313 pi
!= peaks
.end() and accumulated_intensity
<= i_limit
; pi
++)
314 accumulated_intensity
+= pi
->intensity
;
316 peaks
.erase(pi
, peaks
.end());
319 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
323 // Classify peaks and update class_counts and peak bin counts.
324 // FIX: hoist some of this up into Python? use (1,2,4) as parameter
326 spectrum::classify(int intensity_class_count
, double intensity_class_ratio
,
327 double fragment_mass_tolerance
) {
328 if (intensity_class_count
< 1 or intensity_class_ratio
<= 1.0)
329 throw std::invalid_argument("invalid argument value");
331 intensity_class_counts
.clear();
332 intensity_class_counts
.resize(intensity_class_count
);
334 // e.g., 7 = 1 + 2 + 4
335 // FIX: does this fail for non-integer ratios?
336 int min_count
= int((pow(intensity_class_ratio
, intensity_class_count
) - 1)
337 / (intensity_class_ratio
- 1));
339 std::sort(peaks
.begin(), peaks
.end(), peak::greater_intensity
);
340 std::vector
<peak
>::iterator pi
=peaks
.begin();
341 for (int i_class
=0; i_class
<intensity_class_count
; i_class
++) {
342 int peaks_this_class
= int((pow(intensity_class_ratio
, i_class
) * peaks
.size()
344 for (int cp
=0; cp
< peaks_this_class
and pi
!= peaks
.end(); cp
++, pi
++) {
345 pi
->intensity_class
= i_class
;
346 intensity_class_counts
[i_class
]++;
349 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
351 if (not peaks
.empty()) {
352 min_peak_mz
= peaks
.front().mz
- CP
.fragment_mass_tolerance
;
353 max_peak_mz
= peaks
.back().mz
+ CP
.fragment_mass_tolerance
;
354 total_peak_bins
= (int((max_peak_mz
- min_peak_mz
)
355 / (2 * fragment_mass_tolerance
) + 0.5));
356 // This is the MyriMatch estimate. Wouldn't it be better to simply count
357 // how many are empty?
358 empty_peak_bins
= std::max
<int>(total_peak_bins
- peaks
.size(), 0);
363 // Store the list of spectra that search_peptide will search against, and also
364 // build spectrum_mass_index.
366 spectrum::set_searchable_spectra(const std::vector
<spectrum
> &spectra
) {
367 searchable_spectra
= spectra
;
368 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++)
369 spectrum_mass_index
.insert(std::make_pair(spectra
[i
].mass
, i
));
373 // Calculate peaks for a synthesized mass (not mz) ladder.
375 get_synthetic_Y_mass_ladder(std::vector
<peak
> &mass_ladder
,
376 const std::vector
<double> &mass_list
,
377 const double fragment_C_fixed_mass
) NOTHROW
{
378 double m
= fragment_C_fixed_mass
;
380 const int ladder_size
= mass_ladder
.size();
381 for (int i
=ladder_size
-1; i
>=0; i
--) {
383 mass_ladder
[ladder_size
-1-i
] = peak(m
, 1.0);
388 // Calculate peaks for a synthesized mass (not mz) ladder.
390 get_synthetic_B_mass_ladder(std::vector
<peak
> &mass_ladder
,
391 const std::vector
<double> &mass_list
,
392 const double fragment_N_fixed_mass
) NOTHROW
{
393 double m
= fragment_N_fixed_mass
;
395 const int ladder_size
= mass_ladder
.size();
396 for (int i
=0; i
<=ladder_size
-1; i
++) {
398 mass_ladder
[i
] = peak(m
, 1.0);
403 // Generate synthetic spectra for a set of charges. Only the charge and peaks
404 // of these spectra are initialized.
406 synthetic_spectra(spectrum synth_sp
[/* max_fragment_charge+1 */],
407 const std::vector
<double> &mass_list
,
408 const double fragment_N_fixed_mass
,
409 const double fragment_C_fixed_mass
,
410 const double max_fragment_charge
) NOTHROW
{
411 std::vector
<peak
> mass_ladder(mass_list
.size()-1);
413 for (int charge
=1; charge
<=max_fragment_charge
; charge
++) {
414 spectrum
&sp
= synth_sp
[charge
];
415 //sp.mass = fragment_peptide_mass;
417 sp
.peaks
.resize(mass_ladder
.size() * (ION_MAX
-ION_MIN
));
418 std::vector
<peak
>::size_type pi
=0;
420 for (ion ion_type
=ION_MIN
; ion_type
<ION_MAX
; ion_type
++) {
423 get_synthetic_Y_mass_ladder(mass_ladder
, mass_list
,
424 fragment_C_fixed_mass
);
427 get_synthetic_B_mass_ladder(mass_ladder
, mass_list
,
428 fragment_N_fixed_mass
);
434 for (std::vector
<peak
>::size_type i
=0; i
<mass_ladder
.size(); i
++) {
435 sp
.peaks
[pi
++].mz
= peak::get_mz(mass_ladder
[i
].mz
, charge
);
438 std::sort(sp
.peaks
.begin(), sp
.peaks
.end(), peak::less_mz
);
443 // Return the MyriMatch-style score of a (real) spectrum (x) versus a
444 // theoretical spectrum (y) generated from a peptide candidate.
446 // FIX: This code seems complicated. Is the MyriMatch idea of using maps (or
447 // better yet, sets) for peak lists simpler? As fast or faster?
449 // This is the innermost loop, so it's worthwhile to optimize this some.
450 // FIX: Should some of these vectors be arrays?
452 score_spectrum(const spectrum
&x
, const spectrum
&y
) NOTHROW
{
453 assert (not x
.peaks
.empty() and not y
.peaks
.empty()); // FIX
455 std::vector
<double> peak_best_delta(y
.peaks
.size(),
456 CP
.fragment_mass_tolerance
);
457 std::vector
<int> peak_best_class(y
.peaks
.size(), -1);
459 std::vector
<peak
>::const_iterator x_it
= x
.peaks
.begin();
460 std::vector
<peak
>::const_iterator y_it
= y
.peaks
.begin();
462 // Iterate over all closest pairs of peaks, collecting info on the class of
463 // the best (meaning closest mz) real peak matching each theoretical peak
464 // (for real/theoretical peak pairs that are no farther apart than
465 // fragment_mass_tolerance).
467 while (x_it
!= x
.peaks
.end() and y_it
!= y
.peaks
.end()) {
468 const double delta
= y_it
->mz
- x_it
->mz
;
469 if (std::abs(delta
) <= peak_best_delta
[y_index
]) {
470 peak_best_class
[y_index
] = x_it
->intensity_class
;
471 peak_best_delta
[y_index
] = std::abs(delta
);
481 assert(CP
.intensity_class_count
< INT_MAX
);
482 std::vector
<unsigned> peak_hit_histogram(CP
.intensity_class_count
);
483 std::vector
<int>::const_iterator b_it
;
484 for (b_it
= peak_best_class
.begin(); b_it
!= peak_best_class
.end(); b_it
++)
486 assert(*b_it
< static_cast<int>(peak_hit_histogram
.size()));
487 peak_hit_histogram
[*b_it
] += 1;
490 // How many theoretical peaks overlap the real peak range?
491 int valid_theoretical_peaks
= 0;
492 for (y_it
= y
.peaks
.begin(); y_it
!= y
.peaks
.end(); y_it
++)
493 if (x
.min_peak_mz
<= y_it
->mz
and y_it
->mz
<= x
.max_peak_mz
)
494 valid_theoretical_peaks
++;
496 // How many valid theoretical peaks were misses?
497 const int peak_misses
= (valid_theoretical_peaks
498 - accumulate(peak_hit_histogram
.begin(),
499 peak_hit_histogram
.end(), 0));
501 for (unsigned int i
=0; i
<peak_hit_histogram
.size(); i
++)
502 if (x
.intensity_class_counts
[i
] < peak_hit_histogram
[i
])
503 std::cerr
<< i
<< " C(" << x
.intensity_class_counts
[i
] << ", "
504 << peak_hit_histogram
[i
] << ")" << std::endl
;
505 if (x
.empty_peak_bins
< peak_misses
)
506 std::cerr
<< "M C(" << x
.empty_peak_bins
<< ", " << peak_misses
<< ")"
508 if (x
.total_peak_bins
< valid_theoretical_peaks
)
509 std::cerr
<< "T C(" << x
.total_peak_bins
<< ", " << valid_theoretical_peaks
514 for (unsigned int i
=0; i
<peak_hit_histogram
.size(); i
++)
515 std::cerr
<< i
<< " C(" << x
.intensity_class_counts
[i
] << ", "
516 << peak_hit_histogram
[i
] << ")" << std::endl
;
517 std::cerr
<< "C(" << x
.empty_peak_bins
<< ", " << peak_misses
<< ")"
519 std::cerr
<< "C(" << x
.total_peak_bins
<< ", " << valid_theoretical_peaks
521 std::cerr
<< std::endl
;
524 assert(peak_misses
>= 0);
527 for (unsigned int i
=0; i
<peak_hit_histogram
.size(); i
++)
528 score
+= ln_combination_(x
.intensity_class_counts
[i
],
529 peak_hit_histogram
[i
]);
530 score
+= ln_combination_(x
.empty_peak_bins
, peak_misses
);
531 score
-= ln_combination_(x
.total_peak_bins
, valid_theoretical_peaks
);
537 // Return the score of the specified spectrum against a group of synthetic
540 // For +1 and +2 precursors, this does what MyriMatch does. For +3 (and
541 // above), this currently takes the sum of +1b/+1y or +2b/+2y, which may or
542 // may not make much sense. (Wouldn't it be better to take the max of +1b/+2y
544 // FIX: implement MM smart +3 model, and/or come up with something better.
546 score_spectrum_over_charges(spectrum synth_sp
[/* max_fragment_charge+1 */],
548 const int max_fragment_charge
) NOTHROW
{
549 double best_score
= score_spectrum(sp
, synth_sp
[1]);
551 const int charge_limit
= std::max
<int>(1, sp
.charge
-1);
552 for (int charge
=2; charge
<=charge_limit
; charge
++) {
553 assert(charge
<= max_fragment_charge
);
554 best_score
+= score_spectrum(sp
, synth_sp
[charge
]);
560 struct mass_trace_list
{
561 mass_trace_item item
;
562 const mass_trace_list
*next
;
564 mass_trace_list(const mass_trace_list
*p
=0) : next(p
) { }
568 // Search for matches of this particular peptide modification variation
569 // against the spectra. Updates score_stats and returns the number of
570 // candidate spectra found.
572 evaluate_peptide(const search_context
&context
, match
&m
,
573 const mass_trace_list
*mtlp
,
574 const std::vector
<double> &mass_list
,
575 const std::vector
<std::vector
<spectrum
>::size_type
>
577 score_stats
&stats
) NOTHROW
{
578 assert(m
.peptide_sequence
.size() == mass_list
.size());
580 int max_candidate_charge
= 0;
581 typedef std::vector
<std::vector
<spectrum
>::size_type
>::const_iterator c_it
;
582 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++)
584 = std::max
<int>(max_candidate_charge
,
585 spectrum::searchable_spectra
[*it
].charge
);
586 assert(max_candidate_charge
>= 1);
588 const int max_fragment_charge
= std::max
<int>(1, max_candidate_charge
-1);
589 assert(max_fragment_charge
<= spectrum::MAX_SUPPORTED_CHARGE
);
590 // e.g. +2 -> 'B' -> spectrum
591 // FIX: should this be a std::vector??
592 static spectrum synth_sp
[spectrum::MAX_SUPPORTED_CHARGE
+1];
593 synthetic_spectra(synth_sp
, mass_list
, context
.fragment_N_fixed_mass
,
594 context
.fragment_C_fixed_mass
, max_fragment_charge
);
596 for (c_it it
=candidate_spectra
.begin(); it
!= candidate_spectra
.end(); it
++) {
597 m
.spectrum_index
= *it
;
598 spectrum
&sp
= spectrum::searchable_spectra
[m
.spectrum_index
];
601 stats
.candidate_spectrum_count
+= 1;
602 if (CP
.estimate_only
)
605 //std::cerr << "sp " << m.spectrum_index << std::endl;
606 m
.score
= score_spectrum_over_charges(synth_sp
, sp
, max_fragment_charge
);
607 //std::cerr << "score " << m.score << std::endl;
609 // Is this score good enough to be in the best score list? (Better scores
610 // are more negative.)
611 if (m
.score
>= stats
.best_matches
[m
.spectrum_index
].back().score
)
614 // We're saving this match, so remember the mass trace info, too.
615 m
.mass_trace
.clear();
616 for (const mass_trace_list
*p
=mtlp
; p
; p
=p
->next
)
617 m
.mass_trace
.push_back(p
->item
);
619 // Is this match the first candidate for this spectrum?
620 if (stats
.best_matches
[m
.spectrum_index
].front().score
== 0)
621 stats
.spectra_with_candidates
+= 1;
623 // Insert this match in the correct position.
625 for (i
=stats
.best_matches
[m
.spectrum_index
].size() - 2; i
>=0; i
--)
626 if (stats
.best_matches
[m
.spectrum_index
][i
].score
<= m
.score
)
629 stats
.best_matches
[m
.spectrum_index
][i
+1]
630 = stats
.best_matches
[m
.spectrum_index
][i
];
631 stats
.best_matches
[m
.spectrum_index
][i
+1] = m
;
636 // IDEA FIX: Use a cost parameter to define the iterative search front.
639 // Choose one possible residue modification position. Once they're all
640 // chosen, then evaluate.
642 choose_residue_mod(const search_context
&context
, match
&m
,
643 const mass_trace_list
*mtlp
, std::vector
<double> &mass_list
,
644 const std::vector
<std::vector
<spectrum
>::size_type
>
647 std::vector
<int> &db_remaining
,
648 const unsigned remaining_positions_to_choose
,
649 const unsigned next_position_to_consider
) NOTHROW
{
650 assert(remaining_positions_to_choose
651 <= m
.peptide_sequence
.size() - next_position_to_consider
);
653 if (remaining_positions_to_choose
== 0) {
654 stats
.combinations_searched
+= 1;
655 evaluate_peptide(context
, m
, mtlp
, mass_list
, candidate_spectra
, stats
);
657 mass_trace_list
mtl(mtlp
);
659 // consider all of the positions where we could next add a mod
660 for (unsigned int i
=next_position_to_consider
;
661 i
<= m
.peptide_sequence
.size()-remaining_positions_to_choose
; i
++) {
662 mtl
.item
.position
= i
;
663 const double save_pos_mass
=mass_list
[i
];
664 const char pos_res
= m
.peptide_sequence
[i
];
666 // consider the possibilities for this position
667 for (std::vector
<int>::const_iterator
668 it
=context
.delta_bag_lookup
[pos_res
].begin();
669 it
!= context
.delta_bag_lookup
[pos_res
].end(); it
++) {
670 const int db_index
= *it
;
671 if (db_remaining
[db_index
] < 1)
673 db_remaining
[db_index
] -= 1;
674 mass_list
[i
] = save_pos_mass
+ context
.delta_bag_delta
[db_index
];
675 mtl
.item
.delta
= context
.delta_bag_delta
[db_index
];
676 choose_residue_mod(context
, m
, &mtl
, mass_list
, candidate_spectra
,
678 remaining_positions_to_choose
-1, i
+1);
679 db_remaining
[db_index
] += 1;
681 mass_list
[i
] = save_pos_mass
;
687 // p_begin (p_end) to begin_index (end_index), updating p_mass in the process.
688 // The sign determines whether mass increases or decreases as p_begin (p_end)
689 // is moved forward--so it should be -1 for the begin case and +1 for the end
691 // FIX: is this worth its complexity? kill or else add reset
693 update_p_mass(double &p_mass
, int &p_begin
, int begin_index
, int sign
,
694 const std::string
&run_sequence
,
695 const std::vector
<double> &fixed_residue_mass
) {
696 assert(sign
== +1 or sign
== -1);
697 assert(begin_index
>= 0);
698 const bool moving_forward
= begin_index
>= p_begin
;
699 int p0
=p_begin
, p1
=begin_index
;
700 if (not moving_forward
) {
704 for (int i
=p0
; i
<p1
; i
++)
705 p_mass
+= sign
* fixed_residue_mass
[run_sequence
[i
]];
706 p_begin
= begin_index
;
711 // Search a sequence run for matches according to the context against the
712 // spectra. Updates score_stats and the number of candidate spectra found.
714 // FIX: examine carefully for signed/unsigned problems
717 search_run(const search_context
&context
, const sequence_run
&sequence_run
,
718 score_stats
&stats
) {
719 const int min_peptide_length
= std::max
<int>(CP
.minimum_peptide_length
,
721 const std::vector
<double> &fixed_parent_mass \
722 = CP
.parent_mass_regime
[context
.mass_regime_index
].fixed_residue_mass
;
724 const std::string
&run_sequence
= sequence_run
.sequence
;
725 const std::vector
<int> &cleavage_points
= sequence_run
.cleavage_points
;
726 assert(cleavage_points
.size() >= 2); // endpoints assumed always present
728 // This match will be passed inward and used to record information that we
729 // need to remember about a match when we finally see one. At that point, a
730 // copy of this match will be saved.
731 match m
; // FIX: move inward?
732 m
.sequence_name
= sequence_run
.name
;
734 assert(context
.delta_bag_delta
.size()
735 == context
.delta_bag_count
.size());
736 // counts remaining as mod positions are chosen
737 std::vector
<int> db_remaining
= context
.delta_bag_count
;
739 // the rightmost 'end' seen when all spectra masses were too high
740 // (0 means none yet encountered.)
741 unsigned int next_end
= 0;
743 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
744 double p_mass
= context
.parent_fixed_mass
;
745 int p_begin
=0, p_end
=0;
747 // FIX: optimize the non-specific cleavage case?
748 for (unsigned int begin
=0; begin
<cleavage_points
.size()-1; begin
++) {
749 const int begin_index
= cleavage_points
[begin
];
750 m
.peptide_begin
= begin_index
+ sequence_run
.sequence_offset
;
751 if (not context
.pca_residues
.empty())
752 if (context
.pca_residues
.find(run_sequence
[begin_index
])
753 == std::string::npos
)
755 update_p_mass(p_mass
, p_begin
, begin_index
, -1, run_sequence
,
757 unsigned int end
= std::max
<unsigned int>(begin
+ 1, next_end
);
758 for (; end
<cleavage_points
.size(); end
++) {
759 m
.missed_cleavage_count
= end
- begin
- 1;
760 if (m
.missed_cleavage_count
> context
.maximum_missed_cleavage_sites
)
763 const int end_index
= cleavage_points
[end
];
764 assert(end_index
- begin_index
> 0);
765 const int peptide_size
= end_index
- begin_index
;
766 assert(peptide_size
> 0);
767 if (peptide_size
< min_peptide_length
)
769 update_p_mass(p_mass
, p_end
, end_index
, +1, run_sequence
,
771 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
773 // We'd like to consider only the spectra whose masses are within the
774 // tolerance range of theoretical mass of the peptide (p_mass).
775 // Unfortunately, the tolerance range of each spectrum depends on its
776 // charge, since the tolerance is specified in m/z units. Furthermore,
777 // when deciding how to adjust begin/end to generate the next peptide,
778 // we need to be conservative, as a high-charge spectrum may come into
779 // range, even though only +1 spectra are currently in-range. So, we
780 // first screen using the maximum tolerance range, then check the actual
783 const double sp_mass_lb
= p_mass
- CP
.parent_mass_tolerance_max
;
784 const double sp_mass_ub
= p_mass
+ CP
.parent_mass_tolerance_max
;
786 const spmi_c_it candidate_spectra_info_begin
787 = spectrum::spectrum_mass_index
.lower_bound(sp_mass_lb
);
788 if (candidate_spectra_info_begin
== spectrum::spectrum_mass_index
.end()) {
789 // spectrum masses all too low to match peptide (peptide too long)
790 //std::cerr << "peptide: sp low" << std::endl;
794 const spmi_c_it candidate_spectra_info_end
795 = spectrum::spectrum_mass_index
.upper_bound(sp_mass_ub
);
796 if (candidate_spectra_info_end
== spectrum::spectrum_mass_index
.begin()) {
797 // spectrum masses all too high to match peptide
798 // (peptide is too short, so increase end, if possible, else return)
799 //std::cerr << "peptide: sp high" << std::endl;
800 if (end
== cleavage_points
.size() - 1)
805 if (candidate_spectra_info_begin
== candidate_spectra_info_end
) {
806 // no spectrum with close-enough parent mass
807 //std::cerr << "peptide: sp none nearby" << std::endl;
811 // Now generate the list of candidate spectra that are actually
812 // in-range, considering the charge of each spectrum.
813 std::vector
<std::vector
<spectrum
>::size_type
> candidate_spectra_0
;
814 for (spmi_c_it it
=candidate_spectra_info_begin
;
815 it
!= candidate_spectra_info_end
; it
++) {
816 const spectrum
&csp
= spectrum::searchable_spectra
[it
->second
];
817 if (std::abs(csp
.mass
- p_mass
) <= (csp
.charge
818 * CP
.parent_mass_tolerance_1
))
819 candidate_spectra_0
.push_back(it
->second
);
821 if (candidate_spectra_0
.empty())
824 m
.predicted_parent_mass
= p_mass
;
825 m
.peptide_sequence
.assign(run_sequence
, begin_index
, peptide_size
);
827 std::vector
<double> mass_list(peptide_size
);
828 for (std::vector
<double>::size_type i
=0; i
<m
.peptide_sequence
.size(); i
++)
829 mass_list
[i
] = (CP
.fragment_mass_regime
[context
.mass_regime_index
]
830 .fixed_residue_mass
[m
.peptide_sequence
[i
]]);
832 choose_residue_mod(context
, m
, NULL
, mass_list
, candidate_spectra_0
,
833 stats
, db_remaining
, context
.mod_count
, 0);
839 // Search all sequence runs for matches according to the context against the
840 // spectra. Updates score_stats and the number of candidate spectra found.
842 spectrum::search_runs(const search_context
&context
, score_stats
&stats
) {
843 const int no_runs
= context
.sequence_runs
.size();
844 for (int i
=0; i
<no_runs
; i
++) {
845 search_run(context
, context
.sequence_runs
[i
], stats
);
847 if (CP
.show_progress
)
848 std::cerr
<< i
+1 << " of " << no_runs
<< " sequences, "
849 << stats
.candidate_spectrum_count
<< " cand for "
850 << stats
.spectra_with_candidates
<< " sp\r" << std::flush
;
853 if (CP
.show_progress
)
854 std::cerr
<< std::endl
;