1 // C++ module for greylag
5 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
7 // This program is free software; you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation; either version 2 of the License, or
10 // (at your option) any later version.
12 // This program is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
17 // You should have received a copy of the GNU General Public License along
18 // with this program; if not, write to the Free Software Foundation, Inc.,
19 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 #include "cgreylag.hpp"
38 #define NOTHROW __attribute__ ((nothrow))
39 // use faster, non-threadsafe versions, since we don't use threads
40 #define fgetsX fgets_unlocked
41 #define fputsX fputs_unlocked
42 #define fprintfX __builtin_fprintf_unlocked
43 #define ferrorX ferror_unlocked
46 #define fgetsX std::fgets
47 #define fputsX std::fputs
48 #define fprintfX std::fprintf
49 #define ferrorX std::ferror
53 parameters
parameters::the
;
57 peak::__repr__() const {
58 static char temp
[128];
59 sprintf(temp
, "<peak mz=%.4f intensity=%.4f>", mz
, intensity
);
63 const int FIRST_SPECTRUM_ID
= 1;
64 int spectrum::next_id
= FIRST_SPECTRUM_ID
;
65 int spectrum::next_physical_id
= FIRST_SPECTRUM_ID
;
67 std::vector
<spectrum
> spectrum::searchable_spectra
;
68 // parent mass -> searchable_spectra index
70 std::vector
<spectrum
>::size_type
> spectrum::spectrum_mass_index
;
74 spectrum::__repr__() const {
75 static char temp
[1024];
77 "<spectrum #%d (phys #%d) '%s' %.4f/%+d"
78 " [%d peaks, maxI=%f, sumI=%f]>",
79 id
, physical_id
, name
.c_str(), mass
, charge
, peaks
.size(),
80 max_peak_intensity
, sum_peak_intensity
);
86 // This is an error exit for read_spectra and filter_ms2_by_mass.
88 io_error(FILE *f
, const char *message
="") {
90 message
= "I/O error while reading (or writing) spectrum file";
91 throw std::ios_base::failure(message
);
95 // Return sorted list of parent masses present in the given ms2 files.
97 spectrum::read_ms2_spectrum_masses(std::vector
<int> fds
) {
98 std::vector
<double> masses
;
100 for (std::vector
<int>::const_iterator fdit
=fds
.begin(); fdit
!= fds
.end();
102 FILE *f
= fdopen(*fdit
, "r");
104 const int bufsiz
= 1024;
107 char *result
= fgetsX(buf
, bufsiz
, f
);
113 if (not result
or buf
[0] != ':')
115 if (not fgetsX(buf
, bufsiz
, f
))
116 io_error(f
, "bad ms2 format: mass/charge line expected");
118 masses
.push_back(std::strtod(buf
, &endp
)); // need double accuracy here
119 if (errno
or endp
== buf
)
120 io_error(f
, "bad ms2 format: bad mass");
121 result
= fgetsX(buf
, bufsiz
, f
);
126 std::vector
<peak
> peaks
;
128 result
= fgetsX(buf
, bufsiz
, f
);
131 if (not result
or buf
[0] == ':')
136 std::sort(masses
.begin(), masses
.end());
141 // Read spectra from file in ms2 format, tagging them with file_id. Spectra
142 // with charge zero are omitted from the result. (All of them are read,
143 // though, to keep spectrum ids in sync.) If file_id == -1, the ms2 file is
144 // an annotated file produced by split_ms2_by_mass_band.
146 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
147 // having the same physical id. The peak list is initially sorted by mz.
148 // Throws an exception on invalid input.
150 std::vector
<spectrum
>
151 spectrum::read_spectra_from_ms2(FILE *f
, const int file_id
) {
152 std::vector
<spectrum
> spectra
;
154 const int bufsiz
= 1024;
158 char *result
= fgetsX(buf
, bufsiz
, f
);
160 std::vector
<std::string
> names
;
161 std::vector
<double> masses
;
162 std::vector
<int> charges
;
164 std::vector
<int> file_ids
;
165 std::vector
<int> physical_ids
;
166 std::vector
<int> ids
;
172 if (not result
or buf
[0] != ':')
175 names
.push_back(std::string(buf
+1, buf
+std::strlen(buf
)-1));
177 char *anno_hash
= std::strrchr(buf
, '#'); // const, but C++ doesn't
178 // like it declared thus--yuck
180 io_error(f
, "bad ms2+ format: '#' not found");
181 names
.push_back(std::string(buf
+1, anno_hash
));
183 file_ids
.push_back(std::strtol(anno_hash
+1, &endp
, 10));
184 physical_ids
.push_back(std::strtol(endp
+1, &endp
, 10));
185 ids
.push_back(std::strtol(endp
+1, &endp
, 10));
186 if (errno
) // FIX: improve error check
187 io_error(f
, "bad ms2+ format: bad values");
189 if (not fgetsX(buf
, bufsiz
, f
))
190 io_error(f
, "bad ms2 format: mass/charge line expected");
192 masses
.push_back(std::strtod(buf
, &endp
)); // need double accuracy here
193 if (errno
or endp
== buf
)
194 io_error(f
, "bad ms2 format: bad mass");
195 const char *endp0
= endp
;
196 charges
.push_back(std::strtol(endp0
, &endp
, 10));
197 if (errno
or endp
== endp0
)
198 io_error(f
, "bad ms2 format: bad charge");
199 result
= fgetsX(buf
, bufsiz
, f
);
204 std::vector
<peak
> peaks
;
208 p
.mz
= strtod(buf
, &endp
);
209 if (errno
or endp
== buf
)
210 io_error(f
, "bad ms2 format: bad peak mz");
211 const char *endp0
= endp
;
212 p
.intensity
= strtod(endp0
, &endp
);
213 if (errno
or endp
== endp0
)
214 io_error(f
, "bad ms2 format: bad peak intensity");
217 result
= fgetsX(buf
, bufsiz
, f
);
220 if (not result
or buf
[0] == ':')
224 // add spectra to vector
225 if (names
.empty() and not peaks
.empty())
226 io_error(f
, "bad ms2 format: missing header lines?");
228 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
230 for (std::vector
<double>::size_type i
=0; i
<names
.size(); i
++) {
231 spectrum
sp(masses
[i
], charges
[i
]);
235 sp
.file_id
= file_id
;
236 sp
.physical_id
= next_physical_id
;
238 sp
.file_id
= file_ids
[i
];
239 sp
.physical_id
= physical_ids
[i
];
241 spectrum::next_id
= std::max(spectrum::next_id
, sp
.id
+1);
242 spectrum::next_physical_id
= std::max(spectrum::next_physical_id
,
245 sp
.calculate_intensity_statistics();
246 spectra
.push_back(sp
);
249 spectrum::next_physical_id
++;
255 // Copy spectra from an ms2 file to a set of output ms2 files, one for each
256 // band in the set of mass bands described by their upper bounds. Extra
257 // information is written in the first header line of each spectra so that
258 // id's may be recovered. So, for example, ':0002.0002.1' might become
259 // ':0002.0002.1 # 0 45 78', where 0, 45, 78 are the spectrum's file_id,
260 // physical_id, and id, respectively. Multiply charged spectra will be
261 // split into separate spectra (having the same physical_id).
263 // FIX: cleaner to just pass in a vector of filenames, rather than fds?
265 spectrum::split_ms2_by_mass_band(FILE *inf
, const std::vector
<int> &outfds
,
267 const std::vector
<double> &mass_band_upper_bounds
) {
268 std::map
<double, FILE *> mass_file_index
;
270 if (outfds
.size() < 1
271 or outfds
.size() != mass_band_upper_bounds
.size())
272 throw std::invalid_argument("invalid argument value");
274 for (std::vector
<int>::size_type i
=0; i
<outfds
.size(); i
++) {
275 FILE *f
= fdopen(outfds
[i
], "w");
277 throw std::ios_base::failure("fdopen failed: fd limit exceeded?");
278 mass_file_index
.insert(std::make_pair(mass_band_upper_bounds
[i
], f
));
281 int o_next_id
= FIRST_SPECTRUM_ID
;
282 int o_next_physical_id
= FIRST_SPECTRUM_ID
;
284 const int bufsiz
= 1024;
285 char buf0
[bufsiz
], buf1
[bufsiz
];
287 char *result
= fgetsX(buf0
, bufsiz
, inf
);
290 std::set
<FILE *> sp_files
;
296 if (not result
or buf0
[0] != ':')
298 if (not fgetsX(buf1
, bufsiz
, inf
))
299 io_error(inf
, "bad ms2 format: mass/charge line expected");
301 mass
= std::strtod(buf1
, &endp
); // need double accuracy here
302 if (errno
or endp
== buf1
)
303 io_error(inf
, "bad ms2 format: bad mass");
305 std::map
<double, FILE *>::const_iterator bit
306 = mass_file_index
.lower_bound(mass
);
307 if (bit
== mass_file_index
.end())
308 throw std::invalid_argument("internal error: mass out of range");
309 FILE *bandf
= bit
->second
;
310 sp_files
.insert(bandf
);
311 assert(buf0
[strlen(buf0
)-1] == '\n');
312 buf0
[strlen(buf0
)-1] = 0; // chop newline
313 fprintfX(bandf
, "%s # %d %d %d\n", buf0
, file_id
, o_next_physical_id
,
317 io_error(bandf
, "error writing ms2 file");
318 result
= fgetsX(buf0
, bufsiz
, inf
);
322 if (sp_files
.empty())
323 io_error(inf
, "bad ms2 format: missing header lines?");
328 for (std::set
<FILE *>::const_iterator fit
= sp_files
.begin();
329 fit
!= sp_files
.end(); fit
++) {
332 io_error(*fit
, "error writing ms2 file");
334 result
= fgetsX(buf0
, bufsiz
, inf
);
337 if (not result
or buf0
[0] == ':')
340 o_next_physical_id
++;
343 // flush all output files (is this necessary?)
344 for (std::map
<double, FILE *>::const_iterator it
= mass_file_index
.begin();
345 it
!= mass_file_index
.end(); it
++)
346 std::fflush(it
->second
);
350 // For each *non-overlapping* group of peaks with mz within an interval of
351 // (less than) length 'limit', keep only the most intense peak. (The
352 // algorithm works from the right, which matter here.)
354 // FIX: Is this really the best approach? The obvious alternative would be to
355 // eliminate peaks so that no interval of length 'limit' has more than one
356 // peak (eliminating weak peaks to make this so).
359 remove_close_peaks(std::vector
<peak
> &peaks
, double limit
) {
360 for (std::vector
<peak
>::size_type i
=0; i
+1<peaks
.size(); i
++) {
361 double ub_mz
= peaks
[i
].mz
+ limit
;
362 while (i
+1 < peaks
.size() and peaks
[i
+1].mz
< ub_mz
)
363 peaks
.erase(peaks
.begin()
364 + (peaks
[i
+1].intensity
> peaks
[i
].intensity
370 // Examine this spectrum to see if it should be kept. If so, return true and
371 // sort the peaks by mz, normalize them and limit their number.
373 spectrum::filter_and_normalize(double minimum_fragment_mz
,
374 double dynamic_range
, int minimum_peaks
,
376 // "spectrum, maximum parent charge", noise suppression check,
377 // "spectrum, minimum parent m+h" not implemented
379 if (minimum_fragment_mz
< 0 or dynamic_range
<= 0 or minimum_peaks
< 0
381 throw std::invalid_argument("invalid argument value");
384 // >>> removes multiple entries within 0.95 Da of each other, retaining the
385 // highest value. this is necessary because of the behavior of some peak
386 // finding routines in commercial software
388 // peaks already sorted by mz upon read
389 remove_close_peaks(peaks
, 0.95);
392 // >>> set up m/z regions to ignore: those immediately below the m/z of the
393 // parent ion which will contain uninformative neutral loss ions, and those
394 // immediately above the parent ion m/z, which will contain the parent ion
395 // and its isotope pattern
397 const double parent_mz
= 1.00727 + (mass
- 1.00727) / charge
;
398 for (std::vector
<peak
>::size_type i
=0; i
<peaks
.size();)
399 if (parent_mz
>= peaks
[i
].mz
400 ? std::abs(parent_mz
- peaks
[i
].mz
) < (50.0 / charge
)
401 : std::abs(parent_mz
- peaks
[i
].mz
) < (5.0 / charge
))
402 peaks
.erase(peaks
.begin() + i
);
408 std::vector
<peak
>::iterator pi
;
409 for (pi
=peaks
.begin(); pi
!= peaks
.end() and pi
->mz
<= minimum_fragment_mz
;
411 peaks
.erase(peaks
.begin(), pi
);
414 // normalize spectrum
415 // >>> use the dynamic range parameter to set the maximum intensity value
416 // for the spectrum. then remove all peaks with a normalized intensity < 1
419 std::vector
<peak
>::iterator most_intense_peak
420 = std::max_element(peaks
.begin(), peaks
.end(), peak::less_intensity
);
422 = std::max
<double>(1.0, most_intense_peak
->intensity
) / dynamic_range
;
423 for (std::vector
<peak
>::iterator p
=peaks
.begin(); p
!= peaks
.end(); p
++)
424 p
->intensity
/= normalization_factor
;
425 for (std::vector
<peak
>::size_type i
=0; i
<peaks
.size();)
426 if (peaks
[i
].intensity
< 1.0)
427 peaks
.erase(peaks
.begin() + i
);
431 if (peaks
.size() < (unsigned int) minimum_peaks
)
434 // # check is_noise (NYI)
435 // # * is_noise attempts to determine if the spectrum is simply
436 // # noise. if the spectrum
437 // # * does not have any peaks within a window near the parent ion mass,
438 // # it is considered
440 // #limit = sp.mass / sp.charge
441 // #if sp.charge < 3:
442 // # limit = sp.mass - 600.0
443 // #if not [ True for mz, i in sp.peaks if mz > limit ]:
444 // # return "looks like noise"
446 // clean_isotopes removes peaks that are probably C13 isotopes
447 remove_close_peaks(peaks
, 1.5);
449 // keep only the most intense peaks
450 if (peaks
.size() > (unsigned int) total_peaks
) {
451 std::sort(peaks
.begin(), peaks
.end(), peak::less_intensity
);
452 peaks
.erase(peaks
.begin(), peaks
.end() - total_peaks
);
453 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
456 // update statistics, now that we've removed some peaks
457 calculate_intensity_statistics();
463 // Store the list of spectra that search_peptide will search against, and also
464 // build spectrum_mass_index.
466 spectrum::set_searchable_spectra(const std::vector
<spectrum
> &spectra
) {
467 searchable_spectra
= spectra
;
468 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++)
469 spectrum_mass_index
.insert(std::make_pair(spectra
[i
].mass
, i
));
473 // Multiply peak intensities by residue-dependent factors to generate a more
474 // realistic spectrum. If is_XYZ, walk through the peptide sequence in
477 synthesize_ladder_intensities(std::vector
<peak
> &mass_ladder
,
478 const std::string
&peptide_seq
,
479 const bool is_XYZ
) NOTHROW
{
480 const parameters
&CP
= parameters::the
;
482 assert(mass_ladder
.size() == peptide_seq
.size() - 1);
483 assert(mass_ladder
.size() >= 2);
485 // An intensity boost is given for the second peptide bond/peak (*) if the
486 // N-terminal residue is 'P':
489 const int proline_bonus_position
= is_XYZ
? mass_ladder
.size()-2 : 1;
490 mass_ladder
[proline_bonus_position
].intensity
= 3.0;
491 if (peptide_seq
[1] == 'P')
492 mass_ladder
[proline_bonus_position
].intensity
= 10.0;
494 if (CP
.spectrum_synthesis
)
495 for (unsigned int i
=0; i
<mass_ladder
.size(); i
++) {
496 // FIX: there must be a better way
497 const char peptide_index
= is_XYZ
? mass_ladder
.size()-i
-1 : i
;
498 switch (peptide_seq
[peptide_index
]) {
500 mass_ladder
[i
].intensity
*= 5.0;
506 mass_ladder
[i
].intensity
*= 3.0;
510 mass_ladder
[i
].intensity
*= 2.0;
513 switch (peptide_seq
[peptide_index
+1]) {
515 mass_ladder
[i
].intensity
*= 5.0;
522 // Calculate peaks for a synthesized mass (not mz) ladder.
524 get_synthetic_Y_mass_ladder(std::vector
<peak
> &mass_ladder
,
525 const std::string
&peptide_seq
,
526 const std::vector
<double> &mass_list
,
527 const double C_terminal_mass
,
528 const unsigned mass_regime
=0) NOTHROW
{
529 const parameters
&CP
= parameters::the
;
530 assert(peptide_seq
.size() == mass_list
.size());
531 double m
= (CP
.fragment_mass_regime
[mass_regime
].water_mass
532 + (CP
.cleavage_C_terminal_mass_change
533 - CP
.fragment_mass_regime
[mass_regime
].hydroxyl_mass
)
536 const int ladder_size
= mass_ladder
.size();
537 for (int i
=ladder_size
-1; i
>=0; i
--) {
539 mass_ladder
[ladder_size
-1-i
] = peak(m
, 1.0);
542 synthesize_ladder_intensities(mass_ladder
, peptide_seq
, true);
546 // Calculate peaks for a synthesized mass (not mz) ladder.
548 get_synthetic_B_mass_ladder(std::vector
<peak
> &mass_ladder
,
549 const std::string
&peptide_seq
,
550 const std::vector
<double> &mass_list
,
551 const double N_terminal_mass
,
552 const unsigned mass_regime
=0) NOTHROW
{
553 const parameters
&CP
= parameters::the
;
554 double m
= (0.0 + (CP
.cleavage_N_terminal_mass_change
- CP
.hydrogen_mass
)
557 const int ladder_size
= mass_ladder
.size();
558 for (int i
=0; i
<=ladder_size
-1; i
++) {
560 mass_ladder
[i
] = peak(m
, 1.0);
563 synthesize_ladder_intensities(mass_ladder
, peptide_seq
, false);
568 synthetic_spectra(spectrum synth_sp
[/* max_fragment_charge+1 */][ION_MAX
],
569 const std::string
&peptide_seq
, const double peptide_mass
,
570 const std::vector
<double> &mass_list
,
571 const double N_terminal_mass
, const double C_terminal_mass
,
572 const double max_fragment_charge
) NOTHROW
{
573 std::vector
<peak
> mass_ladder(mass_list
.size()-1);
575 for (ion ion_type
=ION_MIN
; ion_type
<ION_MAX
; ion_type
++) {
578 get_synthetic_Y_mass_ladder(mass_ladder
, peptide_seq
, mass_list
,
582 get_synthetic_B_mass_ladder(mass_ladder
, peptide_seq
, mass_list
,
589 //for (unsigned int i=0; i<mass_ladder.size(); i++)
590 // std::cerr << "ladder step " << peptide_seq[i] << " " << mass_ladder[i].mz << std::endl;
592 for (int charge
=1; charge
<=max_fragment_charge
; charge
++) {
593 spectrum
&sp
= synth_sp
[charge
][ion_type
];
594 sp
.mass
= peptide_mass
;
596 sp
.peaks
.resize(mass_ladder
.size());
597 for (std::vector
<peak
>::size_type i
=0; i
<mass_ladder
.size(); i
++) {
598 sp
.peaks
[i
].intensity
= mass_ladder
[i
].intensity
;
599 sp
.peaks
[i
].mz
= peak::get_mz(mass_ladder
[i
].mz
, charge
);
606 // Return the similarity score between this spectrum and that, and also a
607 // count of common peaks in *peak_count.
609 // xtandem quantizes mz values into bins of width fragment_mass_error first,
610 // and adds a bin on either side. This makes the effective fragment mass
611 // error for each peak an arbitrary value between 1x and 2x the parameter,
612 // based on quantization of that peak. If quirks_mode is on, we'll try to
613 // emulate this behavior.
615 score_similarity_(const spectrum
&x
, const spectrum
&y
,
616 int *peak_count
) NOTHROW
{
617 const double frag_err
= parameters::the
.fragment_mass_error
;
618 const bool quirks_mode
= parameters::the
.quirks_mode
;
619 const double error_limit
= quirks_mode
? frag_err
*1.5 : frag_err
;
623 std::vector
<peak
>::const_iterator x_it
= x
.peaks
.begin();
624 std::vector
<peak
>::const_iterator y_it
= y
.peaks
.begin();
626 while (x_it
!= x
.peaks
.end() and y_it
!= y
.peaks
.end()) {
627 double x_mz
= x_it
->mz
, y_mz
= y_it
->mz
;
629 const float ffe
= frag_err
;
630 x_mz
= int(static_cast<float>(x_mz
) / ffe
) * ffe
;
631 y_mz
= int(static_cast<float>(y_mz
) / ffe
) * ffe
;
633 // in quirks mode, must be within one frag_err-wide bin
634 const double delta
= y_mz
- x_mz
;
635 if (std::abs(delta
) < error_limit
) {
637 score
+= (x_it
->intensity
* y_it
->intensity
);
638 //std::cerr << "hit " << x_mz << " vs " << y_mz << ", i = " << x_it->intensity * y_it->intensity << std::endl;
640 // assume peaks aren't "close" together
646 if (delta
> 0) // y_mz > x_mz
657 spectrum::score_similarity(const spectrum
&x
, const spectrum
&y
, int
659 return score_similarity_(x
, y
, peak_count
);
663 // IDEA: could we combine all these spectra and just do the correlation once?
665 // Score the specified spectrum against a group of synthetic fragment
668 score_spectrum(double &hyper_score
, double &convolution_score
,
669 std::vector
<double> &ion_scores
, std::vector
<int> &ion_peaks
,
670 spectrum synth_sp
[/* max_fragment_charge+1 */][ION_MAX
],
671 const int spectrum_id
, const int max_fragment_charge
) NOTHROW
{
672 const parameters
&CP
= parameters::the
;
673 const spectrum sp
= spectrum::searchable_spectra
[spectrum_id
];
674 const int spectrum_charge
= sp
.charge
;
676 convolution_score
= 0;
678 for (ion ion_type
=ION_MIN
; ion_type
<ION_MAX
; ion_type
++) {
680 double i_scores
= 0.0;
681 const int charge_limit
= std::max
<int>(1, spectrum_charge
-1);
682 for (int charge
=1; charge
<=charge_limit
; charge
++) {
683 assert(charge
<= max_fragment_charge
);
685 // convolution score is just sum over charges/ions
686 // hyperscore is product of p! over charges/ions (where p is corr peak
687 // count) times the convolution score (clipped to FLT_MAX)
689 int common_peak_count
; // FIX!
690 double conv_score
= spectrum::score_similarity(synth_sp
[charge
][ion_type
],
691 sp
, &common_peak_count
);
692 i_peaks
+= common_peak_count
;
693 i_scores
+= conv_score
;
694 hyper_score
*= CP
.factorial
[common_peak_count
];
695 convolution_score
+= conv_score
;
698 ion_peaks
[ion_type
] = i_peaks
;
699 ion_scores
[ion_type
] = i_scores
;
701 hyper_score
*= convolution_score
;
705 // FIX: does this actually help inlining?
706 // returns log-scaled hyperscore
708 scale_hyperscore_(double hyper_score
) NOTHROW
{
709 assert(hyper_score
>= 0); // otherwise check for EDOM, ERANGE
710 if (hyper_score
== 0)
711 return -DBL_MAX
; // FIX: this shouldn't be reached?
712 return 4 * std::log10(hyper_score
);
715 scale_hyperscore(double hyper_score
) { return scale_hyperscore_(hyper_score
); }
719 get_peptide_mass(const std::vector
<double> &mass_list
,
720 const double N_terminal_mass
,
721 const double C_terminal_mass
) NOTHROW
{
722 const parameters
&CP
= parameters::the
;
724 double m
= (N_terminal_mass
+ C_terminal_mass
725 + CP
.cleavage_N_terminal_mass_change
726 + CP
.cleavage_C_terminal_mass_change
728 return std::accumulate(mass_list
.begin(), mass_list
.end(), m
);
732 struct mass_trace_list
{
733 mass_trace_item item
;
734 const mass_trace_list
*next
;
736 mass_trace_list(const mass_trace_list
*p
=0) : next(p
) { }
740 // FIX: currently only one mass_list is implemented, meaning that parent and
741 // fragment masses must be the same (e.g., mono/mono). A workaround is to
742 // just increase the parent error plus a bit (the xtandem default seems to
745 // Search for matches of this particular peptide modification variation
746 // against the spectra. Updates score_stats and returns the number of
747 // candidate spectra found.
749 evaluate_peptide_mod_variation(match
&m
, const mass_trace_list
*mtlp
,
750 const std::vector
<double> &mass_list
,
751 const double N_terminal_mass
,
752 const double C_terminal_mass
,
753 score_stats
&stats
) NOTHROW
{
754 const parameters
&CP
= parameters::the
;
755 assert(m
.peptide_sequence
.size() == mass_list
.size());
757 m
.peptide_mass
= get_peptide_mass(mass_list
, N_terminal_mass
,
759 double sp_mass_lb
= m
.peptide_mass
+ CP
.parent_monoisotopic_mass_error_minus
;
760 double sp_mass_ub
= m
.peptide_mass
+ CP
.parent_monoisotopic_mass_error_plus
;
762 const std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
763 candidate_spectra_info_begin
764 = spectrum::spectrum_mass_index
.lower_bound(sp_mass_lb
);
765 if (candidate_spectra_info_begin
== spectrum::spectrum_mass_index
.end()) {
766 stats
.all_spectra_masses_too_high
= false;
768 return; // spectrum masses all too low to match peptide
770 stats
.all_spectra_masses_too_low
= false;
772 const std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
773 candidate_spectra_info_end
774 = spectrum::spectrum_mass_index
.upper_bound(sp_mass_ub
);
775 if (candidate_spectra_info_end
== spectrum::spectrum_mass_index
.begin()) {
777 return; // spectrum masses all too high to match peptide
779 stats
.all_spectra_masses_too_high
= false;
781 if (candidate_spectra_info_begin
== candidate_spectra_info_end
) {
783 return; // no spectrum with close-enough parent mass
788 int max_candidate_charge
= 0;
789 for (std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
790 it
=candidate_spectra_info_begin
;
791 it
!= candidate_spectra_info_end
; it
++)
793 = std::max
<int>(max_candidate_charge
,
794 spectrum::searchable_spectra
[it
->second
].charge
);
795 assert(max_candidate_charge
>= 1);
797 int max_fragment_charge
= max_candidate_charge
;
798 if (not CP
.check_all_fragment_charges
)
799 max_fragment_charge
= std::max
<int>(1, max_candidate_charge
-1);
800 assert(max_fragment_charge
<= spectrum::max_supported_charge
);
801 // e.g. +2 -> 'B' -> spectrum
802 // FIX: should this be a std::vector??
803 static spectrum synth_sp
[spectrum::max_supported_charge
+1][ION_MAX
];
804 synthetic_spectra(synth_sp
, m
.peptide_sequence
, m
.peptide_mass
, mass_list
,
805 N_terminal_mass
, C_terminal_mass
, max_fragment_charge
);
807 for (std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
808 candidate_it
= candidate_spectra_info_begin
;
809 candidate_it
!= candidate_spectra_info_end
; candidate_it
++) {
810 m
.spectrum_index
= candidate_it
->second
;
811 stats
.candidate_spectrum_count
++;
812 score_spectrum(m
.hyper_score
, m
.convolution_score
, m
.ion_scores
,
813 m
.ion_peaks
, synth_sp
, m
.spectrum_index
,
814 max_fragment_charge
);
815 if (m
.convolution_score
<= 2.0)
818 //std::cerr << "histod: " << m.spectrum_index << " " << m.peptide_sequence
819 //<< " " << m.peptide_mass << " " <<
820 //spectrum::searchable_spectra[m.spectrum_index].mass << std::endl;
822 // update spectrum histograms
823 std::vector
<int> &hh
= stats
.hyperscore_histogram
[m
.spectrum_index
];
824 int binned_scaled_hyper_score
= int(0.5 + scale_hyperscore_(m
.hyper_score
));
825 assert(binned_scaled_hyper_score
>= 0);
827 if (static_cast<int>(hh
.size())-1 < binned_scaled_hyper_score
) {
828 assert(hh
.size() < INT_MAX
/2);
829 hh
.resize(std::max
<unsigned int>(binned_scaled_hyper_score
+1,
832 hh
[binned_scaled_hyper_score
] += 1;
833 // incr m_tPeptideScoredCount
834 // nyi: permute stuff
836 const int sp_ion_count
= m
.ion_peaks
[ION_B
] + m
.ion_peaks
[ION_Y
];
837 if (sp_ion_count
< CP
.minimum_ion_count
)
839 const bool has_b_and_y
= m
.ion_peaks
[ION_B
] and m
.ion_peaks
[ION_Y
];
843 // check that parent masses are within error range (isotope ni)
844 // already done above (why does xtandem do it here?)
845 // if check fails, only eligible for 2nd-best record
847 // Remember all of the highest-hyper-scoring matches against each
848 // spectrum. These might be in multiple domains (pos, length, mods) in
849 // multiple proteins.
851 // To avoid the problems that xtandem has with inexact float-point
852 // calculations, we consider hyperscores within the "epsilon ratio" to
853 // be "equal". The best_match vector will need to be re-screened
854 // against this ratio, because it may finally contain entries which
855 // don't meet our criterion. (We don't take the time to get it exactly
856 // right here, because we will often end up discarding it all anyway.)
858 const double current_ratio
= (m
.hyper_score
859 / stats
.best_score
[m
.spectrum_index
]);
861 if (CP
.quirks_mode
and (m
.hyper_score
< stats
.best_score
[m
.spectrum_index
])
862 or (not CP
.quirks_mode
863 and (current_ratio
< CP
.hyper_score_epsilon_ratio
))) {
864 // This isn't a best score, so see if it's a second-best score.
865 if (m
.hyper_score
> stats
.second_best_score
[m
.spectrum_index
])
866 stats
.second_best_score
[m
.spectrum_index
] = m
.hyper_score
;
869 if (CP
.quirks_mode
and (m
.hyper_score
> stats
.best_score
[m
.spectrum_index
])
870 or (not CP
.quirks_mode
871 and (current_ratio
> (1/CP
.hyper_score_epsilon_ratio
)))) {
872 // This score is significantly better, so forget previous matches.
873 stats
.best_score
[m
.spectrum_index
] = m
.hyper_score
;
874 stats
.best_match
[m
.spectrum_index
].clear();
876 // Now remember this match because it's at least as good as those
878 m
.mass_trace
.clear();
879 for (const mass_trace_list
*p
=mtlp
; p
; p
=p
->next
)
880 m
.mass_trace
.push_back(p
->item
);
881 stats
.best_match
[m
.spectrum_index
].push_back(m
);
886 // The choose_* functions generate the different modification possibilities.
888 // IDEA FIX: Use a cost parameter to define the iterative search front.
891 // These values are used to keep track of the choices made so far (because
892 // later choices may depend on earlier ones).
893 enum chosen
{ CHOSEN_NONE
, CHOSEN_PCA
};
896 // Choose a possible residue modification.
898 choose_residue_mod(match
&m
,
899 const std::vector
< std::vector
<double> > &pmm
,
900 const mass_trace_list
*mtlp
,
901 std::vector
<double> &mass_list
,
902 const double N_terminal_mass
,
903 const double C_terminal_mass
, score_stats
&stats
,
904 const unsigned remaining_residues_to_choose
,
905 const unsigned number_of_positions_to_consider
,
906 const int *const mod_positions_to_consider
) NOTHROW
{
907 const parameters
&CP
= parameters::the
;
908 assert(remaining_residues_to_choose
<= number_of_positions_to_consider
);
910 if (CP
.maximum_modification_combinations_searched
911 and (stats
.combinations_searched
912 >= CP
.maximum_modification_combinations_searched
))
915 if (remaining_residues_to_choose
== 0) {
916 stats
.combinations_searched
++;
917 evaluate_peptide_mod_variation(m
, mtlp
, mass_list
, N_terminal_mass
,
918 C_terminal_mass
, stats
);
920 mass_trace_list
mtl(mtlp
);
922 // consider all of the positions where we could next add a mod
924 i
<number_of_positions_to_consider
-remaining_residues_to_choose
+1;
926 const int pos
=mod_positions_to_consider
[i
];
927 mtl
.item
.position
= pos
;
929 const double save_mass
=mass_list
[pos
];
930 const std::vector
<double> &deltas
= pmm
[m
.peptide_sequence
[pos
]];
931 // step through the deltas for this amino acid
932 for (std::vector
<double>::const_iterator it
=deltas
.begin();
933 it
!= deltas
.end(); it
++) {
934 mtl
.item
.delta
= *it
;
935 mtl
.item
.description
= "position mod"; // FIX
936 mass_list
[pos
] = save_mass
+ *it
;
937 choose_residue_mod(m
, pmm
, &mtl
, mass_list
, N_terminal_mass
,
938 C_terminal_mass
, stats
,
939 remaining_residues_to_choose
-1,
940 number_of_positions_to_consider
-i
-1,
941 mod_positions_to_consider
+i
+1);
943 mass_list
[pos
] = save_mass
;
949 // Choose the number of modified residue positions. Start with zero and count
950 // up to the maximum number (which is not greater than the length of the
953 choose_residue_mod_count(match
&m
,
954 const std::vector
< std::vector
<double> > &pmm
,
955 const mass_trace_list
*mtlp
,
956 std::vector
<double> &mass_list
,
957 const double N_terminal_mass
,
958 const double C_terminal_mass
,
959 score_stats
&stats
) NOTHROW
{
960 const parameters
&CP
= parameters::the
;
961 int mod_positions
[m
.peptide_sequence
.size()+1];
962 unsigned int max_count
=0;
963 for (unsigned i
=0; i
<m
.peptide_sequence
.size(); i
++)
964 if (not pmm
[m
.peptide_sequence
[i
]].empty())
965 mod_positions
[max_count
++] = i
;
966 mod_positions
[max_count
] = -1;
967 stats
.combinations_searched
= 0;
969 unsigned count_limit
= max_count
;
970 if (CP
.maximum_simultaneous_modifications_searched
> 0)
971 count_limit
= std::min
<unsigned int>(count_limit
,
972 CP
.maximum_simultaneous_modifications_searched
);
974 // start at one because the zero case is handled by
975 // choose_potential_mod_alternative below (!)
976 for (unsigned count
=1; count
<= count_limit
; count
++)
977 choose_residue_mod(m
, pmm
, mtlp
, mass_list
, N_terminal_mass
,
978 C_terminal_mass
, stats
, count
, max_count
,
983 // FIX: Is there some clear and efficient way to treat terminal mods in the
984 // same way as residue mods? Or, alternatively, is it possible to gracefully
985 // merge the two choose_?_terminal_mod functions?
987 // Choose a possible peptide C-terminal modification.
989 choose_C_terminal_mod(match
&m
,
990 const std::vector
< std::vector
<double> > &pmm
,
991 const mass_trace_list
*mtlp
,
992 std::vector
<double> &mass_list
,
993 const double N_terminal_mass
,
994 const double C_terminal_mass
,
995 score_stats
&stats
) NOTHROW
{
996 choose_residue_mod_count(m
, pmm
, mtlp
, mass_list
, N_terminal_mass
,
997 C_terminal_mass
, stats
);
999 mass_trace_list
mtl(mtlp
);
1000 mtl
.item
.position
= POSITION_CTERM
;
1001 const std::vector
<double> &C_deltas
= pmm
[']'];
1003 for (std::vector
<double>::const_iterator it
=C_deltas
.begin();
1004 it
!= C_deltas
.end(); it
++) {
1005 mtl
.item
.delta
= *it
;
1006 mtl
.item
.description
= "C-terminal mod"; // FIX
1007 choose_residue_mod_count(m
, pmm
, &mtl
, mass_list
, N_terminal_mass
,
1008 C_terminal_mass
+*it
, stats
);
1013 // Choose a possible peptide N-terminal modification, unless a PCA mod has
1014 // already been chosen.
1016 choose_N_terminal_mod(match
&m
, chosen ch
,
1017 const std::vector
< std::vector
<double> > &pmm
,
1018 const mass_trace_list
*mtlp
,
1019 std::vector
<double> &mass_list
,
1020 const double N_terminal_mass
,
1021 const double C_terminal_mass
,
1022 score_stats
&stats
) NOTHROW
{
1023 choose_C_terminal_mod(m
, pmm
, mtlp
, mass_list
, N_terminal_mass
,
1024 C_terminal_mass
, stats
);
1026 if (ch
== CHOSEN_PCA
)
1028 mass_trace_list
mtl(mtlp
);
1029 mtl
.item
.position
= POSITION_NTERM
;
1030 const std::vector
<double> &N_deltas
= pmm
['['];
1032 for (std::vector
<double>::const_iterator it
=N_deltas
.begin();
1033 it
!= N_deltas
.end(); it
++) {
1034 mtl
.item
.delta
= *it
;
1035 mtl
.item
.description
= "N-terminal mod"; // FIX
1036 choose_C_terminal_mod(m
, pmm
, &mtl
, mass_list
, N_terminal_mass
+*it
,
1037 C_terminal_mass
, stats
);
1043 // Choose none or one of the (';'-separated) potential modification
1044 // alternative sets. In the 'none' case, no explicitly requested potential
1045 // mods will be chosen, but PCA mods (above) will still be searched. If an
1046 // alternative set is chosen, at least one explicit mod must be chosen, to
1047 // avoid overlap with the 'none' case.
1049 choose_potential_mod_alternative(match
&m
, chosen ch
,
1050 const mass_trace_list
*mtlp
,
1051 std::vector
<double> &mass_list
,
1052 const double N_terminal_mass
,
1053 const double C_terminal_mass
,
1054 score_stats
&stats
) NOTHROW
{
1055 const parameters
&CP
= parameters::the
;
1057 // the 'none' case--jump over the rest of the choose_*, since in this case
1058 // they choose nothing
1059 evaluate_peptide_mod_variation(m
, mtlp
, mass_list
, N_terminal_mass
,
1060 C_terminal_mass
, stats
);
1062 const mass_regime_parameters
&mrp
= CP
.fragment_mass_regime
[m
.mass_regime
];
1063 const std::vector
< std::vector
< std::vector
<double> > > &alts
1064 = mrp
.potential_modification_mass
;
1066 for (std::vector
< std::vector
< std::vector
<double> > >::const_iterator
1068 it
!= alts
.end(); it
++)
1069 choose_N_terminal_mod(m
, ch
, *it
, mtlp
, mass_list
, N_terminal_mass
,
1070 C_terminal_mass
, stats
);
1074 // Choose a possible modification to account for PCA (pyrrolidone carboxyl
1075 // acid) circularization of the peptide N-terminal. PCA mods are excluded if
1076 // a static N-terminal mod has been specified. Likewise, choosing a PCA mod
1077 // will exclude choosing a potential N-terminal mod. (The PCA mod for 'C' is
1078 // dependent on a static mod of C+57 being in effect.)
1080 choose_PCA_mod(match
&m
, const mass_trace_list
*mtlp
,
1081 std::vector
<double> &mass_list
,
1082 const double N_terminal_mass
,
1083 const double C_terminal_mass
, score_stats
&stats
) NOTHROW
{
1084 const parameters
&CP
= parameters::the
;
1086 choose_potential_mod_alternative(m
, CHOSEN_NONE
, mtlp
, mass_list
,
1087 N_terminal_mass
, C_terminal_mass
, stats
);
1089 mass_trace_list
mtl(mtlp
);
1090 mtl
.item
.position
= POSITION_NTERM
;
1091 const mass_regime_parameters
&mrp
= CP
.fragment_mass_regime
[m
.mass_regime
];
1093 // now try a possible PCA mod, if there is one
1094 // FIX: somehow xtandem generates these twice??
1095 if (mrp
.modification_mass
['['] == 0)
1096 switch (m
.peptide_sequence
[0]) {
1098 mtl
.item
.delta
= -mrp
.water_mass
;
1099 mtl
.item
.description
= "PCA";
1100 choose_potential_mod_alternative(m
, CHOSEN_PCA
, &mtl
, mass_list
,
1101 N_terminal_mass
- mrp
.water_mass
,
1102 C_terminal_mass
, stats
);
1105 // FIX: need better test for C+57? (symbolic?)
1106 const double C_mod
= mrp
.modification_mass
['C'];
1107 if (CP
.quirks_mode
? int(C_mod
) != 57 : std::abs(C_mod
- 57) > 0.5)
1108 break; // skip unless C+57
1109 // else fall through...
1112 mtl
.item
.delta
= -mrp
.ammonia_mass
;
1113 mtl
.item
.description
= "PCA";
1114 choose_potential_mod_alternative(m
, CHOSEN_PCA
, &mtl
, mass_list
,
1115 N_terminal_mass
- mrp
.ammonia_mass
,
1116 C_terminal_mass
, stats
);
1122 // Choose among the requested mass regimes (e.g., isotopes), and also choose
1123 // the static mods (including N- and C-terminal mods), all of which are
1124 // completely determined by the mass regime choice.
1126 choose_mass_regime(match
&m
, std::vector
<double> &mass_list
,
1127 const double N_terminal_mass
, const double C_terminal_mass
,
1128 score_stats
&stats
) NOTHROW
{
1129 const parameters
&CP
= parameters::the
;
1131 m
.mass_regime
= 0; // FIX
1132 const mass_regime_parameters
&mrp
= CP
.fragment_mass_regime
[m
.mass_regime
];
1133 for (std::vector
<double>::size_type i
=0; i
<m
.peptide_sequence
.size(); i
++) {
1134 const char res
= m
.peptide_sequence
[i
];
1135 mass_list
[i
] = mrp
.residue_mass
[res
] + mrp
.modification_mass
[res
];
1137 choose_PCA_mod(m
, NULL
, mass_list
,
1138 N_terminal_mass
+ mrp
.modification_mass
['['],
1139 C_terminal_mass
+ mrp
.modification_mass
[']'], stats
);
1143 // Search for matches of all modification variations of this peptide against
1144 // the spectra. Updates score_stats and the number of candidate spectra
1147 spectrum::search_peptide_all_mods(int idno
, int offset
, int begin
,
1148 const std::string
&peptide_seq
,
1149 int missed_cleavage_count
,
1150 score_stats
&stats
) {
1151 std::vector
<double> mass_list(peptide_seq
.size());
1152 double N_terminal_mass
= 0.0;
1153 double C_terminal_mass
= 0.0;
1155 // This match will be passed inward and used to record information that we
1156 // need to remember about a match when we finally see one. At that point, a
1157 // copy of this match will be saved.
1159 m
.sequence_index
= idno
;
1160 m
.sequence_offset
= offset
;
1161 m
.peptide_begin
= begin
;
1162 m
.peptide_sequence
= peptide_seq
;
1163 m
.missed_cleavage_count
= missed_cleavage_count
;
1165 choose_mass_regime(m
, mass_list
, N_terminal_mass
, C_terminal_mass
, stats
);
1169 // Search for matches of all modification variations of peptides in this
1170 // sequence run against the spectra. Updates score_stats and the number of
1171 // candidate spectra found.
1172 // FIX: optimize the non-specific case?
1175 spectrum::search_run_all_mods(const int maximum_missed_cleavage_sites
,
1176 const int min_peptide_length
,
1177 const bool no_N_term_mods
,
1178 const int idno
, const int offset
,
1179 const std::string
&run_sequence
,
1180 const std::vector
<int> cleavage_points
,
1181 score_stats
&stats
) {
1182 double N_terminal_mass
= 0.0;
1183 double C_terminal_mass
= 0.0;
1185 // This match will be passed inward and used to record information that we
1186 // need to remember about a match when we finally see one. At that point, a
1187 // copy of this match will be saved.
1189 m
.sequence_index
= idno
;
1190 m
.sequence_offset
= offset
;
1192 // FIX: examine carefully for signed/unsigned problems
1194 // The rightmost 'end' seen where 'all_spectra_masses_too_high' was true.
1195 // (0 means none yet encountered.)
1196 unsigned int next_end
= 0;
1198 for (unsigned int begin
=0; begin
<cleavage_points
.size()-1; begin
++) {
1199 const int begin_index
= m
.peptide_begin
= cleavage_points
[begin
];
1200 unsigned int end
= begin
+ 1;
1203 end
= std::max
<unsigned int>(end
, next_end
);
1204 for (; end
<cleavage_points
.size(); end
++) {
1205 m
.missed_cleavage_count
= end
- begin
- 1;
1206 if (m
.missed_cleavage_count
> maximum_missed_cleavage_sites
)
1209 const int end_index
= cleavage_points
[end
];
1210 const int peptide_size
= end_index
- begin_index
;
1211 if (peptide_size
< min_peptide_length
)
1213 // these start vacuously true, then to be falsified if possible
1214 stats
.all_spectra_masses_too_high
= true;
1215 stats
.all_spectra_masses_too_low
= true;
1216 m
.peptide_sequence
.assign(run_sequence
, begin_index
, peptide_size
);
1217 //std::cerr << "peptide: " << m.peptide_sequence << std::endl;
1218 std::vector
<double> mass_list(peptide_size
);
1219 choose_mass_regime(m
, mass_list
, N_terminal_mass
, C_terminal_mass
,
1222 // FIX: double-check that these work right even with negative potential
1224 assert(not (stats
.all_spectra_masses_too_high
1225 and stats
.all_spectra_masses_too_low
));
1226 if (stats
.all_spectra_masses_too_high
) {
1227 //std::cerr << "all_spectra_masses_too_high" << std::endl;
1228 if (end
== cleavage_points
.size() - 1)
1232 if (stats
.all_spectra_masses_too_low
) {
1233 //std::cerr << "all_spectra_masses_too_low" << std::endl;