1 // C++ module for greylag
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "cgreylag.hpp"
36 #define NOTHROW __attribute__ ((nothrow))
37 // use faster, non-threadsafe versions, since we don't use threads
38 #define fgetsX fgets_unlocked
39 #define fputsX fputs_unlocked
40 #define fprintfX __builtin_fprintf_unlocked
41 #define ferrorX ferror_unlocked
44 #define fgetsX std::fgets
45 #define fputsX std::fputs
46 #define fprintfX std::fprintf
47 #define ferrorX std::ferror
51 parameters
parameters::the
;
55 peak::__repr__() const {
56 static char temp
[128];
57 sprintf(temp
, "<peak mz=%.4f intensity=%.4f>", mz
, intensity
);
61 const int FIRST_SPECTRUM_ID
= 1;
62 int spectrum::next_id
= FIRST_SPECTRUM_ID
;
63 int spectrum::next_physical_id
= FIRST_SPECTRUM_ID
;
65 std::vector
<spectrum
> spectrum::searchable_spectra
;
66 // parent mass -> searchable_spectra index
68 std::vector
<spectrum
>::size_type
> spectrum::spectrum_mass_index
;
71 std::multimap
<double, std::vector
<spectrum
>::size_type
>::const_iterator
76 spectrum::__repr__() const {
77 static char temp
[1024];
79 "<spectrum #%d (phys #%d) '%s' %.4f/%+d"
80 " [%d peaks, maxI=%f, sumI=%f]>",
81 id
, physical_id
, name
.c_str(), mass
, charge
, peaks
.size(),
82 max_peak_intensity
, sum_peak_intensity
);
88 // This is an error exit for read_spectra and filter_ms2_by_mass.
90 io_error(FILE *f
, const char *message
="") {
92 message
= "I/O error while reading (or writing) spectrum file";
93 throw std::ios_base::failure(message
);
97 // Return sorted list of parent masses present in the given ms2 files.
99 spectrum::read_ms2_spectrum_masses(std::vector
<int> fds
) {
100 std::vector
<double> masses
;
102 for (std::vector
<int>::const_iterator fdit
=fds
.begin(); fdit
!= fds
.end();
104 FILE *f
= fdopen(*fdit
, "r");
106 const int bufsiz
= 1024;
109 char *result
= fgetsX(buf
, bufsiz
, f
);
115 if (not result
or buf
[0] != ':')
117 if (not fgetsX(buf
, bufsiz
, f
))
118 io_error(f
, "bad ms2 format: mass/charge line expected");
120 masses
.push_back(std::strtod(buf
, &endp
)); // need double accuracy here
121 if (errno
or endp
== buf
)
122 io_error(f
, "bad ms2 format: bad mass");
123 result
= fgetsX(buf
, bufsiz
, f
);
128 std::vector
<peak
> peaks
;
130 result
= fgetsX(buf
, bufsiz
, f
);
133 if (not result
or buf
[0] == ':')
138 std::sort(masses
.begin(), masses
.end());
143 // Read spectra from file in ms2 format, tagging them with file_id. Spectra
144 // with charge zero are omitted from the result. (All of them are read,
145 // though, to keep spectrum ids in sync.) If file_id == -1, the ms2 file is
146 // an annotated file produced by split_ms2_by_mass_band.
148 // Multiply charge spectra (e.g., +2/+3) are split into separate spectra
149 // having the same physical id. The peak list is initially sorted by mz.
150 // Throws an exception on invalid input.
152 std::vector
<spectrum
>
153 spectrum::read_spectra_from_ms2(FILE *f
, const int file_id
) {
154 std::vector
<spectrum
> spectra
;
156 const int bufsiz
= 1024;
160 char *result
= fgetsX(buf
, bufsiz
, f
);
162 std::vector
<std::string
> names
;
163 std::vector
<double> masses
;
164 std::vector
<int> charges
;
166 std::vector
<int> file_ids
;
167 std::vector
<int> physical_ids
;
168 std::vector
<int> ids
;
174 if (not result
or buf
[0] != ':')
177 names
.push_back(std::string(buf
+1, buf
+std::strlen(buf
)-1));
179 char *anno_hash
= std::strrchr(buf
, '#'); // const, but C++ doesn't
180 // like it declared thus--yuck
182 io_error(f
, "bad ms2+ format: '#' not found");
183 names
.push_back(std::string(buf
+1, anno_hash
));
185 file_ids
.push_back(std::strtol(anno_hash
+1, &endp
, 10));
186 physical_ids
.push_back(std::strtol(endp
+1, &endp
, 10));
187 ids
.push_back(std::strtol(endp
+1, &endp
, 10));
188 if (errno
) // FIX: improve error check
189 io_error(f
, "bad ms2+ format: bad values");
191 if (not fgetsX(buf
, bufsiz
, f
))
192 io_error(f
, "bad ms2 format: mass/charge line expected");
194 masses
.push_back(std::strtod(buf
, &endp
)); // need double accuracy here
195 if (errno
or endp
== buf
)
196 io_error(f
, "bad ms2 format: bad mass");
197 const char *endp0
= endp
;
198 charges
.push_back(std::strtol(endp0
, &endp
, 10));
199 if (errno
or endp
== endp0
)
200 io_error(f
, "bad ms2 format: bad charge");
201 result
= fgetsX(buf
, bufsiz
, f
);
206 std::vector
<peak
> peaks
;
210 p
.mz
= strtod(buf
, &endp
);
211 if (errno
or endp
== buf
)
212 io_error(f
, "bad ms2 format: bad peak mz");
213 const char *endp0
= endp
;
214 p
.intensity
= strtod(endp0
, &endp
);
215 if (errno
or endp
== endp0
)
216 io_error(f
, "bad ms2 format: bad peak intensity");
219 result
= fgetsX(buf
, bufsiz
, f
);
222 if (not result
or buf
[0] == ':')
226 // add spectra to vector
227 if (names
.empty() and not peaks
.empty())
228 io_error(f
, "bad ms2 format: missing header lines?");
230 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
232 for (std::vector
<double>::size_type i
=0; i
<names
.size(); i
++) {
233 spectrum
sp(masses
[i
], charges
[i
]);
237 sp
.file_id
= file_id
;
238 sp
.physical_id
= next_physical_id
;
240 sp
.file_id
= file_ids
[i
];
241 sp
.physical_id
= physical_ids
[i
];
243 spectrum::next_id
= std::max(spectrum::next_id
, sp
.id
+1);
244 spectrum::next_physical_id
= std::max(spectrum::next_physical_id
,
247 sp
.calculate_intensity_statistics();
248 spectra
.push_back(sp
);
251 spectrum::next_physical_id
+= 1;
257 // Copy spectra from an ms2 file to a set of output ms2 files, one for each
258 // band in the set of mass bands described by their upper bounds. Extra
259 // information is written in the first header line of each spectra so that
260 // id's may be recovered. So, for example, ':0002.0002.1' might become
261 // ':0002.0002.1 # 0 45 78', where 0, 45, 78 are the spectrum's file_id,
262 // physical_id, and id, respectively. Multiply charged spectra will be
263 // split into separate spectra (having the same physical_id).
265 // FIX: cleaner to just pass in a vector of filenames, rather than fds?
267 spectrum::split_ms2_by_mass_band(FILE *inf
, const std::vector
<int> &outfds
,
269 const std::vector
<double>
270 &mass_band_upper_bounds
) {
271 std::map
<double, FILE *> mass_file_index
;
273 if (outfds
.size() < 1
274 or outfds
.size() != mass_band_upper_bounds
.size())
275 throw std::invalid_argument("invalid argument value");
277 for (std::vector
<int>::size_type i
=0; i
<outfds
.size(); i
++) {
278 FILE *f
= fdopen(outfds
[i
], "w");
280 throw std::ios_base::failure("fdopen failed: fd limit exceeded?");
281 mass_file_index
.insert(std::make_pair(mass_band_upper_bounds
[i
], f
));
284 int o_next_id
= FIRST_SPECTRUM_ID
;
285 int o_next_physical_id
= FIRST_SPECTRUM_ID
;
287 const int bufsiz
= 1024;
288 char buf0
[bufsiz
], buf1
[bufsiz
];
290 char *result
= fgetsX(buf0
, bufsiz
, inf
);
293 std::set
<FILE *> sp_files
;
299 if (not result
or buf0
[0] != ':')
301 if (not fgetsX(buf1
, bufsiz
, inf
))
302 io_error(inf
, "bad ms2 format: mass/charge line expected");
304 mass
= std::strtod(buf1
, &endp
); // need double accuracy here
305 if (errno
or endp
== buf1
)
306 io_error(inf
, "bad ms2 format: bad mass");
308 std::map
<double, FILE *>::const_iterator bit
309 = mass_file_index
.lower_bound(mass
);
310 if (bit
== mass_file_index
.end())
311 throw std::invalid_argument("internal error: mass out of range");
312 FILE *bandf
= bit
->second
;
313 sp_files
.insert(bandf
);
314 assert(buf0
[strlen(buf0
)-1] == '\n');
315 buf0
[strlen(buf0
)-1] = 0; // chop newline
316 fprintfX(bandf
, "%s # %d %d %d\n", buf0
, file_id
, o_next_physical_id
,
320 io_error(bandf
, "error writing ms2 file");
321 result
= fgetsX(buf0
, bufsiz
, inf
);
325 if (sp_files
.empty())
326 io_error(inf
, "bad ms2 format: missing header lines?");
331 for (std::set
<FILE *>::const_iterator fit
= sp_files
.begin();
332 fit
!= sp_files
.end(); fit
++) {
335 io_error(*fit
, "error writing ms2 file");
337 result
= fgetsX(buf0
, bufsiz
, inf
);
340 if (not result
or buf0
[0] == ':')
343 o_next_physical_id
+= 1;
346 // flush all output files (is this necessary?)
347 for (std::map
<double, FILE *>::const_iterator it
= mass_file_index
.begin();
348 it
!= mass_file_index
.end(); it
++)
349 std::fflush(it
->second
);
353 // For each *non-overlapping* group of peaks with mz within an interval of
354 // (less than) length 'limit', keep only the most intense peak. (The
355 // algorithm works from the right, which matter here.)
357 // FIX: Is this really the best approach? The obvious alternative would be to
358 // eliminate peaks so that no interval of length 'limit' has more than one
359 // peak (eliminating weak peaks to make this so).
362 remove_close_peaks(std::vector
<peak
> &peaks
, double limit
) {
363 for (std::vector
<peak
>::size_type i
=0; i
+1<peaks
.size(); i
++) {
364 double ub_mz
= peaks
[i
].mz
+ limit
;
365 while (i
+1 < peaks
.size() and peaks
[i
+1].mz
< ub_mz
)
366 peaks
.erase(peaks
.begin()
367 + (peaks
[i
+1].intensity
> peaks
[i
].intensity
373 // Examine this spectrum to see if it should be kept. If so, return true and
374 // sort the peaks by mz, normalize them and limit their number.
376 spectrum::filter_and_normalize(double minimum_fragment_mz
,
377 double dynamic_range
, int minimum_peaks
,
379 // "spectrum, maximum parent charge", noise suppression check,
380 // "spectrum, minimum parent m+h" not implemented
382 if (minimum_fragment_mz
< 0 or dynamic_range
<= 0 or minimum_peaks
< 0
384 throw std::invalid_argument("invalid argument value");
387 // >>> removes multiple entries within 0.95 Da of each other, retaining the
388 // highest value. this is necessary because of the behavior of some peak
389 // finding routines in commercial software
391 // peaks already sorted by mz upon read
392 remove_close_peaks(peaks
, 0.95);
395 // >>> set up m/z regions to ignore: those immediately below the m/z of the
396 // parent ion which will contain uninformative neutral loss ions, and those
397 // immediately above the parent ion m/z, which will contain the parent ion
398 // and its isotope pattern
400 const double parent_mz
= 1.00727 + (mass
- 1.00727) / charge
;
401 for (std::vector
<peak
>::size_type i
=0; i
<peaks
.size();)
402 if (parent_mz
>= peaks
[i
].mz
403 ? std::abs(parent_mz
- peaks
[i
].mz
) < (50.0 / charge
)
404 : std::abs(parent_mz
- peaks
[i
].mz
) < (5.0 / charge
))
405 peaks
.erase(peaks
.begin() + i
);
411 std::vector
<peak
>::iterator pi
;
412 for (pi
=peaks
.begin(); pi
!= peaks
.end() and pi
->mz
<= minimum_fragment_mz
;
414 peaks
.erase(peaks
.begin(), pi
);
417 // normalize spectrum
418 // >>> use the dynamic range parameter to set the maximum intensity value
419 // for the spectrum. then remove all peaks with a normalized intensity < 1
422 std::vector
<peak
>::iterator most_intense_peak
423 = std::max_element(peaks
.begin(), peaks
.end(), peak::less_intensity
);
425 = std::max
<double>(1.0, most_intense_peak
->intensity
) / dynamic_range
;
426 for (std::vector
<peak
>::iterator p
=peaks
.begin(); p
!= peaks
.end(); p
++)
427 p
->intensity
/= normalization_factor
;
428 for (std::vector
<peak
>::size_type i
=0; i
<peaks
.size();)
429 if (peaks
[i
].intensity
< 1.0)
430 peaks
.erase(peaks
.begin() + i
);
434 if (peaks
.size() < (unsigned int) minimum_peaks
)
437 // # check is_noise (NYI)
438 // # * is_noise attempts to determine if the spectrum is simply
439 // # noise. if the spectrum
440 // # * does not have any peaks within a window near the parent ion mass,
441 // # it is considered
443 // #limit = sp.mass / sp.charge
444 // #if sp.charge < 3:
445 // # limit = sp.mass - 600.0
446 // #if not [ True for mz, i in sp.peaks if mz > limit ]:
447 // # return "looks like noise"
449 // clean_isotopes removes peaks that are probably C13 isotopes
450 remove_close_peaks(peaks
, 1.5);
452 // keep only the most intense peaks
453 if (peaks
.size() > (unsigned int) total_peaks
) {
454 std::sort(peaks
.begin(), peaks
.end(), peak::less_intensity
);
455 peaks
.erase(peaks
.begin(), peaks
.end() - total_peaks
);
456 std::sort(peaks
.begin(), peaks
.end(), peak::less_mz
);
459 // update statistics, now that we've removed some peaks
460 calculate_intensity_statistics();
466 // Store the list of spectra that search_peptide will search against, and also
467 // build spectrum_mass_index.
469 spectrum::set_searchable_spectra(const std::vector
<spectrum
> &spectra
) {
470 searchable_spectra
= spectra
;
471 for (std::vector
<spectrum
>::size_type i
=0; i
<spectra
.size(); i
++)
472 spectrum_mass_index
.insert(std::make_pair(spectra
[i
].mass
, i
));
476 // Multiply peak intensities by residue-dependent factors to generate a more
477 // realistic spectrum. If is_XYZ, walk through the peptide sequence in
480 synthesize_ladder_intensities(std::vector
<peak
> &mass_ladder
,
481 const std::string
&peptide_seq
,
482 const bool is_XYZ
) NOTHROW
{
483 const parameters
&CP
= parameters::the
;
485 assert(mass_ladder
.size() == peptide_seq
.size() - 1);
486 assert(mass_ladder
.size() >= 2);
488 // An intensity boost is given for the second peptide bond/peak (*) if the
489 // N-terminal residue is 'P':
492 const int proline_bonus_position
= is_XYZ
? mass_ladder
.size()-2 : 1;
493 mass_ladder
[proline_bonus_position
].intensity
= 3.0;
494 if (peptide_seq
[1] == 'P')
495 mass_ladder
[proline_bonus_position
].intensity
= 10.0;
497 if (CP
.spectrum_synthesis
)
498 for (unsigned int i
=0; i
<mass_ladder
.size(); i
++) {
499 // FIX: there must be a better way
500 const char peptide_index
= is_XYZ
? mass_ladder
.size()-i
-1 : i
;
501 switch (peptide_seq
[peptide_index
]) {
503 mass_ladder
[i
].intensity
*= 5.0;
509 mass_ladder
[i
].intensity
*= 3.0;
513 mass_ladder
[i
].intensity
*= 2.0;
516 switch (peptide_seq
[peptide_index
+1]) {
518 mass_ladder
[i
].intensity
*= 5.0;
525 // Calculate peaks for a synthesized mass (not mz) ladder.
527 get_synthetic_Y_mass_ladder(std::vector
<peak
> &mass_ladder
,
528 const std::string
&peptide_seq
,
529 const std::vector
<double> &mass_list
,
530 const double fragment_C_fixed_mass
) NOTHROW
{
531 assert(peptide_seq
.size() == mass_list
.size());
532 double m
= fragment_C_fixed_mass
;
534 const int ladder_size
= mass_ladder
.size();
535 for (int i
=ladder_size
-1; i
>=0; i
--) {
537 mass_ladder
[ladder_size
-1-i
] = peak(m
, 1.0);
540 synthesize_ladder_intensities(mass_ladder
, peptide_seq
, true);
544 // Calculate peaks for a synthesized mass (not mz) ladder.
546 get_synthetic_B_mass_ladder(std::vector
<peak
> &mass_ladder
,
547 const std::string
&peptide_seq
,
548 const std::vector
<double> &mass_list
,
549 const double fragment_N_fixed_mass
) NOTHROW
{
550 double m
= fragment_N_fixed_mass
;
552 const int ladder_size
= mass_ladder
.size();
553 for (int i
=0; i
<=ladder_size
-1; i
++) {
555 mass_ladder
[i
] = peak(m
, 1.0);
558 synthesize_ladder_intensities(mass_ladder
, peptide_seq
, false);
563 synthetic_spectra(spectrum synth_sp
[/* max_fragment_charge+1 */][ION_MAX
],
564 const std::string
&peptide_seq
,
565 const std::vector
<double> &mass_list
,
566 const double fragment_N_fixed_mass
,
567 const double fragment_C_fixed_mass
,
568 const double max_fragment_charge
) NOTHROW
{
569 std::vector
<peak
> mass_ladder(mass_list
.size()-1);
571 for (ion ion_type
=ION_MIN
; ion_type
<ION_MAX
; ion_type
++) {
574 get_synthetic_Y_mass_ladder(mass_ladder
, peptide_seq
, mass_list
,
575 fragment_C_fixed_mass
);
578 get_synthetic_B_mass_ladder(mass_ladder
, peptide_seq
, mass_list
,
579 fragment_N_fixed_mass
);
585 //for (unsigned int i=0; i<mass_ladder.size(); i++)
586 // std::cerr << "ladder step " << peptide_seq[i] << " " << mass_ladder[i].mz << std::endl;
588 for (int charge
=1; charge
<=max_fragment_charge
; charge
++) {
589 spectrum
&sp
= synth_sp
[charge
][ion_type
];
590 //sp.mass = fragment_peptide_mass;
592 sp
.peaks
.resize(mass_ladder
.size());
593 for (std::vector
<peak
>::size_type i
=0; i
<mass_ladder
.size(); i
++) {
594 sp
.peaks
[i
].intensity
= mass_ladder
[i
].intensity
;
595 sp
.peaks
[i
].mz
= peak::get_mz(mass_ladder
[i
].mz
, charge
);
602 // Return the similarity score between this spectrum and that, and also a
603 // count of common peaks in *peak_count.
605 // xtandem quantizes mz values into bins of width fragment_mass_error first,
606 // and adds a bin on either side. This makes the effective fragment mass
607 // error for each peak an arbitrary value between 1x and 2x the parameter,
608 // based on quantization of that peak. If quirks_mode is on, we'll try to
609 // emulate this behavior.
611 score_similarity_(const spectrum
&x
, const spectrum
&y
,
612 int *peak_count
) NOTHROW
{
613 const double frag_err
= parameters::the
.fragment_mass_error
;
614 const bool quirks_mode
= parameters::the
.quirks_mode
;
615 const double error_limit
= quirks_mode
? frag_err
*1.5 : frag_err
;
619 std::vector
<peak
>::const_iterator x_it
= x
.peaks
.begin();
620 std::vector
<peak
>::const_iterator y_it
= y
.peaks
.begin();
622 while (x_it
!= x
.peaks
.end() and y_it
!= y
.peaks
.end()) {
623 double x_mz
= x_it
->mz
, y_mz
= y_it
->mz
;
625 const float ffe
= frag_err
;
626 x_mz
= int(static_cast<float>(x_mz
) / ffe
) * ffe
;
627 y_mz
= int(static_cast<float>(y_mz
) / ffe
) * ffe
;
629 // in quirks mode, must be within one frag_err-wide bin
630 const double delta
= y_mz
- x_mz
;
631 if (std::abs(delta
) < error_limit
) {
633 score
+= (x_it
->intensity
* y_it
->intensity
);
634 //std::cerr << "hit " << x_mz << " vs " << y_mz << ", i = " << x_it->intensity * y_it->intensity << std::endl;
647 spectrum::score_similarity(const spectrum
&x
, const spectrum
&y
, int
649 return score_similarity_(x
, y
, peak_count
);
653 // IDEA: could we combine all these spectra and just do the correlation once?
655 // Score the specified spectrum against a group of synthetic fragment
658 score_spectrum(double &hyper_score
, double &convolution_score
,
659 std::vector
<double> &ion_scores
, std::vector
<int> &ion_peaks
,
660 spectrum synth_sp
[/* max_fragment_charge+1 */][ION_MAX
],
661 const int spectrum_id
, const int max_fragment_charge
) NOTHROW
{
662 const parameters
&CP
= parameters::the
;
663 const spectrum sp
= spectrum::searchable_spectra
[spectrum_id
];
664 const int spectrum_charge
= sp
.charge
;
666 convolution_score
= 0;
668 for (ion ion_type
=ION_MIN
; ion_type
<ION_MAX
; ion_type
++) {
670 double i_scores
= 0.0;
671 const int charge_limit
= std::max
<int>(1, spectrum_charge
-1);
672 for (int charge
=1; charge
<=charge_limit
; charge
++) {
673 assert(charge
<= max_fragment_charge
);
675 // convolution score is just sum over charges/ions
676 // hyperscore is product of p! over charges/ions (where p is corr peak
677 // count) times the convolution score (clipped to FLT_MAX)
679 int common_peak_count
; // FIX!
680 double conv_score
= score_similarity_(synth_sp
[charge
][ion_type
],
681 sp
, &common_peak_count
);
682 i_peaks
+= common_peak_count
;
683 i_scores
+= conv_score
;
684 hyper_score
*= CP
.factorial
[common_peak_count
];
685 convolution_score
+= conv_score
;
688 ion_peaks
[ion_type
] = i_peaks
;
689 ion_scores
[ion_type
] = i_scores
;
691 hyper_score
*= convolution_score
;
695 // FIX: does this actually help inlining?
696 // returns log-scaled hyperscore
698 scale_hyperscore_(double hyper_score
) NOTHROW
{
699 assert(hyper_score
>= 0); // otherwise check for EDOM, ERANGE
700 if (hyper_score
== 0)
701 return -DBL_MAX
; // FIX: this shouldn't be reached?
702 return 4 * std::log10(hyper_score
);
705 scale_hyperscore(double hyper_score
) { return scale_hyperscore_(hyper_score
); }
708 struct mass_trace_list
{
709 mass_trace_item item
;
710 const mass_trace_list
*next
;
712 mass_trace_list(const mass_trace_list
*p
=0) : next(p
) { }
716 // Search for matches of this particular peptide modification variation
717 // against the spectra. Updates score_stats and returns the number of
718 // candidate spectra found.
720 evaluate_peptide(const search_context
&context
, match
&m
,
721 const mass_trace_list
*mtlp
,
722 const std::vector
<double> &mass_list
,
723 const spmi_c_it
&candidate_spectra_info_begin
,
724 const spmi_c_it
&candidate_spectra_info_end
,
725 score_stats
&stats
) NOTHROW
{
726 const parameters
&CP
= parameters::the
;
727 assert(m
.peptide_sequence
.size() == mass_list
.size());
729 int max_candidate_charge
= 0;
730 for (spmi_c_it it
=candidate_spectra_info_begin
;
731 it
!= candidate_spectra_info_end
; it
++)
733 = std::max
<int>(max_candidate_charge
,
734 spectrum::searchable_spectra
[it
->second
].charge
);
735 assert(max_candidate_charge
>= 1);
737 int max_fragment_charge
= max_candidate_charge
;
738 if (not CP
.check_all_fragment_charges
)
739 max_fragment_charge
= std::max
<int>(1, max_candidate_charge
-1);
740 assert(max_fragment_charge
<= spectrum::max_supported_charge
);
741 // e.g. +2 -> 'B' -> spectrum
742 // FIX: should this be a std::vector??
743 static spectrum synth_sp
[spectrum::max_supported_charge
+1][ION_MAX
];
744 synthetic_spectra(synth_sp
, m
.peptide_sequence
, mass_list
,
745 context
.fragment_N_fixed_mass
,
746 context
.fragment_C_fixed_mass
, max_fragment_charge
);
748 for (spmi_c_it candidate_it
= candidate_spectra_info_begin
;
749 candidate_it
!= candidate_spectra_info_end
; candidate_it
++) {
750 m
.spectrum_index
= candidate_it
->second
;
751 stats
.candidate_spectrum_count
+= 1;
752 if (CP
.estimate_only
)
755 //std::cerr << "sp " << m.spectrum_index << std::endl;
756 score_spectrum(m
.hyper_score
, m
.convolution_score
, m
.ion_scores
,
757 m
.ion_peaks
, synth_sp
, m
.spectrum_index
,
758 max_fragment_charge
);
759 //std::cerr << "score " << m.convolution_score << std::endl;
760 if (m
.convolution_score
<= 2.0)
763 ////std::cerr << "histod: " << m.spectrum_index << " " << m.peptide_sequence
764 //<< " " << m.peptide_mass << " " <<
765 //spectrum::searchable_spectra[m.spectrum_index].mass << std::endl;
767 // update spectrum histograms
768 std::vector
<int> &hh
= stats
.hyperscore_histogram
[m
.spectrum_index
];
769 int binned_scaled_hyper_score
= int(0.5 + scale_hyperscore_(m
.hyper_score
));
770 assert(binned_scaled_hyper_score
>= 0);
772 if (static_cast<int>(hh
.size())-1 < binned_scaled_hyper_score
) {
773 assert(hh
.size() < INT_MAX
/2);
774 hh
.resize(std::max
<unsigned int>(binned_scaled_hyper_score
+1,
777 hh
[binned_scaled_hyper_score
] += 1;
778 // incr m_tPeptideScoredCount
779 // nyi: permute stuff
781 const int sp_ion_count
= m
.ion_peaks
[ION_B
] + m
.ion_peaks
[ION_Y
];
782 if (sp_ion_count
< CP
.minimum_ion_count
)
784 const bool has_b_and_y
= m
.ion_peaks
[ION_B
] and m
.ion_peaks
[ION_Y
];
788 // check that parent masses are within error range (isotope ni) already
789 // done above (why does xtandem do it here? something to do with isotope
790 // jitter?) if check fails, only eligible for 2nd-best record
792 // Remember all of the highest-hyper-scoring matches against each
793 // spectrum. These might be in multiple domains (pos, length, mods) in
794 // multiple proteins.
796 // To avoid the problems that xtandem has with inexact float-point
797 // calculations, we consider hyperscores within the "epsilon ratio" to be
798 // "equal". The best_match vector will need to be re-screened against
799 // this ratio, because it may finally contain entries which don't meet our
800 // criterion. (We don't take the time to get it exactly right here,
801 // because we will often end up discarding these later anyway.)
803 const double current_ratio
= (m
.hyper_score
804 / stats
.best_score
[m
.spectrum_index
]);
806 if (CP
.quirks_mode
and (m
.hyper_score
< stats
.best_score
[m
.spectrum_index
])
807 or (not CP
.quirks_mode
808 and (current_ratio
< CP
.hyper_score_epsilon_ratio
))) {
809 // This isn't a best score, so see if it's a second-best score.
810 if (m
.hyper_score
> stats
.second_best_score
[m
.spectrum_index
])
811 stats
.second_best_score
[m
.spectrum_index
] = m
.hyper_score
;
814 if (CP
.quirks_mode
and (m
.hyper_score
> stats
.best_score
[m
.spectrum_index
])
815 or (not CP
.quirks_mode
816 and (current_ratio
> (1/CP
.hyper_score_epsilon_ratio
)))) {
817 // This score is significantly better, so forget previous matches.
818 stats
.improved_candidates
+= 1;
819 //if (stats.best_match[m.spectrum_index].empty()) why doesn't this work?
820 if (stats
.best_score
[m
.spectrum_index
] == 100.0)
821 stats
.spectra_with_candidates
+= 1;
822 stats
.best_score
[m
.spectrum_index
] = m
.hyper_score
;
823 stats
.best_match
[m
.spectrum_index
].clear();
825 // Now remember this match because it's at least as good as those
827 m
.mass_trace
.clear();
828 for (const mass_trace_list
*p
=mtlp
; p
; p
=p
->next
)
829 m
.mass_trace
.push_back(p
->item
);
830 stats
.best_match
[m
.spectrum_index
].push_back(m
);
835 // IDEA FIX: Use a cost parameter to define the iterative search front.
838 // Choose one possible residue modification position. Once they're all
839 // chosen, then evaluate.
841 choose_residue_mod(const search_context
&context
, match
&m
,
842 const mass_trace_list
*mtlp
, std::vector
<double> &mass_list
,
843 const spmi_c_it
&candidate_spectra_info_begin
,
844 const spmi_c_it
&candidate_spectra_info_end
,
846 std::vector
<int> &db_remaining
,
847 const unsigned remaining_positions_to_choose
,
848 const unsigned next_position_to_consider
) NOTHROW
{
849 const parameters
&CP
= parameters::the
;
850 assert(remaining_positions_to_choose
851 <= m
.peptide_sequence
.size() - next_position_to_consider
);
853 if (CP
.maximum_modification_combinations_searched
854 and (stats
.combinations_searched
855 >= CP
.maximum_modification_combinations_searched
))
858 if (remaining_positions_to_choose
== 0) {
859 stats
.combinations_searched
+= 1;
860 evaluate_peptide(context
, m
, mtlp
, mass_list
, candidate_spectra_info_begin
,
861 candidate_spectra_info_end
, stats
);
863 mass_trace_list
mtl(mtlp
);
865 // consider all of the positions where we could next add a mod
866 for (unsigned int i
=next_position_to_consider
;
867 i
<= m
.peptide_sequence
.size()-remaining_positions_to_choose
; i
++) {
868 mtl
.item
.position
= i
;
869 const double save_pos_mass
=mass_list
[i
];
870 const char pos_res
= m
.peptide_sequence
[i
];
872 // consider the possibilities for this position
873 for (std::vector
<int>::const_iterator
874 it
=context
.delta_bag_lookup
[pos_res
].begin();
875 it
!= context
.delta_bag_lookup
[pos_res
].end(); it
++) {
876 const int db_index
= *it
;
877 if (db_remaining
[db_index
] < 1)
879 db_remaining
[db_index
] -= 1;
880 mass_list
[i
] = save_pos_mass
+ context
.delta_bag_delta
[db_index
];
881 mtl
.item
.delta
= context
.delta_bag_delta
[db_index
];
882 choose_residue_mod(context
, m
, &mtl
, mass_list
,
883 candidate_spectra_info_begin
,
884 candidate_spectra_info_end
, stats
, db_remaining
,
885 remaining_positions_to_choose
-1, i
+1);
886 db_remaining
[db_index
] += 1;
888 mass_list
[i
] = save_pos_mass
;
894 // p_begin (p_end) to begin_index (end_index), updating p_mass in the process.
895 // The sign determines whether mass increases or decreases as p_begin (p_end)
896 // is moved forward--so it should be -1 for the begin case and +1 for the end
898 // FIX: is this worth its complexity? kill or else add reset
900 update_p_mass(double &p_mass
, int &p_begin
, int begin_index
, int sign
,
901 const std::string
&run_sequence
,
902 const std::vector
<double> &fixed_residue_mass
) {
903 assert(sign
== +1 or sign
== -1);
904 assert(begin_index
>= 0);
905 const bool moving_forward
= begin_index
>= p_begin
;
906 int p0
=p_begin
, p1
=begin_index
;
907 if (not moving_forward
) {
911 for (int i
=p0
; i
<p1
; i
++)
912 p_mass
+= sign
* fixed_residue_mass
[run_sequence
[i
]];
913 p_begin
= begin_index
;
918 // Search a sequence run for matches according to the context against the
919 // spectra. Updates score_stats and the number of candidate spectra found.
921 // FIX: examine carefully for signed/unsigned problems
924 search_run(const search_context
&context
, const sequence_run
&sequence_run
,
925 score_stats
&stats
) {
926 const parameters
&CP
= parameters::the
;
927 const int min_peptide_length
= std::max
<int>(CP
.minimum_peptide_length
,
929 const std::vector
<double> &fixed_parent_mass \
930 = CP
.parent_mass_regime
[context
.mass_regime_index
].fixed_residue_mass
;
932 const std::string
&run_sequence
= sequence_run
.sequence
;
933 const std::vector
<int> &cleavage_points
= sequence_run
.cleavage_points
;
935 // This match will be passed inward and used to record information that we
936 // need to remember about a match when we finally see one. At that point, a
937 // copy of this match will be saved.
938 match m
; // FIX: move inward?
939 m
.sequence_index
= sequence_run
.sequence_index
;
940 m
.sequence_offset
= sequence_run
.sequence_offset
;
942 assert(context
.delta_bag_delta
.size()
943 == context
.delta_bag_count
.size());
944 // counts remaining as mod positions are chosen
945 std::vector
<int> db_remaining
= context
.delta_bag_count
;
947 // the rightmost 'end' seen when all spectra masses were too high
948 // (0 means none yet encountered.)
949 unsigned int next_end
= 0;
951 // p_mass is the parent mass of the peptide run_sequence[p_begin:p_end]
952 double p_mass
= context
.parent_fixed_mass
;
953 int p_begin
=0, p_end
=0;
955 // FIX: optimize the non-specific cleavage case?
956 for (unsigned int begin
=0; begin
<cleavage_points
.size()-1; begin
++) {
957 const int begin_index
= m
.peptide_begin
= cleavage_points
[begin
];
958 if (not context
.pca_residues
.empty())
959 if (context
.pca_residues
.find(run_sequence
[begin_index
])
960 == std::string::npos
)
962 update_p_mass(p_mass
, p_begin
, begin_index
, -1, run_sequence
,
964 unsigned int end
= std::max
<unsigned int>(begin
+ 1, next_end
);
965 for (; end
<cleavage_points
.size(); end
++) {
966 m
.missed_cleavage_count
= end
- begin
- 1;
967 if (m
.missed_cleavage_count
> context
.maximum_missed_cleavage_sites
)
970 const int end_index
= cleavage_points
[end
];
971 assert(end_index
- begin_index
> 0);
972 const int peptide_size
= end_index
- begin_index
;
973 assert(peptide_size
> 0);
974 if (peptide_size
< min_peptide_length
)
976 update_p_mass(p_mass
, p_end
, end_index
, +1, run_sequence
,
978 //std::cerr << "peptide: " << std::string(run_sequence, begin_index, peptide_size) << " p_mass: " << p_mass << std::endl;
981 double sp_mass_lb
= p_mass
+ CP
.parent_monoisotopic_mass_error_minus
;
982 double sp_mass_ub
= p_mass
+ CP
.parent_monoisotopic_mass_error_plus
;
984 const spmi_c_it candidate_spectra_info_begin
985 = spectrum::spectrum_mass_index
.lower_bound(sp_mass_lb
);
986 if (candidate_spectra_info_begin
== spectrum::spectrum_mass_index
.end()) {
987 // spectrum masses all too low to match peptide (peptide too long)
988 //std::cerr << "peptide: sp low" << std::endl;
992 const spmi_c_it candidate_spectra_info_end
993 = spectrum::spectrum_mass_index
.upper_bound(sp_mass_ub
);
994 if (candidate_spectra_info_end
== spectrum::spectrum_mass_index
.begin()) {
995 // spectrum masses all too high to match peptide
996 // (peptide is too short, so increase end, if possible, else return)
997 //std::cerr << "peptide: sp high" << std::endl;
998 if (end
== cleavage_points
.size() - 1)
1003 if (candidate_spectra_info_begin
== candidate_spectra_info_end
) {
1004 // no spectrum with close-enough parent mass
1005 //std::cerr << "peptide: sp none nearby" << std::endl;
1009 m
.parent_peptide_mass
= p_mass
;
1010 //m.fragment_peptide_mass = ???; unneeded???
1012 m
.peptide_sequence
.assign(run_sequence
, begin_index
, peptide_size
);
1014 std::vector
<double> mass_list(peptide_size
);
1015 for (std::vector
<double>::size_type i
=0; i
<m
.peptide_sequence
.size();
1017 mass_list
[i
] = (CP
.fragment_mass_regime
[context
.mass_regime_index
]
1018 .fixed_residue_mass
[m
.peptide_sequence
[i
]]);
1020 choose_residue_mod(context
, m
, NULL
, mass_list
,
1021 candidate_spectra_info_begin
,
1022 candidate_spectra_info_end
, stats
, db_remaining
,
1023 context
.mod_count
, 0);
1029 // Search all sequence runs for matches according to the context against the
1030 // spectra. Updates score_stats and the number of candidate spectra found.
1032 spectrum::search_runs(const search_context
&context
, score_stats
&stats
) {
1033 const parameters
&CP
= parameters::the
;
1035 const int no_runs
= context
.sequence_runs
.size();
1036 for (int i
=0; i
<no_runs
; i
++) {
1037 search_run(context
, context
.sequence_runs
[i
], stats
);
1039 if (CP
.show_progress
)
1040 std::cerr
<< i
+1 << " of " << no_runs
<< " sequences, "
1041 << stats
.candidate_spectrum_count
<< " cand for "
1042 << stats
.spectra_with_candidates
<< " sp, "
1043 << stats
.improved_candidates
<< "++\r" << std::flush
;
1046 if (CP
.show_progress
)
1047 std::cerr
<< std::endl
;