3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
36 class mass_regime_parameters
{
42 // residue char (incl '['/']') -> mass (per regime) with any fixed mod
43 std::vector
<double> fixed_residue_mass
;
46 // FIX: want these class members to all be static, but had SWIG trouble. The
47 // member 'the' is a handle to the singleton instance, for now.
50 bool estimate_only
; // just estimate work required
51 bool show_progress
; // running progress on stderr
53 std::vector
<double> ln_factorial
; // factorial[n] == ln(n!)
55 // deuterium not (yet) implemented
58 std::vector
<mass_regime_parameters
> parent_mass_regime
;
59 std::vector
<mass_regime_parameters
> fragment_mass_regime
;
62 double parent_mass_tolerance_1
; // for +1
63 double parent_mass_tolerance_max
; // for +N (typically +3)
65 int minimum_peptide_length
;
67 double fragment_mass_tolerance
;
68 int intensity_class_count
;
70 static parameters the
;
76 int sequence_index
; // index of this sequence in the database FIX:nn
77 int sequence_offset
; // offset of this run's start in sequence
79 std::vector
<int> cleavage_points
;
80 std::string name
; // e.g., initial defline word
83 sequence_run(const int sequence_index
, const int sequence_offset
,
84 const std::string sequence
, std::vector
<int> cleavage_points
,
85 const std::string name
)
86 : sequence_index(sequence_index
), sequence_offset(sequence_offset
),
87 sequence(sequence
), cleavage_points(cleavage_points
),
90 // generate non-specific cleavage points
91 assert(sequence
.size() < INT_MAX
);
92 if (this->cleavage_points
.empty())
93 for (int i
=0; i
<=static_cast<int>(sequence
.size()); i
++)
94 this->cleavage_points
.push_back(i
);
99 // information about the search that's passed in to--but not changed by--the
101 class search_context
{
104 int mass_regime_index
;
105 std::string pca_residues
;
108 double N_delta
; // 0 if none
109 double C_delta
; // 0 if none
111 // maps residue char to list of indices for _count/_delta
112 std::vector
< std::vector
<int> > delta_bag_lookup
;
113 std::vector
<double> delta_bag_delta
;
114 std::vector
<int> delta_bag_count
;
116 double parent_fixed_mass
;
117 double fragment_N_fixed_mass
;
118 double fragment_C_fixed_mass
;
120 // general search parameters
121 int maximum_missed_cleavage_sites
;
123 std::vector
<sequence_run
> sequence_runs
;
127 enum ion
{ ION_MIN
, ION_Y
=ION_MIN
, ION_B
, ION_MAX
}; // NB: [ION_MIN, ION_MAX)
129 inline void operator++(ion
&i
) { i
= ion(i
+ 1); }
130 inline void operator++(ion
&i
, int) { i
= ion(i
+ 1); }
137 int intensity_class
; // 0..N; -1 == unclassified
139 explicit peak(double mz
=0, double intensity
=0, int intensity_class
=-1)
140 : mz(mz
), intensity(intensity
), intensity_class(intensity_class
) {
143 char *__repr__() const;
145 static bool less_mz(peak x
, peak y
) { return x
.mz
< y
.mz
; }
146 // FIX: delete one of these
147 static bool less_intensity(peak x
, peak y
) {
148 return x
.intensity
< y
.intensity
;
150 static bool greater_intensity(peak x
, peak y
) {
151 return x
.intensity
> y
.intensity
;
154 static double get_mz(double mass
, int charge
) {
155 const parameters
&CP
= parameters::the
;
156 return mass
/charge
+ CP
.proton_mass
;
161 // A spectrum searched on multiple charges will be turned into separate
162 // spectra having differing id's, but the same physical id.
166 double mass
; // [M + H+], aka "neutral mass" (protonated)
168 static const int MAX_SUPPORTED_CHARGE
= 15;
169 static const int MAX_THEORETICAL_FRAGMENTS
= 256;
170 std::vector
<peak
> peaks
; // always ordered by increasing m/z!
172 std::vector
<int> intensity_class_counts
; // number of peaks in each class
174 // This is the mz range of peaks, but with a buffer on each end of size
175 // fragment_mass_tolerance.
179 double total_ion_current
;
181 // This is the total number of bins, 2*fragment_tolerance mz wide in peak
182 // range. So, for example, if the fragment tolerance is 1 and the min and
183 // max mz are 200 and 1800, total peak bins would be 800.
185 int empty_peak_bins
; // number of bins that aren't occupied by a peak
188 int file_id
; // file # of spectra's source
189 int id
; // unique for all spectra
190 int physical_id
; // FIX: unneeded?
192 unsigned long comparisons
; // times scored against theoretical spectra
194 // Construct an empty spectrum.
195 explicit spectrum(double mass
=0,
196 int charge
=0) : mass(mass
), charge(charge
), min_peak_mz(0),
197 max_peak_mz(0), total_ion_current(0),
198 total_peak_bins(0), empty_peak_bins(0),
199 file_id(-1), physical_id(-1),
202 if (charge
> MAX_SUPPORTED_CHARGE
)
203 throw std::invalid_argument("attempt to create a spectrum with greater"
204 " than supported charge");
208 char *__repr__() const;
211 void set_peaks_from_matrix(const std::vector
< std::vector
<double> > &m
) {
212 peaks
.resize(m
.size());
213 std::vector
<peak
>::iterator p_it
= peaks
.begin();
214 for (std::vector
< std::vector
<double> >::const_iterator it
= m
.begin();
215 it
!= m
.end(); it
++, p_it
++) {
217 throw std::invalid_argument("invalid matrix (must be size N x 2)");
219 p_it
->intensity
= (*it
)[1];
224 // Read spectra from file in ms2 format, tagging them with file_id. Before
225 // reading, seek to absolute position offset_begin. If offset_end != -1,
226 // any spectra that begin after position offset_end in the file are not
228 static std::vector
<spectrum
>
229 read_spectra_from_ms2(FILE *f
, const int file_id
, const long offset_begin
,
230 const long offset_end
);
233 // Filter peaks to limit their number according to TIC_cutoff_proportion,
234 // and to remove those too large to be fragment products. (Peaks are
235 // assumed to be ordered by increasing mz.)
236 void filter_peaks(double TIC_cutoff_proportion
,
237 double parent_mass_tolerance_max
);
239 // Classify peaks and update class_counts.
240 void classify(int intensity_class_count
, double intensity_class_ratio
,
241 double fragment_mass_tolerance
);
243 // Store the list of spectra that search_peptide will search against, and
244 // also build spectrum_mass_index.
245 static void set_searchable_spectra(const std::vector
<spectrum
> &spectra
);
247 // Search sequence runs for matches according to the context against the
248 // spectra. Updates score_stats and the number of candidate spectra found.
249 static void search_runs(const search_context
&context
, score_stats
&stats
);
251 // conceptually these are 'protected:', but we're taking it easy here
253 void set_id() { id
= next_id
++; assert(next_id
> 0); }
256 static int next_physical_id
;
258 static std::vector
<spectrum
> searchable_spectra
;
259 // parent mass -> searchable_spectra index
260 static std::multimap
<double,
261 std::vector
<spectrum
>::size_type
> spectrum_mass_index
;
265 struct mass_trace_item
{
271 bool operator==(const mass_trace_item
&x
) const {
272 return this->position
== x
.position
273 and this->delta
== x
.delta
274 and this->id
== x
.id
;
279 // This is everything we want to remember about a match, so that we can report
281 // FIX: Are any of these fields unneeded?
286 std::string peptide_sequence
;
287 double predicted_parent_mass
;
288 std::vector
<mass_trace_item
> mass_trace
;
290 // these are vectors of results for storing peptides found in multiple
291 // sequence locations
292 std::vector
<int> peptide_begin
; // absolute position within locus
293 std::vector
<std::string
> sequence_name
;
295 match() : score(0), spectrum_index(-1), predicted_parent_mass(0) { }
299 // This holds information about the best peptide/spectrum matches found, and
300 // some statistics. Essentially, it holds the results of the search process.
303 score_stats(int spectrum_count
, int best_result_count
)
304 : candidate_spectrum_count(0), combinations_searched(0) {
305 best_matches
.resize(spectrum_count
);
306 assert(best_result_count
>= 1);
307 for (int i
=0; i
<spectrum_count
; i
++)
308 best_matches
[i
].resize(best_result_count
);
311 // spectrum index -> list of match (only top N matches per spectrum) The
312 // best match lists are of fixed length (typically 5), and ordered
313 // best-first. Trailing entries with score 0 are null matches (to simplify
315 std::vector
< std::vector
<match
> > best_matches
;
318 // How many times was a spectrum scored against a peptide?
319 unsigned long long candidate_spectrum_count
; // may be > 2^32
320 // How many different mod combinations were searched? (e.g., A*ABC and AA*BC
321 // are two distinct combinations)
322 int combinations_searched
; // FIX: could be > 2^31?