Merge commit 'v0.1.2'
[greylag.git] / cgreylag.hpp
blob671c2cdf75b9fb25b9500566821cddd698355ab9
1 // cgreylag.hpp
3 // greylag, a collection of programs for MS/MS protein analysis
4 // Copyright (C) 2006-2008 Stowers Institute for Medical Research
5 //
6 // This program is free software: you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License as published by
8 // the Free Software Foundation, either version 3 of the License, or
9 // (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program. If not, see <http://www.gnu.org/licenses/>.
19 // Contact: Mike Coleman
20 // Stowers Institute for Medical Research
21 // 1000 East 50th Street
22 // Kansas City, Missouri 64110
23 // USA
26 #ifndef CGREYLAG_H
27 #define CGREYLAG_H
29 #include <cassert>
30 #include <cmath>
31 #include <cstdio>
32 #include <map>
33 #include <vector>
34 #include <stdexcept>
35 #include <string>
36 #include <utility>
39 class score_stats;
42 class mass_regime_parameters {
43 public:
44 double hydroxyl_mass;
45 double water_mass;
46 double ammonia_mass;
48 // residue char (incl '['/']') -> mass (per regime) with any fixed mod
49 std::vector<double> fixed_residue_mass;
52 // FIX: want these class members to all be static, but had SWIG trouble. The
53 // member 'the' is a handle to the singleton instance, for now.
54 class parameters {
55 public:
56 bool estimate_only; // just estimate work required
57 bool show_progress; // running progress on stderr
59 std::vector<double> ln_factorial; // factorial[n] == ln(n!)
61 // deuterium not (yet) implemented
62 double proton_mass;
63 double hydrogen_mass;
64 std::vector<mass_regime_parameters> parent_mass_regime;
65 std::vector<mass_regime_parameters> fragment_mass_regime;
67 // from GLP
68 double parent_mass_tolerance_1; // for +1
69 double parent_mass_tolerance_max; // for +N (typically +3)
71 unsigned int minimum_peptide_length;
73 double fragment_mass_tolerance;
74 int intensity_class_count;
76 static parameters the;
80 class sequence_run {
81 public:
82 int sequence_index; // index of this sequence in the database FIX:nn
83 int sequence_offset; // offset of this run's start in sequence
84 std::string sequence;
85 std::vector<int> cleavage_points;
86 std::string name; // e.g., initial defline word
88 sequence_run() { }
89 sequence_run(const int sequence_index, const int sequence_offset,
90 const std::string sequence, std::vector<int> cleavage_points,
91 const std::string name)
92 : sequence_index(sequence_index), sequence_offset(sequence_offset),
93 sequence(sequence), cleavage_points(cleavage_points),
94 name(name) {
99 // information about the search that's passed in to--but not changed by--the
100 // C++ search code
101 class search_context {
102 public:
103 unsigned int mod_count;
104 int mass_regime_index;
105 int conjunct_index;
106 std::string pca_residues;
107 double pca_delta;
109 // maps residue char to list of indices for _count/_delta
110 std::vector< std::vector<int> > delta_bag_lookup;
111 std::vector<double> delta_bag_delta;
112 std::vector<int> delta_bag_count;
114 double parent_fixed_mass;
115 double fragment_N_fixed_mass;
116 double fragment_C_fixed_mass;
118 // general search parameters
119 int maximum_missed_cleavage_sites;
120 bool nonspecific_cleavage;
122 std::vector<sequence_run> sequence_runs;
126 class peak {
127 public:
128 double mz;
129 double intensity;
130 int intensity_class; // 0..N; -1 == unclassified
132 explicit peak(double mz=0, double intensity=0, int intensity_class=-1)
133 : mz(mz), intensity(intensity), intensity_class(intensity_class) {
136 char *__repr__() const;
138 static bool less_mz(peak x, peak y) { return x.mz < y.mz; }
140 static bool greater_intensity(peak x, peak y) {
141 return x.intensity > y.intensity;
144 static double get_mz(double mass, int charge) {
145 const parameters &CP = parameters::the;
146 return mass/charge + CP.proton_mass;
151 // A spectrum searched on multiple charges will be turned into separate
152 // spectra having differing id's, but the same physical id.
154 class spectrum {
155 public:
156 double mass; // [M + H+], aka "neutral mass" (protonated)
157 int charge;
158 static const int MAX_SUPPORTED_CHARGE = 15;
159 static const int MAX_THEORETICAL_FRAGMENTS = 256;
160 std::vector<peak> peaks; // always ordered by increasing m/z!
162 std::vector<int> intensity_class_counts; // number of peaks in each class
164 // This is the mz range of peaks, but with a buffer on each end of size
165 // fragment_mass_tolerance.
166 double min_peak_mz;
167 double max_peak_mz;
169 // These are caches of the data in peaks above. They speed up search.
170 double *peak_mz_cache;
171 int *peak_intensity_class_cache;
173 double total_ion_current;
175 // This is the total number of bins, 2*fragment_tolerance mz wide in peak
176 // range. So, for example, if the fragment tolerance is 1 and the min and
177 // max mz are 200 and 1800, total peak bins would be 800.
178 int total_peak_bins;
179 int empty_peak_bins; // number of bins that aren't occupied by a peak
181 std::string name;
182 int file_id; // file # of spectra's source
183 int id; // unique for all spectra
184 int physical_id; // FIX: unneeded?
186 unsigned long comparisons; // times scored against theoretical spectra
188 // Construct an empty spectrum.
189 explicit spectrum(double mass=0,
190 int charge=0) : mass(mass), charge(charge), min_peak_mz(0),
191 max_peak_mz(0), peak_mz_cache(0),
192 peak_intensity_class_cache(0),
193 total_ion_current(0),
194 total_peak_bins(0), empty_peak_bins(0),
195 file_id(-1), physical_id(-1),
196 comparisons(0) {
197 assert(charge >= 0);
198 if (charge > MAX_SUPPORTED_CHARGE)
199 throw std::invalid_argument("attempt to create a spectrum with greater"
200 " than supported charge");
201 set_id();
204 char *__repr__() const;
207 // Set peaks from a tuple of mz/intensity pairs.
208 void set_peaks_from_matrix(const std::vector< std::vector<double> > &m) {
209 peaks.resize(m.size());
210 std::vector<peak>::iterator p_it = peaks.begin();
211 for (std::vector< std::vector<double> >::const_iterator it = m.begin();
212 it != m.end(); it++, p_it++) {
213 if (it->size() != 2)
214 throw std::invalid_argument("invalid matrix (must be size N x 2)");
215 p_it->mz = (*it)[0];
216 p_it->intensity = (*it)[1];
221 // Read spectra from file in ms2 format, tagging them with file_id.
222 //static std::vector<spectrum>
223 //read_spectra_from_ms2(FILE *f, const int file_id);
226 // Filter peaks to limit their number according to TIC_cutoff_proportion,
227 // and to remove those too large to be fragment products. (Peaks are
228 // assumed to be ordered by increasing mz.)
229 void filter_peaks(double TIC_cutoff_proportion,
230 double parent_mass_tolerance_max);
232 // Classify peaks and update class_counts.
233 void classify(int intensity_class_count, double intensity_class_ratio,
234 double fragment_mass_tolerance);
236 // Update the *_cache fields from the peaks field.
237 void update_peak_cache();
239 void clear_peak_cache() const { // FIX: is this const dodgy?
240 if (peak_mz_cache)
241 delete[] peak_mz_cache;
242 if (peak_intensity_class_cache)
243 delete[] peak_intensity_class_cache;
246 // Strictly speaking, we should release the cache on destruction. This
247 // would mean implementing bug-prone copy constructor and assignment
248 // constructor functions. (C++, argh.) Instead, skip it. The potential
249 // memory leak can be avoided by calling clear_peak_cache() before
250 // destruction.
252 // ~spectrum() { clear_peak_cache(); }
254 // Store the list of spectra that search_peptide will search against, and
255 // also build spectrum_mass_index.
256 static void set_searchable_spectra(const std::vector<spectrum> &spectra);
258 // Search sequence runs for matches according to the context against the
259 // spectra. Updates score_stats and the number of candidate spectra found.
260 static void search_runs(const search_context &context, score_stats &stats);
262 // conceptually these are 'protected:', but we're taking it easy here
263 public:
264 // FIX: just delete id mgmt, as we're doing this from Python now
265 void set_id() { id = next_id++; assert(next_id > 0); }
267 static int next_id;
268 static int next_physical_id;
270 static std::vector<spectrum> searchable_spectra;
271 // parent mass -> searchable_spectra index
272 static std::multimap<double,
273 std::vector<spectrum>::size_type> spectrum_mass_index;
277 struct mass_trace_item {
278 int position; // residue position within peptide
279 int conjunct_item_index; // index within conjunct specified by conjunct_index
281 // grrrr
282 bool operator==(const mass_trace_item &x) const {
283 return this->position == x.position
284 and this->conjunct_item_index == x.conjunct_item_index;
289 // This is everything we want to remember about a match, so that we can report
290 // it later.
291 class match {
292 public:
293 double score;
294 int spectrum_index;
295 std::string peptide_sequence;
296 char N_peptide_flank, C_peptide_flank; // adjacent residues; '-' if none
297 double predicted_parent_mass;
299 int mass_regime_index;
300 int conjunct_index;
301 double pca_delta;
302 std::vector<mass_trace_item> mass_trace;
304 // these are vectors of results for storing peptides found in multiple
305 // sequence locations
306 std::vector<int> peptide_begin; // absolute position within locus
307 std::vector<std::string> sequence_name;
309 match() : score(0), spectrum_index(-1), N_peptide_flank('-'),
310 C_peptide_flank('-'), predicted_parent_mass(0),
311 mass_regime_index(-1), conjunct_index(-1), pca_delta(0) { }
315 // This holds information about the best peptide/spectrum matches found, and
316 // some statistics. Essentially, it holds the results of the search process.
317 class score_stats {
318 public:
319 score_stats(int spectrum_count, int best_result_count)
320 : candidate_spectrum_count(0), evaluation_count(0) {
321 best_matches.resize(spectrum_count);
322 assert(best_result_count >= 1);
323 for (int i=0; i<spectrum_count; i++)
324 best_matches[i].resize(best_result_count);
327 // spectrum index -> list of match (only top N matches per spectrum) The
328 // best match lists are of fixed length (typically 5), and ordered
329 // best-first. Trailing entries with score 0 are null matches (to simplify
330 // the code).
331 std::vector< std::vector<match> > best_matches;
333 // Statistics:
334 // How many times was a spectrum scored against a peptide?
335 unsigned long long candidate_spectrum_count; // may be > 2^32
336 // How many different peptides (with mod combinations) were scored?
337 // (e.g., A*ABC and AA*BC count as two distinct combinations)
338 int evaluation_count; // FIX: could be > 2^31?
342 #endif