Cleanups: dead code removal/etc.
[greylag.git] / cgreylag.hpp
blob297f39b0678b327cee0c8d04afbc064320460761
1 // cgreylag.hpp
3 // Copyright (C) 2006-2007, Stowers Institute for Medical Research
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #ifndef CGREYLAG_H
21 #define CGREYLAG_H
23 #include <cassert>
24 #include <cmath>
25 #include <cstdio>
26 #include <map>
27 #include <vector>
28 #include <stdexcept>
29 #include <string>
30 #include <utility>
33 class score_stats;
36 class mass_regime_parameters {
37 public:
38 double hydroxyl_mass;
39 double water_mass;
40 double ammonia_mass;
42 // residue char (incl '['/']') -> mass (per regime) with any fixed mod
43 std::vector<double> fixed_residue_mass;
46 // FIX: want these class members to all be static, but had SWIG trouble. The
47 // member 'the' is a handle to the singleton instance, for now.
48 class parameters {
49 public:
50 bool estimate_only; // just estimate work required
51 bool show_progress; // running progress on stderr
53 std::vector<double> ln_factorial; // factorial[n] == ln(n!)
55 // deuterium not (yet) implemented
56 double proton_mass;
57 double hydrogen_mass;
58 std::vector<mass_regime_parameters> parent_mass_regime;
59 std::vector<mass_regime_parameters> fragment_mass_regime;
61 // from XTP
62 double parent_mass_tolerance_1; // for +1
63 double parent_mass_tolerance_max; // for +N (typically +3)
65 int minimum_peptide_length;
67 double fragment_mass_tolerance;
68 int intensity_class_count;
70 static parameters the;
74 class sequence_run {
75 public:
76 int sequence_index; // index of this sequence in the database FIX:nn
77 int sequence_offset; // offset of this run's start in sequence
78 std::string sequence;
79 std::vector<int> cleavage_points;
80 std::string name; // e.g., initial defline word
82 sequence_run() { }
83 sequence_run(const int sequence_index, const int sequence_offset,
84 const std::string sequence, std::vector<int> cleavage_points,
85 const std::string name)
86 : sequence_index(sequence_index), sequence_offset(sequence_offset),
87 sequence(sequence), cleavage_points(cleavage_points),
88 name(name) {
89 // FIX
90 // generate non-specific cleavage points
91 assert(sequence.size() < INT_MAX);
92 if (this->cleavage_points.empty())
93 for (int i=0; i<=static_cast<int>(sequence.size()); i++)
94 this->cleavage_points.push_back(i);
99 // information about the search that's passed in to--but not changed by--the
100 // C++ search code
101 class search_context {
102 public:
103 int mod_count;
104 int mass_regime_index;
105 std::string pca_residues;
106 double pca_delta;
108 double N_delta; // 0 if none
109 double C_delta; // 0 if none
111 // maps residue char to list of indices for _count/_delta
112 std::vector< std::vector<int> > delta_bag_lookup;
113 std::vector<double> delta_bag_delta;
114 std::vector<int> delta_bag_count;
116 double parent_fixed_mass;
117 double fragment_N_fixed_mass;
118 double fragment_C_fixed_mass;
120 // general search parameters
121 int maximum_missed_cleavage_sites;
123 std::vector<sequence_run> sequence_runs;
127 enum ion { ION_MIN, ION_Y=ION_MIN, ION_B, ION_MAX }; // NB: [ION_MIN, ION_MAX)
129 inline void operator++(ion &i) { i = ion(i + 1); }
130 inline void operator++(ion &i, int) { i = ion(i + 1); }
133 class peak {
134 public:
135 double mz;
136 double intensity;
137 int intensity_class; // 0..N; -1 == unclassified
139 explicit peak(double mz=0, double intensity=0, int intensity_class=-1)
140 : mz(mz), intensity(intensity), intensity_class(intensity_class) {
143 char *__repr__() const;
145 static bool less_mz(peak x, peak y) { return x.mz < y.mz; }
146 // FIX: delete one of these
147 static bool less_intensity(peak x, peak y) {
148 return x.intensity < y.intensity;
150 static bool greater_intensity(peak x, peak y) {
151 return x.intensity > y.intensity;
154 static double get_mz(double mass, int charge) {
155 const parameters &CP = parameters::the;
156 return mass/charge + CP.proton_mass;
161 // A spectrum searched on multiple charges will be turned into separate
162 // spectra having differing id's, but the same physical id.
164 class spectrum {
165 public:
166 double mass; // [M + H+], aka "neutral mass" (protonated)
167 int charge;
168 static const int MAX_SUPPORTED_CHARGE = 10;
169 std::vector<peak> peaks; // always ordered by increasing m/z!
171 std::vector<int> intensity_class_counts; // number of peaks in each class
173 // This is the mz range of peaks, but with a buffer on each end of size
174 // fragment_mass_tolerance.
175 double min_peak_mz;
176 double max_peak_mz;
178 double total_ion_current;
180 // This is the total number of bins, 2*fragment_tolerance mz wide in peak
181 // range. So, for example, if the fragment tolerance is 1 and the min and
182 // max mz are 200 and 1800, total peak bins would be 800.
183 int total_peak_bins;
184 int empty_peak_bins; // number of bins that aren't occupied by a peak
186 std::string name;
187 int file_id; // file # of spectra's source
188 int id; // unique for all spectra
189 int physical_id; // FIX: unneeded?
191 unsigned long comparisons; // times scored against theoretical spectra
193 // Construct an empty spectrum.
194 explicit spectrum(double mass=0,
195 int charge=0) : mass(mass), charge(charge), min_peak_mz(0),
196 max_peak_mz(0), total_ion_current(0),
197 total_peak_bins(0), empty_peak_bins(0),
198 file_id(-1), physical_id(-1),
199 comparisons(0) {
200 assert(charge >= 0);
201 if (charge > MAX_SUPPORTED_CHARGE)
202 throw std::invalid_argument("attempt to create a spectrum with greater"
203 " than supported charge");
204 set_id();
207 char *__repr__() const;
209 // for tinkering
210 void set_peaks_from_matrix(const std::vector< std::vector<double> > &m) {
211 peaks.resize(m.size());
212 std::vector<peak>::iterator p_it = peaks.begin();
213 for (std::vector< std::vector<double> >::const_iterator it = m.begin();
214 it != m.end(); it++, p_it++) {
215 if (it->size() != 2)
216 throw std::invalid_argument("invalid matrix (must be size N x 2)");
217 p_it->mz = (*it)[0];
218 p_it->intensity = (*it)[1];
223 // Read spectra from file in ms2 format, tagging them with file_id. Before
224 // reading, seek to absolute position offset_begin. If offset_end != -1,
225 // any spectra that begin after position offset_end in the file are not
226 // read.
227 static std::vector<spectrum>
228 read_spectra_from_ms2(FILE *f, const int file_id, const long offset_begin,
229 const long offset_end);
232 // Filter peaks to limit their number according to TIC_cutoff_proportion,
233 // and to remove those too large to be fragment products. (Peaks are
234 // assumed to be ordered by increasing mz.)
235 void filter_peaks(double TIC_cutoff_proportion,
236 double parent_mass_tolerance_max);
238 // Classify peaks and update class_counts.
239 void classify(int intensity_class_count, double intensity_class_ratio,
240 double fragment_mass_tolerance);
242 // Store the list of spectra that search_peptide will search against, and
243 // also build spectrum_mass_index.
244 static void set_searchable_spectra(const std::vector<spectrum> &spectra);
246 // Search sequence runs for matches according to the context against the
247 // spectra. Updates score_stats and the number of candidate spectra found.
248 static void search_runs(const search_context &context, score_stats &stats);
250 // conceptually these are 'protected:', but we're taking it easy here
251 public:
252 void set_id() { id = next_id++; assert(next_id > 0); }
254 static int next_id;
255 static int next_physical_id;
257 static std::vector<spectrum> searchable_spectra;
258 // parent mass -> searchable_spectra index
259 static std::multimap<double,
260 std::vector<spectrum>::size_type> spectrum_mass_index;
264 struct mass_trace_item {
265 int position;
266 double delta;
267 int id; // FIX
269 // grrrr
270 bool operator==(const mass_trace_item &x) const {
271 return this->position == x.position
272 and this->delta == x.delta
273 and this->id == x.id;
278 // This is everything we want to remember about a match, so that we can report
279 // it later.
280 // FIX: Are any of these fields unneeded?
281 class match {
282 public:
283 double score;
284 int spectrum_index;
285 std::string peptide_sequence;
286 double predicted_parent_mass;
287 std::vector<mass_trace_item> mass_trace;
289 // these are vectors of results for storing peptides found in multiple
290 // sequence locations
291 std::vector<int> peptide_begin; // absolute position within locus
292 std::vector<std::string> sequence_name;
294 match() : score(0), spectrum_index(-1), predicted_parent_mass(0) { }
298 // This holds information about the best peptide/spectrum matches found, and
299 // some statistics. Essentially, it holds the results of the search process.
300 class score_stats {
301 public:
302 score_stats(int spectrum_count, int best_result_count)
303 : candidate_spectrum_count(0), combinations_searched(0) {
304 best_matches.resize(spectrum_count);
305 assert(best_result_count >= 1);
306 for (int i=0; i<spectrum_count; i++)
307 best_matches[i].resize(best_result_count);
310 // spectrum index -> list of match (only top N matches per spectrum) The
311 // best match lists are of fixed length (typically 5), and ordered
312 // best-first. Trailing entries with score 0 are null matches (to simplify
313 // the code).
314 std::vector< std::vector<match> > best_matches;
316 // Statistics:
317 // How many times was a spectrum scored against a peptide?
318 unsigned long long candidate_spectrum_count; // may be > 2^32
319 // How many different mod combinations were searched? (e.g., A*ABC and AA*BC
320 // are two distinct combinations)
321 int combinations_searched; // FIX: could be > 2^31?
325 #endif