Updating changelog, Adding test case for issue #5309
[cabal.git] / cabal-testsuite / PackageTests / Regression / T5309 / memoized-tcm / costMatrix.h
bloba7f1bc25e82efe0266ddd21a85a397e58ba46122
1 /** costMatrix object to provide for a memoizable cost lookup table. Table is indexed by two
2 * dcElement values, and returns an int, for the cost. In addition, an additional dcElement
3 * is passed in by reference, and the median value of the two input elements is placed there.
4 * The getCost function is designed to interface directly with C.
6 * The key lookup is an ordered pair, so when looking up transition a -> b, a must go in as
7 * first in pair
9 * WARNING: In the interest of speed this code does no "type checking" to make sure that the
10 * two passed deElements are of the same type, i.e. that they have the same alphabet length.
11 * Any such checks should be done exterior to this library.
14 #ifndef _COSTMATRIX_H
15 #define _COSTMATRIX_H
17 #define DEBUG 0
19 #include <climits>
20 #include <cstdlib>
21 #include <unordered_map>
23 #ifdef __cplusplus
24 extern "C" {
25 #endif
27 #include "dynamicCharacterOperations.h"
29 /** Next three fns defined here to use on C side. */
30 costMatrix_p construct_CostMatrix_C (size_t alphSize, int* tcm);
32 void destruct_CostMatrix_C (costMatrix_p mytype);
34 int call_getSetCost_C (costMatrix_p untyped_self, dcElement_t* left, dcElement_t* right, dcElement_t* retMedian);
36 #ifdef __cplusplus
38 #endif
40 typedef std::pair<dcElement_t, dcElement_t> keys_t;
41 typedef std::pair<int, packedChar*> costMedian_t;
42 typedef std::pair<keys_t, costMedian_t> mapAccessPair_t;
44 typedef void* costMatrix_p;
46 /** Allocate room for a costMedian_t. Assumes alphabetSize is already initialized. */
47 costMedian_t* allocCostMedian_t (size_t alphabetSize);
49 /** dealloc costMedian_t. */
50 void freeCostMedian_t (costMedian_t* toFree);
52 /** Allocate room for a keys_t. */
53 keys_t* allocKeys_t (size_t alphSize);
55 /** dealloc keys_t. Calls various other free fns. */
56 void freeKeys_t (const keys_t* toFree);
58 /** Allocate space for Pair<keys_t, costMedian_t>, calling allocators for both types. */
59 mapAccessPair_t* allocateMapAccessPair (size_t alphSize);
61 /** Hashes two `dcElement`s, and returns an order-dependent hash value. In this case
62 * "order dependent" means that the order of the arrays within the `dcElement`s matter,
63 * and the order that the `dcElement`s are sent in also matters, as is necessary for a
64 * non-symmetric tcm.
66 * First loops through each `dcElement` and combines all of the element values (recall that a
67 * `dcElement` has two fields, the second of which is the element, and is an array of `uint64_t`s)
68 * using two different seeds, then combines the two resulting values.
70 struct KeyHash {
71 /** Following hash_combine code modified from here (seems to be based on Boost):
72 * http://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
74 std::size_t hash_combine (const dcElement_t lhs, const dcElement_t rhs) const {
75 std::size_t left_seed = 3141592653; // PI used as arbitrarily random seed
76 std::size_t right_seed = 2718281828; // E used as arbitrarily random seed
78 std::hash<uint64_t> hasher;
79 size_t elemArrCount = dcElemSize(lhs.alphSize);
80 for (size_t i = 0; i < elemArrCount; i++) {
81 left_seed ^= hasher(lhs.element[i]) + 0x9e3779b9 + (left_seed << 6) + (left_seed >> 2);
82 right_seed ^= hasher(rhs.element[i]) + 0x9e3779b9 + (right_seed << 6) + (right_seed >> 2);
84 left_seed ^= hasher(right_seed) + 0x9e3779b9 + (left_seed << 6) + (left_seed >> 2);
85 return left_seed;
88 std::size_t operator()(const keys_t& k) const
90 return hash_combine (k.first, k.second);
94 struct KeyEqual {
95 // Return true if every `uint64_t` in lhs->element and rhs->element is equal, else false.
96 bool operator()(const keys_t& lhs, const keys_t& rhs) const
98 // Assert that all key components share the same alphSize value
99 if ( lhs.first.alphSize != rhs.first.alphSize
100 || lhs.first.alphSize != lhs.second.alphSize
101 || lhs.second.alphSize != rhs.second.alphSize) {
102 return false;
105 //Assert that the left key elements match the right key elements
106 size_t elemArrWidth = dcElemSize(lhs.first.alphSize);
107 for (size_t i = 0; i < elemArrWidth; i++) {
108 if (lhs.first.element[i] != rhs.first.element[i]) {
109 return false;
111 if (lhs.second.element[i] != rhs.second.element[i]) {
112 return false;
115 return true;
119 typedef std::unordered_map<keys_t, costMedian_t, KeyHash, KeyEqual>::const_iterator mapIterator;
122 class CostMatrix
124 public:
125 // CostMatrix();
127 CostMatrix(size_t alphSize, int* tcm);
129 ~CostMatrix();
131 /** Getter only for cost. Necessary for testing, to insure that particular
132 * key pair has, in fact, already been inserted into lookup table.
134 int getCostMedian(dcElement_t* left, dcElement_t* right, dcElement_t* retMedian);
136 /** Acts as both a setter and getter, mutating myMap.
138 * Receives two dcElements and computes the transformation cost as well as
139 * the median for the two. Puts the median and alphabet size into retMedian,
140 * which must therefore by necessity be allocated elsewhere.
142 * This functin allocates _if necessary_. So freeing inputs after a call will not
143 * cause invalid reads from the cost matrix.
145 int getSetCostMedian(dcElement_t* left, dcElement_t* right, dcElement_t* retMedian);
147 private:
148 std::unordered_map <keys_t, costMedian_t, KeyHash, KeyEqual> myMatrix;
150 std::unordered_map <keys_t, costMedian_t, KeyHash, KeyEqual> hasher;
152 size_t alphabetSize;
154 /** Stored unambiguous tcm, necessary to do first calls to findDistance() without having to rewrite findDistance()
155 * and computeCostMedian()
157 int *tcm;
159 /** Takes in a `keys_t` and a `costMedian_t` and updates myMap to store the new values,
160 * with @key as a key, and @median as the value.
162 void setValue(keys_t* key, costMedian_t* median);
164 /** Takes in a pair of keys_t (each of which is a single `dcElement`) and computes their lowest-cost median.
165 * Uses a Sankoff-like algorithm, where all bases are considered, and the lowest cost bases are included in the
166 * cost and median calculations. That means a base might appear in the median that is not present in either of
167 * the two elements being compared.
169 costMedian_t* computeCostMedian(keys_t key);
171 /** Find distance between an ambiguous nucleotide and an unambiguous ambElem. Return that value and the median.
172 * @param ambElem is ambiguous input.
173 * @param nucleotide is unambiguous.
174 * @param median is used to return the calculated median value.
176 * This fn is necessary because there isn't yet a cost matrix set up, so it's not possible to
177 * look up ambElems, therefore we must loop over possible values of the ambElem
178 * and find the lowest cost median.
180 * Nota bene: Requires symmetric, if not metric, matrix. TODO: Is this true? If so fix it?
182 int findDistance (keys_t* searchKey, dcElement_t* ambElem);
184 /** Takes in an initial TCM, which is actually just a row-major array, creates hash table of costs
185 * where cost is least cost between two elements, and medians, where median is union of characters.
187 * Nota bene:
188 * Can only be called once this.alphabetSize has been set.
190 void initializeMatrix ();
192 // DEPRECATED!!!
193 /** Takes in a pair of keys_t (each of which is a single `dcElement`) and computes their lowest-cost median.
194 * Contrast with computeCostMedian(). In this algorithm only bases which are present in at least one of
195 * the two elements being compared are considered.
197 /* costMedian_t* computeCostMedianFitchy(keys_t keys); */
201 #endif // COSTMATRIX_H