1 /** costMatrix object to provide for a memoizable cost lookup table. Table is indexed by two
2 * dcElement values, and returns an int, for the cost. In addition, an additional dcElement
3 * is passed in by reference, and the median value of the two input elements is placed there.
4 * The getCost function is designed to interface directly with C.
6 * The key lookup is an ordered pair, so when looking up transition a -> b, a must go in as
9 * WARNING: In the interest of speed this code does no "type checking" to make sure that the
10 * two passed deElements are of the same type, i.e. that they have the same alphabet length.
11 * Any such checks should be done exterior to this library.
21 #include <unordered_map>
27 #include "dynamicCharacterOperations.h"
29 /** Next three fns defined here to use on C side. */
30 costMatrix_p
construct_CostMatrix_C (size_t alphSize
, int* tcm
);
32 void destruct_CostMatrix_C (costMatrix_p mytype
);
34 int call_getSetCost_C (costMatrix_p untyped_self
, dcElement_t
* left
, dcElement_t
* right
, dcElement_t
* retMedian
);
40 typedef std::pair
<dcElement_t
, dcElement_t
> keys_t
;
41 typedef std::pair
<int, packedChar
*> costMedian_t
;
42 typedef std::pair
<keys_t
, costMedian_t
> mapAccessPair_t
;
44 typedef void* costMatrix_p
;
46 /** Allocate room for a costMedian_t. Assumes alphabetSize is already initialized. */
47 costMedian_t
* allocCostMedian_t (size_t alphabetSize
);
49 /** dealloc costMedian_t. */
50 void freeCostMedian_t (costMedian_t
* toFree
);
52 /** Allocate room for a keys_t. */
53 keys_t
* allocKeys_t (size_t alphSize
);
55 /** dealloc keys_t. Calls various other free fns. */
56 void freeKeys_t (const keys_t
* toFree
);
58 /** Allocate space for Pair<keys_t, costMedian_t>, calling allocators for both types. */
59 mapAccessPair_t
* allocateMapAccessPair (size_t alphSize
);
61 /** Hashes two `dcElement`s, and returns an order-dependent hash value. In this case
62 * "order dependent" means that the order of the arrays within the `dcElement`s matter,
63 * and the order that the `dcElement`s are sent in also matters, as is necessary for a
66 * First loops through each `dcElement` and combines all of the element values (recall that a
67 * `dcElement` has two fields, the second of which is the element, and is an array of `uint64_t`s)
68 * using two different seeds, then combines the two resulting values.
71 /** Following hash_combine code modified from here (seems to be based on Boost):
72 * http://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
74 std::size_t hash_combine (const dcElement_t lhs
, const dcElement_t rhs
) const {
75 std::size_t left_seed
= 3141592653; // PI used as arbitrarily random seed
76 std::size_t right_seed
= 2718281828; // E used as arbitrarily random seed
78 std::hash
<uint64_t> hasher
;
79 size_t elemArrCount
= dcElemSize(lhs
.alphSize
);
80 for (size_t i
= 0; i
< elemArrCount
; i
++) {
81 left_seed
^= hasher(lhs
.element
[i
]) + 0x9e3779b9 + (left_seed
<< 6) + (left_seed
>> 2);
82 right_seed
^= hasher(rhs
.element
[i
]) + 0x9e3779b9 + (right_seed
<< 6) + (right_seed
>> 2);
84 left_seed
^= hasher(right_seed
) + 0x9e3779b9 + (left_seed
<< 6) + (left_seed
>> 2);
88 std::size_t operator()(const keys_t
& k
) const
90 return hash_combine (k
.first
, k
.second
);
95 // Return true if every `uint64_t` in lhs->element and rhs->element is equal, else false.
96 bool operator()(const keys_t
& lhs
, const keys_t
& rhs
) const
98 // Assert that all key components share the same alphSize value
99 if ( lhs
.first
.alphSize
!= rhs
.first
.alphSize
100 || lhs
.first
.alphSize
!= lhs
.second
.alphSize
101 || lhs
.second
.alphSize
!= rhs
.second
.alphSize
) {
105 //Assert that the left key elements match the right key elements
106 size_t elemArrWidth
= dcElemSize(lhs
.first
.alphSize
);
107 for (size_t i
= 0; i
< elemArrWidth
; i
++) {
108 if (lhs
.first
.element
[i
] != rhs
.first
.element
[i
]) {
111 if (lhs
.second
.element
[i
] != rhs
.second
.element
[i
]) {
119 typedef std::unordered_map
<keys_t
, costMedian_t
, KeyHash
, KeyEqual
>::const_iterator mapIterator
;
127 CostMatrix(size_t alphSize
, int* tcm
);
131 /** Getter only for cost. Necessary for testing, to insure that particular
132 * key pair has, in fact, already been inserted into lookup table.
134 int getCostMedian(dcElement_t
* left
, dcElement_t
* right
, dcElement_t
* retMedian
);
136 /** Acts as both a setter and getter, mutating myMap.
138 * Receives two dcElements and computes the transformation cost as well as
139 * the median for the two. Puts the median and alphabet size into retMedian,
140 * which must therefore by necessity be allocated elsewhere.
142 * This functin allocates _if necessary_. So freeing inputs after a call will not
143 * cause invalid reads from the cost matrix.
145 int getSetCostMedian(dcElement_t
* left
, dcElement_t
* right
, dcElement_t
* retMedian
);
148 std::unordered_map
<keys_t
, costMedian_t
, KeyHash
, KeyEqual
> myMatrix
;
150 std::unordered_map
<keys_t
, costMedian_t
, KeyHash
, KeyEqual
> hasher
;
154 /** Stored unambiguous tcm, necessary to do first calls to findDistance() without having to rewrite findDistance()
155 * and computeCostMedian()
159 /** Takes in a `keys_t` and a `costMedian_t` and updates myMap to store the new values,
160 * with @key as a key, and @median as the value.
162 void setValue(keys_t
* key
, costMedian_t
* median
);
164 /** Takes in a pair of keys_t (each of which is a single `dcElement`) and computes their lowest-cost median.
165 * Uses a Sankoff-like algorithm, where all bases are considered, and the lowest cost bases are included in the
166 * cost and median calculations. That means a base might appear in the median that is not present in either of
167 * the two elements being compared.
169 costMedian_t
* computeCostMedian(keys_t key
);
171 /** Find distance between an ambiguous nucleotide and an unambiguous ambElem. Return that value and the median.
172 * @param ambElem is ambiguous input.
173 * @param nucleotide is unambiguous.
174 * @param median is used to return the calculated median value.
176 * This fn is necessary because there isn't yet a cost matrix set up, so it's not possible to
177 * look up ambElems, therefore we must loop over possible values of the ambElem
178 * and find the lowest cost median.
180 * Nota bene: Requires symmetric, if not metric, matrix. TODO: Is this true? If so fix it?
182 int findDistance (keys_t
* searchKey
, dcElement_t
* ambElem
);
184 /** Takes in an initial TCM, which is actually just a row-major array, creates hash table of costs
185 * where cost is least cost between two elements, and medians, where median is union of characters.
188 * Can only be called once this.alphabetSize has been set.
190 void initializeMatrix ();
193 /** Takes in a pair of keys_t (each of which is a single `dcElement`) and computes their lowest-cost median.
194 * Contrast with computeCostMedian(). In this algorithm only bases which are present in at least one of
195 * the two elements being compared are considered.
197 /* costMedian_t* computeCostMedianFitchy(keys_t keys); */
201 #endif // COSTMATRIX_H