no bug - Bumping Firefox l10n changesets r=release a=l10n-bump DONTBUILD CLOSED TREE
[gecko.git] / mfbt / BloomFilter.h
blob08882c4d6326068f8c84244660d3f12d128c31cb
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /*
8 * A counting Bloom filter implementation. This allows consumers to
9 * do fast probabilistic "is item X in set Y?" testing which will
10 * never answer "no" when the correct answer is "yes" (but might
11 * incorrectly answer "yes" when the correct answer is "no").
14 #ifndef mozilla_BloomFilter_h
15 #define mozilla_BloomFilter_h
17 #include "mozilla/Assertions.h"
18 #include "mozilla/Likely.h"
20 #include <stdint.h>
21 #include <string.h>
23 namespace mozilla {
26 * This class implements a classic Bloom filter as described at
27 * <http://en.wikipedia.org/wiki/Bloom_filter>. This allows quick
28 * probabilistic answers to the question "is object X in set Y?" where the
29 * contents of Y might not be time-invariant. The probabilistic nature of the
30 * test means that sometimes the answer will be "yes" when it should be "no".
31 * If the answer is "no", then X is guaranteed not to be in Y.
33 * The filter is parametrized on KeySize, which is the size of the key
34 * generated by each of hash functions used by the filter, in bits,
35 * and the type of object T being added and removed. T must implement
36 * a |uint32_t hash() const| method which returns a uint32_t hash key
37 * that will be used to generate the two separate hash functions for
38 * the Bloom filter. This hash key MUST be well-distributed for good
39 * results! KeySize is not allowed to be larger than 16.
41 * The filter uses exactly 2**KeySize bit (2**(KeySize-3) bytes) of memory.
42 * From now on we will refer to the memory used by the filter as M.
44 * The expected rate of incorrect "yes" answers depends on M and on
45 * the number N of objects in set Y. As long as N is small compared
46 * to M, the rate of such answers is expected to be approximately
47 * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
48 * elements then using a KeySize of 12 gives a reasonably low
49 * incorrect answer rate. A KeySize of 12 has the additional benefit
50 * of using exactly one page for the filter in typical hardware
51 * configurations.
53 template <unsigned KeySize, class T>
54 class BitBloomFilter {
56 * A counting Bloom filter with 8-bit counters. For now we assume
57 * that having two hash functions is enough, but we may revisit that
58 * decision later.
60 * The filter uses an array with 2**KeySize entries.
62 * Assuming a well-distributed hash function, a Bloom filter with
63 * array size M containing N elements and
64 * using k hash function has expected false positive rate exactly
66 * $ (1 - (1 - 1/M)^{kN})^k $
68 * because each array slot has a
70 * $ (1 - 1/M)^{kN} $
72 * chance of being 0, and the expected false positive rate is the
73 * probability that all of the k hash functions will hit a nonzero
74 * slot.
76 * For reasonable assumptions (M large, kN large, which should both
77 * hold if we're worried about false positives) about M and kN this
78 * becomes approximately
80 * $$ (1 - \exp(-kN/M))^k $$
82 * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
83 * or in other words
85 * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
87 * where r is the false positive rate. This can be used to compute
88 * the desired KeySize for a given load N and false positive rate r.
90 * If N/M is assumed small, then the false positive rate can
91 * further be approximated as 4*N^2/M^2. So increasing KeySize by
92 * 1, which doubles M, reduces the false positive rate by about a
93 * factor of 4, and a false positive rate of 1% corresponds to
94 * about M/N == 20.
96 * What this means in practice is that for a few hundred keys using a
97 * KeySize of 12 gives false positive rates on the order of 0.25-4%.
99 * Similarly, using a KeySize of 10 would lead to a 4% false
100 * positive rate for N == 100 and to quite bad false positive
101 * rates for larger N.
103 public:
104 BitBloomFilter() {
105 static_assert(KeySize >= 3, "KeySize too small");
106 static_assert(KeySize <= kKeyShift, "KeySize too big");
108 // XXX: Should we have a custom operator new using calloc instead and
109 // require that we're allocated via the operator?
110 clear();
114 * Clear the filter. This should be done before reusing it.
116 void clear();
119 * Add an item to the filter.
121 void add(const T* aValue);
124 * Check whether the filter might contain an item. This can
125 * sometimes return true even if the item is not in the filter,
126 * but will never return false for items that are actually in the
127 * filter.
129 bool mightContain(const T* aValue) const;
132 * Methods for add/contain when we already have a hash computed
134 void add(uint32_t aHash);
135 bool mightContain(uint32_t aHash) const;
137 private:
138 static const size_t kArraySize = (1 << (KeySize - 3));
139 static const uint32_t kKeyMask = (1 << KeySize) - 1;
140 static const uint32_t kKeyShift = 16;
142 static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
143 static uint32_t hash2(uint32_t aHash) {
144 return (aHash >> kKeyShift) & kKeyMask;
147 bool getSlot(uint32_t aHash) const {
148 uint32_t index = aHash / 8;
149 uint8_t shift = aHash % 8;
150 uint8_t mask = 1 << shift;
151 return !!(mBits[index] & mask);
154 void setSlot(uint32_t aHash) {
155 uint32_t index = aHash / 8;
156 uint8_t shift = aHash % 8;
157 uint8_t bit = 1 << shift;
158 mBits[index] |= bit;
161 bool getFirstSlot(uint32_t aHash) const { return getSlot(hash1(aHash)); }
162 bool getSecondSlot(uint32_t aHash) const { return getSlot(hash2(aHash)); }
164 void setFirstSlot(uint32_t aHash) { setSlot(hash1(aHash)); }
165 void setSecondSlot(uint32_t aHash) { setSlot(hash2(aHash)); }
167 uint8_t mBits[kArraySize];
170 template <unsigned KeySize, class T>
171 inline void BitBloomFilter<KeySize, T>::clear() {
172 memset(mBits, 0, kArraySize);
175 template <unsigned KeySize, class T>
176 inline void BitBloomFilter<KeySize, T>::add(uint32_t aHash) {
177 setFirstSlot(aHash);
178 setSecondSlot(aHash);
181 template <unsigned KeySize, class T>
182 MOZ_ALWAYS_INLINE void BitBloomFilter<KeySize, T>::add(const T* aValue) {
183 uint32_t hash = aValue->hash();
184 return add(hash);
187 template <unsigned KeySize, class T>
188 MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
189 uint32_t aHash) const {
190 // Check that all the slots for this hash contain something
191 return getFirstSlot(aHash) && getSecondSlot(aHash);
194 template <unsigned KeySize, class T>
195 MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
196 const T* aValue) const {
197 uint32_t hash = aValue->hash();
198 return mightContain(hash);
202 * This class implements a counting Bloom filter as described at
203 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
204 * 8-bit counters.
206 * Compared to `BitBloomFilter`, this class supports 'remove' operation.
208 * The filter uses exactly 2**KeySize bytes of memory.
210 * Other characteristics are the same as BitBloomFilter.
212 template <unsigned KeySize, class T>
213 class CountingBloomFilter {
214 public:
215 CountingBloomFilter() {
216 static_assert(KeySize <= kKeyShift, "KeySize too big");
218 clear();
222 * Clear the filter. This should be done before reusing it, because
223 * just removing all items doesn't clear counters that hit the upper
224 * bound.
226 void clear();
229 * Add an item to the filter.
231 void add(const T* aValue);
234 * Remove an item from the filter.
236 void remove(const T* aValue);
239 * Check whether the filter might contain an item. This can
240 * sometimes return true even if the item is not in the filter,
241 * but will never return false for items that are actually in the
242 * filter.
244 bool mightContain(const T* aValue) const;
247 * Methods for add/remove/contain when we already have a hash computed
249 void add(uint32_t aHash);
250 void remove(uint32_t aHash);
251 bool mightContain(uint32_t aHash) const;
253 private:
254 static const size_t kArraySize = (1 << KeySize);
255 static const uint32_t kKeyMask = (1 << KeySize) - 1;
256 static const uint32_t kKeyShift = 16;
258 static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
259 static uint32_t hash2(uint32_t aHash) {
260 return (aHash >> kKeyShift) & kKeyMask;
263 uint8_t& firstSlot(uint32_t aHash) { return mCounters[hash1(aHash)]; }
264 uint8_t& secondSlot(uint32_t aHash) { return mCounters[hash2(aHash)]; }
266 const uint8_t& firstSlot(uint32_t aHash) const {
267 return mCounters[hash1(aHash)];
269 const uint8_t& secondSlot(uint32_t aHash) const {
270 return mCounters[hash2(aHash)];
273 static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
275 uint8_t mCounters[kArraySize];
278 template <unsigned KeySize, class T>
279 inline void CountingBloomFilter<KeySize, T>::clear() {
280 memset(mCounters, 0, kArraySize);
283 template <unsigned KeySize, class T>
284 inline void CountingBloomFilter<KeySize, T>::add(uint32_t aHash) {
285 uint8_t& slot1 = firstSlot(aHash);
286 if (MOZ_LIKELY(!full(slot1))) {
287 ++slot1;
289 uint8_t& slot2 = secondSlot(aHash);
290 if (MOZ_LIKELY(!full(slot2))) {
291 ++slot2;
295 template <unsigned KeySize, class T>
296 MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::add(const T* aValue) {
297 uint32_t hash = aValue->hash();
298 return add(hash);
301 template <unsigned KeySize, class T>
302 inline void CountingBloomFilter<KeySize, T>::remove(uint32_t aHash) {
303 // If the slots are full, we don't know whether we bumped them to be
304 // there when we added or not, so just leave them full.
305 uint8_t& slot1 = firstSlot(aHash);
306 if (MOZ_LIKELY(!full(slot1))) {
307 --slot1;
309 uint8_t& slot2 = secondSlot(aHash);
310 if (MOZ_LIKELY(!full(slot2))) {
311 --slot2;
315 template <unsigned KeySize, class T>
316 MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::remove(
317 const T* aValue) {
318 uint32_t hash = aValue->hash();
319 remove(hash);
322 template <unsigned KeySize, class T>
323 MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
324 uint32_t aHash) const {
325 // Check that all the slots for this hash contain something
326 return firstSlot(aHash) && secondSlot(aHash);
329 template <unsigned KeySize, class T>
330 MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
331 const T* aValue) const {
332 uint32_t hash = aValue->hash();
333 return mightContain(hash);
336 } // namespace mozilla
338 #endif /* mozilla_BloomFilter_h */