Merge fx-team to m-c.
[gecko.git] / mfbt / BloomFilter.h
blobafe4b72b8090b8f81f6db60d7522362d7e8048e1
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /*
8 * A counting Bloom filter implementation. This allows consumers to
9 * do fast probabilistic "is item X in set Y?" testing which will
10 * never answer "no" when the correct answer is "yes" (but might
11 * incorrectly answer "yes" when the correct answer is "no").
14 #ifndef mozilla_BloomFilter_h
15 #define mozilla_BloomFilter_h
17 #include "mozilla/Assertions.h"
18 #include "mozilla/Likely.h"
19 #include "mozilla/Util.h"
21 #include <stdint.h>
22 #include <string.h>
24 namespace mozilla {
27 * This class implements a counting Bloom filter as described at
28 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
29 * 8-bit counters. This allows quick probabilistic answers to the
30 * question "is object X in set Y?" where the contents of Y might not
31 * be time-invariant. The probabilistic nature of the test means that
32 * sometimes the answer will be "yes" when it should be "no". If the
33 * answer is "no", then X is guaranteed not to be in Y.
35 * The filter is parametrized on KeySize, which is the size of the key
36 * generated by each of hash functions used by the filter, in bits,
37 * and the type of object T being added and removed. T must implement
38 * a |uint32_t hash() const| method which returns a uint32_t hash key
39 * that will be used to generate the two separate hash functions for
40 * the Bloom filter. This hash key MUST be well-distributed for good
41 * results! KeySize is not allowed to be larger than 16.
43 * The filter uses exactly 2**KeySize bytes of memory. From now on we
44 * will refer to the memory used by the filter as M.
46 * The expected rate of incorrect "yes" answers depends on M and on
47 * the number N of objects in set Y. As long as N is small compared
48 * to M, the rate of such answers is expected to be approximately
49 * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
50 * elements then using a KeySize of 12 gives a reasonably low
51 * incorrect answer rate. A KeySize of 12 has the additional benefit
52 * of using exactly one page for the filter in typical hardware
53 * configurations.
56 template<unsigned KeySize, class T>
57 class BloomFilter
60 * A counting Bloom filter with 8-bit counters. For now we assume
61 * that having two hash functions is enough, but we may revisit that
62 * decision later.
64 * The filter uses an array with 2**KeySize entries.
66 * Assuming a well-distributed hash function, a Bloom filter with
67 * array size M containing N elements and
68 * using k hash function has expected false positive rate exactly
70 * $ (1 - (1 - 1/M)^{kN})^k $
72 * because each array slot has a
74 * $ (1 - 1/M)^{kN} $
76 * chance of being 0, and the expected false positive rate is the
77 * probability that all of the k hash functions will hit a nonzero
78 * slot.
80 * For reasonable assumptions (M large, kN large, which should both
81 * hold if we're worried about false positives) about M and kN this
82 * becomes approximately
84 * $$ (1 - \exp(-kN/M))^k $$
86 * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
87 * or in other words
89 * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
91 * where r is the false positive rate. This can be used to compute
92 * the desired KeySize for a given load N and false positive rate r.
94 * If N/M is assumed small, then the false positive rate can
95 * further be approximated as 4*N^2/M^2. So increasing KeySize by
96 * 1, which doubles M, reduces the false positive rate by about a
97 * factor of 4, and a false positive rate of 1% corresponds to
98 * about M/N == 20.
100 * What this means in practice is that for a few hundred keys using a
101 * KeySize of 12 gives false positive rates on the order of 0.25-4%.
103 * Similarly, using a KeySize of 10 would lead to a 4% false
104 * positive rate for N == 100 and to quite bad false positive
105 * rates for larger N.
107 public:
108 BloomFilter() {
109 static_assert(KeySize <= keyShift, "KeySize too big");
111 // Should we have a custom operator new using calloc instead and
112 // require that we're allocated via the operator?
113 clear();
117 * Clear the filter. This should be done before reusing it, because
118 * just removing all items doesn't clear counters that hit the upper
119 * bound.
121 void clear();
124 * Add an item to the filter.
126 void add(const T* t);
129 * Remove an item from the filter.
131 void remove(const T* t);
134 * Check whether the filter might contain an item. This can
135 * sometimes return true even if the item is not in the filter,
136 * but will never return false for items that are actually in the
137 * filter.
139 bool mightContain(const T* t) const;
142 * Methods for add/remove/contain when we already have a hash computed
144 void add(uint32_t hash);
145 void remove(uint32_t hash);
146 bool mightContain(uint32_t hash) const;
148 private:
149 static const size_t arraySize = (1 << KeySize);
150 static const uint32_t keyMask = (1 << KeySize) - 1;
151 static const uint32_t keyShift = 16;
153 static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
154 static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
156 uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
157 uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
158 const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
159 const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
161 static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
163 uint8_t counters[arraySize];
166 template<unsigned KeySize, class T>
167 inline void
168 BloomFilter<KeySize, T>::clear()
170 memset(counters, 0, arraySize);
173 template<unsigned KeySize, class T>
174 inline void
175 BloomFilter<KeySize, T>::add(uint32_t hash)
177 uint8_t& slot1 = firstSlot(hash);
178 if (MOZ_LIKELY(!full(slot1)))
179 ++slot1;
181 uint8_t& slot2 = secondSlot(hash);
182 if (MOZ_LIKELY(!full(slot2)))
183 ++slot2;
186 template<unsigned KeySize, class T>
187 MOZ_ALWAYS_INLINE void
188 BloomFilter<KeySize, T>::add(const T* t)
190 uint32_t hash = t->hash();
191 return add(hash);
194 template<unsigned KeySize, class T>
195 inline void
196 BloomFilter<KeySize, T>::remove(uint32_t hash)
198 // If the slots are full, we don't know whether we bumped them to be
199 // there when we added or not, so just leave them full.
200 uint8_t& slot1 = firstSlot(hash);
201 if (MOZ_LIKELY(!full(slot1)))
202 --slot1;
204 uint8_t& slot2 = secondSlot(hash);
205 if (MOZ_LIKELY(!full(slot2)))
206 --slot2;
209 template<unsigned KeySize, class T>
210 MOZ_ALWAYS_INLINE void
211 BloomFilter<KeySize, T>::remove(const T* t)
213 uint32_t hash = t->hash();
214 remove(hash);
217 template<unsigned KeySize, class T>
218 MOZ_ALWAYS_INLINE bool
219 BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
221 // Check that all the slots for this hash contain something
222 return firstSlot(hash) && secondSlot(hash);
225 template<unsigned KeySize, class T>
226 MOZ_ALWAYS_INLINE bool
227 BloomFilter<KeySize, T>::mightContain(const T* t) const
229 uint32_t hash = t->hash();
230 return mightContain(hash);
233 } // namespace mozilla
235 #endif /* mozilla_BloomFilter_h */