1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
8 * A counting Bloom filter implementation. This allows consumers to
9 * do fast probabilistic "is item X in set Y?" testing which will
10 * never answer "no" when the correct answer is "yes" (but might
11 * incorrectly answer "yes" when the correct answer is "no").
14 #ifndef mozilla_BloomFilter_h
15 #define mozilla_BloomFilter_h
17 #include "mozilla/Assertions.h"
18 #include "mozilla/Likely.h"
26 * This class implements a counting Bloom filter as described at
27 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
28 * 8-bit counters. This allows quick probabilistic answers to the
29 * question "is object X in set Y?" where the contents of Y might not
30 * be time-invariant. The probabilistic nature of the test means that
31 * sometimes the answer will be "yes" when it should be "no". If the
32 * answer is "no", then X is guaranteed not to be in Y.
34 * The filter is parametrized on KeySize, which is the size of the key
35 * generated by each of hash functions used by the filter, in bits,
36 * and the type of object T being added and removed. T must implement
37 * a |uint32_t hash() const| method which returns a uint32_t hash key
38 * that will be used to generate the two separate hash functions for
39 * the Bloom filter. This hash key MUST be well-distributed for good
40 * results! KeySize is not allowed to be larger than 16.
42 * The filter uses exactly 2**KeySize bytes of memory. From now on we
43 * will refer to the memory used by the filter as M.
45 * The expected rate of incorrect "yes" answers depends on M and on
46 * the number N of objects in set Y. As long as N is small compared
47 * to M, the rate of such answers is expected to be approximately
48 * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
49 * elements then using a KeySize of 12 gives a reasonably low
50 * incorrect answer rate. A KeySize of 12 has the additional benefit
51 * of using exactly one page for the filter in typical hardware
55 template <unsigned KeySize
, class T
>
58 * A counting Bloom filter with 8-bit counters. For now we assume
59 * that having two hash functions is enough, but we may revisit that
62 * The filter uses an array with 2**KeySize entries.
64 * Assuming a well-distributed hash function, a Bloom filter with
65 * array size M containing N elements and
66 * using k hash function has expected false positive rate exactly
68 * $ (1 - (1 - 1/M)^{kN})^k $
70 * because each array slot has a
74 * chance of being 0, and the expected false positive rate is the
75 * probability that all of the k hash functions will hit a nonzero
78 * For reasonable assumptions (M large, kN large, which should both
79 * hold if we're worried about false positives) about M and kN this
80 * becomes approximately
82 * $$ (1 - \exp(-kN/M))^k $$
84 * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
87 * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
89 * where r is the false positive rate. This can be used to compute
90 * the desired KeySize for a given load N and false positive rate r.
92 * If N/M is assumed small, then the false positive rate can
93 * further be approximated as 4*N^2/M^2. So increasing KeySize by
94 * 1, which doubles M, reduces the false positive rate by about a
95 * factor of 4, and a false positive rate of 1% corresponds to
98 * What this means in practice is that for a few hundred keys using a
99 * KeySize of 12 gives false positive rates on the order of 0.25-4%.
101 * Similarly, using a KeySize of 10 would lead to a 4% false
102 * positive rate for N == 100 and to quite bad false positive
103 * rates for larger N.
107 static_assert(KeySize
<= kKeyShift
, "KeySize too big");
109 // Should we have a custom operator new using calloc instead and
110 // require that we're allocated via the operator?
115 * Clear the filter. This should be done before reusing it, because
116 * just removing all items doesn't clear counters that hit the upper
122 * Add an item to the filter.
124 void add(const T
* aValue
);
127 * Remove an item from the filter.
129 void remove(const T
* aValue
);
132 * Check whether the filter might contain an item. This can
133 * sometimes return true even if the item is not in the filter,
134 * but will never return false for items that are actually in the
137 bool mightContain(const T
* aValue
) const;
140 * Methods for add/remove/contain when we already have a hash computed
142 void add(uint32_t aHash
);
143 void remove(uint32_t aHash
);
144 bool mightContain(uint32_t aHash
) const;
147 static const size_t kArraySize
= (1 << KeySize
);
148 static const uint32_t kKeyMask
= (1 << KeySize
) - 1;
149 static const uint32_t kKeyShift
= 16;
151 static uint32_t hash1(uint32_t aHash
) { return aHash
& kKeyMask
; }
152 static uint32_t hash2(uint32_t aHash
) {
153 return (aHash
>> kKeyShift
) & kKeyMask
;
156 uint8_t& firstSlot(uint32_t aHash
) { return mCounters
[hash1(aHash
)]; }
157 uint8_t& secondSlot(uint32_t aHash
) { return mCounters
[hash2(aHash
)]; }
159 const uint8_t& firstSlot(uint32_t aHash
) const {
160 return mCounters
[hash1(aHash
)];
162 const uint8_t& secondSlot(uint32_t aHash
) const {
163 return mCounters
[hash2(aHash
)];
166 static bool full(const uint8_t& aSlot
) { return aSlot
== UINT8_MAX
; }
168 uint8_t mCounters
[kArraySize
];
171 template <unsigned KeySize
, class T
>
172 inline void BloomFilter
<KeySize
, T
>::clear() {
173 memset(mCounters
, 0, kArraySize
);
176 template <unsigned KeySize
, class T
>
177 inline void BloomFilter
<KeySize
, T
>::add(uint32_t aHash
) {
178 uint8_t& slot1
= firstSlot(aHash
);
179 if (MOZ_LIKELY(!full(slot1
))) {
182 uint8_t& slot2
= secondSlot(aHash
);
183 if (MOZ_LIKELY(!full(slot2
))) {
188 template <unsigned KeySize
, class T
>
189 MOZ_ALWAYS_INLINE
void BloomFilter
<KeySize
, T
>::add(const T
* aValue
) {
190 uint32_t hash
= aValue
->hash();
194 template <unsigned KeySize
, class T
>
195 inline void BloomFilter
<KeySize
, T
>::remove(uint32_t aHash
) {
196 // If the slots are full, we don't know whether we bumped them to be
197 // there when we added or not, so just leave them full.
198 uint8_t& slot1
= firstSlot(aHash
);
199 if (MOZ_LIKELY(!full(slot1
))) {
202 uint8_t& slot2
= secondSlot(aHash
);
203 if (MOZ_LIKELY(!full(slot2
))) {
208 template <unsigned KeySize
, class T
>
209 MOZ_ALWAYS_INLINE
void BloomFilter
<KeySize
, T
>::remove(const T
* aValue
) {
210 uint32_t hash
= aValue
->hash();
214 template <unsigned KeySize
, class T
>
215 MOZ_ALWAYS_INLINE
bool BloomFilter
<KeySize
, T
>::mightContain(
216 uint32_t aHash
) const {
217 // Check that all the slots for this hash contain something
218 return firstSlot(aHash
) && secondSlot(aHash
);
221 template <unsigned KeySize
, class T
>
222 MOZ_ALWAYS_INLINE
bool BloomFilter
<KeySize
, T
>::mightContain(
223 const T
* aValue
) const {
224 uint32_t hash
= aValue
->hash();
225 return mightContain(hash
);
228 } // namespace mozilla
230 #endif /* mozilla_BloomFilter_h */