mfbt/BloomFilter.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 /*
   8  * A counting Bloom filter implementation.  This allows consumers to
   9  * do fast probabilistic "is item X in set Y?" testing which will
  10  * never answer "no" when the correct answer is "yes" (but might
  11  * incorrectly answer "yes" when the correct answer is "no").
  12  */
  13
  14 #ifndef mozilla_BloomFilter_h
  15 #define mozilla_BloomFilter_h
  16
  17 #include "mozilla/Assertions.h"
  18 #include "mozilla/Likely.h"
  19 #include "mozilla/Util.h"
  20
  21 #include <stdint.h>
  22 #include <string.h>
  23
  24 namespace mozilla {
  25
  26 /*
  27  * This class implements a counting Bloom filter as described at
  28  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
  29  * 8-bit counters.  This allows quick probabilistic answers to the
  30  * question "is object X in set Y?" where the contents of Y might not
  31  * be time-invariant.  The probabilistic nature of the test means that
  32  * sometimes the answer will be "yes" when it should be "no".  If the
  33  * answer is "no", then X is guaranteed not to be in Y.
  34  *
  35  * The filter is parametrized on KeySize, which is the size of the key
  36  * generated by each of hash functions used by the filter, in bits,
  37  * and the type of object T being added and removed.  T must implement
  38  * a |uint32_t hash() const| method which returns a uint32_t hash key
  39  * that will be used to generate the two separate hash functions for
  40  * the Bloom filter.  This hash key MUST be well-distributed for good
  41  * results!  KeySize is not allowed to be larger than 16.
  42  *
  43  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
  44  * will refer to the memory used by the filter as M.
  45  *
  46  * The expected rate of incorrect "yes" answers depends on M and on
  47  * the number N of objects in set Y.  As long as N is small compared
  48  * to M, the rate of such answers is expected to be approximately
  49  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
  50  * elements then using a KeySize of 12 gives a reasonably low
  51  * incorrect answer rate.  A KeySize of 12 has the additional benefit
  52  * of using exactly one page for the filter in typical hardware
  53  * configurations.
  54  */
  55
  56 template<unsigned KeySize, class T>
  57 class BloomFilter
  58 {
  59     /*
  60      * A counting Bloom filter with 8-bit counters.  For now we assume
  61      * that having two hash functions is enough, but we may revisit that
  62      * decision later.
  63      *
  64      * The filter uses an array with 2**KeySize entries.
  65      *
  66      * Assuming a well-distributed hash function, a Bloom filter with
  67      * array size M containing N elements and
  68      * using k hash function has expected false positive rate exactly
  69      *
  70      * $  (1 - (1 - 1/M)^{kN})^k  $
  71      *
  72      * because each array slot has a
  73      *
  74      * $  (1 - 1/M)^{kN}  $
  75      *
  76      * chance of being 0, and the expected false positive rate is the
  77      * probability that all of the k hash functions will hit a nonzero
  78      * slot.
  79      *
  80      * For reasonable assumptions (M large, kN large, which should both
  81      * hold if we're worried about false positives) about M and kN this
  82      * becomes approximately
  83      *
  84      * $$  (1 - \exp(-kN/M))^k   $$
  85      *
  86      * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
  87      * or in other words
  88      *
  89      * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
  90      *
  91      * where r is the false positive rate.  This can be used to compute
  92      * the desired KeySize for a given load N and false positive rate r.
  93      *
  94      * If N/M is assumed small, then the false positive rate can
  95      * further be approximated as 4*N^2/M^2.  So increasing KeySize by
  96      * 1, which doubles M, reduces the false positive rate by about a
  97      * factor of 4, and a false positive rate of 1% corresponds to
  98      * about M/N == 20.
  99      *
 100      * What this means in practice is that for a few hundred keys using a
 101      * KeySize of 12 gives false positive rates on the order of 0.25-4%.
 102      *
 103      * Similarly, using a KeySize of 10 would lead to a 4% false
 104      * positive rate for N == 100 and to quite bad false positive
 105      * rates for larger N.
 106      */
 107   public:
 108     BloomFilter() {
 109         static_assert(KeySize <= keyShift, "KeySize too big");
 110
 111         // Should we have a custom operator new using calloc instead and
 112         // require that we're allocated via the operator?
 113         clear();
 114     }
 115
 116     /*
 117      * Clear the filter.  This should be done before reusing it, because
 118      * just removing all items doesn't clear counters that hit the upper
 119      * bound.
 120      */
 121     void clear();
 122
 123     /*
 124      * Add an item to the filter.
 125      */
 126     void add(const T* t);
 127
 128     /*
 129      * Remove an item from the filter.
 130      */
 131     void remove(const T* t);
 132
 133     /*
 134      * Check whether the filter might contain an item.  This can
 135      * sometimes return true even if the item is not in the filter,
 136      * but will never return false for items that are actually in the
 137      * filter.
 138      */
 139     bool mightContain(const T* t) const;
 140
 141     /*
 142      * Methods for add/remove/contain when we already have a hash computed
 143      */
 144     void add(uint32_t hash);
 145     void remove(uint32_t hash);
 146     bool mightContain(uint32_t hash) const;
 147
 148   private:
 149     static const size_t arraySize = (1 << KeySize);
 150     static const uint32_t keyMask = (1 << KeySize) - 1;
 151     static const uint32_t keyShift = 16;
 152
 153     static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
 154     static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
 155
 156     uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
 157     uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
 158     const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
 159     const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
 160
 161     static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
 162
 163     uint8_t counters[arraySize];
 164 };
 165
 166 template<unsigned KeySize, class T>
 167 inline void
 168 BloomFilter<KeySize, T>::clear()
 169 {
 170   memset(counters, 0, arraySize);
 171 }
 172
 173 template<unsigned KeySize, class T>
 174 inline void
 175 BloomFilter<KeySize, T>::add(uint32_t hash)
 176 {
 177   uint8_t& slot1 = firstSlot(hash);
 178   if (MOZ_LIKELY(!full(slot1)))
 179     ++slot1;
 180
 181   uint8_t& slot2 = secondSlot(hash);
 182   if (MOZ_LIKELY(!full(slot2)))
 183     ++slot2;
 184 }
 185
 186 template<unsigned KeySize, class T>
 187 MOZ_ALWAYS_INLINE void
 188 BloomFilter<KeySize, T>::add(const T* t)
 189 {
 190   uint32_t hash = t->hash();
 191   return add(hash);
 192 }
 193
 194 template<unsigned KeySize, class T>
 195 inline void
 196 BloomFilter<KeySize, T>::remove(uint32_t hash)
 197 {
 198   // If the slots are full, we don't know whether we bumped them to be
 199   // there when we added or not, so just leave them full.
 200   uint8_t& slot1 = firstSlot(hash);
 201   if (MOZ_LIKELY(!full(slot1)))
 202     --slot1;
 203
 204   uint8_t& slot2 = secondSlot(hash);
 205   if (MOZ_LIKELY(!full(slot2)))
 206     --slot2;
 207 }
 208
 209 template<unsigned KeySize, class T>
 210 MOZ_ALWAYS_INLINE void
 211 BloomFilter<KeySize, T>::remove(const T* t)
 212 {
 213   uint32_t hash = t->hash();
 214   remove(hash);
 215 }
 216
 217 template<unsigned KeySize, class T>
 218 MOZ_ALWAYS_INLINE bool
 219 BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
 220 {
 221   // Check that all the slots for this hash contain something
 222   return firstSlot(hash) && secondSlot(hash);
 223 }
 224
 225 template<unsigned KeySize, class T>
 226 MOZ_ALWAYS_INLINE bool
 227 BloomFilter<KeySize, T>::mightContain(const T* t) const
 228 {
 229   uint32_t hash = t->hash();
 230   return mightContain(hash);
 231 }
 232
 233 } // namespace mozilla
 234
 235 #endif /* mozilla_BloomFilter_h */