mfbt/BloomFilter.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 /*
   8  * A counting Bloom filter implementation.  This allows consumers to
   9  * do fast probabilistic "is item X in set Y?" testing which will
  10  * never answer "no" when the correct answer is "yes" (but might
  11  * incorrectly answer "yes" when the correct answer is "no").
  12  */
  13
  14 #ifndef mozilla_BloomFilter_h
  15 #define mozilla_BloomFilter_h
  16
  17 #include "mozilla/Assertions.h"
  18 #include "mozilla/Likely.h"
  19
  20 #include <stdint.h>
  21 #include <string.h>
  22
  23 namespace mozilla {
  24
  25 /*
  26  * This class implements a counting Bloom filter as described at
  27  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
  28  * 8-bit counters.  This allows quick probabilistic answers to the
  29  * question "is object X in set Y?" where the contents of Y might not
  30  * be time-invariant.  The probabilistic nature of the test means that
  31  * sometimes the answer will be "yes" when it should be "no".  If the
  32  * answer is "no", then X is guaranteed not to be in Y.
  33  *
  34  * The filter is parametrized on KeySize, which is the size of the key
  35  * generated by each of hash functions used by the filter, in bits,
  36  * and the type of object T being added and removed.  T must implement
  37  * a |uint32_t hash() const| method which returns a uint32_t hash key
  38  * that will be used to generate the two separate hash functions for
  39  * the Bloom filter.  This hash key MUST be well-distributed for good
  40  * results!  KeySize is not allowed to be larger than 16.
  41  *
  42  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
  43  * will refer to the memory used by the filter as M.
  44  *
  45  * The expected rate of incorrect "yes" answers depends on M and on
  46  * the number N of objects in set Y.  As long as N is small compared
  47  * to M, the rate of such answers is expected to be approximately
  48  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
  49  * elements then using a KeySize of 12 gives a reasonably low
  50  * incorrect answer rate.  A KeySize of 12 has the additional benefit
  51  * of using exactly one page for the filter in typical hardware
  52  * configurations.
  53  */
  54
  55 template<unsigned KeySize, class T>
  56 class BloomFilter
  57 {
  58   /*
  59    * A counting Bloom filter with 8-bit counters.  For now we assume
  60    * that having two hash functions is enough, but we may revisit that
  61    * decision later.
  62    *
  63    * The filter uses an array with 2**KeySize entries.
  64    *
  65    * Assuming a well-distributed hash function, a Bloom filter with
  66    * array size M containing N elements and
  67    * using k hash function has expected false positive rate exactly
  68    *
  69    * $  (1 - (1 - 1/M)^{kN})^k  $
  70    *
  71    * because each array slot has a
  72    *
  73    * $  (1 - 1/M)^{kN}  $
  74    *
  75    * chance of being 0, and the expected false positive rate is the
  76    * probability that all of the k hash functions will hit a nonzero
  77    * slot.
  78    *
  79    * For reasonable assumptions (M large, kN large, which should both
  80    * hold if we're worried about false positives) about M and kN this
  81    * becomes approximately
  82    *
  83    * $$  (1 - \exp(-kN/M))^k   $$
  84    *
  85    * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
  86    * or in other words
  87    *
  88    * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
  89    *
  90    * where r is the false positive rate.  This can be used to compute
  91    * the desired KeySize for a given load N and false positive rate r.
  92    *
  93    * If N/M is assumed small, then the false positive rate can
  94    * further be approximated as 4*N^2/M^2.  So increasing KeySize by
  95    * 1, which doubles M, reduces the false positive rate by about a
  96    * factor of 4, and a false positive rate of 1% corresponds to
  97    * about M/N == 20.
  98    *
  99    * What this means in practice is that for a few hundred keys using a
 100    * KeySize of 12 gives false positive rates on the order of 0.25-4%.
 101    *
 102    * Similarly, using a KeySize of 10 would lead to a 4% false
 103    * positive rate for N == 100 and to quite bad false positive
 104    * rates for larger N.
 105    */
 106 public:
 107   BloomFilter()
 108   {
 109     static_assert(KeySize <= kKeyShift, "KeySize too big");
 110
 111     // Should we have a custom operator new using calloc instead and
 112     // require that we're allocated via the operator?
 113     clear();
 114   }
 115
 116   /*
 117    * Clear the filter.  This should be done before reusing it, because
 118    * just removing all items doesn't clear counters that hit the upper
 119    * bound.
 120    */
 121   void clear();
 122
 123   /*
 124    * Add an item to the filter.
 125    */
 126   void add(const T* aValue);
 127
 128   /*
 129    * Remove an item from the filter.
 130    */
 131   void remove(const T* aValue);
 132
 133   /*
 134    * Check whether the filter might contain an item.  This can
 135    * sometimes return true even if the item is not in the filter,
 136    * but will never return false for items that are actually in the
 137    * filter.
 138    */
 139   bool mightContain(const T* aValue) const;
 140
 141   /*
 142    * Methods for add/remove/contain when we already have a hash computed
 143    */
 144   void add(uint32_t aHash);
 145   void remove(uint32_t aHash);
 146   bool mightContain(uint32_t aHash) const;
 147
 148 private:
 149   static const size_t kArraySize = (1 << KeySize);
 150   static const uint32_t kKeyMask = (1 << KeySize) - 1;
 151   static const uint32_t kKeyShift = 16;
 152
 153   static uint32_t hash1(uint32_t aHash)
 154   {
 155     return aHash & kKeyMask;
 156   }
 157   static uint32_t hash2(uint32_t aHash)
 158   {
 159     return (aHash >> kKeyShift) & kKeyMask;
 160   }
 161
 162   uint8_t& firstSlot(uint32_t aHash)
 163   {
 164     return mCounters[hash1(aHash)];
 165   }
 166   uint8_t& secondSlot(uint32_t aHash)
 167   {
 168     return mCounters[hash2(aHash)];
 169   }
 170
 171   const uint8_t& firstSlot(uint32_t aHash) const
 172   {
 173     return mCounters[hash1(aHash)];
 174   }
 175   const uint8_t& secondSlot(uint32_t aHash) const
 176   {
 177     return mCounters[hash2(aHash)];
 178   }
 179
 180   static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
 181
 182   uint8_t mCounters[kArraySize];
 183 };
 184
 185 template<unsigned KeySize, class T>
 186 inline void
 187 BloomFilter<KeySize, T>::clear()
 188 {
 189   memset(mCounters, 0, kArraySize);
 190 }
 191
 192 template<unsigned KeySize, class T>
 193 inline void
 194 BloomFilter<KeySize, T>::add(uint32_t aHash)
 195 {
 196   uint8_t& slot1 = firstSlot(aHash);
 197   if (MOZ_LIKELY(!full(slot1))) {
 198     ++slot1;
 199   }
 200   uint8_t& slot2 = secondSlot(aHash);
 201   if (MOZ_LIKELY(!full(slot2))) {
 202     ++slot2;
 203   }
 204 }
 205
 206 template<unsigned KeySize, class T>
 207 MOZ_ALWAYS_INLINE void
 208 BloomFilter<KeySize, T>::add(const T* aValue)
 209 {
 210   uint32_t hash = aValue->hash();
 211   return add(hash);
 212 }
 213
 214 template<unsigned KeySize, class T>
 215 inline void
 216 BloomFilter<KeySize, T>::remove(uint32_t aHash)
 217 {
 218   // If the slots are full, we don't know whether we bumped them to be
 219   // there when we added or not, so just leave them full.
 220   uint8_t& slot1 = firstSlot(aHash);
 221   if (MOZ_LIKELY(!full(slot1))) {
 222     --slot1;
 223   }
 224   uint8_t& slot2 = secondSlot(aHash);
 225   if (MOZ_LIKELY(!full(slot2))) {
 226     --slot2;
 227   }
 228 }
 229
 230 template<unsigned KeySize, class T>
 231 MOZ_ALWAYS_INLINE void
 232 BloomFilter<KeySize, T>::remove(const T* aValue)
 233 {
 234   uint32_t hash = aValue->hash();
 235   remove(hash);
 236 }
 237
 238 template<unsigned KeySize, class T>
 239 MOZ_ALWAYS_INLINE bool
 240 BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
 241 {
 242   // Check that all the slots for this hash contain something
 243   return firstSlot(aHash) && secondSlot(aHash);
 244 }
 245
 246 template<unsigned KeySize, class T>
 247 MOZ_ALWAYS_INLINE bool
 248 BloomFilter<KeySize, T>::mightContain(const T* aValue) const
 249 {
 250   uint32_t hash = aValue->hash();
 251   return mightContain(hash);
 252 }
 253
 254 } // namespace mozilla
 255
 256 #endif /* mozilla_BloomFilter_h */