src/backend/access/brin/brin_bloom.c

   1 /*
   2  * brin_bloom.c
   3  *              Implementation of Bloom opclass for BRIN
   4  *
   5  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
   6  * Portions Copyright (c) 1994, Regents of the University of California
   7  *
   8  *
   9  * A BRIN opclass summarizing page range into a bloom filter.
  10  *
  11  * Bloom filters allow efficient testing whether a given page range contains
  12  * a particular value. Therefore, if we summarize each page range into a small
  13  * bloom filter, we can easily (and cheaply) test whether it contains values
  14  * we get later.
  15  *
  16  * The index only supports equality operators, similarly to hash indexes.
  17  * Bloom indexes are however much smaller, and support only bitmap scans.
  18  *
  19  * Note: Don't confuse this with bloom indexes, implemented in a contrib
  20  * module. That extension implements an entirely new AM, building a bloom
  21  * filter on multiple columns in a single row. This opclass works with an
  22  * existing AM (BRIN) and builds bloom filter on a column.
  23  *
  24  *
  25  * values vs. hashes
  26  * -----------------
  27  *
  28  * The original column values are not used directly, but are first hashed
  29  * using the regular type-specific hash function, producing a uint32 hash.
  30  * And this hash value is then added to the summary - i.e. it's hashed
  31  * again and added to the bloom filter.
  32  *
  33  * This allows the code to treat all data types (byval/byref/...) the same
  34  * way, with only minimal space requirements, because we're working with
  35  * hashes and not the original values. Everything is uint32.
  36  *
  37  * Of course, this assumes the built-in hash function is reasonably good,
  38  * without too many collisions etc. But that does seem to be the case, at
  39  * least based on past experience. After all, the same hash functions are
  40  * used for hash indexes, hash partitioning and so on.
  41  *
  42  *
  43  * hashing scheme
  44  * --------------
  45  *
  46  * Bloom filters require a number of independent hash functions. There are
  47  * different schemes how to construct them - for example we might use
  48  * hash_uint32_extended with random seeds, but that seems fairly expensive.
  49  * We use a scheme requiring only two functions described in this paper:
  50  *
  51  * Less Hashing, Same Performance:Building a Better Bloom Filter
  52  * Adam Kirsch, Michael Mitzenmacher, Harvard School of Engineering and
  53  * Applied Sciences, Cambridge, Massachusetts [DOI 10.1002/rsa.20208]
  54  *
  55  * The two hash functions h1 and h2 are calculated using hard-coded seeds,
  56  * and then combined using (h1 + i * h2) to generate the hash functions.
  57  *
  58  *
  59  * sizing the bloom filter
  60  * -----------------------
  61  *
  62  * Size of a bloom filter depends on the number of distinct values we will
  63  * store in it, and the desired false positive rate. The higher the number
  64  * of distinct values and/or the lower the false positive rate, the larger
  65  * the bloom filter. On the other hand, we want to keep the index as small
  66  * as possible - that's one of the basic advantages of BRIN indexes.
  67  *
  68  * Although the number of distinct elements (in a page range) depends on
  69  * the data, we can consider it fixed. This simplifies the trade-off to
  70  * just false positive rate vs. size.
  71  *
  72  * At the page range level, false positive rate is a probability the bloom
  73  * filter matches a random value. For the whole index (with sufficiently
  74  * many page ranges) it represents the fraction of the index ranges (and
  75  * thus fraction of the table to be scanned) matching the random value.
  76  *
  77  * Furthermore, the size of the bloom filter is subject to implementation
  78  * limits - it has to fit onto a single index page (8kB by default). As
  79  * the bitmap is inherently random (when "full" about half the bits is set
  80  * to 1, randomly), compression can't help very much.
  81  *
  82  * To reduce the size of a filter (to fit to a page), we have to either
  83  * accept higher false positive rate (undesirable), or reduce the number
  84  * of distinct items to be stored in the filter. We can't alter the input
  85  * data, of course, but we may make the BRIN page ranges smaller - instead
  86  * of the default 128 pages (1MB) we may build index with 16-page ranges,
  87  * or something like that. This should reduce the number of distinct values
  88  * in the page range, making the filter smaller (with fixed false positive
  89  * rate). Even for random data sets this should help, as the number of rows
  90  * per heap page is limited (to ~290 with very narrow tables, likely ~20
  91  * in practice).
  92  *
  93  * Of course, good sizing decisions depend on having the necessary data,
  94  * i.e. number of distinct values in a page range (of a given size) and
  95  * table size (to estimate cost change due to change in false positive
  96  * rate due to having larger index vs. scanning larger indexes). We may
  97  * not have that data - for example when building an index on empty table
  98  * it's not really possible. And for some data we only have estimates for
  99  * the whole table and we can only estimate per-range values (ndistinct).
 100  *
 101  * Another challenge is that while the bloom filter is per-column, it's
 102  * the whole index tuple that has to fit into a page. And for multi-column
 103  * indexes that may include pieces we have no control over (not necessarily
 104  * bloom filters, the other columns may use other BRIN opclasses). So it's
 105  * not entirely clear how to distribute the space between those columns.
 106  *
 107  * The current logic, implemented in brin_bloom_get_ndistinct, attempts to
 108  * make some basic sizing decisions, based on the size of BRIN ranges, and
 109  * the maximum number of rows per range.
 110  *
 111  *
 112  * IDENTIFICATION
 113  *        src/backend/access/brin/brin_bloom.c
 114  */
 115 #include "postgres.h"
 116
 117 #include "access/genam.h"
 118 #include "access/brin.h"
 119 #include "access/brin_internal.h"
 120 #include "access/brin_page.h"
 121 #include "access/brin_tuple.h"
 122 #include "access/hash.h"
 123 #include "access/htup_details.h"
 124 #include "access/reloptions.h"
 125 #include "access/stratnum.h"
 126 #include "catalog/pg_type.h"
 127 #include "catalog/pg_amop.h"
 128 #include "utils/builtins.h"
 129 #include "utils/datum.h"
 130 #include "utils/lsyscache.h"
 131 #include "utils/rel.h"
 132 #include "utils/syscache.h"
 133
 134 #include <math.h>
 135
 136 #define BloomEqualStrategyNumber        1
 137
 138 /*
 139  * Additional SQL level support functions. We only have one, which is
 140  * used to calculate hash of the input value.
 141  *
 142  * Procedure numbers must not use values reserved for BRIN itself; see
 143  * brin_internal.h.
 144  */
 145 #define         BLOOM_MAX_PROCNUMS              1       /* maximum support procs we need */
 146 #define         PROCNUM_HASH                    11      /* required */
 147
 148 /*
 149  * Subtract this from procnum to obtain index in BloomOpaque arrays
 150  * (Must be equal to minimum of private procnums).
 151  */
 152 #define         PROCNUM_BASE                    11
 153
 154 /*
 155  * Storage type for BRIN's reloptions.
 156  */
 157 typedef struct BloomOptions
 158 {
 159         int32           vl_len_;                /* varlena header (do not touch directly!) */
 160         double          nDistinctPerRange;      /* number of distinct values per range */
 161         double          falsePositiveRate;      /* false positive for bloom filter */
 162 } BloomOptions;
 163
 164 /*
 165  * The current min value (16) is somewhat arbitrary, but it's based
 166  * on the fact that the filter header is ~20B alone, which is about
 167  * the same as the filter bitmap for 16 distinct items with 1% false
 168  * positive rate. So by allowing lower values we'd not gain much. In
 169  * any case, the min should not be larger than MaxHeapTuplesPerPage
 170  * (~290), which is the theoretical maximum for single-page ranges.
 171  */
 172 #define         BLOOM_MIN_NDISTINCT_PER_RANGE           16
 173
 174 /*
 175  * Used to determine number of distinct items, based on the number of rows
 176  * in a page range. The 10% is somewhat similar to what estimate_num_groups
 177  * does, so we use the same factor here.
 178  */
 179 #define         BLOOM_DEFAULT_NDISTINCT_PER_RANGE       -0.1    /* 10% of values */
 180
 181 /*
 182  * Allowed range and default value for the false positive range. The exact
 183  * values are somewhat arbitrary, but were chosen considering the various
 184  * parameters (size of filter vs. page size, etc.).
 185  *
 186  * The lower the false-positive rate, the more accurate the filter is, but
 187  * it also gets larger - at some point this eliminates the main advantage
 188  * of BRIN indexes, which is the tiny size. At 0.01% the index is about
 189  * 10% of the table (assuming 290 distinct values per 8kB page).
 190  *
 191  * On the other hand, as the false-positive rate increases, larger part of
 192  * the table has to be scanned due to mismatches - at 25% we're probably
 193  * close to sequential scan being cheaper.
 194  */
 195 #define         BLOOM_MIN_FALSE_POSITIVE_RATE   0.0001  /* 0.01% fp rate */
 196 #define         BLOOM_MAX_FALSE_POSITIVE_RATE   0.25    /* 25% fp rate */
 197 #define         BLOOM_DEFAULT_FALSE_POSITIVE_RATE       0.01    /* 1% fp rate */
 198
 199 #define BloomGetNDistinctPerRange(opts) \
 200         ((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \
 201          (((BloomOptions *) (opts))->nDistinctPerRange) : \
 202          BLOOM_DEFAULT_NDISTINCT_PER_RANGE)
 203
 204 #define BloomGetFalsePositiveRate(opts) \
 205         ((opts) && (((BloomOptions *) (opts))->falsePositiveRate != 0.0) ? \
 206          (((BloomOptions *) (opts))->falsePositiveRate) : \
 207          BLOOM_DEFAULT_FALSE_POSITIVE_RATE)
 208
 209 /*
 210  * And estimate of the largest bloom we can fit onto a page. This is not
 211  * a perfect guarantee, for a couple of reasons. For example, the row may
 212  * be larger because the index has multiple columns.
 213  */
 214 #define BloomMaxFilterSize \
 215         MAXALIGN_DOWN(BLCKSZ - \
 216                                   (MAXALIGN(SizeOfPageHeaderData + \
 217                                                         sizeof(ItemIdData)) + \
 218                                    MAXALIGN(sizeof(BrinSpecialSpace)) + \
 219                                    SizeOfBrinTuple))
 220
 221 /*
 222  * Seeds used to calculate two hash functions h1 and h2, which are then used
 223  * to generate k hashes using the (h1 + i * h2) scheme.
 224  */
 225 #define BLOOM_SEED_1    0x71d924af
 226 #define BLOOM_SEED_2    0xba48b314
 227
 228 /*
 229  * Bloom Filter
 230  *
 231  * Represents a bloom filter, built on hashes of the indexed values. That is,
 232  * we compute a uint32 hash of the value, and then store this hash into the
 233  * bloom filter (and compute additional hashes on it).
 234  *
 235  * XXX We could implement "sparse" bloom filters, keeping only the bytes that
 236  * are not entirely 0. But while indexes don't support TOAST, the varlena can
 237  * still be compressed. So this seems unnecessary, because the compression
 238  * should do the same job.
 239  *
 240  * XXX We can also watch the number of bits set in the bloom filter, and then
 241  * stop using it (and not store the bitmap, to save space) when the false
 242  * positive rate gets too high. But even if the false positive rate exceeds the
 243  * desired value, it still can eliminate some page ranges.
 244  */
 245 typedef struct BloomFilter
 246 {
 247         /* varlena header (do not touch directly!) */
 248         int32           vl_len_;
 249
 250         /* space for various flags (unused for now) */
 251         uint16          flags;
 252
 253         /* fields for the HASHED phase */
 254         uint8           nhashes;                /* number of hash functions */
 255         uint32          nbits;                  /* number of bits in the bitmap (size) */
 256         uint32          nbits_set;              /* number of bits set to 1 */
 257
 258         /* data of the bloom filter */
 259         char            data[FLEXIBLE_ARRAY_MEMBER];
 260
 261 } BloomFilter;
 262
 263
 264 /*
 265  * bloom_init
 266  *              Initialize the Bloom Filter, allocate all the memory.
 267  *
 268  * The filter is initialized with optimal size for ndistinct expected values
 269  * and the requested false positive rate. The filter is stored as varlena.
 270  */
 271 static BloomFilter *
 272 bloom_init(int ndistinct, double false_positive_rate)
 273 {
 274         Size            len;
 275         BloomFilter *filter;
 276
 277         int                     nbits;                  /* size of filter / number of bits */
 278         int                     nbytes;                 /* size of filter / number of bytes */
 279
 280         double          k;                              /* number of hash functions */
 281
 282         Assert(ndistinct > 0);
 283         Assert((false_positive_rate >= BLOOM_MIN_FALSE_POSITIVE_RATE) &&
 284                    (false_positive_rate < BLOOM_MAX_FALSE_POSITIVE_RATE));
 285
 286         /* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */
 287         nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2));
 288
 289         /* round m to whole bytes */
 290         nbytes = ((nbits + 7) / 8);
 291         nbits = nbytes * 8;
 292
 293         /*
 294          * Reject filters that are obviously too large to store on a page.
 295          *
 296          * Initially the bloom filter is just zeroes and so very compressible, but
 297          * as we add values it gets more and more random, and so less and less
 298          * compressible. So initially everything fits on the page, but we might
 299          * get surprising failures later - we want to prevent that, so we reject
 300          * bloom filter that are obviously too large.
 301          *
 302          * XXX It's not uncommon to oversize the bloom filter a bit, to defend
 303          * against unexpected data anomalies (parts of table with more distinct
 304          * values per range etc.). But we still need to make sure even the
 305          * oversized filter fits on page, if such need arises.
 306          *
 307          * XXX This check is not perfect, because the index may have multiple
 308          * filters that are small individually, but too large when combined.
 309          */
 310         if (nbytes > BloomMaxFilterSize)
 311                 elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes,
 312                          BloomMaxFilterSize);
 313
 314         /*
 315          * round(log(2.0) * m / ndistinct), but assume round() may not be
 316          * available on Windows
 317          */
 318         k = log(2.0) * nbits / ndistinct;
 319         k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k);
 320
 321         /*
 322          * We allocate the whole filter. Most of it is going to be 0 bits, so the
 323          * varlena is easy to compress.
 324          */
 325         len = offsetof(BloomFilter, data) + nbytes;
 326
 327         filter = (BloomFilter *) palloc0(len);
 328
 329         filter->flags = 0;
 330         filter->nhashes = (int) k;
 331         filter->nbits = nbits;
 332
 333         SET_VARSIZE(filter, len);
 334
 335         return filter;
 336 }
 337
 338
 339 /*
 340  * bloom_add_value
 341  *              Add value to the bloom filter.
 342  */
 343 static BloomFilter *
 344 bloom_add_value(BloomFilter *filter, uint32 value, bool *updated)
 345 {
 346         int                     i;
 347         uint64          h1,
 348                                 h2;
 349
 350         /* compute the hashes, used for the bloom filter */
 351         h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
 352         h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;
 353
 354         /* compute the requested number of hashes */
 355         for (i = 0; i < filter->nhashes; i++)
 356         {
 357                 /* h1 + h2 + f(i) */
 358                 uint32          h = (h1 + i * h2) % filter->nbits;
 359                 uint32          byte = (h / 8);
 360                 uint32          bit = (h % 8);
 361
 362                 /* if the bit is not set, set it and remember we did that */
 363                 if (!(filter->data[byte] & (0x01 << bit)))
 364                 {
 365                         filter->data[byte] |= (0x01 << bit);
 366                         filter->nbits_set++;
 367                         if (updated)
 368                                 *updated = true;
 369                 }
 370         }
 371
 372         return filter;
 373 }
 374
 375
 376 /*
 377  * bloom_contains_value
 378  *              Check if the bloom filter contains a particular value.
 379  */
 380 static bool
 381 bloom_contains_value(BloomFilter *filter, uint32 value)
 382 {
 383         int                     i;
 384         uint64          h1,
 385                                 h2;
 386
 387         /* calculate the two hashes */
 388         h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
 389         h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;
 390
 391         /* compute the requested number of hashes */
 392         for (i = 0; i < filter->nhashes; i++)
 393         {
 394                 /* h1 + h2 + f(i) */
 395                 uint32          h = (h1 + i * h2) % filter->nbits;
 396                 uint32          byte = (h / 8);
 397                 uint32          bit = (h % 8);
 398
 399                 /* if the bit is not set, the value is not there */
 400                 if (!(filter->data[byte] & (0x01 << bit)))
 401                         return false;
 402         }
 403
 404         /* all hashes found in bloom filter */
 405         return true;
 406 }
 407
 408 typedef struct BloomOpaque
 409 {
 410         /*
 411          * XXX At this point we only need a single proc (to compute the hash), but
 412          * let's keep the array just like inclusion and minmax opclasses, for
 413          * consistency. We may need additional procs in the future.
 414          */
 415         FmgrInfo        extra_procinfos[BLOOM_MAX_PROCNUMS];
 416         bool            extra_proc_missing[BLOOM_MAX_PROCNUMS];
 417 } BloomOpaque;
 418
 419 static FmgrInfo *bloom_get_procinfo(BrinDesc *bdesc, uint16 attno,
 420                                                                         uint16 procnum);
 421
 422
 423 Datum
 424 brin_bloom_opcinfo(PG_FUNCTION_ARGS)
 425 {
 426         BrinOpcInfo *result;
 427
 428         /*
 429          * opaque->strategy_procinfos is initialized lazily; here it is set to
 430          * all-uninitialized by palloc0 which sets fn_oid to InvalidOid.
 431          *
 432          * bloom indexes only store the filter as a single BYTEA column
 433          */
 434
 435         result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) +
 436                                          sizeof(BloomOpaque));
 437         result->oi_nstored = 1;
 438         result->oi_regular_nulls = true;
 439         result->oi_opaque = (BloomOpaque *)
 440                 MAXALIGN((char *) result + SizeofBrinOpcInfo(1));
 441         result->oi_typcache[0] = lookup_type_cache(PG_BRIN_BLOOM_SUMMARYOID, 0);
 442
 443         PG_RETURN_POINTER(result);
 444 }
 445
 446 /*
 447  * brin_bloom_get_ndistinct
 448  *              Determine the ndistinct value used to size bloom filter.
 449  *
 450  * Adjust the ndistinct value based on the pagesPerRange value. First,
 451  * if it's negative, it's assumed to be relative to maximum number of
 452  * tuples in the range (assuming each page gets MaxHeapTuplesPerPage
 453  * tuples, which is likely a significant over-estimate). We also clamp
 454  * the value, not to over-size the bloom filter unnecessarily.
 455  *
 456  * XXX We can only do this when the pagesPerRange value was supplied.
 457  * If it wasn't, it has to be a read-only access to the index, in which
 458  * case we don't really care. But perhaps we should fall-back to the
 459  * default pagesPerRange value?
 460  *
 461  * XXX We might also fetch info about ndistinct estimate for the column,
 462  * and compute the expected number of distinct values in a range. But
 463  * that may be tricky due to data being sorted in various ways, so it
 464  * seems better to rely on the upper estimate.
 465  *
 466  * XXX We might also calculate a better estimate of rows per BRIN range,
 467  * instead of using MaxHeapTuplesPerPage (which probably produces values
 468  * much higher than reality).
 469  */
 470 static int
 471 brin_bloom_get_ndistinct(BrinDesc *bdesc, BloomOptions *opts)
 472 {
 473         double          ndistinct;
 474         double          maxtuples;
 475         BlockNumber pagesPerRange;
 476
 477         pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index);
 478         ndistinct = BloomGetNDistinctPerRange(opts);
 479
 480         Assert(BlockNumberIsValid(pagesPerRange));
 481
 482         maxtuples = MaxHeapTuplesPerPage * pagesPerRange;
 483
 484         /*
 485          * Similarly to n_distinct, negative values are relative - in this case to
 486          * maximum number of tuples in the page range (maxtuples).
 487          */
 488         if (ndistinct < 0)
 489                 ndistinct = (-ndistinct) * maxtuples;
 490
 491         /*
 492          * Positive values are to be used directly, but we still apply a couple of
 493          * safeties to avoid using unreasonably small bloom filters.
 494          */
 495         ndistinct = Max(ndistinct, BLOOM_MIN_NDISTINCT_PER_RANGE);
 496
 497         /*
 498          * And don't use more than the maximum possible number of tuples, in the
 499          * range, which would be entirely wasteful.
 500          */
 501         ndistinct = Min(ndistinct, maxtuples);
 502
 503         return (int) ndistinct;
 504 }
 505
 506 /*
 507  * Examine the given index tuple (which contains partial status of a certain
 508  * page range) by comparing it to the given value that comes from another heap
 509  * tuple.  If the new value is outside the bloom filter specified by the
 510  * existing tuple values, update the index tuple and return true.  Otherwise,
 511  * return false and do not modify in this case.
 512  */
 513 Datum
 514 brin_bloom_add_value(PG_FUNCTION_ARGS)
 515 {
 516         BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
 517         BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
 518         Datum           newval = PG_GETARG_DATUM(2);
 519         bool            isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3);
 520         BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS();
 521         Oid                     colloid = PG_GET_COLLATION();
 522         FmgrInfo   *hashFn;
 523         uint32          hashValue;
 524         bool            updated = false;
 525         AttrNumber      attno;
 526         BloomFilter *filter;
 527
 528         Assert(!isnull);
 529
 530         attno = column->bv_attno;
 531
 532         /*
 533          * If this is the first non-null value, we need to initialize the bloom
 534          * filter. Otherwise just extract the existing bloom filter from
 535          * BrinValues.
 536          */
 537         if (column->bv_allnulls)
 538         {
 539                 filter = bloom_init(brin_bloom_get_ndistinct(bdesc, opts),
 540                                                         BloomGetFalsePositiveRate(opts));
 541                 column->bv_values[0] = PointerGetDatum(filter);
 542                 column->bv_allnulls = false;
 543                 updated = true;
 544         }
 545         else
 546                 filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);
 547
 548         /*
 549          * Compute the hash of the new value, using the supplied hash function,
 550          * and then add the hash value to the bloom filter.
 551          */
 552         hashFn = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);
 553
 554         hashValue = DatumGetUInt32(FunctionCall1Coll(hashFn, colloid, newval));
 555
 556         filter = bloom_add_value(filter, hashValue, &updated);
 557
 558         column->bv_values[0] = PointerGetDatum(filter);
 559
 560         PG_RETURN_BOOL(updated);
 561 }
 562
 563 /*
 564  * Given an index tuple corresponding to a certain page range and a scan key,
 565  * return whether the scan key is consistent with the index tuple's bloom
 566  * filter.  Return true if so, false otherwise.
 567  */
 568 Datum
 569 brin_bloom_consistent(PG_FUNCTION_ARGS)
 570 {
 571         BrinDesc   *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
 572         BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
 573         ScanKey    *keys = (ScanKey *) PG_GETARG_POINTER(2);
 574         int                     nkeys = PG_GETARG_INT32(3);
 575         Oid                     colloid = PG_GET_COLLATION();
 576         AttrNumber      attno;
 577         Datum           value;
 578         Datum           matches;
 579         FmgrInfo   *finfo;
 580         uint32          hashValue;
 581         BloomFilter *filter;
 582         int                     keyno;
 583
 584         filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);
 585
 586         Assert(filter);
 587
 588         matches = true;
 589
 590         for (keyno = 0; keyno < nkeys; keyno++)
 591         {
 592                 ScanKey         key = keys[keyno];
 593
 594                 /* NULL keys are handled and filtered-out in bringetbitmap */
 595                 Assert(!(key->sk_flags & SK_ISNULL));
 596
 597                 attno = key->sk_attno;
 598                 value = key->sk_argument;
 599
 600                 switch (key->sk_strategy)
 601                 {
 602                         case BloomEqualStrategyNumber:
 603
 604                                 /*
 605                                  * In the equality case (WHERE col = someval), we want to
 606                                  * return the current page range if the minimum value in the
 607                                  * range <= scan key, and the maximum value >= scan key.
 608                                  */
 609                                 finfo = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);
 610
 611                                 hashValue = DatumGetUInt32(FunctionCall1Coll(finfo, colloid, value));
 612                                 matches &= bloom_contains_value(filter, hashValue);
 613
 614                                 break;
 615                         default:
 616                                 /* shouldn't happen */
 617                                 elog(ERROR, "invalid strategy number %d", key->sk_strategy);
 618                                 matches = 0;
 619                                 break;
 620                 }
 621
 622                 if (!matches)
 623                         break;
 624         }
 625
 626         PG_RETURN_DATUM(matches);
 627 }
 628
 629 /*
 630  * Given two BrinValues, update the first of them as a union of the summary
 631  * values contained in both.  The second one is untouched.
 632  *
 633  * XXX We assume the bloom filters have the same parameters for now. In the
 634  * future we should have 'can union' function, to decide if we can combine
 635  * two particular bloom filters.
 636  */
 637 Datum
 638 brin_bloom_union(PG_FUNCTION_ARGS)
 639 {
 640         int                     i;
 641         int                     nbytes;
 642         BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
 643         BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
 644         BloomFilter *filter_a;
 645         BloomFilter *filter_b;
 646
 647         Assert(col_a->bv_attno == col_b->bv_attno);
 648         Assert(!col_a->bv_allnulls && !col_b->bv_allnulls);
 649
 650         filter_a = (BloomFilter *) PG_DETOAST_DATUM(col_a->bv_values[0]);
 651         filter_b = (BloomFilter *) PG_DETOAST_DATUM(col_b->bv_values[0]);
 652
 653         /* make sure the filters use the same parameters */
 654         Assert(filter_a && filter_b);
 655         Assert(filter_a->nbits == filter_b->nbits);
 656         Assert(filter_a->nhashes == filter_b->nhashes);
 657         Assert((filter_a->nbits > 0) && (filter_a->nbits % 8 == 0));
 658
 659         nbytes = (filter_a->nbits) / 8;
 660
 661         /* simply OR the bitmaps */
 662         for (i = 0; i < nbytes; i++)
 663                 filter_a->data[i] |= filter_b->data[i];
 664
 665         PG_RETURN_VOID();
 666 }
 667
 668 /*
 669  * Cache and return inclusion opclass support procedure
 670  *
 671  * Return the procedure corresponding to the given function support number
 672  * or null if it does not exist.
 673  */
 674 static FmgrInfo *
 675 bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
 676 {
 677         BloomOpaque *opaque;
 678         uint16          basenum = procnum - PROCNUM_BASE;
 679
 680         /*
 681          * We cache these in the opaque struct, to avoid repetitive syscache
 682          * lookups.
 683          */
 684         opaque = (BloomOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
 685
 686         /*
 687          * If we already searched for this proc and didn't find it, don't bother
 688          * searching again.
 689          */
 690         if (opaque->extra_proc_missing[basenum])
 691                 return NULL;
 692
 693         if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid)
 694         {
 695                 if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno,
 696                                                                                                 procnum)))
 697                 {
 698                         fmgr_info_copy(&opaque->extra_procinfos[basenum],
 699                                                    index_getprocinfo(bdesc->bd_index, attno, procnum),
 700                                                    bdesc->bd_context);
 701                 }
 702                 else
 703                 {
 704                         opaque->extra_proc_missing[basenum] = true;
 705                         return NULL;
 706                 }
 707         }
 708
 709         return &opaque->extra_procinfos[basenum];
 710 }
 711
 712 Datum
 713 brin_bloom_options(PG_FUNCTION_ARGS)
 714 {
 715         local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0);
 716
 717         init_local_reloptions(relopts, sizeof(BloomOptions));
 718
 719         add_local_real_reloption(relopts, "n_distinct_per_range",
 720                                                          "number of distinct items expected in a BRIN page range",
 721                                                          BLOOM_DEFAULT_NDISTINCT_PER_RANGE,
 722                                                          -1.0, INT_MAX, offsetof(BloomOptions, nDistinctPerRange));
 723
 724         add_local_real_reloption(relopts, "false_positive_rate",
 725                                                          "desired false-positive rate for the bloom filters",
 726                                                          BLOOM_DEFAULT_FALSE_POSITIVE_RATE,
 727                                                          BLOOM_MIN_FALSE_POSITIVE_RATE,
 728                                                          BLOOM_MAX_FALSE_POSITIVE_RATE,
 729                                                          offsetof(BloomOptions, falsePositiveRate));
 730
 731         PG_RETURN_VOID();
 732 }
 733
 734 /*
 735  * brin_bloom_summary_in
 736  *              - input routine for type brin_bloom_summary.
 737  *
 738  * brin_bloom_summary is only used internally to represent summaries
 739  * in BRIN bloom indexes, so it has no operations of its own, and we
 740  * disallow input too.
 741  */
 742 Datum
 743 brin_bloom_summary_in(PG_FUNCTION_ARGS)
 744 {
 745         /*
 746          * brin_bloom_summary stores the data in binary form and parsing text
 747          * input is not needed, so disallow this.
 748          */
 749         ereport(ERROR,
 750                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 751                          errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));
 752
 753         PG_RETURN_VOID();                       /* keep compiler quiet */
 754 }
 755
 756
 757 /*
 758  * brin_bloom_summary_out
 759  *              - output routine for type brin_bloom_summary.
 760  *
 761  * BRIN bloom summaries are serialized into a bytea value, but we want
 762  * to output something nicer humans can understand.
 763  */
 764 Datum
 765 brin_bloom_summary_out(PG_FUNCTION_ARGS)
 766 {
 767         BloomFilter *filter;
 768         StringInfoData str;
 769
 770         /* detoast the data to get value with a full 4B header */
 771         filter = (BloomFilter *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0));
 772
 773         initStringInfo(&str);
 774         appendStringInfoChar(&str, '{');
 775
 776         appendStringInfo(&str, "mode: hashed  nhashes: %u  nbits: %u  nbits_set: %u",
 777                                          filter->nhashes, filter->nbits, filter->nbits_set);
 778
 779         appendStringInfoChar(&str, '}');
 780
 781         PG_RETURN_CSTRING(str.data);
 782 }
 783
 784 /*
 785  * brin_bloom_summary_recv
 786  *              - binary input routine for type brin_bloom_summary.
 787  */
 788 Datum
 789 brin_bloom_summary_recv(PG_FUNCTION_ARGS)
 790 {
 791         ereport(ERROR,
 792                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 793                          errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));
 794
 795         PG_RETURN_VOID();                       /* keep compiler quiet */
 796 }
 797
 798 /*
 799  * brin_bloom_summary_send
 800  *              - binary output routine for type brin_bloom_summary.
 801  *
 802  * BRIN bloom summaries are serialized in a bytea value (although the
 803  * type is named differently), so let's just send that.
 804  */
 805 Datum
 806 brin_bloom_summary_send(PG_FUNCTION_ARGS)
 807 {
 808         return byteasend(fcinfo);
 809 }