Update copyright for 2022
[pgsql.git] / src / backend / utils / adt / multirangetypes_selfuncs.c
blob919c8889d459f146ac2860c10d5a5094f4b9ebfd
1 /*-------------------------------------------------------------------------
3 * multirangetypes_selfuncs.c
4 * Functions for selectivity estimation of multirange operators
6 * Estimates are based on histograms of lower and upper bounds, and the
7 * fraction of empty multiranges.
9 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
13 * IDENTIFICATION
14 * src/backend/utils/adt/multirangetypes_selfuncs.c
16 *-------------------------------------------------------------------------
18 #include "postgres.h"
20 #include <math.h>
22 #include "access/htup_details.h"
23 #include "catalog/pg_operator.h"
24 #include "catalog/pg_statistic.h"
25 #include "catalog/pg_type.h"
26 #include "utils/float.h"
27 #include "utils/fmgrprotos.h"
28 #include "utils/lsyscache.h"
29 #include "utils/rangetypes.h"
30 #include "utils/multirangetypes.h"
31 #include "utils/selfuncs.h"
32 #include "utils/typcache.h"
34 static double calc_multirangesel(TypeCacheEntry *typcache,
35 VariableStatData *vardata,
36 const MultirangeType *constval, Oid operator);
37 static double default_multirange_selectivity(Oid operator);
38 static double default_multirange_selectivity(Oid operator);
39 static double calc_hist_selectivity(TypeCacheEntry *typcache,
40 VariableStatData *vardata,
41 const MultirangeType *constval,
42 Oid operator);
43 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
44 const RangeBound *constbound,
45 const RangeBound *hist,
46 int hist_nvalues, bool equal);
47 static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value,
48 const RangeBound *hist, int hist_length, bool equal);
49 static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value,
50 const RangeBound *hist1, const RangeBound *hist2);
51 static float8 get_len_position(double value, double hist1, double hist2);
52 static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1,
53 const RangeBound *bound2);
54 static int length_hist_bsearch(Datum *length_hist_values,
55 int length_hist_nvalues, double value,
56 bool equal);
57 static double calc_length_hist_frac(Datum *length_hist_values,
58 int length_hist_nvalues, double length1,
59 double length2, bool equal);
60 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
61 const RangeBound *lower,
62 RangeBound *upper,
63 const RangeBound *hist_lower,
64 int hist_nvalues,
65 Datum *length_hist_values,
66 int length_hist_nvalues);
67 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
68 const RangeBound *lower,
69 const RangeBound *upper,
70 const RangeBound *hist_lower,
71 int hist_nvalues,
72 Datum *length_hist_values,
73 int length_hist_nvalues);
76 * Returns a default selectivity estimate for given operator, when we don't
77 * have statistics or cannot use them for some reason.
79 static double
80 default_multirange_selectivity(Oid operator)
82 switch (operator)
84 case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
85 case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
86 case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
87 return 0.01;
89 case OID_RANGE_CONTAINS_MULTIRANGE_OP:
90 case OID_RANGE_MULTIRANGE_CONTAINED_OP:
91 case OID_MULTIRANGE_CONTAINS_RANGE_OP:
92 case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
93 case OID_MULTIRANGE_RANGE_CONTAINED_OP:
94 case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
95 return 0.005;
97 case OID_MULTIRANGE_CONTAINS_ELEM_OP:
98 case OID_MULTIRANGE_ELEM_CONTAINED_OP:
101 * "multirange @> elem" is more or less identical to a scalar
102 * inequality "A >= b AND A <= c".
104 return DEFAULT_MULTIRANGE_INEQ_SEL;
106 case OID_MULTIRANGE_LESS_OP:
107 case OID_MULTIRANGE_LESS_EQUAL_OP:
108 case OID_MULTIRANGE_GREATER_OP:
109 case OID_MULTIRANGE_GREATER_EQUAL_OP:
110 case OID_MULTIRANGE_LEFT_RANGE_OP:
111 case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
112 case OID_RANGE_LEFT_MULTIRANGE_OP:
113 case OID_MULTIRANGE_RIGHT_RANGE_OP:
114 case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
115 case OID_RANGE_RIGHT_MULTIRANGE_OP:
116 case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
117 case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
118 case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
119 case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
120 case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
121 case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
122 /* these are similar to regular scalar inequalities */
123 return DEFAULT_INEQ_SEL;
125 default:
128 * all multirange operators should be handled above, but just in
129 * case
131 return 0.01;
136 * multirangesel -- restriction selectivity for multirange operators
138 Datum
139 multirangesel(PG_FUNCTION_ARGS)
141 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
142 Oid operator = PG_GETARG_OID(1);
143 List *args = (List *) PG_GETARG_POINTER(2);
144 int varRelid = PG_GETARG_INT32(3);
145 VariableStatData vardata;
146 Node *other;
147 bool varonleft;
148 Selectivity selec;
149 TypeCacheEntry *typcache = NULL;
150 MultirangeType *constmultirange = NULL;
151 RangeType *constrange = NULL;
154 * If expression is not (variable op something) or (something op
155 * variable), then punt and return a default estimate.
157 if (!get_restriction_variable(root, args, varRelid,
158 &vardata, &other, &varonleft))
159 PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
162 * Can't do anything useful if the something is not a constant, either.
164 if (!IsA(other, Const))
166 ReleaseVariableStats(vardata);
167 PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
171 * All the multirange operators are strict, so we can cope with a NULL
172 * constant right away.
174 if (((Const *) other)->constisnull)
176 ReleaseVariableStats(vardata);
177 PG_RETURN_FLOAT8(0.0);
181 * If var is on the right, commute the operator, so that we can assume the
182 * var is on the left in what follows.
184 if (!varonleft)
186 /* we have other Op var, commute to make var Op other */
187 operator = get_commutator(operator);
188 if (!operator)
190 /* Use default selectivity (should we raise an error instead?) */
191 ReleaseVariableStats(vardata);
192 PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
197 * OK, there's a Var and a Const we're dealing with here. We need the
198 * Const to be of same multirange type as the column, else we can't do
199 * anything useful. (Such cases will likely fail at runtime, but here we'd
200 * rather just return a default estimate.)
202 * If the operator is "multirange @> element", the constant should be of
203 * the element type of the multirange column. Convert it to a multirange
204 * that includes only that single point, so that we don't need special
205 * handling for that in what follows.
207 if (operator == OID_MULTIRANGE_CONTAINS_ELEM_OP)
209 typcache = multirange_get_typcache(fcinfo, vardata.vartype);
211 if (((Const *) other)->consttype == typcache->rngtype->rngelemtype->type_id)
213 RangeBound lower,
214 upper;
216 lower.inclusive = true;
217 lower.val = ((Const *) other)->constvalue;
218 lower.infinite = false;
219 lower.lower = true;
220 upper.inclusive = true;
221 upper.val = ((Const *) other)->constvalue;
222 upper.infinite = false;
223 upper.lower = false;
224 constrange = range_serialize(typcache->rngtype, &lower, &upper, false);
225 constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
226 1, &constrange);
229 else if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
230 operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
231 operator == OID_MULTIRANGE_OVERLAPS_RANGE_OP ||
232 operator == OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP ||
233 operator == OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP ||
234 operator == OID_MULTIRANGE_LEFT_RANGE_OP ||
235 operator == OID_MULTIRANGE_RIGHT_RANGE_OP)
238 * Promote a range in "multirange OP range" just like we do an element
239 * in "multirange OP element".
241 typcache = multirange_get_typcache(fcinfo, vardata.vartype);
242 if (((Const *) other)->consttype == typcache->rngtype->type_id)
244 constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
245 constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
246 1, &constrange);
249 else if (operator == OID_RANGE_OVERLAPS_MULTIRANGE_OP ||
250 operator == OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP ||
251 operator == OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP ||
252 operator == OID_RANGE_LEFT_MULTIRANGE_OP ||
253 operator == OID_RANGE_RIGHT_MULTIRANGE_OP ||
254 operator == OID_RANGE_CONTAINS_MULTIRANGE_OP ||
255 operator == OID_MULTIRANGE_ELEM_CONTAINED_OP ||
256 operator == OID_MULTIRANGE_RANGE_CONTAINED_OP)
259 * Here, the Var is the elem/range, not the multirange. For now we
260 * just punt and return the default estimate. In future we could
261 * disassemble the multirange constant to do something more
262 * intelligent.
265 else if (((Const *) other)->consttype == vardata.vartype)
267 /* Both sides are the same multirange type */
268 typcache = multirange_get_typcache(fcinfo, vardata.vartype);
270 constmultirange = DatumGetMultirangeTypeP(((Const *) other)->constvalue);
274 * If we got a valid constant on one side of the operator, proceed to
275 * estimate using statistics. Otherwise punt and return a default constant
276 * estimate. Note that calc_multirangesel need not handle
277 * OID_MULTIRANGE_*_CONTAINED_OP.
279 if (constmultirange)
280 selec = calc_multirangesel(typcache, &vardata, constmultirange, operator);
281 else
282 selec = default_multirange_selectivity(operator);
284 ReleaseVariableStats(vardata);
286 CLAMP_PROBABILITY(selec);
288 PG_RETURN_FLOAT8((float8) selec);
291 static double
292 calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
293 const MultirangeType *constval, Oid operator)
295 double hist_selec;
296 double selec;
297 float4 empty_frac,
298 null_frac;
301 * First look up the fraction of NULLs and empty multiranges from
302 * pg_statistic.
304 if (HeapTupleIsValid(vardata->statsTuple))
306 Form_pg_statistic stats;
307 AttStatsSlot sslot;
309 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
310 null_frac = stats->stanullfrac;
312 /* Try to get fraction of empty multiranges */
313 if (get_attstatsslot(&sslot, vardata->statsTuple,
314 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
315 InvalidOid,
316 ATTSTATSSLOT_NUMBERS))
318 if (sslot.nnumbers != 1)
319 elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
320 empty_frac = sslot.numbers[0];
321 free_attstatsslot(&sslot);
323 else
325 /* No empty fraction statistic. Assume no empty ranges. */
326 empty_frac = 0.0;
329 else
332 * No stats are available. Follow through the calculations below
333 * anyway, assuming no NULLs and no empty multiranges. This still
334 * allows us to give a better-than-nothing estimate based on whether
335 * the constant is an empty multirange or not.
337 null_frac = 0.0;
338 empty_frac = 0.0;
341 if (MultirangeIsEmpty(constval))
344 * An empty multirange matches all multiranges, all empty multiranges,
345 * or nothing, depending on the operator
347 switch (operator)
349 /* these return false if either argument is empty */
350 case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
351 case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
352 case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
353 case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
354 case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
355 case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
356 case OID_MULTIRANGE_LEFT_RANGE_OP:
357 case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
358 case OID_MULTIRANGE_RIGHT_RANGE_OP:
359 case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
360 /* nothing is less than an empty multirange */
361 case OID_MULTIRANGE_LESS_OP:
362 selec = 0.0;
363 break;
366 * only empty multiranges can be contained by an empty
367 * multirange
369 case OID_RANGE_MULTIRANGE_CONTAINED_OP:
370 case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
371 /* only empty ranges are <= an empty multirange */
372 case OID_MULTIRANGE_LESS_EQUAL_OP:
373 selec = empty_frac;
374 break;
376 /* everything contains an empty multirange */
377 case OID_MULTIRANGE_CONTAINS_RANGE_OP:
378 case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
379 /* everything is >= an empty multirange */
380 case OID_MULTIRANGE_GREATER_EQUAL_OP:
381 selec = 1.0;
382 break;
384 /* all non-empty multiranges are > an empty multirange */
385 case OID_MULTIRANGE_GREATER_OP:
386 selec = 1.0 - empty_frac;
387 break;
389 /* an element cannot be empty */
390 case OID_MULTIRANGE_CONTAINS_ELEM_OP:
392 /* filtered out by multirangesel() */
393 case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
394 case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
395 case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
396 case OID_RANGE_LEFT_MULTIRANGE_OP:
397 case OID_RANGE_RIGHT_MULTIRANGE_OP:
398 case OID_RANGE_CONTAINS_MULTIRANGE_OP:
399 case OID_MULTIRANGE_ELEM_CONTAINED_OP:
400 case OID_MULTIRANGE_RANGE_CONTAINED_OP:
402 default:
403 elog(ERROR, "unexpected operator %u", operator);
404 selec = 0.0; /* keep compiler quiet */
405 break;
408 else
411 * Calculate selectivity using bound histograms. If that fails for
412 * some reason, e.g no histogram in pg_statistic, use the default
413 * constant estimate for the fraction of non-empty values. This is
414 * still somewhat better than just returning the default estimate,
415 * because this still takes into account the fraction of empty and
416 * NULL tuples, if we had statistics for them.
418 hist_selec = calc_hist_selectivity(typcache, vardata, constval,
419 operator);
420 if (hist_selec < 0.0)
421 hist_selec = default_multirange_selectivity(operator);
424 * Now merge the results for the empty multiranges and histogram
425 * calculations, realizing that the histogram covers only the
426 * non-null, non-empty values.
428 if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
429 operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
431 /* empty is contained by anything non-empty */
432 selec = (1.0 - empty_frac) * hist_selec + empty_frac;
434 else
436 /* with any other operator, empty Op non-empty matches nothing */
437 selec = (1.0 - empty_frac) * hist_selec;
441 /* all multirange operators are strict */
442 selec *= (1.0 - null_frac);
444 /* result should be in range, but make sure... */
445 CLAMP_PROBABILITY(selec);
447 return selec;
451 * Calculate multirange operator selectivity using histograms of multirange bounds.
453 * This estimate is for the portion of values that are not empty and not
454 * NULL.
456 static double
457 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
458 const MultirangeType *constval, Oid operator)
460 TypeCacheEntry *rng_typcache = typcache->rngtype;
461 AttStatsSlot hslot;
462 AttStatsSlot lslot;
463 int nhist;
464 RangeBound *hist_lower;
465 RangeBound *hist_upper;
466 int i;
467 RangeBound const_lower;
468 RangeBound const_upper;
469 RangeBound tmp;
470 double hist_selec;
472 /* Can't use the histogram with insecure multirange support functions */
473 if (!statistic_proc_security_check(vardata,
474 rng_typcache->rng_cmp_proc_finfo.fn_oid))
475 return -1;
476 if (OidIsValid(rng_typcache->rng_subdiff_finfo.fn_oid) &&
477 !statistic_proc_security_check(vardata,
478 rng_typcache->rng_subdiff_finfo.fn_oid))
479 return -1;
481 /* Try to get histogram of ranges */
482 if (!(HeapTupleIsValid(vardata->statsTuple) &&
483 get_attstatsslot(&hslot, vardata->statsTuple,
484 STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
485 ATTSTATSSLOT_VALUES)))
486 return -1.0;
488 /* check that it's a histogram, not just a dummy entry */
489 if (hslot.nvalues < 2)
491 free_attstatsslot(&hslot);
492 return -1.0;
496 * Convert histogram of ranges into histograms of its lower and upper
497 * bounds.
499 nhist = hslot.nvalues;
500 hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
501 hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
502 for (i = 0; i < nhist; i++)
504 bool empty;
506 range_deserialize(rng_typcache, DatumGetRangeTypeP(hslot.values[i]),
507 &hist_lower[i], &hist_upper[i], &empty);
508 /* The histogram should not contain any empty ranges */
509 if (empty)
510 elog(ERROR, "bounds histogram contains an empty range");
513 /* @> and @< also need a histogram of range lengths */
514 if (operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
515 operator == OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP ||
516 operator == OID_MULTIRANGE_RANGE_CONTAINED_OP ||
517 operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
519 if (!(HeapTupleIsValid(vardata->statsTuple) &&
520 get_attstatsslot(&lslot, vardata->statsTuple,
521 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
522 InvalidOid,
523 ATTSTATSSLOT_VALUES)))
525 free_attstatsslot(&hslot);
526 return -1.0;
529 /* check that it's a histogram, not just a dummy entry */
530 if (lslot.nvalues < 2)
532 free_attstatsslot(&lslot);
533 free_attstatsslot(&hslot);
534 return -1.0;
537 else
538 memset(&lslot, 0, sizeof(lslot));
540 /* Extract the bounds of the constant value. */
541 Assert(constval->rangeCount > 0);
542 multirange_get_bounds(rng_typcache, constval, 0,
543 &const_lower, &tmp);
544 multirange_get_bounds(rng_typcache, constval, constval->rangeCount - 1,
545 &tmp, &const_upper);
548 * Calculate selectivity comparing the lower or upper bound of the
549 * constant with the histogram of lower or upper bounds.
551 switch (operator)
553 case OID_MULTIRANGE_LESS_OP:
556 * The regular b-tree comparison operators (<, <=, >, >=) compare
557 * the lower bounds first, and the upper bounds for values with
558 * equal lower bounds. Estimate that by comparing the lower bounds
559 * only. This gives a fairly accurate estimate assuming there
560 * aren't many rows with a lower bound equal to the constant's
561 * lower bound.
563 hist_selec =
564 calc_hist_selectivity_scalar(rng_typcache, &const_lower,
565 hist_lower, nhist, false);
566 break;
568 case OID_MULTIRANGE_LESS_EQUAL_OP:
569 hist_selec =
570 calc_hist_selectivity_scalar(rng_typcache, &const_lower,
571 hist_lower, nhist, true);
572 break;
574 case OID_MULTIRANGE_GREATER_OP:
575 hist_selec =
576 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
577 hist_lower, nhist, false);
578 break;
580 case OID_MULTIRANGE_GREATER_EQUAL_OP:
581 hist_selec =
582 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
583 hist_lower, nhist, true);
584 break;
586 case OID_MULTIRANGE_LEFT_RANGE_OP:
587 case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
588 /* var << const when upper(var) < lower(const) */
589 hist_selec =
590 calc_hist_selectivity_scalar(rng_typcache, &const_lower,
591 hist_upper, nhist, false);
592 break;
594 case OID_MULTIRANGE_RIGHT_RANGE_OP:
595 case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
596 /* var >> const when lower(var) > upper(const) */
597 hist_selec =
598 1 - calc_hist_selectivity_scalar(rng_typcache, &const_upper,
599 hist_lower, nhist, true);
600 break;
602 case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
603 case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
604 /* compare lower bounds */
605 hist_selec =
606 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
607 hist_lower, nhist, false);
608 break;
610 case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
611 case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
612 /* compare upper bounds */
613 hist_selec =
614 calc_hist_selectivity_scalar(rng_typcache, &const_upper,
615 hist_upper, nhist, true);
616 break;
618 case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
619 case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
620 case OID_MULTIRANGE_CONTAINS_ELEM_OP:
623 * A && B <=> NOT (A << B OR A >> B).
625 * Since A << B and A >> B are mutually exclusive events we can
626 * sum their probabilities to find probability of (A << B OR A >>
627 * B).
629 * "multirange @> elem" is equivalent to "multirange &&
630 * {[elem,elem]}". The caller already constructed the singular
631 * range from the element constant, so just treat it the same as
632 * &&.
634 hist_selec =
635 calc_hist_selectivity_scalar(rng_typcache,
636 &const_lower, hist_upper,
637 nhist, false);
638 hist_selec +=
639 (1.0 - calc_hist_selectivity_scalar(rng_typcache,
640 &const_upper, hist_lower,
641 nhist, true));
642 hist_selec = 1.0 - hist_selec;
643 break;
645 case OID_MULTIRANGE_CONTAINS_RANGE_OP:
646 case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
647 hist_selec =
648 calc_hist_selectivity_contains(rng_typcache, &const_lower,
649 &const_upper, hist_lower, nhist,
650 lslot.values, lslot.nvalues);
651 break;
653 case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
654 case OID_RANGE_MULTIRANGE_CONTAINED_OP:
655 if (const_lower.infinite)
658 * Lower bound no longer matters. Just estimate the fraction
659 * with an upper bound <= const upper bound
661 hist_selec =
662 calc_hist_selectivity_scalar(rng_typcache, &const_upper,
663 hist_upper, nhist, true);
665 else if (const_upper.infinite)
667 hist_selec =
668 1.0 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
669 hist_lower, nhist, false);
671 else
673 hist_selec =
674 calc_hist_selectivity_contained(rng_typcache, &const_lower,
675 &const_upper, hist_lower, nhist,
676 lslot.values, lslot.nvalues);
678 break;
680 /* filtered out by multirangesel() */
681 case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
682 case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
683 case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
684 case OID_RANGE_LEFT_MULTIRANGE_OP:
685 case OID_RANGE_RIGHT_MULTIRANGE_OP:
686 case OID_RANGE_CONTAINS_MULTIRANGE_OP:
687 case OID_MULTIRANGE_ELEM_CONTAINED_OP:
688 case OID_MULTIRANGE_RANGE_CONTAINED_OP:
690 default:
691 elog(ERROR, "unknown multirange operator %u", operator);
692 hist_selec = -1.0; /* keep compiler quiet */
693 break;
696 free_attstatsslot(&lslot);
697 free_attstatsslot(&hslot);
699 return hist_selec;
704 * Look up the fraction of values less than (or equal, if 'equal' argument
705 * is true) a given const in a histogram of range bounds.
707 static double
708 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound,
709 const RangeBound *hist, int hist_nvalues, bool equal)
711 Selectivity selec;
712 int index;
715 * Find the histogram bin the given constant falls into. Estimate
716 * selectivity as the number of preceding whole bins.
718 index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
719 selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
721 /* Adjust using linear interpolation within the bin */
722 if (index >= 0 && index < hist_nvalues - 1)
723 selec += get_position(typcache, constbound, &hist[index],
724 &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
726 return selec;
730 * Binary search on an array of range bounds. Returns greatest index of range
731 * bound in array which is less(less or equal) than given range bound. If all
732 * range bounds in array are greater or equal(greater) than given range bound,
733 * return -1. When "equal" flag is set conditions in brackets are used.
735 * This function is used in scalar operator selectivity estimation. Another
736 * goal of this function is to find a histogram bin where to stop
737 * interpolation of portion of bounds which are less than or equal to given bound.
739 static int
740 rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist,
741 int hist_length, bool equal)
743 int lower = -1,
744 upper = hist_length - 1,
745 cmp,
746 middle;
748 while (lower < upper)
750 middle = (lower + upper + 1) / 2;
751 cmp = range_cmp_bounds(typcache, &hist[middle], value);
753 if (cmp < 0 || (equal && cmp == 0))
754 lower = middle;
755 else
756 upper = middle - 1;
758 return lower;
763 * Binary search on length histogram. Returns greatest index of range length in
764 * histogram which is less than (less than or equal) the given length value. If
765 * all lengths in the histogram are greater than (greater than or equal) the
766 * given length, returns -1.
768 static int
769 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
770 double value, bool equal)
772 int lower = -1,
773 upper = length_hist_nvalues - 1,
774 middle;
776 while (lower < upper)
778 double middleval;
780 middle = (lower + upper + 1) / 2;
782 middleval = DatumGetFloat8(length_hist_values[middle]);
783 if (middleval < value || (equal && middleval <= value))
784 lower = middle;
785 else
786 upper = middle - 1;
788 return lower;
792 * Get relative position of value in histogram bin in [0,1] range.
794 static float8
795 get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1,
796 const RangeBound *hist2)
798 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
799 float8 position;
801 if (!hist1->infinite && !hist2->infinite)
803 float8 bin_width;
806 * Both bounds are finite. Assuming the subtype's comparison function
807 * works sanely, the value must be finite, too, because it lies
808 * somewhere between the bounds. If it doesn't, arbitrarily return
809 * 0.5.
811 if (value->infinite)
812 return 0.5;
814 /* Can't interpolate without subdiff function */
815 if (!has_subdiff)
816 return 0.5;
818 /* Calculate relative position using subdiff function. */
819 bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
820 typcache->rng_collation,
821 hist2->val,
822 hist1->val));
823 if (isnan(bin_width) || bin_width <= 0.0)
824 return 0.5; /* punt for NaN or zero-width bin */
826 position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
827 typcache->rng_collation,
828 value->val,
829 hist1->val))
830 / bin_width;
832 if (isnan(position))
833 return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
835 /* Relative position must be in [0,1] range */
836 position = Max(position, 0.0);
837 position = Min(position, 1.0);
838 return position;
840 else if (hist1->infinite && !hist2->infinite)
843 * Lower bin boundary is -infinite, upper is finite. If the value is
844 * -infinite, return 0.0 to indicate it's equal to the lower bound.
845 * Otherwise return 1.0 to indicate it's infinitely far from the lower
846 * bound.
848 return ((value->infinite && value->lower) ? 0.0 : 1.0);
850 else if (!hist1->infinite && hist2->infinite)
852 /* same as above, but in reverse */
853 return ((value->infinite && !value->lower) ? 1.0 : 0.0);
855 else
858 * If both bin boundaries are infinite, they should be equal to each
859 * other, and the value should also be infinite and equal to both
860 * bounds. (But don't Assert that, to avoid crashing if a user creates
861 * a datatype with a broken comparison function).
863 * Assume the value to lie in the middle of the infinite bounds.
865 return 0.5;
871 * Get relative position of value in a length histogram bin in [0,1] range.
873 static double
874 get_len_position(double value, double hist1, double hist2)
876 if (!isinf(hist1) && !isinf(hist2))
879 * Both bounds are finite. The value should be finite too, because it
880 * lies somewhere between the bounds. If it doesn't, just return
881 * something.
883 if (isinf(value))
884 return 0.5;
886 return 1.0 - (hist2 - value) / (hist2 - hist1);
888 else if (isinf(hist1) && !isinf(hist2))
891 * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
892 * indicate the value is infinitely far from the lower bound.
894 return 1.0;
896 else if (isinf(hist1) && isinf(hist2))
898 /* same as above, but in reverse */
899 return 0.0;
901 else
904 * If both bin boundaries are infinite, they should be equal to each
905 * other, and the value should also be infinite and equal to both
906 * bounds. (But don't Assert that, to avoid crashing unnecessarily if
907 * the caller messes up)
909 * Assume the value to lie in the middle of the infinite bounds.
911 return 0.5;
916 * Measure distance between two range bounds.
918 static float8
919 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2)
921 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
923 if (!bound1->infinite && !bound2->infinite)
926 * Neither bound is infinite, use subdiff function or return default
927 * value of 1.0 if no subdiff is available.
929 if (has_subdiff)
931 float8 res;
933 res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
934 typcache->rng_collation,
935 bound2->val,
936 bound1->val));
937 /* Reject possible NaN result, also negative result */
938 if (isnan(res) || res < 0.0)
939 return 1.0;
940 else
941 return res;
943 else
944 return 1.0;
946 else if (bound1->infinite && bound2->infinite)
948 /* Both bounds are infinite */
949 if (bound1->lower == bound2->lower)
950 return 0.0;
951 else
952 return get_float8_infinity();
954 else
956 /* One bound is infinite, the other is not */
957 return get_float8_infinity();
962 * Calculate the average of function P(x), in the interval [length1, length2],
963 * where P(x) is the fraction of tuples with length < x (or length <= x if
964 * 'equal' is true).
966 static double
967 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
968 double length1, double length2, bool equal)
970 double frac;
971 double A,
975 double pos;
976 int i;
977 double area;
979 Assert(length2 >= length1);
981 if (length2 < 0.0)
982 return 0.0; /* shouldn't happen, but doesn't hurt to check */
984 /* All lengths in the table are <= infinite. */
985 if (isinf(length2) && equal)
986 return 1.0;
988 /*----------
989 * The average of a function between A and B can be calculated by the
990 * formula:
993 * 1 /
994 * ------- | P(x)dx
995 * B - A /
998 * The geometrical interpretation of the integral is the area under the
999 * graph of P(x). P(x) is defined by the length histogram. We calculate
1000 * the area in a piecewise fashion, iterating through the length histogram
1001 * bins. Each bin is a trapezoid:
1003 * P(x2)
1004 * /|
1005 * / |
1006 * P(x1)/ |
1007 * | |
1008 * | |
1009 * ---+---+--
1010 * x1 x2
1012 * where x1 and x2 are the boundaries of the current histogram, and P(x1)
1013 * and P(x1) are the cumulative fraction of tuples at the boundaries.
1015 * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
1017 * The first bin contains the lower bound passed by the caller, so we
1018 * use linear interpolation between the previous and next histogram bin
1019 * boundary to calculate P(x1). Likewise for the last bin: we use linear
1020 * interpolation to calculate P(x2). For the bins in between, x1 and x2
1021 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
1022 * P(x1) = (bin index) / (number of bins)
1023 * P(x2) = (bin index + 1 / (number of bins)
1026 /* First bin, the one that contains lower bound */
1027 i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
1028 if (i >= length_hist_nvalues - 1)
1029 return 1.0;
1031 if (i < 0)
1033 i = 0;
1034 pos = 0.0;
1036 else
1038 /* interpolate length1's position in the bin */
1039 pos = get_len_position(length1,
1040 DatumGetFloat8(length_hist_values[i]),
1041 DatumGetFloat8(length_hist_values[i + 1]));
1043 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1044 B = length1;
1047 * In the degenerate case that length1 == length2, simply return
1048 * P(length1). This is not merely an optimization: if length1 == length2,
1049 * we'd divide by zero later on.
1051 if (length2 == length1)
1052 return PB;
1055 * Loop through all the bins, until we hit the last bin, the one that
1056 * contains the upper bound. (if lower and upper bounds are in the same
1057 * bin, this falls out immediately)
1059 area = 0.0;
1060 for (; i < length_hist_nvalues - 1; i++)
1062 double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
1064 /* check if we've reached the last bin */
1065 if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
1066 break;
1068 /* the upper bound of previous bin is the lower bound of this bin */
1069 A = B;
1070 PA = PB;
1072 B = bin_upper;
1073 PB = (double) i / (double) (length_hist_nvalues - 1);
1076 * Add the area of this trapezoid to the total. The point of the
1077 * if-check is to avoid NaN, in the corner case that PA == PB == 0,
1078 * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
1079 * 0) is zero, regardless of the width (B - A).
1081 if (PA > 0 || PB > 0)
1082 area += 0.5 * (PB + PA) * (B - A);
1085 /* Last bin */
1086 A = B;
1087 PA = PB;
1089 B = length2; /* last bin ends at the query upper bound */
1090 if (i >= length_hist_nvalues - 1)
1091 pos = 0.0;
1092 else
1094 if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
1095 pos = 0.0;
1096 else
1097 pos = get_len_position(length2,
1098 DatumGetFloat8(length_hist_values[i]),
1099 DatumGetFloat8(length_hist_values[i + 1]));
1101 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1103 if (PA > 0 || PB > 0)
1104 area += 0.5 * (PB + PA) * (B - A);
1107 * Ok, we have calculated the area, ie. the integral. Divide by width to
1108 * get the requested average.
1110 * Avoid NaN arising from infinite / infinite. This happens at least if
1111 * length2 is infinite. It's not clear what the correct value would be in
1112 * that case, so 0.5 seems as good as any value.
1114 if (isinf(area) && isinf(length2))
1115 frac = 0.5;
1116 else
1117 frac = area / (length2 - length1);
1119 return frac;
1123 * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1124 * of multiranges that fall within the constant lower and upper bounds. This uses
1125 * the histograms of range lower bounds and range lengths, on the assumption
1126 * that the range lengths are independent of the lower bounds.
1128 * The caller has already checked that constant lower and upper bounds are
1129 * finite.
1131 static double
1132 calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1133 const RangeBound *lower, RangeBound *upper,
1134 const RangeBound *hist_lower, int hist_nvalues,
1135 Datum *length_hist_values, int length_hist_nvalues)
1137 int i,
1138 upper_index;
1139 float8 prev_dist;
1140 double bin_width;
1141 double upper_bin_width;
1142 double sum_frac;
1145 * Begin by finding the bin containing the upper bound, in the lower bound
1146 * histogram. Any range with a lower bound > constant upper bound can't
1147 * match, ie. there are no matches in bins greater than upper_index.
1149 upper->inclusive = !upper->inclusive;
1150 upper->lower = true;
1151 upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1152 false);
1155 * If the upper bound value is below the histogram's lower limit, there
1156 * are no matches.
1158 if (upper_index < 0)
1159 return 0.0;
1162 * If the upper bound value is at or beyond the histogram's upper limit,
1163 * start our loop at the last actual bin, as though the upper bound were
1164 * within that bin; get_position will clamp its result to 1.0 anyway.
1165 * (This corresponds to assuming that the data population above the
1166 * histogram's upper limit is empty, exactly like what we just assumed for
1167 * the lower limit.)
1169 upper_index = Min(upper_index, hist_nvalues - 2);
1172 * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1173 * upper_index + 1) bin which is greater than upper bound of query range
1174 * using linear interpolation of subdiff function.
1176 upper_bin_width = get_position(typcache, upper,
1177 &hist_lower[upper_index],
1178 &hist_lower[upper_index + 1]);
1181 * In the loop, dist and prev_dist are the distance of the "current" bin's
1182 * lower and upper bounds from the constant upper bound.
1184 * bin_width represents the width of the current bin. Normally it is 1.0,
1185 * meaning a full width bin, but can be less in the corner cases: start
1186 * and end of the loop. We start with bin_width = upper_bin_width, because
1187 * we begin at the bin containing the upper bound.
1189 prev_dist = 0.0;
1190 bin_width = upper_bin_width;
1192 sum_frac = 0.0;
1193 for (i = upper_index; i >= 0; i--)
1195 double dist;
1196 double length_hist_frac;
1197 bool final_bin = false;
1200 * dist -- distance from upper bound of query range to lower bound of
1201 * the current bin in the lower bound histogram. Or to the lower bound
1202 * of the constant range, if this is the final bin, containing the
1203 * constant lower bound.
1205 if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1207 dist = get_distance(typcache, lower, upper);
1210 * Subtract from bin_width the portion of this bin that we want to
1211 * ignore.
1213 bin_width -= get_position(typcache, lower, &hist_lower[i],
1214 &hist_lower[i + 1]);
1215 if (bin_width < 0.0)
1216 bin_width = 0.0;
1217 final_bin = true;
1219 else
1220 dist = get_distance(typcache, &hist_lower[i], upper);
1223 * Estimate the fraction of tuples in this bin that are narrow enough
1224 * to not exceed the distance to the upper bound of the query range.
1226 length_hist_frac = calc_length_hist_frac(length_hist_values,
1227 length_hist_nvalues,
1228 prev_dist, dist, true);
1231 * Add the fraction of tuples in this bin, with a suitable length, to
1232 * the total.
1234 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1236 if (final_bin)
1237 break;
1239 bin_width = 1.0;
1240 prev_dist = dist;
1243 return sum_frac;
1247 * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1248 * of multiranges that contain the constant lower and upper bounds. This uses
1249 * the histograms of range lower bounds and range lengths, on the assumption
1250 * that the range lengths are independent of the lower bounds.
1252 static double
1253 calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1254 const RangeBound *lower, const RangeBound *upper,
1255 const RangeBound *hist_lower, int hist_nvalues,
1256 Datum *length_hist_values, int length_hist_nvalues)
1258 int i,
1259 lower_index;
1260 double bin_width,
1261 lower_bin_width;
1262 double sum_frac;
1263 float8 prev_dist;
1265 /* Find the bin containing the lower bound of query range. */
1266 lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1267 true);
1270 * If the lower bound value is below the histogram's lower limit, there
1271 * are no matches.
1273 if (lower_index < 0)
1274 return 0.0;
1277 * If the lower bound value is at or beyond the histogram's upper limit,
1278 * start our loop at the last actual bin, as though the upper bound were
1279 * within that bin; get_position will clamp its result to 1.0 anyway.
1280 * (This corresponds to assuming that the data population above the
1281 * histogram's upper limit is empty, exactly like what we just assumed for
1282 * the lower limit.)
1284 lower_index = Min(lower_index, hist_nvalues - 2);
1287 * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1288 * lower_index + 1) bin which is greater than lower bound of query range
1289 * using linear interpolation of subdiff function.
1291 lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1292 &hist_lower[lower_index + 1]);
1295 * Loop through all the lower bound bins, smaller than the query lower
1296 * bound. In the loop, dist and prev_dist are the distance of the
1297 * "current" bin's lower and upper bounds from the constant upper bound.
1298 * We begin from query lower bound, and walk backwards, so the first bin's
1299 * upper bound is the query lower bound, and its distance to the query
1300 * upper bound is the length of the query range.
1302 * bin_width represents the width of the current bin. Normally it is 1.0,
1303 * meaning a full width bin, except for the first bin, which is only
1304 * counted up to the constant lower bound.
1306 prev_dist = get_distance(typcache, lower, upper);
1307 sum_frac = 0.0;
1308 bin_width = lower_bin_width;
1309 for (i = lower_index; i >= 0; i--)
1311 float8 dist;
1312 double length_hist_frac;
1315 * dist -- distance from upper bound of query range to current value
1316 * of lower bound histogram or lower bound of query range (if we've
1317 * reach it).
1319 dist = get_distance(typcache, &hist_lower[i], upper);
1322 * Get average fraction of length histogram which covers intervals
1323 * longer than (or equal to) distance to upper bound of query range.
1325 length_hist_frac =
1326 1.0 - calc_length_hist_frac(length_hist_values,
1327 length_hist_nvalues,
1328 prev_dist, dist, false);
1330 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1332 bin_width = 1.0;
1333 prev_dist = dist;
1336 return sum_frac;