1 /*-------------------------------------------------------------------------
3 * rangetypes_typanalyze.c
4 * Functions for gathering statistics from range columns
6 * For a range type column, histograms of lower and upper bounds, and
7 * the fraction of NULL and empty ranges are collected.
9 * Both histograms have the same length, and they are combined into a
10 * single array of ranges. This has the same shape as the histogram that
11 * std_typanalyze would collect, but the values are different. Each range
12 * in the array is a valid range, even though the lower and upper bounds
13 * come from different tuples. In theory, the standard scalar selectivity
14 * functions could be used with the combined histogram.
16 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
21 * src/backend/utils/adt/rangetypes_typanalyze.c
23 *-------------------------------------------------------------------------
27 #include "catalog/pg_operator.h"
28 #include "commands/vacuum.h"
29 #include "utils/float.h"
30 #include "utils/fmgrprotos.h"
31 #include "utils/lsyscache.h"
32 #include "utils/rangetypes.h"
33 #include "utils/multirangetypes.h"
35 static int float8_qsort_cmp(const void *a1
, const void *a2
);
36 static int range_bound_qsort_cmp(const void *a1
, const void *a2
, void *arg
);
37 static void compute_range_stats(VacAttrStats
*stats
,
38 AnalyzeAttrFetchFunc fetchfunc
, int samplerows
,
42 * range_typanalyze -- typanalyze function for range columns
45 range_typanalyze(PG_FUNCTION_ARGS
)
47 VacAttrStats
*stats
= (VacAttrStats
*) PG_GETARG_POINTER(0);
48 TypeCacheEntry
*typcache
;
49 Form_pg_attribute attr
= stats
->attr
;
51 /* Get information about range type; note column might be a domain */
52 typcache
= range_get_typcache(fcinfo
, getBaseType(stats
->attrtypid
));
54 if (attr
->attstattarget
< 0)
55 attr
->attstattarget
= default_statistics_target
;
57 stats
->compute_stats
= compute_range_stats
;
58 stats
->extra_data
= typcache
;
59 /* same as in std_typanalyze */
60 stats
->minrows
= 300 * attr
->attstattarget
;
66 * multirange_typanalyze -- typanalyze function for multirange columns
68 * We do the same analysis as for ranges, but on the smallest range that
69 * completely includes the multirange.
72 multirange_typanalyze(PG_FUNCTION_ARGS
)
74 VacAttrStats
*stats
= (VacAttrStats
*) PG_GETARG_POINTER(0);
75 TypeCacheEntry
*typcache
;
76 Form_pg_attribute attr
= stats
->attr
;
78 /* Get information about multirange type; note column might be a domain */
79 typcache
= multirange_get_typcache(fcinfo
, getBaseType(stats
->attrtypid
));
81 if (attr
->attstattarget
< 0)
82 attr
->attstattarget
= default_statistics_target
;
84 stats
->compute_stats
= compute_range_stats
;
85 stats
->extra_data
= typcache
;
86 /* same as in std_typanalyze */
87 stats
->minrows
= 300 * attr
->attstattarget
;
93 * Comparison function for sorting float8s, used for range lengths.
96 float8_qsort_cmp(const void *a1
, const void *a2
)
98 const float8
*f1
= (const float8
*) a1
;
99 const float8
*f2
= (const float8
*) a2
;
110 * Comparison function for sorting RangeBounds.
113 range_bound_qsort_cmp(const void *a1
, const void *a2
, void *arg
)
115 RangeBound
*b1
= (RangeBound
*) a1
;
116 RangeBound
*b2
= (RangeBound
*) a2
;
117 TypeCacheEntry
*typcache
= (TypeCacheEntry
*) arg
;
119 return range_cmp_bounds(typcache
, b1
, b2
);
123 * compute_range_stats() -- compute statistics for a range column
126 compute_range_stats(VacAttrStats
*stats
, AnalyzeAttrFetchFunc fetchfunc
,
127 int samplerows
, double totalrows
)
129 TypeCacheEntry
*typcache
= (TypeCacheEntry
*) stats
->extra_data
;
130 TypeCacheEntry
*mltrng_typcache
= NULL
;
133 int non_null_cnt
= 0;
134 int non_empty_cnt
= 0;
138 int num_bins
= stats
->attr
->attstattarget
;
143 double total_width
= 0;
145 if (typcache
->typtype
== TYPTYPE_MULTIRANGE
)
147 mltrng_typcache
= typcache
;
148 typcache
= typcache
->rngtype
;
151 Assert(typcache
->typtype
== TYPTYPE_RANGE
);
152 has_subdiff
= OidIsValid(typcache
->rng_subdiff_finfo
.fn_oid
);
154 /* Allocate memory to hold range bounds and lengths of the sample ranges. */
155 lowers
= (RangeBound
*) palloc(sizeof(RangeBound
) * samplerows
);
156 uppers
= (RangeBound
*) palloc(sizeof(RangeBound
) * samplerows
);
157 lengths
= (float8
*) palloc(sizeof(float8
) * samplerows
);
159 /* Loop over the sample ranges. */
160 for (range_no
= 0; range_no
< samplerows
; range_no
++)
165 MultirangeType
*multirange
;
171 vacuum_delay_point();
173 value
= fetchfunc(stats
, range_no
, &isnull
);
176 /* range is null, just count that */
182 * XXX: should we ignore wide values, like std_typanalyze does, to
183 * avoid bloating the statistics table?
185 total_width
+= VARSIZE_ANY(DatumGetPointer(value
));
187 /* Get range and deserialize it for further analysis. */
188 if (mltrng_typcache
!= NULL
)
190 /* Treat multiranges like a big range without gaps. */
191 multirange
= DatumGetMultirangeTypeP(value
);
192 if (!MultirangeIsEmpty(multirange
))
196 multirange_get_bounds(typcache
, multirange
, 0,
198 multirange_get_bounds(typcache
, multirange
,
199 multirange
->rangeCount
- 1,
210 range
= DatumGetRangeTypeP(value
);
211 range_deserialize(typcache
, range
, &lower
, &upper
, &empty
);
216 /* Remember bounds and length for further usage in histograms */
217 lowers
[non_empty_cnt
] = lower
;
218 uppers
[non_empty_cnt
] = upper
;
220 if (lower
.infinite
|| upper
.infinite
)
222 /* Length of any kind of an infinite range is infinite */
223 length
= get_float8_infinity();
225 else if (has_subdiff
)
228 * For an ordinary range, use subdiff function between upper
229 * and lower bound values.
231 length
= DatumGetFloat8(FunctionCall2Coll(&typcache
->rng_subdiff_finfo
,
232 typcache
->rng_collation
,
233 upper
.val
, lower
.val
));
237 /* Use default value of 1.0 if no subdiff is available. */
240 lengths
[non_empty_cnt
] = length
;
252 /* We can only compute real stats if we found some non-null values. */
253 if (non_null_cnt
> 0)
255 Datum
*bound_hist_values
;
256 Datum
*length_hist_values
;
262 MemoryContext old_cxt
;
265 stats
->stats_valid
= true;
266 /* Do the simple null-frac and width stats */
267 stats
->stanullfrac
= (double) null_cnt
/ (double) samplerows
;
268 stats
->stawidth
= total_width
/ (double) non_null_cnt
;
270 /* Estimate that non-null values are unique */
271 stats
->stadistinct
= -1.0 * (1.0 - stats
->stanullfrac
);
273 /* Must copy the target values into anl_context */
274 old_cxt
= MemoryContextSwitchTo(stats
->anl_context
);
277 * Generate a bounds histogram slot entry if there are at least two
280 if (non_empty_cnt
>= 2)
282 /* Sort bound values */
283 qsort_arg(lowers
, non_empty_cnt
, sizeof(RangeBound
),
284 range_bound_qsort_cmp
, typcache
);
285 qsort_arg(uppers
, non_empty_cnt
, sizeof(RangeBound
),
286 range_bound_qsort_cmp
, typcache
);
288 num_hist
= non_empty_cnt
;
289 if (num_hist
> num_bins
)
290 num_hist
= num_bins
+ 1;
292 bound_hist_values
= (Datum
*) palloc(num_hist
* sizeof(Datum
));
295 * The object of this loop is to construct ranges from first and
296 * last entries in lowers[] and uppers[] along with evenly-spaced
297 * values in between. So the i'th value is a range of lowers[(i *
298 * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
299 * (num_hist - 1)]. But computing that subscript directly risks
300 * integer overflow when the stats target is more than a couple
301 * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
302 * at each step, tracking the integral and fractional parts of the
305 delta
= (non_empty_cnt
- 1) / (num_hist
- 1);
306 deltafrac
= (non_empty_cnt
- 1) % (num_hist
- 1);
309 for (i
= 0; i
< num_hist
; i
++)
311 bound_hist_values
[i
] = PointerGetDatum(range_serialize(typcache
,
316 posfrac
+= deltafrac
;
317 if (posfrac
>= (num_hist
- 1))
319 /* fractional part exceeds 1, carry to integer part */
321 posfrac
-= (num_hist
- 1);
325 stats
->stakind
[slot_idx
] = STATISTIC_KIND_BOUNDS_HISTOGRAM
;
326 stats
->stavalues
[slot_idx
] = bound_hist_values
;
327 stats
->numvalues
[slot_idx
] = num_hist
;
329 /* Store ranges even if we're analyzing a multirange column */
330 stats
->statypid
[slot_idx
] = typcache
->type_id
;
331 stats
->statyplen
[slot_idx
] = typcache
->typlen
;
332 stats
->statypbyval
[slot_idx
] = typcache
->typbyval
;
333 stats
->statypalign
[slot_idx
] = typcache
->typalign
;
339 * Generate a length histogram slot entry if there are at least two
342 if (non_empty_cnt
>= 2)
345 * Ascending sort of range lengths for further filling of
348 qsort(lengths
, non_empty_cnt
, sizeof(float8
), float8_qsort_cmp
);
350 num_hist
= non_empty_cnt
;
351 if (num_hist
> num_bins
)
352 num_hist
= num_bins
+ 1;
354 length_hist_values
= (Datum
*) palloc(num_hist
* sizeof(Datum
));
357 * The object of this loop is to copy the first and last lengths[]
358 * entries along with evenly-spaced values in between. So the i'th
359 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
360 * computing that subscript directly risks integer overflow when
361 * the stats target is more than a couple thousand. Instead we
362 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
363 * the integral and fractional parts of the sum separately.
365 delta
= (non_empty_cnt
- 1) / (num_hist
- 1);
366 deltafrac
= (non_empty_cnt
- 1) % (num_hist
- 1);
369 for (i
= 0; i
< num_hist
; i
++)
371 length_hist_values
[i
] = Float8GetDatum(lengths
[pos
]);
373 posfrac
+= deltafrac
;
374 if (posfrac
>= (num_hist
- 1))
376 /* fractional part exceeds 1, carry to integer part */
378 posfrac
-= (num_hist
- 1);
385 * Even when we don't create the histogram, store an empty array
386 * to mean "no histogram". We can't just leave stavalues NULL,
387 * because get_attstatsslot() errors if you ask for stavalues, and
388 * it's NULL. We'll still store the empty fraction in stanumbers.
390 length_hist_values
= palloc(0);
393 stats
->staop
[slot_idx
] = Float8LessOperator
;
394 stats
->stacoll
[slot_idx
] = InvalidOid
;
395 stats
->stavalues
[slot_idx
] = length_hist_values
;
396 stats
->numvalues
[slot_idx
] = num_hist
;
397 stats
->statypid
[slot_idx
] = FLOAT8OID
;
398 stats
->statyplen
[slot_idx
] = sizeof(float8
);
399 stats
->statypbyval
[slot_idx
] = FLOAT8PASSBYVAL
;
400 stats
->statypalign
[slot_idx
] = 'd';
402 /* Store the fraction of empty ranges */
403 emptyfrac
= (float4
*) palloc(sizeof(float4
));
404 *emptyfrac
= ((double) empty_cnt
) / ((double) non_null_cnt
);
405 stats
->stanumbers
[slot_idx
] = emptyfrac
;
406 stats
->numnumbers
[slot_idx
] = 1;
408 stats
->stakind
[slot_idx
] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM
;
411 MemoryContextSwitchTo(old_cxt
);
413 else if (null_cnt
> 0)
415 /* We found only nulls; assume the column is entirely null */
416 stats
->stats_valid
= true;
417 stats
->stanullfrac
= 1.0;
418 stats
->stawidth
= 0; /* "unknown" */
419 stats
->stadistinct
= 0.0; /* "unknown" */
423 * We don't need to bother cleaning up any of our temporary palloc's. The
424 * hashtable should also go away, as it used a child memory context.