1 /*-------------------------------------------------------------------------
4 * support routines for SYSTEM_TIME tablesample method
6 * The desire here is to produce a random sample with as many rows as possible
7 * in no more than the specified amount of time. We use a block-sampling
8 * approach. To ensure that the whole relation will be visited if necessary,
9 * we start at a randomly chosen block and then advance with a stride that
10 * is randomly chosen but is relatively prime to the relation's nblocks.
12 * Because of the time dependence, this method is necessarily unrepeatable.
13 * However, we do what we can to reduce surprising behavior by selecting
14 * the sampling pattern just once per query, much as in tsm_system_rows.
16 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
20 * contrib/tsm_system_time/tsm_system_time.c
22 *-------------------------------------------------------------------------
29 #include "access/relscan.h"
30 #include "access/tsmapi.h"
31 #include "catalog/pg_type.h"
32 #include "miscadmin.h"
33 #include "optimizer/optimizer.h"
34 #include "utils/sampling.h"
35 #include "utils/spccache.h"
39 PG_FUNCTION_INFO_V1(tsm_system_time_handler
);
45 uint32 seed
; /* random seed */
46 double millis
; /* time limit for sampling */
47 instr_time start_time
; /* scan start time */
48 OffsetNumber lt
; /* last tuple returned from current block */
49 BlockNumber doneblocks
; /* number of already-scanned blocks */
50 BlockNumber lb
; /* last block visited */
51 /* these three values are not changed during a rescan: */
52 BlockNumber nblocks
; /* number of blocks in relation */
53 BlockNumber firstblock
; /* first block to sample from */
54 BlockNumber step
; /* step size, or 0 if not set yet */
55 } SystemTimeSamplerData
;
57 static void system_time_samplescangetsamplesize(PlannerInfo
*root
,
62 static void system_time_initsamplescan(SampleScanState
*node
,
64 static void system_time_beginsamplescan(SampleScanState
*node
,
68 static BlockNumber
system_time_nextsampleblock(SampleScanState
*node
, BlockNumber nblocks
);
69 static OffsetNumber
system_time_nextsampletuple(SampleScanState
*node
,
71 OffsetNumber maxoffset
);
72 static uint32
random_relative_prime(uint32 n
, pg_prng_state
*randstate
);
76 * Create a TsmRoutine descriptor for the SYSTEM_TIME method.
79 tsm_system_time_handler(PG_FUNCTION_ARGS
)
81 TsmRoutine
*tsm
= makeNode(TsmRoutine
);
83 tsm
->parameterTypes
= list_make1_oid(FLOAT8OID
);
85 /* See notes at head of file */
86 tsm
->repeatable_across_queries
= false;
87 tsm
->repeatable_across_scans
= false;
89 tsm
->SampleScanGetSampleSize
= system_time_samplescangetsamplesize
;
90 tsm
->InitSampleScan
= system_time_initsamplescan
;
91 tsm
->BeginSampleScan
= system_time_beginsamplescan
;
92 tsm
->NextSampleBlock
= system_time_nextsampleblock
;
93 tsm
->NextSampleTuple
= system_time_nextsampletuple
;
94 tsm
->EndSampleScan
= NULL
;
96 PG_RETURN_POINTER(tsm
);
100 * Sample size estimation.
103 system_time_samplescangetsamplesize(PlannerInfo
*root
,
111 double spc_random_page_cost
;
115 /* Try to extract an estimate for the limit time spec */
116 limitnode
= (Node
*) linitial(paramexprs
);
117 limitnode
= estimate_expression_value(root
, limitnode
);
119 if (IsA(limitnode
, Const
) &&
120 !((Const
*) limitnode
)->constisnull
)
122 millis
= DatumGetFloat8(((Const
*) limitnode
)->constvalue
);
123 if (millis
< 0 || isnan(millis
))
125 /* Default millis if the value is bogus */
131 /* Default millis if we didn't obtain a non-null Const */
135 /* Get the planner's idea of cost per page read */
136 get_tablespace_page_costs(baserel
->reltablespace
,
137 &spc_random_page_cost
,
141 * Estimate the number of pages we can read by assuming that the cost
142 * figure is expressed in milliseconds. This is completely, unmistakably
143 * bogus, but we have to do something to produce an estimate and there's
146 if (spc_random_page_cost
> 0)
147 npages
= millis
/ spc_random_page_cost
;
149 npages
= millis
; /* even more bogus, but whatcha gonna do? */
151 /* Clamp to sane value */
152 npages
= clamp_row_est(Min((double) baserel
->pages
, npages
));
154 if (baserel
->tuples
> 0 && baserel
->pages
> 0)
156 /* Estimate number of tuples returned based on tuple density */
157 double density
= baserel
->tuples
/ (double) baserel
->pages
;
159 ntuples
= npages
* density
;
163 /* For lack of data, assume one tuple per page */
167 /* Clamp to the estimated relation size */
168 ntuples
= clamp_row_est(Min(baserel
->tuples
, ntuples
));
175 * Initialize during executor setup.
178 system_time_initsamplescan(SampleScanState
*node
, int eflags
)
180 node
->tsm_state
= palloc0(sizeof(SystemTimeSamplerData
));
181 /* Note the above leaves tsm_state->step equal to zero */
185 * Examine parameters and prepare for a sample scan.
188 system_time_beginsamplescan(SampleScanState
*node
,
193 SystemTimeSamplerData
*sampler
= (SystemTimeSamplerData
*) node
->tsm_state
;
194 double millis
= DatumGetFloat8(params
[0]);
196 if (millis
< 0 || isnan(millis
))
198 (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT
),
199 errmsg("sample collection time must not be negative")));
201 sampler
->seed
= seed
;
202 sampler
->millis
= millis
;
203 sampler
->lt
= InvalidOffsetNumber
;
204 sampler
->doneblocks
= 0;
205 /* start_time, lb will be initialized during first NextSampleBlock call */
206 /* we intentionally do not change nblocks/firstblock/step here */
210 * Select next block to sample.
212 * Uses linear probing algorithm for picking next block.
215 system_time_nextsampleblock(SampleScanState
*node
, BlockNumber nblocks
)
217 SystemTimeSamplerData
*sampler
= (SystemTimeSamplerData
*) node
->tsm_state
;
220 /* First call within scan? */
221 if (sampler
->doneblocks
== 0)
223 /* First scan within query? */
224 if (sampler
->step
== 0)
226 /* Initialize now that we have scan descriptor */
227 pg_prng_state randstate
;
229 /* If relation is empty, there's nothing to scan */
231 return InvalidBlockNumber
;
233 /* We only need an RNG during this setup step */
234 sampler_random_init_state(sampler
->seed
, &randstate
);
236 /* Compute nblocks/firstblock/step only once per query */
237 sampler
->nblocks
= nblocks
;
239 /* Choose random starting block within the relation */
240 /* (Actually this is the predecessor of the first block visited) */
241 sampler
->firstblock
= sampler_random_fract(&randstate
) *
244 /* Find relative prime as step size for linear probing */
245 sampler
->step
= random_relative_prime(sampler
->nblocks
, &randstate
);
248 /* Reinitialize lb and start_time */
249 sampler
->lb
= sampler
->firstblock
;
250 INSTR_TIME_SET_CURRENT(sampler
->start_time
);
253 /* If we've read all blocks in relation, we're done */
254 if (++sampler
->doneblocks
> sampler
->nblocks
)
255 return InvalidBlockNumber
;
257 /* If we've used up all the allotted time, we're done */
258 INSTR_TIME_SET_CURRENT(cur_time
);
259 INSTR_TIME_SUBTRACT(cur_time
, sampler
->start_time
);
260 if (INSTR_TIME_GET_MILLISEC(cur_time
) >= sampler
->millis
)
261 return InvalidBlockNumber
;
264 * It's probably impossible for scan->rs_nblocks to decrease between scans
265 * within a query; but just in case, loop until we select a block number
266 * less than scan->rs_nblocks. We don't care if scan->rs_nblocks has
267 * increased since the first scan.
271 /* Advance lb, using uint64 arithmetic to forestall overflow */
272 sampler
->lb
= ((uint64
) sampler
->lb
+ sampler
->step
) % sampler
->nblocks
;
273 } while (sampler
->lb
>= nblocks
);
279 * Select next sampled tuple in current block.
281 * In block sampling, we just want to sample all the tuples in each selected
284 * When we reach end of the block, return InvalidOffsetNumber which tells
285 * SampleScan to go to next block.
288 system_time_nextsampletuple(SampleScanState
*node
,
290 OffsetNumber maxoffset
)
292 SystemTimeSamplerData
*sampler
= (SystemTimeSamplerData
*) node
->tsm_state
;
293 OffsetNumber tupoffset
= sampler
->lt
;
295 /* Advance to next possible offset on page */
296 if (tupoffset
== InvalidOffsetNumber
)
297 tupoffset
= FirstOffsetNumber
;
302 if (tupoffset
> maxoffset
)
303 tupoffset
= InvalidOffsetNumber
;
305 sampler
->lt
= tupoffset
;
311 * Compute greatest common divisor of two uint32's.
314 gcd(uint32 a
, uint32 b
)
329 * Pick a random value less than and relatively prime to n, if possible
333 random_relative_prime(uint32 n
, pg_prng_state
*randstate
)
337 /* Safety check to avoid infinite loop or zero result for small n. */
342 * This should only take 2 or 3 iterations as the probability of 2 numbers
343 * being relatively prime is ~61%; but just in case, we'll include a
344 * CHECK_FOR_INTERRUPTS in the loop.
348 CHECK_FOR_INTERRUPTS();
349 r
= (uint32
) (sampler_random_fract(randstate
) * n
);
350 } while (r
== 0 || gcd(r
, n
) > 1);