src/backend/optimizer/path/costsize.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * costsize.c
   4  *        Routines to compute (and set) relation sizes and path costs
   5  *
   6  * Path costs are measured in arbitrary units established by these basic
   7  * parameters:
   8  *
   9  *      seq_page_cost           Cost of a sequential page fetch
  10  *      random_page_cost        Cost of a non-sequential page fetch
  11  *      cpu_tuple_cost          Cost of typical CPU time to process a tuple
  12  *      cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
  13  *      cpu_operator_cost       Cost of CPU time to execute an operator or function
  14  *      parallel_tuple_cost Cost of CPU time to pass a tuple from worker to leader backend
  15  *      parallel_setup_cost Cost of setting up shared memory for parallelism
  16  *
  17  * We expect that the kernel will typically do some amount of read-ahead
  18  * optimization; this in conjunction with seek costs means that seq_page_cost
  19  * is normally considerably less than random_page_cost.  (However, if the
  20  * database is fully cached in RAM, it is reasonable to set them equal.)
  21  *
  22  * We also use a rough estimate "effective_cache_size" of the number of
  23  * disk pages in Postgres + OS-level disk cache.  (We can't simply use
  24  * NBuffers for this purpose because that would ignore the effects of
  25  * the kernel's disk cache.)
  26  *
  27  * Obviously, taking constants for these values is an oversimplification,
  28  * but it's tough enough to get any useful estimates even at this level of
  29  * detail.  Note that all of these parameters are user-settable, in case
  30  * the default values are drastically off for a particular platform.
  31  *
  32  * seq_page_cost and random_page_cost can also be overridden for an individual
  33  * tablespace, in case some data is on a fast disk and other data is on a slow
  34  * disk.  Per-tablespace overrides never apply to temporary work files such as
  35  * an external sort or a materialize node that overflows work_mem.
  36  *
  37  * We compute two separate costs for each path:
  38  *              total_cost: total estimated cost to fetch all tuples
  39  *              startup_cost: cost that is expended before first tuple is fetched
  40  * In some scenarios, such as when there is a LIMIT or we are implementing
  41  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
  42  * path's result.  A caller can estimate the cost of fetching a partial
  43  * result by interpolating between startup_cost and total_cost.  In detail:
  44  *              actual_cost = startup_cost +
  45  *                      (total_cost - startup_cost) * tuples_to_fetch / path->rows;
  46  * Note that a base relation's rows count (and, by extension, plan_rows for
  47  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
  48  * that this equation works properly.  (Note: while path->rows is never zero
  49  * for ordinary relations, it is zero for paths for provably-empty relations,
  50  * so beware of division-by-zero.)      The LIMIT is applied as a top-level
  51  * plan node.
  52  *
  53  * For largely historical reasons, most of the routines in this module use
  54  * the passed result Path only to store their results (rows, startup_cost and
  55  * total_cost) into.  All the input data they need is passed as separate
  56  * parameters, even though much of it could be extracted from the Path.
  57  * An exception is made for the cost_XXXjoin() routines, which expect all
  58  * the other fields of the passed XXXPath to be filled in, and similarly
  59  * cost_index() assumes the passed IndexPath is valid except for its output
  60  * values.
  61  *
  62  *
  63  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  64  * Portions Copyright (c) 1994, Regents of the University of California
  65  *
  66  * IDENTIFICATION
  67  *        src/backend/optimizer/path/costsize.c
  68  *
  69  *-------------------------------------------------------------------------
  70  */
  71
  72 #include "postgres.h"
  73
  74 #include <math.h>
  75
  76 #include "access/amapi.h"
  77 #include "access/htup_details.h"
  78 #include "access/tsmapi.h"
  79 #include "executor/executor.h"
  80 #include "executor/nodeAgg.h"
  81 #include "executor/nodeHash.h"
  82 #include "executor/nodeMemoize.h"
  83 #include "miscadmin.h"
  84 #include "nodes/makefuncs.h"
  85 #include "nodes/nodeFuncs.h"
  86 #include "optimizer/clauses.h"
  87 #include "optimizer/cost.h"
  88 #include "optimizer/optimizer.h"
  89 #include "optimizer/pathnode.h"
  90 #include "optimizer/paths.h"
  91 #include "optimizer/placeholder.h"
  92 #include "optimizer/plancat.h"
  93 #include "optimizer/planmain.h"
  94 #include "optimizer/restrictinfo.h"
  95 #include "parser/parsetree.h"
  96 #include "utils/lsyscache.h"
  97 #include "utils/selfuncs.h"
  98 #include "utils/spccache.h"
  99 #include "utils/tuplesort.h"
 100
 101
 102 #define LOG2(x)  (log(x) / 0.693147180559945)
 103
 104 /*
 105  * Append and MergeAppend nodes are less expensive than some other operations
 106  * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
 107  * per-tuple cost as cpu_tuple_cost multiplied by this value.
 108  */
 109 #define APPEND_CPU_COST_MULTIPLIER 0.5
 110
 111 /*
 112  * Maximum value for row estimates.  We cap row estimates to this to help
 113  * ensure that costs based on these estimates remain within the range of what
 114  * double can represent.  add_path() wouldn't act sanely given infinite or NaN
 115  * cost values.
 116  */
 117 #define MAXIMUM_ROWCOUNT 1e100
 118
 119 double          seq_page_cost = DEFAULT_SEQ_PAGE_COST;
 120 double          random_page_cost = DEFAULT_RANDOM_PAGE_COST;
 121 double          cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
 122 double          cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
 123 double          cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
 124 double          parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
 125 double          parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
 126
 127 int                     effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
 128
 129 Cost            disable_cost = 1.0e10;
 130
 131 int                     max_parallel_workers_per_gather = 2;
 132
 133 bool            enable_seqscan = true;
 134 bool            enable_indexscan = true;
 135 bool            enable_indexonlyscan = true;
 136 bool            enable_bitmapscan = true;
 137 bool            enable_tidscan = true;
 138 bool            enable_sort = true;
 139 bool            enable_incremental_sort = true;
 140 bool            enable_hashagg = true;
 141 bool            enable_nestloop = true;
 142 bool            enable_material = true;
 143 bool            enable_memoize = true;
 144 bool            enable_mergejoin = true;
 145 bool            enable_hashjoin = true;
 146 bool            enable_gathermerge = true;
 147 bool            enable_partitionwise_join = false;
 148 bool            enable_partitionwise_aggregate = false;
 149 bool            enable_parallel_append = true;
 150 bool            enable_parallel_hash = true;
 151 bool            enable_partition_pruning = true;
 152 bool            enable_async_append = true;
 153
 154 typedef struct
 155 {
 156         PlannerInfo *root;
 157         QualCost        total;
 158 } cost_qual_eval_context;
 159
 160 static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
 161 static MergeScanSelCache *cached_scansel(PlannerInfo *root,
 162                                                                                  RestrictInfo *rinfo,
 163                                                                                  PathKey *pathkey);
 164 static void cost_rescan(PlannerInfo *root, Path *path,
 165                                                 Cost *rescan_startup_cost, Cost *rescan_total_cost);
 166 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
 167 static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
 168                                                                           ParamPathInfo *param_info,
 169                                                                           QualCost *qpqual_cost);
 170 static bool has_indexed_join_quals(NestPath *path);
 171 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
 172                                                                  List *quals);
 173 static double calc_joinrel_size_estimate(PlannerInfo *root,
 174                                                                                  RelOptInfo *joinrel,
 175                                                                                  RelOptInfo *outer_rel,
 176                                                                                  RelOptInfo *inner_rel,
 177                                                                                  double outer_rows,
 178                                                                                  double inner_rows,
 179                                                                                  SpecialJoinInfo *sjinfo,
 180                                                                                  List *restrictlist);
 181 static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
 182                                                                                                         Relids outer_relids,
 183                                                                                                         Relids inner_relids,
 184                                                                                                         SpecialJoinInfo *sjinfo,
 185                                                                                                         List **restrictlist);
 186 static Cost append_nonpartial_cost(List *subpaths, int numpaths,
 187                                                                    int parallel_workers);
 188 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
 189 static double relation_byte_size(double tuples, int width);
 190 static double page_size(double tuples, int width);
 191 static double get_parallel_divisor(Path *path);
 192
 193
 194 /*
 195  * clamp_row_est
 196  *              Force a row-count estimate to a sane value.
 197  */
 198 double
 199 clamp_row_est(double nrows)
 200 {
 201         /*
 202          * Avoid infinite and NaN row estimates.  Costs derived from such values
 203          * are going to be useless.  Also force the estimate to be at least one
 204          * row, to make explain output look better and to avoid possible
 205          * divide-by-zero when interpolating costs.  Make it an integer, too.
 206          */
 207         if (nrows > MAXIMUM_ROWCOUNT || isnan(nrows))
 208                 nrows = MAXIMUM_ROWCOUNT;
 209         else if (nrows <= 1.0)
 210                 nrows = 1.0;
 211         else
 212                 nrows = rint(nrows);
 213
 214         return nrows;
 215 }
 216
 217
 218 /*
 219  * cost_seqscan
 220  *        Determines and returns the cost of scanning a relation sequentially.
 221  *
 222  * 'baserel' is the relation to be scanned
 223  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
 224  */
 225 void
 226 cost_seqscan(Path *path, PlannerInfo *root,
 227                          RelOptInfo *baserel, ParamPathInfo *param_info)
 228 {
 229         Cost            startup_cost = 0;
 230         Cost            cpu_run_cost;
 231         Cost            disk_run_cost;
 232         double          spc_seq_page_cost;
 233         QualCost        qpqual_cost;
 234         Cost            cpu_per_tuple;
 235
 236         /* Should only be applied to base relations */
 237         Assert(baserel->relid > 0);
 238         Assert(baserel->rtekind == RTE_RELATION);
 239
 240         /* Mark the path with the correct row estimate */
 241         if (param_info)
 242                 path->rows = param_info->ppi_rows;
 243         else
 244                 path->rows = baserel->rows;
 245
 246         if (!enable_seqscan)
 247                 startup_cost += disable_cost;
 248
 249         /* fetch estimated page cost for tablespace containing table */
 250         get_tablespace_page_costs(baserel->reltablespace,
 251                                                           NULL,
 252                                                           &spc_seq_page_cost);
 253
 254         /*
 255          * disk costs
 256          */
 257         disk_run_cost = spc_seq_page_cost * baserel->pages;
 258
 259         /* CPU costs */
 260         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
 261
 262         startup_cost += qpqual_cost.startup;
 263         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
 264         cpu_run_cost = cpu_per_tuple * baserel->tuples;
 265         /* tlist eval costs are paid per output row, not per tuple scanned */
 266         startup_cost += path->pathtarget->cost.startup;
 267         cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
 268
 269         /* Adjust costing for parallelism, if used. */
 270         if (path->parallel_workers > 0)
 271         {
 272                 double          parallel_divisor = get_parallel_divisor(path);
 273
 274                 /* The CPU cost is divided among all the workers. */
 275                 cpu_run_cost /= parallel_divisor;
 276
 277                 /*
 278                  * It may be possible to amortize some of the I/O cost, but probably
 279                  * not very much, because most operating systems already do aggressive
 280                  * prefetching.  For now, we assume that the disk run cost can't be
 281                  * amortized at all.
 282                  */
 283
 284                 /*
 285                  * In the case of a parallel plan, the row count needs to represent
 286                  * the number of tuples processed per worker.
 287                  */
 288                 path->rows = clamp_row_est(path->rows / parallel_divisor);
 289         }
 290
 291         path->startup_cost = startup_cost;
 292         path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
 293 }
 294
 295 /*
 296  * cost_samplescan
 297  *        Determines and returns the cost of scanning a relation using sampling.
 298  *
 299  * 'baserel' is the relation to be scanned
 300  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
 301  */
 302 void
 303 cost_samplescan(Path *path, PlannerInfo *root,
 304                                 RelOptInfo *baserel, ParamPathInfo *param_info)
 305 {
 306         Cost            startup_cost = 0;
 307         Cost            run_cost = 0;
 308         RangeTblEntry *rte;
 309         TableSampleClause *tsc;
 310         TsmRoutine *tsm;
 311         double          spc_seq_page_cost,
 312                                 spc_random_page_cost,
 313                                 spc_page_cost;
 314         QualCost        qpqual_cost;
 315         Cost            cpu_per_tuple;
 316
 317         /* Should only be applied to base relations with tablesample clauses */
 318         Assert(baserel->relid > 0);
 319         rte = planner_rt_fetch(baserel->relid, root);
 320         Assert(rte->rtekind == RTE_RELATION);
 321         tsc = rte->tablesample;
 322         Assert(tsc != NULL);
 323         tsm = GetTsmRoutine(tsc->tsmhandler);
 324
 325         /* Mark the path with the correct row estimate */
 326         if (param_info)
 327                 path->rows = param_info->ppi_rows;
 328         else
 329                 path->rows = baserel->rows;
 330
 331         /* fetch estimated page cost for tablespace containing table */
 332         get_tablespace_page_costs(baserel->reltablespace,
 333                                                           &spc_random_page_cost,
 334                                                           &spc_seq_page_cost);
 335
 336         /* if NextSampleBlock is used, assume random access, else sequential */
 337         spc_page_cost = (tsm->NextSampleBlock != NULL) ?
 338                 spc_random_page_cost : spc_seq_page_cost;
 339
 340         /*
 341          * disk costs (recall that baserel->pages has already been set to the
 342          * number of pages the sampling method will visit)
 343          */
 344         run_cost += spc_page_cost * baserel->pages;
 345
 346         /*
 347          * CPU costs (recall that baserel->tuples has already been set to the
 348          * number of tuples the sampling method will select).  Note that we ignore
 349          * execution cost of the TABLESAMPLE parameter expressions; they will be
 350          * evaluated only once per scan, and in most usages they'll likely be
 351          * simple constants anyway.  We also don't charge anything for the
 352          * calculations the sampling method might do internally.
 353          */
 354         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
 355
 356         startup_cost += qpqual_cost.startup;
 357         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
 358         run_cost += cpu_per_tuple * baserel->tuples;
 359         /* tlist eval costs are paid per output row, not per tuple scanned */
 360         startup_cost += path->pathtarget->cost.startup;
 361         run_cost += path->pathtarget->cost.per_tuple * path->rows;
 362
 363         path->startup_cost = startup_cost;
 364         path->total_cost = startup_cost + run_cost;
 365 }
 366
 367 /*
 368  * cost_gather
 369  *        Determines and returns the cost of gather path.
 370  *
 371  * 'rel' is the relation to be operated upon
 372  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
 373  * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
 374  * both 'rel' and 'param_info'.  This is useful when the path doesn't exactly
 375  * correspond to any particular RelOptInfo.
 376  */
 377 void
 378 cost_gather(GatherPath *path, PlannerInfo *root,
 379                         RelOptInfo *rel, ParamPathInfo *param_info,
 380                         double *rows)
 381 {
 382         Cost            startup_cost = 0;
 383         Cost            run_cost = 0;
 384
 385         /* Mark the path with the correct row estimate */
 386         if (rows)
 387                 path->path.rows = *rows;
 388         else if (param_info)
 389                 path->path.rows = param_info->ppi_rows;
 390         else
 391                 path->path.rows = rel->rows;
 392
 393         startup_cost = path->subpath->startup_cost;
 394
 395         run_cost = path->subpath->total_cost - path->subpath->startup_cost;
 396
 397         /* Parallel setup and communication cost. */
 398         startup_cost += parallel_setup_cost;
 399         run_cost += parallel_tuple_cost * path->path.rows;
 400
 401         path->path.startup_cost = startup_cost;
 402         path->path.total_cost = (startup_cost + run_cost);
 403 }
 404
 405 /*
 406  * cost_gather_merge
 407  *        Determines and returns the cost of gather merge path.
 408  *
 409  * GatherMerge merges several pre-sorted input streams, using a heap that at
 410  * any given instant holds the next tuple from each stream. If there are N
 411  * streams, we need about N*log2(N) tuple comparisons to construct the heap at
 412  * startup, and then for each output tuple, about log2(N) comparisons to
 413  * replace the top heap entry with the next tuple from the same stream.
 414  */
 415 void
 416 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
 417                                   RelOptInfo *rel, ParamPathInfo *param_info,
 418                                   Cost input_startup_cost, Cost input_total_cost,
 419                                   double *rows)
 420 {
 421         Cost            startup_cost = 0;
 422         Cost            run_cost = 0;
 423         Cost            comparison_cost;
 424         double          N;
 425         double          logN;
 426
 427         /* Mark the path with the correct row estimate */
 428         if (rows)
 429                 path->path.rows = *rows;
 430         else if (param_info)
 431                 path->path.rows = param_info->ppi_rows;
 432         else
 433                 path->path.rows = rel->rows;
 434
 435         if (!enable_gathermerge)
 436                 startup_cost += disable_cost;
 437
 438         /*
 439          * Add one to the number of workers to account for the leader.  This might
 440          * be overgenerous since the leader will do less work than other workers
 441          * in typical cases, but we'll go with it for now.
 442          */
 443         Assert(path->num_workers > 0);
 444         N = (double) path->num_workers + 1;
 445         logN = LOG2(N);
 446
 447         /* Assumed cost per tuple comparison */
 448         comparison_cost = 2.0 * cpu_operator_cost;
 449
 450         /* Heap creation cost */
 451         startup_cost += comparison_cost * N * logN;
 452
 453         /* Per-tuple heap maintenance cost */
 454         run_cost += path->path.rows * comparison_cost * logN;
 455
 456         /* small cost for heap management, like cost_merge_append */
 457         run_cost += cpu_operator_cost * path->path.rows;
 458
 459         /*
 460          * Parallel setup and communication cost.  Since Gather Merge, unlike
 461          * Gather, requires us to block until a tuple is available from every
 462          * worker, we bump the IPC cost up a little bit as compared with Gather.
 463          * For lack of a better idea, charge an extra 5%.
 464          */
 465         startup_cost += parallel_setup_cost;
 466         run_cost += parallel_tuple_cost * path->path.rows * 1.05;
 467
 468         path->path.startup_cost = startup_cost + input_startup_cost;
 469         path->path.total_cost = (startup_cost + run_cost + input_total_cost);
 470 }
 471
 472 /*
 473  * cost_index
 474  *        Determines and returns the cost of scanning a relation using an index.
 475  *
 476  * 'path' describes the indexscan under consideration, and is complete
 477  *              except for the fields to be set by this routine
 478  * 'loop_count' is the number of repetitions of the indexscan to factor into
 479  *              estimates of caching behavior
 480  *
 481  * In addition to rows, startup_cost and total_cost, cost_index() sets the
 482  * path's indextotalcost and indexselectivity fields.  These values will be
 483  * needed if the IndexPath is used in a BitmapIndexScan.
 484  *
 485  * NOTE: path->indexquals must contain only clauses usable as index
 486  * restrictions.  Any additional quals evaluated as qpquals may reduce the
 487  * number of returned tuples, but they won't reduce the number of tuples
 488  * we have to fetch from the table, so they don't reduce the scan cost.
 489  */
 490 void
 491 cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
 492                    bool partial_path)
 493 {
 494         IndexOptInfo *index = path->indexinfo;
 495         RelOptInfo *baserel = index->rel;
 496         bool            indexonly = (path->path.pathtype == T_IndexOnlyScan);
 497         amcostestimate_function amcostestimate;
 498         List       *qpquals;
 499         Cost            startup_cost = 0;
 500         Cost            run_cost = 0;
 501         Cost            cpu_run_cost = 0;
 502         Cost            indexStartupCost;
 503         Cost            indexTotalCost;
 504         Selectivity indexSelectivity;
 505         double          indexCorrelation,
 506                                 csquared;
 507         double          spc_seq_page_cost,
 508                                 spc_random_page_cost;
 509         Cost            min_IO_cost,
 510                                 max_IO_cost;
 511         QualCost        qpqual_cost;
 512         Cost            cpu_per_tuple;
 513         double          tuples_fetched;
 514         double          pages_fetched;
 515         double          rand_heap_pages;
 516         double          index_pages;
 517
 518         /* Should only be applied to base relations */
 519         Assert(IsA(baserel, RelOptInfo) &&
 520                    IsA(index, IndexOptInfo));
 521         Assert(baserel->relid > 0);
 522         Assert(baserel->rtekind == RTE_RELATION);
 523
 524         /*
 525          * Mark the path with the correct row estimate, and identify which quals
 526          * will need to be enforced as qpquals.  We need not check any quals that
 527          * are implied by the index's predicate, so we can use indrestrictinfo not
 528          * baserestrictinfo as the list of relevant restriction clauses for the
 529          * rel.
 530          */
 531         if (path->path.param_info)
 532         {
 533                 path->path.rows = path->path.param_info->ppi_rows;
 534                 /* qpquals come from the rel's restriction clauses and ppi_clauses */
 535                 qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
 536                                                                                                                   path->indexclauses),
 537                                                           extract_nonindex_conditions(path->path.param_info->ppi_clauses,
 538                                                                                                                   path->indexclauses));
 539         }
 540         else
 541         {
 542                 path->path.rows = baserel->rows;
 543                 /* qpquals come from just the rel's restriction clauses */
 544                 qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
 545                                                                                           path->indexclauses);
 546         }
 547
 548         if (!enable_indexscan)
 549                 startup_cost += disable_cost;
 550         /* we don't need to check enable_indexonlyscan; indxpath.c does that */
 551
 552         /*
 553          * Call index-access-method-specific code to estimate the processing cost
 554          * for scanning the index, as well as the selectivity of the index (ie,
 555          * the fraction of main-table tuples we will have to retrieve) and its
 556          * correlation to the main-table tuple order.  We need a cast here because
 557          * pathnodes.h uses a weak function type to avoid including amapi.h.
 558          */
 559         amcostestimate = (amcostestimate_function) index->amcostestimate;
 560         amcostestimate(root, path, loop_count,
 561                                    &indexStartupCost, &indexTotalCost,
 562                                    &indexSelectivity, &indexCorrelation,
 563                                    &index_pages);
 564
 565         /*
 566          * Save amcostestimate's results for possible use in bitmap scan planning.
 567          * We don't bother to save indexStartupCost or indexCorrelation, because a
 568          * bitmap scan doesn't care about either.
 569          */
 570         path->indextotalcost = indexTotalCost;
 571         path->indexselectivity = indexSelectivity;
 572
 573         /* all costs for touching index itself included here */
 574         startup_cost += indexStartupCost;
 575         run_cost += indexTotalCost - indexStartupCost;
 576
 577         /* estimate number of main-table tuples fetched */
 578         tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
 579
 580         /* fetch estimated page costs for tablespace containing table */
 581         get_tablespace_page_costs(baserel->reltablespace,
 582                                                           &spc_random_page_cost,
 583                                                           &spc_seq_page_cost);
 584
 585         /*----------
 586          * Estimate number of main-table pages fetched, and compute I/O cost.
 587          *
 588          * When the index ordering is uncorrelated with the table ordering,
 589          * we use an approximation proposed by Mackert and Lohman (see
 590          * index_pages_fetched() for details) to compute the number of pages
 591          * fetched, and then charge spc_random_page_cost per page fetched.
 592          *
 593          * When the index ordering is exactly correlated with the table ordering
 594          * (just after a CLUSTER, for example), the number of pages fetched should
 595          * be exactly selectivity * table_size.  What's more, all but the first
 596          * will be sequential fetches, not the random fetches that occur in the
 597          * uncorrelated case.  So if the number of pages is more than 1, we
 598          * ought to charge
 599          *              spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
 600          * For partially-correlated indexes, we ought to charge somewhere between
 601          * these two estimates.  We currently interpolate linearly between the
 602          * estimates based on the correlation squared (XXX is that appropriate?).
 603          *
 604          * If it's an index-only scan, then we will not need to fetch any heap
 605          * pages for which the visibility map shows all tuples are visible.
 606          * Hence, reduce the estimated number of heap fetches accordingly.
 607          * We use the measured fraction of the entire heap that is all-visible,
 608          * which might not be particularly relevant to the subset of the heap
 609          * that this query will fetch; but it's not clear how to do better.
 610          *----------
 611          */
 612         if (loop_count > 1)
 613         {
 614                 /*
 615                  * For repeated indexscans, the appropriate estimate for the
 616                  * uncorrelated case is to scale up the number of tuples fetched in
 617                  * the Mackert and Lohman formula by the number of scans, so that we
 618                  * estimate the number of pages fetched by all the scans; then
 619                  * pro-rate the costs for one scan.  In this case we assume all the
 620                  * fetches are random accesses.
 621                  */
 622                 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
 623                                                                                         baserel->pages,
 624                                                                                         (double) index->pages,
 625                                                                                         root);
 626
 627                 if (indexonly)
 628                         pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
 629
 630                 rand_heap_pages = pages_fetched;
 631
 632                 max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
 633
 634                 /*
 635                  * In the perfectly correlated case, the number of pages touched by
 636                  * each scan is selectivity * table_size, and we can use the Mackert
 637                  * and Lohman formula at the page level to estimate how much work is
 638                  * saved by caching across scans.  We still assume all the fetches are
 639                  * random, though, which is an overestimate that's hard to correct for
 640                  * without double-counting the cache effects.  (But in most cases
 641                  * where such a plan is actually interesting, only one page would get
 642                  * fetched per scan anyway, so it shouldn't matter much.)
 643                  */
 644                 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
 645
 646                 pages_fetched = index_pages_fetched(pages_fetched * loop_count,
 647                                                                                         baserel->pages,
 648                                                                                         (double) index->pages,
 649                                                                                         root);
 650
 651                 if (indexonly)
 652                         pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
 653
 654                 min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
 655         }
 656         else
 657         {
 658                 /*
 659                  * Normal case: apply the Mackert and Lohman formula, and then
 660                  * interpolate between that and the correlation-derived result.
 661                  */
 662                 pages_fetched = index_pages_fetched(tuples_fetched,
 663                                                                                         baserel->pages,
 664                                                                                         (double) index->pages,
 665                                                                                         root);
 666
 667                 if (indexonly)
 668                         pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
 669
 670                 rand_heap_pages = pages_fetched;
 671
 672                 /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
 673                 max_IO_cost = pages_fetched * spc_random_page_cost;
 674
 675                 /* min_IO_cost is for the perfectly correlated case (csquared=1) */
 676                 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
 677
 678                 if (indexonly)
 679                         pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
 680
 681                 if (pages_fetched > 0)
 682                 {
 683                         min_IO_cost = spc_random_page_cost;
 684                         if (pages_fetched > 1)
 685                                 min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
 686                 }
 687                 else
 688                         min_IO_cost = 0;
 689         }
 690
 691         if (partial_path)
 692         {
 693                 /*
 694                  * For index only scans compute workers based on number of index pages
 695                  * fetched; the number of heap pages we fetch might be so small as to
 696                  * effectively rule out parallelism, which we don't want to do.
 697                  */
 698                 if (indexonly)
 699                         rand_heap_pages = -1;
 700
 701                 /*
 702                  * Estimate the number of parallel workers required to scan index. Use
 703                  * the number of heap pages computed considering heap fetches won't be
 704                  * sequential as for parallel scans the pages are accessed in random
 705                  * order.
 706                  */
 707                 path->path.parallel_workers = compute_parallel_worker(baserel,
 708                                                                                                                           rand_heap_pages,
 709                                                                                                                           index_pages,
 710                                                                                                                           max_parallel_workers_per_gather);
 711
 712                 /*
 713                  * Fall out if workers can't be assigned for parallel scan, because in
 714                  * such a case this path will be rejected.  So there is no benefit in
 715                  * doing extra computation.
 716                  */
 717                 if (path->path.parallel_workers <= 0)
 718                         return;
 719
 720                 path->path.parallel_aware = true;
 721         }
 722
 723         /*
 724          * Now interpolate based on estimated index order correlation to get total
 725          * disk I/O cost for main table accesses.
 726          */
 727         csquared = indexCorrelation * indexCorrelation;
 728
 729         run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
 730
 731         /*
 732          * Estimate CPU costs per tuple.
 733          *
 734          * What we want here is cpu_tuple_cost plus the evaluation costs of any
 735          * qual clauses that we have to evaluate as qpquals.
 736          */
 737         cost_qual_eval(&qpqual_cost, qpquals, root);
 738
 739         startup_cost += qpqual_cost.startup;
 740         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
 741
 742         cpu_run_cost += cpu_per_tuple * tuples_fetched;
 743
 744         /* tlist eval costs are paid per output row, not per tuple scanned */
 745         startup_cost += path->path.pathtarget->cost.startup;
 746         cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
 747
 748         /* Adjust costing for parallelism, if used. */
 749         if (path->path.parallel_workers > 0)
 750         {
 751                 double          parallel_divisor = get_parallel_divisor(&path->path);
 752
 753                 path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
 754
 755                 /* The CPU cost is divided among all the workers. */
 756                 cpu_run_cost /= parallel_divisor;
 757         }
 758
 759         run_cost += cpu_run_cost;
 760
 761         path->path.startup_cost = startup_cost;
 762         path->path.total_cost = startup_cost + run_cost;
 763 }
 764
 765 /*
 766  * extract_nonindex_conditions
 767  *
 768  * Given a list of quals to be enforced in an indexscan, extract the ones that
 769  * will have to be applied as qpquals (ie, the index machinery won't handle
 770  * them).  Here we detect only whether a qual clause is directly redundant
 771  * with some indexclause.  If the index path is chosen for use, createplan.c
 772  * will try a bit harder to get rid of redundant qual conditions; specifically
 773  * it will see if quals can be proven to be implied by the indexquals.  But
 774  * it does not seem worth the cycles to try to factor that in at this stage,
 775  * since we're only trying to estimate qual eval costs.  Otherwise this must
 776  * match the logic in create_indexscan_plan().
 777  *
 778  * qual_clauses, and the result, are lists of RestrictInfos.
 779  * indexclauses is a list of IndexClauses.
 780  */
 781 static List *
 782 extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
 783 {
 784         List       *result = NIL;
 785         ListCell   *lc;
 786
 787         foreach(lc, qual_clauses)
 788         {
 789                 RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
 790
 791                 if (rinfo->pseudoconstant)
 792                         continue;                       /* we may drop pseudoconstants here */
 793                 if (is_redundant_with_indexclauses(rinfo, indexclauses))
 794                         continue;                       /* dup or derived from same EquivalenceClass */
 795                 /* ... skip the predicate proof attempt createplan.c will try ... */
 796                 result = lappend(result, rinfo);
 797         }
 798         return result;
 799 }
 800
 801 /*
 802  * index_pages_fetched
 803  *        Estimate the number of pages actually fetched after accounting for
 804  *        cache effects.
 805  *
 806  * We use an approximation proposed by Mackert and Lohman, "Index Scans
 807  * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
 808  * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
 809  * The Mackert and Lohman approximation is that the number of pages
 810  * fetched is
 811  *      PF =
 812  *              min(2TNs/(2T+Ns), T)                    when T <= b
 813  *              2TNs/(2T+Ns)                                    when T > b and Ns <= 2Tb/(2T-b)
 814  *              b + (Ns - 2Tb/(2T-b))*(T-b)/T   when T > b and Ns > 2Tb/(2T-b)
 815  * where
 816  *              T = # pages in table
 817  *              N = # tuples in table
 818  *              s = selectivity = fraction of table to be scanned
 819  *              b = # buffer pages available (we include kernel space here)
 820  *
 821  * We assume that effective_cache_size is the total number of buffer pages
 822  * available for the whole query, and pro-rate that space across all the
 823  * tables in the query and the index currently under consideration.  (This
 824  * ignores space needed for other indexes used by the query, but since we
 825  * don't know which indexes will get used, we can't estimate that very well;
 826  * and in any case counting all the tables may well be an overestimate, since
 827  * depending on the join plan not all the tables may be scanned concurrently.)
 828  *
 829  * The product Ns is the number of tuples fetched; we pass in that
 830  * product rather than calculating it here.  "pages" is the number of pages
 831  * in the object under consideration (either an index or a table).
 832  * "index_pages" is the amount to add to the total table space, which was
 833  * computed for us by make_one_rel.
 834  *
 835  * Caller is expected to have ensured that tuples_fetched is greater than zero
 836  * and rounded to integer (see clamp_row_est).  The result will likewise be
 837  * greater than zero and integral.
 838  */
 839 double
 840 index_pages_fetched(double tuples_fetched, BlockNumber pages,
 841                                         double index_pages, PlannerInfo *root)
 842 {
 843         double          pages_fetched;
 844         double          total_pages;
 845         double          T,
 846                                 b;
 847
 848         /* T is # pages in table, but don't allow it to be zero */
 849         T = (pages > 1) ? (double) pages : 1.0;
 850
 851         /* Compute number of pages assumed to be competing for cache space */
 852         total_pages = root->total_table_pages + index_pages;
 853         total_pages = Max(total_pages, 1.0);
 854         Assert(T <= total_pages);
 855
 856         /* b is pro-rated share of effective_cache_size */
 857         b = (double) effective_cache_size * T / total_pages;
 858
 859         /* force it positive and integral */
 860         if (b <= 1.0)
 861                 b = 1.0;
 862         else
 863                 b = ceil(b);
 864
 865         /* This part is the Mackert and Lohman formula */
 866         if (T <= b)
 867         {
 868                 pages_fetched =
 869                         (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 870                 if (pages_fetched >= T)
 871                         pages_fetched = T;
 872                 else
 873                         pages_fetched = ceil(pages_fetched);
 874         }
 875         else
 876         {
 877                 double          lim;
 878
 879                 lim = (2.0 * T * b) / (2.0 * T - b);
 880                 if (tuples_fetched <= lim)
 881                 {
 882                         pages_fetched =
 883                                 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 884                 }
 885                 else
 886                 {
 887                         pages_fetched =
 888                                 b + (tuples_fetched - lim) * (T - b) / T;
 889                 }
 890                 pages_fetched = ceil(pages_fetched);
 891         }
 892         return pages_fetched;
 893 }
 894
 895 /*
 896  * get_indexpath_pages
 897  *              Determine the total size of the indexes used in a bitmap index path.
 898  *
 899  * Note: if the same index is used more than once in a bitmap tree, we will
 900  * count it multiple times, which perhaps is the wrong thing ... but it's
 901  * not completely clear, and detecting duplicates is difficult, so ignore it
 902  * for now.
 903  */
 904 static double
 905 get_indexpath_pages(Path *bitmapqual)
 906 {
 907         double          result = 0;
 908         ListCell   *l;
 909
 910         if (IsA(bitmapqual, BitmapAndPath))
 911         {
 912                 BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
 913
 914                 foreach(l, apath->bitmapquals)
 915                 {
 916                         result += get_indexpath_pages((Path *) lfirst(l));
 917                 }
 918         }
 919         else if (IsA(bitmapqual, BitmapOrPath))
 920         {
 921                 BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
 922
 923                 foreach(l, opath->bitmapquals)
 924                 {
 925                         result += get_indexpath_pages((Path *) lfirst(l));
 926                 }
 927         }
 928         else if (IsA(bitmapqual, IndexPath))
 929         {
 930                 IndexPath  *ipath = (IndexPath *) bitmapqual;
 931
 932                 result = (double) ipath->indexinfo->pages;
 933         }
 934         else
 935                 elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
 936
 937         return result;
 938 }
 939
 940 /*
 941  * cost_bitmap_heap_scan
 942  *        Determines and returns the cost of scanning a relation using a bitmap
 943  *        index-then-heap plan.
 944  *
 945  * 'baserel' is the relation to be scanned
 946  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
 947  * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
 948  * 'loop_count' is the number of repetitions of the indexscan to factor into
 949  *              estimates of caching behavior
 950  *
 951  * Note: the component IndexPaths in bitmapqual should have been costed
 952  * using the same loop_count.
 953  */
 954 void
 955 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
 956                                           ParamPathInfo *param_info,
 957                                           Path *bitmapqual, double loop_count)
 958 {
 959         Cost            startup_cost = 0;
 960         Cost            run_cost = 0;
 961         Cost            indexTotalCost;
 962         QualCost        qpqual_cost;
 963         Cost            cpu_per_tuple;
 964         Cost            cost_per_page;
 965         Cost            cpu_run_cost;
 966         double          tuples_fetched;
 967         double          pages_fetched;
 968         double          spc_seq_page_cost,
 969                                 spc_random_page_cost;
 970         double          T;
 971
 972         /* Should only be applied to base relations */
 973         Assert(IsA(baserel, RelOptInfo));
 974         Assert(baserel->relid > 0);
 975         Assert(baserel->rtekind == RTE_RELATION);
 976
 977         /* Mark the path with the correct row estimate */
 978         if (param_info)
 979                 path->rows = param_info->ppi_rows;
 980         else
 981                 path->rows = baserel->rows;
 982
 983         if (!enable_bitmapscan)
 984                 startup_cost += disable_cost;
 985
 986         pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
 987                                                                                  loop_count, &indexTotalCost,
 988                                                                                  &tuples_fetched);
 989
 990         startup_cost += indexTotalCost;
 991         T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
 992
 993         /* Fetch estimated page costs for tablespace containing table. */
 994         get_tablespace_page_costs(baserel->reltablespace,
 995                                                           &spc_random_page_cost,
 996                                                           &spc_seq_page_cost);
 997
 998         /*
 999          * For small numbers of pages we should charge spc_random_page_cost
1000          * apiece, while if nearly all the table's pages are being read, it's more
1001          * appropriate to charge spc_seq_page_cost apiece.  The effect is
1002          * nonlinear, too. For lack of a better idea, interpolate like this to
1003          * determine the cost per page.
1004          */
1005         if (pages_fetched >= 2.0)
1006                 cost_per_page = spc_random_page_cost -
1007                         (spc_random_page_cost - spc_seq_page_cost)
1008                         * sqrt(pages_fetched / T);
1009         else
1010                 cost_per_page = spc_random_page_cost;
1011
1012         run_cost += pages_fetched * cost_per_page;
1013
1014         /*
1015          * Estimate CPU costs per tuple.
1016          *
1017          * Often the indexquals don't need to be rechecked at each tuple ... but
1018          * not always, especially not if there are enough tuples involved that the
1019          * bitmaps become lossy.  For the moment, just assume they will be
1020          * rechecked always.  This means we charge the full freight for all the
1021          * scan clauses.
1022          */
1023         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1024
1025         startup_cost += qpqual_cost.startup;
1026         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1027         cpu_run_cost = cpu_per_tuple * tuples_fetched;
1028
1029         /* Adjust costing for parallelism, if used. */
1030         if (path->parallel_workers > 0)
1031         {
1032                 double          parallel_divisor = get_parallel_divisor(path);
1033
1034                 /* The CPU cost is divided among all the workers. */
1035                 cpu_run_cost /= parallel_divisor;
1036
1037                 path->rows = clamp_row_est(path->rows / parallel_divisor);
1038         }
1039
1040
1041         run_cost += cpu_run_cost;
1042
1043         /* tlist eval costs are paid per output row, not per tuple scanned */
1044         startup_cost += path->pathtarget->cost.startup;
1045         run_cost += path->pathtarget->cost.per_tuple * path->rows;
1046
1047         path->startup_cost = startup_cost;
1048         path->total_cost = startup_cost + run_cost;
1049 }
1050
1051 /*
1052  * cost_bitmap_tree_node
1053  *              Extract cost and selectivity from a bitmap tree node (index/and/or)
1054  */
1055 void
1056 cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1057 {
1058         if (IsA(path, IndexPath))
1059         {
1060                 *cost = ((IndexPath *) path)->indextotalcost;
1061                 *selec = ((IndexPath *) path)->indexselectivity;
1062
1063                 /*
1064                  * Charge a small amount per retrieved tuple to reflect the costs of
1065                  * manipulating the bitmap.  This is mostly to make sure that a bitmap
1066                  * scan doesn't look to be the same cost as an indexscan to retrieve a
1067                  * single tuple.
1068                  */
1069                 *cost += 0.1 * cpu_operator_cost * path->rows;
1070         }
1071         else if (IsA(path, BitmapAndPath))
1072         {
1073                 *cost = path->total_cost;
1074                 *selec = ((BitmapAndPath *) path)->bitmapselectivity;
1075         }
1076         else if (IsA(path, BitmapOrPath))
1077         {
1078                 *cost = path->total_cost;
1079                 *selec = ((BitmapOrPath *) path)->bitmapselectivity;
1080         }
1081         else
1082         {
1083                 elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1084                 *cost = *selec = 0;             /* keep compiler quiet */
1085         }
1086 }
1087
1088 /*
1089  * cost_bitmap_and_node
1090  *              Estimate the cost of a BitmapAnd node
1091  *
1092  * Note that this considers only the costs of index scanning and bitmap
1093  * creation, not the eventual heap access.  In that sense the object isn't
1094  * truly a Path, but it has enough path-like properties (costs in particular)
1095  * to warrant treating it as one.  We don't bother to set the path rows field,
1096  * however.
1097  */
1098 void
1099 cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1100 {
1101         Cost            totalCost;
1102         Selectivity selec;
1103         ListCell   *l;
1104
1105         /*
1106          * We estimate AND selectivity on the assumption that the inputs are
1107          * independent.  This is probably often wrong, but we don't have the info
1108          * to do better.
1109          *
1110          * The runtime cost of the BitmapAnd itself is estimated at 100x
1111          * cpu_operator_cost for each tbm_intersect needed.  Probably too small,
1112          * definitely too simplistic?
1113          */
1114         totalCost = 0.0;
1115         selec = 1.0;
1116         foreach(l, path->bitmapquals)
1117         {
1118                 Path       *subpath = (Path *) lfirst(l);
1119                 Cost            subCost;
1120                 Selectivity subselec;
1121
1122                 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1123
1124                 selec *= subselec;
1125
1126                 totalCost += subCost;
1127                 if (l != list_head(path->bitmapquals))
1128                         totalCost += 100.0 * cpu_operator_cost;
1129         }
1130         path->bitmapselectivity = selec;
1131         path->path.rows = 0;            /* per above, not used */
1132         path->path.startup_cost = totalCost;
1133         path->path.total_cost = totalCost;
1134 }
1135
1136 /*
1137  * cost_bitmap_or_node
1138  *              Estimate the cost of a BitmapOr node
1139  *
1140  * See comments for cost_bitmap_and_node.
1141  */
1142 void
1143 cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1144 {
1145         Cost            totalCost;
1146         Selectivity selec;
1147         ListCell   *l;
1148
1149         /*
1150          * We estimate OR selectivity on the assumption that the inputs are
1151          * non-overlapping, since that's often the case in "x IN (list)" type
1152          * situations.  Of course, we clamp to 1.0 at the end.
1153          *
1154          * The runtime cost of the BitmapOr itself is estimated at 100x
1155          * cpu_operator_cost for each tbm_union needed.  Probably too small,
1156          * definitely too simplistic?  We are aware that the tbm_unions are
1157          * optimized out when the inputs are BitmapIndexScans.
1158          */
1159         totalCost = 0.0;
1160         selec = 0.0;
1161         foreach(l, path->bitmapquals)
1162         {
1163                 Path       *subpath = (Path *) lfirst(l);
1164                 Cost            subCost;
1165                 Selectivity subselec;
1166
1167                 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1168
1169                 selec += subselec;
1170
1171                 totalCost += subCost;
1172                 if (l != list_head(path->bitmapquals) &&
1173                         !IsA(subpath, IndexPath))
1174                         totalCost += 100.0 * cpu_operator_cost;
1175         }
1176         path->bitmapselectivity = Min(selec, 1.0);
1177         path->path.rows = 0;            /* per above, not used */
1178         path->path.startup_cost = totalCost;
1179         path->path.total_cost = totalCost;
1180 }
1181
1182 /*
1183  * cost_tidscan
1184  *        Determines and returns the cost of scanning a relation using TIDs.
1185  *
1186  * 'baserel' is the relation to be scanned
1187  * 'tidquals' is the list of TID-checkable quals
1188  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1189  */
1190 void
1191 cost_tidscan(Path *path, PlannerInfo *root,
1192                          RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1193 {
1194         Cost            startup_cost = 0;
1195         Cost            run_cost = 0;
1196         bool            isCurrentOf = false;
1197         QualCost        qpqual_cost;
1198         Cost            cpu_per_tuple;
1199         QualCost        tid_qual_cost;
1200         int                     ntuples;
1201         ListCell   *l;
1202         double          spc_random_page_cost;
1203
1204         /* Should only be applied to base relations */
1205         Assert(baserel->relid > 0);
1206         Assert(baserel->rtekind == RTE_RELATION);
1207
1208         /* Mark the path with the correct row estimate */
1209         if (param_info)
1210                 path->rows = param_info->ppi_rows;
1211         else
1212                 path->rows = baserel->rows;
1213
1214         /* Count how many tuples we expect to retrieve */
1215         ntuples = 0;
1216         foreach(l, tidquals)
1217         {
1218                 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1219                 Expr       *qual = rinfo->clause;
1220
1221                 if (IsA(qual, ScalarArrayOpExpr))
1222                 {
1223                         /* Each element of the array yields 1 tuple */
1224                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1225                         Node       *arraynode = (Node *) lsecond(saop->args);
1226
1227                         ntuples += estimate_array_length(arraynode);
1228                 }
1229                 else if (IsA(qual, CurrentOfExpr))
1230                 {
1231                         /* CURRENT OF yields 1 tuple */
1232                         isCurrentOf = true;
1233                         ntuples++;
1234                 }
1235                 else
1236                 {
1237                         /* It's just CTID = something, count 1 tuple */
1238                         ntuples++;
1239                 }
1240         }
1241
1242         /*
1243          * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1244          * understands how to do it correctly.  Therefore, honor enable_tidscan
1245          * only when CURRENT OF isn't present.  Also note that cost_qual_eval
1246          * counts a CurrentOfExpr as having startup cost disable_cost, which we
1247          * subtract off here; that's to prevent other plan types such as seqscan
1248          * from winning.
1249          */
1250         if (isCurrentOf)
1251         {
1252                 Assert(baserel->baserestrictcost.startup >= disable_cost);
1253                 startup_cost -= disable_cost;
1254         }
1255         else if (!enable_tidscan)
1256                 startup_cost += disable_cost;
1257
1258         /*
1259          * The TID qual expressions will be computed once, any other baserestrict
1260          * quals once per retrieved tuple.
1261          */
1262         cost_qual_eval(&tid_qual_cost, tidquals, root);
1263
1264         /* fetch estimated page cost for tablespace containing table */
1265         get_tablespace_page_costs(baserel->reltablespace,
1266                                                           &spc_random_page_cost,
1267                                                           NULL);
1268
1269         /* disk costs --- assume each tuple on a different page */
1270         run_cost += spc_random_page_cost * ntuples;
1271
1272         /* Add scanning CPU costs */
1273         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1274
1275         /* XXX currently we assume TID quals are a subset of qpquals */
1276         startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1277         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1278                 tid_qual_cost.per_tuple;
1279         run_cost += cpu_per_tuple * ntuples;
1280
1281         /* tlist eval costs are paid per output row, not per tuple scanned */
1282         startup_cost += path->pathtarget->cost.startup;
1283         run_cost += path->pathtarget->cost.per_tuple * path->rows;
1284
1285         path->startup_cost = startup_cost;
1286         path->total_cost = startup_cost + run_cost;
1287 }
1288
1289 /*
1290  * cost_tidrangescan
1291  *        Determines and sets the costs of scanning a relation using a range of
1292  *        TIDs for 'path'
1293  *
1294  * 'baserel' is the relation to be scanned
1295  * 'tidrangequals' is the list of TID-checkable range quals
1296  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1297  */
1298 void
1299 cost_tidrangescan(Path *path, PlannerInfo *root,
1300                                   RelOptInfo *baserel, List *tidrangequals,
1301                                   ParamPathInfo *param_info)
1302 {
1303         Selectivity selectivity;
1304         double          pages;
1305         Cost            startup_cost = 0;
1306         Cost            run_cost = 0;
1307         QualCost        qpqual_cost;
1308         Cost            cpu_per_tuple;
1309         QualCost        tid_qual_cost;
1310         double          ntuples;
1311         double          nseqpages;
1312         double          spc_random_page_cost;
1313         double          spc_seq_page_cost;
1314
1315         /* Should only be applied to base relations */
1316         Assert(baserel->relid > 0);
1317         Assert(baserel->rtekind == RTE_RELATION);
1318
1319         /* Mark the path with the correct row estimate */
1320         if (param_info)
1321                 path->rows = param_info->ppi_rows;
1322         else
1323                 path->rows = baserel->rows;
1324
1325         /* Count how many tuples and pages we expect to scan */
1326         selectivity = clauselist_selectivity(root, tidrangequals, baserel->relid,
1327                                                                                  JOIN_INNER, NULL);
1328         pages = ceil(selectivity * baserel->pages);
1329
1330         if (pages <= 0.0)
1331                 pages = 1.0;
1332
1333         /*
1334          * The first page in a range requires a random seek, but each subsequent
1335          * page is just a normal sequential page read. NOTE: it's desirable for
1336          * TID Range Scans to cost more than the equivalent Sequential Scans,
1337          * because Seq Scans have some performance advantages such as scan
1338          * synchronization and parallelizability, and we'd prefer one of them to
1339          * be picked unless a TID Range Scan really is better.
1340          */
1341         ntuples = selectivity * baserel->tuples;
1342         nseqpages = pages - 1.0;
1343
1344         if (!enable_tidscan)
1345                 startup_cost += disable_cost;
1346
1347         /*
1348          * The TID qual expressions will be computed once, any other baserestrict
1349          * quals once per retrieved tuple.
1350          */
1351         cost_qual_eval(&tid_qual_cost, tidrangequals, root);
1352
1353         /* fetch estimated page cost for tablespace containing table */
1354         get_tablespace_page_costs(baserel->reltablespace,
1355                                                           &spc_random_page_cost,
1356                                                           &spc_seq_page_cost);
1357
1358         /* disk costs; 1 random page and the remainder as seq pages */
1359         run_cost += spc_random_page_cost + spc_seq_page_cost * nseqpages;
1360
1361         /* Add scanning CPU costs */
1362         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1363
1364         /*
1365          * XXX currently we assume TID quals are a subset of qpquals at this
1366          * point; they will be removed (if possible) when we create the plan, so
1367          * we subtract their cost from the total qpqual cost.  (If the TID quals
1368          * can't be removed, this is a mistake and we're going to underestimate
1369          * the CPU cost a bit.)
1370          */
1371         startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1372         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1373                 tid_qual_cost.per_tuple;
1374         run_cost += cpu_per_tuple * ntuples;
1375
1376         /* tlist eval costs are paid per output row, not per tuple scanned */
1377         startup_cost += path->pathtarget->cost.startup;
1378         run_cost += path->pathtarget->cost.per_tuple * path->rows;
1379
1380         path->startup_cost = startup_cost;
1381         path->total_cost = startup_cost + run_cost;
1382 }
1383
1384 /*
1385  * cost_subqueryscan
1386  *        Determines and returns the cost of scanning a subquery RTE.
1387  *
1388  * 'baserel' is the relation to be scanned
1389  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1390  */
1391 void
1392 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1393                                   RelOptInfo *baserel, ParamPathInfo *param_info)
1394 {
1395         Cost            startup_cost;
1396         Cost            run_cost;
1397         QualCost        qpqual_cost;
1398         Cost            cpu_per_tuple;
1399
1400         /* Should only be applied to base relations that are subqueries */
1401         Assert(baserel->relid > 0);
1402         Assert(baserel->rtekind == RTE_SUBQUERY);
1403
1404         /* Mark the path with the correct row estimate */
1405         if (param_info)
1406                 path->path.rows = param_info->ppi_rows;
1407         else
1408                 path->path.rows = baserel->rows;
1409
1410         /*
1411          * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1412          * any restriction clauses and tlist that will be attached to the
1413          * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1414          * projection overhead.
1415          */
1416         path->path.startup_cost = path->subpath->startup_cost;
1417         path->path.total_cost = path->subpath->total_cost;
1418
1419         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1420
1421         startup_cost = qpqual_cost.startup;
1422         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1423         run_cost = cpu_per_tuple * baserel->tuples;
1424
1425         /* tlist eval costs are paid per output row, not per tuple scanned */
1426         startup_cost += path->path.pathtarget->cost.startup;
1427         run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1428
1429         path->path.startup_cost += startup_cost;
1430         path->path.total_cost += startup_cost + run_cost;
1431 }
1432
1433 /*
1434  * cost_functionscan
1435  *        Determines and returns the cost of scanning a function RTE.
1436  *
1437  * 'baserel' is the relation to be scanned
1438  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1439  */
1440 void
1441 cost_functionscan(Path *path, PlannerInfo *root,
1442                                   RelOptInfo *baserel, ParamPathInfo *param_info)
1443 {
1444         Cost            startup_cost = 0;
1445         Cost            run_cost = 0;
1446         QualCost        qpqual_cost;
1447         Cost            cpu_per_tuple;
1448         RangeTblEntry *rte;
1449         QualCost        exprcost;
1450
1451         /* Should only be applied to base relations that are functions */
1452         Assert(baserel->relid > 0);
1453         rte = planner_rt_fetch(baserel->relid, root);
1454         Assert(rte->rtekind == RTE_FUNCTION);
1455
1456         /* Mark the path with the correct row estimate */
1457         if (param_info)
1458                 path->rows = param_info->ppi_rows;
1459         else
1460                 path->rows = baserel->rows;
1461
1462         /*
1463          * Estimate costs of executing the function expression(s).
1464          *
1465          * Currently, nodeFunctionscan.c always executes the functions to
1466          * completion before returning any rows, and caches the results in a
1467          * tuplestore.  So the function eval cost is all startup cost, and per-row
1468          * costs are minimal.
1469          *
1470          * XXX in principle we ought to charge tuplestore spill costs if the
1471          * number of rows is large.  However, given how phony our rowcount
1472          * estimates for functions tend to be, there's not a lot of point in that
1473          * refinement right now.
1474          */
1475         cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1476
1477         startup_cost += exprcost.startup + exprcost.per_tuple;
1478
1479         /* Add scanning CPU costs */
1480         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1481
1482         startup_cost += qpqual_cost.startup;
1483         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1484         run_cost += cpu_per_tuple * baserel->tuples;
1485
1486         /* tlist eval costs are paid per output row, not per tuple scanned */
1487         startup_cost += path->pathtarget->cost.startup;
1488         run_cost += path->pathtarget->cost.per_tuple * path->rows;
1489
1490         path->startup_cost = startup_cost;
1491         path->total_cost = startup_cost + run_cost;
1492 }
1493
1494 /*
1495  * cost_tablefuncscan
1496  *        Determines and returns the cost of scanning a table function.
1497  *
1498  * 'baserel' is the relation to be scanned
1499  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1500  */
1501 void
1502 cost_tablefuncscan(Path *path, PlannerInfo *root,
1503                                    RelOptInfo *baserel, ParamPathInfo *param_info)
1504 {
1505         Cost            startup_cost = 0;
1506         Cost            run_cost = 0;
1507         QualCost        qpqual_cost;
1508         Cost            cpu_per_tuple;
1509         RangeTblEntry *rte;
1510         QualCost        exprcost;
1511
1512         /* Should only be applied to base relations that are functions */
1513         Assert(baserel->relid > 0);
1514         rte = planner_rt_fetch(baserel->relid, root);
1515         Assert(rte->rtekind == RTE_TABLEFUNC);
1516
1517         /* Mark the path with the correct row estimate */
1518         if (param_info)
1519                 path->rows = param_info->ppi_rows;
1520         else
1521                 path->rows = baserel->rows;
1522
1523         /*
1524          * Estimate costs of executing the table func expression(s).
1525          *
1526          * XXX in principle we ought to charge tuplestore spill costs if the
1527          * number of rows is large.  However, given how phony our rowcount
1528          * estimates for tablefuncs tend to be, there's not a lot of point in that
1529          * refinement right now.
1530          */
1531         cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1532
1533         startup_cost += exprcost.startup + exprcost.per_tuple;
1534
1535         /* Add scanning CPU costs */
1536         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1537
1538         startup_cost += qpqual_cost.startup;
1539         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1540         run_cost += cpu_per_tuple * baserel->tuples;
1541
1542         /* tlist eval costs are paid per output row, not per tuple scanned */
1543         startup_cost += path->pathtarget->cost.startup;
1544         run_cost += path->pathtarget->cost.per_tuple * path->rows;
1545
1546         path->startup_cost = startup_cost;
1547         path->total_cost = startup_cost + run_cost;
1548 }
1549
1550 /*
1551  * cost_valuesscan
1552  *        Determines and returns the cost of scanning a VALUES RTE.
1553  *
1554  * 'baserel' is the relation to be scanned
1555  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1556  */
1557 void
1558 cost_valuesscan(Path *path, PlannerInfo *root,
1559                                 RelOptInfo *baserel, ParamPathInfo *param_info)
1560 {
1561         Cost            startup_cost = 0;
1562         Cost            run_cost = 0;
1563         QualCost        qpqual_cost;
1564         Cost            cpu_per_tuple;
1565
1566         /* Should only be applied to base relations that are values lists */
1567         Assert(baserel->relid > 0);
1568         Assert(baserel->rtekind == RTE_VALUES);
1569
1570         /* Mark the path with the correct row estimate */
1571         if (param_info)
1572                 path->rows = param_info->ppi_rows;
1573         else
1574                 path->rows = baserel->rows;
1575
1576         /*
1577          * For now, estimate list evaluation cost at one operator eval per list
1578          * (probably pretty bogus, but is it worth being smarter?)
1579          */
1580         cpu_per_tuple = cpu_operator_cost;
1581
1582         /* Add scanning CPU costs */
1583         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1584
1585         startup_cost += qpqual_cost.startup;
1586         cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1587         run_cost += cpu_per_tuple * baserel->tuples;
1588
1589         /* tlist eval costs are paid per output row, not per tuple scanned */
1590         startup_cost += path->pathtarget->cost.startup;
1591         run_cost += path->pathtarget->cost.per_tuple * path->rows;
1592
1593         path->startup_cost = startup_cost;
1594         path->total_cost = startup_cost + run_cost;
1595 }
1596
1597 /*
1598  * cost_ctescan
1599  *        Determines and returns the cost of scanning a CTE RTE.
1600  *
1601  * Note: this is used for both self-reference and regular CTEs; the
1602  * possible cost differences are below the threshold of what we could
1603  * estimate accurately anyway.  Note that the costs of evaluating the
1604  * referenced CTE query are added into the final plan as initplan costs,
1605  * and should NOT be counted here.
1606  */
1607 void
1608 cost_ctescan(Path *path, PlannerInfo *root,
1609                          RelOptInfo *baserel, ParamPathInfo *param_info)
1610 {
1611         Cost            startup_cost = 0;
1612         Cost            run_cost = 0;
1613         QualCost        qpqual_cost;
1614         Cost            cpu_per_tuple;
1615
1616         /* Should only be applied to base relations that are CTEs */
1617         Assert(baserel->relid > 0);
1618         Assert(baserel->rtekind == RTE_CTE);
1619
1620         /* Mark the path with the correct row estimate */
1621         if (param_info)
1622                 path->rows = param_info->ppi_rows;
1623         else
1624                 path->rows = baserel->rows;
1625
1626         /* Charge one CPU tuple cost per row for tuplestore manipulation */
1627         cpu_per_tuple = cpu_tuple_cost;
1628
1629         /* Add scanning CPU costs */
1630         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1631
1632         startup_cost += qpqual_cost.startup;
1633         cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1634         run_cost += cpu_per_tuple * baserel->tuples;
1635
1636         /* tlist eval costs are paid per output row, not per tuple scanned */
1637         startup_cost += path->pathtarget->cost.startup;
1638         run_cost += path->pathtarget->cost.per_tuple * path->rows;
1639
1640         path->startup_cost = startup_cost;
1641         path->total_cost = startup_cost + run_cost;
1642 }
1643
1644 /*
1645  * cost_namedtuplestorescan
1646  *        Determines and returns the cost of scanning a named tuplestore.
1647  */
1648 void
1649 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1650                                                  RelOptInfo *baserel, ParamPathInfo *param_info)
1651 {
1652         Cost            startup_cost = 0;
1653         Cost            run_cost = 0;
1654         QualCost        qpqual_cost;
1655         Cost            cpu_per_tuple;
1656
1657         /* Should only be applied to base relations that are Tuplestores */
1658         Assert(baserel->relid > 0);
1659         Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1660
1661         /* Mark the path with the correct row estimate */
1662         if (param_info)
1663                 path->rows = param_info->ppi_rows;
1664         else
1665                 path->rows = baserel->rows;
1666
1667         /* Charge one CPU tuple cost per row for tuplestore manipulation */
1668         cpu_per_tuple = cpu_tuple_cost;
1669
1670         /* Add scanning CPU costs */
1671         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1672
1673         startup_cost += qpqual_cost.startup;
1674         cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1675         run_cost += cpu_per_tuple * baserel->tuples;
1676
1677         path->startup_cost = startup_cost;
1678         path->total_cost = startup_cost + run_cost;
1679 }
1680
1681 /*
1682  * cost_resultscan
1683  *        Determines and returns the cost of scanning an RTE_RESULT relation.
1684  */
1685 void
1686 cost_resultscan(Path *path, PlannerInfo *root,
1687                                 RelOptInfo *baserel, ParamPathInfo *param_info)
1688 {
1689         Cost            startup_cost = 0;
1690         Cost            run_cost = 0;
1691         QualCost        qpqual_cost;
1692         Cost            cpu_per_tuple;
1693
1694         /* Should only be applied to RTE_RESULT base relations */
1695         Assert(baserel->relid > 0);
1696         Assert(baserel->rtekind == RTE_RESULT);
1697
1698         /* Mark the path with the correct row estimate */
1699         if (param_info)
1700                 path->rows = param_info->ppi_rows;
1701         else
1702                 path->rows = baserel->rows;
1703
1704         /* We charge qual cost plus cpu_tuple_cost */
1705         get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1706
1707         startup_cost += qpqual_cost.startup;
1708         cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1709         run_cost += cpu_per_tuple * baserel->tuples;
1710
1711         path->startup_cost = startup_cost;
1712         path->total_cost = startup_cost + run_cost;
1713 }
1714
1715 /*
1716  * cost_recursive_union
1717  *        Determines and returns the cost of performing a recursive union,
1718  *        and also the estimated output size.
1719  *
1720  * We are given Paths for the nonrecursive and recursive terms.
1721  */
1722 void
1723 cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1724 {
1725         Cost            startup_cost;
1726         Cost            total_cost;
1727         double          total_rows;
1728
1729         /* We probably have decent estimates for the non-recursive term */
1730         startup_cost = nrterm->startup_cost;
1731         total_cost = nrterm->total_cost;
1732         total_rows = nrterm->rows;
1733
1734         /*
1735          * We arbitrarily assume that about 10 recursive iterations will be
1736          * needed, and that we've managed to get a good fix on the cost and output
1737          * size of each one of them.  These are mighty shaky assumptions but it's
1738          * hard to see how to do better.
1739          */
1740         total_cost += 10 * rterm->total_cost;
1741         total_rows += 10 * rterm->rows;
1742
1743         /*
1744          * Also charge cpu_tuple_cost per row to account for the costs of
1745          * manipulating the tuplestores.  (We don't worry about possible
1746          * spill-to-disk costs.)
1747          */
1748         total_cost += cpu_tuple_cost * total_rows;
1749
1750         runion->startup_cost = startup_cost;
1751         runion->total_cost = total_cost;
1752         runion->rows = total_rows;
1753         runion->pathtarget->width = Max(nrterm->pathtarget->width,
1754                                                                         rterm->pathtarget->width);
1755 }
1756
1757 /*
1758  * cost_tuplesort
1759  *        Determines and returns the cost of sorting a relation using tuplesort,
1760  *    not including the cost of reading the input data.
1761  *
1762  * If the total volume of data to sort is less than sort_mem, we will do
1763  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1764  * comparisons for t tuples.
1765  *
1766  * If the total volume exceeds sort_mem, we switch to a tape-style merge
1767  * algorithm.  There will still be about t*log2(t) tuple comparisons in
1768  * total, but we will also need to write and read each tuple once per
1769  * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
1770  * number of initial runs formed and M is the merge order used by tuplesort.c.
1771  * Since the average initial run should be about sort_mem, we have
1772  *              disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1773  *              cpu = comparison_cost * t * log2(t)
1774  *
1775  * If the sort is bounded (i.e., only the first k result tuples are needed)
1776  * and k tuples can fit into sort_mem, we use a heap method that keeps only
1777  * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1778  *
1779  * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1780  * accesses (XXX can't we refine that guess?)
1781  *
1782  * By default, we charge two operator evals per tuple comparison, which should
1783  * be in the right ballpark in most cases.  The caller can tweak this by
1784  * specifying nonzero comparison_cost; typically that's used for any extra
1785  * work that has to be done to prepare the inputs to the comparison operators.
1786  *
1787  * 'tuples' is the number of tuples in the relation
1788  * 'width' is the average tuple width in bytes
1789  * 'comparison_cost' is the extra cost per comparison, if any
1790  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1791  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1792  */
1793 static void
1794 cost_tuplesort(Cost *startup_cost, Cost *run_cost,
1795                            double tuples, int width,
1796                            Cost comparison_cost, int sort_mem,
1797                            double limit_tuples)
1798 {
1799         double          input_bytes = relation_byte_size(tuples, width);
1800         double          output_bytes;
1801         double          output_tuples;
1802         long            sort_mem_bytes = sort_mem * 1024L;
1803
1804         /*
1805          * We want to be sure the cost of a sort is never estimated as zero, even
1806          * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
1807          */
1808         if (tuples < 2.0)
1809                 tuples = 2.0;
1810
1811         /* Include the default cost-per-comparison */
1812         comparison_cost += 2.0 * cpu_operator_cost;
1813
1814         /* Do we have a useful LIMIT? */
1815         if (limit_tuples > 0 && limit_tuples < tuples)
1816         {
1817                 output_tuples = limit_tuples;
1818                 output_bytes = relation_byte_size(output_tuples, width);
1819         }
1820         else
1821         {
1822                 output_tuples = tuples;
1823                 output_bytes = input_bytes;
1824         }
1825
1826         if (output_bytes > sort_mem_bytes)
1827         {
1828                 /*
1829                  * We'll have to use a disk-based sort of all the tuples
1830                  */
1831                 double          npages = ceil(input_bytes / BLCKSZ);
1832                 double          nruns = input_bytes / sort_mem_bytes;
1833                 double          mergeorder = tuplesort_merge_order(sort_mem_bytes);
1834                 double          log_runs;
1835                 double          npageaccesses;
1836
1837                 /*
1838                  * CPU costs
1839                  *
1840                  * Assume about N log2 N comparisons
1841                  */
1842                 *startup_cost = comparison_cost * tuples * LOG2(tuples);
1843
1844                 /* Disk costs */
1845
1846                 /* Compute logM(r) as log(r) / log(M) */
1847                 if (nruns > mergeorder)
1848                         log_runs = ceil(log(nruns) / log(mergeorder));
1849                 else
1850                         log_runs = 1.0;
1851                 npageaccesses = 2.0 * npages * log_runs;
1852                 /* Assume 3/4ths of accesses are sequential, 1/4th are not */
1853                 *startup_cost += npageaccesses *
1854                         (seq_page_cost * 0.75 + random_page_cost * 0.25);
1855         }
1856         else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1857         {
1858                 /*
1859                  * We'll use a bounded heap-sort keeping just K tuples in memory, for
1860                  * a total number of tuple comparisons of N log2 K; but the constant
1861                  * factor is a bit higher than for quicksort.  Tweak it so that the
1862                  * cost curve is continuous at the crossover point.
1863                  */
1864                 *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
1865         }
1866         else
1867         {
1868                 /* We'll use plain quicksort on all the input tuples */
1869                 *startup_cost = comparison_cost * tuples * LOG2(tuples);
1870         }
1871
1872         /*
1873          * Also charge a small amount (arbitrarily set equal to operator cost) per
1874          * extracted tuple.  We don't charge cpu_tuple_cost because a Sort node
1875          * doesn't do qual-checking or projection, so it has less overhead than
1876          * most plan nodes.  Note it's correct to use tuples not output_tuples
1877          * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1878          * counting the LIMIT otherwise.
1879          */
1880         *run_cost = cpu_operator_cost * tuples;
1881 }
1882
1883 /*
1884  * cost_incremental_sort
1885  *      Determines and returns the cost of sorting a relation incrementally, when
1886  *  the input path is presorted by a prefix of the pathkeys.
1887  *
1888  * 'presorted_keys' is the number of leading pathkeys by which the input path
1889  * is sorted.
1890  *
1891  * We estimate the number of groups into which the relation is divided by the
1892  * leading pathkeys, and then calculate the cost of sorting a single group
1893  * with tuplesort using cost_tuplesort().
1894  */
1895 void
1896 cost_incremental_sort(Path *path,
1897                                           PlannerInfo *root, List *pathkeys, int presorted_keys,
1898                                           Cost input_startup_cost, Cost input_total_cost,
1899                                           double input_tuples, int width, Cost comparison_cost, int sort_mem,
1900                                           double limit_tuples)
1901 {
1902         Cost            startup_cost = 0,
1903                                 run_cost = 0,
1904                                 input_run_cost = input_total_cost - input_startup_cost;
1905         double          group_tuples,
1906                                 input_groups;
1907         Cost            group_startup_cost,
1908                                 group_run_cost,
1909                                 group_input_run_cost;
1910         List       *presortedExprs = NIL;
1911         ListCell   *l;
1912         int                     i = 0;
1913         bool            unknown_varno = false;
1914
1915         Assert(presorted_keys != 0);
1916
1917         /*
1918          * We want to be sure the cost of a sort is never estimated as zero, even
1919          * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
1920          */
1921         if (input_tuples < 2.0)
1922                 input_tuples = 2.0;
1923
1924         /* Default estimate of number of groups, capped to one group per row. */
1925         input_groups = Min(input_tuples, DEFAULT_NUM_DISTINCT);
1926
1927         /*
1928          * Extract presorted keys as list of expressions.
1929          *
1930          * We need to be careful about Vars containing "varno 0" which might have
1931          * been introduced by generate_append_tlist, which would confuse
1932          * estimate_num_groups (in fact it'd fail for such expressions). See
1933          * recurse_set_operations which has to deal with the same issue.
1934          *
1935          * Unlike recurse_set_operations we can't access the original target list
1936          * here, and even if we could it's not very clear how useful would that be
1937          * for a set operation combining multiple tables. So we simply detect if
1938          * there are any expressions with "varno 0" and use the default
1939          * DEFAULT_NUM_DISTINCT in that case.
1940          *
1941          * We might also use either 1.0 (a single group) or input_tuples (each row
1942          * being a separate group), pretty much the worst and best case for
1943          * incremental sort. But those are extreme cases and using something in
1944          * between seems reasonable. Furthermore, generate_append_tlist is used
1945          * for set operations, which are likely to produce mostly unique output
1946          * anyway - from that standpoint the DEFAULT_NUM_DISTINCT is defensive
1947          * while maintaining lower startup cost.
1948          */
1949         foreach(l, pathkeys)
1950         {
1951                 PathKey    *key = (PathKey *) lfirst(l);
1952                 EquivalenceMember *member = (EquivalenceMember *)
1953                 linitial(key->pk_eclass->ec_members);
1954
1955                 /*
1956                  * Check if the expression contains Var with "varno 0" so that we
1957                  * don't call estimate_num_groups in that case.
1958                  */
1959                 if (bms_is_member(0, pull_varnos(root, (Node *) member->em_expr)))
1960                 {
1961                         unknown_varno = true;
1962                         break;
1963                 }
1964
1965                 /* expression not containing any Vars with "varno 0" */
1966                 presortedExprs = lappend(presortedExprs, member->em_expr);
1967
1968                 i++;
1969                 if (i >= presorted_keys)
1970                         break;
1971         }
1972
1973         /* Estimate number of groups with equal presorted keys. */
1974         if (!unknown_varno)
1975                 input_groups = estimate_num_groups(root, presortedExprs, input_tuples,
1976                                                                                    NULL, NULL);
1977
1978         group_tuples = input_tuples / input_groups;
1979         group_input_run_cost = input_run_cost / input_groups;
1980
1981         /*
1982          * Estimate average cost of sorting of one group where presorted keys are
1983          * equal.  Incremental sort is sensitive to distribution of tuples to the
1984          * groups, where we're relying on quite rough assumptions.  Thus, we're
1985          * pessimistic about incremental sort performance and increase its average
1986          * group size by half.
1987          */
1988         cost_tuplesort(&group_startup_cost, &group_run_cost,
1989                                    1.5 * group_tuples, width, comparison_cost, sort_mem,
1990                                    limit_tuples);
1991
1992         /*
1993          * Startup cost of incremental sort is the startup cost of its first group
1994          * plus the cost of its input.
1995          */
1996         startup_cost += group_startup_cost
1997                 + input_startup_cost + group_input_run_cost;
1998
1999         /*
2000          * After we started producing tuples from the first group, the cost of
2001          * producing all the tuples is given by the cost to finish processing this
2002          * group, plus the total cost to process the remaining groups, plus the
2003          * remaining cost of input.
2004          */
2005         run_cost += group_run_cost
2006                 + (group_run_cost + group_startup_cost) * (input_groups - 1)
2007                 + group_input_run_cost * (input_groups - 1);
2008
2009         /*
2010          * Incremental sort adds some overhead by itself. Firstly, it has to
2011          * detect the sort groups. This is roughly equal to one extra copy and
2012          * comparison per tuple. Secondly, it has to reset the tuplesort context
2013          * for every group.
2014          */
2015         run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
2016         run_cost += 2.0 * cpu_tuple_cost * input_groups;
2017
2018         path->rows = input_tuples;
2019         path->startup_cost = startup_cost;
2020         path->total_cost = startup_cost + run_cost;
2021 }
2022
2023 /*
2024  * cost_sort
2025  *        Determines and returns the cost of sorting a relation, including
2026  *        the cost of reading the input data.
2027  *
2028  * NOTE: some callers currently pass NIL for pathkeys because they
2029  * can't conveniently supply the sort keys.  Since this routine doesn't
2030  * currently do anything with pathkeys anyway, that doesn't matter...
2031  * but if it ever does, it should react gracefully to lack of key data.
2032  * (Actually, the thing we'd most likely be interested in is just the number
2033  * of sort keys, which all callers *could* supply.)
2034  */
2035 void
2036 cost_sort(Path *path, PlannerInfo *root,
2037                   List *pathkeys, Cost input_cost, double tuples, int width,
2038                   Cost comparison_cost, int sort_mem,
2039                   double limit_tuples)
2040
2041 {
2042         Cost            startup_cost;
2043         Cost            run_cost;
2044
2045         cost_tuplesort(&startup_cost, &run_cost,
2046                                    tuples, width,
2047                                    comparison_cost, sort_mem,
2048                                    limit_tuples);
2049
2050         if (!enable_sort)
2051                 startup_cost += disable_cost;
2052
2053         startup_cost += input_cost;
2054
2055         path->rows = tuples;
2056         path->startup_cost = startup_cost;
2057         path->total_cost = startup_cost + run_cost;
2058 }
2059
2060 /*
2061  * append_nonpartial_cost
2062  *        Estimate the cost of the non-partial paths in a Parallel Append.
2063  *        The non-partial paths are assumed to be the first "numpaths" paths
2064  *        from the subpaths list, and to be in order of decreasing cost.
2065  */
2066 static Cost
2067 append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
2068 {
2069         Cost       *costarr;
2070         int                     arrlen;
2071         ListCell   *l;
2072         ListCell   *cell;
2073         int                     i;
2074         int                     path_index;
2075         int                     min_index;
2076         int                     max_index;
2077
2078         if (numpaths == 0)
2079                 return 0;
2080
2081         /*
2082          * Array length is number of workers or number of relevant paths,
2083          * whichever is less.
2084          */
2085         arrlen = Min(parallel_workers, numpaths);
2086         costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
2087
2088         /* The first few paths will each be claimed by a different worker. */
2089         path_index = 0;
2090         foreach(cell, subpaths)
2091         {
2092                 Path       *subpath = (Path *) lfirst(cell);
2093
2094                 if (path_index == arrlen)
2095                         break;
2096                 costarr[path_index++] = subpath->total_cost;
2097         }
2098
2099         /*
2100          * Since subpaths are sorted by decreasing cost, the last one will have
2101          * the minimum cost.
2102          */
2103         min_index = arrlen - 1;
2104
2105         /*
2106          * For each of the remaining subpaths, add its cost to the array element
2107          * with minimum cost.
2108          */
2109         for_each_cell(l, subpaths, cell)
2110         {
2111                 Path       *subpath = (Path *) lfirst(l);
2112                 int                     i;
2113
2114                 /* Consider only the non-partial paths */
2115                 if (path_index++ == numpaths)
2116                         break;
2117
2118                 costarr[min_index] += subpath->total_cost;
2119
2120                 /* Update the new min cost array index */
2121                 for (min_index = i = 0; i < arrlen; i++)
2122                 {
2123                         if (costarr[i] < costarr[min_index])
2124                                 min_index = i;
2125                 }
2126         }
2127
2128         /* Return the highest cost from the array */
2129         for (max_index = i = 0; i < arrlen; i++)
2130         {
2131                 if (costarr[i] > costarr[max_index])
2132                         max_index = i;
2133         }
2134
2135         return costarr[max_index];
2136 }
2137
2138 /*
2139  * cost_append
2140  *        Determines and returns the cost of an Append node.
2141  */
2142 void
2143 cost_append(AppendPath *apath)
2144 {
2145         ListCell   *l;
2146
2147         apath->path.startup_cost = 0;
2148         apath->path.total_cost = 0;
2149         apath->path.rows = 0;
2150
2151         if (apath->subpaths == NIL)
2152                 return;
2153
2154         if (!apath->path.parallel_aware)
2155         {
2156                 List       *pathkeys = apath->path.pathkeys;
2157
2158                 if (pathkeys == NIL)
2159                 {
2160                         Path       *subpath = (Path *) linitial(apath->subpaths);
2161
2162                         /*
2163                          * For an unordered, non-parallel-aware Append we take the startup
2164                          * cost as the startup cost of the first subpath.
2165                          */
2166                         apath->path.startup_cost = subpath->startup_cost;
2167
2168                         /* Compute rows and costs as sums of subplan rows and costs. */
2169                         foreach(l, apath->subpaths)
2170                         {
2171                                 Path       *subpath = (Path *) lfirst(l);
2172
2173                                 apath->path.rows += subpath->rows;
2174                                 apath->path.total_cost += subpath->total_cost;
2175                         }
2176                 }
2177                 else
2178                 {
2179                         /*
2180                          * For an ordered, non-parallel-aware Append we take the startup
2181                          * cost as the sum of the subpath startup costs.  This ensures
2182                          * that we don't underestimate the startup cost when a query's
2183                          * LIMIT is such that several of the children have to be run to
2184                          * satisfy it.  This might be overkill --- another plausible hack
2185                          * would be to take the Append's startup cost as the maximum of
2186                          * the child startup costs.  But we don't want to risk believing
2187                          * that an ORDER BY LIMIT query can be satisfied at small cost
2188                          * when the first child has small startup cost but later ones
2189                          * don't.  (If we had the ability to deal with nonlinear cost
2190                          * interpolation for partial retrievals, we would not need to be
2191                          * so conservative about this.)
2192                          *
2193                          * This case is also different from the above in that we have to
2194                          * account for possibly injecting sorts into subpaths that aren't
2195                          * natively ordered.
2196                          */
2197                         foreach(l, apath->subpaths)
2198                         {
2199                                 Path       *subpath = (Path *) lfirst(l);
2200                                 Path            sort_path;      /* dummy for result of cost_sort */
2201
2202                                 if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
2203                                 {
2204                                         /*
2205                                          * We'll need to insert a Sort node, so include costs for
2206                                          * that.  We can use the parent's LIMIT if any, since we
2207                                          * certainly won't pull more than that many tuples from
2208                                          * any child.
2209                                          */
2210                                         cost_sort(&sort_path,
2211                                                           NULL, /* doesn't currently need root */
2212                                                           pathkeys,
2213                                                           subpath->total_cost,
2214                                                           subpath->rows,
2215                                                           subpath->pathtarget->width,
2216                                                           0.0,
2217                                                           work_mem,
2218                                                           apath->limit_tuples);
2219                                         subpath = &sort_path;
2220                                 }
2221
2222                                 apath->path.rows += subpath->rows;
2223                                 apath->path.startup_cost += subpath->startup_cost;
2224                                 apath->path.total_cost += subpath->total_cost;
2225                         }
2226                 }
2227         }
2228         else                                            /* parallel-aware */
2229         {
2230                 int                     i = 0;
2231                 double          parallel_divisor = get_parallel_divisor(&apath->path);
2232
2233                 /* Parallel-aware Append never produces ordered output. */
2234                 Assert(apath->path.pathkeys == NIL);
2235
2236                 /* Calculate startup cost. */
2237                 foreach(l, apath->subpaths)
2238                 {
2239                         Path       *subpath = (Path *) lfirst(l);
2240
2241                         /*
2242                          * Append will start returning tuples when the child node having
2243                          * lowest startup cost is done setting up. We consider only the
2244                          * first few subplans that immediately get a worker assigned.
2245                          */
2246                         if (i == 0)
2247                                 apath->path.startup_cost = subpath->startup_cost;
2248                         else if (i < apath->path.parallel_workers)
2249                                 apath->path.startup_cost = Min(apath->path.startup_cost,
2250                                                                                            subpath->startup_cost);
2251
2252                         /*
2253                          * Apply parallel divisor to subpaths.  Scale the number of rows
2254                          * for each partial subpath based on the ratio of the parallel
2255                          * divisor originally used for the subpath to the one we adopted.
2256                          * Also add the cost of partial paths to the total cost, but
2257                          * ignore non-partial paths for now.
2258                          */
2259                         if (i < apath->first_partial_path)
2260                                 apath->path.rows += subpath->rows / parallel_divisor;
2261                         else
2262                         {
2263                                 double          subpath_parallel_divisor;
2264
2265                                 subpath_parallel_divisor = get_parallel_divisor(subpath);
2266                                 apath->path.rows += subpath->rows * (subpath_parallel_divisor /
2267                                                                                                          parallel_divisor);
2268                                 apath->path.total_cost += subpath->total_cost;
2269                         }
2270
2271                         apath->path.rows = clamp_row_est(apath->path.rows);
2272
2273                         i++;
2274                 }
2275
2276                 /* Add cost for non-partial subpaths. */
2277                 apath->path.total_cost +=
2278                         append_nonpartial_cost(apath->subpaths,
2279                                                                    apath->first_partial_path,
2280                                                                    apath->path.parallel_workers);
2281         }
2282
2283         /*
2284          * Although Append does not do any selection or projection, it's not free;
2285          * add a small per-tuple overhead.
2286          */
2287         apath->path.total_cost +=
2288                 cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2289 }
2290
2291 /*
2292  * cost_merge_append
2293  *        Determines and returns the cost of a MergeAppend node.
2294  *
2295  * MergeAppend merges several pre-sorted input streams, using a heap that
2296  * at any given instant holds the next tuple from each stream.  If there
2297  * are N streams, we need about N*log2(N) tuple comparisons to construct
2298  * the heap at startup, and then for each output tuple, about log2(N)
2299  * comparisons to replace the top entry.
2300  *
2301  * (The effective value of N will drop once some of the input streams are
2302  * exhausted, but it seems unlikely to be worth trying to account for that.)
2303  *
2304  * The heap is never spilled to disk, since we assume N is not very large.
2305  * So this is much simpler than cost_sort.
2306  *
2307  * As in cost_sort, we charge two operator evals per tuple comparison.
2308  *
2309  * 'pathkeys' is a list of sort keys
2310  * 'n_streams' is the number of input streams
2311  * 'input_startup_cost' is the sum of the input streams' startup costs
2312  * 'input_total_cost' is the sum of the input streams' total costs
2313  * 'tuples' is the number of tuples in all the streams
2314  */
2315 void
2316 cost_merge_append(Path *path, PlannerInfo *root,
2317                                   List *pathkeys, int n_streams,
2318                                   Cost input_startup_cost, Cost input_total_cost,
2319                                   double tuples)
2320 {
2321         Cost            startup_cost = 0;
2322         Cost            run_cost = 0;
2323         Cost            comparison_cost;
2324         double          N;
2325         double          logN;
2326
2327         /*
2328          * Avoid log(0)...
2329          */
2330         N = (n_streams < 2) ? 2.0 : (double) n_streams;
2331         logN = LOG2(N);
2332
2333         /* Assumed cost per tuple comparison */
2334         comparison_cost = 2.0 * cpu_operator_cost;
2335
2336         /* Heap creation cost */
2337         startup_cost += comparison_cost * N * logN;
2338
2339         /* Per-tuple heap maintenance cost */
2340         run_cost += tuples * comparison_cost * logN;
2341
2342         /*
2343          * Although MergeAppend does not do any selection or projection, it's not
2344          * free; add a small per-tuple overhead.
2345          */
2346         run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2347
2348         path->startup_cost = startup_cost + input_startup_cost;
2349         path->total_cost = startup_cost + run_cost + input_total_cost;
2350 }
2351
2352 /*
2353  * cost_material
2354  *        Determines and returns the cost of materializing a relation, including
2355  *        the cost of reading the input data.
2356  *
2357  * If the total volume of data to materialize exceeds work_mem, we will need
2358  * to write it to disk, so the cost is much higher in that case.
2359  *
2360  * Note that here we are estimating the costs for the first scan of the
2361  * relation, so the materialization is all overhead --- any savings will
2362  * occur only on rescan, which is estimated in cost_rescan.
2363  */
2364 void
2365 cost_material(Path *path,
2366                           Cost input_startup_cost, Cost input_total_cost,
2367                           double tuples, int width)
2368 {
2369         Cost            startup_cost = input_startup_cost;
2370         Cost            run_cost = input_total_cost - input_startup_cost;
2371         double          nbytes = relation_byte_size(tuples, width);
2372         long            work_mem_bytes = work_mem * 1024L;
2373
2374         path->rows = tuples;
2375
2376         /*
2377          * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2378          * reflect bookkeeping overhead.  (This rate must be more than what
2379          * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2380          * if it is exactly the same then there will be a cost tie between
2381          * nestloop with A outer, materialized B inner and nestloop with B outer,
2382          * materialized A inner.  The extra cost ensures we'll prefer
2383          * materializing the smaller rel.)      Note that this is normally a good deal
2384          * less than cpu_tuple_cost; which is OK because a Material plan node
2385          * doesn't do qual-checking or projection, so it's got less overhead than
2386          * most plan nodes.
2387          */
2388         run_cost += 2 * cpu_operator_cost * tuples;
2389
2390         /*
2391          * If we will spill to disk, charge at the rate of seq_page_cost per page.
2392          * This cost is assumed to be evenly spread through the plan run phase,
2393          * which isn't exactly accurate but our cost model doesn't allow for
2394          * nonuniform costs within the run phase.
2395          */
2396         if (nbytes > work_mem_bytes)
2397         {
2398                 double          npages = ceil(nbytes / BLCKSZ);
2399
2400                 run_cost += seq_page_cost * npages;
2401         }
2402
2403         path->startup_cost = startup_cost;
2404         path->total_cost = startup_cost + run_cost;
2405 }
2406
2407 /*
2408  * cost_memoize_rescan
2409  *        Determines the estimated cost of rescanning a Memoize node.
2410  *
2411  * In order to estimate this, we must gain knowledge of how often we expect to
2412  * be called and how many distinct sets of parameters we are likely to be
2413  * called with. If we expect a good cache hit ratio, then we can set our
2414  * costs to account for that hit ratio, plus a little bit of cost for the
2415  * caching itself.  Caching will not work out well if we expect to be called
2416  * with too many distinct parameter values.  The worst-case here is that we
2417  * never see any parameter value twice, in which case we'd never get a cache
2418  * hit and caching would be a complete waste of effort.
2419  */
2420 static void
2421 cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
2422                                         Cost *rescan_startup_cost, Cost *rescan_total_cost)
2423 {
2424         EstimationInfo estinfo;
2425         Cost            input_startup_cost = mpath->subpath->startup_cost;
2426         Cost            input_total_cost = mpath->subpath->total_cost;
2427         double          tuples = mpath->subpath->rows;
2428         double          calls = mpath->calls;
2429         int                     width = mpath->subpath->pathtarget->width;
2430
2431         double          hash_mem_bytes;
2432         double          est_entry_bytes;
2433         double          est_cache_entries;
2434         double          ndistinct;
2435         double          evict_ratio;
2436         double          hit_ratio;
2437         Cost            startup_cost;
2438         Cost            total_cost;
2439
2440         /* available cache space */
2441         hash_mem_bytes = get_hash_memory_limit();
2442
2443         /*
2444          * Set the number of bytes each cache entry should consume in the cache.
2445          * To provide us with better estimations on how many cache entries we can
2446          * store at once, we make a call to the executor here to ask it what
2447          * memory overheads there are for a single cache entry.
2448          *
2449          * XXX we also store the cache key, but that's not accounted for here.
2450          */
2451         est_entry_bytes = relation_byte_size(tuples, width) +
2452                 ExecEstimateCacheEntryOverheadBytes(tuples);
2453
2454         /* estimate on the upper limit of cache entries we can hold at once */
2455         est_cache_entries = floor(hash_mem_bytes / est_entry_bytes);
2456
2457         /* estimate on the distinct number of parameter values */
2458         ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL,
2459                                                                         &estinfo);
2460
2461         /*
2462          * When the estimation fell back on using a default value, it's a bit too
2463          * risky to assume that it's ok to use a Memoize node.  The use of a
2464          * default could cause us to use a Memoize node when it's really
2465          * inappropriate to do so.  If we see that this has been done, then we'll
2466          * assume that every call will have unique parameters, which will almost
2467          * certainly mean a MemoizePath will never survive add_path().
2468          */
2469         if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0)
2470                 ndistinct = calls;
2471
2472         /*
2473          * Since we've already estimated the maximum number of entries we can
2474          * store at once and know the estimated number of distinct values we'll be
2475          * called with, we'll take this opportunity to set the path's est_entries.
2476          * This will ultimately determine the hash table size that the executor
2477          * will use.  If we leave this at zero, the executor will just choose the
2478          * size itself.  Really this is not the right place to do this, but it's
2479          * convenient since everything is already calculated.
2480          */
2481         mpath->est_entries = Min(Min(ndistinct, est_cache_entries),
2482                                                          PG_UINT32_MAX);
2483
2484         /*
2485          * When the number of distinct parameter values is above the amount we can
2486          * store in the cache, then we'll have to evict some entries from the
2487          * cache.  This is not free. Here we estimate how often we'll incur the
2488          * cost of that eviction.
2489          */
2490         evict_ratio = 1.0 - Min(est_cache_entries, ndistinct) / ndistinct;
2491
2492         /*
2493          * In order to estimate how costly a single scan will be, we need to
2494          * attempt to estimate what the cache hit ratio will be.  To do that we
2495          * must look at how many scans are estimated in total for this node and
2496          * how many of those scans we expect to get a cache hit.
2497          */
2498         hit_ratio = 1.0 / ndistinct * Min(est_cache_entries, ndistinct) -
2499                 (ndistinct / calls);
2500
2501         /* Ensure we don't go negative */
2502         hit_ratio = Max(hit_ratio, 0.0);
2503
2504         /*
2505          * Set the total_cost accounting for the expected cache hit ratio.  We
2506          * also add on a cpu_operator_cost to account for a cache lookup. This
2507          * will happen regardless of whether it's a cache hit or not.
2508          */
2509         total_cost = input_total_cost * (1.0 - hit_ratio) + cpu_operator_cost;
2510
2511         /* Now adjust the total cost to account for cache evictions */
2512
2513         /* Charge a cpu_tuple_cost for evicting the actual cache entry */
2514         total_cost += cpu_tuple_cost * evict_ratio;
2515
2516         /*
2517          * Charge a 10th of cpu_operator_cost to evict every tuple in that entry.
2518          * The per-tuple eviction is really just a pfree, so charging a whole
2519          * cpu_operator_cost seems a little excessive.
2520          */
2521         total_cost += cpu_operator_cost / 10.0 * evict_ratio * tuples;
2522
2523         /*
2524          * Now adjust for storing things in the cache, since that's not free
2525          * either.  Everything must go in the cache.  We don't proportion this
2526          * over any ratio, just apply it once for the scan.  We charge a
2527          * cpu_tuple_cost for the creation of the cache entry and also a
2528          * cpu_operator_cost for each tuple we expect to cache.
2529          */
2530         total_cost += cpu_tuple_cost + cpu_operator_cost * tuples;
2531
2532         /*
2533          * Getting the first row must be also be proportioned according to the
2534          * expected cache hit ratio.
2535          */
2536         startup_cost = input_startup_cost * (1.0 - hit_ratio);
2537
2538         /*
2539          * Additionally we charge a cpu_tuple_cost to account for cache lookups,
2540          * which we'll do regardless of whether it was a cache hit or not.
2541          */
2542         startup_cost += cpu_tuple_cost;
2543
2544         *rescan_startup_cost = startup_cost;
2545         *rescan_total_cost = total_cost;
2546 }
2547
2548 /*
2549  * cost_agg
2550  *              Determines and returns the cost of performing an Agg plan node,
2551  *              including the cost of its input.
2552  *
2553  * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2554  * we are using a hashed Agg node just to do grouping).
2555  *
2556  * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2557  * are for appropriately-sorted input.
2558  */
2559 void
2560 cost_agg(Path *path, PlannerInfo *root,
2561                  AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2562                  int numGroupCols, double numGroups,
2563                  List *quals,
2564                  Cost input_startup_cost, Cost input_total_cost,
2565                  double input_tuples, double input_width)
2566 {
2567         double          output_tuples;
2568         Cost            startup_cost;
2569         Cost            total_cost;
2570         AggClauseCosts dummy_aggcosts;
2571
2572         /* Use all-zero per-aggregate costs if NULL is passed */
2573         if (aggcosts == NULL)
2574         {
2575                 Assert(aggstrategy == AGG_HASHED);
2576                 MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2577                 aggcosts = &dummy_aggcosts;
2578         }
2579
2580         /*
2581          * The transCost.per_tuple component of aggcosts should be charged once
2582          * per input tuple, corresponding to the costs of evaluating the aggregate
2583          * transfns and their input expressions. The finalCost.per_tuple component
2584          * is charged once per output tuple, corresponding to the costs of
2585          * evaluating the finalfns.  Startup costs are of course charged but once.
2586          *
2587          * If we are grouping, we charge an additional cpu_operator_cost per
2588          * grouping column per input tuple for grouping comparisons.
2589          *
2590          * We will produce a single output tuple if not grouping, and a tuple per
2591          * group otherwise.  We charge cpu_tuple_cost for each output tuple.
2592          *
2593          * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2594          * same total CPU cost, but AGG_SORTED has lower startup cost.  If the
2595          * input path is already sorted appropriately, AGG_SORTED should be
2596          * preferred (since it has no risk of memory overflow).  This will happen
2597          * as long as the computed total costs are indeed exactly equal --- but if
2598          * there's roundoff error we might do the wrong thing.  So be sure that
2599          * the computations below form the same intermediate values in the same
2600          * order.
2601          */
2602         if (aggstrategy == AGG_PLAIN)
2603         {
2604                 startup_cost = input_total_cost;
2605                 startup_cost += aggcosts->transCost.startup;
2606                 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2607                 startup_cost += aggcosts->finalCost.startup;
2608                 startup_cost += aggcosts->finalCost.per_tuple;
2609                 /* we aren't grouping */
2610                 total_cost = startup_cost + cpu_tuple_cost;
2611                 output_tuples = 1;
2612         }
2613         else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2614         {
2615                 /* Here we are able to deliver output on-the-fly */
2616                 startup_cost = input_startup_cost;
2617                 total_cost = input_total_cost;
2618                 if (aggstrategy == AGG_MIXED && !enable_hashagg)
2619                 {
2620                         startup_cost += disable_cost;
2621                         total_cost += disable_cost;
2622                 }
2623                 /* calcs phrased this way to match HASHED case, see note above */
2624                 total_cost += aggcosts->transCost.startup;
2625                 total_cost += aggcosts->transCost.per_tuple * input_tuples;
2626                 total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2627                 total_cost += aggcosts->finalCost.startup;
2628                 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2629                 total_cost += cpu_tuple_cost * numGroups;
2630                 output_tuples = numGroups;
2631         }
2632         else
2633         {
2634                 /* must be AGG_HASHED */
2635                 startup_cost = input_total_cost;
2636                 if (!enable_hashagg)
2637                         startup_cost += disable_cost;
2638                 startup_cost += aggcosts->transCost.startup;
2639                 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2640                 /* cost of computing hash value */
2641                 startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2642                 startup_cost += aggcosts->finalCost.startup;
2643
2644                 total_cost = startup_cost;
2645                 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2646                 /* cost of retrieving from hash table */
2647                 total_cost += cpu_tuple_cost * numGroups;
2648                 output_tuples = numGroups;
2649         }
2650
2651         /*
2652          * Add the disk costs of hash aggregation that spills to disk.
2653          *
2654          * Groups that go into the hash table stay in memory until finalized, so
2655          * spilling and reprocessing tuples doesn't incur additional invocations
2656          * of transCost or finalCost. Furthermore, the computed hash value is
2657          * stored with the spilled tuples, so we don't incur extra invocations of
2658          * the hash function.
2659          *
2660          * Hash Agg begins returning tuples after the first batch is complete.
2661          * Accrue writes (spilled tuples) to startup_cost and to total_cost;
2662          * accrue reads only to total_cost.
2663          */
2664         if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
2665         {
2666                 double          pages;
2667                 double          pages_written = 0.0;
2668                 double          pages_read = 0.0;
2669                 double          spill_cost;
2670                 double          hashentrysize;
2671                 double          nbatches;
2672                 Size            mem_limit;
2673                 uint64          ngroups_limit;
2674                 int                     num_partitions;
2675                 int                     depth;
2676
2677                 /*
2678                  * Estimate number of batches based on the computed limits. If less
2679                  * than or equal to one, all groups are expected to fit in memory;
2680                  * otherwise we expect to spill.
2681                  */
2682                 hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos),
2683                                                                                         input_width,
2684                                                                                         aggcosts->transitionSpace);
2685                 hash_agg_set_limits(hashentrysize, numGroups, 0, &mem_limit,
2686                                                         &ngroups_limit, &num_partitions);
2687
2688                 nbatches = Max((numGroups * hashentrysize) / mem_limit,
2689                                            numGroups / ngroups_limit);
2690
2691                 nbatches = Max(ceil(nbatches), 1.0);
2692                 num_partitions = Max(num_partitions, 2);
2693
2694                 /*
2695                  * The number of partitions can change at different levels of
2696                  * recursion; but for the purposes of this calculation assume it stays
2697                  * constant.
2698                  */
2699                 depth = ceil(log(nbatches) / log(num_partitions));
2700
2701                 /*
2702                  * Estimate number of pages read and written. For each level of
2703                  * recursion, a tuple must be written and then later read.
2704                  */
2705                 pages = relation_byte_size(input_tuples, input_width) / BLCKSZ;
2706                 pages_written = pages_read = pages * depth;
2707
2708                 /*
2709                  * HashAgg has somewhat worse IO behavior than Sort on typical
2710                  * hardware/OS combinations. Account for this with a generic penalty.
2711                  */
2712                 pages_read *= 2.0;
2713                 pages_written *= 2.0;
2714
2715                 startup_cost += pages_written * random_page_cost;
2716                 total_cost += pages_written * random_page_cost;
2717                 total_cost += pages_read * seq_page_cost;
2718
2719                 /* account for CPU cost of spilling a tuple and reading it back */
2720                 spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost;
2721                 startup_cost += spill_cost;
2722                 total_cost += spill_cost;
2723         }
2724
2725         /*
2726          * If there are quals (HAVING quals), account for their cost and
2727          * selectivity.
2728          */
2729         if (quals)
2730         {
2731                 QualCost        qual_cost;
2732
2733                 cost_qual_eval(&qual_cost, quals, root);
2734                 startup_cost += qual_cost.startup;
2735                 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2736
2737                 output_tuples = clamp_row_est(output_tuples *
2738                                                                           clauselist_selectivity(root,
2739                                                                                                                          quals,
2740                                                                                                                          0,
2741                                                                                                                          JOIN_INNER,
2742                                                                                                                          NULL));
2743         }
2744
2745         path->rows = output_tuples;
2746         path->startup_cost = startup_cost;
2747         path->total_cost = total_cost;
2748 }
2749
2750 /*
2751  * cost_windowagg
2752  *              Determines and returns the cost of performing a WindowAgg plan node,
2753  *              including the cost of its input.
2754  *
2755  * Input is assumed already properly sorted.
2756  */
2757 void
2758 cost_windowagg(Path *path, PlannerInfo *root,
2759                            List *windowFuncs, int numPartCols, int numOrderCols,
2760                            Cost input_startup_cost, Cost input_total_cost,
2761                            double input_tuples)
2762 {
2763         Cost            startup_cost;
2764         Cost            total_cost;
2765         ListCell   *lc;
2766
2767         startup_cost = input_startup_cost;
2768         total_cost = input_total_cost;
2769
2770         /*
2771          * Window functions are assumed to cost their stated execution cost, plus
2772          * the cost of evaluating their input expressions, per tuple.  Since they
2773          * may in fact evaluate their inputs at multiple rows during each cycle,
2774          * this could be a drastic underestimate; but without a way to know how
2775          * many rows the window function will fetch, it's hard to do better.  In
2776          * any case, it's a good estimate for all the built-in window functions,
2777          * so we'll just do this for now.
2778          */
2779         foreach(lc, windowFuncs)
2780         {
2781                 WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2782                 Cost            wfunccost;
2783                 QualCost        argcosts;
2784
2785                 argcosts.startup = argcosts.per_tuple = 0;
2786                 add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
2787                                                   &argcosts);
2788                 startup_cost += argcosts.startup;
2789                 wfunccost = argcosts.per_tuple;
2790
2791                 /* also add the input expressions' cost to per-input-row costs */
2792                 cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2793                 startup_cost += argcosts.startup;
2794                 wfunccost += argcosts.per_tuple;
2795
2796                 /*
2797                  * Add the filter's cost to per-input-row costs.  XXX We should reduce
2798                  * input expression costs according to filter selectivity.
2799                  */
2800                 cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2801                 startup_cost += argcosts.startup;
2802                 wfunccost += argcosts.per_tuple;
2803
2804                 total_cost += wfunccost * input_tuples;
2805         }
2806
2807         /*
2808          * We also charge cpu_operator_cost per grouping column per tuple for
2809          * grouping comparisons, plus cpu_tuple_cost per tuple for general
2810          * overhead.
2811          *
2812          * XXX this neglects costs of spooling the data to disk when it overflows
2813          * work_mem.  Sooner or later that should get accounted for.
2814          */
2815         total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2816         total_cost += cpu_tuple_cost * input_tuples;
2817
2818         path->rows = input_tuples;
2819         path->startup_cost = startup_cost;
2820         path->total_cost = total_cost;
2821 }
2822
2823 /*
2824  * cost_group
2825  *              Determines and returns the cost of performing a Group plan node,
2826  *              including the cost of its input.
2827  *
2828  * Note: caller must ensure that input costs are for appropriately-sorted
2829  * input.
2830  */
2831 void
2832 cost_group(Path *path, PlannerInfo *root,
2833                    int numGroupCols, double numGroups,
2834                    List *quals,
2835                    Cost input_startup_cost, Cost input_total_cost,
2836                    double input_tuples)
2837 {
2838         double          output_tuples;
2839         Cost            startup_cost;
2840         Cost            total_cost;
2841
2842         output_tuples = numGroups;
2843         startup_cost = input_startup_cost;
2844         total_cost = input_total_cost;
2845
2846         /*
2847          * Charge one cpu_operator_cost per comparison per input tuple. We assume
2848          * all columns get compared at most of the tuples.
2849          */
2850         total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2851
2852         /*
2853          * If there are quals (HAVING quals), account for their cost and
2854          * selectivity.
2855          */
2856         if (quals)
2857         {
2858                 QualCost        qual_cost;
2859
2860                 cost_qual_eval(&qual_cost, quals, root);
2861                 startup_cost += qual_cost.startup;
2862                 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2863
2864                 output_tuples = clamp_row_est(output_tuples *
2865                                                                           clauselist_selectivity(root,
2866                                                                                                                          quals,
2867                                                                                                                          0,
2868                                                                                                                          JOIN_INNER,
2869                                                                                                                          NULL));
2870         }
2871
2872         path->rows = output_tuples;
2873         path->startup_cost = startup_cost;
2874         path->total_cost = total_cost;
2875 }
2876
2877 /*
2878  * initial_cost_nestloop
2879  *        Preliminary estimate of the cost of a nestloop join path.
2880  *
2881  * This must quickly produce lower-bound estimates of the path's startup and
2882  * total costs.  If we are unable to eliminate the proposed path from
2883  * consideration using the lower bounds, final_cost_nestloop will be called
2884  * to obtain the final estimates.
2885  *
2886  * The exact division of labor between this function and final_cost_nestloop
2887  * is private to them, and represents a tradeoff between speed of the initial
2888  * estimate and getting a tight lower bound.  We choose to not examine the
2889  * join quals here, since that's by far the most expensive part of the
2890  * calculations.  The end result is that CPU-cost considerations must be
2891  * left for the second phase; and for SEMI/ANTI joins, we must also postpone
2892  * incorporation of the inner path's run cost.
2893  *
2894  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2895  *              other data to be used by final_cost_nestloop
2896  * 'jointype' is the type of join to be performed
2897  * 'outer_path' is the outer input to the join
2898  * 'inner_path' is the inner input to the join
2899  * 'extra' contains miscellaneous information about the join
2900  */
2901 void
2902 initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
2903                                           JoinType jointype,
2904                                           Path *outer_path, Path *inner_path,
2905                                           JoinPathExtraData *extra)
2906 {
2907         Cost            startup_cost = 0;
2908         Cost            run_cost = 0;
2909         double          outer_path_rows = outer_path->rows;
2910         Cost            inner_rescan_start_cost;
2911         Cost            inner_rescan_total_cost;
2912         Cost            inner_run_cost;
2913         Cost            inner_rescan_run_cost;
2914
2915         /* estimate costs to rescan the inner relation */
2916         cost_rescan(root, inner_path,
2917                                 &inner_rescan_start_cost,
2918                                 &inner_rescan_total_cost);
2919
2920         /* cost of source data */
2921
2922         /*
2923          * NOTE: clearly, we must pay both outer and inner paths' startup_cost
2924          * before we can start returning tuples, so the join's startup cost is
2925          * their sum.  We'll also pay the inner path's rescan startup cost
2926          * multiple times.
2927          */
2928         startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2929         run_cost += outer_path->total_cost - outer_path->startup_cost;
2930         if (outer_path_rows > 1)
2931                 run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
2932
2933         inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2934         inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2935
2936         if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
2937                 extra->inner_unique)
2938         {
2939                 /*
2940                  * With a SEMI or ANTI join, or if the innerrel is known unique, the
2941                  * executor will stop after the first match.
2942                  *
2943                  * Getting decent estimates requires inspection of the join quals,
2944                  * which we choose to postpone to final_cost_nestloop.
2945                  */
2946
2947                 /* Save private data for final_cost_nestloop */
2948                 workspace->inner_run_cost = inner_run_cost;
2949                 workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2950         }
2951         else
2952         {
2953                 /* Normal case; we'll scan whole input rel for each outer row */
2954                 run_cost += inner_run_cost;
2955                 if (outer_path_rows > 1)
2956                         run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
2957         }
2958
2959         /* CPU costs left for later */
2960
2961         /* Public result fields */
2962         workspace->startup_cost = startup_cost;
2963         workspace->total_cost = startup_cost + run_cost;
2964         /* Save private data for final_cost_nestloop */
2965         workspace->run_cost = run_cost;
2966 }
2967
2968 /*
2969  * final_cost_nestloop
2970  *        Final estimate of the cost and result size of a nestloop join path.
2971  *
2972  * 'path' is already filled in except for the rows and cost fields
2973  * 'workspace' is the result from initial_cost_nestloop
2974  * 'extra' contains miscellaneous information about the join
2975  */
2976 void
2977 final_cost_nestloop(PlannerInfo *root, NestPath *path,
2978                                         JoinCostWorkspace *workspace,
2979                                         JoinPathExtraData *extra)
2980 {
2981         Path       *outer_path = path->jpath.outerjoinpath;
2982         Path       *inner_path = path->jpath.innerjoinpath;
2983         double          outer_path_rows = outer_path->rows;
2984         double          inner_path_rows = inner_path->rows;
2985         Cost            startup_cost = workspace->startup_cost;
2986         Cost            run_cost = workspace->run_cost;
2987         Cost            cpu_per_tuple;
2988         QualCost        restrict_qual_cost;
2989         double          ntuples;
2990
2991         /* Protect some assumptions below that rowcounts aren't zero */
2992         if (outer_path_rows <= 0)
2993                 outer_path_rows = 1;
2994         if (inner_path_rows <= 0)
2995                 inner_path_rows = 1;
2996         /* Mark the path with the correct row estimate */
2997         if (path->jpath.path.param_info)
2998                 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
2999         else
3000                 path->jpath.path.rows = path->jpath.path.parent->rows;
3001
3002         /* For partial paths, scale row estimate. */
3003         if (path->jpath.path.parallel_workers > 0)
3004         {
3005                 double          parallel_divisor = get_parallel_divisor(&path->jpath.path);
3006
3007                 path->jpath.path.rows =
3008                         clamp_row_est(path->jpath.path.rows / parallel_divisor);
3009         }
3010
3011         /*
3012          * We could include disable_cost in the preliminary estimate, but that
3013          * would amount to optimizing for the case where the join method is
3014          * disabled, which doesn't seem like the way to bet.
3015          */
3016         if (!enable_nestloop)
3017                 startup_cost += disable_cost;
3018
3019         /* cost of inner-relation source data (we already dealt with outer rel) */
3020
3021         if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI ||
3022                 extra->inner_unique)
3023         {
3024                 /*
3025                  * With a SEMI or ANTI join, or if the innerrel is known unique, the
3026                  * executor will stop after the first match.
3027                  */
3028                 Cost            inner_run_cost = workspace->inner_run_cost;
3029                 Cost            inner_rescan_run_cost = workspace->inner_rescan_run_cost;
3030                 double          outer_matched_rows;
3031                 double          outer_unmatched_rows;
3032                 Selectivity inner_scan_frac;
3033
3034                 /*
3035                  * For an outer-rel row that has at least one match, we can expect the
3036                  * inner scan to stop after a fraction 1/(match_count+1) of the inner
3037                  * rows, if the matches are evenly distributed.  Since they probably
3038                  * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
3039                  * that fraction.  (If we used a larger fuzz factor, we'd have to
3040                  * clamp inner_scan_frac to at most 1.0; but since match_count is at
3041                  * least 1, no such clamp is needed now.)
3042                  */
3043                 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3044                 outer_unmatched_rows = outer_path_rows - outer_matched_rows;
3045                 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3046
3047                 /*
3048                  * Compute number of tuples processed (not number emitted!).  First,
3049                  * account for successfully-matched outer rows.
3050                  */
3051                 ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
3052
3053                 /*
3054                  * Now we need to estimate the actual costs of scanning the inner
3055                  * relation, which may be quite a bit less than N times inner_run_cost
3056                  * due to early scan stops.  We consider two cases.  If the inner path
3057                  * is an indexscan using all the joinquals as indexquals, then an
3058                  * unmatched outer row results in an indexscan returning no rows,
3059                  * which is probably quite cheap.  Otherwise, the executor will have
3060                  * to scan the whole inner rel for an unmatched row; not so cheap.
3061                  */
3062                 if (has_indexed_join_quals(path))
3063                 {
3064                         /*
3065                          * Successfully-matched outer rows will only require scanning
3066                          * inner_scan_frac of the inner relation.  In this case, we don't
3067                          * need to charge the full inner_run_cost even when that's more
3068                          * than inner_rescan_run_cost, because we can assume that none of
3069                          * the inner scans ever scan the whole inner relation.  So it's
3070                          * okay to assume that all the inner scan executions can be
3071                          * fractions of the full cost, even if materialization is reducing
3072                          * the rescan cost.  At this writing, it's impossible to get here
3073                          * for a materialized inner scan, so inner_run_cost and
3074                          * inner_rescan_run_cost will be the same anyway; but just in
3075                          * case, use inner_run_cost for the first matched tuple and
3076                          * inner_rescan_run_cost for additional ones.
3077                          */
3078                         run_cost += inner_run_cost * inner_scan_frac;
3079                         if (outer_matched_rows > 1)
3080                                 run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
3081
3082                         /*
3083                          * Add the cost of inner-scan executions for unmatched outer rows.
3084                          * We estimate this as the same cost as returning the first tuple
3085                          * of a nonempty scan.  We consider that these are all rescans,
3086                          * since we used inner_run_cost once already.
3087                          */
3088                         run_cost += outer_unmatched_rows *
3089                                 inner_rescan_run_cost / inner_path_rows;
3090
3091                         /*
3092                          * We won't be evaluating any quals at all for unmatched rows, so
3093                          * don't add them to ntuples.
3094                          */
3095                 }
3096                 else
3097                 {
3098                         /*
3099                          * Here, a complicating factor is that rescans may be cheaper than
3100                          * first scans.  If we never scan all the way to the end of the
3101                          * inner rel, it might be (depending on the plan type) that we'd
3102                          * never pay the whole inner first-scan run cost.  However it is
3103                          * difficult to estimate whether that will happen (and it could
3104                          * not happen if there are any unmatched outer rows!), so be
3105                          * conservative and always charge the whole first-scan cost once.
3106                          * We consider this charge to correspond to the first unmatched
3107                          * outer row, unless there isn't one in our estimate, in which
3108                          * case blame it on the first matched row.
3109                          */
3110
3111                         /* First, count all unmatched join tuples as being processed */
3112                         ntuples += outer_unmatched_rows * inner_path_rows;
3113
3114                         /* Now add the forced full scan, and decrement appropriate count */
3115                         run_cost += inner_run_cost;
3116                         if (outer_unmatched_rows >= 1)
3117                                 outer_unmatched_rows -= 1;
3118                         else
3119                                 outer_matched_rows -= 1;
3120
3121                         /* Add inner run cost for additional outer tuples having matches */
3122                         if (outer_matched_rows > 0)
3123                                 run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
3124
3125                         /* Add inner run cost for additional unmatched outer tuples */
3126                         if (outer_unmatched_rows > 0)
3127                                 run_cost += outer_unmatched_rows * inner_rescan_run_cost;
3128                 }
3129         }
3130         else
3131         {
3132                 /* Normal-case source costs were included in preliminary estimate */
3133
3134                 /* Compute number of tuples processed (not number emitted!) */
3135                 ntuples = outer_path_rows * inner_path_rows;
3136         }
3137
3138         /* CPU costs */
3139         cost_qual_eval(&restrict_qual_cost, path->jpath.joinrestrictinfo, root);
3140         startup_cost += restrict_qual_cost.startup;
3141         cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
3142         run_cost += cpu_per_tuple * ntuples;
3143
3144         /* tlist eval costs are paid per output row, not per tuple scanned */
3145         startup_cost += path->jpath.path.pathtarget->cost.startup;
3146         run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3147
3148         path->jpath.path.startup_cost = startup_cost;
3149         path->jpath.path.total_cost = startup_cost + run_cost;
3150 }
3151
3152 /*
3153  * initial_cost_mergejoin
3154  *        Preliminary estimate of the cost of a mergejoin path.
3155  *
3156  * This must quickly produce lower-bound estimates of the path's startup and
3157  * total costs.  If we are unable to eliminate the proposed path from
3158  * consideration using the lower bounds, final_cost_mergejoin will be called
3159  * to obtain the final estimates.
3160  *
3161  * The exact division of labor between this function and final_cost_mergejoin
3162  * is private to them, and represents a tradeoff between speed of the initial
3163  * estimate and getting a tight lower bound.  We choose to not examine the
3164  * join quals here, except for obtaining the scan selectivity estimate which
3165  * is really essential (but fortunately, use of caching keeps the cost of
3166  * getting that down to something reasonable).
3167  * We also assume that cost_sort is cheap enough to use here.
3168  *
3169  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3170  *              other data to be used by final_cost_mergejoin
3171  * 'jointype' is the type of join to be performed
3172  * 'mergeclauses' is the list of joinclauses to be used as merge clauses
3173  * 'outer_path' is the outer input to the join
3174  * 'inner_path' is the inner input to the join
3175  * 'outersortkeys' is the list of sort keys for the outer path
3176  * 'innersortkeys' is the list of sort keys for the inner path
3177  * 'extra' contains miscellaneous information about the join
3178  *
3179  * Note: outersortkeys and innersortkeys should be NIL if no explicit
3180  * sort is needed because the respective source path is already ordered.
3181  */
3182 void
3183 initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3184                                            JoinType jointype,
3185                                            List *mergeclauses,
3186                                            Path *outer_path, Path *inner_path,
3187                                            List *outersortkeys, List *innersortkeys,
3188                                            JoinPathExtraData *extra)
3189 {
3190         Cost            startup_cost = 0;
3191         Cost            run_cost = 0;
3192         double          outer_path_rows = outer_path->rows;
3193         double          inner_path_rows = inner_path->rows;
3194         Cost            inner_run_cost;
3195         double          outer_rows,
3196                                 inner_rows,
3197                                 outer_skip_rows,
3198                                 inner_skip_rows;
3199         Selectivity outerstartsel,
3200                                 outerendsel,
3201                                 innerstartsel,
3202                                 innerendsel;
3203         Path            sort_path;              /* dummy for result of cost_sort */
3204
3205         /* Protect some assumptions below that rowcounts aren't zero */
3206         if (outer_path_rows <= 0)
3207                 outer_path_rows = 1;
3208         if (inner_path_rows <= 0)
3209                 inner_path_rows = 1;
3210
3211         /*
3212          * A merge join will stop as soon as it exhausts either input stream
3213          * (unless it's an outer join, in which case the outer side has to be
3214          * scanned all the way anyway).  Estimate fraction of the left and right
3215          * inputs that will actually need to be scanned.  Likewise, we can
3216          * estimate the number of rows that will be skipped before the first join
3217          * pair is found, which should be factored into startup cost. We use only
3218          * the first (most significant) merge clause for this purpose. Since
3219          * mergejoinscansel() is a fairly expensive computation, we cache the
3220          * results in the merge clause RestrictInfo.
3221          */
3222         if (mergeclauses && jointype != JOIN_FULL)
3223         {
3224                 RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
3225                 List       *opathkeys;
3226                 List       *ipathkeys;
3227                 PathKey    *opathkey;
3228                 PathKey    *ipathkey;
3229                 MergeScanSelCache *cache;
3230
3231                 /* Get the input pathkeys to determine the sort-order details */
3232                 opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
3233                 ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
3234                 Assert(opathkeys);
3235                 Assert(ipathkeys);
3236                 opathkey = (PathKey *) linitial(opathkeys);
3237                 ipathkey = (PathKey *) linitial(ipathkeys);
3238                 /* debugging check */
3239                 if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
3240                         opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
3241                         opathkey->pk_strategy != ipathkey->pk_strategy ||
3242                         opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
3243                         elog(ERROR, "left and right pathkeys do not match in mergejoin");
3244
3245                 /* Get the selectivity with caching */
3246                 cache = cached_scansel(root, firstclause, opathkey);
3247
3248                 if (bms_is_subset(firstclause->left_relids,
3249                                                   outer_path->parent->relids))
3250                 {
3251                         /* left side of clause is outer */
3252                         outerstartsel = cache->leftstartsel;
3253                         outerendsel = cache->leftendsel;
3254                         innerstartsel = cache->rightstartsel;
3255                         innerendsel = cache->rightendsel;
3256                 }
3257                 else
3258                 {
3259                         /* left side of clause is inner */
3260                         outerstartsel = cache->rightstartsel;
3261                         outerendsel = cache->rightendsel;
3262                         innerstartsel = cache->leftstartsel;
3263                         innerendsel = cache->leftendsel;
3264                 }
3265                 if (jointype == JOIN_LEFT ||
3266                         jointype == JOIN_ANTI)
3267                 {
3268                         outerstartsel = 0.0;
3269                         outerendsel = 1.0;
3270                 }
3271                 else if (jointype == JOIN_RIGHT)
3272                 {
3273                         innerstartsel = 0.0;
3274                         innerendsel = 1.0;
3275                 }
3276         }
3277         else
3278         {
3279                 /* cope with clauseless or full mergejoin */
3280                 outerstartsel = innerstartsel = 0.0;
3281                 outerendsel = innerendsel = 1.0;
3282         }
3283
3284         /*
3285          * Convert selectivities to row counts.  We force outer_rows and
3286          * inner_rows to be at least 1, but the skip_rows estimates can be zero.
3287          */
3288         outer_skip_rows = rint(outer_path_rows * outerstartsel);
3289         inner_skip_rows = rint(inner_path_rows * innerstartsel);
3290         outer_rows = clamp_row_est(outer_path_rows * outerendsel);
3291         inner_rows = clamp_row_est(inner_path_rows * innerendsel);
3292
3293         Assert(outer_skip_rows <= outer_rows);
3294         Assert(inner_skip_rows <= inner_rows);
3295
3296         /*
3297          * Readjust scan selectivities to account for above rounding.  This is
3298          * normally an insignificant effect, but when there are only a few rows in
3299          * the inputs, failing to do this makes for a large percentage error.
3300          */
3301         outerstartsel = outer_skip_rows / outer_path_rows;
3302         innerstartsel = inner_skip_rows / inner_path_rows;
3303         outerendsel = outer_rows / outer_path_rows;
3304         innerendsel = inner_rows / inner_path_rows;
3305
3306         Assert(outerstartsel <= outerendsel);
3307         Assert(innerstartsel <= innerendsel);
3308
3309         /* cost of source data */
3310
3311         if (outersortkeys)                      /* do we need to sort outer? */
3312         {
3313                 cost_sort(&sort_path,
3314                                   root,
3315                                   outersortkeys,
3316                                   outer_path->total_cost,
3317                                   outer_path_rows,
3318                                   outer_path->pathtarget->width,
3319                                   0.0,
3320                                   work_mem,
3321                                   -1.0);
3322                 startup_cost += sort_path.startup_cost;
3323                 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3324                         * outerstartsel;
3325                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
3326                         * (outerendsel - outerstartsel);
3327         }
3328         else
3329         {
3330                 startup_cost += outer_path->startup_cost;
3331                 startup_cost += (outer_path->total_cost - outer_path->startup_cost)
3332                         * outerstartsel;
3333                 run_cost += (outer_path->total_cost - outer_path->startup_cost)
3334                         * (outerendsel - outerstartsel);
3335         }
3336
3337         if (innersortkeys)                      /* do we need to sort inner? */
3338         {
3339                 cost_sort(&sort_path,
3340                                   root,
3341                                   innersortkeys,
3342                                   inner_path->total_cost,
3343                                   inner_path_rows,
3344                                   inner_path->pathtarget->width,
3345                                   0.0,
3346                                   work_mem,
3347                                   -1.0);
3348                 startup_cost += sort_path.startup_cost;
3349                 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3350                         * innerstartsel;
3351                 inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
3352                         * (innerendsel - innerstartsel);
3353         }
3354         else
3355         {
3356                 startup_cost += inner_path->startup_cost;
3357                 startup_cost += (inner_path->total_cost - inner_path->startup_cost)
3358                         * innerstartsel;
3359                 inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
3360                         * (innerendsel - innerstartsel);
3361         }
3362
3363         /*
3364          * We can't yet determine whether rescanning occurs, or whether
3365          * materialization of the inner input should be done.  The minimum
3366          * possible inner input cost, regardless of rescan and materialization
3367          * considerations, is inner_run_cost.  We include that in
3368          * workspace->total_cost, but not yet in run_cost.
3369          */
3370
3371         /* CPU costs left for later */
3372
3373         /* Public result fields */
3374         workspace->startup_cost = startup_cost;
3375         workspace->total_cost = startup_cost + run_cost + inner_run_cost;
3376         /* Save private data for final_cost_mergejoin */
3377         workspace->run_cost = run_cost;
3378         workspace->inner_run_cost = inner_run_cost;
3379         workspace->outer_rows = outer_rows;
3380         workspace->inner_rows = inner_rows;
3381         workspace->outer_skip_rows = outer_skip_rows;
3382         workspace->inner_skip_rows = inner_skip_rows;
3383 }
3384
3385 /*
3386  * final_cost_mergejoin
3387  *        Final estimate of the cost and result size of a mergejoin path.
3388  *
3389  * Unlike other costsize functions, this routine makes two actual decisions:
3390  * whether the executor will need to do mark/restore, and whether we should
3391  * materialize the inner path.  It would be logically cleaner to build
3392  * separate paths testing these alternatives, but that would require repeating
3393  * most of the cost calculations, which are not all that cheap.  Since the
3394  * choice will not affect output pathkeys or startup cost, only total cost,
3395  * there is no possibility of wanting to keep more than one path.  So it seems
3396  * best to make the decisions here and record them in the path's
3397  * skip_mark_restore and materialize_inner fields.
3398  *
3399  * Mark/restore overhead is usually required, but can be skipped if we know
3400  * that the executor need find only one match per outer tuple, and that the
3401  * mergeclauses are sufficient to identify a match.
3402  *
3403  * We materialize the inner path if we need mark/restore and either the inner
3404  * path can't support mark/restore, or it's cheaper to use an interposed
3405  * Material node to handle mark/restore.
3406  *
3407  * 'path' is already filled in except for the rows and cost fields and
3408  *              skip_mark_restore and materialize_inner
3409  * 'workspace' is the result from initial_cost_mergejoin
3410  * 'extra' contains miscellaneous information about the join
3411  */
3412 void
3413 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
3414                                          JoinCostWorkspace *workspace,
3415                                          JoinPathExtraData *extra)
3416 {
3417         Path       *outer_path = path->jpath.outerjoinpath;
3418         Path       *inner_path = path->jpath.innerjoinpath;
3419         double          inner_path_rows = inner_path->rows;
3420         List       *mergeclauses = path->path_mergeclauses;
3421         List       *innersortkeys = path->innersortkeys;
3422         Cost            startup_cost = workspace->startup_cost;
3423         Cost            run_cost = workspace->run_cost;
3424         Cost            inner_run_cost = workspace->inner_run_cost;
3425         double          outer_rows = workspace->outer_rows;
3426         double          inner_rows = workspace->inner_rows;
3427         double          outer_skip_rows = workspace->outer_skip_rows;
3428         double          inner_skip_rows = workspace->inner_skip_rows;
3429         Cost            cpu_per_tuple,
3430                                 bare_inner_cost,
3431                                 mat_inner_cost;
3432         QualCost        merge_qual_cost;
3433         QualCost        qp_qual_cost;
3434         double          mergejointuples,
3435                                 rescannedtuples;
3436         double          rescanratio;
3437
3438         /* Protect some assumptions below that rowcounts aren't zero */
3439         if (inner_path_rows <= 0)
3440                 inner_path_rows = 1;
3441
3442         /* Mark the path with the correct row estimate */
3443         if (path->jpath.path.param_info)
3444                 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3445         else
3446                 path->jpath.path.rows = path->jpath.path.parent->rows;
3447
3448         /* For partial paths, scale row estimate. */
3449         if (path->jpath.path.parallel_workers > 0)
3450         {
3451                 double          parallel_divisor = get_parallel_divisor(&path->jpath.path);
3452
3453                 path->jpath.path.rows =
3454                         clamp_row_est(path->jpath.path.rows / parallel_divisor);
3455         }
3456
3457         /*
3458          * We could include disable_cost in the preliminary estimate, but that
3459          * would amount to optimizing for the case where the join method is
3460          * disabled, which doesn't seem like the way to bet.
3461          */
3462         if (!enable_mergejoin)
3463                 startup_cost += disable_cost;
3464
3465         /*
3466          * Compute cost of the mergequals and qpquals (other restriction clauses)
3467          * separately.
3468          */
3469         cost_qual_eval(&merge_qual_cost, mergeclauses, root);
3470         cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3471         qp_qual_cost.startup -= merge_qual_cost.startup;
3472         qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
3473
3474         /*
3475          * With a SEMI or ANTI join, or if the innerrel is known unique, the
3476          * executor will stop scanning for matches after the first match.  When
3477          * all the joinclauses are merge clauses, this means we don't ever need to
3478          * back up the merge, and so we can skip mark/restore overhead.
3479          */
3480         if ((path->jpath.jointype == JOIN_SEMI ||
3481                  path->jpath.jointype == JOIN_ANTI ||
3482                  extra->inner_unique) &&
3483                 (list_length(path->jpath.joinrestrictinfo) ==
3484                  list_length(path->path_mergeclauses)))
3485                 path->skip_mark_restore = true;
3486         else
3487                 path->skip_mark_restore = false;
3488
3489         /*
3490          * Get approx # tuples passing the mergequals.  We use approx_tuple_count
3491          * here because we need an estimate done with JOIN_INNER semantics.
3492          */
3493         mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3494
3495         /*
3496          * When there are equal merge keys in the outer relation, the mergejoin
3497          * must rescan any matching tuples in the inner relation. This means
3498          * re-fetching inner tuples; we have to estimate how often that happens.
3499          *
3500          * For regular inner and outer joins, the number of re-fetches can be
3501          * estimated approximately as size of merge join output minus size of
3502          * inner relation. Assume that the distinct key values are 1, 2, ..., and
3503          * denote the number of values of each key in the outer relation as m1,
3504          * m2, ...; in the inner relation, n1, n2, ...  Then we have
3505          *
3506          * size of join = m1 * n1 + m2 * n2 + ...
3507          *
3508          * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3509          * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3510          * relation
3511          *
3512          * This equation works correctly for outer tuples having no inner match
3513          * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3514          * are effectively subtracting those from the number of rescanned tuples,
3515          * when we should not.  Can we do better without expensive selectivity
3516          * computations?
3517          *
3518          * The whole issue is moot if we are working from a unique-ified outer
3519          * input, or if we know we don't need to mark/restore at all.
3520          */
3521         if (IsA(outer_path, UniquePath) || path->skip_mark_restore)
3522                 rescannedtuples = 0;
3523         else
3524         {
3525                 rescannedtuples = mergejointuples - inner_path_rows;
3526                 /* Must clamp because of possible underestimate */
3527                 if (rescannedtuples < 0)
3528                         rescannedtuples = 0;
3529         }
3530
3531         /*
3532          * We'll inflate various costs this much to account for rescanning.  Note
3533          * that this is to be multiplied by something involving inner_rows, or
3534          * another number related to the portion of the inner rel we'll scan.
3535          */
3536         rescanratio = 1.0 + (rescannedtuples / inner_rows);
3537
3538         /*
3539          * Decide whether we want to materialize the inner input to shield it from
3540          * mark/restore and performing re-fetches.  Our cost model for regular
3541          * re-fetches is that a re-fetch costs the same as an original fetch,
3542          * which is probably an overestimate; but on the other hand we ignore the
3543          * bookkeeping costs of mark/restore.  Not clear if it's worth developing
3544          * a more refined model.  So we just need to inflate the inner run cost by
3545          * rescanratio.
3546          */
3547         bare_inner_cost = inner_run_cost * rescanratio;
3548
3549         /*
3550          * When we interpose a Material node the re-fetch cost is assumed to be
3551          * just cpu_operator_cost per tuple, independently of the underlying
3552          * plan's cost; and we charge an extra cpu_operator_cost per original
3553          * fetch as well.  Note that we're assuming the materialize node will
3554          * never spill to disk, since it only has to remember tuples back to the
3555          * last mark.  (If there are a huge number of duplicates, our other cost
3556          * factors will make the path so expensive that it probably won't get
3557          * chosen anyway.)      So we don't use cost_rescan here.
3558          *
3559          * Note: keep this estimate in sync with create_mergejoin_plan's labeling
3560          * of the generated Material node.
3561          */
3562         mat_inner_cost = inner_run_cost +
3563                 cpu_operator_cost * inner_rows * rescanratio;
3564
3565         /*
3566          * If we don't need mark/restore at all, we don't need materialization.
3567          */
3568         if (path->skip_mark_restore)
3569                 path->materialize_inner = false;
3570
3571         /*
3572          * Prefer materializing if it looks cheaper, unless the user has asked to
3573          * suppress materialization.
3574          */
3575         else if (enable_material && mat_inner_cost < bare_inner_cost)
3576                 path->materialize_inner = true;
3577
3578         /*
3579          * Even if materializing doesn't look cheaper, we *must* do it if the
3580          * inner path is to be used directly (without sorting) and it doesn't
3581          * support mark/restore.
3582          *
3583          * Since the inner side must be ordered, and only Sorts and IndexScans can
3584          * create order to begin with, and they both support mark/restore, you
3585          * might think there's no problem --- but you'd be wrong.  Nestloop and
3586          * merge joins can *preserve* the order of their inputs, so they can be
3587          * selected as the input of a mergejoin, and they don't support
3588          * mark/restore at present.
3589          *
3590          * We don't test the value of enable_material here, because
3591          * materialization is required for correctness in this case, and turning
3592          * it off does not entitle us to deliver an invalid plan.
3593          */
3594         else if (innersortkeys == NIL &&
3595                          !ExecSupportsMarkRestore(inner_path))
3596                 path->materialize_inner = true;
3597
3598         /*
3599          * Also, force materializing if the inner path is to be sorted and the
3600          * sort is expected to spill to disk.  This is because the final merge
3601          * pass can be done on-the-fly if it doesn't have to support mark/restore.
3602          * We don't try to adjust the cost estimates for this consideration,
3603          * though.
3604          *
3605          * Since materialization is a performance optimization in this case,
3606          * rather than necessary for correctness, we skip it if enable_material is
3607          * off.
3608          */
3609         else if (enable_material && innersortkeys != NIL &&
3610                          relation_byte_size(inner_path_rows,
3611                                                                 inner_path->pathtarget->width) >
3612                          (work_mem * 1024L))
3613                 path->materialize_inner = true;
3614         else
3615                 path->materialize_inner = false;
3616
3617         /* Charge the right incremental cost for the chosen case */
3618         if (path->materialize_inner)
3619                 run_cost += mat_inner_cost;
3620         else
3621                 run_cost += bare_inner_cost;
3622
3623         /* CPU costs */
3624
3625         /*
3626          * The number of tuple comparisons needed is approximately number of outer
3627          * rows plus number of inner rows plus number of rescanned tuples (can we
3628          * refine this?).  At each one, we need to evaluate the mergejoin quals.
3629          */
3630         startup_cost += merge_qual_cost.startup;
3631         startup_cost += merge_qual_cost.per_tuple *
3632                 (outer_skip_rows + inner_skip_rows * rescanratio);
3633         run_cost += merge_qual_cost.per_tuple *
3634                 ((outer_rows - outer_skip_rows) +
3635                  (inner_rows - inner_skip_rows) * rescanratio);
3636
3637         /*
3638          * For each tuple that gets through the mergejoin proper, we charge
3639          * cpu_tuple_cost plus the cost of evaluating additional restriction
3640          * clauses that are to be applied at the join.  (This is pessimistic since
3641          * not all of the quals may get evaluated at each tuple.)
3642          *
3643          * Note: we could adjust for SEMI/ANTI joins skipping some qual
3644          * evaluations here, but it's probably not worth the trouble.
3645          */
3646         startup_cost += qp_qual_cost.startup;
3647         cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3648         run_cost += cpu_per_tuple * mergejointuples;
3649
3650         /* tlist eval costs are paid per output row, not per tuple scanned */
3651         startup_cost += path->jpath.path.pathtarget->cost.startup;
3652         run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3653
3654         path->jpath.path.startup_cost = startup_cost;
3655         path->jpath.path.total_cost = startup_cost + run_cost;
3656 }
3657
3658 /*
3659  * run mergejoinscansel() with caching
3660  */
3661 static MergeScanSelCache *
3662 cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
3663 {
3664         MergeScanSelCache *cache;
3665         ListCell   *lc;
3666         Selectivity leftstartsel,
3667                                 leftendsel,
3668                                 rightstartsel,
3669                                 rightendsel;
3670         MemoryContext oldcontext;
3671
3672         /* Do we have this result already? */
3673         foreach(lc, rinfo->scansel_cache)
3674         {
3675                 cache = (MergeScanSelCache *) lfirst(lc);
3676                 if (cache->opfamily == pathkey->pk_opfamily &&
3677                         cache->collation == pathkey->pk_eclass->ec_collation &&
3678                         cache->strategy == pathkey->pk_strategy &&
3679                         cache->nulls_first == pathkey->pk_nulls_first)
3680                         return cache;
3681         }
3682
3683         /* Nope, do the computation */
3684         mergejoinscansel(root,
3685                                          (Node *) rinfo->clause,
3686                                          pathkey->pk_opfamily,
3687                                          pathkey->pk_strategy,
3688                                          pathkey->pk_nulls_first,
3689                                          &leftstartsel,
3690                                          &leftendsel,
3691                                          &rightstartsel,
3692                                          &rightendsel);
3693
3694         /* Cache the result in suitably long-lived workspace */
3695         oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3696
3697         cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
3698         cache->opfamily = pathkey->pk_opfamily;
3699         cache->collation = pathkey->pk_eclass->ec_collation;
3700         cache->strategy = pathkey->pk_strategy;
3701         cache->nulls_first = pathkey->pk_nulls_first;
3702         cache->leftstartsel = leftstartsel;
3703         cache->leftendsel = leftendsel;
3704         cache->rightstartsel = rightstartsel;
3705         cache->rightendsel = rightendsel;
3706
3707         rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3708
3709         MemoryContextSwitchTo(oldcontext);
3710
3711         return cache;
3712 }
3713
3714 /*
3715  * initial_cost_hashjoin
3716  *        Preliminary estimate of the cost of a hashjoin path.
3717  *
3718  * This must quickly produce lower-bound estimates of the path's startup and
3719  * total costs.  If we are unable to eliminate the proposed path from
3720  * consideration using the lower bounds, final_cost_hashjoin will be called
3721  * to obtain the final estimates.
3722  *
3723  * The exact division of labor between this function and final_cost_hashjoin
3724  * is private to them, and represents a tradeoff between speed of the initial
3725  * estimate and getting a tight lower bound.  We choose to not examine the
3726  * join quals here (other than by counting the number of hash clauses),
3727  * so we can't do much with CPU costs.  We do assume that
3728  * ExecChooseHashTableSize is cheap enough to use here.
3729  *
3730  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3731  *              other data to be used by final_cost_hashjoin
3732  * 'jointype' is the type of join to be performed
3733  * 'hashclauses' is the list of joinclauses to be used as hash clauses
3734  * 'outer_path' is the outer input to the join
3735  * 'inner_path' is the inner input to the join
3736  * 'extra' contains miscellaneous information about the join
3737  * 'parallel_hash' indicates that inner_path is partial and that a shared
3738  *              hash table will be built in parallel
3739  */
3740 void
3741 initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3742                                           JoinType jointype,
3743                                           List *hashclauses,
3744                                           Path *outer_path, Path *inner_path,
3745                                           JoinPathExtraData *extra,
3746                                           bool parallel_hash)
3747 {
3748         Cost            startup_cost = 0;
3749         Cost            run_cost = 0;
3750         double          outer_path_rows = outer_path->rows;
3751         double          inner_path_rows = inner_path->rows;
3752         double          inner_path_rows_total = inner_path_rows;
3753         int                     num_hashclauses = list_length(hashclauses);
3754         int                     numbuckets;
3755         int                     numbatches;
3756         int                     num_skew_mcvs;
3757         size_t          space_allowed;  /* unused */
3758
3759         /* cost of source data */
3760         startup_cost += outer_path->startup_cost;
3761         run_cost += outer_path->total_cost - outer_path->startup_cost;
3762         startup_cost += inner_path->total_cost;
3763
3764         /*
3765          * Cost of computing hash function: must do it once per input tuple. We
3766          * charge one cpu_operator_cost for each column's hash function.  Also,
3767          * tack on one cpu_tuple_cost per inner row, to model the costs of
3768          * inserting the row into the hashtable.
3769          *
3770          * XXX when a hashclause is more complex than a single operator, we really
3771          * should charge the extra eval costs of the left or right side, as
3772          * appropriate, here.  This seems more work than it's worth at the moment.
3773          */
3774         startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3775                 * inner_path_rows;
3776         run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3777
3778         /*
3779          * If this is a parallel hash build, then the value we have for
3780          * inner_rows_total currently refers only to the rows returned by each
3781          * participant.  For shared hash table size estimation, we need the total
3782          * number, so we need to undo the division.
3783          */
3784         if (parallel_hash)
3785                 inner_path_rows_total *= get_parallel_divisor(inner_path);
3786
3787         /*
3788          * Get hash table size that executor would use for inner relation.
3789          *
3790          * XXX for the moment, always assume that skew optimization will be
3791          * performed.  As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
3792          * trying to determine that for sure.
3793          *
3794          * XXX at some point it might be interesting to try to account for skew
3795          * optimization in the cost estimate, but for now, we don't.
3796          */
3797         ExecChooseHashTableSize(inner_path_rows_total,
3798                                                         inner_path->pathtarget->width,
3799                                                         true,   /* useskew */
3800                                                         parallel_hash,  /* try_combined_hash_mem */
3801                                                         outer_path->parallel_workers,
3802                                                         &space_allowed,
3803                                                         &numbuckets,
3804                                                         &numbatches,
3805                                                         &num_skew_mcvs);
3806
3807         /*
3808          * If inner relation is too big then we will need to "batch" the join,
3809          * which implies writing and reading most of the tuples to disk an extra
3810          * time.  Charge seq_page_cost per page, since the I/O should be nice and
3811          * sequential.  Writing the inner rel counts as startup cost, all the rest
3812          * as run cost.
3813          */
3814         if (numbatches > 1)
3815         {
3816                 double          outerpages = page_size(outer_path_rows,
3817                                                                                    outer_path->pathtarget->width);
3818                 double          innerpages = page_size(inner_path_rows,
3819                                                                                    inner_path->pathtarget->width);
3820
3821                 startup_cost += seq_page_cost * innerpages;
3822                 run_cost += seq_page_cost * (innerpages + 2 * outerpages);
3823         }
3824
3825         /* CPU costs left for later */
3826
3827         /* Public result fields */
3828         workspace->startup_cost = startup_cost;
3829         workspace->total_cost = startup_cost + run_cost;
3830         /* Save private data for final_cost_hashjoin */
3831         workspace->run_cost = run_cost;
3832         workspace->numbuckets = numbuckets;
3833         workspace->numbatches = numbatches;
3834         workspace->inner_rows_total = inner_path_rows_total;
3835 }
3836
3837 /*
3838  * final_cost_hashjoin
3839  *        Final estimate of the cost and result size of a hashjoin path.
3840  *
3841  * Note: the numbatches estimate is also saved into 'path' for use later
3842  *
3843  * 'path' is already filled in except for the rows and cost fields and
3844  *              num_batches
3845  * 'workspace' is the result from initial_cost_hashjoin
3846  * 'extra' contains miscellaneous information about the join
3847  */
3848 void
3849 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
3850                                         JoinCostWorkspace *workspace,
3851                                         JoinPathExtraData *extra)
3852 {
3853         Path       *outer_path = path->jpath.outerjoinpath;
3854         Path       *inner_path = path->jpath.innerjoinpath;
3855         double          outer_path_rows = outer_path->rows;
3856         double          inner_path_rows = inner_path->rows;
3857         double          inner_path_rows_total = workspace->inner_rows_total;
3858         List       *hashclauses = path->path_hashclauses;
3859         Cost            startup_cost = workspace->startup_cost;
3860         Cost            run_cost = workspace->run_cost;
3861         int                     numbuckets = workspace->numbuckets;
3862         int                     numbatches = workspace->numbatches;
3863         Cost            cpu_per_tuple;
3864         QualCost        hash_qual_cost;
3865         QualCost        qp_qual_cost;
3866         double          hashjointuples;
3867         double          virtualbuckets;
3868         Selectivity innerbucketsize;
3869         Selectivity innermcvfreq;
3870         ListCell   *hcl;
3871
3872         /* Mark the path with the correct row estimate */
3873         if (path->jpath.path.param_info)
3874                 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3875         else
3876                 path->jpath.path.rows = path->jpath.path.parent->rows;
3877
3878         /* For partial paths, scale row estimate. */
3879         if (path->jpath.path.parallel_workers > 0)
3880         {
3881                 double          parallel_divisor = get_parallel_divisor(&path->jpath.path);
3882
3883                 path->jpath.path.rows =
3884                         clamp_row_est(path->jpath.path.rows / parallel_divisor);
3885         }
3886
3887         /*
3888          * We could include disable_cost in the preliminary estimate, but that
3889          * would amount to optimizing for the case where the join method is
3890          * disabled, which doesn't seem like the way to bet.
3891          */
3892         if (!enable_hashjoin)
3893                 startup_cost += disable_cost;
3894
3895         /* mark the path with estimated # of batches */
3896         path->num_batches = numbatches;
3897
3898         /* store the total number of tuples (sum of partial row estimates) */
3899         path->inner_rows_total = inner_path_rows_total;
3900
3901         /* and compute the number of "virtual" buckets in the whole join */
3902         virtualbuckets = (double) numbuckets * (double) numbatches;
3903
3904         /*
3905          * Determine bucketsize fraction and MCV frequency for the inner relation.
3906          * We use the smallest bucketsize or MCV frequency estimated for any
3907          * individual hashclause; this is undoubtedly conservative.
3908          *
3909          * BUT: if inner relation has been unique-ified, we can assume it's good
3910          * for hashing.  This is important both because it's the right answer, and
3911          * because we avoid contaminating the cache with a value that's wrong for
3912          * non-unique-ified paths.
3913          */
3914         if (IsA(inner_path, UniquePath))
3915         {
3916                 innerbucketsize = 1.0 / virtualbuckets;
3917                 innermcvfreq = 0.0;
3918         }
3919         else
3920         {
3921                 innerbucketsize = 1.0;
3922                 innermcvfreq = 1.0;
3923                 foreach(hcl, hashclauses)
3924                 {
3925                         RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3926                         Selectivity thisbucketsize;
3927                         Selectivity thismcvfreq;
3928
3929                         /*
3930                          * First we have to figure out which side of the hashjoin clause
3931                          * is the inner side.
3932                          *
3933                          * Since we tend to visit the same clauses over and over when
3934                          * planning a large query, we cache the bucket stats estimates in
3935                          * the RestrictInfo node to avoid repeated lookups of statistics.
3936                          */
3937                         if (bms_is_subset(restrictinfo->right_relids,
3938                                                           inner_path->parent->relids))
3939                         {
3940                                 /* righthand side is inner */
3941                                 thisbucketsize = restrictinfo->right_bucketsize;
3942                                 if (thisbucketsize < 0)
3943                                 {
3944                                         /* not cached yet */
3945                                         estimate_hash_bucket_stats(root,
3946                                                                                            get_rightop(restrictinfo->clause),
3947                                                                                            virtualbuckets,
3948                                                                                            &restrictinfo->right_mcvfreq,
3949                                                                                            &restrictinfo->right_bucketsize);
3950                                         thisbucketsize = restrictinfo->right_bucketsize;
3951                                 }
3952                                 thismcvfreq = restrictinfo->right_mcvfreq;
3953                         }
3954                         else
3955                         {
3956                                 Assert(bms_is_subset(restrictinfo->left_relids,
3957                                                                          inner_path->parent->relids));
3958                                 /* lefthand side is inner */
3959                                 thisbucketsize = restrictinfo->left_bucketsize;
3960                                 if (thisbucketsize < 0)
3961                                 {
3962                                         /* not cached yet */
3963                                         estimate_hash_bucket_stats(root,
3964                                                                                            get_leftop(restrictinfo->clause),
3965                                                                                            virtualbuckets,
3966                                                                                            &restrictinfo->left_mcvfreq,
3967                                                                                            &restrictinfo->left_bucketsize);
3968                                         thisbucketsize = restrictinfo->left_bucketsize;
3969                                 }
3970                                 thismcvfreq = restrictinfo->left_mcvfreq;
3971                         }
3972
3973                         if (innerbucketsize > thisbucketsize)
3974                                 innerbucketsize = thisbucketsize;
3975                         if (innermcvfreq > thismcvfreq)
3976                                 innermcvfreq = thismcvfreq;
3977                 }
3978         }
3979
3980         /*
3981          * If the bucket holding the inner MCV would exceed hash_mem, we don't
3982          * want to hash unless there is really no other alternative, so apply
3983          * disable_cost.  (The executor normally copes with excessive memory usage
3984          * by splitting batches, but obviously it cannot separate equal values
3985          * that way, so it will be unable to drive the batch size below hash_mem
3986          * when this is true.)
3987          */
3988         if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3989                                                    inner_path->pathtarget->width) > get_hash_memory_limit())
3990                 startup_cost += disable_cost;
3991
3992         /*
3993          * Compute cost of the hashquals and qpquals (other restriction clauses)
3994          * separately.
3995          */
3996         cost_qual_eval(&hash_qual_cost, hashclauses, root);
3997         cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3998         qp_qual_cost.startup -= hash_qual_cost.startup;
3999         qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
4000
4001         /* CPU costs */
4002
4003         if (path->jpath.jointype == JOIN_SEMI ||
4004                 path->jpath.jointype == JOIN_ANTI ||
4005                 extra->inner_unique)
4006         {
4007                 double          outer_matched_rows;
4008                 Selectivity inner_scan_frac;
4009
4010                 /*
4011                  * With a SEMI or ANTI join, or if the innerrel is known unique, the
4012                  * executor will stop after the first match.
4013                  *
4014                  * For an outer-rel row that has at least one match, we can expect the
4015                  * bucket scan to stop after a fraction 1/(match_count+1) of the
4016                  * bucket's rows, if the matches are evenly distributed.  Since they
4017                  * probably aren't quite evenly distributed, we apply a fuzz factor of
4018                  * 2.0 to that fraction.  (If we used a larger fuzz factor, we'd have
4019                  * to clamp inner_scan_frac to at most 1.0; but since match_count is
4020                  * at least 1, no such clamp is needed now.)
4021                  */
4022                 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
4023                 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
4024
4025                 startup_cost += hash_qual_cost.startup;
4026                 run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
4027                         clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
4028
4029                 /*
4030                  * For unmatched outer-rel rows, the picture is quite a lot different.
4031                  * In the first place, there is no reason to assume that these rows
4032                  * preferentially hit heavily-populated buckets; instead assume they
4033                  * are uncorrelated with the inner distribution and so they see an
4034                  * average bucket size of inner_path_rows / virtualbuckets.  In the
4035                  * second place, it seems likely that they will have few if any exact
4036                  * hash-code matches and so very few of the tuples in the bucket will
4037                  * actually require eval of the hash quals.  We don't have any good
4038                  * way to estimate how many will, but for the moment assume that the
4039                  * effective cost per bucket entry is one-tenth what it is for
4040                  * matchable tuples.
4041                  */
4042                 run_cost += hash_qual_cost.per_tuple *
4043                         (outer_path_rows - outer_matched_rows) *
4044                         clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
4045
4046                 /* Get # of tuples that will pass the basic join */
4047                 if (path->jpath.jointype == JOIN_ANTI)
4048                         hashjointuples = outer_path_rows - outer_matched_rows;
4049                 else
4050                         hashjointuples = outer_matched_rows;
4051         }
4052         else
4053         {
4054                 /*
4055                  * The number of tuple comparisons needed is the number of outer
4056                  * tuples times the typical number of tuples in a hash bucket, which
4057                  * is the inner relation size times its bucketsize fraction.  At each
4058                  * one, we need to evaluate the hashjoin quals.  But actually,
4059                  * charging the full qual eval cost at each tuple is pessimistic,
4060                  * since we don't evaluate the quals unless the hash values match
4061                  * exactly.  For lack of a better idea, halve the cost estimate to
4062                  * allow for that.
4063                  */
4064                 startup_cost += hash_qual_cost.startup;
4065                 run_cost += hash_qual_cost.per_tuple * outer_path_rows *
4066                         clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
4067
4068                 /*
4069                  * Get approx # tuples passing the hashquals.  We use
4070                  * approx_tuple_count here because we need an estimate done with
4071                  * JOIN_INNER semantics.
4072                  */
4073                 hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
4074         }
4075
4076         /*
4077          * For each tuple that gets through the hashjoin proper, we charge
4078          * cpu_tuple_cost plus the cost of evaluating additional restriction
4079          * clauses that are to be applied at the join.  (This is pessimistic since
4080          * not all of the quals may get evaluated at each tuple.)
4081          */
4082         startup_cost += qp_qual_cost.startup;
4083         cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
4084         run_cost += cpu_per_tuple * hashjointuples;
4085
4086         /* tlist eval costs are paid per output row, not per tuple scanned */
4087         startup_cost += path->jpath.path.pathtarget->cost.startup;
4088         run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
4089
4090         path->jpath.path.startup_cost = startup_cost;
4091         path->jpath.path.total_cost = startup_cost + run_cost;
4092 }
4093
4094
4095 /*
4096  * cost_subplan
4097  *              Figure the costs for a SubPlan (or initplan).
4098  *
4099  * Note: we could dig the subplan's Plan out of the root list, but in practice
4100  * all callers have it handy already, so we make them pass it.
4101  */
4102 void
4103 cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
4104 {
4105         QualCost        sp_cost;
4106
4107         /* Figure any cost for evaluating the testexpr */
4108         cost_qual_eval(&sp_cost,
4109                                    make_ands_implicit((Expr *) subplan->testexpr),
4110                                    root);
4111
4112         if (subplan->useHashTable)
4113         {
4114                 /*
4115                  * If we are using a hash table for the subquery outputs, then the
4116                  * cost of evaluating the query is a one-time cost.  We charge one
4117                  * cpu_operator_cost per tuple for the work of loading the hashtable,
4118                  * too.
4119                  */
4120                 sp_cost.startup += plan->total_cost +
4121                         cpu_operator_cost * plan->plan_rows;
4122
4123                 /*
4124                  * The per-tuple costs include the cost of evaluating the lefthand
4125                  * expressions, plus the cost of probing the hashtable.  We already
4126                  * accounted for the lefthand expressions as part of the testexpr, and
4127                  * will also have counted one cpu_operator_cost for each comparison
4128                  * operator.  That is probably too low for the probing cost, but it's
4129                  * hard to make a better estimate, so live with it for now.
4130                  */
4131         }
4132         else
4133         {
4134                 /*
4135                  * Otherwise we will be rescanning the subplan output on each
4136                  * evaluation.  We need to estimate how much of the output we will
4137                  * actually need to scan.  NOTE: this logic should agree with the
4138                  * tuple_fraction estimates used by make_subplan() in
4139                  * plan/subselect.c.
4140                  */
4141                 Cost            plan_run_cost = plan->total_cost - plan->startup_cost;
4142
4143                 if (subplan->subLinkType == EXISTS_SUBLINK)
4144                 {
4145                         /* we only need to fetch 1 tuple; clamp to avoid zero divide */
4146                         sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
4147                 }
4148                 else if (subplan->subLinkType == ALL_SUBLINK ||
4149                                  subplan->subLinkType == ANY_SUBLINK)
4150                 {
4151                         /* assume we need 50% of the tuples */
4152                         sp_cost.per_tuple += 0.50 * plan_run_cost;
4153                         /* also charge a cpu_operator_cost per row examined */
4154                         sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
4155                 }
4156                 else
4157                 {
4158                         /* assume we need all tuples */
4159                         sp_cost.per_tuple += plan_run_cost;
4160                 }
4161
4162                 /*
4163                  * Also account for subplan's startup cost. If the subplan is
4164                  * uncorrelated or undirect correlated, AND its topmost node is one
4165                  * that materializes its output, assume that we'll only need to pay
4166                  * its startup cost once; otherwise assume we pay the startup cost
4167                  * every time.
4168                  */
4169                 if (subplan->parParam == NIL &&
4170                         ExecMaterializesOutput(nodeTag(plan)))
4171                         sp_cost.startup += plan->startup_cost;
4172                 else
4173                         sp_cost.per_tuple += plan->startup_cost;
4174         }
4175
4176         subplan->startup_cost = sp_cost.startup;
4177         subplan->per_call_cost = sp_cost.per_tuple;
4178 }
4179
4180
4181 /*
4182  * cost_rescan
4183  *              Given a finished Path, estimate the costs of rescanning it after
4184  *              having done so the first time.  For some Path types a rescan is
4185  *              cheaper than an original scan (if no parameters change), and this
4186  *              function embodies knowledge about that.  The default is to return
4187  *              the same costs stored in the Path.  (Note that the cost estimates
4188  *              actually stored in Paths are always for first scans.)
4189  *
4190  * This function is not currently intended to model effects such as rescans
4191  * being cheaper due to disk block caching; what we are concerned with is
4192  * plan types wherein the executor caches results explicitly, or doesn't
4193  * redo startup calculations, etc.
4194  */
4195 static void
4196 cost_rescan(PlannerInfo *root, Path *path,
4197                         Cost *rescan_startup_cost,      /* output parameters */
4198                         Cost *rescan_total_cost)
4199 {
4200         switch (path->pathtype)
4201         {
4202                 case T_FunctionScan:
4203
4204                         /*
4205                          * Currently, nodeFunctionscan.c always executes the function to
4206                          * completion before returning any rows, and caches the results in
4207                          * a tuplestore.  So the function eval cost is all startup cost
4208                          * and isn't paid over again on rescans. However, all run costs
4209                          * will be paid over again.
4210                          */
4211                         *rescan_startup_cost = 0;
4212                         *rescan_total_cost = path->total_cost - path->startup_cost;
4213                         break;
4214                 case T_HashJoin:
4215
4216                         /*
4217                          * If it's a single-batch join, we don't need to rebuild the hash
4218                          * table during a rescan.
4219                          */
4220                         if (((HashPath *) path)->num_batches == 1)
4221                         {
4222                                 /* Startup cost is exactly the cost of hash table building */
4223                                 *rescan_startup_cost = 0;
4224                                 *rescan_total_cost = path->total_cost - path->startup_cost;
4225                         }
4226                         else
4227                         {
4228                                 /* Otherwise, no special treatment */
4229                                 *rescan_startup_cost = path->startup_cost;
4230                                 *rescan_total_cost = path->total_cost;
4231                         }
4232                         break;
4233                 case T_CteScan:
4234                 case T_WorkTableScan:
4235                         {
4236                                 /*
4237                                  * These plan types materialize their final result in a
4238                                  * tuplestore or tuplesort object.  So the rescan cost is only
4239                                  * cpu_tuple_cost per tuple, unless the result is large enough
4240                                  * to spill to disk.
4241                                  */
4242                                 Cost            run_cost = cpu_tuple_cost * path->rows;
4243                                 double          nbytes = relation_byte_size(path->rows,
4244                                                                                                                 path->pathtarget->width);
4245                                 long            work_mem_bytes = work_mem * 1024L;
4246
4247                                 if (nbytes > work_mem_bytes)
4248                                 {
4249                                         /* It will spill, so account for re-read cost */
4250                                         double          npages = ceil(nbytes / BLCKSZ);
4251
4252                                         run_cost += seq_page_cost * npages;
4253                                 }
4254                                 *rescan_startup_cost = 0;
4255                                 *rescan_total_cost = run_cost;
4256                         }
4257                         break;
4258                 case T_Material:
4259                 case T_Sort:
4260                         {
4261                                 /*
4262                                  * These plan types not only materialize their results, but do
4263                                  * not implement qual filtering or projection.  So they are
4264                                  * even cheaper to rescan than the ones above.  We charge only
4265                                  * cpu_operator_cost per tuple.  (Note: keep that in sync with
4266                                  * the run_cost charge in cost_sort, and also see comments in
4267                                  * cost_material before you change it.)
4268                                  */
4269                                 Cost            run_cost = cpu_operator_cost * path->rows;
4270                                 double          nbytes = relation_byte_size(path->rows,
4271                                                                                                                 path->pathtarget->width);
4272                                 long            work_mem_bytes = work_mem * 1024L;
4273
4274                                 if (nbytes > work_mem_bytes)
4275                                 {
4276                                         /* It will spill, so account for re-read cost */
4277                                         double          npages = ceil(nbytes / BLCKSZ);
4278
4279                                         run_cost += seq_page_cost * npages;
4280                                 }
4281                                 *rescan_startup_cost = 0;
4282                                 *rescan_total_cost = run_cost;
4283                         }
4284                         break;
4285                 case T_Memoize:
4286                         /* All the hard work is done by cost_memoize_rescan */
4287                         cost_memoize_rescan(root, (MemoizePath *) path,
4288                                                                 rescan_startup_cost, rescan_total_cost);
4289                         break;
4290                 default:
4291                         *rescan_startup_cost = path->startup_cost;
4292                         *rescan_total_cost = path->total_cost;
4293                         break;
4294         }
4295 }
4296
4297
4298 /*
4299  * cost_qual_eval
4300  *              Estimate the CPU costs of evaluating a WHERE clause.
4301  *              The input can be either an implicitly-ANDed list of boolean
4302  *              expressions, or a list of RestrictInfo nodes.  (The latter is
4303  *              preferred since it allows caching of the results.)
4304  *              The result includes both a one-time (startup) component,
4305  *              and a per-evaluation component.
4306  */
4307 void
4308 cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
4309 {
4310         cost_qual_eval_context context;
4311         ListCell   *l;
4312
4313         context.root = root;
4314         context.total.startup = 0;
4315         context.total.per_tuple = 0;
4316
4317         /* We don't charge any cost for the implicit ANDing at top level ... */
4318
4319         foreach(l, quals)
4320         {
4321                 Node       *qual = (Node *) lfirst(l);
4322
4323                 cost_qual_eval_walker(qual, &context);
4324         }
4325
4326         *cost = context.total;
4327 }
4328
4329 /*
4330  * cost_qual_eval_node
4331  *              As above, for a single RestrictInfo or expression.
4332  */
4333 void
4334 cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
4335 {
4336         cost_qual_eval_context context;
4337
4338         context.root = root;
4339         context.total.startup = 0;
4340         context.total.per_tuple = 0;
4341
4342         cost_qual_eval_walker(qual, &context);
4343
4344         *cost = context.total;
4345 }
4346
4347 static bool
4348 cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
4349 {
4350         if (node == NULL)
4351                 return false;
4352
4353         /*
4354          * RestrictInfo nodes contain an eval_cost field reserved for this
4355          * routine's use, so that it's not necessary to evaluate the qual clause's
4356          * cost more than once.  If the clause's cost hasn't been computed yet,
4357          * the field's startup value will contain -1.
4358          */
4359         if (IsA(node, RestrictInfo))
4360         {
4361                 RestrictInfo *rinfo = (RestrictInfo *) node;
4362
4363                 if (rinfo->eval_cost.startup < 0)
4364                 {
4365                         cost_qual_eval_context locContext;
4366
4367                         locContext.root = context->root;
4368                         locContext.total.startup = 0;
4369                         locContext.total.per_tuple = 0;
4370
4371                         /*
4372                          * For an OR clause, recurse into the marked-up tree so that we
4373                          * set the eval_cost for contained RestrictInfos too.
4374                          */
4375                         if (rinfo->orclause)
4376                                 cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
4377                         else
4378                                 cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
4379
4380                         /*
4381                          * If the RestrictInfo is marked pseudoconstant, it will be tested
4382                          * only once, so treat its cost as all startup cost.
4383                          */
4384                         if (rinfo->pseudoconstant)
4385                         {
4386                                 /* count one execution during startup */
4387                                 locContext.total.startup += locContext.total.per_tuple;
4388                                 locContext.total.per_tuple = 0;
4389                         }
4390                         rinfo->eval_cost = locContext.total;
4391                 }
4392                 context->total.startup += rinfo->eval_cost.startup;
4393                 context->total.per_tuple += rinfo->eval_cost.per_tuple;
4394                 /* do NOT recurse into children */
4395                 return false;
4396         }
4397
4398         /*
4399          * For each operator or function node in the given tree, we charge the
4400          * estimated execution cost given by pg_proc.procost (remember to multiply
4401          * this by cpu_operator_cost).
4402          *
4403          * Vars and Consts are charged zero, and so are boolean operators (AND,
4404          * OR, NOT). Simplistic, but a lot better than no model at all.
4405          *
4406          * Should we try to account for the possibility of short-circuit
4407          * evaluation of AND/OR?  Probably *not*, because that would make the
4408          * results depend on the clause ordering, and we are not in any position
4409          * to expect that the current ordering of the clauses is the one that's
4410          * going to end up being used.  The above per-RestrictInfo caching would
4411          * not mix well with trying to re-order clauses anyway.
4412          *
4413          * Another issue that is entirely ignored here is that if a set-returning
4414          * function is below top level in the tree, the functions/operators above
4415          * it will need to be evaluated multiple times.  In practical use, such
4416          * cases arise so seldom as to not be worth the added complexity needed;
4417          * moreover, since our rowcount estimates for functions tend to be pretty
4418          * phony, the results would also be pretty phony.
4419          */
4420         if (IsA(node, FuncExpr))
4421         {
4422                 add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
4423                                                   &context->total);
4424         }
4425         else if (IsA(node, OpExpr) ||
4426                          IsA(node, DistinctExpr) ||
4427                          IsA(node, NullIfExpr))
4428         {
4429                 /* rely on struct equivalence to treat these all alike */
4430                 set_opfuncid((OpExpr *) node);
4431                 add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
4432                                                   &context->total);
4433         }
4434         else if (IsA(node, ScalarArrayOpExpr))
4435         {
4436                 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
4437                 Node       *arraynode = (Node *) lsecond(saop->args);
4438                 QualCost        sacosts;
4439                 QualCost        hcosts;
4440                 int                     estarraylen = estimate_array_length(arraynode);
4441
4442                 set_sa_opfuncid(saop);
4443                 sacosts.startup = sacosts.per_tuple = 0;
4444                 add_function_cost(context->root, saop->opfuncid, NULL,
4445                                                   &sacosts);
4446
4447                 if (OidIsValid(saop->hashfuncid))
4448                 {
4449                         /* Handle costs for hashed ScalarArrayOpExpr */
4450                         hcosts.startup = hcosts.per_tuple = 0;
4451
4452                         add_function_cost(context->root, saop->hashfuncid, NULL, &hcosts);
4453                         context->total.startup += sacosts.startup + hcosts.startup;
4454
4455                         /* Estimate the cost of building the hashtable. */
4456                         context->total.startup += estarraylen * hcosts.per_tuple;
4457
4458                         /*
4459                          * XXX should we charge a little bit for sacosts.per_tuple when
4460                          * building the table, or is it ok to assume there will be zero
4461                          * hash collision?
4462                          */
4463
4464                         /*
4465                          * Charge for hashtable lookups.  Charge a single hash and a
4466                          * single comparison.
4467                          */
4468                         context->total.per_tuple += hcosts.per_tuple + sacosts.per_tuple;
4469                 }
4470                 else
4471                 {
4472                         /*
4473                          * Estimate that the operator will be applied to about half of the
4474                          * array elements before the answer is determined.
4475                          */
4476                         context->total.startup += sacosts.startup;
4477                         context->total.per_tuple += sacosts.per_tuple *
4478                                 estimate_array_length(arraynode) * 0.5;
4479                 }
4480         }
4481         else if (IsA(node, Aggref) ||
4482                          IsA(node, WindowFunc))
4483         {
4484                 /*
4485                  * Aggref and WindowFunc nodes are (and should be) treated like Vars,
4486                  * ie, zero execution cost in the current model, because they behave
4487                  * essentially like Vars at execution.  We disregard the costs of
4488                  * their input expressions for the same reason.  The actual execution
4489                  * costs of the aggregate/window functions and their arguments have to
4490                  * be factored into plan-node-specific costing of the Agg or WindowAgg
4491                  * plan node.
4492                  */
4493                 return false;                   /* don't recurse into children */
4494         }
4495         else if (IsA(node, CoerceViaIO))
4496         {
4497                 CoerceViaIO *iocoerce = (CoerceViaIO *) node;
4498                 Oid                     iofunc;
4499                 Oid                     typioparam;
4500                 bool            typisvarlena;
4501
4502                 /* check the result type's input function */
4503                 getTypeInputInfo(iocoerce->resulttype,
4504                                                  &iofunc, &typioparam);
4505                 add_function_cost(context->root, iofunc, NULL,
4506                                                   &context->total);
4507                 /* check the input type's output function */
4508                 getTypeOutputInfo(exprType((Node *) iocoerce->arg),
4509                                                   &iofunc, &typisvarlena);
4510                 add_function_cost(context->root, iofunc, NULL,
4511                                                   &context->total);
4512         }
4513         else if (IsA(node, ArrayCoerceExpr))
4514         {
4515                 ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
4516                 QualCost        perelemcost;
4517
4518                 cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4519                                                         context->root);
4520                 context->total.startup += perelemcost.startup;
4521                 if (perelemcost.per_tuple > 0)
4522                         context->total.per_tuple += perelemcost.per_tuple *
4523                                 estimate_array_length((Node *) acoerce->arg);
4524         }
4525         else if (IsA(node, RowCompareExpr))
4526         {
4527                 /* Conservatively assume we will check all the columns */
4528                 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
4529                 ListCell   *lc;
4530
4531                 foreach(lc, rcexpr->opnos)
4532                 {
4533                         Oid                     opid = lfirst_oid(lc);
4534
4535                         add_function_cost(context->root, get_opcode(opid), NULL,
4536                                                           &context->total);
4537                 }
4538         }
4539         else if (IsA(node, MinMaxExpr) ||
4540                          IsA(node, SQLValueFunction) ||
4541                          IsA(node, XmlExpr) ||
4542                          IsA(node, CoerceToDomain) ||
4543                          IsA(node, NextValueExpr))
4544         {
4545                 /* Treat all these as having cost 1 */
4546                 context->total.per_tuple += cpu_operator_cost;
4547         }
4548         else if (IsA(node, CurrentOfExpr))
4549         {
4550                 /* Report high cost to prevent selection of anything but TID scan */
4551                 context->total.startup += disable_cost;
4552         }
4553         else if (IsA(node, SubLink))
4554         {
4555                 /* This routine should not be applied to un-planned expressions */
4556                 elog(ERROR, "cannot handle unplanned sub-select");
4557         }
4558         else if (IsA(node, SubPlan))
4559         {
4560                 /*
4561                  * A subplan node in an expression typically indicates that the
4562                  * subplan will be executed on each evaluation, so charge accordingly.
4563                  * (Sub-selects that can be executed as InitPlans have already been
4564                  * removed from the expression.)
4565                  */
4566                 SubPlan    *subplan = (SubPlan *) node;
4567
4568                 context->total.startup += subplan->startup_cost;
4569                 context->total.per_tuple += subplan->per_call_cost;
4570
4571                 /*
4572                  * We don't want to recurse into the testexpr, because it was already
4573                  * counted in the SubPlan node's costs.  So we're done.
4574                  */
4575                 return false;
4576         }
4577         else if (IsA(node, AlternativeSubPlan))
4578         {
4579                 /*
4580                  * Arbitrarily use the first alternative plan for costing.  (We should
4581                  * certainly only include one alternative, and we don't yet have
4582                  * enough information to know which one the executor is most likely to
4583                  * use.)
4584                  */
4585                 AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
4586
4587                 return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4588                                                                          context);
4589         }
4590         else if (IsA(node, PlaceHolderVar))
4591         {
4592                 /*
4593                  * A PlaceHolderVar should be given cost zero when considering general
4594                  * expression evaluation costs.  The expense of doing the contained
4595                  * expression is charged as part of the tlist eval costs of the scan
4596                  * or join where the PHV is first computed (see set_rel_width and
4597                  * add_placeholders_to_joinrel).  If we charged it again here, we'd be
4598                  * double-counting the cost for each level of plan that the PHV
4599                  * bubbles up through.  Hence, return without recursing into the
4600                  * phexpr.
4601                  */
4602                 return false;
4603         }
4604
4605         /* recurse into children */
4606         return expression_tree_walker(node, cost_qual_eval_walker,
4607                                                                   (void *) context);
4608 }
4609
4610 /*
4611  * get_restriction_qual_cost
4612  *        Compute evaluation costs of a baserel's restriction quals, plus any
4613  *        movable join quals that have been pushed down to the scan.
4614  *        Results are returned into *qpqual_cost.
4615  *
4616  * This is a convenience subroutine that works for seqscans and other cases
4617  * where all the given quals will be evaluated the hard way.  It's not useful
4618  * for cost_index(), for example, where the index machinery takes care of
4619  * some of the quals.  We assume baserestrictcost was previously set by
4620  * set_baserel_size_estimates().
4621  */
4622 static void
4623 get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
4624                                                   ParamPathInfo *param_info,
4625                                                   QualCost *qpqual_cost)
4626 {
4627         if (param_info)
4628         {
4629                 /* Include costs of pushed-down clauses */
4630                 cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4631
4632                 qpqual_cost->startup += baserel->baserestrictcost.startup;
4633                 qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4634         }
4635         else
4636                 *qpqual_cost = baserel->baserestrictcost;
4637 }
4638
4639
4640 /*
4641  * compute_semi_anti_join_factors
4642  *        Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4643  *        can be expected to scan.
4644  *
4645  * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4646  * inner rows as soon as it finds a match to the current outer row.
4647  * The same happens if we have detected the inner rel is unique.
4648  * We should therefore adjust some of the cost components for this effect.
4649  * This function computes some estimates needed for these adjustments.
4650  * These estimates will be the same regardless of the particular paths used
4651  * for the outer and inner relation, so we compute these once and then pass
4652  * them to all the join cost estimation functions.
4653  *
4654  * Input parameters:
4655  *      joinrel: join relation under consideration
4656  *      outerrel: outer relation under consideration
4657  *      innerrel: inner relation under consideration
4658  *      jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4659  *      sjinfo: SpecialJoinInfo relevant to this join
4660  *      restrictlist: join quals
4661  * Output parameters:
4662  *      *semifactors is filled in (see pathnodes.h for field definitions)
4663  */
4664 void
4665 compute_semi_anti_join_factors(PlannerInfo *root,
4666                                                            RelOptInfo *joinrel,
4667                                                            RelOptInfo *outerrel,
4668                                                            RelOptInfo *innerrel,
4669                                                            JoinType jointype,
4670                                                            SpecialJoinInfo *sjinfo,
4671                                                            List *restrictlist,
4672                                                            SemiAntiJoinFactors *semifactors)
4673 {
4674         Selectivity jselec;
4675         Selectivity nselec;
4676         Selectivity avgmatch;
4677         SpecialJoinInfo norm_sjinfo;
4678         List       *joinquals;
4679         ListCell   *l;
4680
4681         /*
4682          * In an ANTI join, we must ignore clauses that are "pushed down", since
4683          * those won't affect the match logic.  In a SEMI join, we do not
4684          * distinguish joinquals from "pushed down" quals, so just use the whole
4685          * restrictinfo list.  For other outer join types, we should consider only
4686          * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4687          */
4688         if (IS_OUTER_JOIN(jointype))
4689         {
4690                 joinquals = NIL;
4691                 foreach(l, restrictlist)
4692                 {
4693                         RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4694
4695                         if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4696                                 joinquals = lappend(joinquals, rinfo);
4697                 }
4698         }
4699         else
4700                 joinquals = restrictlist;
4701
4702         /*
4703          * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4704          */
4705         jselec = clauselist_selectivity(root,
4706                                                                         joinquals,
4707                                                                         0,
4708                                                                         (jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4709                                                                         sjinfo);
4710
4711         /*
4712          * Also get the normal inner-join selectivity of the join clauses.
4713          */
4714         norm_sjinfo.type = T_SpecialJoinInfo;
4715         norm_sjinfo.min_lefthand = outerrel->relids;
4716         norm_sjinfo.min_righthand = innerrel->relids;
4717         norm_sjinfo.syn_lefthand = outerrel->relids;
4718         norm_sjinfo.syn_righthand = innerrel->relids;
4719         norm_sjinfo.jointype = JOIN_INNER;
4720         /* we don't bother trying to make the remaining fields valid */
4721         norm_sjinfo.lhs_strict = false;
4722         norm_sjinfo.delay_upper_joins = false;
4723         norm_sjinfo.semi_can_btree = false;
4724         norm_sjinfo.semi_can_hash = false;
4725         norm_sjinfo.semi_operators = NIL;
4726         norm_sjinfo.semi_rhs_exprs = NIL;
4727
4728         nselec = clauselist_selectivity(root,
4729                                                                         joinquals,
4730                                                                         0,
4731                                                                         JOIN_INNER,
4732                                                                         &norm_sjinfo);
4733
4734         /* Avoid leaking a lot of ListCells */
4735         if (IS_OUTER_JOIN(jointype))
4736                 list_free(joinquals);
4737
4738         /*
4739          * jselec can be interpreted as the fraction of outer-rel rows that have
4740          * any matches (this is true for both SEMI and ANTI cases).  And nselec is
4741          * the fraction of the Cartesian product that matches.  So, the average
4742          * number of matches for each outer-rel row that has at least one match is
4743          * nselec * inner_rows / jselec.
4744          *
4745          * Note: it is correct to use the inner rel's "rows" count here, even
4746          * though we might later be considering a parameterized inner path with
4747          * fewer rows.  This is because we have included all the join clauses in
4748          * the selectivity estimate.
4749          */
4750         if (jselec > 0)                         /* protect against zero divide */
4751         {
4752                 avgmatch = nselec * innerrel->rows / jselec;
4753                 /* Clamp to sane range */
4754                 avgmatch = Max(1.0, avgmatch);
4755         }
4756         else
4757                 avgmatch = 1.0;
4758
4759         semifactors->outer_match_frac = jselec;
4760         semifactors->match_count = avgmatch;
4761 }
4762
4763 /*
4764  * has_indexed_join_quals
4765  *        Check whether all the joinquals of a nestloop join are used as
4766  *        inner index quals.
4767  *
4768  * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4769  * indexscan) that uses all the joinquals as indexquals, we can assume that an
4770  * unmatched outer tuple is cheap to process, whereas otherwise it's probably
4771  * expensive.
4772  */
4773 static bool
4774 has_indexed_join_quals(NestPath *path)
4775 {
4776         JoinPath   *joinpath = &path->jpath;
4777         Relids          joinrelids = joinpath->path.parent->relids;
4778         Path       *innerpath = joinpath->innerjoinpath;
4779         List       *indexclauses;
4780         bool            found_one;
4781         ListCell   *lc;
4782
4783         /* If join still has quals to evaluate, it's not fast */
4784         if (joinpath->joinrestrictinfo != NIL)
4785                 return false;
4786         /* Nor if the inner path isn't parameterized at all */
4787         if (innerpath->param_info == NULL)
4788                 return false;
4789
4790         /* Find the indexclauses list for the inner scan */
4791         switch (innerpath->pathtype)
4792         {
4793                 case T_IndexScan:
4794                 case T_IndexOnlyScan:
4795                         indexclauses = ((IndexPath *) innerpath)->indexclauses;
4796                         break;
4797                 case T_BitmapHeapScan:
4798                         {
4799                                 /* Accept only a simple bitmap scan, not AND/OR cases */
4800                                 Path       *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
4801
4802                                 if (IsA(bmqual, IndexPath))
4803                                         indexclauses = ((IndexPath *) bmqual)->indexclauses;
4804                                 else
4805                                         return false;
4806                                 break;
4807                         }
4808                 default:
4809
4810                         /*
4811                          * If it's not a simple indexscan, it probably doesn't run quickly
4812                          * for zero rows out, even if it's a parameterized path using all
4813                          * the joinquals.
4814                          */
4815                         return false;
4816         }
4817
4818         /*
4819          * Examine the inner path's param clauses.  Any that are from the outer
4820          * path must be found in the indexclauses list, either exactly or in an
4821          * equivalent form generated by equivclass.c.  Also, we must find at least
4822          * one such clause, else it's a clauseless join which isn't fast.
4823          */
4824         found_one = false;
4825         foreach(lc, innerpath->param_info->ppi_clauses)
4826         {
4827                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
4828
4829                 if (join_clause_is_movable_into(rinfo,
4830                                                                                 innerpath->parent->relids,
4831                                                                                 joinrelids))
4832                 {
4833                         if (!is_redundant_with_indexclauses(rinfo, indexclauses))
4834                                 return false;
4835                         found_one = true;
4836                 }
4837         }
4838         return found_one;
4839 }
4840
4841
4842 /*
4843  * approx_tuple_count
4844  *              Quick-and-dirty estimation of the number of join rows passing
4845  *              a set of qual conditions.
4846  *
4847  * The quals can be either an implicitly-ANDed list of boolean expressions,
4848  * or a list of RestrictInfo nodes (typically the latter).
4849  *
4850  * We intentionally compute the selectivity under JOIN_INNER rules, even
4851  * if it's some type of outer join.  This is appropriate because we are
4852  * trying to figure out how many tuples pass the initial merge or hash
4853  * join step.
4854  *
4855  * This is quick-and-dirty because we bypass clauselist_selectivity, and
4856  * simply multiply the independent clause selectivities together.  Now
4857  * clauselist_selectivity often can't do any better than that anyhow, but
4858  * for some situations (such as range constraints) it is smarter.  However,
4859  * we can't effectively cache the results of clauselist_selectivity, whereas
4860  * the individual clause selectivities can be and are cached.
4861  *
4862  * Since we are only using the results to estimate how many potential
4863  * output tuples are generated and passed through qpqual checking, it
4864  * seems OK to live with the approximation.
4865  */
4866 static double
4867 approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
4868 {
4869         double          tuples;
4870         double          outer_tuples = path->outerjoinpath->rows;
4871         double          inner_tuples = path->innerjoinpath->rows;
4872         SpecialJoinInfo sjinfo;
4873         Selectivity selec = 1.0;
4874         ListCell   *l;
4875
4876         /*
4877          * Make up a SpecialJoinInfo for JOIN_INNER semantics.
4878          */
4879         sjinfo.type = T_SpecialJoinInfo;
4880         sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4881         sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4882         sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4883         sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4884         sjinfo.jointype = JOIN_INNER;
4885         /* we don't bother trying to make the remaining fields valid */
4886         sjinfo.lhs_strict = false;
4887         sjinfo.delay_upper_joins = false;
4888         sjinfo.semi_can_btree = false;
4889         sjinfo.semi_can_hash = false;
4890         sjinfo.semi_operators = NIL;
4891         sjinfo.semi_rhs_exprs = NIL;
4892
4893         /* Get the approximate selectivity */
4894         foreach(l, quals)
4895         {
4896                 Node       *qual = (Node *) lfirst(l);
4897
4898                 /* Note that clause_selectivity will be able to cache its result */
4899                 selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
4900         }
4901
4902         /* Apply it to the input relation sizes */
4903         tuples = selec * outer_tuples * inner_tuples;
4904
4905         return clamp_row_est(tuples);
4906 }
4907
4908
4909 /*
4910  * set_baserel_size_estimates
4911  *              Set the size estimates for the given base relation.
4912  *
4913  * The rel's targetlist and restrictinfo list must have been constructed
4914  * already, and rel->tuples must be set.
4915  *
4916  * We set the following fields of the rel node:
4917  *      rows: the estimated number of output tuples (after applying
4918  *                restriction clauses).
4919  *      width: the estimated average output tuple width in bytes.
4920  *      baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4921  */
4922 void
4923 set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4924 {
4925         double          nrows;
4926
4927         /* Should only be applied to base relations */
4928         Assert(rel->relid > 0);
4929
4930         nrows = rel->tuples *
4931                 clauselist_selectivity(root,
4932                                                            rel->baserestrictinfo,
4933                                                            0,
4934                                                            JOIN_INNER,
4935                                                            NULL);
4936
4937         rel->rows = clamp_row_est(nrows);
4938
4939         cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4940
4941         set_rel_width(root, rel);
4942 }
4943
4944 /*
4945  * get_parameterized_baserel_size
4946  *              Make a size estimate for a parameterized scan of a base relation.
4947  *
4948  * 'param_clauses' lists the additional join clauses to be used.
4949  *
4950  * set_baserel_size_estimates must have been applied already.
4951  */
4952 double
4953 get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
4954                                                            List *param_clauses)
4955 {
4956         List       *allclauses;
4957         double          nrows;
4958
4959         /*
4960          * Estimate the number of rows returned by the parameterized scan, knowing
4961          * that it will apply all the extra join clauses as well as the rel's own
4962          * restriction clauses.  Note that we force the clauses to be treated as
4963          * non-join clauses during selectivity estimation.
4964          */
4965         allclauses = list_concat_copy(param_clauses, rel->baserestrictinfo);
4966         nrows = rel->tuples *
4967                 clauselist_selectivity(root,
4968                                                            allclauses,
4969                                                            rel->relid,  /* do not use 0! */
4970                                                            JOIN_INNER,
4971                                                            NULL);
4972         nrows = clamp_row_est(nrows);
4973         /* For safety, make sure result is not more than the base estimate */
4974         if (nrows > rel->rows)
4975                 nrows = rel->rows;
4976         return nrows;
4977 }
4978
4979 /*
4980  * set_joinrel_size_estimates
4981  *              Set the size estimates for the given join relation.
4982  *
4983  * The rel's targetlist must have been constructed already, and a
4984  * restriction clause list that matches the given component rels must
4985  * be provided.
4986  *
4987  * Since there is more than one way to make a joinrel for more than two
4988  * base relations, the results we get here could depend on which component
4989  * rel pair is provided.  In theory we should get the same answers no matter
4990  * which pair is provided; in practice, since the selectivity estimation
4991  * routines don't handle all cases equally well, we might not.  But there's
4992  * not much to be done about it.  (Would it make sense to repeat the
4993  * calculations for each pair of input rels that's encountered, and somehow
4994  * average the results?  Probably way more trouble than it's worth, and
4995  * anyway we must keep the rowcount estimate the same for all paths for the
4996  * joinrel.)
4997  *
4998  * We set only the rows field here.  The reltarget field was already set by
4999  * build_joinrel_tlist, and baserestrictcost is not used for join rels.
5000  */
5001 void
5002 set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
5003                                                    RelOptInfo *outer_rel,
5004                                                    RelOptInfo *inner_rel,
5005                                                    SpecialJoinInfo *sjinfo,
5006                                                    List *restrictlist)
5007 {
5008         rel->rows = calc_joinrel_size_estimate(root,
5009                                                                                    rel,
5010                                                                                    outer_rel,
5011                                                                                    inner_rel,
5012                                                                                    outer_rel->rows,
5013                                                                                    inner_rel->rows,
5014                                                                                    sjinfo,
5015                                                                                    restrictlist);
5016 }
5017
5018 /*
5019  * get_parameterized_joinrel_size
5020  *              Make a size estimate for a parameterized scan of a join relation.
5021  *
5022  * 'rel' is the joinrel under consideration.
5023  * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
5024  *              produce the relations being joined.
5025  * 'sjinfo' is any SpecialJoinInfo relevant to this join.
5026  * 'restrict_clauses' lists the join clauses that need to be applied at the
5027  * join node (including any movable clauses that were moved down to this join,
5028  * and not including any movable clauses that were pushed down into the
5029  * child paths).
5030  *
5031  * set_joinrel_size_estimates must have been applied already.
5032  */
5033 double
5034 get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
5035                                                            Path *outer_path,
5036                                                            Path *inner_path,
5037                                                            SpecialJoinInfo *sjinfo,
5038                                                            List *restrict_clauses)
5039 {
5040         double          nrows;
5041
5042         /*
5043          * Estimate the number of rows returned by the parameterized join as the
5044          * sizes of the input paths times the selectivity of the clauses that have
5045          * ended up at this join node.
5046          *
5047          * As with set_joinrel_size_estimates, the rowcount estimate could depend
5048          * on the pair of input paths provided, though ideally we'd get the same
5049          * estimate for any pair with the same parameterization.
5050          */
5051         nrows = calc_joinrel_size_estimate(root,
5052                                                                            rel,
5053                                                                            outer_path->parent,
5054                                                                            inner_path->parent,
5055                                                                            outer_path->rows,
5056                                                                            inner_path->rows,
5057                                                                            sjinfo,
5058                                                                            restrict_clauses);
5059         /* For safety, make sure result is not more than the base estimate */
5060         if (nrows > rel->rows)
5061                 nrows = rel->rows;
5062         return nrows;
5063 }
5064
5065 /*
5066  * calc_joinrel_size_estimate
5067  *              Workhorse for set_joinrel_size_estimates and
5068  *              get_parameterized_joinrel_size.
5069  *
5070  * outer_rel/inner_rel are the relations being joined, but they should be
5071  * assumed to have sizes outer_rows/inner_rows; those numbers might be less
5072  * than what rel->rows says, when we are considering parameterized paths.
5073  */
5074 static double
5075 calc_joinrel_size_estimate(PlannerInfo *root,
5076                                                    RelOptInfo *joinrel,
5077                                                    RelOptInfo *outer_rel,
5078                                                    RelOptInfo *inner_rel,
5079                                                    double outer_rows,
5080                                                    double inner_rows,
5081                                                    SpecialJoinInfo *sjinfo,
5082                                                    List *restrictlist_in)
5083 {
5084         /* This apparently-useless variable dodges a compiler bug in VS2013: */
5085         List       *restrictlist = restrictlist_in;
5086         JoinType        jointype = sjinfo->jointype;
5087         Selectivity fkselec;
5088         Selectivity jselec;
5089         Selectivity pselec;
5090         double          nrows;
5091
5092         /*
5093          * Compute joinclause selectivity.  Note that we are only considering
5094          * clauses that become restriction clauses at this join level; we are not
5095          * double-counting them because they were not considered in estimating the
5096          * sizes of the component rels.
5097          *
5098          * First, see whether any of the joinclauses can be matched to known FK
5099          * constraints.  If so, drop those clauses from the restrictlist, and
5100          * instead estimate their selectivity using FK semantics.  (We do this
5101          * without regard to whether said clauses are local or "pushed down".
5102          * Probably, an FK-matching clause could never be seen as pushed down at
5103          * an outer join, since it would be strict and hence would be grounds for
5104          * join strength reduction.)  fkselec gets the net selectivity for
5105          * FK-matching clauses, or 1.0 if there are none.
5106          */
5107         fkselec = get_foreign_key_join_selectivity(root,
5108                                                                                            outer_rel->relids,
5109                                                                                            inner_rel->relids,
5110                                                                                            sjinfo,
5111                                                                                            &restrictlist);
5112
5113         /*
5114          * For an outer join, we have to distinguish the selectivity of the join's
5115          * own clauses (JOIN/ON conditions) from any clauses that were "pushed
5116          * down".  For inner joins we just count them all as joinclauses.
5117          */
5118         if (IS_OUTER_JOIN(jointype))
5119         {
5120                 List       *joinquals = NIL;
5121                 List       *pushedquals = NIL;
5122                 ListCell   *l;
5123
5124                 /* Grovel through the clauses to separate into two lists */
5125                 foreach(l, restrictlist)
5126                 {
5127                         RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
5128
5129                         if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
5130                                 pushedquals = lappend(pushedquals, rinfo);
5131                         else
5132                                 joinquals = lappend(joinquals, rinfo);
5133                 }
5134
5135                 /* Get the separate selectivities */
5136                 jselec = clauselist_selectivity(root,
5137                                                                                 joinquals,
5138                                                                                 0,
5139                                                                                 jointype,
5140                                                                                 sjinfo);
5141                 pselec = clauselist_selectivity(root,
5142                                                                                 pushedquals,
5143                                                                                 0,
5144                                                                                 jointype,
5145                                                                                 sjinfo);
5146
5147                 /* Avoid leaking a lot of ListCells */
5148                 list_free(joinquals);
5149                 list_free(pushedquals);
5150         }
5151         else
5152         {
5153                 jselec = clauselist_selectivity(root,
5154                                                                                 restrictlist,
5155                                                                                 0,
5156                                                                                 jointype,
5157                                                                                 sjinfo);
5158                 pselec = 0.0;                   /* not used, keep compiler quiet */
5159         }
5160
5161         /*
5162          * Basically, we multiply size of Cartesian product by selectivity.
5163          *
5164          * If we are doing an outer join, take that into account: the joinqual
5165          * selectivity has to be clamped using the knowledge that the output must
5166          * be at least as large as the non-nullable input.  However, any
5167          * pushed-down quals are applied after the outer join, so their
5168          * selectivity applies fully.
5169          *
5170          * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
5171          * of LHS rows that have matches, and we apply that straightforwardly.
5172          */
5173         switch (jointype)
5174         {
5175                 case JOIN_INNER:
5176                         nrows = outer_rows * inner_rows * fkselec * jselec;
5177                         /* pselec not used */
5178                         break;
5179                 case JOIN_LEFT:
5180                         nrows = outer_rows * inner_rows * fkselec * jselec;
5181                         if (nrows < outer_rows)
5182                                 nrows = outer_rows;
5183                         nrows *= pselec;
5184                         break;
5185                 case JOIN_FULL:
5186                         nrows = outer_rows * inner_rows * fkselec * jselec;
5187                         if (nrows < outer_rows)
5188                                 nrows = outer_rows;
5189                         if (nrows < inner_rows)
5190                                 nrows = inner_rows;
5191                         nrows *= pselec;
5192                         break;
5193                 case JOIN_SEMI:
5194                         nrows = outer_rows * fkselec * jselec;
5195                         /* pselec not used */
5196                         break;
5197                 case JOIN_ANTI:
5198                         nrows = outer_rows * (1.0 - fkselec * jselec);
5199                         nrows *= pselec;
5200                         break;
5201                 default:
5202                         /* other values not expected here */
5203                         elog(ERROR, "unrecognized join type: %d", (int) jointype);
5204                         nrows = 0;                      /* keep compiler quiet */
5205                         break;
5206         }
5207
5208         return clamp_row_est(nrows);
5209 }
5210
5211 /*
5212  * get_foreign_key_join_selectivity
5213  *              Estimate join selectivity for foreign-key-related clauses.
5214  *
5215  * Remove any clauses that can be matched to FK constraints from *restrictlist,
5216  * and return a substitute estimate of their selectivity.  1.0 is returned
5217  * when there are no such clauses.
5218  *
5219  * The reason for treating such clauses specially is that we can get better
5220  * estimates this way than by relying on clauselist_selectivity(), especially
5221  * for multi-column FKs where that function's assumption that the clauses are
5222  * independent falls down badly.  But even with single-column FKs, we may be
5223  * able to get a better answer when the pg_statistic stats are missing or out
5224  * of date.
5225  */
5226 static Selectivity
5227 get_foreign_key_join_selectivity(PlannerInfo *root,
5228                                                                  Relids outer_relids,
5229                                                                  Relids inner_relids,
5230                                                                  SpecialJoinInfo *sjinfo,
5231                                                                  List **restrictlist)
5232 {
5233         Selectivity fkselec = 1.0;
5234         JoinType        jointype = sjinfo->jointype;
5235         List       *worklist = *restrictlist;
5236         ListCell   *lc;
5237
5238         /* Consider each FK constraint that is known to match the query */
5239         foreach(lc, root->fkey_list)
5240         {
5241                 ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
5242                 bool            ref_is_outer;
5243                 List       *removedlist;
5244                 ListCell   *cell;
5245
5246                 /*
5247                  * This FK is not relevant unless it connects a baserel on one side of
5248                  * this join to a baserel on the other side.
5249                  */
5250                 if (bms_is_member(fkinfo->con_relid, outer_relids) &&
5251                         bms_is_member(fkinfo->ref_relid, inner_relids))
5252                         ref_is_outer = false;
5253                 else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
5254                                  bms_is_member(fkinfo->con_relid, inner_relids))
5255                         ref_is_outer = true;
5256                 else
5257                         continue;
5258
5259                 /*
5260                  * If we're dealing with a semi/anti join, and the FK's referenced
5261                  * relation is on the outside, then knowledge of the FK doesn't help
5262                  * us figure out what we need to know (which is the fraction of outer
5263                  * rows that have matches).  On the other hand, if the referenced rel
5264                  * is on the inside, then all outer rows must have matches in the
5265                  * referenced table (ignoring nulls).  But any restriction or join
5266                  * clauses that filter that table will reduce the fraction of matches.
5267                  * We can account for restriction clauses, but it's too hard to guess
5268                  * how many table rows would get through a join that's inside the RHS.
5269                  * Hence, if either case applies, punt and ignore the FK.
5270                  */
5271                 if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
5272                         (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
5273                         continue;
5274
5275                 /*
5276                  * Modify the restrictlist by removing clauses that match the FK (and
5277                  * putting them into removedlist instead).  It seems unsafe to modify
5278                  * the originally-passed List structure, so we make a shallow copy the
5279                  * first time through.
5280                  */
5281                 if (worklist == *restrictlist)
5282                         worklist = list_copy(worklist);
5283
5284                 removedlist = NIL;
5285                 foreach(cell, worklist)
5286                 {
5287                         RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
5288                         bool            remove_it = false;
5289                         int                     i;
5290
5291                         /* Drop this clause if it matches any column of the FK */
5292                         for (i = 0; i < fkinfo->nkeys; i++)
5293                         {
5294                                 if (rinfo->parent_ec)
5295                                 {
5296                                         /*
5297                                          * EC-derived clauses can only match by EC.  It is okay to
5298                                          * consider any clause derived from the same EC as
5299                                          * matching the FK: even if equivclass.c chose to generate
5300                                          * a clause equating some other pair of Vars, it could
5301                                          * have generated one equating the FK's Vars.  So for
5302                                          * purposes of estimation, we can act as though it did so.
5303                                          *
5304                                          * Note: checking parent_ec is a bit of a cheat because
5305                                          * there are EC-derived clauses that don't have parent_ec
5306                                          * set; but such clauses must compare expressions that
5307                                          * aren't just Vars, so they cannot match the FK anyway.
5308                                          */
5309                                         if (fkinfo->eclass[i] == rinfo->parent_ec)
5310                                         {
5311                                                 remove_it = true;
5312                                                 break;
5313                                         }
5314                                 }
5315                                 else
5316                                 {
5317                                         /*
5318                                          * Otherwise, see if rinfo was previously matched to FK as
5319                                          * a "loose" clause.
5320                                          */
5321                                         if (list_member_ptr(fkinfo->rinfos[i], rinfo))
5322                                         {
5323                                                 remove_it = true;
5324                                                 break;
5325                                         }
5326                                 }
5327                         }
5328                         if (remove_it)
5329                         {
5330                                 worklist = foreach_delete_current(worklist, cell);
5331                                 removedlist = lappend(removedlist, rinfo);
5332                         }
5333                 }
5334
5335                 /*
5336                  * If we failed to remove all the matching clauses we expected to
5337                  * find, chicken out and ignore this FK; applying its selectivity
5338                  * might result in double-counting.  Put any clauses we did manage to
5339                  * remove back into the worklist.
5340                  *
5341                  * Since the matching clauses are known not outerjoin-delayed, they
5342                  * would normally have appeared in the initial joinclause list.  If we
5343                  * didn't find them, there are two possibilities:
5344                  *
5345                  * 1. If the FK match is based on an EC that is ec_has_const, it won't
5346                  * have generated any join clauses at all.  We discount such ECs while
5347                  * checking to see if we have "all" the clauses.  (Below, we'll adjust
5348                  * the selectivity estimate for this case.)
5349                  *
5350                  * 2. The clauses were matched to some other FK in a previous
5351                  * iteration of this loop, and thus removed from worklist.  (A likely
5352                  * case is that two FKs are matched to the same EC; there will be only
5353                  * one EC-derived clause in the initial list, so the first FK will
5354                  * consume it.)  Applying both FKs' selectivity independently risks
5355                  * underestimating the join size; in particular, this would undo one
5356                  * of the main things that ECs were invented for, namely to avoid
5357                  * double-counting the selectivity of redundant equality conditions.
5358                  * Later we might think of a reasonable way to combine the estimates,
5359                  * but for now, just punt, since this is a fairly uncommon situation.
5360                  */
5361                 if (removedlist == NIL ||
5362                         list_length(removedlist) !=
5363                         (fkinfo->nmatched_ec - fkinfo->nconst_ec + fkinfo->nmatched_ri))
5364                 {
5365                         worklist = list_concat(worklist, removedlist);
5366                         continue;
5367                 }
5368
5369                 /*
5370                  * Finally we get to the payoff: estimate selectivity using the
5371                  * knowledge that each referencing row will match exactly one row in
5372                  * the referenced table.
5373                  *
5374                  * XXX that's not true in the presence of nulls in the referencing
5375                  * column(s), so in principle we should derate the estimate for those.
5376                  * However (1) if there are any strict restriction clauses for the
5377                  * referencing column(s) elsewhere in the query, derating here would
5378                  * be double-counting the null fraction, and (2) it's not very clear
5379                  * how to combine null fractions for multiple referencing columns. So
5380                  * we do nothing for now about correcting for nulls.
5381                  *
5382                  * XXX another point here is that if either side of an FK constraint
5383                  * is an inheritance parent, we estimate as though the constraint
5384                  * covers all its children as well.  This is not an unreasonable
5385                  * assumption for a referencing table, ie the user probably applied
5386                  * identical constraints to all child tables (though perhaps we ought
5387                  * to check that).  But it's not possible to have done that for a
5388                  * referenced table.  Fortunately, precisely because that doesn't
5389                  * work, it is uncommon in practice to have an FK referencing a parent
5390                  * table.  So, at least for now, disregard inheritance here.
5391                  */
5392                 if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
5393                 {
5394                         /*
5395                          * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
5396                          * referenced table is exactly the inside of the join.  The join
5397                          * selectivity is defined as the fraction of LHS rows that have
5398                          * matches.  The FK implies that every LHS row has a match *in the
5399                          * referenced table*; but any restriction clauses on it will
5400                          * reduce the number of matches.  Hence we take the join
5401                          * selectivity as equal to the selectivity of the table's
5402                          * restriction clauses, which is rows / tuples; but we must guard
5403                          * against tuples == 0.
5404                          */
5405                         RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5406                         double          ref_tuples = Max(ref_rel->tuples, 1.0);
5407
5408                         fkselec *= ref_rel->rows / ref_tuples;
5409                 }
5410                 else
5411                 {
5412                         /*
5413                          * Otherwise, selectivity is exactly 1/referenced-table-size; but
5414                          * guard against tuples == 0.  Note we should use the raw table
5415                          * tuple count, not any estimate of its filtered or joined size.
5416                          */
5417                         RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5418                         double          ref_tuples = Max(ref_rel->tuples, 1.0);
5419
5420                         fkselec *= 1.0 / ref_tuples;
5421                 }
5422
5423                 /*
5424                  * If any of the FK columns participated in ec_has_const ECs, then
5425                  * equivclass.c will have generated "var = const" restrictions for
5426                  * each side of the join, thus reducing the sizes of both input
5427                  * relations.  Taking the fkselec at face value would amount to
5428                  * double-counting the selectivity of the constant restriction for the
5429                  * referencing Var.  Hence, look for the restriction clause(s) that
5430                  * were applied to the referencing Var(s), and divide out their
5431                  * selectivity to correct for this.
5432                  */
5433                 if (fkinfo->nconst_ec > 0)
5434                 {
5435                         for (int i = 0; i < fkinfo->nkeys; i++)
5436                         {
5437                                 EquivalenceClass *ec = fkinfo->eclass[i];
5438
5439                                 if (ec && ec->ec_has_const)
5440                                 {
5441                                         EquivalenceMember *em = fkinfo->fk_eclass_member[i];
5442                                         RestrictInfo *rinfo = find_derived_clause_for_ec_member(ec,
5443                                                                                                                                                         em);
5444
5445                                         if (rinfo)
5446                                         {
5447                                                 Selectivity s0;
5448
5449                                                 s0 = clause_selectivity(root,
5450                                                                                                 (Node *) rinfo,
5451                                                                                                 0,
5452                                                                                                 jointype,
5453                                                                                                 sjinfo);
5454                                                 if (s0 > 0)
5455                                                         fkselec /= s0;
5456                                         }
5457                                 }
5458                         }
5459                 }
5460         }
5461
5462         *restrictlist = worklist;
5463         CLAMP_PROBABILITY(fkselec);
5464         return fkselec;
5465 }
5466
5467 /*
5468  * set_subquery_size_estimates
5469  *              Set the size estimates for a base relation that is a subquery.
5470  *
5471  * The rel's targetlist and restrictinfo list must have been constructed
5472  * already, and the Paths for the subquery must have been completed.
5473  * We look at the subquery's PlannerInfo to extract data.
5474  *
5475  * We set the same fields as set_baserel_size_estimates.
5476  */
5477 void
5478 set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5479 {
5480         PlannerInfo *subroot = rel->subroot;
5481         RelOptInfo *sub_final_rel;
5482         ListCell   *lc;
5483
5484         /* Should only be applied to base relations that are subqueries */
5485         Assert(rel->relid > 0);
5486         Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
5487
5488         /*
5489          * Copy raw number of output rows from subquery.  All of its paths should
5490          * have the same output rowcount, so just look at cheapest-total.
5491          */
5492         sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
5493         rel->tuples = sub_final_rel->cheapest_total_path->rows;
5494
5495         /*
5496          * Compute per-output-column width estimates by examining the subquery's
5497          * targetlist.  For any output that is a plain Var, get the width estimate
5498          * that was made while planning the subquery.  Otherwise, we leave it to
5499          * set_rel_width to fill in a datatype-based default estimate.
5500          */
5501         foreach(lc, subroot->parse->targetList)
5502         {
5503                 TargetEntry *te = lfirst_node(TargetEntry, lc);
5504                 Node       *texpr = (Node *) te->expr;
5505                 int32           item_width = 0;
5506
5507                 /* junk columns aren't visible to upper query */
5508                 if (te->resjunk)
5509                         continue;
5510
5511                 /*
5512                  * The subquery could be an expansion of a view that's had columns
5513                  * added to it since the current query was parsed, so that there are
5514                  * non-junk tlist columns in it that don't correspond to any column
5515                  * visible at our query level.  Ignore such columns.
5516                  */
5517                 if (te->resno < rel->min_attr || te->resno > rel->max_attr)
5518                         continue;
5519
5520                 /*
5521                  * XXX This currently doesn't work for subqueries containing set
5522                  * operations, because the Vars in their tlists are bogus references
5523                  * to the first leaf subquery, which wouldn't give the right answer
5524                  * even if we could still get to its PlannerInfo.
5525                  *
5526                  * Also, the subquery could be an appendrel for which all branches are
5527                  * known empty due to constraint exclusion, in which case
5528                  * set_append_rel_pathlist will have left the attr_widths set to zero.
5529                  *
5530                  * In either case, we just leave the width estimate zero until
5531                  * set_rel_width fixes it.
5532                  */
5533                 if (IsA(texpr, Var) &&
5534                         subroot->parse->setOperations == NULL)
5535                 {
5536                         Var                *var = (Var *) texpr;
5537                         RelOptInfo *subrel = find_base_rel(subroot, var->varno);
5538
5539                         item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
5540                 }
5541                 rel->attr_widths[te->resno - rel->min_attr] = item_width;
5542         }
5543
5544         /* Now estimate number of output rows, etc */
5545         set_baserel_size_estimates(root, rel);
5546 }
5547
5548 /*
5549  * set_function_size_estimates
5550  *              Set the size estimates for a base relation that is a function call.
5551  *
5552  * The rel's targetlist and restrictinfo list must have been constructed
5553  * already.
5554  *
5555  * We set the same fields as set_baserel_size_estimates.
5556  */
5557 void
5558 set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5559 {
5560         RangeTblEntry *rte;
5561         ListCell   *lc;
5562
5563         /* Should only be applied to base relations that are functions */
5564         Assert(rel->relid > 0);
5565         rte = planner_rt_fetch(rel->relid, root);
5566         Assert(rte->rtekind == RTE_FUNCTION);
5567
5568         /*
5569          * Estimate number of rows the functions will return. The rowcount of the
5570          * node is that of the largest function result.
5571          */
5572         rel->tuples = 0;
5573         foreach(lc, rte->functions)
5574         {
5575                 RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
5576                 double          ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5577
5578                 if (ntup > rel->tuples)
5579                         rel->tuples = ntup;
5580         }
5581
5582         /* Now estimate number of output rows, etc */
5583         set_baserel_size_estimates(root, rel);
5584 }
5585
5586 /*
5587  * set_function_size_estimates
5588  *              Set the size estimates for a base relation that is a function call.
5589  *
5590  * The rel's targetlist and restrictinfo list must have been constructed
5591  * already.
5592  *
5593  * We set the same fields as set_tablefunc_size_estimates.
5594  */
5595 void
5596 set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5597 {
5598         /* Should only be applied to base relations that are functions */
5599         Assert(rel->relid > 0);
5600         Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5601
5602         rel->tuples = 100;
5603
5604         /* Now estimate number of output rows, etc */
5605         set_baserel_size_estimates(root, rel);
5606 }
5607
5608 /*
5609  * set_values_size_estimates
5610  *              Set the size estimates for a base relation that is a values list.
5611  *
5612  * The rel's targetlist and restrictinfo list must have been constructed
5613  * already.
5614  *
5615  * We set the same fields as set_baserel_size_estimates.
5616  */
5617 void
5618 set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5619 {
5620         RangeTblEntry *rte;
5621
5622         /* Should only be applied to base relations that are values lists */
5623         Assert(rel->relid > 0);
5624         rte = planner_rt_fetch(rel->relid, root);
5625         Assert(rte->rtekind == RTE_VALUES);
5626
5627         /*
5628          * Estimate number of rows the values list will return. We know this
5629          * precisely based on the list length (well, barring set-returning
5630          * functions in list items, but that's a refinement not catered for
5631          * anywhere else either).
5632          */
5633         rel->tuples = list_length(rte->values_lists);
5634
5635         /* Now estimate number of output rows, etc */
5636         set_baserel_size_estimates(root, rel);
5637 }
5638
5639 /*
5640  * set_cte_size_estimates
5641  *              Set the size estimates for a base relation that is a CTE reference.
5642  *
5643  * The rel's targetlist and restrictinfo list must have been constructed
5644  * already, and we need an estimate of the number of rows returned by the CTE
5645  * (if a regular CTE) or the non-recursive term (if a self-reference).
5646  *
5647  * We set the same fields as set_baserel_size_estimates.
5648  */
5649 void
5650 set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
5651 {
5652         RangeTblEntry *rte;
5653
5654         /* Should only be applied to base relations that are CTE references */
5655         Assert(rel->relid > 0);
5656         rte = planner_rt_fetch(rel->relid, root);
5657         Assert(rte->rtekind == RTE_CTE);
5658
5659         if (rte->self_reference)
5660         {
5661                 /*
5662                  * In a self-reference, arbitrarily assume the average worktable size
5663                  * is about 10 times the nonrecursive term's size.
5664                  */
5665                 rel->tuples = 10 * cte_rows;
5666         }
5667         else
5668         {
5669                 /* Otherwise just believe the CTE's rowcount estimate */
5670                 rel->tuples = cte_rows;
5671         }
5672
5673         /* Now estimate number of output rows, etc */
5674         set_baserel_size_estimates(root, rel);
5675 }
5676
5677 /*
5678  * set_namedtuplestore_size_estimates
5679  *              Set the size estimates for a base relation that is a tuplestore reference.
5680  *
5681  * The rel's targetlist and restrictinfo list must have been constructed
5682  * already.
5683  *
5684  * We set the same fields as set_baserel_size_estimates.
5685  */
5686 void
5687 set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5688 {
5689         RangeTblEntry *rte;
5690
5691         /* Should only be applied to base relations that are tuplestore references */
5692         Assert(rel->relid > 0);
5693         rte = planner_rt_fetch(rel->relid, root);
5694         Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5695
5696         /*
5697          * Use the estimate provided by the code which is generating the named
5698          * tuplestore.  In some cases, the actual number might be available; in
5699          * others the same plan will be re-used, so a "typical" value might be
5700          * estimated and used.
5701          */
5702         rel->tuples = rte->enrtuples;
5703         if (rel->tuples < 0)
5704                 rel->tuples = 1000;
5705
5706         /* Now estimate number of output rows, etc */
5707         set_baserel_size_estimates(root, rel);
5708 }
5709
5710 /*
5711  * set_result_size_estimates
5712  *              Set the size estimates for an RTE_RESULT base relation
5713  *
5714  * The rel's targetlist and restrictinfo list must have been constructed
5715  * already.
5716  *
5717  * We set the same fields as set_baserel_size_estimates.
5718  */
5719 void
5720 set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5721 {
5722         /* Should only be applied to RTE_RESULT base relations */
5723         Assert(rel->relid > 0);
5724         Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
5725
5726         /* RTE_RESULT always generates a single row, natively */
5727         rel->tuples = 1;
5728
5729         /* Now estimate number of output rows, etc */
5730         set_baserel_size_estimates(root, rel);
5731 }
5732
5733 /*
5734  * set_foreign_size_estimates
5735  *              Set the size estimates for a base relation that is a foreign table.
5736  *
5737  * There is not a whole lot that we can do here; the foreign-data wrapper
5738  * is responsible for producing useful estimates.  We can do a decent job
5739  * of estimating baserestrictcost, so we set that, and we also set up width
5740  * using what will be purely datatype-driven estimates from the targetlist.
5741  * There is no way to do anything sane with the rows value, so we just put
5742  * a default estimate and hope that the wrapper can improve on it.  The
5743  * wrapper's GetForeignRelSize function will be called momentarily.
5744  *
5745  * The rel's targetlist and restrictinfo list must have been constructed
5746  * already.
5747  */
5748 void
5749 set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5750 {
5751         /* Should only be applied to base relations */
5752         Assert(rel->relid > 0);
5753
5754         rel->rows = 1000;                       /* entirely bogus default estimate */
5755
5756         cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5757
5758         set_rel_width(root, rel);
5759 }
5760
5761
5762 /*
5763  * set_rel_width
5764  *              Set the estimated output width of a base relation.
5765  *
5766  * The estimated output width is the sum of the per-attribute width estimates
5767  * for the actually-referenced columns, plus any PHVs or other expressions
5768  * that have to be calculated at this relation.  This is the amount of data
5769  * we'd need to pass upwards in case of a sort, hash, etc.
5770  *
5771  * This function also sets reltarget->cost, so it's a bit misnamed now.
5772  *
5773  * NB: this works best on plain relations because it prefers to look at
5774  * real Vars.  For subqueries, set_subquery_size_estimates will already have
5775  * copied up whatever per-column estimates were made within the subquery,
5776  * and for other types of rels there isn't much we can do anyway.  We fall
5777  * back on (fairly stupid) datatype-based width estimates if we can't get
5778  * any better number.
5779  *
5780  * The per-attribute width estimates are cached for possible re-use while
5781  * building join relations or post-scan/join pathtargets.
5782  */
5783 static void
5784 set_rel_width(PlannerInfo *root, RelOptInfo *rel)
5785 {
5786         Oid                     reloid = planner_rt_fetch(rel->relid, root)->relid;
5787         int32           tuple_width = 0;
5788         bool            have_wholerow_var = false;
5789         ListCell   *lc;
5790
5791         /* Vars are assumed to have cost zero, but other exprs do not */
5792         rel->reltarget->cost.startup = 0;
5793         rel->reltarget->cost.per_tuple = 0;
5794
5795         foreach(lc, rel->reltarget->exprs)
5796         {
5797                 Node       *node = (Node *) lfirst(lc);
5798
5799                 /*
5800                  * Ordinarily, a Var in a rel's targetlist must belong to that rel;
5801                  * but there are corner cases involving LATERAL references where that
5802                  * isn't so.  If the Var has the wrong varno, fall through to the
5803                  * generic case (it doesn't seem worth the trouble to be any smarter).
5804                  */
5805                 if (IsA(node, Var) &&
5806                         ((Var *) node)->varno == rel->relid)
5807                 {
5808                         Var                *var = (Var *) node;
5809                         int                     ndx;
5810                         int32           item_width;
5811
5812                         Assert(var->varattno >= rel->min_attr);
5813                         Assert(var->varattno <= rel->max_attr);
5814
5815                         ndx = var->varattno - rel->min_attr;
5816
5817                         /*
5818                          * If it's a whole-row Var, we'll deal with it below after we have
5819                          * already cached as many attr widths as possible.
5820                          */
5821                         if (var->varattno == 0)
5822                         {
5823                                 have_wholerow_var = true;
5824                                 continue;
5825                         }
5826
5827                         /*
5828                          * The width may have been cached already (especially if it's a
5829                          * subquery), so don't duplicate effort.
5830                          */
5831                         if (rel->attr_widths[ndx] > 0)
5832                         {
5833                                 tuple_width += rel->attr_widths[ndx];
5834                                 continue;
5835                         }
5836
5837                         /* Try to get column width from statistics */
5838                         if (reloid != InvalidOid && var->varattno > 0)
5839                         {
5840                                 item_width = get_attavgwidth(reloid, var->varattno);
5841                                 if (item_width > 0)
5842                                 {
5843                                         rel->attr_widths[ndx] = item_width;
5844                                         tuple_width += item_width;
5845                                         continue;
5846                                 }
5847                         }
5848
5849                         /*
5850                          * Not a plain relation, or can't find statistics for it. Estimate
5851                          * using just the type info.
5852                          */
5853                         item_width = get_typavgwidth(var->vartype, var->vartypmod);
5854                         Assert(item_width > 0);
5855                         rel->attr_widths[ndx] = item_width;
5856                         tuple_width += item_width;
5857                 }
5858                 else if (IsA(node, PlaceHolderVar))
5859                 {
5860                         /*
5861                          * We will need to evaluate the PHV's contained expression while
5862                          * scanning this rel, so be sure to include it in reltarget->cost.
5863                          */
5864                         PlaceHolderVar *phv = (PlaceHolderVar *) node;
5865                         PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5866                         QualCost        cost;
5867
5868                         tuple_width += phinfo->ph_width;
5869                         cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5870                         rel->reltarget->cost.startup += cost.startup;
5871                         rel->reltarget->cost.per_tuple += cost.per_tuple;
5872                 }
5873                 else
5874                 {
5875                         /*
5876                          * We could be looking at an expression pulled up from a subquery,
5877                          * or a ROW() representing a whole-row child Var, etc.  Do what we
5878                          * can using the expression type information.
5879                          */
5880                         int32           item_width;
5881                         QualCost        cost;
5882
5883                         item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5884                         Assert(item_width > 0);
5885                         tuple_width += item_width;
5886                         /* Not entirely clear if we need to account for cost, but do so */
5887                         cost_qual_eval_node(&cost, node, root);
5888                         rel->reltarget->cost.startup += cost.startup;
5889                         rel->reltarget->cost.per_tuple += cost.per_tuple;
5890                 }
5891         }
5892
5893         /*
5894          * If we have a whole-row reference, estimate its width as the sum of
5895          * per-column widths plus heap tuple header overhead.
5896          */
5897         if (have_wholerow_var)
5898         {
5899                 int32           wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5900
5901                 if (reloid != InvalidOid)
5902                 {
5903                         /* Real relation, so estimate true tuple width */
5904                         wholerow_width += get_relation_data_width(reloid,
5905                                                                                                           rel->attr_widths - rel->min_attr);
5906                 }
5907                 else
5908                 {
5909                         /* Do what we can with info for a phony rel */
5910                         AttrNumber      i;
5911
5912                         for (i = 1; i <= rel->max_attr; i++)
5913                                 wholerow_width += rel->attr_widths[i - rel->min_attr];
5914                 }
5915
5916                 rel->attr_widths[0 - rel->min_attr] = wholerow_width;
5917
5918                 /*
5919                  * Include the whole-row Var as part of the output tuple.  Yes, that
5920                  * really is what happens at runtime.
5921                  */
5922                 tuple_width += wholerow_width;
5923         }
5924
5925         Assert(tuple_width >= 0);
5926         rel->reltarget->width = tuple_width;
5927 }
5928
5929 /*
5930  * set_pathtarget_cost_width
5931  *              Set the estimated eval cost and output width of a PathTarget tlist.
5932  *
5933  * As a notational convenience, returns the same PathTarget pointer passed in.
5934  *
5935  * Most, though not quite all, uses of this function occur after we've run
5936  * set_rel_width() for base relations; so we can usually obtain cached width
5937  * estimates for Vars.  If we can't, fall back on datatype-based width
5938  * estimates.  Present early-planning uses of PathTargets don't need accurate
5939  * widths badly enough to justify going to the catalogs for better data.
5940  */
5941 PathTarget *
5942 set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
5943 {
5944         int32           tuple_width = 0;
5945         ListCell   *lc;
5946
5947         /* Vars are assumed to have cost zero, but other exprs do not */
5948         target->cost.startup = 0;
5949         target->cost.per_tuple = 0;
5950
5951         foreach(lc, target->exprs)
5952         {
5953                 Node       *node = (Node *) lfirst(lc);
5954
5955                 if (IsA(node, Var))
5956                 {
5957                         Var                *var = (Var *) node;
5958                         int32           item_width;
5959
5960                         /* We should not see any upper-level Vars here */
5961                         Assert(var->varlevelsup == 0);
5962
5963                         /* Try to get data from RelOptInfo cache */
5964                         if (!IS_SPECIAL_VARNO(var->varno) &&
5965                                 var->varno < root->simple_rel_array_size)
5966                         {
5967                                 RelOptInfo *rel = root->simple_rel_array[var->varno];
5968
5969                                 if (rel != NULL &&
5970                                         var->varattno >= rel->min_attr &&
5971                                         var->varattno <= rel->max_attr)
5972                                 {
5973                                         int                     ndx = var->varattno - rel->min_attr;
5974
5975                                         if (rel->attr_widths[ndx] > 0)
5976                                         {
5977                                                 tuple_width += rel->attr_widths[ndx];
5978                                                 continue;
5979                                         }
5980                                 }
5981                         }
5982
5983                         /*
5984                          * No cached data available, so estimate using just the type info.
5985                          */
5986                         item_width = get_typavgwidth(var->vartype, var->vartypmod);
5987                         Assert(item_width > 0);
5988                         tuple_width += item_width;
5989                 }
5990                 else
5991                 {
5992                         /*
5993                          * Handle general expressions using type info.
5994                          */
5995                         int32           item_width;
5996                         QualCost        cost;
5997
5998                         item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5999                         Assert(item_width > 0);
6000                         tuple_width += item_width;
6001
6002                         /* Account for cost, too */
6003                         cost_qual_eval_node(&cost, node, root);
6004                         target->cost.startup += cost.startup;
6005                         target->cost.per_tuple += cost.per_tuple;
6006                 }
6007         }
6008
6009         Assert(tuple_width >= 0);
6010         target->width = tuple_width;
6011
6012         return target;
6013 }
6014
6015 /*
6016  * relation_byte_size
6017  *        Estimate the storage space in bytes for a given number of tuples
6018  *        of a given width (size in bytes).
6019  */
6020 static double
6021 relation_byte_size(double tuples, int width)
6022 {
6023         return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
6024 }
6025
6026 /*
6027  * page_size
6028  *        Returns an estimate of the number of pages covered by a given
6029  *        number of tuples of a given width (size in bytes).
6030  */
6031 static double
6032 page_size(double tuples, int width)
6033 {
6034         return ceil(relation_byte_size(tuples, width) / BLCKSZ);
6035 }
6036
6037 /*
6038  * Estimate the fraction of the work that each worker will do given the
6039  * number of workers budgeted for the path.
6040  */
6041 static double
6042 get_parallel_divisor(Path *path)
6043 {
6044         double          parallel_divisor = path->parallel_workers;
6045
6046         /*
6047          * Early experience with parallel query suggests that when there is only
6048          * one worker, the leader often makes a very substantial contribution to
6049          * executing the parallel portion of the plan, but as more workers are
6050          * added, it does less and less, because it's busy reading tuples from the
6051          * workers and doing whatever non-parallel post-processing is needed.  By
6052          * the time we reach 4 workers, the leader no longer makes a meaningful
6053          * contribution.  Thus, for now, estimate that the leader spends 30% of
6054          * its time servicing each worker, and the remainder executing the
6055          * parallel plan.
6056          */
6057         if (parallel_leader_participation)
6058         {
6059                 double          leader_contribution;
6060
6061                 leader_contribution = 1.0 - (0.3 * path->parallel_workers);
6062                 if (leader_contribution > 0)
6063                         parallel_divisor += leader_contribution;
6064         }
6065
6066         return parallel_divisor;
6067 }
6068
6069 /*
6070  * compute_bitmap_pages
6071  *
6072  * compute number of pages fetched from heap in bitmap heap scan.
6073  */
6074 double
6075 compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
6076                                          int loop_count, Cost *cost, double *tuple)
6077 {
6078         Cost            indexTotalCost;
6079         Selectivity indexSelectivity;
6080         double          T;
6081         double          pages_fetched;
6082         double          tuples_fetched;
6083         double          heap_pages;
6084         long            maxentries;
6085
6086         /*
6087          * Fetch total cost of obtaining the bitmap, as well as its total
6088          * selectivity.
6089          */
6090         cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
6091
6092         /*
6093          * Estimate number of main-table pages fetched.
6094          */
6095         tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
6096
6097         T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
6098
6099         /*
6100          * For a single scan, the number of heap pages that need to be fetched is
6101          * the same as the Mackert and Lohman formula for the case T <= b (ie, no
6102          * re-reads needed).
6103          */
6104         pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
6105
6106         /*
6107          * Calculate the number of pages fetched from the heap.  Then based on
6108          * current work_mem estimate get the estimated maxentries in the bitmap.
6109          * (Note that we always do this calculation based on the number of pages
6110          * that would be fetched in a single iteration, even if loop_count > 1.
6111          * That's correct, because only that number of entries will be stored in
6112          * the bitmap at one time.)
6113          */
6114         heap_pages = Min(pages_fetched, baserel->pages);
6115         maxentries = tbm_calculate_entries(work_mem * 1024L);
6116
6117         if (loop_count > 1)
6118         {
6119                 /*
6120                  * For repeated bitmap scans, scale up the number of tuples fetched in
6121                  * the Mackert and Lohman formula by the number of scans, so that we
6122                  * estimate the number of pages fetched by all the scans. Then
6123                  * pro-rate for one scan.
6124                  */
6125                 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
6126                                                                                         baserel->pages,
6127                                                                                         get_indexpath_pages(bitmapqual),
6128                                                                                         root);
6129                 pages_fetched /= loop_count;
6130         }
6131
6132         if (pages_fetched >= T)
6133                 pages_fetched = T;
6134         else
6135                 pages_fetched = ceil(pages_fetched);
6136
6137         if (maxentries < heap_pages)
6138         {
6139                 double          exact_pages;
6140                 double          lossy_pages;
6141
6142                 /*
6143                  * Crude approximation of the number of lossy pages.  Because of the
6144                  * way tbm_lossify() is coded, the number of lossy pages increases
6145                  * very sharply as soon as we run short of memory; this formula has
6146                  * that property and seems to perform adequately in testing, but it's
6147                  * possible we could do better somehow.
6148                  */
6149                 lossy_pages = Max(0, heap_pages - maxentries / 2);
6150                 exact_pages = heap_pages - lossy_pages;
6151
6152                 /*
6153                  * If there are lossy pages then recompute the  number of tuples
6154                  * processed by the bitmap heap node.  We assume here that the chance
6155                  * of a given tuple coming from an exact page is the same as the
6156                  * chance that a given page is exact.  This might not be true, but
6157                  * it's not clear how we can do any better.
6158                  */
6159                 if (lossy_pages > 0)
6160                         tuples_fetched =
6161                                 clamp_row_est(indexSelectivity *
6162                                                           (exact_pages / heap_pages) * baserel->tuples +
6163                                                           (lossy_pages / heap_pages) * baserel->tuples);
6164         }
6165
6166         if (cost)
6167                 *cost = indexTotalCost;
6168         if (tuple)
6169                 *tuple = tuples_fetched;
6170
6171         return pages_fetched;
6172 }