src/backend/optimizer/plan/analyzejoins.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * analyzejoins.c
   4  *        Routines for simplifying joins after initial query analysis
   5  *
   6  * While we do a great deal of join simplification in prep/prepjointree.c,
   7  * certain optimizations cannot be performed at that stage for lack of
   8  * detailed information about the query.  The routines here are invoked
   9  * after initsplan.c has done its work, and can do additional join removal
  10  * and simplification steps based on the information extracted.  The penalty
  11  * is that we have to work harder to clean up after ourselves when we modify
  12  * the query, since the derived data structures have to be updated too.
  13  *
  14  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  15  * Portions Copyright (c) 1994, Regents of the University of California
  16  *
  17  *
  18  * IDENTIFICATION
  19  *        src/backend/optimizer/plan/analyzejoins.c
  20  *
  21  *-------------------------------------------------------------------------
  22  */
  23 #include "postgres.h"
  24
  25 #include "nodes/nodeFuncs.h"
  26 #include "optimizer/clauses.h"
  27 #include "optimizer/joininfo.h"
  28 #include "optimizer/optimizer.h"
  29 #include "optimizer/pathnode.h"
  30 #include "optimizer/paths.h"
  31 #include "optimizer/planmain.h"
  32 #include "optimizer/tlist.h"
  33 #include "utils/lsyscache.h"
  34
  35 /* local functions */
  36 static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo);
  37 static void remove_rel_from_query(PlannerInfo *root, int relid,
  38                                                                   Relids joinrelids);
  39 static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
  40 static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
  41 static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
  42                                                                 List *clause_list);
  43 static Oid      distinct_col_search(int colno, List *colnos, List *opids);
  44 static bool is_innerrel_unique_for(PlannerInfo *root,
  45                                                                    Relids joinrelids,
  46                                                                    Relids outerrelids,
  47                                                                    RelOptInfo *innerrel,
  48                                                                    JoinType jointype,
  49                                                                    List *restrictlist);
  50
  51
  52 /*
  53  * remove_useless_joins
  54  *              Check for relations that don't actually need to be joined at all,
  55  *              and remove them from the query.
  56  *
  57  * We are passed the current joinlist and return the updated list.  Other
  58  * data structures that have to be updated are accessible via "root".
  59  */
  60 List *
  61 remove_useless_joins(PlannerInfo *root, List *joinlist)
  62 {
  63         ListCell   *lc;
  64
  65         /*
  66          * We are only interested in relations that are left-joined to, so we can
  67          * scan the join_info_list to find them easily.
  68          */
  69 restart:
  70         foreach(lc, root->join_info_list)
  71         {
  72                 SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
  73                 int                     innerrelid;
  74                 int                     nremoved;
  75
  76                 /* Skip if not removable */
  77                 if (!join_is_removable(root, sjinfo))
  78                         continue;
  79
  80                 /*
  81                  * Currently, join_is_removable can only succeed when the sjinfo's
  82                  * righthand is a single baserel.  Remove that rel from the query and
  83                  * joinlist.
  84                  */
  85                 innerrelid = bms_singleton_member(sjinfo->min_righthand);
  86
  87                 remove_rel_from_query(root, innerrelid,
  88                                                           bms_union(sjinfo->min_lefthand,
  89                                                                                 sjinfo->min_righthand));
  90
  91                 /* We verify that exactly one reference gets removed from joinlist */
  92                 nremoved = 0;
  93                 joinlist = remove_rel_from_joinlist(joinlist, innerrelid, &nremoved);
  94                 if (nremoved != 1)
  95                         elog(ERROR, "failed to find relation %d in joinlist", innerrelid);
  96
  97                 /*
  98                  * We can delete this SpecialJoinInfo from the list too, since it's no
  99                  * longer of interest.  (Since we'll restart the foreach loop
 100                  * immediately, we don't bother with foreach_delete_current.)
 101                  */
 102                 root->join_info_list = list_delete_cell(root->join_info_list, lc);
 103
 104                 /*
 105                  * Restart the scan.  This is necessary to ensure we find all
 106                  * removable joins independently of ordering of the join_info_list
 107                  * (note that removal of attr_needed bits may make a join appear
 108                  * removable that did not before).
 109                  */
 110                 goto restart;
 111         }
 112
 113         return joinlist;
 114 }
 115
 116 /*
 117  * clause_sides_match_join
 118  *        Determine whether a join clause is of the right form to use in this join.
 119  *
 120  * We already know that the clause is a binary opclause referencing only the
 121  * rels in the current join.  The point here is to check whether it has the
 122  * form "outerrel_expr op innerrel_expr" or "innerrel_expr op outerrel_expr",
 123  * rather than mixing outer and inner vars on either side.  If it matches,
 124  * we set the transient flag outer_is_left to identify which side is which.
 125  */
 126 static inline bool
 127 clause_sides_match_join(RestrictInfo *rinfo, Relids outerrelids,
 128                                                 Relids innerrelids)
 129 {
 130         if (bms_is_subset(rinfo->left_relids, outerrelids) &&
 131                 bms_is_subset(rinfo->right_relids, innerrelids))
 132         {
 133                 /* lefthand side is outer */
 134                 rinfo->outer_is_left = true;
 135                 return true;
 136         }
 137         else if (bms_is_subset(rinfo->left_relids, innerrelids) &&
 138                          bms_is_subset(rinfo->right_relids, outerrelids))
 139         {
 140                 /* righthand side is outer */
 141                 rinfo->outer_is_left = false;
 142                 return true;
 143         }
 144         return false;                           /* no good for these input relations */
 145 }
 146
 147 /*
 148  * join_is_removable
 149  *        Check whether we need not perform this special join at all, because
 150  *        it will just duplicate its left input.
 151  *
 152  * This is true for a left join for which the join condition cannot match
 153  * more than one inner-side row.  (There are other possibly interesting
 154  * cases, but we don't have the infrastructure to prove them.)  We also
 155  * have to check that the inner side doesn't generate any variables needed
 156  * above the join.
 157  */
 158 static bool
 159 join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 160 {
 161         int                     innerrelid;
 162         RelOptInfo *innerrel;
 163         Relids          joinrelids;
 164         List       *clause_list = NIL;
 165         ListCell   *l;
 166         int                     attroff;
 167
 168         /*
 169          * Must be a non-delaying left join to a single baserel, else we aren't
 170          * going to be able to do anything with it.
 171          */
 172         if (sjinfo->jointype != JOIN_LEFT ||
 173                 sjinfo->delay_upper_joins)
 174                 return false;
 175
 176         if (!bms_get_singleton_member(sjinfo->min_righthand, &innerrelid))
 177                 return false;
 178
 179         innerrel = find_base_rel(root, innerrelid);
 180
 181         /*
 182          * Before we go to the effort of checking whether any innerrel variables
 183          * are needed above the join, make a quick check to eliminate cases in
 184          * which we will surely be unable to prove uniqueness of the innerrel.
 185          */
 186         if (!rel_supports_distinctness(root, innerrel))
 187                 return false;
 188
 189         /* Compute the relid set for the join we are considering */
 190         joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
 191
 192         /*
 193          * We can't remove the join if any inner-rel attributes are used above the
 194          * join.
 195          *
 196          * Note that this test only detects use of inner-rel attributes in higher
 197          * join conditions and the target list.  There might be such attributes in
 198          * pushed-down conditions at this join, too.  We check that case below.
 199          *
 200          * As a micro-optimization, it seems better to start with max_attr and
 201          * count down rather than starting with min_attr and counting up, on the
 202          * theory that the system attributes are somewhat less likely to be wanted
 203          * and should be tested last.
 204          */
 205         for (attroff = innerrel->max_attr - innerrel->min_attr;
 206                  attroff >= 0;
 207                  attroff--)
 208         {
 209                 if (!bms_is_subset(innerrel->attr_needed[attroff], joinrelids))
 210                         return false;
 211         }
 212
 213         /*
 214          * Similarly check that the inner rel isn't needed by any PlaceHolderVars
 215          * that will be used above the join.  We only need to fail if such a PHV
 216          * actually references some inner-rel attributes; but the correct check
 217          * for that is relatively expensive, so we first check against ph_eval_at,
 218          * which must mention the inner rel if the PHV uses any inner-rel attrs as
 219          * non-lateral references.  Note that if the PHV's syntactic scope is just
 220          * the inner rel, we can't drop the rel even if the PHV is variable-free.
 221          */
 222         foreach(l, root->placeholder_list)
 223         {
 224                 PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
 225
 226                 if (bms_overlap(phinfo->ph_lateral, innerrel->relids))
 227                         return false;           /* it references innerrel laterally */
 228                 if (bms_is_subset(phinfo->ph_needed, joinrelids))
 229                         continue;                       /* PHV is not used above the join */
 230                 if (!bms_overlap(phinfo->ph_eval_at, innerrel->relids))
 231                         continue;                       /* it definitely doesn't reference innerrel */
 232                 if (bms_is_subset(phinfo->ph_eval_at, innerrel->relids))
 233                         return false;           /* there isn't any other place to eval PHV */
 234                 if (bms_overlap(pull_varnos(root, (Node *) phinfo->ph_var->phexpr),
 235                                                 innerrel->relids))
 236                         return false;           /* it does reference innerrel */
 237         }
 238
 239         /*
 240          * Search for mergejoinable clauses that constrain the inner rel against
 241          * either the outer rel or a pseudoconstant.  If an operator is
 242          * mergejoinable then it behaves like equality for some btree opclass, so
 243          * it's what we want.  The mergejoinability test also eliminates clauses
 244          * containing volatile functions, which we couldn't depend on.
 245          */
 246         foreach(l, innerrel->joininfo)
 247         {
 248                 RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
 249
 250                 /*
 251                  * If it's not a join clause for this outer join, we can't use it.
 252                  * Note that if the clause is pushed-down, then it is logically from
 253                  * above the outer join, even if it references no other rels (it might
 254                  * be from WHERE, for example).
 255                  */
 256                 if (RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids))
 257                 {
 258                         /*
 259                          * If such a clause actually references the inner rel then join
 260                          * removal has to be disallowed.  We have to check this despite
 261                          * the previous attr_needed checks because of the possibility of
 262                          * pushed-down clauses referencing the rel.
 263                          */
 264                         if (bms_is_member(innerrelid, restrictinfo->clause_relids))
 265                                 return false;
 266                         continue;                       /* else, ignore; not useful here */
 267                 }
 268
 269                 /* Ignore if it's not a mergejoinable clause */
 270                 if (!restrictinfo->can_join ||
 271                         restrictinfo->mergeopfamilies == NIL)
 272                         continue;                       /* not mergejoinable */
 273
 274                 /*
 275                  * Check if clause has the form "outer op inner" or "inner op outer",
 276                  * and if so mark which side is inner.
 277                  */
 278                 if (!clause_sides_match_join(restrictinfo, sjinfo->min_lefthand,
 279                                                                          innerrel->relids))
 280                         continue;                       /* no good for these input relations */
 281
 282                 /* OK, add to list */
 283                 clause_list = lappend(clause_list, restrictinfo);
 284         }
 285
 286         /*
 287          * Now that we have the relevant equality join clauses, try to prove the
 288          * innerrel distinct.
 289          */
 290         if (rel_is_distinct_for(root, innerrel, clause_list))
 291                 return true;
 292
 293         /*
 294          * Some day it would be nice to check for other methods of establishing
 295          * distinctness.
 296          */
 297         return false;
 298 }
 299
 300
 301 /*
 302  * Remove the target relid from the planner's data structures, having
 303  * determined that there is no need to include it in the query.
 304  *
 305  * We are not terribly thorough here.  We must make sure that the rel is
 306  * no longer treated as a baserel, and that attributes of other baserels
 307  * are no longer marked as being needed at joins involving this rel.
 308  * Also, join quals involving the rel have to be removed from the joininfo
 309  * lists, but only if they belong to the outer join identified by joinrelids.
 310  */
 311 static void
 312 remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids)
 313 {
 314         RelOptInfo *rel = find_base_rel(root, relid);
 315         List       *joininfos;
 316         Index           rti;
 317         ListCell   *l;
 318
 319         /*
 320          * Mark the rel as "dead" to show it is no longer part of the join tree.
 321          * (Removing it from the baserel array altogether seems too risky.)
 322          */
 323         rel->reloptkind = RELOPT_DEADREL;
 324
 325         /*
 326          * Remove references to the rel from other baserels' attr_needed arrays.
 327          */
 328         for (rti = 1; rti < root->simple_rel_array_size; rti++)
 329         {
 330                 RelOptInfo *otherrel = root->simple_rel_array[rti];
 331                 int                     attroff;
 332
 333                 /* there may be empty slots corresponding to non-baserel RTEs */
 334                 if (otherrel == NULL)
 335                         continue;
 336
 337                 Assert(otherrel->relid == rti); /* sanity check on array */
 338
 339                 /* no point in processing target rel itself */
 340                 if (otherrel == rel)
 341                         continue;
 342
 343                 for (attroff = otherrel->max_attr - otherrel->min_attr;
 344                          attroff >= 0;
 345                          attroff--)
 346                 {
 347                         otherrel->attr_needed[attroff] =
 348                                 bms_del_member(otherrel->attr_needed[attroff], relid);
 349                 }
 350         }
 351
 352         /*
 353          * Likewise remove references from SpecialJoinInfo data structures.
 354          *
 355          * This is relevant in case the outer join we're deleting is nested inside
 356          * other outer joins: the upper joins' relid sets have to be adjusted. The
 357          * RHS of the target outer join will be made empty here, but that's OK
 358          * since caller will delete that SpecialJoinInfo entirely.
 359          */
 360         foreach(l, root->join_info_list)
 361         {
 362                 SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l);
 363
 364                 sjinfo->min_lefthand = bms_del_member(sjinfo->min_lefthand, relid);
 365                 sjinfo->min_righthand = bms_del_member(sjinfo->min_righthand, relid);
 366                 sjinfo->syn_lefthand = bms_del_member(sjinfo->syn_lefthand, relid);
 367                 sjinfo->syn_righthand = bms_del_member(sjinfo->syn_righthand, relid);
 368         }
 369
 370         /*
 371          * Likewise remove references from PlaceHolderVar data structures,
 372          * removing any no-longer-needed placeholders entirely.
 373          *
 374          * Removal is a bit trickier than it might seem: we can remove PHVs that
 375          * are used at the target rel and/or in the join qual, but not those that
 376          * are used at join partner rels or above the join.  It's not that easy to
 377          * distinguish PHVs used at partner rels from those used in the join qual,
 378          * since they will both have ph_needed sets that are subsets of
 379          * joinrelids.  However, a PHV used at a partner rel could not have the
 380          * target rel in ph_eval_at, so we check that while deciding whether to
 381          * remove or just update the PHV.  There is no corresponding test in
 382          * join_is_removable because it doesn't need to distinguish those cases.
 383          */
 384         foreach(l, root->placeholder_list)
 385         {
 386                 PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
 387
 388                 Assert(!bms_is_member(relid, phinfo->ph_lateral));
 389                 if (bms_is_subset(phinfo->ph_needed, joinrelids) &&
 390                         bms_is_member(relid, phinfo->ph_eval_at))
 391                         root->placeholder_list = foreach_delete_current(root->placeholder_list,
 392                                                                                                                         l);
 393                 else
 394                 {
 395                         phinfo->ph_eval_at = bms_del_member(phinfo->ph_eval_at, relid);
 396                         Assert(!bms_is_empty(phinfo->ph_eval_at));
 397                         phinfo->ph_needed = bms_del_member(phinfo->ph_needed, relid);
 398                 }
 399         }
 400
 401         /*
 402          * Remove any joinquals referencing the rel from the joininfo lists.
 403          *
 404          * In some cases, a joinqual has to be put back after deleting its
 405          * reference to the target rel.  This can occur for pseudoconstant and
 406          * outerjoin-delayed quals, which can get marked as requiring the rel in
 407          * order to force them to be evaluated at or above the join.  We can't
 408          * just discard them, though.  Only quals that logically belonged to the
 409          * outer join being discarded should be removed from the query.
 410          *
 411          * We must make a copy of the rel's old joininfo list before starting the
 412          * loop, because otherwise remove_join_clause_from_rels would destroy the
 413          * list while we're scanning it.
 414          */
 415         joininfos = list_copy(rel->joininfo);
 416         foreach(l, joininfos)
 417         {
 418                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
 419
 420                 remove_join_clause_from_rels(root, rinfo, rinfo->required_relids);
 421
 422                 if (RINFO_IS_PUSHED_DOWN(rinfo, joinrelids))
 423                 {
 424                         /* Recheck that qual doesn't actually reference the target rel */
 425                         Assert(!bms_is_member(relid, rinfo->clause_relids));
 426
 427                         /*
 428                          * The required_relids probably aren't shared with anything else,
 429                          * but let's copy them just to be sure.
 430                          */
 431                         rinfo->required_relids = bms_copy(rinfo->required_relids);
 432                         rinfo->required_relids = bms_del_member(rinfo->required_relids,
 433                                                                                                         relid);
 434                         distribute_restrictinfo_to_rels(root, rinfo);
 435                 }
 436         }
 437
 438         /*
 439          * There may be references to the rel in root->fkey_list, but if so,
 440          * match_foreign_keys_to_quals() will get rid of them.
 441          */
 442 }
 443
 444 /*
 445  * Remove any occurrences of the target relid from a joinlist structure.
 446  *
 447  * It's easiest to build a whole new list structure, so we handle it that
 448  * way.  Efficiency is not a big deal here.
 449  *
 450  * *nremoved is incremented by the number of occurrences removed (there
 451  * should be exactly one, but the caller checks that).
 452  */
 453 static List *
 454 remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved)
 455 {
 456         List       *result = NIL;
 457         ListCell   *jl;
 458
 459         foreach(jl, joinlist)
 460         {
 461                 Node       *jlnode = (Node *) lfirst(jl);
 462
 463                 if (IsA(jlnode, RangeTblRef))
 464                 {
 465                         int                     varno = ((RangeTblRef *) jlnode)->rtindex;
 466
 467                         if (varno == relid)
 468                                 (*nremoved)++;
 469                         else
 470                                 result = lappend(result, jlnode);
 471                 }
 472                 else if (IsA(jlnode, List))
 473                 {
 474                         /* Recurse to handle subproblem */
 475                         List       *sublist;
 476
 477                         sublist = remove_rel_from_joinlist((List *) jlnode,
 478                                                                                            relid, nremoved);
 479                         /* Avoid including empty sub-lists in the result */
 480                         if (sublist)
 481                                 result = lappend(result, sublist);
 482                 }
 483                 else
 484                 {
 485                         elog(ERROR, "unrecognized joinlist node type: %d",
 486                                  (int) nodeTag(jlnode));
 487                 }
 488         }
 489
 490         return result;
 491 }
 492
 493
 494 /*
 495  * reduce_unique_semijoins
 496  *              Check for semijoins that can be simplified to plain inner joins
 497  *              because the inner relation is provably unique for the join clauses.
 498  *
 499  * Ideally this would happen during reduce_outer_joins, but we don't have
 500  * enough information at that point.
 501  *
 502  * To perform the strength reduction when applicable, we need only delete
 503  * the semijoin's SpecialJoinInfo from root->join_info_list.  (We don't
 504  * bother fixing the join type attributed to it in the query jointree,
 505  * since that won't be consulted again.)
 506  */
 507 void
 508 reduce_unique_semijoins(PlannerInfo *root)
 509 {
 510         ListCell   *lc;
 511
 512         /*
 513          * Scan the join_info_list to find semijoins.
 514          */
 515         foreach(lc, root->join_info_list)
 516         {
 517                 SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
 518                 int                     innerrelid;
 519                 RelOptInfo *innerrel;
 520                 Relids          joinrelids;
 521                 List       *restrictlist;
 522
 523                 /*
 524                  * Must be a non-delaying semijoin to a single baserel, else we aren't
 525                  * going to be able to do anything with it.  (It's probably not
 526                  * possible for delay_upper_joins to be set on a semijoin, but we
 527                  * might as well check.)
 528                  */
 529                 if (sjinfo->jointype != JOIN_SEMI ||
 530                         sjinfo->delay_upper_joins)
 531                         continue;
 532
 533                 if (!bms_get_singleton_member(sjinfo->min_righthand, &innerrelid))
 534                         continue;
 535
 536                 innerrel = find_base_rel(root, innerrelid);
 537
 538                 /*
 539                  * Before we trouble to run generate_join_implied_equalities, make a
 540                  * quick check to eliminate cases in which we will surely be unable to
 541                  * prove uniqueness of the innerrel.
 542                  */
 543                 if (!rel_supports_distinctness(root, innerrel))
 544                         continue;
 545
 546                 /* Compute the relid set for the join we are considering */
 547                 joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
 548
 549                 /*
 550                  * Since we're only considering a single-rel RHS, any join clauses it
 551                  * has must be clauses linking it to the semijoin's min_lefthand.  We
 552                  * can also consider EC-derived join clauses.
 553                  */
 554                 restrictlist =
 555                         list_concat(generate_join_implied_equalities(root,
 556                                                                                                                  joinrelids,
 557                                                                                                                  sjinfo->min_lefthand,
 558                                                                                                                  innerrel),
 559                                                 innerrel->joininfo);
 560
 561                 /* Test whether the innerrel is unique for those clauses. */
 562                 if (!innerrel_is_unique(root,
 563                                                                 joinrelids, sjinfo->min_lefthand, innerrel,
 564                                                                 JOIN_SEMI, restrictlist, true))
 565                         continue;
 566
 567                 /* OK, remove the SpecialJoinInfo from the list. */
 568                 root->join_info_list = foreach_delete_current(root->join_info_list, lc);
 569         }
 570 }
 571
 572
 573 /*
 574  * rel_supports_distinctness
 575  *              Could the relation possibly be proven distinct on some set of columns?
 576  *
 577  * This is effectively a pre-checking function for rel_is_distinct_for().
 578  * It must return true if rel_is_distinct_for() could possibly return true
 579  * with this rel, but it should not expend a lot of cycles.  The idea is
 580  * that callers can avoid doing possibly-expensive processing to compute
 581  * rel_is_distinct_for()'s argument lists if the call could not possibly
 582  * succeed.
 583  */
 584 static bool
 585 rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 586 {
 587         /* We only know about baserels ... */
 588         if (rel->reloptkind != RELOPT_BASEREL)
 589                 return false;
 590         if (rel->rtekind == RTE_RELATION)
 591         {
 592                 /*
 593                  * For a plain relation, we only know how to prove uniqueness by
 594                  * reference to unique indexes.  Make sure there's at least one
 595                  * suitable unique index.  It must be immediately enforced, and if
 596                  * it's a partial index, it must match the query.  (Keep these
 597                  * conditions in sync with relation_has_unique_index_for!)
 598                  */
 599                 ListCell   *lc;
 600
 601                 foreach(lc, rel->indexlist)
 602                 {
 603                         IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 604
 605                         if (ind->unique && ind->immediate &&
 606                                 (ind->indpred == NIL || ind->predOK))
 607                                 return true;
 608                 }
 609         }
 610         else if (rel->rtekind == RTE_SUBQUERY)
 611         {
 612                 Query      *subquery = root->simple_rte_array[rel->relid]->subquery;
 613
 614                 /* Check if the subquery has any qualities that support distinctness */
 615                 if (query_supports_distinctness(subquery))
 616                         return true;
 617         }
 618         /* We have no proof rules for any other rtekinds. */
 619         return false;
 620 }
 621
 622 /*
 623  * rel_is_distinct_for
 624  *              Does the relation return only distinct rows according to clause_list?
 625  *
 626  * clause_list is a list of join restriction clauses involving this rel and
 627  * some other one.  Return true if no two rows emitted by this rel could
 628  * possibly join to the same row of the other rel.
 629  *
 630  * The caller must have already determined that each condition is a
 631  * mergejoinable equality with an expression in this relation on one side, and
 632  * an expression not involving this relation on the other.  The transient
 633  * outer_is_left flag is used to identify which side references this relation:
 634  * left side if outer_is_left is false, right side if it is true.
 635  *
 636  * Note that the passed-in clause_list may be destructively modified!  This
 637  * is OK for current uses, because the clause_list is built by the caller for
 638  * the sole purpose of passing to this function.
 639  */
 640 static bool
 641 rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list)
 642 {
 643         /*
 644          * We could skip a couple of tests here if we assume all callers checked
 645          * rel_supports_distinctness first, but it doesn't seem worth taking any
 646          * risk for.
 647          */
 648         if (rel->reloptkind != RELOPT_BASEREL)
 649                 return false;
 650         if (rel->rtekind == RTE_RELATION)
 651         {
 652                 /*
 653                  * Examine the indexes to see if we have a matching unique index.
 654                  * relation_has_unique_index_for automatically adds any usable
 655                  * restriction clauses for the rel, so we needn't do that here.
 656                  */
 657                 if (relation_has_unique_index_for(root, rel, clause_list, NIL, NIL))
 658                         return true;
 659         }
 660         else if (rel->rtekind == RTE_SUBQUERY)
 661         {
 662                 Index           relid = rel->relid;
 663                 Query      *subquery = root->simple_rte_array[relid]->subquery;
 664                 List       *colnos = NIL;
 665                 List       *opids = NIL;
 666                 ListCell   *l;
 667
 668                 /*
 669                  * Build the argument lists for query_is_distinct_for: a list of
 670                  * output column numbers that the query needs to be distinct over, and
 671                  * a list of equality operators that the output columns need to be
 672                  * distinct according to.
 673                  *
 674                  * (XXX we are not considering restriction clauses attached to the
 675                  * subquery; is that worth doing?)
 676                  */
 677                 foreach(l, clause_list)
 678                 {
 679                         RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
 680                         Oid                     op;
 681                         Var                *var;
 682
 683                         /*
 684                          * Get the equality operator we need uniqueness according to.
 685                          * (This might be a cross-type operator and thus not exactly the
 686                          * same operator the subquery would consider; that's all right
 687                          * since query_is_distinct_for can resolve such cases.)  The
 688                          * caller's mergejoinability test should have selected only
 689                          * OpExprs.
 690                          */
 691                         op = castNode(OpExpr, rinfo->clause)->opno;
 692
 693                         /* caller identified the inner side for us */
 694                         if (rinfo->outer_is_left)
 695                                 var = (Var *) get_rightop(rinfo->clause);
 696                         else
 697                                 var = (Var *) get_leftop(rinfo->clause);
 698
 699                         /*
 700                          * We may ignore any RelabelType node above the operand.  (There
 701                          * won't be more than one, since eval_const_expressions() has been
 702                          * applied already.)
 703                          */
 704                         if (var && IsA(var, RelabelType))
 705                                 var = (Var *) ((RelabelType *) var)->arg;
 706
 707                         /*
 708                          * If inner side isn't a Var referencing a subquery output column,
 709                          * this clause doesn't help us.
 710                          */
 711                         if (!var || !IsA(var, Var) ||
 712                                 var->varno != relid || var->varlevelsup != 0)
 713                                 continue;
 714
 715                         colnos = lappend_int(colnos, var->varattno);
 716                         opids = lappend_oid(opids, op);
 717                 }
 718
 719                 if (query_is_distinct_for(subquery, colnos, opids))
 720                         return true;
 721         }
 722         return false;
 723 }
 724
 725
 726 /*
 727  * query_supports_distinctness - could the query possibly be proven distinct
 728  *              on some set of output columns?
 729  *
 730  * This is effectively a pre-checking function for query_is_distinct_for().
 731  * It must return true if query_is_distinct_for() could possibly return true
 732  * with this query, but it should not expend a lot of cycles.  The idea is
 733  * that callers can avoid doing possibly-expensive processing to compute
 734  * query_is_distinct_for()'s argument lists if the call could not possibly
 735  * succeed.
 736  */
 737 bool
 738 query_supports_distinctness(Query *query)
 739 {
 740         /* SRFs break distinctness except with DISTINCT, see below */
 741         if (query->hasTargetSRFs && query->distinctClause == NIL)
 742                 return false;
 743
 744         /* check for features we can prove distinctness with */
 745         if (query->distinctClause != NIL ||
 746                 query->groupClause != NIL ||
 747                 query->groupingSets != NIL ||
 748                 query->hasAggs ||
 749                 query->havingQual ||
 750                 query->setOperations)
 751                 return true;
 752
 753         return false;
 754 }
 755
 756 /*
 757  * query_is_distinct_for - does query never return duplicates of the
 758  *              specified columns?
 759  *
 760  * query is a not-yet-planned subquery (in current usage, it's always from
 761  * a subquery RTE, which the planner avoids scribbling on).
 762  *
 763  * colnos is an integer list of output column numbers (resno's).  We are
 764  * interested in whether rows consisting of just these columns are certain
 765  * to be distinct.  "Distinctness" is defined according to whether the
 766  * corresponding upper-level equality operators listed in opids would think
 767  * the values are distinct.  (Note: the opids entries could be cross-type
 768  * operators, and thus not exactly the equality operators that the subquery
 769  * would use itself.  We use equality_ops_are_compatible() to check
 770  * compatibility.  That looks at btree or hash opfamily membership, and so
 771  * should give trustworthy answers for all operators that we might need
 772  * to deal with here.)
 773  */
 774 bool
 775 query_is_distinct_for(Query *query, List *colnos, List *opids)
 776 {
 777         ListCell   *l;
 778         Oid                     opid;
 779
 780         Assert(list_length(colnos) == list_length(opids));
 781
 782         /*
 783          * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
 784          * columns in the DISTINCT clause appear in colnos and operator semantics
 785          * match.  This is true even if there are SRFs in the DISTINCT columns or
 786          * elsewhere in the tlist.
 787          */
 788         if (query->distinctClause)
 789         {
 790                 foreach(l, query->distinctClause)
 791                 {
 792                         SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
 793                         TargetEntry *tle = get_sortgroupclause_tle(sgc,
 794                                                                                                            query->targetList);
 795
 796                         opid = distinct_col_search(tle->resno, colnos, opids);
 797                         if (!OidIsValid(opid) ||
 798                                 !equality_ops_are_compatible(opid, sgc->eqop))
 799                                 break;                  /* exit early if no match */
 800                 }
 801                 if (l == NULL)                  /* had matches for all? */
 802                         return true;
 803         }
 804
 805         /*
 806          * Otherwise, a set-returning function in the query's targetlist can
 807          * result in returning duplicate rows, despite any grouping that might
 808          * occur before tlist evaluation.  (If all tlist SRFs are within GROUP BY
 809          * columns, it would be safe because they'd be expanded before grouping.
 810          * But it doesn't currently seem worth the effort to check for that.)
 811          */
 812         if (query->hasTargetSRFs)
 813                 return false;
 814
 815         /*
 816          * Similarly, GROUP BY without GROUPING SETS guarantees uniqueness if all
 817          * the grouped columns appear in colnos and operator semantics match.
 818          */
 819         if (query->groupClause && !query->groupingSets)
 820         {
 821                 foreach(l, query->groupClause)
 822                 {
 823                         SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
 824                         TargetEntry *tle = get_sortgroupclause_tle(sgc,
 825                                                                                                            query->targetList);
 826
 827                         opid = distinct_col_search(tle->resno, colnos, opids);
 828                         if (!OidIsValid(opid) ||
 829                                 !equality_ops_are_compatible(opid, sgc->eqop))
 830                                 break;                  /* exit early if no match */
 831                 }
 832                 if (l == NULL)                  /* had matches for all? */
 833                         return true;
 834         }
 835         else if (query->groupingSets)
 836         {
 837                 /*
 838                  * If we have grouping sets with expressions, we probably don't have
 839                  * uniqueness and analysis would be hard. Punt.
 840                  */
 841                 if (query->groupClause)
 842                         return false;
 843
 844                 /*
 845                  * If we have no groupClause (therefore no grouping expressions), we
 846                  * might have one or many empty grouping sets. If there's just one,
 847                  * then we're returning only one row and are certainly unique. But
 848                  * otherwise, we know we're certainly not unique.
 849                  */
 850                 if (list_length(query->groupingSets) == 1 &&
 851                         ((GroupingSet *) linitial(query->groupingSets))->kind == GROUPING_SET_EMPTY)
 852                         return true;
 853                 else
 854                         return false;
 855         }
 856         else
 857         {
 858                 /*
 859                  * If we have no GROUP BY, but do have aggregates or HAVING, then the
 860                  * result is at most one row so it's surely unique, for any operators.
 861                  */
 862                 if (query->hasAggs || query->havingQual)
 863                         return true;
 864         }
 865
 866         /*
 867          * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
 868          * except with ALL.
 869          */
 870         if (query->setOperations)
 871         {
 872                 SetOperationStmt *topop = castNode(SetOperationStmt, query->setOperations);
 873
 874                 Assert(topop->op != SETOP_NONE);
 875
 876                 if (!topop->all)
 877                 {
 878                         ListCell   *lg;
 879
 880                         /* We're good if all the nonjunk output columns are in colnos */
 881                         lg = list_head(topop->groupClauses);
 882                         foreach(l, query->targetList)
 883                         {
 884                                 TargetEntry *tle = (TargetEntry *) lfirst(l);
 885                                 SortGroupClause *sgc;
 886
 887                                 if (tle->resjunk)
 888                                         continue;       /* ignore resjunk columns */
 889
 890                                 /* non-resjunk columns should have grouping clauses */
 891                                 Assert(lg != NULL);
 892                                 sgc = (SortGroupClause *) lfirst(lg);
 893                                 lg = lnext(topop->groupClauses, lg);
 894
 895                                 opid = distinct_col_search(tle->resno, colnos, opids);
 896                                 if (!OidIsValid(opid) ||
 897                                         !equality_ops_are_compatible(opid, sgc->eqop))
 898                                         break;          /* exit early if no match */
 899                         }
 900                         if (l == NULL)          /* had matches for all? */
 901                                 return true;
 902                 }
 903         }
 904
 905         /*
 906          * XXX Are there any other cases in which we can easily see the result
 907          * must be distinct?
 908          *
 909          * If you do add more smarts to this function, be sure to update
 910          * query_supports_distinctness() to match.
 911          */
 912
 913         return false;
 914 }
 915
 916 /*
 917  * distinct_col_search - subroutine for query_is_distinct_for
 918  *
 919  * If colno is in colnos, return the corresponding element of opids,
 920  * else return InvalidOid.  (Ordinarily colnos would not contain duplicates,
 921  * but if it does, we arbitrarily select the first match.)
 922  */
 923 static Oid
 924 distinct_col_search(int colno, List *colnos, List *opids)
 925 {
 926         ListCell   *lc1,
 927                            *lc2;
 928
 929         forboth(lc1, colnos, lc2, opids)
 930         {
 931                 if (colno == lfirst_int(lc1))
 932                         return lfirst_oid(lc2);
 933         }
 934         return InvalidOid;
 935 }
 936
 937
 938 /*
 939  * innerrel_is_unique
 940  *        Check if the innerrel provably contains at most one tuple matching any
 941  *        tuple from the outerrel, based on join clauses in the 'restrictlist'.
 942  *
 943  * We need an actual RelOptInfo for the innerrel, but it's sufficient to
 944  * identify the outerrel by its Relids.  This asymmetry supports use of this
 945  * function before joinrels have been built.  (The caller is expected to
 946  * also supply the joinrelids, just to save recalculating that.)
 947  *
 948  * The proof must be made based only on clauses that will be "joinquals"
 949  * rather than "otherquals" at execution.  For an inner join there's no
 950  * difference; but if the join is outer, we must ignore pushed-down quals,
 951  * as those will become "otherquals".  Note that this means the answer might
 952  * vary depending on whether IS_OUTER_JOIN(jointype); since we cache the
 953  * answer without regard to that, callers must take care not to call this
 954  * with jointypes that would be classified differently by IS_OUTER_JOIN().
 955  *
 956  * The actual proof is undertaken by is_innerrel_unique_for(); this function
 957  * is a frontend that is mainly concerned with caching the answers.
 958  * In particular, the force_cache argument allows overriding the internal
 959  * heuristic about whether to cache negative answers; it should be "true"
 960  * if making an inquiry that is not part of the normal bottom-up join search
 961  * sequence.
 962  */
 963 bool
 964 innerrel_is_unique(PlannerInfo *root,
 965                                    Relids joinrelids,
 966                                    Relids outerrelids,
 967                                    RelOptInfo *innerrel,
 968                                    JoinType jointype,
 969                                    List *restrictlist,
 970                                    bool force_cache)
 971 {
 972         MemoryContext old_context;
 973         ListCell   *lc;
 974
 975         /* Certainly can't prove uniqueness when there are no joinclauses */
 976         if (restrictlist == NIL)
 977                 return false;
 978
 979         /*
 980          * Make a quick check to eliminate cases in which we will surely be unable
 981          * to prove uniqueness of the innerrel.
 982          */
 983         if (!rel_supports_distinctness(root, innerrel))
 984                 return false;
 985
 986         /*
 987          * Query the cache to see if we've managed to prove that innerrel is
 988          * unique for any subset of this outerrel.  We don't need an exact match,
 989          * as extra outerrels can't make the innerrel any less unique (or more
 990          * formally, the restrictlist for a join to a superset outerrel must be a
 991          * superset of the conditions we successfully used before).
 992          */
 993         foreach(lc, innerrel->unique_for_rels)
 994         {
 995                 Relids          unique_for_rels = (Relids) lfirst(lc);
 996
 997                 if (bms_is_subset(unique_for_rels, outerrelids))
 998                         return true;            /* Success! */
 999         }
1000
1001         /*
1002          * Conversely, we may have already determined that this outerrel, or some
1003          * superset thereof, cannot prove this innerrel to be unique.
1004          */
1005         foreach(lc, innerrel->non_unique_for_rels)
1006         {
1007                 Relids          unique_for_rels = (Relids) lfirst(lc);
1008
1009                 if (bms_is_subset(outerrelids, unique_for_rels))
1010                         return false;
1011         }
1012
1013         /* No cached information, so try to make the proof. */
1014         if (is_innerrel_unique_for(root, joinrelids, outerrelids, innerrel,
1015                                                            jointype, restrictlist))
1016         {
1017                 /*
1018                  * Cache the positive result for future probes, being sure to keep it
1019                  * in the planner_cxt even if we are working in GEQO.
1020                  *
1021                  * Note: one might consider trying to isolate the minimal subset of
1022                  * the outerrels that proved the innerrel unique.  But it's not worth
1023                  * the trouble, because the planner builds up joinrels incrementally
1024                  * and so we'll see the minimally sufficient outerrels before any
1025                  * supersets of them anyway.
1026                  */
1027                 old_context = MemoryContextSwitchTo(root->planner_cxt);
1028                 innerrel->unique_for_rels = lappend(innerrel->unique_for_rels,
1029                                                                                         bms_copy(outerrelids));
1030                 MemoryContextSwitchTo(old_context);
1031
1032                 return true;                    /* Success! */
1033         }
1034         else
1035         {
1036                 /*
1037                  * None of the join conditions for outerrel proved innerrel unique, so
1038                  * we can safely reject this outerrel or any subset of it in future
1039                  * checks.
1040                  *
1041                  * However, in normal planning mode, caching this knowledge is totally
1042                  * pointless; it won't be queried again, because we build up joinrels
1043                  * from smaller to larger.  It is useful in GEQO mode, where the
1044                  * knowledge can be carried across successive planning attempts; and
1045                  * it's likely to be useful when using join-search plugins, too. Hence
1046                  * cache when join_search_private is non-NULL.  (Yeah, that's a hack,
1047                  * but it seems reasonable.)
1048                  *
1049                  * Also, allow callers to override that heuristic and force caching;
1050                  * that's useful for reduce_unique_semijoins, which calls here before
1051                  * the normal join search starts.
1052                  */
1053                 if (force_cache || root->join_search_private)
1054                 {
1055                         old_context = MemoryContextSwitchTo(root->planner_cxt);
1056                         innerrel->non_unique_for_rels =
1057                                 lappend(innerrel->non_unique_for_rels,
1058                                                 bms_copy(outerrelids));
1059                         MemoryContextSwitchTo(old_context);
1060                 }
1061
1062                 return false;
1063         }
1064 }
1065
1066 /*
1067  * is_innerrel_unique_for
1068  *        Check if the innerrel provably contains at most one tuple matching any
1069  *        tuple from the outerrel, based on join clauses in the 'restrictlist'.
1070  */
1071 static bool
1072 is_innerrel_unique_for(PlannerInfo *root,
1073                                            Relids joinrelids,
1074                                            Relids outerrelids,
1075                                            RelOptInfo *innerrel,
1076                                            JoinType jointype,
1077                                            List *restrictlist)
1078 {
1079         List       *clause_list = NIL;
1080         ListCell   *lc;
1081
1082         /*
1083          * Search for mergejoinable clauses that constrain the inner rel against
1084          * the outer rel.  If an operator is mergejoinable then it behaves like
1085          * equality for some btree opclass, so it's what we want.  The
1086          * mergejoinability test also eliminates clauses containing volatile
1087          * functions, which we couldn't depend on.
1088          */
1089         foreach(lc, restrictlist)
1090         {
1091                 RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(lc);
1092
1093                 /*
1094                  * As noted above, if it's a pushed-down clause and we're at an outer
1095                  * join, we can't use it.
1096                  */
1097                 if (IS_OUTER_JOIN(jointype) &&
1098                         RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids))
1099                         continue;
1100
1101                 /* Ignore if it's not a mergejoinable clause */
1102                 if (!restrictinfo->can_join ||
1103                         restrictinfo->mergeopfamilies == NIL)
1104                         continue;                       /* not mergejoinable */
1105
1106                 /*
1107                  * Check if clause has the form "outer op inner" or "inner op outer",
1108                  * and if so mark which side is inner.
1109                  */
1110                 if (!clause_sides_match_join(restrictinfo, outerrelids,
1111                                                                          innerrel->relids))
1112                         continue;                       /* no good for these input relations */
1113
1114                 /* OK, add to list */
1115                 clause_list = lappend(clause_list, restrictinfo);
1116         }
1117
1118         /* Let rel_is_distinct_for() do the hard work */
1119         return rel_is_distinct_for(root, innerrel, clause_list);
1120 }