src/backend/commands/cluster.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * cluster.c
   4  *        CLUSTER a table on an index.
   5  *
   6  * There is hardly anything left of Paul Brown's original implementation...
   7  *
   8  *
   9  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  10  * Portions Copyright (c) 1994-5, Regents of the University of California
  11  *
  12  *
  13  * IDENTIFICATION
  14  *        $PostgreSQL$
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19
  20 #include "access/genam.h"
  21 #include "access/heapam.h"
  22 #include "access/relscan.h"
  23 #include "access/rewriteheap.h"
  24 #include "access/transam.h"
  25 #include "access/xact.h"
  26 #include "catalog/catalog.h"
  27 #include "catalog/dependency.h"
  28 #include "catalog/heap.h"
  29 #include "catalog/index.h"
  30 #include "catalog/indexing.h"
  31 #include "catalog/namespace.h"
  32 #include "catalog/pg_namespace.h"
  33 #include "catalog/toasting.h"
  34 #include "commands/cluster.h"
  35 #include "commands/tablecmds.h"
  36 #include "commands/trigger.h"
  37 #include "commands/vacuum.h"
  38 #include "miscadmin.h"
  39 #include "storage/bufmgr.h"
  40 #include "storage/procarray.h"
  41 #include "utils/acl.h"
  42 #include "utils/fmgroids.h"
  43 #include "utils/inval.h"
  44 #include "utils/lsyscache.h"
  45 #include "utils/memutils.h"
  46 #include "utils/relcache.h"
  47 #include "utils/snapmgr.h"
  48 #include "utils/syscache.h"
  49 #include "utils/tqual.h"
  50
  51
  52 /*
  53  * This struct is used to pass around the information on tables to be
  54  * clustered. We need this so we can make a list of them when invoked without
  55  * a specific table/index pair.
  56  */
  57 typedef struct
  58 {
  59         Oid                     tableOid;
  60         Oid                     indexOid;
  61 } RelToCluster;
  62
  63
  64 static void cluster_rel(RelToCluster *rv, bool recheck, bool verbose);
  65 static void rebuild_relation(Relation OldHeap, Oid indexOid);
  66 static TransactionId copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
  67 static List *get_tables_to_cluster(MemoryContext cluster_context);
  68
  69
  70
  71 /*---------------------------------------------------------------------------
  72  * This cluster code allows for clustering multiple tables at once. Because
  73  * of this, we cannot just run everything on a single transaction, or we
  74  * would be forced to acquire exclusive locks on all the tables being
  75  * clustered, simultaneously --- very likely leading to deadlock.
  76  *
  77  * To solve this we follow a similar strategy to VACUUM code,
  78  * clustering each relation in a separate transaction. For this to work,
  79  * we need to:
  80  *      - provide a separate memory context so that we can pass information in
  81  *        a way that survives across transactions
  82  *      - start a new transaction every time a new relation is clustered
  83  *      - check for validity of the information on to-be-clustered relations,
  84  *        as someone might have deleted a relation behind our back, or
  85  *        clustered one on a different index
  86  *      - end the transaction
  87  *
  88  * The single-relation case does not have any such overhead.
  89  *
  90  * We also allow a relation to be specified without index.      In that case,
  91  * the indisclustered bit will be looked up, and an ERROR will be thrown
  92  * if there is no index with the bit set.
  93  *---------------------------------------------------------------------------
  94  */
  95 void
  96 cluster(ClusterStmt *stmt, bool isTopLevel)
  97 {
  98         if (stmt->relation != NULL)
  99         {
 100                 /* This is the single-relation case. */
 101                 Oid                     tableOid,
 102                                         indexOid = InvalidOid;
 103                 Relation        rel;
 104                 RelToCluster rvtc;
 105
 106                 /* Find and lock the table */
 107                 rel = heap_openrv(stmt->relation, AccessExclusiveLock);
 108
 109                 tableOid = RelationGetRelid(rel);
 110
 111                 /* Check permissions */
 112                 if (!pg_class_ownercheck(tableOid, GetUserId()))
 113                         aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
 114                                                    RelationGetRelationName(rel));
 115
 116                 /*
 117                  * Reject clustering a remote temp table ... their local buffer
 118                  * manager is not going to cope.
 119                  */
 120                 if (isOtherTempNamespace(RelationGetNamespace(rel)))
 121                         ereport(ERROR,
 122                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 123                            errmsg("cannot cluster temporary tables of other sessions")));
 124
 125                 if (stmt->indexname == NULL)
 126                 {
 127                         ListCell   *index;
 128
 129                         /* We need to find the index that has indisclustered set. */
 130                         foreach(index, RelationGetIndexList(rel))
 131                         {
 132                                 HeapTuple       idxtuple;
 133                                 Form_pg_index indexForm;
 134
 135                                 indexOid = lfirst_oid(index);
 136                                 idxtuple = SearchSysCache(INDEXRELID,
 137                                                                                   ObjectIdGetDatum(indexOid),
 138                                                                                   0, 0, 0);
 139                                 if (!HeapTupleIsValid(idxtuple))
 140                                         elog(ERROR, "cache lookup failed for index %u", indexOid);
 141                                 indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
 142                                 if (indexForm->indisclustered)
 143                                 {
 144                                         ReleaseSysCache(idxtuple);
 145                                         break;
 146                                 }
 147                                 ReleaseSysCache(idxtuple);
 148                                 indexOid = InvalidOid;
 149                         }
 150
 151                         if (!OidIsValid(indexOid))
 152                                 ereport(ERROR,
 153                                                 (errcode(ERRCODE_UNDEFINED_OBJECT),
 154                                                  errmsg("there is no previously clustered index for table \"%s\"",
 155                                                                 stmt->relation->relname)));
 156                 }
 157                 else
 158                 {
 159                         /*
 160                          * The index is expected to be in the same namespace as the
 161                          * relation.
 162                          */
 163                         indexOid = get_relname_relid(stmt->indexname,
 164                                                                                  rel->rd_rel->relnamespace);
 165                         if (!OidIsValid(indexOid))
 166                                 ereport(ERROR,
 167                                                 (errcode(ERRCODE_UNDEFINED_OBJECT),
 168                                            errmsg("index \"%s\" for table \"%s\" does not exist",
 169                                                           stmt->indexname, stmt->relation->relname)));
 170                 }
 171
 172                 /* All other checks are done in cluster_rel() */
 173                 rvtc.tableOid = tableOid;
 174                 rvtc.indexOid = indexOid;
 175
 176                 /* close relation, keep lock till commit */
 177                 heap_close(rel, NoLock);
 178
 179                 /* Do the job */
 180                 cluster_rel(&rvtc, false, stmt->verbose);
 181         }
 182         else
 183         {
 184                 /*
 185                  * This is the "multi relation" case. We need to cluster all tables
 186                  * that have some index with indisclustered set.
 187                  */
 188                 MemoryContext cluster_context;
 189                 List       *rvs;
 190                 ListCell   *rv;
 191
 192                 /*
 193                  * We cannot run this form of CLUSTER inside a user transaction block;
 194                  * we'd be holding locks way too long.
 195                  */
 196                 PreventTransactionChain(isTopLevel, "CLUSTER");
 197
 198                 /*
 199                  * Create special memory context for cross-transaction storage.
 200                  *
 201                  * Since it is a child of PortalContext, it will go away even in case
 202                  * of error.
 203                  */
 204                 cluster_context = AllocSetContextCreate(PortalContext,
 205                                                                                                 "Cluster",
 206                                                                                                 ALLOCSET_DEFAULT_MINSIZE,
 207                                                                                                 ALLOCSET_DEFAULT_INITSIZE,
 208                                                                                                 ALLOCSET_DEFAULT_MAXSIZE);
 209
 210                 /*
 211                  * Build the list of relations to cluster.      Note that this lives in
 212                  * cluster_context.
 213                  */
 214                 rvs = get_tables_to_cluster(cluster_context);
 215
 216                 /* Commit to get out of starting transaction */
 217                 PopActiveSnapshot();
 218                 CommitTransactionCommand();
 219
 220                 /* Ok, now that we've got them all, cluster them one by one */
 221                 foreach(rv, rvs)
 222                 {
 223                         RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
 224
 225                         /* Start a new transaction for each relation. */
 226                         StartTransactionCommand();
 227                         /* functions in indexes may want a snapshot set */
 228                         PushActiveSnapshot(GetTransactionSnapshot());
 229                         cluster_rel(rvtc, true, stmt->verbose);
 230                         PopActiveSnapshot();
 231                         CommitTransactionCommand();
 232                 }
 233
 234                 /* Start a new transaction for the cleanup work. */
 235                 StartTransactionCommand();
 236
 237                 /* Clean up working storage */
 238                 MemoryContextDelete(cluster_context);
 239         }
 240 }
 241
 242 /*
 243  * cluster_rel
 244  *
 245  * This clusters the table by creating a new, clustered table and
 246  * swapping the relfilenodes of the new table and the old table, so
 247  * the OID of the original table is preserved.  Thus we do not lose
 248  * GRANT, inheritance nor references to this table (this was a bug
 249  * in releases thru 7.3).
 250  *
 251  * Also create new indexes and swap the filenodes with the old indexes the
 252  * same way we do for the relation.  Since we are effectively bulk-loading
 253  * the new table, it's better to create the indexes afterwards than to fill
 254  * them incrementally while we load the table.
 255  */
 256 static void
 257 cluster_rel(RelToCluster *rvtc, bool recheck, bool verbose)
 258 {
 259         Relation        OldHeap;
 260
 261         /* Check for user-requested abort. */
 262         CHECK_FOR_INTERRUPTS();
 263
 264         /*
 265          * We grab exclusive access to the target rel and index for the duration
 266          * of the transaction.  (This is redundant for the single-transaction
 267          * case, since cluster() already did it.)  The index lock is taken inside
 268          * check_index_is_clusterable.
 269          */
 270         OldHeap = try_relation_open(rvtc->tableOid, AccessExclusiveLock);
 271
 272         /* If the table has gone away, we can skip processing it */
 273         if (!OldHeap)
 274                 return;
 275
 276         /*
 277          * Since we may open a new transaction for each relation, we have to check
 278          * that the relation still is what we think it is.
 279          *
 280          * If this is a single-transaction CLUSTER, we can skip these tests. We
 281          * *must* skip the one on indisclustered since it would reject an attempt
 282          * to cluster a not-previously-clustered index.
 283          */
 284         if (recheck)
 285         {
 286                 HeapTuple       tuple;
 287                 Form_pg_index indexForm;
 288
 289                 /* Check that the user still owns the relation */
 290                 if (!pg_class_ownercheck(rvtc->tableOid, GetUserId()))
 291                 {
 292                         relation_close(OldHeap, AccessExclusiveLock);
 293                         return;
 294                 }
 295
 296                 /*
 297                  * Silently skip a temp table for a remote session.  Only doing this
 298                  * check in the "recheck" case is appropriate (which currently means
 299                  * somebody is executing a database-wide CLUSTER), because there is
 300                  * another check in cluster() which will stop any attempt to cluster
 301                  * remote temp tables by name.  There is another check in
 302                  * check_index_is_clusterable which is redundant, but we leave it for
 303                  * extra safety.
 304                  */
 305                 if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
 306                 {
 307                         relation_close(OldHeap, AccessExclusiveLock);
 308                         return;
 309                 }
 310
 311                 /*
 312                  * Check that the index still exists
 313                  */
 314                 if (!SearchSysCacheExists(RELOID,
 315                                                                   ObjectIdGetDatum(rvtc->indexOid),
 316                                                                   0, 0, 0))
 317                 {
 318                         relation_close(OldHeap, AccessExclusiveLock);
 319                         return;
 320                 }
 321
 322                 /*
 323                  * Check that the index is still the one with indisclustered set.
 324                  */
 325                 tuple = SearchSysCache(INDEXRELID,
 326                                                            ObjectIdGetDatum(rvtc->indexOid),
 327                                                            0, 0, 0);
 328                 if (!HeapTupleIsValid(tuple))   /* probably can't happen */
 329                 {
 330                         relation_close(OldHeap, AccessExclusiveLock);
 331                         return;
 332                 }
 333                 indexForm = (Form_pg_index) GETSTRUCT(tuple);
 334                 if (!indexForm->indisclustered)
 335                 {
 336                         ReleaseSysCache(tuple);
 337                         relation_close(OldHeap, AccessExclusiveLock);
 338                         return;
 339                 }
 340                 ReleaseSysCache(tuple);
 341         }
 342
 343         /* Check index is valid to cluster on */
 344         check_index_is_clusterable(OldHeap, rvtc->indexOid, recheck);
 345
 346         /* rebuild_relation does all the dirty work */
 347         ereport(verbose ? INFO : DEBUG2,
 348                         (errmsg("clustering \"%s.%s\"",
 349                                         get_namespace_name(RelationGetNamespace(OldHeap)),
 350                                         RelationGetRelationName(OldHeap))));
 351         rebuild_relation(OldHeap, rvtc->indexOid);
 352
 353         /* NB: rebuild_relation does heap_close() on OldHeap */
 354 }
 355
 356 /*
 357  * Verify that the specified index is a legitimate index to cluster on
 358  *
 359  * Side effect: obtains exclusive lock on the index.  The caller should
 360  * already have exclusive lock on the table, so the index lock is likely
 361  * redundant, but it seems best to grab it anyway to ensure the index
 362  * definition can't change under us.
 363  */
 364 void
 365 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck)
 366 {
 367         Relation        OldIndex;
 368
 369         OldIndex = index_open(indexOid, AccessExclusiveLock);
 370
 371         /*
 372          * Check that index is in fact an index on the given relation
 373          */
 374         if (OldIndex->rd_index == NULL ||
 375                 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
 376                 ereport(ERROR,
 377                                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
 378                                  errmsg("\"%s\" is not an index for table \"%s\"",
 379                                                 RelationGetRelationName(OldIndex),
 380                                                 RelationGetRelationName(OldHeap))));
 381
 382         /*
 383          * Disallow clustering on incomplete indexes (those that might not index
 384          * every row of the relation).  We could relax this by making a separate
 385          * seqscan pass over the table to copy the missing rows, but that seems
 386          * expensive and tedious.
 387          */
 388         if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
 389                 ereport(ERROR,
 390                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 391                                  errmsg("cannot cluster on partial index \"%s\"",
 392                                                 RelationGetRelationName(OldIndex))));
 393
 394         if (!OldIndex->rd_am->amclusterable)
 395                 ereport(ERROR,
 396                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 397                                  errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
 398                                                 RelationGetRelationName(OldIndex))));
 399
 400         if (!OldIndex->rd_am->amindexnulls)
 401         {
 402                 AttrNumber      colno;
 403
 404                 /*
 405                  * If the AM doesn't index nulls, then it's a partial index unless we
 406                  * can prove all the rows are non-null.  Note we only need look at the
 407                  * first column; multicolumn-capable AMs are *required* to index nulls
 408                  * in columns after the first.
 409                  */
 410                 colno = OldIndex->rd_index->indkey.values[0];
 411                 if (colno > 0)
 412                 {
 413                         /* ordinary user attribute */
 414                         if (!OldHeap->rd_att->attrs[colno - 1]->attnotnull)
 415                                 ereport(ERROR,
 416                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 417                                                  errmsg("cannot cluster on index \"%s\" because access method does not handle null values",
 418                                                                 RelationGetRelationName(OldIndex)),
 419                                                  recheck
 420                                                  ? errhint("You might be able to work around this by marking column \"%s\" NOT NULL, or use ALTER TABLE ... SET WITHOUT CLUSTER to remove the cluster specification from the table.",
 421                                                  NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))
 422                                                  : errhint("You might be able to work around this by marking column \"%s\" NOT NULL.",
 423                                           NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))));
 424                 }
 425                 else if (colno < 0)
 426                 {
 427                         /* system column --- okay, always non-null */
 428                 }
 429                 else
 430                         /* index expression, lose... */
 431                         ereport(ERROR,
 432                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 433                                          errmsg("cannot cluster on expressional index \"%s\" because its index access method does not handle null values",
 434                                                         RelationGetRelationName(OldIndex))));
 435         }
 436
 437         /*
 438          * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
 439          * it might well not contain entries for every heap row, or might not even
 440          * be internally consistent.  (But note that we don't check indcheckxmin;
 441          * the worst consequence of following broken HOT chains would be that we
 442          * might put recently-dead tuples out-of-order in the new table, and there
 443          * is little harm in that.)
 444          */
 445         if (!OldIndex->rd_index->indisvalid)
 446                 ereport(ERROR,
 447                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 448                                  errmsg("cannot cluster on invalid index \"%s\"",
 449                                                 RelationGetRelationName(OldIndex))));
 450
 451         /*
 452          * Disallow clustering system relations.  This will definitely NOT work
 453          * for shared relations (we have no way to update pg_class rows in other
 454          * databases), nor for nailed-in-cache relations (the relfilenode values
 455          * for those are hardwired, see relcache.c).  It might work for other
 456          * system relations, but I ain't gonna risk it.
 457          */
 458         if (IsSystemRelation(OldHeap))
 459                 ereport(ERROR,
 460                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 461                                  errmsg("\"%s\" is a system catalog",
 462                                                 RelationGetRelationName(OldHeap))));
 463
 464         /*
 465          * Don't allow cluster on temp tables of other backends ... their local
 466          * buffer manager is not going to cope.
 467          */
 468         if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
 469                 ereport(ERROR,
 470                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 471                            errmsg("cannot cluster temporary tables of other sessions")));
 472
 473         /*
 474          * Also check for active uses of the relation in the current transaction,
 475          * including open scans and pending AFTER trigger events.
 476          */
 477         CheckTableNotInUse(OldHeap, "CLUSTER");
 478
 479         /* Drop relcache refcnt on OldIndex, but keep lock */
 480         index_close(OldIndex, NoLock);
 481 }
 482
 483 /*
 484  * mark_index_clustered: mark the specified index as the one clustered on
 485  *
 486  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
 487  */
 488 void
 489 mark_index_clustered(Relation rel, Oid indexOid)
 490 {
 491         HeapTuple       indexTuple;
 492         Form_pg_index indexForm;
 493         Relation        pg_index;
 494         ListCell   *index;
 495
 496         /*
 497          * If the index is already marked clustered, no need to do anything.
 498          */
 499         if (OidIsValid(indexOid))
 500         {
 501                 indexTuple = SearchSysCache(INDEXRELID,
 502                                                                         ObjectIdGetDatum(indexOid),
 503                                                                         0, 0, 0);
 504                 if (!HeapTupleIsValid(indexTuple))
 505                         elog(ERROR, "cache lookup failed for index %u", indexOid);
 506                 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 507
 508                 if (indexForm->indisclustered)
 509                 {
 510                         ReleaseSysCache(indexTuple);
 511                         return;
 512                 }
 513
 514                 ReleaseSysCache(indexTuple);
 515         }
 516
 517         /*
 518          * Check each index of the relation and set/clear the bit as needed.
 519          */
 520         pg_index = heap_open(IndexRelationId, RowExclusiveLock);
 521
 522         foreach(index, RelationGetIndexList(rel))
 523         {
 524                 Oid                     thisIndexOid = lfirst_oid(index);
 525
 526                 indexTuple = SearchSysCacheCopy(INDEXRELID,
 527                                                                                 ObjectIdGetDatum(thisIndexOid),
 528                                                                                 0, 0, 0);
 529                 if (!HeapTupleIsValid(indexTuple))
 530                         elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
 531                 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 532
 533                 /*
 534                  * Unset the bit if set.  We know it's wrong because we checked this
 535                  * earlier.
 536                  */
 537                 if (indexForm->indisclustered)
 538                 {
 539                         indexForm->indisclustered = false;
 540                         simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
 541                         CatalogUpdateIndexes(pg_index, indexTuple);
 542                         /* Ensure we see the update in the index's relcache entry */
 543                         CacheInvalidateRelcacheByRelid(thisIndexOid);
 544                 }
 545                 else if (thisIndexOid == indexOid)
 546                 {
 547                         indexForm->indisclustered = true;
 548                         simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
 549                         CatalogUpdateIndexes(pg_index, indexTuple);
 550                         /* Ensure we see the update in the index's relcache entry */
 551                         CacheInvalidateRelcacheByRelid(thisIndexOid);
 552                 }
 553                 heap_freetuple(indexTuple);
 554         }
 555
 556         heap_close(pg_index, RowExclusiveLock);
 557 }
 558
 559 /*
 560  * rebuild_relation: rebuild an existing relation in index order
 561  *
 562  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
 563  * indexOid: index to cluster by
 564  *
 565  * NB: this routine closes OldHeap at the right time; caller should not.
 566  */
 567 static void
 568 rebuild_relation(Relation OldHeap, Oid indexOid)
 569 {
 570         Oid                     tableOid = RelationGetRelid(OldHeap);
 571         Oid                     tableSpace = OldHeap->rd_rel->reltablespace;
 572         Oid                     OIDNewHeap;
 573         char            NewHeapName[NAMEDATALEN];
 574         TransactionId frozenXid;
 575         ObjectAddress object;
 576         Relation        newrel;
 577
 578         /* Mark the correct index as clustered */
 579         mark_index_clustered(OldHeap, indexOid);
 580
 581         /* Close relcache entry, but keep lock until transaction commit */
 582         heap_close(OldHeap, NoLock);
 583
 584         /*
 585          * Create the new heap, using a temporary name in the same namespace as
 586          * the existing table.  NOTE: there is some risk of collision with user
 587          * relnames.  Working around this seems more trouble than it's worth; in
 588          * particular, we can't create the new heap in a different namespace from
 589          * the old, or we will have problems with the TEMP status of temp tables.
 590          */
 591         snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", tableOid);
 592
 593         OIDNewHeap = make_new_heap(tableOid, NewHeapName, tableSpace);
 594
 595         /*
 596          * We don't need CommandCounterIncrement() because make_new_heap did it.
 597          */
 598
 599         /*
 600          * Copy the heap data into the new table in the desired order.
 601          */
 602         frozenXid = copy_heap_data(OIDNewHeap, tableOid, indexOid);
 603
 604         /* To make the new heap's data visible (probably not needed?). */
 605         CommandCounterIncrement();
 606
 607         /* Swap the physical files of the old and new heaps. */
 608         swap_relation_files(tableOid, OIDNewHeap, frozenXid);
 609
 610         CommandCounterIncrement();
 611
 612         /* Destroy new heap with old filenode */
 613         object.classId = RelationRelationId;
 614         object.objectId = OIDNewHeap;
 615         object.objectSubId = 0;
 616
 617         /*
 618          * The new relation is local to our transaction and we know nothing
 619          * depends on it, so DROP_RESTRICT should be OK.
 620          */
 621         performDeletion(&object, DROP_RESTRICT);
 622
 623         /* performDeletion does CommandCounterIncrement at end */
 624
 625         /*
 626          * Rebuild each index on the relation (but not the toast table, which is
 627          * all-new at this point).      We do not need CommandCounterIncrement()
 628          * because reindex_relation does it.
 629          */
 630         reindex_relation(tableOid, false);
 631
 632         /*
 633          * At this point, everything is kosher except that the toast table's name
 634          * corresponds to the temporary table.  The name is irrelevant to
 635          * the backend because it's referenced by OID, but users looking at the
 636          * catalogs could be confused.  Rename it to prevent this problem.
 637          *
 638          * Note no lock required on the relation, because we already hold an
 639          * exclusive lock on it.
 640          */
 641         newrel = heap_open(tableOid, NoLock);
 642         if (OidIsValid(newrel->rd_rel->reltoastrelid))
 643         {
 644                 char            NewToastName[NAMEDATALEN];
 645                 Relation        toastrel;
 646
 647                 /* rename the toast table ... */
 648                 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u", tableOid);
 649                 RenameRelationInternal(newrel->rd_rel->reltoastrelid, NewToastName,
 650                                                            PG_TOAST_NAMESPACE);
 651
 652                 /* ... and its index too */
 653                 toastrel = relation_open(newrel->rd_rel->reltoastrelid, AccessShareLock);
 654                 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index", tableOid);
 655                 RenameRelationInternal(toastrel->rd_rel->reltoastidxid, NewToastName,
 656                                                            PG_TOAST_NAMESPACE);
 657                 relation_close(toastrel, AccessShareLock);
 658         }
 659         relation_close(newrel, NoLock);
 660 }
 661
 662 /*
 663  * Create the new table that we will fill with correctly-ordered data.
 664  */
 665 Oid
 666 make_new_heap(Oid OIDOldHeap, const char *NewName, Oid NewTableSpace)
 667 {
 668         TupleDesc       OldHeapDesc,
 669                                 tupdesc;
 670         Oid                     OIDNewHeap;
 671         Relation        OldHeap;
 672         HeapTuple       tuple;
 673         Datum           reloptions;
 674         bool            isNull;
 675
 676         OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
 677         OldHeapDesc = RelationGetDescr(OldHeap);
 678
 679         /*
 680          * Need to make a copy of the tuple descriptor, since
 681          * heap_create_with_catalog modifies it.  Note that the NewHeap will
 682          * not receive any of the defaults or constraints associated with the
 683          * OldHeap; we don't need 'em, and there's no reason to spend cycles
 684          * inserting them into the catalogs only to delete them.
 685          */
 686         tupdesc = CreateTupleDescCopy(OldHeapDesc);
 687
 688         /*
 689          * Use options of the old heap for new heap.
 690          */
 691         tuple = SearchSysCache(RELOID,
 692                                                    ObjectIdGetDatum(OIDOldHeap),
 693                                                    0, 0, 0);
 694         if (!HeapTupleIsValid(tuple))
 695                 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
 696         reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
 697                                                                  &isNull);
 698         if (isNull)
 699                 reloptions = (Datum) 0;
 700
 701         OIDNewHeap = heap_create_with_catalog(NewName,
 702                                                                                   RelationGetNamespace(OldHeap),
 703                                                                                   NewTableSpace,
 704                                                                                   InvalidOid,
 705                                                                                   OldHeap->rd_rel->relowner,
 706                                                                                   tupdesc,
 707                                                                                   NIL,
 708                                                                                   OldHeap->rd_rel->relkind,
 709                                                                                   OldHeap->rd_rel->relisshared,
 710                                                                                   true,
 711                                                                                   0,
 712                                                                                   ONCOMMIT_NOOP,
 713                                                                                   reloptions,
 714                                                                                   allowSystemTableMods);
 715
 716         ReleaseSysCache(tuple);
 717
 718         /*
 719          * Advance command counter so that the newly-created relation's catalog
 720          * tuples will be visible to heap_open.
 721          */
 722         CommandCounterIncrement();
 723
 724         /*
 725          * If necessary, create a TOAST table for the new relation. Note that
 726          * AlterTableCreateToastTable ends with CommandCounterIncrement(), so that
 727          * the TOAST table will be visible for insertion.
 728          */
 729         AlterTableCreateToastTable(OIDNewHeap);
 730
 731         heap_close(OldHeap, NoLock);
 732
 733         return OIDNewHeap;
 734 }
 735
 736 /*
 737  * Do the physical copying of heap data.  Returns the TransactionId used as
 738  * freeze cutoff point for the tuples.
 739  */
 740 static TransactionId
 741 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
 742 {
 743         Relation        NewHeap,
 744                                 OldHeap,
 745                                 OldIndex;
 746         TupleDesc       oldTupDesc;
 747         TupleDesc       newTupDesc;
 748         int                     natts;
 749         Datum      *values;
 750         bool       *isnull;
 751         IndexScanDesc scan;
 752         HeapTuple       tuple;
 753         bool            use_wal;
 754         TransactionId OldestXmin;
 755         TransactionId FreezeXid;
 756         RewriteState rwstate;
 757
 758         /*
 759          * Open the relations we need.
 760          */
 761         NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
 762         OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
 763         OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
 764
 765         /*
 766          * Their tuple descriptors should be exactly alike, but here we only need
 767          * assume that they have the same number of columns.
 768          */
 769         oldTupDesc = RelationGetDescr(OldHeap);
 770         newTupDesc = RelationGetDescr(NewHeap);
 771         Assert(newTupDesc->natts == oldTupDesc->natts);
 772
 773         /* Preallocate values/isnull arrays */
 774         natts = newTupDesc->natts;
 775         values = (Datum *) palloc(natts * sizeof(Datum));
 776         isnull = (bool *) palloc(natts * sizeof(bool));
 777
 778         /*
 779          * We need to log the copied data in WAL iff WAL archiving is enabled AND
 780          * it's not a temp rel.
 781          */
 782         use_wal = XLogArchivingActive() && !NewHeap->rd_istemp;
 783
 784         /* use_wal off requires rd_targblock be initially invalid */
 785         Assert(NewHeap->rd_targblock == InvalidBlockNumber);
 786
 787         /*
 788          * compute xids used to freeze and weed out dead tuples.  We use -1
 789          * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a
 790          * plain VACUUM would.
 791          */
 792         vacuum_set_xid_limits(-1, -1, OldHeap->rd_rel->relisshared,
 793                                                   &OldestXmin, &FreezeXid, NULL);
 794
 795         /*
 796          * FreezeXid will become the table's new relfrozenxid, and that mustn't
 797          * go backwards, so take the max.
 798          */
 799         if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
 800                 FreezeXid = OldHeap->rd_rel->relfrozenxid;
 801
 802         /* Initialize the rewrite operation */
 803         rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
 804
 805         /*
 806          * Scan through the OldHeap in OldIndex order and copy each tuple into the
 807          * NewHeap.  To ensure we see recently-dead tuples that still need to be
 808          * copied, we scan with SnapshotAny and use HeapTupleSatisfiesVacuum for
 809          * the visibility test.
 810          */
 811         scan = index_beginscan(OldHeap, OldIndex,
 812                                                    SnapshotAny, 0, (ScanKey) NULL);
 813
 814         while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL)
 815         {
 816                 HeapTuple       copiedTuple;
 817                 bool            isdead;
 818                 int                     i;
 819
 820                 CHECK_FOR_INTERRUPTS();
 821
 822                 /* Since we used no scan keys, should never need to recheck */
 823                 if (scan->xs_recheck)
 824                         elog(ERROR, "CLUSTER does not support lossy index conditions");
 825
 826                 LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
 827
 828                 switch (HeapTupleSatisfiesVacuum(tuple->t_data, OldestXmin,
 829                                                                                  scan->xs_cbuf))
 830                 {
 831                         case HEAPTUPLE_DEAD:
 832                                 /* Definitely dead */
 833                                 isdead = true;
 834                                 break;
 835                         case HEAPTUPLE_LIVE:
 836                         case HEAPTUPLE_RECENTLY_DEAD:
 837                                 /* Live or recently dead, must copy it */
 838                                 isdead = false;
 839                                 break;
 840                         case HEAPTUPLE_INSERT_IN_PROGRESS:
 841
 842                                 /*
 843                                  * We should not see this unless it's been inserted earlier in
 844                                  * our own transaction.
 845                                  */
 846                                 if (!TransactionIdIsCurrentTransactionId(
 847                                                                           HeapTupleHeaderGetXmin(tuple->t_data)))
 848                                         elog(ERROR, "concurrent insert in progress");
 849                                 /* treat as live */
 850                                 isdead = false;
 851                                 break;
 852                         case HEAPTUPLE_DELETE_IN_PROGRESS:
 853
 854                                 /*
 855                                  * We should not see this unless it's been deleted earlier in
 856                                  * our own transaction.
 857                                  */
 858                                 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
 859                                 if (!TransactionIdIsCurrentTransactionId(
 860                                                                           HeapTupleHeaderGetXmax(tuple->t_data)))
 861                                         elog(ERROR, "concurrent delete in progress");
 862                                 /* treat as recently dead */
 863                                 isdead = false;
 864                                 break;
 865                         default:
 866                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
 867                                 isdead = false; /* keep compiler quiet */
 868                                 break;
 869                 }
 870
 871                 LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
 872
 873                 if (isdead)
 874                 {
 875                         /* heap rewrite module still needs to see it... */
 876                         rewrite_heap_dead_tuple(rwstate, tuple);
 877                         continue;
 878                 }
 879
 880                 /*
 881                  * We cannot simply copy the tuple as-is, for several reasons:
 882                  *
 883                  * 1. We'd like to squeeze out the values of any dropped columns, both
 884                  * to save space and to ensure we have no corner-case failures. (It's
 885                  * possible for example that the new table hasn't got a TOAST table
 886                  * and so is unable to store any large values of dropped cols.)
 887                  *
 888                  * 2. The tuple might not even be legal for the new table; this is
 889                  * currently only known to happen as an after-effect of ALTER TABLE
 890                  * SET WITHOUT OIDS.
 891                  *
 892                  * So, we must reconstruct the tuple from component Datums.
 893                  */
 894                 heap_deform_tuple(tuple, oldTupDesc, values, isnull);
 895
 896                 /* Be sure to null out any dropped columns */
 897                 for (i = 0; i < natts; i++)
 898                 {
 899                         if (newTupDesc->attrs[i]->attisdropped)
 900                                 isnull[i] = true;
 901                 }
 902
 903                 copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
 904
 905                 /* Preserve OID, if any */
 906                 if (NewHeap->rd_rel->relhasoids)
 907                         HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
 908
 909                 /* The heap rewrite module does the rest */
 910                 rewrite_heap_tuple(rwstate, tuple, copiedTuple);
 911
 912                 heap_freetuple(copiedTuple);
 913         }
 914
 915         index_endscan(scan);
 916
 917         /* Write out any remaining tuples, and fsync if needed */
 918         end_heap_rewrite(rwstate);
 919
 920         pfree(values);
 921         pfree(isnull);
 922
 923         index_close(OldIndex, NoLock);
 924         heap_close(OldHeap, NoLock);
 925         heap_close(NewHeap, NoLock);
 926
 927         return FreezeXid;
 928 }
 929
 930 /*
 931  * Swap the physical files of two given relations.
 932  *
 933  * We swap the physical identity (reltablespace and relfilenode) while
 934  * keeping the same logical identities of the two relations.
 935  *
 936  * Also swap any TOAST links, so that the toast data moves along with
 937  * the main-table data.
 938  *
 939  * Additionally, the first relation is marked with relfrozenxid set to
 940  * frozenXid.  It seems a bit ugly to have this here, but all callers would
 941  * have to do it anyway, so having it here saves a heap_update.  Note: the
 942  * TOAST table needs no special handling, because since we swapped the links,
 943  * the entry for the TOAST table will now contain RecentXmin in relfrozenxid,
 944  * which is the correct value.
 945  */
 946 void
 947 swap_relation_files(Oid r1, Oid r2, TransactionId frozenXid)
 948 {
 949         Relation        relRelation;
 950         HeapTuple       reltup1,
 951                                 reltup2;
 952         Form_pg_class relform1,
 953                                 relform2;
 954         Oid                     swaptemp;
 955         CatalogIndexState indstate;
 956
 957         /* We need writable copies of both pg_class tuples. */
 958         relRelation = heap_open(RelationRelationId, RowExclusiveLock);
 959
 960         reltup1 = SearchSysCacheCopy(RELOID,
 961                                                                  ObjectIdGetDatum(r1),
 962                                                                  0, 0, 0);
 963         if (!HeapTupleIsValid(reltup1))
 964                 elog(ERROR, "cache lookup failed for relation %u", r1);
 965         relform1 = (Form_pg_class) GETSTRUCT(reltup1);
 966
 967         reltup2 = SearchSysCacheCopy(RELOID,
 968                                                                  ObjectIdGetDatum(r2),
 969                                                                  0, 0, 0);
 970         if (!HeapTupleIsValid(reltup2))
 971                 elog(ERROR, "cache lookup failed for relation %u", r2);
 972         relform2 = (Form_pg_class) GETSTRUCT(reltup2);
 973
 974         /*
 975          * Actually swap the fields in the two tuples
 976          */
 977         swaptemp = relform1->relfilenode;
 978         relform1->relfilenode = relform2->relfilenode;
 979         relform2->relfilenode = swaptemp;
 980
 981         swaptemp = relform1->reltablespace;
 982         relform1->reltablespace = relform2->reltablespace;
 983         relform2->reltablespace = swaptemp;
 984
 985         swaptemp = relform1->reltoastrelid;
 986         relform1->reltoastrelid = relform2->reltoastrelid;
 987         relform2->reltoastrelid = swaptemp;
 988
 989         /* we should not swap reltoastidxid */
 990
 991         /* set rel1's frozen Xid */
 992         Assert(TransactionIdIsNormal(frozenXid));
 993         relform1->relfrozenxid = frozenXid;
 994
 995         /* swap size statistics too, since new rel has freshly-updated stats */
 996         {
 997                 int4            swap_pages;
 998                 float4          swap_tuples;
 999
1000                 swap_pages = relform1->relpages;
1001                 relform1->relpages = relform2->relpages;
1002                 relform2->relpages = swap_pages;
1003
1004                 swap_tuples = relform1->reltuples;
1005                 relform1->reltuples = relform2->reltuples;
1006                 relform2->reltuples = swap_tuples;
1007         }
1008
1009         /* Update the tuples in pg_class */
1010         simple_heap_update(relRelation, &reltup1->t_self, reltup1);
1011         simple_heap_update(relRelation, &reltup2->t_self, reltup2);
1012
1013         /* Keep system catalogs current */
1014         indstate = CatalogOpenIndexes(relRelation);
1015         CatalogIndexInsert(indstate, reltup1);
1016         CatalogIndexInsert(indstate, reltup2);
1017         CatalogCloseIndexes(indstate);
1018
1019         /*
1020          * If we have toast tables associated with the relations being swapped,
1021          * change their dependency links to re-associate them with their new
1022          * owning relations.  Otherwise the wrong one will get dropped ...
1023          *
1024          * NOTE: it is possible that only one table has a toast table; this can
1025          * happen in CLUSTER if there were dropped columns in the old table, and
1026          * in ALTER TABLE when adding or changing type of columns.
1027          *
1028          * NOTE: at present, a TOAST table's only dependency is the one on its
1029          * owning table.  If more are ever created, we'd need to use something
1030          * more selective than deleteDependencyRecordsFor() to get rid of only the
1031          * link we want.
1032          */
1033         if (relform1->reltoastrelid || relform2->reltoastrelid)
1034         {
1035                 ObjectAddress baseobject,
1036                                         toastobject;
1037                 long            count;
1038
1039                 /* Delete old dependencies */
1040                 if (relform1->reltoastrelid)
1041                 {
1042                         count = deleteDependencyRecordsFor(RelationRelationId,
1043                                                                                            relform1->reltoastrelid);
1044                         if (count != 1)
1045                                 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1046                                          count);
1047                 }
1048                 if (relform2->reltoastrelid)
1049                 {
1050                         count = deleteDependencyRecordsFor(RelationRelationId,
1051                                                                                            relform2->reltoastrelid);
1052                         if (count != 1)
1053                                 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1054                                          count);
1055                 }
1056
1057                 /* Register new dependencies */
1058                 baseobject.classId = RelationRelationId;
1059                 baseobject.objectSubId = 0;
1060                 toastobject.classId = RelationRelationId;
1061                 toastobject.objectSubId = 0;
1062
1063                 if (relform1->reltoastrelid)
1064                 {
1065                         baseobject.objectId = r1;
1066                         toastobject.objectId = relform1->reltoastrelid;
1067                         recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1068                 }
1069
1070                 if (relform2->reltoastrelid)
1071                 {
1072                         baseobject.objectId = r2;
1073                         toastobject.objectId = relform2->reltoastrelid;
1074                         recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1075                 }
1076         }
1077
1078         /*
1079          * Blow away the old relcache entries now.      We need this kluge because
1080          * relcache.c keeps a link to the smgr relation for the physical file, and
1081          * that will be out of date as soon as we do CommandCounterIncrement.
1082          * Whichever of the rels is the second to be cleared during cache
1083          * invalidation will have a dangling reference to an already-deleted smgr
1084          * relation.  Rather than trying to avoid this by ordering operations just
1085          * so, it's easiest to not have the relcache entries there at all.
1086          * (Fortunately, since one of the entries is local in our transaction,
1087          * it's sufficient to clear out our own relcache this way; the problem
1088          * cannot arise for other backends when they see our update on the
1089          * non-local relation.)
1090          */
1091         RelationForgetRelation(r1);
1092         RelationForgetRelation(r2);
1093
1094         /* Clean up. */
1095         heap_freetuple(reltup1);
1096         heap_freetuple(reltup2);
1097
1098         heap_close(relRelation, RowExclusiveLock);
1099 }
1100
1101 /*
1102  * Get a list of tables that the current user owns and
1103  * have indisclustered set.  Return the list in a List * of rvsToCluster
1104  * with the tableOid and the indexOid on which the table is already
1105  * clustered.
1106  */
1107 static List *
1108 get_tables_to_cluster(MemoryContext cluster_context)
1109 {
1110         Relation        indRelation;
1111         HeapScanDesc scan;
1112         ScanKeyData entry;
1113         HeapTuple       indexTuple;
1114         Form_pg_index index;
1115         MemoryContext old_context;
1116         RelToCluster *rvtc;
1117         List       *rvs = NIL;
1118
1119         /*
1120          * Get all indexes that have indisclustered set and are owned by
1121          * appropriate user. System relations or nailed-in relations cannot ever
1122          * have indisclustered set, because CLUSTER will refuse to set it when
1123          * called with one of them as argument.
1124          */
1125         indRelation = heap_open(IndexRelationId, AccessShareLock);
1126         ScanKeyInit(&entry,
1127                                 Anum_pg_index_indisclustered,
1128                                 BTEqualStrategyNumber, F_BOOLEQ,
1129                                 BoolGetDatum(true));
1130         scan = heap_beginscan(indRelation, SnapshotNow, 1, &entry);
1131         while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1132         {
1133                 index = (Form_pg_index) GETSTRUCT(indexTuple);
1134
1135                 if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1136                         continue;
1137
1138                 /*
1139                  * We have to build the list in a different memory context so it will
1140                  * survive the cross-transaction processing
1141                  */
1142                 old_context = MemoryContextSwitchTo(cluster_context);
1143
1144                 rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1145                 rvtc->tableOid = index->indrelid;
1146                 rvtc->indexOid = index->indexrelid;
1147                 rvs = lcons(rvtc, rvs);
1148
1149                 MemoryContextSwitchTo(old_context);
1150         }
1151         heap_endscan(scan);
1152
1153         relation_close(indRelation, AccessShareLock);
1154
1155         return rvs;
1156 }