Add vacuum_freeze_table_age GUC option, to control when VACUUM should
[PostgreSQL.git] / src / backend / commands / cluster.c
blobc7f464900c7a1c50e4d65e5ef55a17b439b9a1e3
1 /*-------------------------------------------------------------------------
3 * cluster.c
4 * CLUSTER a table on an index.
6 * There is hardly anything left of Paul Brown's original implementation...
9 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994-5, Regents of the University of California
13 * IDENTIFICATION
14 * $PostgreSQL$
16 *-------------------------------------------------------------------------
18 #include "postgres.h"
20 #include "access/genam.h"
21 #include "access/heapam.h"
22 #include "access/relscan.h"
23 #include "access/rewriteheap.h"
24 #include "access/transam.h"
25 #include "access/xact.h"
26 #include "catalog/catalog.h"
27 #include "catalog/dependency.h"
28 #include "catalog/heap.h"
29 #include "catalog/index.h"
30 #include "catalog/indexing.h"
31 #include "catalog/namespace.h"
32 #include "catalog/pg_namespace.h"
33 #include "catalog/toasting.h"
34 #include "commands/cluster.h"
35 #include "commands/tablecmds.h"
36 #include "commands/trigger.h"
37 #include "commands/vacuum.h"
38 #include "miscadmin.h"
39 #include "storage/bufmgr.h"
40 #include "storage/procarray.h"
41 #include "utils/acl.h"
42 #include "utils/fmgroids.h"
43 #include "utils/inval.h"
44 #include "utils/lsyscache.h"
45 #include "utils/memutils.h"
46 #include "utils/relcache.h"
47 #include "utils/snapmgr.h"
48 #include "utils/syscache.h"
49 #include "utils/tqual.h"
53 * This struct is used to pass around the information on tables to be
54 * clustered. We need this so we can make a list of them when invoked without
55 * a specific table/index pair.
57 typedef struct
59 Oid tableOid;
60 Oid indexOid;
61 } RelToCluster;
64 static void cluster_rel(RelToCluster *rv, bool recheck, bool verbose);
65 static void rebuild_relation(Relation OldHeap, Oid indexOid);
66 static TransactionId copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
67 static List *get_tables_to_cluster(MemoryContext cluster_context);
71 /*---------------------------------------------------------------------------
72 * This cluster code allows for clustering multiple tables at once. Because
73 * of this, we cannot just run everything on a single transaction, or we
74 * would be forced to acquire exclusive locks on all the tables being
75 * clustered, simultaneously --- very likely leading to deadlock.
77 * To solve this we follow a similar strategy to VACUUM code,
78 * clustering each relation in a separate transaction. For this to work,
79 * we need to:
80 * - provide a separate memory context so that we can pass information in
81 * a way that survives across transactions
82 * - start a new transaction every time a new relation is clustered
83 * - check for validity of the information on to-be-clustered relations,
84 * as someone might have deleted a relation behind our back, or
85 * clustered one on a different index
86 * - end the transaction
88 * The single-relation case does not have any such overhead.
90 * We also allow a relation to be specified without index. In that case,
91 * the indisclustered bit will be looked up, and an ERROR will be thrown
92 * if there is no index with the bit set.
93 *---------------------------------------------------------------------------
95 void
96 cluster(ClusterStmt *stmt, bool isTopLevel)
98 if (stmt->relation != NULL)
100 /* This is the single-relation case. */
101 Oid tableOid,
102 indexOid = InvalidOid;
103 Relation rel;
104 RelToCluster rvtc;
106 /* Find and lock the table */
107 rel = heap_openrv(stmt->relation, AccessExclusiveLock);
109 tableOid = RelationGetRelid(rel);
111 /* Check permissions */
112 if (!pg_class_ownercheck(tableOid, GetUserId()))
113 aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
114 RelationGetRelationName(rel));
117 * Reject clustering a remote temp table ... their local buffer
118 * manager is not going to cope.
120 if (isOtherTempNamespace(RelationGetNamespace(rel)))
121 ereport(ERROR,
122 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
123 errmsg("cannot cluster temporary tables of other sessions")));
125 if (stmt->indexname == NULL)
127 ListCell *index;
129 /* We need to find the index that has indisclustered set. */
130 foreach(index, RelationGetIndexList(rel))
132 HeapTuple idxtuple;
133 Form_pg_index indexForm;
135 indexOid = lfirst_oid(index);
136 idxtuple = SearchSysCache(INDEXRELID,
137 ObjectIdGetDatum(indexOid),
138 0, 0, 0);
139 if (!HeapTupleIsValid(idxtuple))
140 elog(ERROR, "cache lookup failed for index %u", indexOid);
141 indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
142 if (indexForm->indisclustered)
144 ReleaseSysCache(idxtuple);
145 break;
147 ReleaseSysCache(idxtuple);
148 indexOid = InvalidOid;
151 if (!OidIsValid(indexOid))
152 ereport(ERROR,
153 (errcode(ERRCODE_UNDEFINED_OBJECT),
154 errmsg("there is no previously clustered index for table \"%s\"",
155 stmt->relation->relname)));
157 else
160 * The index is expected to be in the same namespace as the
161 * relation.
163 indexOid = get_relname_relid(stmt->indexname,
164 rel->rd_rel->relnamespace);
165 if (!OidIsValid(indexOid))
166 ereport(ERROR,
167 (errcode(ERRCODE_UNDEFINED_OBJECT),
168 errmsg("index \"%s\" for table \"%s\" does not exist",
169 stmt->indexname, stmt->relation->relname)));
172 /* All other checks are done in cluster_rel() */
173 rvtc.tableOid = tableOid;
174 rvtc.indexOid = indexOid;
176 /* close relation, keep lock till commit */
177 heap_close(rel, NoLock);
179 /* Do the job */
180 cluster_rel(&rvtc, false, stmt->verbose);
182 else
185 * This is the "multi relation" case. We need to cluster all tables
186 * that have some index with indisclustered set.
188 MemoryContext cluster_context;
189 List *rvs;
190 ListCell *rv;
193 * We cannot run this form of CLUSTER inside a user transaction block;
194 * we'd be holding locks way too long.
196 PreventTransactionChain(isTopLevel, "CLUSTER");
199 * Create special memory context for cross-transaction storage.
201 * Since it is a child of PortalContext, it will go away even in case
202 * of error.
204 cluster_context = AllocSetContextCreate(PortalContext,
205 "Cluster",
206 ALLOCSET_DEFAULT_MINSIZE,
207 ALLOCSET_DEFAULT_INITSIZE,
208 ALLOCSET_DEFAULT_MAXSIZE);
211 * Build the list of relations to cluster. Note that this lives in
212 * cluster_context.
214 rvs = get_tables_to_cluster(cluster_context);
216 /* Commit to get out of starting transaction */
217 PopActiveSnapshot();
218 CommitTransactionCommand();
220 /* Ok, now that we've got them all, cluster them one by one */
221 foreach(rv, rvs)
223 RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
225 /* Start a new transaction for each relation. */
226 StartTransactionCommand();
227 /* functions in indexes may want a snapshot set */
228 PushActiveSnapshot(GetTransactionSnapshot());
229 cluster_rel(rvtc, true, stmt->verbose);
230 PopActiveSnapshot();
231 CommitTransactionCommand();
234 /* Start a new transaction for the cleanup work. */
235 StartTransactionCommand();
237 /* Clean up working storage */
238 MemoryContextDelete(cluster_context);
243 * cluster_rel
245 * This clusters the table by creating a new, clustered table and
246 * swapping the relfilenodes of the new table and the old table, so
247 * the OID of the original table is preserved. Thus we do not lose
248 * GRANT, inheritance nor references to this table (this was a bug
249 * in releases thru 7.3).
251 * Also create new indexes and swap the filenodes with the old indexes the
252 * same way we do for the relation. Since we are effectively bulk-loading
253 * the new table, it's better to create the indexes afterwards than to fill
254 * them incrementally while we load the table.
256 static void
257 cluster_rel(RelToCluster *rvtc, bool recheck, bool verbose)
259 Relation OldHeap;
261 /* Check for user-requested abort. */
262 CHECK_FOR_INTERRUPTS();
265 * We grab exclusive access to the target rel and index for the duration
266 * of the transaction. (This is redundant for the single-transaction
267 * case, since cluster() already did it.) The index lock is taken inside
268 * check_index_is_clusterable.
270 OldHeap = try_relation_open(rvtc->tableOid, AccessExclusiveLock);
272 /* If the table has gone away, we can skip processing it */
273 if (!OldHeap)
274 return;
277 * Since we may open a new transaction for each relation, we have to check
278 * that the relation still is what we think it is.
280 * If this is a single-transaction CLUSTER, we can skip these tests. We
281 * *must* skip the one on indisclustered since it would reject an attempt
282 * to cluster a not-previously-clustered index.
284 if (recheck)
286 HeapTuple tuple;
287 Form_pg_index indexForm;
289 /* Check that the user still owns the relation */
290 if (!pg_class_ownercheck(rvtc->tableOid, GetUserId()))
292 relation_close(OldHeap, AccessExclusiveLock);
293 return;
297 * Silently skip a temp table for a remote session. Only doing this
298 * check in the "recheck" case is appropriate (which currently means
299 * somebody is executing a database-wide CLUSTER), because there is
300 * another check in cluster() which will stop any attempt to cluster
301 * remote temp tables by name. There is another check in
302 * check_index_is_clusterable which is redundant, but we leave it for
303 * extra safety.
305 if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
307 relation_close(OldHeap, AccessExclusiveLock);
308 return;
312 * Check that the index still exists
314 if (!SearchSysCacheExists(RELOID,
315 ObjectIdGetDatum(rvtc->indexOid),
316 0, 0, 0))
318 relation_close(OldHeap, AccessExclusiveLock);
319 return;
323 * Check that the index is still the one with indisclustered set.
325 tuple = SearchSysCache(INDEXRELID,
326 ObjectIdGetDatum(rvtc->indexOid),
327 0, 0, 0);
328 if (!HeapTupleIsValid(tuple)) /* probably can't happen */
330 relation_close(OldHeap, AccessExclusiveLock);
331 return;
333 indexForm = (Form_pg_index) GETSTRUCT(tuple);
334 if (!indexForm->indisclustered)
336 ReleaseSysCache(tuple);
337 relation_close(OldHeap, AccessExclusiveLock);
338 return;
340 ReleaseSysCache(tuple);
343 /* Check index is valid to cluster on */
344 check_index_is_clusterable(OldHeap, rvtc->indexOid, recheck);
346 /* rebuild_relation does all the dirty work */
347 ereport(verbose ? INFO : DEBUG2,
348 (errmsg("clustering \"%s.%s\"",
349 get_namespace_name(RelationGetNamespace(OldHeap)),
350 RelationGetRelationName(OldHeap))));
351 rebuild_relation(OldHeap, rvtc->indexOid);
353 /* NB: rebuild_relation does heap_close() on OldHeap */
357 * Verify that the specified index is a legitimate index to cluster on
359 * Side effect: obtains exclusive lock on the index. The caller should
360 * already have exclusive lock on the table, so the index lock is likely
361 * redundant, but it seems best to grab it anyway to ensure the index
362 * definition can't change under us.
364 void
365 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck)
367 Relation OldIndex;
369 OldIndex = index_open(indexOid, AccessExclusiveLock);
372 * Check that index is in fact an index on the given relation
374 if (OldIndex->rd_index == NULL ||
375 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
376 ereport(ERROR,
377 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
378 errmsg("\"%s\" is not an index for table \"%s\"",
379 RelationGetRelationName(OldIndex),
380 RelationGetRelationName(OldHeap))));
383 * Disallow clustering on incomplete indexes (those that might not index
384 * every row of the relation). We could relax this by making a separate
385 * seqscan pass over the table to copy the missing rows, but that seems
386 * expensive and tedious.
388 if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
389 ereport(ERROR,
390 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
391 errmsg("cannot cluster on partial index \"%s\"",
392 RelationGetRelationName(OldIndex))));
394 if (!OldIndex->rd_am->amclusterable)
395 ereport(ERROR,
396 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
397 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
398 RelationGetRelationName(OldIndex))));
400 if (!OldIndex->rd_am->amindexnulls)
402 AttrNumber colno;
405 * If the AM doesn't index nulls, then it's a partial index unless we
406 * can prove all the rows are non-null. Note we only need look at the
407 * first column; multicolumn-capable AMs are *required* to index nulls
408 * in columns after the first.
410 colno = OldIndex->rd_index->indkey.values[0];
411 if (colno > 0)
413 /* ordinary user attribute */
414 if (!OldHeap->rd_att->attrs[colno - 1]->attnotnull)
415 ereport(ERROR,
416 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
417 errmsg("cannot cluster on index \"%s\" because access method does not handle null values",
418 RelationGetRelationName(OldIndex)),
419 recheck
420 ? errhint("You might be able to work around this by marking column \"%s\" NOT NULL, or use ALTER TABLE ... SET WITHOUT CLUSTER to remove the cluster specification from the table.",
421 NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))
422 : errhint("You might be able to work around this by marking column \"%s\" NOT NULL.",
423 NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))));
425 else if (colno < 0)
427 /* system column --- okay, always non-null */
429 else
430 /* index expression, lose... */
431 ereport(ERROR,
432 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
433 errmsg("cannot cluster on expressional index \"%s\" because its index access method does not handle null values",
434 RelationGetRelationName(OldIndex))));
438 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
439 * it might well not contain entries for every heap row, or might not even
440 * be internally consistent. (But note that we don't check indcheckxmin;
441 * the worst consequence of following broken HOT chains would be that we
442 * might put recently-dead tuples out-of-order in the new table, and there
443 * is little harm in that.)
445 if (!OldIndex->rd_index->indisvalid)
446 ereport(ERROR,
447 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
448 errmsg("cannot cluster on invalid index \"%s\"",
449 RelationGetRelationName(OldIndex))));
452 * Disallow clustering system relations. This will definitely NOT work
453 * for shared relations (we have no way to update pg_class rows in other
454 * databases), nor for nailed-in-cache relations (the relfilenode values
455 * for those are hardwired, see relcache.c). It might work for other
456 * system relations, but I ain't gonna risk it.
458 if (IsSystemRelation(OldHeap))
459 ereport(ERROR,
460 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
461 errmsg("\"%s\" is a system catalog",
462 RelationGetRelationName(OldHeap))));
465 * Don't allow cluster on temp tables of other backends ... their local
466 * buffer manager is not going to cope.
468 if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
469 ereport(ERROR,
470 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
471 errmsg("cannot cluster temporary tables of other sessions")));
474 * Also check for active uses of the relation in the current transaction,
475 * including open scans and pending AFTER trigger events.
477 CheckTableNotInUse(OldHeap, "CLUSTER");
479 /* Drop relcache refcnt on OldIndex, but keep lock */
480 index_close(OldIndex, NoLock);
484 * mark_index_clustered: mark the specified index as the one clustered on
486 * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
488 void
489 mark_index_clustered(Relation rel, Oid indexOid)
491 HeapTuple indexTuple;
492 Form_pg_index indexForm;
493 Relation pg_index;
494 ListCell *index;
497 * If the index is already marked clustered, no need to do anything.
499 if (OidIsValid(indexOid))
501 indexTuple = SearchSysCache(INDEXRELID,
502 ObjectIdGetDatum(indexOid),
503 0, 0, 0);
504 if (!HeapTupleIsValid(indexTuple))
505 elog(ERROR, "cache lookup failed for index %u", indexOid);
506 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
508 if (indexForm->indisclustered)
510 ReleaseSysCache(indexTuple);
511 return;
514 ReleaseSysCache(indexTuple);
518 * Check each index of the relation and set/clear the bit as needed.
520 pg_index = heap_open(IndexRelationId, RowExclusiveLock);
522 foreach(index, RelationGetIndexList(rel))
524 Oid thisIndexOid = lfirst_oid(index);
526 indexTuple = SearchSysCacheCopy(INDEXRELID,
527 ObjectIdGetDatum(thisIndexOid),
528 0, 0, 0);
529 if (!HeapTupleIsValid(indexTuple))
530 elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
531 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
534 * Unset the bit if set. We know it's wrong because we checked this
535 * earlier.
537 if (indexForm->indisclustered)
539 indexForm->indisclustered = false;
540 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
541 CatalogUpdateIndexes(pg_index, indexTuple);
542 /* Ensure we see the update in the index's relcache entry */
543 CacheInvalidateRelcacheByRelid(thisIndexOid);
545 else if (thisIndexOid == indexOid)
547 indexForm->indisclustered = true;
548 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
549 CatalogUpdateIndexes(pg_index, indexTuple);
550 /* Ensure we see the update in the index's relcache entry */
551 CacheInvalidateRelcacheByRelid(thisIndexOid);
553 heap_freetuple(indexTuple);
556 heap_close(pg_index, RowExclusiveLock);
560 * rebuild_relation: rebuild an existing relation in index order
562 * OldHeap: table to rebuild --- must be opened and exclusive-locked!
563 * indexOid: index to cluster by
565 * NB: this routine closes OldHeap at the right time; caller should not.
567 static void
568 rebuild_relation(Relation OldHeap, Oid indexOid)
570 Oid tableOid = RelationGetRelid(OldHeap);
571 Oid tableSpace = OldHeap->rd_rel->reltablespace;
572 Oid OIDNewHeap;
573 char NewHeapName[NAMEDATALEN];
574 TransactionId frozenXid;
575 ObjectAddress object;
576 Relation newrel;
578 /* Mark the correct index as clustered */
579 mark_index_clustered(OldHeap, indexOid);
581 /* Close relcache entry, but keep lock until transaction commit */
582 heap_close(OldHeap, NoLock);
585 * Create the new heap, using a temporary name in the same namespace as
586 * the existing table. NOTE: there is some risk of collision with user
587 * relnames. Working around this seems more trouble than it's worth; in
588 * particular, we can't create the new heap in a different namespace from
589 * the old, or we will have problems with the TEMP status of temp tables.
591 snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", tableOid);
593 OIDNewHeap = make_new_heap(tableOid, NewHeapName, tableSpace);
596 * We don't need CommandCounterIncrement() because make_new_heap did it.
600 * Copy the heap data into the new table in the desired order.
602 frozenXid = copy_heap_data(OIDNewHeap, tableOid, indexOid);
604 /* To make the new heap's data visible (probably not needed?). */
605 CommandCounterIncrement();
607 /* Swap the physical files of the old and new heaps. */
608 swap_relation_files(tableOid, OIDNewHeap, frozenXid);
610 CommandCounterIncrement();
612 /* Destroy new heap with old filenode */
613 object.classId = RelationRelationId;
614 object.objectId = OIDNewHeap;
615 object.objectSubId = 0;
618 * The new relation is local to our transaction and we know nothing
619 * depends on it, so DROP_RESTRICT should be OK.
621 performDeletion(&object, DROP_RESTRICT);
623 /* performDeletion does CommandCounterIncrement at end */
626 * Rebuild each index on the relation (but not the toast table, which is
627 * all-new at this point). We do not need CommandCounterIncrement()
628 * because reindex_relation does it.
630 reindex_relation(tableOid, false);
633 * At this point, everything is kosher except that the toast table's name
634 * corresponds to the temporary table. The name is irrelevant to
635 * the backend because it's referenced by OID, but users looking at the
636 * catalogs could be confused. Rename it to prevent this problem.
638 * Note no lock required on the relation, because we already hold an
639 * exclusive lock on it.
641 newrel = heap_open(tableOid, NoLock);
642 if (OidIsValid(newrel->rd_rel->reltoastrelid))
644 char NewToastName[NAMEDATALEN];
645 Relation toastrel;
647 /* rename the toast table ... */
648 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u", tableOid);
649 RenameRelationInternal(newrel->rd_rel->reltoastrelid, NewToastName,
650 PG_TOAST_NAMESPACE);
652 /* ... and its index too */
653 toastrel = relation_open(newrel->rd_rel->reltoastrelid, AccessShareLock);
654 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index", tableOid);
655 RenameRelationInternal(toastrel->rd_rel->reltoastidxid, NewToastName,
656 PG_TOAST_NAMESPACE);
657 relation_close(toastrel, AccessShareLock);
659 relation_close(newrel, NoLock);
663 * Create the new table that we will fill with correctly-ordered data.
666 make_new_heap(Oid OIDOldHeap, const char *NewName, Oid NewTableSpace)
668 TupleDesc OldHeapDesc,
669 tupdesc;
670 Oid OIDNewHeap;
671 Relation OldHeap;
672 HeapTuple tuple;
673 Datum reloptions;
674 bool isNull;
676 OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
677 OldHeapDesc = RelationGetDescr(OldHeap);
680 * Need to make a copy of the tuple descriptor, since
681 * heap_create_with_catalog modifies it. Note that the NewHeap will
682 * not receive any of the defaults or constraints associated with the
683 * OldHeap; we don't need 'em, and there's no reason to spend cycles
684 * inserting them into the catalogs only to delete them.
686 tupdesc = CreateTupleDescCopy(OldHeapDesc);
689 * Use options of the old heap for new heap.
691 tuple = SearchSysCache(RELOID,
692 ObjectIdGetDatum(OIDOldHeap),
693 0, 0, 0);
694 if (!HeapTupleIsValid(tuple))
695 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
696 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
697 &isNull);
698 if (isNull)
699 reloptions = (Datum) 0;
701 OIDNewHeap = heap_create_with_catalog(NewName,
702 RelationGetNamespace(OldHeap),
703 NewTableSpace,
704 InvalidOid,
705 OldHeap->rd_rel->relowner,
706 tupdesc,
707 NIL,
708 OldHeap->rd_rel->relkind,
709 OldHeap->rd_rel->relisshared,
710 true,
712 ONCOMMIT_NOOP,
713 reloptions,
714 allowSystemTableMods);
716 ReleaseSysCache(tuple);
719 * Advance command counter so that the newly-created relation's catalog
720 * tuples will be visible to heap_open.
722 CommandCounterIncrement();
725 * If necessary, create a TOAST table for the new relation. Note that
726 * AlterTableCreateToastTable ends with CommandCounterIncrement(), so that
727 * the TOAST table will be visible for insertion.
729 AlterTableCreateToastTable(OIDNewHeap);
731 heap_close(OldHeap, NoLock);
733 return OIDNewHeap;
737 * Do the physical copying of heap data. Returns the TransactionId used as
738 * freeze cutoff point for the tuples.
740 static TransactionId
741 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
743 Relation NewHeap,
744 OldHeap,
745 OldIndex;
746 TupleDesc oldTupDesc;
747 TupleDesc newTupDesc;
748 int natts;
749 Datum *values;
750 bool *isnull;
751 IndexScanDesc scan;
752 HeapTuple tuple;
753 bool use_wal;
754 TransactionId OldestXmin;
755 TransactionId FreezeXid;
756 RewriteState rwstate;
759 * Open the relations we need.
761 NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
762 OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
763 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
766 * Their tuple descriptors should be exactly alike, but here we only need
767 * assume that they have the same number of columns.
769 oldTupDesc = RelationGetDescr(OldHeap);
770 newTupDesc = RelationGetDescr(NewHeap);
771 Assert(newTupDesc->natts == oldTupDesc->natts);
773 /* Preallocate values/isnull arrays */
774 natts = newTupDesc->natts;
775 values = (Datum *) palloc(natts * sizeof(Datum));
776 isnull = (bool *) palloc(natts * sizeof(bool));
779 * We need to log the copied data in WAL iff WAL archiving is enabled AND
780 * it's not a temp rel.
782 use_wal = XLogArchivingActive() && !NewHeap->rd_istemp;
784 /* use_wal off requires rd_targblock be initially invalid */
785 Assert(NewHeap->rd_targblock == InvalidBlockNumber);
788 * compute xids used to freeze and weed out dead tuples. We use -1
789 * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a
790 * plain VACUUM would.
792 vacuum_set_xid_limits(-1, -1, OldHeap->rd_rel->relisshared,
793 &OldestXmin, &FreezeXid, NULL);
796 * FreezeXid will become the table's new relfrozenxid, and that mustn't
797 * go backwards, so take the max.
799 if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
800 FreezeXid = OldHeap->rd_rel->relfrozenxid;
802 /* Initialize the rewrite operation */
803 rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
806 * Scan through the OldHeap in OldIndex order and copy each tuple into the
807 * NewHeap. To ensure we see recently-dead tuples that still need to be
808 * copied, we scan with SnapshotAny and use HeapTupleSatisfiesVacuum for
809 * the visibility test.
811 scan = index_beginscan(OldHeap, OldIndex,
812 SnapshotAny, 0, (ScanKey) NULL);
814 while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL)
816 HeapTuple copiedTuple;
817 bool isdead;
818 int i;
820 CHECK_FOR_INTERRUPTS();
822 /* Since we used no scan keys, should never need to recheck */
823 if (scan->xs_recheck)
824 elog(ERROR, "CLUSTER does not support lossy index conditions");
826 LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
828 switch (HeapTupleSatisfiesVacuum(tuple->t_data, OldestXmin,
829 scan->xs_cbuf))
831 case HEAPTUPLE_DEAD:
832 /* Definitely dead */
833 isdead = true;
834 break;
835 case HEAPTUPLE_LIVE:
836 case HEAPTUPLE_RECENTLY_DEAD:
837 /* Live or recently dead, must copy it */
838 isdead = false;
839 break;
840 case HEAPTUPLE_INSERT_IN_PROGRESS:
843 * We should not see this unless it's been inserted earlier in
844 * our own transaction.
846 if (!TransactionIdIsCurrentTransactionId(
847 HeapTupleHeaderGetXmin(tuple->t_data)))
848 elog(ERROR, "concurrent insert in progress");
849 /* treat as live */
850 isdead = false;
851 break;
852 case HEAPTUPLE_DELETE_IN_PROGRESS:
855 * We should not see this unless it's been deleted earlier in
856 * our own transaction.
858 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
859 if (!TransactionIdIsCurrentTransactionId(
860 HeapTupleHeaderGetXmax(tuple->t_data)))
861 elog(ERROR, "concurrent delete in progress");
862 /* treat as recently dead */
863 isdead = false;
864 break;
865 default:
866 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
867 isdead = false; /* keep compiler quiet */
868 break;
871 LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
873 if (isdead)
875 /* heap rewrite module still needs to see it... */
876 rewrite_heap_dead_tuple(rwstate, tuple);
877 continue;
881 * We cannot simply copy the tuple as-is, for several reasons:
883 * 1. We'd like to squeeze out the values of any dropped columns, both
884 * to save space and to ensure we have no corner-case failures. (It's
885 * possible for example that the new table hasn't got a TOAST table
886 * and so is unable to store any large values of dropped cols.)
888 * 2. The tuple might not even be legal for the new table; this is
889 * currently only known to happen as an after-effect of ALTER TABLE
890 * SET WITHOUT OIDS.
892 * So, we must reconstruct the tuple from component Datums.
894 heap_deform_tuple(tuple, oldTupDesc, values, isnull);
896 /* Be sure to null out any dropped columns */
897 for (i = 0; i < natts; i++)
899 if (newTupDesc->attrs[i]->attisdropped)
900 isnull[i] = true;
903 copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
905 /* Preserve OID, if any */
906 if (NewHeap->rd_rel->relhasoids)
907 HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
909 /* The heap rewrite module does the rest */
910 rewrite_heap_tuple(rwstate, tuple, copiedTuple);
912 heap_freetuple(copiedTuple);
915 index_endscan(scan);
917 /* Write out any remaining tuples, and fsync if needed */
918 end_heap_rewrite(rwstate);
920 pfree(values);
921 pfree(isnull);
923 index_close(OldIndex, NoLock);
924 heap_close(OldHeap, NoLock);
925 heap_close(NewHeap, NoLock);
927 return FreezeXid;
931 * Swap the physical files of two given relations.
933 * We swap the physical identity (reltablespace and relfilenode) while
934 * keeping the same logical identities of the two relations.
936 * Also swap any TOAST links, so that the toast data moves along with
937 * the main-table data.
939 * Additionally, the first relation is marked with relfrozenxid set to
940 * frozenXid. It seems a bit ugly to have this here, but all callers would
941 * have to do it anyway, so having it here saves a heap_update. Note: the
942 * TOAST table needs no special handling, because since we swapped the links,
943 * the entry for the TOAST table will now contain RecentXmin in relfrozenxid,
944 * which is the correct value.
946 void
947 swap_relation_files(Oid r1, Oid r2, TransactionId frozenXid)
949 Relation relRelation;
950 HeapTuple reltup1,
951 reltup2;
952 Form_pg_class relform1,
953 relform2;
954 Oid swaptemp;
955 CatalogIndexState indstate;
957 /* We need writable copies of both pg_class tuples. */
958 relRelation = heap_open(RelationRelationId, RowExclusiveLock);
960 reltup1 = SearchSysCacheCopy(RELOID,
961 ObjectIdGetDatum(r1),
962 0, 0, 0);
963 if (!HeapTupleIsValid(reltup1))
964 elog(ERROR, "cache lookup failed for relation %u", r1);
965 relform1 = (Form_pg_class) GETSTRUCT(reltup1);
967 reltup2 = SearchSysCacheCopy(RELOID,
968 ObjectIdGetDatum(r2),
969 0, 0, 0);
970 if (!HeapTupleIsValid(reltup2))
971 elog(ERROR, "cache lookup failed for relation %u", r2);
972 relform2 = (Form_pg_class) GETSTRUCT(reltup2);
975 * Actually swap the fields in the two tuples
977 swaptemp = relform1->relfilenode;
978 relform1->relfilenode = relform2->relfilenode;
979 relform2->relfilenode = swaptemp;
981 swaptemp = relform1->reltablespace;
982 relform1->reltablespace = relform2->reltablespace;
983 relform2->reltablespace = swaptemp;
985 swaptemp = relform1->reltoastrelid;
986 relform1->reltoastrelid = relform2->reltoastrelid;
987 relform2->reltoastrelid = swaptemp;
989 /* we should not swap reltoastidxid */
991 /* set rel1's frozen Xid */
992 Assert(TransactionIdIsNormal(frozenXid));
993 relform1->relfrozenxid = frozenXid;
995 /* swap size statistics too, since new rel has freshly-updated stats */
997 int4 swap_pages;
998 float4 swap_tuples;
1000 swap_pages = relform1->relpages;
1001 relform1->relpages = relform2->relpages;
1002 relform2->relpages = swap_pages;
1004 swap_tuples = relform1->reltuples;
1005 relform1->reltuples = relform2->reltuples;
1006 relform2->reltuples = swap_tuples;
1009 /* Update the tuples in pg_class */
1010 simple_heap_update(relRelation, &reltup1->t_self, reltup1);
1011 simple_heap_update(relRelation, &reltup2->t_self, reltup2);
1013 /* Keep system catalogs current */
1014 indstate = CatalogOpenIndexes(relRelation);
1015 CatalogIndexInsert(indstate, reltup1);
1016 CatalogIndexInsert(indstate, reltup2);
1017 CatalogCloseIndexes(indstate);
1020 * If we have toast tables associated with the relations being swapped,
1021 * change their dependency links to re-associate them with their new
1022 * owning relations. Otherwise the wrong one will get dropped ...
1024 * NOTE: it is possible that only one table has a toast table; this can
1025 * happen in CLUSTER if there were dropped columns in the old table, and
1026 * in ALTER TABLE when adding or changing type of columns.
1028 * NOTE: at present, a TOAST table's only dependency is the one on its
1029 * owning table. If more are ever created, we'd need to use something
1030 * more selective than deleteDependencyRecordsFor() to get rid of only the
1031 * link we want.
1033 if (relform1->reltoastrelid || relform2->reltoastrelid)
1035 ObjectAddress baseobject,
1036 toastobject;
1037 long count;
1039 /* Delete old dependencies */
1040 if (relform1->reltoastrelid)
1042 count = deleteDependencyRecordsFor(RelationRelationId,
1043 relform1->reltoastrelid);
1044 if (count != 1)
1045 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1046 count);
1048 if (relform2->reltoastrelid)
1050 count = deleteDependencyRecordsFor(RelationRelationId,
1051 relform2->reltoastrelid);
1052 if (count != 1)
1053 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1054 count);
1057 /* Register new dependencies */
1058 baseobject.classId = RelationRelationId;
1059 baseobject.objectSubId = 0;
1060 toastobject.classId = RelationRelationId;
1061 toastobject.objectSubId = 0;
1063 if (relform1->reltoastrelid)
1065 baseobject.objectId = r1;
1066 toastobject.objectId = relform1->reltoastrelid;
1067 recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1070 if (relform2->reltoastrelid)
1072 baseobject.objectId = r2;
1073 toastobject.objectId = relform2->reltoastrelid;
1074 recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1079 * Blow away the old relcache entries now. We need this kluge because
1080 * relcache.c keeps a link to the smgr relation for the physical file, and
1081 * that will be out of date as soon as we do CommandCounterIncrement.
1082 * Whichever of the rels is the second to be cleared during cache
1083 * invalidation will have a dangling reference to an already-deleted smgr
1084 * relation. Rather than trying to avoid this by ordering operations just
1085 * so, it's easiest to not have the relcache entries there at all.
1086 * (Fortunately, since one of the entries is local in our transaction,
1087 * it's sufficient to clear out our own relcache this way; the problem
1088 * cannot arise for other backends when they see our update on the
1089 * non-local relation.)
1091 RelationForgetRelation(r1);
1092 RelationForgetRelation(r2);
1094 /* Clean up. */
1095 heap_freetuple(reltup1);
1096 heap_freetuple(reltup2);
1098 heap_close(relRelation, RowExclusiveLock);
1102 * Get a list of tables that the current user owns and
1103 * have indisclustered set. Return the list in a List * of rvsToCluster
1104 * with the tableOid and the indexOid on which the table is already
1105 * clustered.
1107 static List *
1108 get_tables_to_cluster(MemoryContext cluster_context)
1110 Relation indRelation;
1111 HeapScanDesc scan;
1112 ScanKeyData entry;
1113 HeapTuple indexTuple;
1114 Form_pg_index index;
1115 MemoryContext old_context;
1116 RelToCluster *rvtc;
1117 List *rvs = NIL;
1120 * Get all indexes that have indisclustered set and are owned by
1121 * appropriate user. System relations or nailed-in relations cannot ever
1122 * have indisclustered set, because CLUSTER will refuse to set it when
1123 * called with one of them as argument.
1125 indRelation = heap_open(IndexRelationId, AccessShareLock);
1126 ScanKeyInit(&entry,
1127 Anum_pg_index_indisclustered,
1128 BTEqualStrategyNumber, F_BOOLEQ,
1129 BoolGetDatum(true));
1130 scan = heap_beginscan(indRelation, SnapshotNow, 1, &entry);
1131 while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1133 index = (Form_pg_index) GETSTRUCT(indexTuple);
1135 if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1136 continue;
1139 * We have to build the list in a different memory context so it will
1140 * survive the cross-transaction processing
1142 old_context = MemoryContextSwitchTo(cluster_context);
1144 rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1145 rvtc->tableOid = index->indrelid;
1146 rvtc->indexOid = index->indexrelid;
1147 rvs = lcons(rvtc, rvs);
1149 MemoryContextSwitchTo(old_context);
1151 heap_endscan(scan);
1153 relation_close(indRelation, AccessShareLock);
1155 return rvs;