Update copyright for 2022
[pgsql.git] / src / backend / storage / smgr / smgr.c
blobeb701dce57671bae9bfddf9fd81fea7bee5a552c
1 /*-------------------------------------------------------------------------
3 * smgr.c
4 * public interface routines to storage manager switch.
6 * All file system operations in POSTGRES dispatch through these
7 * routines.
9 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
13 * IDENTIFICATION
14 * src/backend/storage/smgr/smgr.c
16 *-------------------------------------------------------------------------
18 #include "postgres.h"
20 #include "access/xlogutils.h"
21 #include "lib/ilist.h"
22 #include "storage/bufmgr.h"
23 #include "storage/ipc.h"
24 #include "storage/md.h"
25 #include "storage/smgr.h"
26 #include "utils/hsearch.h"
27 #include "utils/inval.h"
31 * This struct of function pointers defines the API between smgr.c and
32 * any individual storage manager module. Note that smgr subfunctions are
33 * generally expected to report problems via elog(ERROR). An exception is
34 * that smgr_unlink should use elog(WARNING), rather than erroring out,
35 * because we normally unlink relations during post-commit/abort cleanup,
36 * and so it's too late to raise an error. Also, various conditions that
37 * would normally be errors should be allowed during bootstrap and/or WAL
38 * recovery --- see comments in md.c for details.
40 typedef struct f_smgr
42 void (*smgr_init) (void); /* may be NULL */
43 void (*smgr_shutdown) (void); /* may be NULL */
44 void (*smgr_open) (SMgrRelation reln);
45 void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
46 void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
47 bool isRedo);
48 bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
49 void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
50 bool isRedo);
51 void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
52 BlockNumber blocknum, char *buffer, bool skipFsync);
53 bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
54 BlockNumber blocknum);
55 void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
56 BlockNumber blocknum, char *buffer);
57 void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
58 BlockNumber blocknum, char *buffer, bool skipFsync);
59 void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
60 BlockNumber blocknum, BlockNumber nblocks);
61 BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
62 void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
63 BlockNumber nblocks);
64 void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
65 } f_smgr;
67 static const f_smgr smgrsw[] = {
68 /* magnetic disk */
70 .smgr_init = mdinit,
71 .smgr_shutdown = NULL,
72 .smgr_open = mdopen,
73 .smgr_close = mdclose,
74 .smgr_create = mdcreate,
75 .smgr_exists = mdexists,
76 .smgr_unlink = mdunlink,
77 .smgr_extend = mdextend,
78 .smgr_prefetch = mdprefetch,
79 .smgr_read = mdread,
80 .smgr_write = mdwrite,
81 .smgr_writeback = mdwriteback,
82 .smgr_nblocks = mdnblocks,
83 .smgr_truncate = mdtruncate,
84 .smgr_immedsync = mdimmedsync,
88 static const int NSmgr = lengthof(smgrsw);
91 * Each backend has a hashtable that stores all extant SMgrRelation objects.
92 * In addition, "unowned" SMgrRelation objects are chained together in a list.
94 static HTAB *SMgrRelationHash = NULL;
96 static dlist_head unowned_relns;
98 /* local function prototypes */
99 static void smgrshutdown(int code, Datum arg);
103 * smgrinit(), smgrshutdown() -- Initialize or shut down storage
104 * managers.
106 * Note: smgrinit is called during backend startup (normal or standalone
107 * case), *not* during postmaster start. Therefore, any resources created
108 * here or destroyed in smgrshutdown are backend-local.
110 void
111 smgrinit(void)
113 int i;
115 for (i = 0; i < NSmgr; i++)
117 if (smgrsw[i].smgr_init)
118 smgrsw[i].smgr_init();
121 /* register the shutdown proc */
122 on_proc_exit(smgrshutdown, 0);
126 * on_proc_exit hook for smgr cleanup during backend shutdown
128 static void
129 smgrshutdown(int code, Datum arg)
131 int i;
133 for (i = 0; i < NSmgr; i++)
135 if (smgrsw[i].smgr_shutdown)
136 smgrsw[i].smgr_shutdown();
141 * smgropen() -- Return an SMgrRelation object, creating it if need be.
143 * This does not attempt to actually open the underlying file.
145 SMgrRelation
146 smgropen(RelFileNode rnode, BackendId backend)
148 RelFileNodeBackend brnode;
149 SMgrRelation reln;
150 bool found;
152 if (SMgrRelationHash == NULL)
154 /* First time through: initialize the hash table */
155 HASHCTL ctl;
157 ctl.keysize = sizeof(RelFileNodeBackend);
158 ctl.entrysize = sizeof(SMgrRelationData);
159 SMgrRelationHash = hash_create("smgr relation table", 400,
160 &ctl, HASH_ELEM | HASH_BLOBS);
161 dlist_init(&unowned_relns);
164 /* Look up or create an entry */
165 brnode.node = rnode;
166 brnode.backend = backend;
167 reln = (SMgrRelation) hash_search(SMgrRelationHash,
168 (void *) &brnode,
169 HASH_ENTER, &found);
171 /* Initialize it if not present before */
172 if (!found)
174 /* hash_search already filled in the lookup key */
175 reln->smgr_owner = NULL;
176 reln->smgr_targblock = InvalidBlockNumber;
177 for (int i = 0; i <= MAX_FORKNUM; ++i)
178 reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
179 reln->smgr_which = 0; /* we only have md.c at present */
181 /* implementation-specific initialization */
182 smgrsw[reln->smgr_which].smgr_open(reln);
184 /* it has no owner yet */
185 dlist_push_tail(&unowned_relns, &reln->node);
188 return reln;
192 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
194 * There can be only one owner at a time; this is sufficient since currently
195 * the only such owners exist in the relcache.
197 void
198 smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
200 /* We don't support "disowning" an SMgrRelation here, use smgrclearowner */
201 Assert(owner != NULL);
204 * First, unhook any old owner. (Normally there shouldn't be any, but it
205 * seems possible that this can happen during swap_relation_files()
206 * depending on the order of processing. It's ok to close the old
207 * relcache entry early in that case.)
209 * If there isn't an old owner, then the reln should be in the unowned
210 * list, and we need to remove it.
212 if (reln->smgr_owner)
213 *(reln->smgr_owner) = NULL;
214 else
215 dlist_delete(&reln->node);
217 /* Now establish the ownership relationship. */
218 reln->smgr_owner = owner;
219 *owner = reln;
223 * smgrclearowner() -- Remove long-lived reference to an SMgrRelation object
224 * if one exists
226 void
227 smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
229 /* Do nothing if the SMgrRelation object is not owned by the owner */
230 if (reln->smgr_owner != owner)
231 return;
233 /* unset the owner's reference */
234 *owner = NULL;
236 /* unset our reference to the owner */
237 reln->smgr_owner = NULL;
239 /* add to list of unowned relations */
240 dlist_push_tail(&unowned_relns, &reln->node);
244 * smgrexists() -- Does the underlying file for a fork exist?
246 bool
247 smgrexists(SMgrRelation reln, ForkNumber forknum)
249 return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
253 * smgrclose() -- Close and delete an SMgrRelation object.
255 void
256 smgrclose(SMgrRelation reln)
258 SMgrRelation *owner;
259 ForkNumber forknum;
261 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
262 smgrsw[reln->smgr_which].smgr_close(reln, forknum);
264 owner = reln->smgr_owner;
266 if (!owner)
267 dlist_delete(&reln->node);
269 if (hash_search(SMgrRelationHash,
270 (void *) &(reln->smgr_rnode),
271 HASH_REMOVE, NULL) == NULL)
272 elog(ERROR, "SMgrRelation hashtable corrupted");
275 * Unhook the owner pointer, if any. We do this last since in the remote
276 * possibility of failure above, the SMgrRelation object will still exist.
278 if (owner)
279 *owner = NULL;
283 * smgrcloseall() -- Close all existing SMgrRelation objects.
285 void
286 smgrcloseall(void)
288 HASH_SEQ_STATUS status;
289 SMgrRelation reln;
291 /* Nothing to do if hashtable not set up */
292 if (SMgrRelationHash == NULL)
293 return;
295 hash_seq_init(&status, SMgrRelationHash);
297 while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
298 smgrclose(reln);
302 * smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
303 * if one exists.
305 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
306 * uselessly creating a hashtable entry only to drop it again when no
307 * such entry exists already.
309 void
310 smgrclosenode(RelFileNodeBackend rnode)
312 SMgrRelation reln;
314 /* Nothing to do if hashtable not set up */
315 if (SMgrRelationHash == NULL)
316 return;
318 reln = (SMgrRelation) hash_search(SMgrRelationHash,
319 (void *) &rnode,
320 HASH_FIND, NULL);
321 if (reln != NULL)
322 smgrclose(reln);
326 * smgrcreate() -- Create a new relation.
328 * Given an already-created (but presumably unused) SMgrRelation,
329 * cause the underlying disk file or other storage for the fork
330 * to be created.
332 void
333 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
335 smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
339 * smgrdosyncall() -- Immediately sync all forks of all given relations
341 * All forks of all given relations are synced out to the store.
343 * This is equivalent to FlushRelationBuffers() for each smgr relation,
344 * then calling smgrimmedsync() for all forks of each relation, but it's
345 * significantly quicker so should be preferred when possible.
347 void
348 smgrdosyncall(SMgrRelation *rels, int nrels)
350 int i = 0;
351 ForkNumber forknum;
353 if (nrels == 0)
354 return;
356 FlushRelationsAllBuffers(rels, nrels);
359 * Sync the physical file(s).
361 for (i = 0; i < nrels; i++)
363 int which = rels[i]->smgr_which;
365 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
367 if (smgrsw[which].smgr_exists(rels[i], forknum))
368 smgrsw[which].smgr_immedsync(rels[i], forknum);
374 * smgrdounlinkall() -- Immediately unlink all forks of all given relations
376 * All forks of all given relations are removed from the store. This
377 * should not be used during transactional operations, since it can't be
378 * undone.
380 * If isRedo is true, it is okay for the underlying file(s) to be gone
381 * already.
383 void
384 smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
386 int i = 0;
387 RelFileNodeBackend *rnodes;
388 ForkNumber forknum;
390 if (nrels == 0)
391 return;
394 * Get rid of any remaining buffers for the relations. bufmgr will just
395 * drop them without bothering to write the contents.
397 DropRelFileNodesAllBuffers(rels, nrels);
400 * create an array which contains all relations to be dropped, and close
401 * each relation's forks at the smgr level while at it
403 rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
404 for (i = 0; i < nrels; i++)
406 RelFileNodeBackend rnode = rels[i]->smgr_rnode;
407 int which = rels[i]->smgr_which;
409 rnodes[i] = rnode;
411 /* Close the forks at smgr level */
412 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
413 smgrsw[which].smgr_close(rels[i], forknum);
417 * It'd be nice to tell the stats collector to forget them immediately,
418 * too. But we can't because we don't know the OIDs.
422 * Send a shared-inval message to force other backends to close any
423 * dangling smgr references they may have for these rels. We should do
424 * this before starting the actual unlinking, in case we fail partway
425 * through that step. Note that the sinval messages will eventually come
426 * back to this backend, too, and thereby provide a backstop that we
427 * closed our own smgr rel.
429 for (i = 0; i < nrels; i++)
430 CacheInvalidateSmgr(rnodes[i]);
433 * Delete the physical file(s).
435 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
436 * ERROR, because we've already decided to commit or abort the current
437 * xact.
440 for (i = 0; i < nrels; i++)
442 int which = rels[i]->smgr_which;
444 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
445 smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo);
448 pfree(rnodes);
453 * smgrextend() -- Add a new block to a file.
455 * The semantics are nearly the same as smgrwrite(): write at the
456 * specified position. However, this is to be used for the case of
457 * extending a relation (i.e., blocknum is at or beyond the current
458 * EOF). Note that we assume writing a block beyond current EOF
459 * causes intervening file space to become filled with zeroes.
461 void
462 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
463 char *buffer, bool skipFsync)
465 smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
466 buffer, skipFsync);
469 * Normally we expect this to increase nblocks by one, but if the cached
470 * value isn't as expected, just invalidate it so the next call asks the
471 * kernel.
473 if (reln->smgr_cached_nblocks[forknum] == blocknum)
474 reln->smgr_cached_nblocks[forknum] = blocknum + 1;
475 else
476 reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
480 * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
482 * In recovery only, this can return false to indicate that a file
483 * doesn't exist (presumably it has been dropped by a later WAL
484 * record).
486 bool
487 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
489 return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
493 * smgrread() -- read a particular block from a relation into the supplied
494 * buffer.
496 * This routine is called from the buffer manager in order to
497 * instantiate pages in the shared buffer cache. All storage managers
498 * return pages in the format that POSTGRES expects.
500 void
501 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
502 char *buffer)
504 smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
508 * smgrwrite() -- Write the supplied buffer out.
510 * This is to be used only for updating already-existing blocks of a
511 * relation (ie, those before the current EOF). To extend a relation,
512 * use smgrextend().
514 * This is not a synchronous write -- the block is not necessarily
515 * on disk at return, only dumped out to the kernel. However,
516 * provisions will be made to fsync the write before the next checkpoint.
518 * skipFsync indicates that the caller will make other provisions to
519 * fsync the relation, so we needn't bother. Temporary relations also
520 * do not require fsync.
522 void
523 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
524 char *buffer, bool skipFsync)
526 smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
527 buffer, skipFsync);
532 * smgrwriteback() -- Trigger kernel writeback for the supplied range of
533 * blocks.
535 void
536 smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
537 BlockNumber nblocks)
539 smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
540 nblocks);
544 * smgrnblocks() -- Calculate the number of blocks in the
545 * supplied relation.
547 BlockNumber
548 smgrnblocks(SMgrRelation reln, ForkNumber forknum)
550 BlockNumber result;
552 /* Check and return if we get the cached value for the number of blocks. */
553 result = smgrnblocks_cached(reln, forknum);
554 if (result != InvalidBlockNumber)
555 return result;
557 result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
559 reln->smgr_cached_nblocks[forknum] = result;
561 return result;
565 * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
566 * relation.
568 * Returns an InvalidBlockNumber when not in recovery and when the relation
569 * fork size is not cached.
571 BlockNumber
572 smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
575 * For now, we only use cached values in recovery due to lack of a shared
576 * invalidation mechanism for changes in file size.
578 if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
579 return reln->smgr_cached_nblocks[forknum];
581 return InvalidBlockNumber;
585 * smgrtruncate() -- Truncate the given forks of supplied relation to
586 * each specified numbers of blocks
588 * The truncation is done immediately, so this can't be rolled back.
590 * The caller must hold AccessExclusiveLock on the relation, to ensure that
591 * other backends receive the smgr invalidation event that this function sends
592 * before they access any forks of the relation again.
594 void
595 smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks)
597 int i;
600 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
601 * just drop them without bothering to write the contents.
603 DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
606 * Send a shared-inval message to force other backends to close any smgr
607 * references they may have for this rel. This is useful because they
608 * might have open file pointers to segments that got removed, and/or
609 * smgr_targblock variables pointing past the new rel end. (The inval
610 * message will come back to our backend, too, causing a
611 * probably-unnecessary local smgr flush. But we don't expect that this
612 * is a performance-critical path.) As in the unlink code, we want to be
613 * sure the message is sent before we start changing things on-disk.
615 CacheInvalidateSmgr(reln->smgr_rnode);
617 /* Do the truncation */
618 for (i = 0; i < nforks; i++)
620 /* Make the cached size is invalid if we encounter an error. */
621 reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
623 smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
626 * We might as well update the local smgr_cached_nblocks values. The
627 * smgr cache inval message that this function sent will cause other
628 * backends to invalidate their copies of smgr_fsm_nblocks and
629 * smgr_vm_nblocks, and these ones too at the next command boundary.
630 * But these ensure they aren't outright wrong until then.
632 reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
637 * smgrimmedsync() -- Force the specified relation to stable storage.
639 * Synchronously force all previous writes to the specified relation
640 * down to disk.
642 * This is useful for building completely new relations (eg, new
643 * indexes). Instead of incrementally WAL-logging the index build
644 * steps, we can just write completed index pages to disk with smgrwrite
645 * or smgrextend, and then fsync the completed index file before
646 * committing the transaction. (This is sufficient for purposes of
647 * crash recovery, since it effectively duplicates forcing a checkpoint
648 * for the completed index. But it is *not* sufficient if one wishes
649 * to use the WAL log for PITR or replication purposes: in that case
650 * we have to make WAL entries as well.)
652 * The preceding writes should specify skipFsync = true to avoid
653 * duplicative fsyncs.
655 * Note that you need to do FlushRelationBuffers() first if there is
656 * any possibility that there are dirty buffers for the relation;
657 * otherwise the sync is not very meaningful.
659 void
660 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
662 smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
666 * AtEOXact_SMgr
668 * This routine is called during transaction commit or abort (it doesn't
669 * particularly care which). All transient SMgrRelation objects are closed.
671 * We do this as a compromise between wanting transient SMgrRelations to
672 * live awhile (to amortize the costs of blind writes of multiple blocks)
673 * and needing them to not live forever (since we're probably holding open
674 * a kernel file descriptor for the underlying file, and we need to ensure
675 * that gets closed reasonably soon if the file gets deleted).
677 void
678 AtEOXact_SMgr(void)
680 dlist_mutable_iter iter;
683 * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each
684 * one from the list.
686 dlist_foreach_modify(iter, &unowned_relns)
688 SMgrRelation rel = dlist_container(SMgrRelationData, node,
689 iter.cur);
691 Assert(rel->smgr_owner == NULL);
693 smgrclose(rel);