Fix xslt_process() to ensure that it inserts a NULL terminator after the
[PostgreSQL.git] / src / backend / storage / smgr / md.c
blobc5bd30929d7ac9a48d4f3d0b5ec2c9d8b9c8d7f4
1 /*-------------------------------------------------------------------------
3 * md.c
4 * This code manages relations that reside on magnetic disk.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * $PostgreSQL$
13 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/relfilenode.h"
27 #include "storage/smgr.h"
28 #include "utils/hsearch.h"
29 #include "utils/memutils.h"
30 #include "pg_trace.h"
33 /* interval for calling AbsorbFsyncRequests in mdsync */
34 #define FSYNCS_PER_ABSORB 10
36 /* special values for the segno arg to RememberFsyncRequest */
37 #define FORGET_RELATION_FSYNC (InvalidBlockNumber)
38 #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
39 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
42 * On Windows, we have to interpret EACCES as possibly meaning the same as
43 * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
44 * that's what you get. Ugh. This code is designed so that we don't
45 * actually believe these cases are okay without further evidence (namely,
46 * a pending fsync request getting revoked ... see mdsync).
48 #ifndef WIN32
49 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
50 #else
51 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
52 #endif
55 * The magnetic disk storage manager keeps track of open file
56 * descriptors in its own descriptor pool. This is done to make it
57 * easier to support relations that are larger than the operating
58 * system's file size limit (often 2GBytes). In order to do that,
59 * we break relations up into "segment" files that are each shorter than
60 * the OS file size limit. The segment size is set by the RELSEG_SIZE
61 * configuration constant in pg_config.h.
63 * On disk, a relation must consist of consecutively numbered segment
64 * files in the pattern
65 * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
66 * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
67 * -- Optionally, any number of inactive segments of size 0 blocks.
68 * The full and partial segments are collectively the "active" segments.
69 * Inactive segments are those that once contained data but are currently
70 * not needed because of an mdtruncate() operation. The reason for leaving
71 * them present at size zero, rather than unlinking them, is that other
72 * backends and/or the bgwriter might be holding open file references to
73 * such segments. If the relation expands again after mdtruncate(), such
74 * that a deactivated segment becomes active again, it is important that
75 * such file references still be valid --- else data might get written
76 * out to an unlinked old copy of a segment file that will eventually
77 * disappear.
79 * The file descriptor pointer (md_fd field) stored in the SMgrRelation
80 * cache is, therefore, just the head of a list of MdfdVec objects, one
81 * per segment. But note the md_fd pointer can be NULL, indicating
82 * relation not open.
84 * Also note that mdfd_chain == NULL does not necessarily mean the relation
85 * doesn't have another segment after this one; we may just not have
86 * opened the next segment yet. (We could not have "all segments are
87 * in the chain" as an invariant anyway, since another backend could
88 * extend the relation when we weren't looking.) We do not make chain
89 * entries for inactive segments, however; as soon as we find a partial
90 * segment, we assume that any subsequent segments are inactive.
92 * All MdfdVec objects are palloc'd in the MdCxt memory context.
95 typedef struct _MdfdVec
97 File mdfd_vfd; /* fd number in fd.c's pool */
98 BlockNumber mdfd_segno; /* segment number, from 0 */
99 struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
100 } MdfdVec;
102 static MemoryContext MdCxt; /* context for all md.c allocations */
106 * In some contexts (currently, standalone backends and the bgwriter process)
107 * we keep track of pending fsync operations: we need to remember all relation
108 * segments that have been written since the last checkpoint, so that we can
109 * fsync them down to disk before completing the next checkpoint. This hash
110 * table remembers the pending operations. We use a hash table mostly as
111 * a convenient way of eliminating duplicate requests.
113 * We use a similar mechanism to remember no-longer-needed files that can
114 * be deleted after the next checkpoint, but we use a linked list instead of
115 * a hash table, because we don't expect there to be any duplicate requests.
117 * (Regular backends do not track pending operations locally, but forward
118 * them to the bgwriter.)
120 typedef struct
122 RelFileNode rnode; /* the targeted relation */
123 ForkNumber forknum;
124 BlockNumber segno; /* which segment */
125 } PendingOperationTag;
127 typedef uint16 CycleCtr; /* can be any convenient integer size */
129 typedef struct
131 PendingOperationTag tag; /* hash table key (must be first!) */
132 bool canceled; /* T => request canceled, not yet removed */
133 CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */
134 } PendingOperationEntry;
136 typedef struct
138 RelFileNode rnode; /* the dead relation to delete */
139 CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */
140 } PendingUnlinkEntry;
142 static HTAB *pendingOpsTable = NULL;
143 static List *pendingUnlinks = NIL;
145 static CycleCtr mdsync_cycle_ctr = 0;
146 static CycleCtr mdckpt_cycle_ctr = 0;
149 typedef enum /* behavior for mdopen & _mdfd_getseg */
151 EXTENSION_FAIL, /* ereport if segment not present */
152 EXTENSION_RETURN_NULL, /* return NULL if not present */
153 EXTENSION_CREATE /* create new segments as needed */
154 } ExtensionBehavior;
156 /* local routines */
157 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
158 ExtensionBehavior behavior);
159 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
160 MdfdVec *seg);
161 static void register_unlink(RelFileNode rnode);
162 static MdfdVec *_fdvec_alloc(void);
163 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
164 BlockNumber segno, int oflags);
165 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
166 BlockNumber blkno, bool isTemp, ExtensionBehavior behavior);
167 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
168 MdfdVec *seg);
172 * mdinit() -- Initialize private state for magnetic disk storage manager.
174 void
175 mdinit(void)
177 MdCxt = AllocSetContextCreate(TopMemoryContext,
178 "MdSmgr",
179 ALLOCSET_DEFAULT_MINSIZE,
180 ALLOCSET_DEFAULT_INITSIZE,
181 ALLOCSET_DEFAULT_MAXSIZE);
184 * Create pending-operations hashtable if we need it. Currently, we need
185 * it if we are standalone (not under a postmaster) OR if we are a
186 * bootstrap-mode subprocess of a postmaster (that is, a startup or
187 * bgwriter process).
189 if (!IsUnderPostmaster || IsBootstrapProcessingMode())
191 HASHCTL hash_ctl;
193 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
194 hash_ctl.keysize = sizeof(PendingOperationTag);
195 hash_ctl.entrysize = sizeof(PendingOperationEntry);
196 hash_ctl.hash = tag_hash;
197 hash_ctl.hcxt = MdCxt;
198 pendingOpsTable = hash_create("Pending Ops Table",
199 100L,
200 &hash_ctl,
201 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
202 pendingUnlinks = NIL;
207 * In archive recovery, we rely on bgwriter to do fsyncs, but we will have
208 * already created the pendingOpsTable during initialization of the startup
209 * process. Calling this function drops the local pendingOpsTable so that
210 * subsequent requests will be forwarded to bgwriter.
212 void
213 SetForwardFsyncRequests(void)
215 /* Perform any pending ops we may have queued up */
216 if (pendingOpsTable)
217 mdsync();
218 pendingOpsTable = NULL;
222 * mdexists() -- Does the physical file exist?
224 * Note: this will return true for lingering files, with pending deletions
226 bool
227 mdexists(SMgrRelation reln, ForkNumber forkNum)
230 * Close it first, to ensure that we notice if the fork has been unlinked
231 * since we opened it.
233 mdclose(reln, forkNum);
235 return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
239 * mdcreate() -- Create a new relation on magnetic disk.
241 * If isRedo is true, it's okay for the relation to exist already.
243 void
244 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
246 char *path;
247 File fd;
249 if (isRedo && reln->md_fd[forkNum] != NULL)
250 return; /* created and opened already... */
252 Assert(reln->md_fd[forkNum] == NULL);
254 path = relpath(reln->smgr_rnode, forkNum);
256 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
258 if (fd < 0)
260 int save_errno = errno;
263 * During bootstrap, there are cases where a system relation will be
264 * accessed (by internal backend processes) before the bootstrap
265 * script nominally creates it. Therefore, allow the file to exist
266 * already, even if isRedo is not set. (See also mdopen)
268 if (isRedo || IsBootstrapProcessingMode())
269 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
270 if (fd < 0)
272 /* be sure to report the error reported by create, not open */
273 errno = save_errno;
274 ereport(ERROR,
275 (errcode_for_file_access(),
276 errmsg("could not create relation %s: %m", path)));
280 pfree(path);
282 reln->md_fd[forkNum] = _fdvec_alloc();
284 reln->md_fd[forkNum]->mdfd_vfd = fd;
285 reln->md_fd[forkNum]->mdfd_segno = 0;
286 reln->md_fd[forkNum]->mdfd_chain = NULL;
290 * mdunlink() -- Unlink a relation.
292 * Note that we're passed a RelFileNode --- by the time this is called,
293 * there won't be an SMgrRelation hashtable entry anymore.
295 * Actually, we don't unlink the first segment file of the relation, but
296 * just truncate it to zero length, and record a request to unlink it after
297 * the next checkpoint. Additional segments can be unlinked immediately,
298 * however. Leaving the empty file in place prevents that relfilenode
299 * number from being reused. The scenario this protects us from is:
300 * 1. We delete a relation (and commit, and actually remove its file).
301 * 2. We create a new relation, which by chance gets the same relfilenode as
302 * the just-deleted one (OIDs must've wrapped around for that to happen).
303 * 3. We crash before another checkpoint occurs.
304 * During replay, we would delete the file and then recreate it, which is fine
305 * if the contents of the file were repopulated by subsequent WAL entries.
306 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
307 * file after populating it (as for instance CLUSTER and CREATE INDEX do),
308 * the contents of the file would be lost forever. By leaving the empty file
309 * until after the next checkpoint, we prevent reassignment of the relfilenode
310 * number until it's safe, because relfilenode assignment skips over any
311 * existing file.
313 * If isRedo is true, it's okay for the relation to be already gone.
314 * Also, we should remove the file immediately instead of queuing a request
315 * for later, since during redo there's no possibility of creating a
316 * conflicting relation.
318 * Note: any failure should be reported as WARNING not ERROR, because
319 * we are usually not in a transaction anymore when this is called.
321 void
322 mdunlink(RelFileNode rnode, ForkNumber forkNum, bool isRedo)
324 char *path;
325 int ret;
328 * We have to clean out any pending fsync requests for the doomed
329 * relation, else the next mdsync() will fail.
331 ForgetRelationFsyncRequests(rnode, forkNum);
333 path = relpath(rnode, forkNum);
336 * Delete or truncate the first segment.
338 if (isRedo || forkNum != MAIN_FORKNUM)
339 ret = unlink(path);
340 else
342 /* truncate(2) would be easier here, but Windows hasn't got it */
343 int fd;
345 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
346 if (fd >= 0)
348 int save_errno;
350 ret = ftruncate(fd, 0);
351 save_errno = errno;
352 close(fd);
353 errno = save_errno;
355 else
356 ret = -1;
358 if (ret < 0)
360 if (!isRedo || errno != ENOENT)
361 ereport(WARNING,
362 (errcode_for_file_access(),
363 errmsg("could not remove relation %s: %m", path)));
367 * Delete any additional segments.
369 else
371 char *segpath = (char *) palloc(strlen(path) + 12);
372 BlockNumber segno;
375 * Note that because we loop until getting ENOENT, we will correctly
376 * remove all inactive segments as well as active ones.
378 for (segno = 1;; segno++)
380 sprintf(segpath, "%s.%u", path, segno);
381 if (unlink(segpath) < 0)
383 /* ENOENT is expected after the last segment... */
384 if (errno != ENOENT)
385 ereport(WARNING,
386 (errcode_for_file_access(),
387 errmsg("could not remove segment %u of relation %s: %m",
388 segno, path)));
389 break;
392 pfree(segpath);
395 pfree(path);
397 /* Register request to unlink first segment later */
398 if (!isRedo && forkNum == MAIN_FORKNUM)
399 register_unlink(rnode);
403 * mdextend() -- Add a block to the specified relation.
405 * The semantics are nearly the same as mdwrite(): write at the
406 * specified position. However, this is to be used for the case of
407 * extending a relation (i.e., blocknum is at or beyond the current
408 * EOF). Note that we assume writing a block beyond current EOF
409 * causes intervening file space to become filled with zeroes.
411 void
412 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
413 char *buffer, bool isTemp)
415 off_t seekpos;
416 int nbytes;
417 MdfdVec *v;
419 /* This assert is too expensive to have on normally ... */
420 #ifdef CHECK_WRITE_VS_EXTEND
421 Assert(blocknum >= mdnblocks(reln, forknum));
422 #endif
425 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
426 * more --- we mustn't create a block whose number actually is
427 * InvalidBlockNumber.
429 if (blocknum == InvalidBlockNumber)
430 ereport(ERROR,
431 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
432 errmsg("cannot extend relation %s beyond %u blocks",
433 relpath(reln->smgr_rnode, forknum),
434 InvalidBlockNumber)));
436 v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_CREATE);
438 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
440 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
443 * Note: because caller usually obtained blocknum by calling mdnblocks,
444 * which did a seek(SEEK_END), this seek is often redundant and will be
445 * optimized away by fd.c. It's not redundant, however, if there is a
446 * partial page at the end of the file. In that case we want to try to
447 * overwrite the partial page with a full page. It's also not redundant
448 * if bufmgr.c had to dump another buffer of the same file to make room
449 * for the new page's buffer.
451 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
452 ereport(ERROR,
453 (errcode_for_file_access(),
454 errmsg("could not seek to block %u of relation %s: %m",
455 blocknum,
456 relpath(reln->smgr_rnode, forknum))));
458 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
460 if (nbytes < 0)
461 ereport(ERROR,
462 (errcode_for_file_access(),
463 errmsg("could not extend relation %s: %m",
464 relpath(reln->smgr_rnode, forknum)),
465 errhint("Check free disk space.")));
466 /* short write: complain appropriately */
467 ereport(ERROR,
468 (errcode(ERRCODE_DISK_FULL),
469 errmsg("could not extend relation %s: wrote only %d of %d bytes at block %u",
470 relpath(reln->smgr_rnode, forknum),
471 nbytes, BLCKSZ, blocknum),
472 errhint("Check free disk space.")));
475 if (!isTemp)
476 register_dirty_segment(reln, forknum, v);
478 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
482 * mdopen() -- Open the specified relation.
484 * Note we only open the first segment, when there are multiple segments.
486 * If first segment is not present, either ereport or return NULL according
487 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
488 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
489 * invent one out of whole cloth.
491 static MdfdVec *
492 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
494 MdfdVec *mdfd;
495 char *path;
496 File fd;
498 /* No work if already open */
499 if (reln->md_fd[forknum])
500 return reln->md_fd[forknum];
502 path = relpath(reln->smgr_rnode, forknum);
504 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
506 if (fd < 0)
509 * During bootstrap, there are cases where a system relation will be
510 * accessed (by internal backend processes) before the bootstrap
511 * script nominally creates it. Therefore, accept mdopen() as a
512 * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
514 if (IsBootstrapProcessingMode())
515 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
516 if (fd < 0)
518 if (behavior == EXTENSION_RETURN_NULL &&
519 FILE_POSSIBLY_DELETED(errno))
521 pfree(path);
522 return NULL;
524 ereport(ERROR,
525 (errcode_for_file_access(),
526 errmsg("could not open relation %s: %m", path)));
530 pfree(path);
532 reln->md_fd[forknum] = mdfd = _fdvec_alloc();
534 mdfd->mdfd_vfd = fd;
535 mdfd->mdfd_segno = 0;
536 mdfd->mdfd_chain = NULL;
537 Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
539 return mdfd;
543 * mdclose() -- Close the specified relation, if it isn't closed already.
545 void
546 mdclose(SMgrRelation reln, ForkNumber forknum)
548 MdfdVec *v = reln->md_fd[forknum];
550 /* No work if already closed */
551 if (v == NULL)
552 return;
554 reln->md_fd[forknum] = NULL; /* prevent dangling pointer after error */
556 while (v != NULL)
558 MdfdVec *ov = v;
560 /* if not closed already */
561 if (v->mdfd_vfd >= 0)
562 FileClose(v->mdfd_vfd);
563 /* Now free vector */
564 v = v->mdfd_chain;
565 pfree(ov);
570 * mdprefetch() -- Initiate asynchronous read of the specified block of a relation
572 void
573 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
575 #ifdef USE_PREFETCH
576 off_t seekpos;
577 MdfdVec *v;
579 v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
581 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
583 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
585 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
586 #endif /* USE_PREFETCH */
591 * mdread() -- Read the specified block from a relation.
593 void
594 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
595 char *buffer)
597 off_t seekpos;
598 int nbytes;
599 MdfdVec *v;
601 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
602 reln->smgr_rnode.spcNode,
603 reln->smgr_rnode.dbNode,
604 reln->smgr_rnode.relNode);
606 v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
608 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
610 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
612 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
613 ereport(ERROR,
614 (errcode_for_file_access(),
615 errmsg("could not seek to block %u of relation %s: %m",
616 blocknum, relpath(reln->smgr_rnode, forknum))));
618 nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
620 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
621 reln->smgr_rnode.spcNode,
622 reln->smgr_rnode.dbNode,
623 reln->smgr_rnode.relNode,
624 nbytes,
625 BLCKSZ);
627 if (nbytes != BLCKSZ)
629 if (nbytes < 0)
630 ereport(ERROR,
631 (errcode_for_file_access(),
632 errmsg("could not read block %u of relation %s: %m",
633 blocknum, relpath(reln->smgr_rnode, forknum))));
636 * Short read: we are at or past EOF, or we read a partial block at
637 * EOF. Normally this is an error; upper levels should never try to
638 * read a nonexistent block. However, if zero_damaged_pages is ON or
639 * we are InRecovery, we should instead return zeroes without
640 * complaining. This allows, for example, the case of trying to
641 * update a block that was later truncated away.
643 if (zero_damaged_pages || InRecovery)
644 MemSet(buffer, 0, BLCKSZ);
645 else
646 ereport(ERROR,
647 (errcode(ERRCODE_DATA_CORRUPTED),
648 errmsg("could not read block %u of relation %s: read only %d of %d bytes",
649 blocknum, relpath(reln->smgr_rnode, forknum),
650 nbytes, BLCKSZ)));
655 * mdwrite() -- Write the supplied block at the appropriate location.
657 * This is to be used only for updating already-existing blocks of a
658 * relation (ie, those before the current EOF). To extend a relation,
659 * use mdextend().
661 void
662 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
663 char *buffer, bool isTemp)
665 off_t seekpos;
666 int nbytes;
667 MdfdVec *v;
669 /* This assert is too expensive to have on normally ... */
670 #ifdef CHECK_WRITE_VS_EXTEND
671 Assert(blocknum < mdnblocks(reln, forknum));
672 #endif
674 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
675 reln->smgr_rnode.spcNode,
676 reln->smgr_rnode.dbNode,
677 reln->smgr_rnode.relNode);
679 v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_FAIL);
681 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
683 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
685 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
686 ereport(ERROR,
687 (errcode_for_file_access(),
688 errmsg("could not seek to block %u of relation %s: %m",
689 blocknum, relpath(reln->smgr_rnode, forknum))));
691 nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
693 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
694 reln->smgr_rnode.spcNode,
695 reln->smgr_rnode.dbNode,
696 reln->smgr_rnode.relNode,
697 nbytes,
698 BLCKSZ);
700 if (nbytes != BLCKSZ)
702 if (nbytes < 0)
703 ereport(ERROR,
704 (errcode_for_file_access(),
705 errmsg("could not write block %u of relation %s: %m",
706 blocknum, relpath(reln->smgr_rnode, forknum))));
707 /* short write: complain appropriately */
708 ereport(ERROR,
709 (errcode(ERRCODE_DISK_FULL),
710 errmsg("could not write block %u of relation %s: wrote only %d of %d bytes",
711 blocknum,
712 relpath(reln->smgr_rnode, forknum),
713 nbytes, BLCKSZ),
714 errhint("Check free disk space.")));
717 if (!isTemp)
718 register_dirty_segment(reln, forknum, v);
722 * mdnblocks() -- Get the number of blocks stored in a relation.
724 * Important side effect: all active segments of the relation are opened
725 * and added to the mdfd_chain list. If this routine has not been
726 * called, then only segments up to the last one actually touched
727 * are present in the chain.
729 BlockNumber
730 mdnblocks(SMgrRelation reln, ForkNumber forknum)
732 MdfdVec *v = mdopen(reln, forknum, EXTENSION_FAIL);
733 BlockNumber nblocks;
734 BlockNumber segno = 0;
737 * Skip through any segments that aren't the last one, to avoid redundant
738 * seeks on them. We have previously verified that these segments are
739 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
741 * NOTE: this assumption could only be wrong if another backend has
742 * truncated the relation. We rely on higher code levels to handle that
743 * scenario by closing and re-opening the md fd, which is handled via
744 * relcache flush. (Since the bgwriter doesn't participate in relcache
745 * flush, it could have segment chain entries for inactive segments;
746 * that's OK because the bgwriter never needs to compute relation size.)
748 while (v->mdfd_chain != NULL)
750 segno++;
751 v = v->mdfd_chain;
754 for (;;)
756 nblocks = _mdnblocks(reln, forknum, v);
757 if (nblocks > ((BlockNumber) RELSEG_SIZE))
758 elog(FATAL, "segment too big");
759 if (nblocks < ((BlockNumber) RELSEG_SIZE))
760 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
763 * If segment is exactly RELSEG_SIZE, advance to next one.
765 segno++;
767 if (v->mdfd_chain == NULL)
770 * Because we pass O_CREAT, we will create the next segment (with
771 * zero length) immediately, if the last segment is of length
772 * RELSEG_SIZE. While perhaps not strictly necessary, this keeps
773 * the logic simple.
775 v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
776 if (v->mdfd_chain == NULL)
777 ereport(ERROR,
778 (errcode_for_file_access(),
779 errmsg("could not open segment %u of relation %s: %m",
780 segno,
781 relpath(reln->smgr_rnode, forknum))));
784 v = v->mdfd_chain;
789 * mdtruncate() -- Truncate relation to specified number of blocks.
791 void
792 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks,
793 bool isTemp)
795 MdfdVec *v;
796 BlockNumber curnblk;
797 BlockNumber priorblocks;
800 * NOTE: mdnblocks makes sure we have opened all active segments, so that
801 * truncation loop will get them all!
803 curnblk = mdnblocks(reln, forknum);
804 if (nblocks > curnblk)
806 /* Bogus request ... but no complaint if InRecovery */
807 if (InRecovery)
808 return;
809 ereport(ERROR,
810 (errmsg("could not truncate relation %s to %u blocks: it's only %u blocks now",
811 relpath(reln->smgr_rnode, forknum),
812 nblocks, curnblk)));
814 if (nblocks == curnblk)
815 return; /* no work */
817 v = mdopen(reln, forknum, EXTENSION_FAIL);
819 priorblocks = 0;
820 while (v != NULL)
822 MdfdVec *ov = v;
824 if (priorblocks > nblocks)
827 * This segment is no longer active (and has already been unlinked
828 * from the mdfd_chain). We truncate the file, but do not delete
829 * it, for reasons explained in the header comments.
831 if (FileTruncate(v->mdfd_vfd, 0) < 0)
832 ereport(ERROR,
833 (errcode_for_file_access(),
834 errmsg("could not truncate relation %s to %u blocks: %m",
835 relpath(reln->smgr_rnode, forknum),
836 nblocks)));
837 if (!isTemp)
838 register_dirty_segment(reln, forknum, v);
839 v = v->mdfd_chain;
840 Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
841 * segment */
842 pfree(ov);
844 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
847 * This is the last segment we want to keep. Truncate the file to
848 * the right length, and clear chain link that points to any
849 * remaining segments (which we shall zap). NOTE: if nblocks is
850 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
851 * segment to 0 length but keep it. This adheres to the invariant
852 * given in the header comments.
854 BlockNumber lastsegblocks = nblocks - priorblocks;
856 if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
857 ereport(ERROR,
858 (errcode_for_file_access(),
859 errmsg("could not truncate relation %s to %u blocks: %m",
860 relpath(reln->smgr_rnode, forknum),
861 nblocks)));
862 if (!isTemp)
863 register_dirty_segment(reln, forknum, v);
864 v = v->mdfd_chain;
865 ov->mdfd_chain = NULL;
867 else
870 * We still need this segment and 0 or more blocks beyond it, so
871 * nothing to do here.
873 v = v->mdfd_chain;
875 priorblocks += RELSEG_SIZE;
880 * mdimmedsync() -- Immediately sync a relation to stable storage.
882 * Note that only writes already issued are synced; this routine knows
883 * nothing of dirty buffers that may exist inside the buffer manager.
885 void
886 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
888 MdfdVec *v;
889 BlockNumber curnblk;
892 * NOTE: mdnblocks makes sure we have opened all active segments, so that
893 * fsync loop will get them all!
895 curnblk = mdnblocks(reln, forknum);
897 v = mdopen(reln, forknum, EXTENSION_FAIL);
899 while (v != NULL)
901 if (FileSync(v->mdfd_vfd) < 0)
902 ereport(ERROR,
903 (errcode_for_file_access(),
904 errmsg("could not fsync segment %u of relation %s: %m",
905 v->mdfd_segno,
906 relpath(reln->smgr_rnode, forknum))));
907 v = v->mdfd_chain;
912 * mdsync() -- Sync previous writes to stable storage.
914 void
915 mdsync(void)
917 static bool mdsync_in_progress = false;
919 HASH_SEQ_STATUS hstat;
920 PendingOperationEntry *entry;
921 int absorb_counter;
924 * This is only called during checkpoints, and checkpoints should only
925 * occur in processes that have created a pendingOpsTable.
927 if (!pendingOpsTable)
928 elog(ERROR, "cannot sync without a pendingOpsTable");
931 * If we are in the bgwriter, the sync had better include all fsync
932 * requests that were queued by backends up to this point. The tightest
933 * race condition that could occur is that a buffer that must be written
934 * and fsync'd for the checkpoint could have been dumped by a backend just
935 * before it was visited by BufferSync(). We know the backend will have
936 * queued an fsync request before clearing the buffer's dirtybit, so we
937 * are safe as long as we do an Absorb after completing BufferSync().
939 AbsorbFsyncRequests();
942 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
943 * checkpoint), we want to ignore fsync requests that are entered into the
944 * hashtable after this point --- they should be processed next time,
945 * instead. We use mdsync_cycle_ctr to tell old entries apart from new
946 * ones: new ones will have cycle_ctr equal to the incremented value of
947 * mdsync_cycle_ctr.
949 * In normal circumstances, all entries present in the table at this point
950 * will have cycle_ctr exactly equal to the current (about to be old)
951 * value of mdsync_cycle_ctr. However, if we fail partway through the
952 * fsync'ing loop, then older values of cycle_ctr might remain when we
953 * come back here to try again. Repeated checkpoint failures would
954 * eventually wrap the counter around to the point where an old entry
955 * might appear new, causing us to skip it, possibly allowing a checkpoint
956 * to succeed that should not have. To forestall wraparound, any time the
957 * previous mdsync() failed to complete, run through the table and
958 * forcibly set cycle_ctr = mdsync_cycle_ctr.
960 * Think not to merge this loop with the main loop, as the problem is
961 * exactly that that loop may fail before having visited all the entries.
962 * From a performance point of view it doesn't matter anyway, as this path
963 * will never be taken in a system that's functioning normally.
965 if (mdsync_in_progress)
967 /* prior try failed, so update any stale cycle_ctr values */
968 hash_seq_init(&hstat, pendingOpsTable);
969 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
971 entry->cycle_ctr = mdsync_cycle_ctr;
975 /* Advance counter so that new hashtable entries are distinguishable */
976 mdsync_cycle_ctr++;
978 /* Set flag to detect failure if we don't reach the end of the loop */
979 mdsync_in_progress = true;
981 /* Now scan the hashtable for fsync requests to process */
982 absorb_counter = FSYNCS_PER_ABSORB;
983 hash_seq_init(&hstat, pendingOpsTable);
984 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
987 * If the entry is new then don't process it this time. Note that
988 * "continue" bypasses the hash-remove call at the bottom of the loop.
990 if (entry->cycle_ctr == mdsync_cycle_ctr)
991 continue;
993 /* Else assert we haven't missed it */
994 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
997 * If fsync is off then we don't have to bother opening the file at
998 * all. (We delay checking until this point so that changing fsync on
999 * the fly behaves sensibly.) Also, if the entry is marked canceled,
1000 * fall through to delete it.
1002 if (enableFsync && !entry->canceled)
1004 int failures;
1007 * If in bgwriter, we want to absorb pending requests every so
1008 * often to prevent overflow of the fsync request queue. It is
1009 * unspecified whether newly-added entries will be visited by
1010 * hash_seq_search, but we don't care since we don't need to
1011 * process them anyway.
1013 if (--absorb_counter <= 0)
1015 AbsorbFsyncRequests();
1016 absorb_counter = FSYNCS_PER_ABSORB;
1020 * The fsync table could contain requests to fsync segments that
1021 * have been deleted (unlinked) by the time we get to them. Rather
1022 * than just hoping an ENOENT (or EACCES on Windows) error can be
1023 * ignored, what we do on error is absorb pending requests and
1024 * then retry. Since mdunlink() queues a "revoke" message before
1025 * actually unlinking, the fsync request is guaranteed to be
1026 * marked canceled after the absorb if it really was this case.
1027 * DROP DATABASE likewise has to tell us to forget fsync requests
1028 * before it starts deletions.
1030 for (failures = 0;; failures++) /* loop exits at "break" */
1032 SMgrRelation reln;
1033 MdfdVec *seg;
1034 char *path;
1037 * Find or create an smgr hash entry for this relation. This
1038 * may seem a bit unclean -- md calling smgr? But it's really
1039 * the best solution. It ensures that the open file reference
1040 * isn't permanently leaked if we get an error here. (You may
1041 * say "but an unreferenced SMgrRelation is still a leak!" Not
1042 * really, because the only case in which a checkpoint is done
1043 * by a process that isn't about to shut down is in the
1044 * bgwriter, and it will periodically do smgrcloseall(). This
1045 * fact justifies our not closing the reln in the success path
1046 * either, which is a good thing since in non-bgwriter cases
1047 * we couldn't safely do that.) Furthermore, in many cases
1048 * the relation will have been dirtied through this same smgr
1049 * relation, and so we can save a file open/close cycle.
1051 reln = smgropen(entry->tag.rnode);
1054 * It is possible that the relation has been dropped or
1055 * truncated since the fsync request was entered. Therefore,
1056 * allow ENOENT, but only if we didn't fail already on this
1057 * file. This applies both during _mdfd_getseg() and during
1058 * FileSync, since fd.c might have closed the file behind our
1059 * back.
1061 seg = _mdfd_getseg(reln, entry->tag.forknum,
1062 entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1063 false, EXTENSION_RETURN_NULL);
1064 if (seg != NULL &&
1065 FileSync(seg->mdfd_vfd) >= 0)
1066 break; /* success; break out of retry loop */
1069 * XXX is there any point in allowing more than one retry?
1070 * Don't see one at the moment, but easy to change the test
1071 * here if so.
1073 path = relpath(entry->tag.rnode, entry->tag.forknum);
1074 if (!FILE_POSSIBLY_DELETED(errno) ||
1075 failures > 0)
1076 ereport(ERROR,
1077 (errcode_for_file_access(),
1078 errmsg("could not fsync segment %u of relation %s: %m",
1079 entry->tag.segno, path)));
1080 else
1081 ereport(DEBUG1,
1082 (errcode_for_file_access(),
1083 errmsg("could not fsync segment %u of relation %s but retrying: %m",
1084 entry->tag.segno, path)));
1085 pfree(path);
1088 * Absorb incoming requests and check to see if canceled.
1090 AbsorbFsyncRequests();
1091 absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
1093 if (entry->canceled)
1094 break;
1095 } /* end retry loop */
1099 * If we get here, either we fsync'd successfully, or we don't have to
1100 * because enableFsync is off, or the entry is (now) marked canceled.
1101 * Okay to delete it.
1103 if (hash_search(pendingOpsTable, &entry->tag,
1104 HASH_REMOVE, NULL) == NULL)
1105 elog(ERROR, "pendingOpsTable corrupted");
1106 } /* end loop over hashtable entries */
1108 /* Flag successful completion of mdsync */
1109 mdsync_in_progress = false;
1113 * mdpreckpt() -- Do pre-checkpoint work
1115 * To distinguish unlink requests that arrived before this checkpoint
1116 * started from those that arrived during the checkpoint, we use a cycle
1117 * counter similar to the one we use for fsync requests. That cycle
1118 * counter is incremented here.
1120 * This must be called *before* the checkpoint REDO point is determined.
1121 * That ensures that we won't delete files too soon.
1123 * Note that we can't do anything here that depends on the assumption
1124 * that the checkpoint will be completed.
1126 void
1127 mdpreckpt(void)
1129 ListCell *cell;
1132 * In case the prior checkpoint wasn't completed, stamp all entries in the
1133 * list with the current cycle counter. Anything that's in the list at
1134 * the start of checkpoint can surely be deleted after the checkpoint is
1135 * finished, regardless of when the request was made.
1137 foreach(cell, pendingUnlinks)
1139 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1141 entry->cycle_ctr = mdckpt_cycle_ctr;
1145 * Any unlink requests arriving after this point will be assigned the next
1146 * cycle counter, and won't be unlinked until next checkpoint.
1148 mdckpt_cycle_ctr++;
1152 * mdpostckpt() -- Do post-checkpoint work
1154 * Remove any lingering files that can now be safely removed.
1156 void
1157 mdpostckpt(void)
1159 while (pendingUnlinks != NIL)
1161 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1162 char *path;
1165 * New entries are appended to the end, so if the entry is new we've
1166 * reached the end of old entries.
1168 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1169 break;
1171 /* Else assert we haven't missed it */
1172 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1174 /* Unlink the file */
1175 path = relpath(entry->rnode, MAIN_FORKNUM);
1176 if (unlink(path) < 0)
1179 * There's a race condition, when the database is dropped at the
1180 * same time that we process the pending unlink requests. If the
1181 * DROP DATABASE deletes the file before we do, we will get ENOENT
1182 * here. rmtree() also has to ignore ENOENT errors, to deal with
1183 * the possibility that we delete the file first.
1185 if (errno != ENOENT)
1186 ereport(WARNING,
1187 (errcode_for_file_access(),
1188 errmsg("could not remove relation %s: %m", path)));
1190 pfree(path);
1192 pendingUnlinks = list_delete_first(pendingUnlinks);
1193 pfree(entry);
1198 * register_dirty_segment() -- Mark a relation segment as needing fsync
1200 * If there is a local pending-ops table, just make an entry in it for
1201 * mdsync to process later. Otherwise, try to pass off the fsync request
1202 * to the background writer process. If that fails, just do the fsync
1203 * locally before returning (we expect this will not happen often enough
1204 * to be a performance problem).
1206 static void
1207 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1209 if (pendingOpsTable)
1211 /* push it into local pending-ops table */
1212 RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
1214 else
1216 if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
1217 return; /* passed it off successfully */
1219 if (FileSync(seg->mdfd_vfd) < 0)
1220 ereport(ERROR,
1221 (errcode_for_file_access(),
1222 errmsg("could not fsync segment %u of relation %s: %m",
1223 seg->mdfd_segno,
1224 relpath(reln->smgr_rnode, forknum))));
1229 * register_unlink() -- Schedule a file to be deleted after next checkpoint
1231 * As with register_dirty_segment, this could involve either a local or
1232 * a remote pending-ops table.
1234 static void
1235 register_unlink(RelFileNode rnode)
1237 if (pendingOpsTable)
1239 /* push it into local pending-ops table */
1240 RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
1242 else
1245 * Notify the bgwriter about it. If we fail to queue the request
1246 * message, we have to sleep and try again, because we can't simply
1247 * delete the file now. Ugly, but hopefully won't happen often.
1249 * XXX should we just leave the file orphaned instead?
1251 Assert(IsUnderPostmaster);
1252 while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
1253 UNLINK_RELATION_REQUEST))
1254 pg_usleep(10000L); /* 10 msec seems a good number */
1259 * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1261 * We stuff most fsync requests into the local hash table for execution
1262 * during the bgwriter's next checkpoint. UNLINK requests go into a
1263 * separate linked list, however, because they get processed separately.
1265 * The range of possible segment numbers is way less than the range of
1266 * BlockNumber, so we can reserve high values of segno for special purposes.
1267 * We define three:
1268 * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1269 * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1270 * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1271 * checkpoint.
1273 * (Handling the FORGET_* requests is a tad slow because the hash table has
1274 * to be searched linearly, but it doesn't seem worth rethinking the table
1275 * structure for them.)
1277 void
1278 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1280 Assert(pendingOpsTable);
1282 if (segno == FORGET_RELATION_FSYNC)
1284 /* Remove any pending requests for the entire relation */
1285 HASH_SEQ_STATUS hstat;
1286 PendingOperationEntry *entry;
1288 hash_seq_init(&hstat, pendingOpsTable);
1289 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1291 if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
1292 entry->tag.forknum == forknum)
1294 /* Okay, cancel this entry */
1295 entry->canceled = true;
1299 else if (segno == FORGET_DATABASE_FSYNC)
1301 /* Remove any pending requests for the entire database */
1302 HASH_SEQ_STATUS hstat;
1303 PendingOperationEntry *entry;
1304 ListCell *cell,
1305 *prev,
1306 *next;
1308 /* Remove fsync requests */
1309 hash_seq_init(&hstat, pendingOpsTable);
1310 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1312 if (entry->tag.rnode.dbNode == rnode.dbNode)
1314 /* Okay, cancel this entry */
1315 entry->canceled = true;
1319 /* Remove unlink requests */
1320 prev = NULL;
1321 for (cell = list_head(pendingUnlinks); cell; cell = next)
1323 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1325 next = lnext(cell);
1326 if (entry->rnode.dbNode == rnode.dbNode)
1328 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1329 pfree(entry);
1331 else
1332 prev = cell;
1335 else if (segno == UNLINK_RELATION_REQUEST)
1337 /* Unlink request: put it in the linked list */
1338 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1339 PendingUnlinkEntry *entry;
1341 entry = palloc(sizeof(PendingUnlinkEntry));
1342 entry->rnode = rnode;
1343 entry->cycle_ctr = mdckpt_cycle_ctr;
1345 pendingUnlinks = lappend(pendingUnlinks, entry);
1347 MemoryContextSwitchTo(oldcxt);
1349 else
1351 /* Normal case: enter a request to fsync this segment */
1352 PendingOperationTag key;
1353 PendingOperationEntry *entry;
1354 bool found;
1356 /* ensure any pad bytes in the hash key are zeroed */
1357 MemSet(&key, 0, sizeof(key));
1358 key.rnode = rnode;
1359 key.forknum = forknum;
1360 key.segno = segno;
1362 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1363 &key,
1364 HASH_ENTER,
1365 &found);
1366 /* if new or previously canceled entry, initialize it */
1367 if (!found || entry->canceled)
1369 entry->canceled = false;
1370 entry->cycle_ctr = mdsync_cycle_ctr;
1374 * NB: it's intentional that we don't change cycle_ctr if the entry
1375 * already exists. The fsync request must be treated as old, even
1376 * though the new request will be satisfied too by any subsequent
1377 * fsync.
1379 * However, if the entry is present but is marked canceled, we should
1380 * act just as though it wasn't there. The only case where this could
1381 * happen would be if a file had been deleted, we received but did not
1382 * yet act on the cancel request, and the same relfilenode was then
1383 * assigned to a new file. We mustn't lose the new request, but it
1384 * should be considered new not old.
1390 * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1392 void
1393 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
1395 if (pendingOpsTable)
1397 /* standalone backend or startup process: fsync state is local */
1398 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1400 else if (IsUnderPostmaster)
1403 * Notify the bgwriter about it. If we fail to queue the revoke
1404 * message, we have to sleep and try again ... ugly, but hopefully
1405 * won't happen often.
1407 * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
1408 * error would leave the no-longer-used file still present on disk,
1409 * which would be bad, so I'm inclined to assume that the bgwriter
1410 * will always empty the queue soon.
1412 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1413 pg_usleep(10000L); /* 10 msec seems a good number */
1416 * Note we don't wait for the bgwriter to actually absorb the revoke
1417 * message; see mdsync() for the implications.
1423 * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1425 void
1426 ForgetDatabaseFsyncRequests(Oid dbid)
1428 RelFileNode rnode;
1430 rnode.dbNode = dbid;
1431 rnode.spcNode = 0;
1432 rnode.relNode = 0;
1434 if (pendingOpsTable)
1436 /* standalone backend or startup process: fsync state is local */
1437 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1439 else if (IsUnderPostmaster)
1441 /* see notes in ForgetRelationFsyncRequests */
1442 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1443 FORGET_DATABASE_FSYNC))
1444 pg_usleep(10000L); /* 10 msec seems a good number */
1450 * _fdvec_alloc() -- Make a MdfdVec object.
1452 static MdfdVec *
1453 _fdvec_alloc(void)
1455 return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1459 * Open the specified segment of the relation,
1460 * and make a MdfdVec object for it. Returns NULL on failure.
1462 static MdfdVec *
1463 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1464 int oflags)
1466 MdfdVec *v;
1467 int fd;
1468 char *path,
1469 *fullpath;
1471 path = relpath(reln->smgr_rnode, forknum);
1473 if (segno > 0)
1475 /* be sure we have enough space for the '.segno' */
1476 fullpath = (char *) palloc(strlen(path) + 12);
1477 sprintf(fullpath, "%s.%u", path, segno);
1478 pfree(path);
1480 else
1481 fullpath = path;
1483 /* open the file */
1484 fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1486 pfree(fullpath);
1488 if (fd < 0)
1489 return NULL;
1491 /* allocate an mdfdvec entry for it */
1492 v = _fdvec_alloc();
1494 /* fill the entry */
1495 v->mdfd_vfd = fd;
1496 v->mdfd_segno = segno;
1497 v->mdfd_chain = NULL;
1498 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1500 /* all done */
1501 return v;
1505 * _mdfd_getseg() -- Find the segment of the relation holding the
1506 * specified block.
1508 * If the segment doesn't exist, we ereport, return NULL, or create the
1509 * segment, according to "behavior". Note: isTemp need only be correct
1510 * in the EXTENSION_CREATE case.
1512 static MdfdVec *
1513 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1514 bool isTemp, ExtensionBehavior behavior)
1516 MdfdVec *v = mdopen(reln, forknum, behavior);
1517 BlockNumber targetseg;
1518 BlockNumber nextsegno;
1520 if (!v)
1521 return NULL; /* only possible if EXTENSION_RETURN_NULL */
1523 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1524 for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1526 Assert(nextsegno == v->mdfd_segno + 1);
1528 if (v->mdfd_chain == NULL)
1531 * Normally we will create new segments only if authorized by the
1532 * caller (i.e., we are doing mdextend()). But when doing WAL
1533 * recovery, create segments anyway; this allows cases such as
1534 * replaying WAL data that has a write into a high-numbered
1535 * segment of a relation that was later deleted. We want to go
1536 * ahead and create the segments so we can finish out the replay.
1538 * We have to maintain the invariant that segments before the last
1539 * active segment are of size RELSEG_SIZE; therefore, pad them out
1540 * with zeroes if needed. (This only matters if caller is
1541 * extending the relation discontiguously, but that can happen in
1542 * hash indexes.)
1544 if (behavior == EXTENSION_CREATE || InRecovery)
1546 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1548 char *zerobuf = palloc0(BLCKSZ);
1550 mdextend(reln, forknum,
1551 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1552 zerobuf, isTemp);
1553 pfree(zerobuf);
1555 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1557 else
1559 /* We won't create segment if not existent */
1560 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1562 if (v->mdfd_chain == NULL)
1564 if (behavior == EXTENSION_RETURN_NULL &&
1565 FILE_POSSIBLY_DELETED(errno))
1566 return NULL;
1567 ereport(ERROR,
1568 (errcode_for_file_access(),
1569 errmsg("could not open segment %u of relation %s (target block %u): %m",
1570 nextsegno,
1571 relpath(reln->smgr_rnode, forknum),
1572 blkno)));
1575 v = v->mdfd_chain;
1577 return v;
1581 * Get number of blocks present in a single disk file
1583 static BlockNumber
1584 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1586 off_t len;
1588 len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1589 if (len < 0)
1590 ereport(ERROR,
1591 (errcode_for_file_access(),
1592 errmsg("could not seek to end of segment %u of relation %s: %m",
1593 seg->mdfd_segno, relpath(reln->smgr_rnode, forknum))));
1594 /* note that this calculation will ignore any partial block at EOF */
1595 return (BlockNumber) (len / BLCKSZ);