Suppress "variable may be used uninitialized" warning.
[pgsql.git] / src / common / blkreftable.c
blobbfa6f7ab5d8d7809b7806f55a1a6ddc10e798f46
1 /*-------------------------------------------------------------------------
3 * blkreftable.c
4 * Block reference tables.
6 * A block reference table is used to keep track of which blocks have
7 * been modified by WAL records within a certain LSN range.
9 * For each relation fork, we keep track of all blocks that have appeared
10 * in block reference in the WAL. We also keep track of the "limit block",
11 * which is the smallest relation length in blocks known to have occurred
12 * during that range of WAL records. This should be set to 0 if the relation
13 * fork is created or destroyed, and to the post-truncation length if
14 * truncated.
16 * Whenever we set the limit block, we also forget about any modified blocks
17 * beyond that point. Those blocks don't exist any more. Such blocks can
18 * later be marked as modified again; if that happens, it means the relation
19 * was re-extended.
21 * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
23 * src/common/blkreftable.c
25 *-------------------------------------------------------------------------
29 #ifndef FRONTEND
30 #include "postgres.h"
31 #else
32 #include "postgres_fe.h"
33 #endif
35 #ifdef FRONTEND
36 #include "common/logging.h"
37 #endif
39 #include "common/blkreftable.h"
40 #include "common/hashfn.h"
41 #include "port/pg_crc32c.h"
44 * A block reference table keeps track of the status of each relation
45 * fork individually.
47 typedef struct BlockRefTableKey
49 RelFileLocator rlocator;
50 ForkNumber forknum;
51 } BlockRefTableKey;
54 * We could need to store data either for a relation in which only a
55 * tiny fraction of the blocks have been modified or for a relation in
56 * which nearly every block has been modified, and we want a
57 * space-efficient representation in both cases. To accomplish this,
58 * we divide the relation into chunks of 2^16 blocks and choose between
59 * an array representation and a bitmap representation for each chunk.
61 * When the number of modified blocks in a given chunk is small, we
62 * essentially store an array of block numbers, but we need not store the
63 * entire block number: instead, we store each block number as a 2-byte
64 * offset from the start of the chunk.
66 * When the number of modified blocks in a given chunk is large, we switch
67 * to a bitmap representation.
69 * These same basic representational choices are used both when a block
70 * reference table is stored in memory and when it is serialized to disk.
72 * In the in-memory representation, we initially allocate each chunk with
73 * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
74 * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
75 * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
76 * to a bitmap, and thus never needs to grow further.
78 #define BLOCKS_PER_CHUNK (1 << 16)
79 #define BLOCKS_PER_ENTRY (BITS_PER_BYTE * sizeof(uint16))
80 #define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
81 #define INITIAL_ENTRIES_PER_CHUNK 16
82 typedef uint16 *BlockRefTableChunk;
85 * State for one relation fork.
87 * 'rlocator' and 'forknum' identify the relation fork to which this entry
88 * pertains.
90 * 'limit_block' is the shortest known length of the relation in blocks
91 * within the LSN range covered by a particular block reference table.
92 * It should be set to 0 if the relation fork is created or dropped. If the
93 * relation fork is truncated, it should be set to the number of blocks that
94 * remain after truncation.
96 * 'nchunks' is the allocated length of each of the three arrays that follow.
97 * We can only represent the status of block numbers less than nchunks *
98 * BLOCKS_PER_CHUNK.
100 * 'chunk_size' is an array storing the allocated size of each chunk.
102 * 'chunk_usage' is an array storing the number of elements used in each
103 * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
104 * chunk is used as an array; else the corresponding chunk is used as a bitmap.
105 * When used as a bitmap, the least significant bit of the first array element
106 * is the status of the lowest-numbered block covered by this chunk.
108 * 'chunk_data' is the array of chunks.
110 struct BlockRefTableEntry
112 BlockRefTableKey key;
113 BlockNumber limit_block;
114 char status;
115 uint32 nchunks;
116 uint16 *chunk_size;
117 uint16 *chunk_usage;
118 BlockRefTableChunk *chunk_data;
121 /* Declare and define a hash table over type BlockRefTableEntry. */
122 #define SH_PREFIX blockreftable
123 #define SH_ELEMENT_TYPE BlockRefTableEntry
124 #define SH_KEY_TYPE BlockRefTableKey
125 #define SH_KEY key
126 #define SH_HASH_KEY(tb, key) \
127 hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
128 #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
129 #define SH_SCOPE static inline
130 #ifdef FRONTEND
131 #define SH_RAW_ALLOCATOR pg_malloc0
132 #endif
133 #define SH_DEFINE
134 #define SH_DECLARE
135 #include "lib/simplehash.h"
138 * A block reference table is basically just the hash table, but we don't
139 * want to expose that to outside callers.
141 * We keep track of the memory context in use explicitly too, so that it's
142 * easy to place all of our allocations in the same context.
144 struct BlockRefTable
146 blockreftable_hash *hash;
147 #ifndef FRONTEND
148 MemoryContext mcxt;
149 #endif
153 * On-disk serialization format for block reference table entries.
155 typedef struct BlockRefTableSerializedEntry
157 RelFileLocator rlocator;
158 ForkNumber forknum;
159 BlockNumber limit_block;
160 uint32 nchunks;
161 } BlockRefTableSerializedEntry;
164 * Buffer size, so that we avoid doing many small I/Os.
166 #define BUFSIZE 65536
169 * Ad-hoc buffer for file I/O.
171 typedef struct BlockRefTableBuffer
173 io_callback_fn io_callback;
174 void *io_callback_arg;
175 char data[BUFSIZE];
176 int used;
177 int cursor;
178 pg_crc32c crc;
179 } BlockRefTableBuffer;
182 * State for keeping track of progress while incrementally reading a block
183 * table reference file from disk.
185 * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
186 * combination that is currently being read, and consumed_chunks is the number
187 * of those that have been read. (We always read all the information for
188 * a single chunk at one time, so we don't need to be able to represent the
189 * state where a chunk has been partially read.)
191 * chunk_size is the array of chunk sizes. The length is given by total_chunks.
193 * chunk_data holds the current chunk.
195 * chunk_position helps us figure out how much progress we've made in returning
196 * the block numbers for the current chunk to the caller. If the chunk is a
197 * bitmap, it's the number of bits we've scanned; otherwise, it's the number
198 * of chunk entries we've scanned.
200 struct BlockRefTableReader
202 BlockRefTableBuffer buffer;
203 char *error_filename;
204 report_error_fn error_callback;
205 void *error_callback_arg;
206 uint32 total_chunks;
207 uint32 consumed_chunks;
208 uint16 *chunk_size;
209 uint16 chunk_data[MAX_ENTRIES_PER_CHUNK];
210 uint32 chunk_position;
214 * State for keeping track of progress while incrementally writing a block
215 * reference table file to disk.
217 struct BlockRefTableWriter
219 BlockRefTableBuffer buffer;
222 /* Function prototypes. */
223 static int BlockRefTableComparator(const void *a, const void *b);
224 static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
225 static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
226 int length);
227 static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
228 int length);
229 static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer);
232 * Create an empty block reference table.
234 BlockRefTable *
235 CreateEmptyBlockRefTable(void)
237 BlockRefTable *brtab = palloc(sizeof(BlockRefTable));
240 * Even completely empty database has a few hundred relation forks, so it
241 * seems best to size the hash on the assumption that we're going to have
242 * at least a few thousand entries.
244 #ifdef FRONTEND
245 brtab->hash = blockreftable_create(4096, NULL);
246 #else
247 brtab->mcxt = CurrentMemoryContext;
248 brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
249 #endif
251 return brtab;
255 * Set the "limit block" for a relation fork and forget any modified blocks
256 * with equal or higher block numbers.
258 * The "limit block" is the shortest known length of the relation within the
259 * range of WAL records covered by this block reference table.
261 void
262 BlockRefTableSetLimitBlock(BlockRefTable *brtab,
263 const RelFileLocator *rlocator,
264 ForkNumber forknum,
265 BlockNumber limit_block)
267 BlockRefTableEntry *brtentry;
268 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
269 bool found;
271 memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
272 key.forknum = forknum;
273 brtentry = blockreftable_insert(brtab->hash, key, &found);
275 if (!found)
278 * We have no existing data about this relation fork, so just record
279 * the limit_block value supplied by the caller, and make sure other
280 * parts of the entry are properly initialized.
282 brtentry->limit_block = limit_block;
283 brtentry->nchunks = 0;
284 brtentry->chunk_size = NULL;
285 brtentry->chunk_usage = NULL;
286 brtentry->chunk_data = NULL;
287 return;
290 BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
294 * Mark a block in a given relation fork as known to have been modified.
296 void
297 BlockRefTableMarkBlockModified(BlockRefTable *brtab,
298 const RelFileLocator *rlocator,
299 ForkNumber forknum,
300 BlockNumber blknum)
302 BlockRefTableEntry *brtentry;
303 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
304 bool found;
305 #ifndef FRONTEND
306 MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
307 #endif
309 memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
310 key.forknum = forknum;
311 brtentry = blockreftable_insert(brtab->hash, key, &found);
313 if (!found)
316 * We want to set the initial limit block value to something higher
317 * than any legal block number. InvalidBlockNumber fits the bill.
319 brtentry->limit_block = InvalidBlockNumber;
320 brtentry->nchunks = 0;
321 brtentry->chunk_size = NULL;
322 brtentry->chunk_usage = NULL;
323 brtentry->chunk_data = NULL;
326 BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);
328 #ifndef FRONTEND
329 MemoryContextSwitchTo(oldcontext);
330 #endif
334 * Get an entry from a block reference table.
336 * If the entry does not exist, this function returns NULL. Otherwise, it
337 * returns the entry and sets *limit_block to the value from the entry.
339 BlockRefTableEntry *
340 BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator,
341 ForkNumber forknum, BlockNumber *limit_block)
343 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
344 BlockRefTableEntry *entry;
346 Assert(limit_block != NULL);
348 memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
349 key.forknum = forknum;
350 entry = blockreftable_lookup(brtab->hash, key);
352 if (entry != NULL)
353 *limit_block = entry->limit_block;
355 return entry;
359 * Get block numbers from a table entry.
361 * 'blocks' must point to enough space to hold at least 'nblocks' block
362 * numbers, and any block numbers we manage to get will be written there.
363 * The return value is the number of block numbers actually written.
365 * We do not return block numbers unless they are greater than or equal to
366 * start_blkno and strictly less than stop_blkno.
369 BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry,
370 BlockNumber start_blkno,
371 BlockNumber stop_blkno,
372 BlockNumber *blocks,
373 int nblocks)
375 uint32 start_chunkno;
376 uint32 stop_chunkno;
377 uint32 chunkno;
378 int nresults = 0;
380 Assert(entry != NULL);
383 * Figure out which chunks could potentially contain blocks of interest.
385 * We need to be careful about overflow here, because stop_blkno could be
386 * InvalidBlockNumber or something very close to it.
388 start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
389 stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
390 if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
391 ++stop_chunkno;
392 if (stop_chunkno > entry->nchunks)
393 stop_chunkno = entry->nchunks;
396 * Loop over chunks.
398 for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
400 uint16 chunk_usage = entry->chunk_usage[chunkno];
401 BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
402 unsigned start_offset = 0;
403 unsigned stop_offset = BLOCKS_PER_CHUNK;
406 * If the start and/or stop block number falls within this chunk, the
407 * whole chunk may not be of interest. Figure out which portion we
408 * care about, if it's not the whole thing.
410 if (chunkno == start_chunkno)
411 start_offset = start_blkno % BLOCKS_PER_CHUNK;
412 if (chunkno == stop_chunkno - 1)
413 stop_offset = stop_blkno % BLOCKS_PER_CHUNK;
416 * Handling differs depending on whether this is an array of offsets
417 * or a bitmap.
419 if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
421 unsigned i;
423 /* It's a bitmap, so test every relevant bit. */
424 for (i = start_offset; i < stop_offset; ++i)
426 uint16 w = chunk_data[i / BLOCKS_PER_ENTRY];
428 if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
430 BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;
432 blocks[nresults++] = blkno;
434 /* Early exit if we run out of output space. */
435 if (nresults == nblocks)
436 return nresults;
440 else
442 unsigned i;
444 /* It's an array of offsets, so check each one. */
445 for (i = 0; i < chunk_usage; ++i)
447 uint16 offset = chunk_data[i];
449 if (offset >= start_offset && offset < stop_offset)
451 BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;
453 blocks[nresults++] = blkno;
455 /* Early exit if we run out of output space. */
456 if (nresults == nblocks)
457 return nresults;
463 return nresults;
467 * Serialize a block reference table to a file.
469 void
470 WriteBlockRefTable(BlockRefTable *brtab,
471 io_callback_fn write_callback,
472 void *write_callback_arg)
474 BlockRefTableSerializedEntry *sdata = NULL;
475 BlockRefTableBuffer buffer;
476 uint32 magic = BLOCKREFTABLE_MAGIC;
478 /* Prepare buffer. */
479 memset(&buffer, 0, sizeof(BlockRefTableBuffer));
480 buffer.io_callback = write_callback;
481 buffer.io_callback_arg = write_callback_arg;
482 INIT_CRC32C(buffer.crc);
484 /* Write magic number. */
485 BlockRefTableWrite(&buffer, &magic, sizeof(uint32));
487 /* Write the entries, assuming there are some. */
488 if (brtab->hash->members > 0)
490 unsigned i = 0;
491 blockreftable_iterator it;
492 BlockRefTableEntry *brtentry;
494 /* Extract entries into serializable format and sort them. */
495 sdata =
496 palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry));
497 blockreftable_start_iterate(brtab->hash, &it);
498 while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
500 BlockRefTableSerializedEntry *sentry = &sdata[i++];
502 sentry->rlocator = brtentry->key.rlocator;
503 sentry->forknum = brtentry->key.forknum;
504 sentry->limit_block = brtentry->limit_block;
505 sentry->nchunks = brtentry->nchunks;
507 /* trim trailing zero entries */
508 while (sentry->nchunks > 0 &&
509 brtentry->chunk_usage[sentry->nchunks - 1] == 0)
510 sentry->nchunks--;
512 Assert(i == brtab->hash->members);
513 qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
514 BlockRefTableComparator);
516 /* Loop over entries in sorted order and serialize each one. */
517 for (i = 0; i < brtab->hash->members; ++i)
519 BlockRefTableSerializedEntry *sentry = &sdata[i];
520 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
521 unsigned j;
523 /* Write the serialized entry itself. */
524 BlockRefTableWrite(&buffer, sentry,
525 sizeof(BlockRefTableSerializedEntry));
527 /* Look up the original entry so we can access the chunks. */
528 memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
529 key.forknum = sentry->forknum;
530 brtentry = blockreftable_lookup(brtab->hash, key);
531 Assert(brtentry != NULL);
533 /* Write the untruncated portion of the chunk length array. */
534 if (sentry->nchunks != 0)
535 BlockRefTableWrite(&buffer, brtentry->chunk_usage,
536 sentry->nchunks * sizeof(uint16));
538 /* Write the contents of each chunk. */
539 for (j = 0; j < brtentry->nchunks; ++j)
541 if (brtentry->chunk_usage[j] == 0)
542 continue;
543 BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
544 brtentry->chunk_usage[j] * sizeof(uint16));
549 /* Write out appropriate terminator and CRC and flush buffer. */
550 BlockRefTableFileTerminate(&buffer);
554 * Prepare to incrementally read a block reference table file.
556 * 'read_callback' is a function that can be called to read data from the
557 * underlying file (or other data source) into our internal buffer.
559 * 'read_callback_arg' is an opaque argument to be passed to read_callback.
561 * 'error_filename' is the filename that should be included in error messages
562 * if the file is found to be malformed. The value is not copied, so the
563 * caller should ensure that it remains valid until done with this
564 * BlockRefTableReader.
566 * 'error_callback' is a function to be called if the file is found to be
567 * malformed. This is not used for I/O errors, which must be handled internally
568 * by read_callback.
570 * 'error_callback_arg' is an opaque argument to be passed to error_callback.
572 BlockRefTableReader *
573 CreateBlockRefTableReader(io_callback_fn read_callback,
574 void *read_callback_arg,
575 char *error_filename,
576 report_error_fn error_callback,
577 void *error_callback_arg)
579 BlockRefTableReader *reader;
580 uint32 magic;
582 /* Initialize data structure. */
583 reader = palloc0(sizeof(BlockRefTableReader));
584 reader->buffer.io_callback = read_callback;
585 reader->buffer.io_callback_arg = read_callback_arg;
586 reader->error_filename = error_filename;
587 reader->error_callback = error_callback;
588 reader->error_callback_arg = error_callback_arg;
589 INIT_CRC32C(reader->buffer.crc);
591 /* Verify magic number. */
592 BlockRefTableRead(reader, &magic, sizeof(uint32));
593 if (magic != BLOCKREFTABLE_MAGIC)
594 error_callback(error_callback_arg,
595 "file \"%s\" has wrong magic number: expected %u, found %u",
596 error_filename,
597 BLOCKREFTABLE_MAGIC, magic);
599 return reader;
603 * Read next relation fork covered by this block reference table file.
605 * After calling this function, you must call BlockRefTableReaderGetBlocks
606 * until it returns 0 before calling it again.
608 bool
609 BlockRefTableReaderNextRelation(BlockRefTableReader *reader,
610 RelFileLocator *rlocator,
611 ForkNumber *forknum,
612 BlockNumber *limit_block)
614 BlockRefTableSerializedEntry sentry;
615 BlockRefTableSerializedEntry zentry = {{0}};
618 * Sanity check: caller must read all blocks from all chunks before moving
619 * on to the next relation.
621 Assert(reader->total_chunks == reader->consumed_chunks);
623 /* Read serialized entry. */
624 BlockRefTableRead(reader, &sentry,
625 sizeof(BlockRefTableSerializedEntry));
628 * If we just read the sentinel entry indicating that we've reached the
629 * end, read and check the CRC.
631 if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
633 pg_crc32c expected_crc;
634 pg_crc32c actual_crc;
637 * We want to know the CRC of the file excluding the 4-byte CRC
638 * itself, so copy the current value of the CRC accumulator before
639 * reading those bytes, and use the copy to finalize the calculation.
641 expected_crc = reader->buffer.crc;
642 FIN_CRC32C(expected_crc);
644 /* Now we can read the actual value. */
645 BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));
647 /* Throw an error if there is a mismatch. */
648 if (!EQ_CRC32C(expected_crc, actual_crc))
649 reader->error_callback(reader->error_callback_arg,
650 "file \"%s\" has wrong checksum: expected %08X, found %08X",
651 reader->error_filename, expected_crc, actual_crc);
653 return false;
656 /* Read chunk size array. */
657 if (reader->chunk_size != NULL)
658 pfree(reader->chunk_size);
659 reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16));
660 BlockRefTableRead(reader, reader->chunk_size,
661 sentry.nchunks * sizeof(uint16));
663 /* Set up for chunk scan. */
664 reader->total_chunks = sentry.nchunks;
665 reader->consumed_chunks = 0;
667 /* Return data to caller. */
668 memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
669 *forknum = sentry.forknum;
670 *limit_block = sentry.limit_block;
671 return true;
675 * Get modified blocks associated with the relation fork returned by
676 * the most recent call to BlockRefTableReaderNextRelation.
678 * On return, block numbers will be written into the 'blocks' array, whose
679 * length should be passed via 'nblocks'. The return value is the number of
680 * entries actually written into the 'blocks' array, which may be less than
681 * 'nblocks' if we run out of modified blocks in the relation fork before
682 * we run out of room in the array.
684 unsigned
685 BlockRefTableReaderGetBlocks(BlockRefTableReader *reader,
686 BlockNumber *blocks,
687 int nblocks)
689 unsigned blocks_found = 0;
691 /* Must provide space for at least one block number to be returned. */
692 Assert(nblocks > 0);
694 /* Loop collecting blocks to return to caller. */
695 for (;;)
697 uint16 next_chunk_size;
700 * If we've read at least one chunk, maybe it contains some block
701 * numbers that could satisfy caller's request.
703 if (reader->consumed_chunks > 0)
705 uint32 chunkno = reader->consumed_chunks - 1;
706 uint16 chunk_size = reader->chunk_size[chunkno];
708 if (chunk_size == MAX_ENTRIES_PER_CHUNK)
710 /* Bitmap format, so search for bits that are set. */
711 while (reader->chunk_position < BLOCKS_PER_CHUNK &&
712 blocks_found < nblocks)
714 uint16 chunkoffset = reader->chunk_position;
715 uint16 w;
717 w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
718 if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
719 blocks[blocks_found++] =
720 chunkno * BLOCKS_PER_CHUNK + chunkoffset;
721 ++reader->chunk_position;
724 else
726 /* Not in bitmap format, so each entry is a 2-byte offset. */
727 while (reader->chunk_position < chunk_size &&
728 blocks_found < nblocks)
730 blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
731 + reader->chunk_data[reader->chunk_position];
732 ++reader->chunk_position;
737 /* We found enough blocks, so we're done. */
738 if (blocks_found >= nblocks)
739 break;
742 * We didn't find enough blocks, so we must need the next chunk. If
743 * there are none left, though, then we're done anyway.
745 if (reader->consumed_chunks == reader->total_chunks)
746 break;
749 * Read data for next chunk and reset scan position to beginning of
750 * chunk. Note that the next chunk might be empty, in which case we
751 * consume the chunk without actually consuming any bytes from the
752 * underlying file.
754 next_chunk_size = reader->chunk_size[reader->consumed_chunks];
755 if (next_chunk_size > 0)
756 BlockRefTableRead(reader, reader->chunk_data,
757 next_chunk_size * sizeof(uint16));
758 ++reader->consumed_chunks;
759 reader->chunk_position = 0;
762 return blocks_found;
766 * Release memory used while reading a block reference table from a file.
768 void
769 DestroyBlockRefTableReader(BlockRefTableReader *reader)
771 if (reader->chunk_size != NULL)
773 pfree(reader->chunk_size);
774 reader->chunk_size = NULL;
776 pfree(reader);
780 * Prepare to write a block reference table file incrementally.
782 * Caller must be able to supply BlockRefTableEntry objects sorted in the
783 * appropriate order.
785 BlockRefTableWriter *
786 CreateBlockRefTableWriter(io_callback_fn write_callback,
787 void *write_callback_arg)
789 BlockRefTableWriter *writer;
790 uint32 magic = BLOCKREFTABLE_MAGIC;
792 /* Prepare buffer and CRC check and save callbacks. */
793 writer = palloc0(sizeof(BlockRefTableWriter));
794 writer->buffer.io_callback = write_callback;
795 writer->buffer.io_callback_arg = write_callback_arg;
796 INIT_CRC32C(writer->buffer.crc);
798 /* Write magic number. */
799 BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));
801 return writer;
805 * Append one entry to a block reference table file.
807 * Note that entries must be written in the proper order, that is, sorted by
808 * tablespace, then database, then relfilenumber, then fork number. Caller
809 * is responsible for supplying data in the correct order. If that seems hard,
810 * use an in-memory BlockRefTable instead.
812 void
813 BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
815 BlockRefTableSerializedEntry sentry;
816 unsigned j;
818 /* Convert to serialized entry format. */
819 sentry.rlocator = entry->key.rlocator;
820 sentry.forknum = entry->key.forknum;
821 sentry.limit_block = entry->limit_block;
822 sentry.nchunks = entry->nchunks;
824 /* Trim trailing zero entries. */
825 while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
826 sentry.nchunks--;
828 /* Write the serialized entry itself. */
829 BlockRefTableWrite(&writer->buffer, &sentry,
830 sizeof(BlockRefTableSerializedEntry));
832 /* Write the untruncated portion of the chunk length array. */
833 if (sentry.nchunks != 0)
834 BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
835 sentry.nchunks * sizeof(uint16));
837 /* Write the contents of each chunk. */
838 for (j = 0; j < entry->nchunks; ++j)
840 if (entry->chunk_usage[j] == 0)
841 continue;
842 BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
843 entry->chunk_usage[j] * sizeof(uint16));
848 * Finalize an incremental write of a block reference table file.
850 void
851 DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
853 BlockRefTableFileTerminate(&writer->buffer);
854 pfree(writer);
858 * Allocate a standalone BlockRefTableEntry.
860 * When we're manipulating a full in-memory BlockRefTable, the entries are
861 * part of the hash table and are allocated by simplehash. This routine is
862 * used by callers that want to write out a BlockRefTable to a file without
863 * needing to store the whole thing in memory at once.
865 * Entries allocated by this function can be manipulated using the functions
866 * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
867 * and then written using BlockRefTableWriteEntry and freed using
868 * BlockRefTableFreeEntry.
870 BlockRefTableEntry *
871 CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
873 BlockRefTableEntry *entry = palloc0(sizeof(BlockRefTableEntry));
875 memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
876 entry->key.forknum = forknum;
877 entry->limit_block = InvalidBlockNumber;
879 return entry;
883 * Update a BlockRefTableEntry with a new value for the "limit block" and
884 * forget any equal-or-higher-numbered modified blocks.
886 * The "limit block" is the shortest known length of the relation within the
887 * range of WAL records covered by this block reference table.
889 void
890 BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry,
891 BlockNumber limit_block)
893 unsigned chunkno;
894 unsigned limit_chunkno;
895 unsigned limit_chunkoffset;
896 BlockRefTableChunk limit_chunk;
898 /* If we already have an equal or lower limit block, do nothing. */
899 if (limit_block >= entry->limit_block)
900 return;
902 /* Record the new limit block value. */
903 entry->limit_block = limit_block;
906 * Figure out which chunk would store the state of the new limit block,
907 * and which offset within that chunk.
909 limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
910 limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;
913 * If the number of chunks is not large enough for any blocks with equal
914 * or higher block numbers to exist, then there is nothing further to do.
916 if (limit_chunkno >= entry->nchunks)
917 return;
919 /* Discard entire contents of any higher-numbered chunks. */
920 for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
921 entry->chunk_usage[chunkno] = 0;
924 * Next, we need to discard any offsets within the chunk that would
925 * contain the limit_block. We must handle this differently depending on
926 * whether the chunk that would contain limit_block is a bitmap or an
927 * array of offsets.
929 limit_chunk = entry->chunk_data[limit_chunkno];
930 if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
932 unsigned chunkoffset;
934 /* It's a bitmap. Unset bits. */
935 for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
936 ++chunkoffset)
937 limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
938 ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
940 else
942 unsigned i,
943 j = 0;
945 /* It's an offset array. Filter out large offsets. */
946 for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
948 Assert(j <= i);
949 if (limit_chunk[i] < limit_chunkoffset)
950 limit_chunk[j++] = limit_chunk[i];
952 Assert(j <= entry->chunk_usage[limit_chunkno]);
953 entry->chunk_usage[limit_chunkno] = j;
958 * Mark a block in a given BlockRefTableEntry as known to have been modified.
960 void
961 BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry,
962 ForkNumber forknum,
963 BlockNumber blknum)
965 unsigned chunkno;
966 unsigned chunkoffset;
967 unsigned i;
970 * Which chunk should store the state of this block? And what is the
971 * offset of this block relative to the start of that chunk?
973 chunkno = blknum / BLOCKS_PER_CHUNK;
974 chunkoffset = blknum % BLOCKS_PER_CHUNK;
977 * If 'nchunks' isn't big enough for us to be able to represent the state
978 * of this block, we need to enlarge our arrays.
980 if (chunkno >= entry->nchunks)
982 unsigned max_chunks;
983 unsigned extra_chunks;
986 * New array size is a power of 2, at least 16, big enough so that
987 * chunkno will be a valid array index.
989 max_chunks = Max(16, entry->nchunks);
990 while (max_chunks < chunkno + 1)
991 max_chunks *= 2;
992 extra_chunks = max_chunks - entry->nchunks;
994 if (entry->nchunks == 0)
996 entry->chunk_size = palloc0(sizeof(uint16) * max_chunks);
997 entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks);
998 entry->chunk_data =
999 palloc0(sizeof(BlockRefTableChunk) * max_chunks);
1001 else
1003 entry->chunk_size = repalloc(entry->chunk_size,
1004 sizeof(uint16) * max_chunks);
1005 memset(&entry->chunk_size[entry->nchunks], 0,
1006 extra_chunks * sizeof(uint16));
1007 entry->chunk_usage = repalloc(entry->chunk_usage,
1008 sizeof(uint16) * max_chunks);
1009 memset(&entry->chunk_usage[entry->nchunks], 0,
1010 extra_chunks * sizeof(uint16));
1011 entry->chunk_data = repalloc(entry->chunk_data,
1012 sizeof(BlockRefTableChunk) * max_chunks);
1013 memset(&entry->chunk_data[entry->nchunks], 0,
1014 extra_chunks * sizeof(BlockRefTableChunk));
1016 entry->nchunks = max_chunks;
1020 * If the chunk that covers this block number doesn't exist yet, create it
1021 * as an array and add the appropriate offset to it. We make it pretty
1022 * small initially, because there might only be 1 or a few block
1023 * references in this chunk and we don't want to use up too much memory.
1025 if (entry->chunk_size[chunkno] == 0)
1027 entry->chunk_data[chunkno] =
1028 palloc(sizeof(uint16) * INITIAL_ENTRIES_PER_CHUNK);
1029 entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
1030 entry->chunk_data[chunkno][0] = chunkoffset;
1031 entry->chunk_usage[chunkno] = 1;
1032 return;
1036 * If the number of entries in this chunk is already maximum, it must be a
1037 * bitmap. Just set the appropriate bit.
1039 if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
1041 BlockRefTableChunk chunk = entry->chunk_data[chunkno];
1043 chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1044 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1045 return;
1049 * There is an existing chunk and it's in array format. Let's find out
1050 * whether it already has an entry for this block. If so, we do not need
1051 * to do anything.
1053 for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
1055 if (entry->chunk_data[chunkno][i] == chunkoffset)
1056 return;
1060 * If the number of entries currently used is one less than the maximum,
1061 * it's time to convert to bitmap format.
1063 if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
1065 BlockRefTableChunk newchunk;
1066 unsigned j;
1068 /* Allocate a new chunk. */
1069 newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));
1071 /* Set the bit for each existing entry. */
1072 for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
1074 unsigned coff = entry->chunk_data[chunkno][j];
1076 newchunk[coff / BLOCKS_PER_ENTRY] |=
1077 1 << (coff % BLOCKS_PER_ENTRY);
1080 /* Set the bit for the new entry. */
1081 newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1082 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1084 /* Swap the new chunk into place and update metadata. */
1085 pfree(entry->chunk_data[chunkno]);
1086 entry->chunk_data[chunkno] = newchunk;
1087 entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
1088 entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
1089 return;
1093 * OK, we currently have an array, and we don't need to convert to a
1094 * bitmap, but we do need to add a new element. If there's not enough
1095 * room, we'll have to expand the array.
1097 if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
1099 unsigned newsize = entry->chunk_size[chunkno] * 2;
1101 Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
1102 entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
1103 newsize * sizeof(uint16));
1104 entry->chunk_size[chunkno] = newsize;
1107 /* Now we can add the new entry. */
1108 entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
1109 chunkoffset;
1110 entry->chunk_usage[chunkno]++;
1114 * Release memory for a BlockRefTableEntry that was created by
1115 * CreateBlockRefTableEntry.
1117 void
1118 BlockRefTableFreeEntry(BlockRefTableEntry *entry)
1120 if (entry->chunk_size != NULL)
1122 pfree(entry->chunk_size);
1123 entry->chunk_size = NULL;
1126 if (entry->chunk_usage != NULL)
1128 pfree(entry->chunk_usage);
1129 entry->chunk_usage = NULL;
1132 if (entry->chunk_data != NULL)
1134 pfree(entry->chunk_data);
1135 entry->chunk_data = NULL;
1138 pfree(entry);
1142 * Comparator for BlockRefTableSerializedEntry objects.
1144 * We make the tablespace OID the first column of the sort key to match
1145 * the on-disk tree structure.
1147 static int
1148 BlockRefTableComparator(const void *a, const void *b)
1150 const BlockRefTableSerializedEntry *sa = a;
1151 const BlockRefTableSerializedEntry *sb = b;
1153 if (sa->rlocator.spcOid > sb->rlocator.spcOid)
1154 return 1;
1155 if (sa->rlocator.spcOid < sb->rlocator.spcOid)
1156 return -1;
1158 if (sa->rlocator.dbOid > sb->rlocator.dbOid)
1159 return 1;
1160 if (sa->rlocator.dbOid < sb->rlocator.dbOid)
1161 return -1;
1163 if (sa->rlocator.relNumber > sb->rlocator.relNumber)
1164 return 1;
1165 if (sa->rlocator.relNumber < sb->rlocator.relNumber)
1166 return -1;
1168 if (sa->forknum > sb->forknum)
1169 return 1;
1170 if (sa->forknum < sb->forknum)
1171 return -1;
1173 return 0;
1177 * Flush any buffered data out of a BlockRefTableBuffer.
1179 static void
1180 BlockRefTableFlush(BlockRefTableBuffer *buffer)
1182 buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
1183 buffer->used = 0;
1187 * Read data from a BlockRefTableBuffer, and update the running CRC
1188 * calculation for the returned data (but not any data that we may have
1189 * buffered but not yet actually returned).
1191 static void
1192 BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
1194 BlockRefTableBuffer *buffer = &reader->buffer;
1196 /* Loop until read is fully satisfied. */
1197 while (length > 0)
1199 if (buffer->cursor < buffer->used)
1202 * If any buffered data is available, use that to satisfy as much
1203 * of the request as possible.
1205 int bytes_to_copy = Min(length, buffer->used - buffer->cursor);
1207 memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
1208 COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
1209 bytes_to_copy);
1210 buffer->cursor += bytes_to_copy;
1211 data = ((char *) data) + bytes_to_copy;
1212 length -= bytes_to_copy;
1214 else if (length >= BUFSIZE)
1217 * If the request length is long, read directly into caller's
1218 * buffer.
1220 int bytes_read;
1222 bytes_read = buffer->io_callback(buffer->io_callback_arg,
1223 data, length);
1224 COMP_CRC32C(buffer->crc, data, bytes_read);
1225 data = ((char *) data) + bytes_read;
1226 length -= bytes_read;
1228 /* If we didn't get anything, that's bad. */
1229 if (bytes_read == 0)
1230 reader->error_callback(reader->error_callback_arg,
1231 "file \"%s\" ends unexpectedly",
1232 reader->error_filename);
1234 else
1237 * Refill our buffer.
1239 buffer->used = buffer->io_callback(buffer->io_callback_arg,
1240 buffer->data, BUFSIZE);
1241 buffer->cursor = 0;
1243 /* If we didn't get anything, that's bad. */
1244 if (buffer->used == 0)
1245 reader->error_callback(reader->error_callback_arg,
1246 "file \"%s\" ends unexpectedly",
1247 reader->error_filename);
1253 * Supply data to a BlockRefTableBuffer for write to the underlying File,
1254 * and update the running CRC calculation for that data.
1256 static void
1257 BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
1259 /* Update running CRC calculation. */
1260 COMP_CRC32C(buffer->crc, data, length);
1262 /* If the new data can't fit into the buffer, flush the buffer. */
1263 if (buffer->used + length > BUFSIZE)
1265 buffer->io_callback(buffer->io_callback_arg, buffer->data,
1266 buffer->used);
1267 buffer->used = 0;
1270 /* If the new data would fill the buffer, or more, write it directly. */
1271 if (length >= BUFSIZE)
1273 buffer->io_callback(buffer->io_callback_arg, data, length);
1274 return;
1277 /* Otherwise, copy the new data into the buffer. */
1278 memcpy(&buffer->data[buffer->used], data, length);
1279 buffer->used += length;
1280 Assert(buffer->used <= BUFSIZE);
1284 * Generate the sentinel and CRC required at the end of a block reference
1285 * table file and flush them out of our internal buffer.
1287 static void
1288 BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
1290 BlockRefTableSerializedEntry zentry = {{0}};
1291 pg_crc32c crc;
1293 /* Write a sentinel indicating that there are no more entries. */
1294 BlockRefTableWrite(buffer, &zentry,
1295 sizeof(BlockRefTableSerializedEntry));
1298 * Writing the checksum will perturb the ongoing checksum calculation, so
1299 * copy the state first and finalize the computation using the copy.
1301 crc = buffer->crc;
1302 FIN_CRC32C(crc);
1303 BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));
1305 /* Flush any leftover data out of our buffer. */
1306 BlockRefTableFlush(buffer);