Update copyright for 2022
[pgsql.git] / src / backend / utils / mmgr / freepage.c
blobb26a295a4ead5e308ea215b9c68635c39a9178fc
1 /*-------------------------------------------------------------------------
3 * freepage.c
4 * Management of free memory pages.
6 * The intention of this code is to provide infrastructure for memory
7 * allocators written specifically for PostgreSQL. At least in the case
8 * of dynamic shared memory, we can't simply use malloc() or even
9 * relatively thin wrappers like palloc() which sit on top of it, because
10 * no allocator built into the operating system will deal with relative
11 * pointers. In the future, we may find other cases in which greater
12 * control over our own memory management seems desirable.
14 * A FreePageManager keeps track of which 4kB pages of memory are currently
15 * unused from the point of view of some higher-level memory allocator.
16 * Unlike a user-facing allocator such as palloc(), a FreePageManager can
17 * only allocate and free in units of whole pages, and freeing an
18 * allocation can only be done given knowledge of its length in pages.
20 * Since a free page manager has only a fixed amount of dedicated memory,
21 * and since there is no underlying allocator, it uses the free pages
22 * it is given to manage to store its bookkeeping data. It keeps multiple
23 * freelists of runs of pages, sorted by the size of the run; the head of
24 * each freelist is stored in the FreePageManager itself, and the first
25 * page of each run contains a relative pointer to the next run. See
26 * FreePageManagerGetInternal for more details on how the freelists are
27 * managed.
29 * To avoid memory fragmentation, it's important to consolidate adjacent
30 * spans of pages whenever possible; otherwise, large allocation requests
31 * might not be satisfied even when sufficient contiguous space is
32 * available. Therefore, in addition to the freelists, we maintain an
33 * in-memory btree of free page ranges ordered by page number. If a
34 * range being freed precedes or follows a range that is already free,
35 * the existing range is extended; if it exactly bridges the gap between
36 * free ranges, then the two existing ranges are consolidated with the
37 * newly-freed range to form one great big range of free pages.
39 * When there is only one range of free pages, the btree is trivial and
40 * is stored within the FreePageManager proper; otherwise, pages are
41 * allocated from the area under management as needed. Even in cases
42 * where memory fragmentation is very severe, only a tiny fraction of
43 * the pages under management are consumed by this btree.
45 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
46 * Portions Copyright (c) 1994, Regents of the University of California
48 * IDENTIFICATION
49 * src/backend/utils/mmgr/freepage.c
51 *-------------------------------------------------------------------------
54 #include "postgres.h"
55 #include "lib/stringinfo.h"
56 #include "miscadmin.h"
58 #include "utils/freepage.h"
59 #include "utils/relptr.h"
62 /* Magic numbers to identify various page types */
63 #define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0
64 #define FREE_PAGE_LEAF_MAGIC 0x98eae728
65 #define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9
67 /* Doubly linked list of spans of free pages; stored in first page of span. */
68 struct FreePageSpanLeader
70 int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */
71 Size npages; /* number of pages in span */
72 RelptrFreePageSpanLeader prev;
73 RelptrFreePageSpanLeader next;
76 /* Common header for btree leaf and internal pages. */
77 typedef struct FreePageBtreeHeader
79 int magic; /* FREE_PAGE_LEAF_MAGIC or
80 * FREE_PAGE_INTERNAL_MAGIC */
81 Size nused; /* number of items used */
82 RelptrFreePageBtree parent; /* uplink */
83 } FreePageBtreeHeader;
85 /* Internal key; points to next level of btree. */
86 typedef struct FreePageBtreeInternalKey
88 Size first_page; /* low bound for keys on child page */
89 RelptrFreePageBtree child; /* downlink */
90 } FreePageBtreeInternalKey;
92 /* Leaf key; no payload data. */
93 typedef struct FreePageBtreeLeafKey
95 Size first_page; /* first page in span */
96 Size npages; /* number of pages in span */
97 } FreePageBtreeLeafKey;
99 /* Work out how many keys will fit on a page. */
100 #define FPM_ITEMS_PER_INTERNAL_PAGE \
101 ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
102 sizeof(FreePageBtreeInternalKey))
103 #define FPM_ITEMS_PER_LEAF_PAGE \
104 ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
105 sizeof(FreePageBtreeLeafKey))
107 /* A btree page of either sort */
108 struct FreePageBtree
110 FreePageBtreeHeader hdr;
111 union
113 FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
114 FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
115 } u;
118 /* Results of a btree search */
119 typedef struct FreePageBtreeSearchResult
121 FreePageBtree *page;
122 Size index;
123 bool found;
124 unsigned split_pages;
125 } FreePageBtreeSearchResult;
127 /* Helper functions */
128 static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
129 FreePageBtree *btp);
130 static Size FreePageBtreeCleanup(FreePageManager *fpm);
131 static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
132 FreePageBtree *btp);
133 static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
134 FreePageBtree *btp);
135 static Size FreePageBtreeFirstKey(FreePageBtree *btp);
136 static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
137 static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
138 Size index, Size first_page, FreePageBtree *child);
139 static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
140 Size first_page, Size npages);
141 static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
142 static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
143 Size index);
144 static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
145 static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
146 FreePageBtreeSearchResult *result);
147 static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
148 static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
149 static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
150 FreePageBtree *btp);
151 static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
152 static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
153 FreePageBtree *parent, int level, StringInfo buf);
154 static void FreePageManagerDumpSpans(FreePageManager *fpm,
155 FreePageSpanLeader *span, Size expected_pages,
156 StringInfo buf);
157 static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
158 Size *first_page);
159 static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
160 Size npages, bool soft);
161 static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
162 static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
163 Size npages);
164 static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
165 static void FreePageManagerUpdateLargest(FreePageManager *fpm);
167 #ifdef FPM_EXTRA_ASSERTS
168 static Size sum_free_pages(FreePageManager *fpm);
169 #endif
172 * Initialize a new, empty free page manager.
174 * 'fpm' should reference caller-provided memory large enough to contain a
175 * FreePageManager. We'll initialize it here.
177 * 'base' is the address to which all pointers are relative. When managing
178 * a dynamic shared memory segment, it should normally be the base of the
179 * segment. When managing backend-private memory, it can be either NULL or,
180 * if managing a single contiguous extent of memory, the start of that extent.
182 void
183 FreePageManagerInitialize(FreePageManager *fpm, char *base)
185 Size f;
187 relptr_store(base, fpm->self, fpm);
188 relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
189 relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
190 fpm->btree_depth = 0;
191 fpm->btree_recycle_count = 0;
192 fpm->singleton_first_page = 0;
193 fpm->singleton_npages = 0;
194 fpm->contiguous_pages = 0;
195 fpm->contiguous_pages_dirty = true;
196 #ifdef FPM_EXTRA_ASSERTS
197 fpm->free_pages = 0;
198 #endif
200 for (f = 0; f < FPM_NUM_FREELISTS; f++)
201 relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
205 * Allocate a run of pages of the given length from the free page manager.
206 * The return value indicates whether we were able to satisfy the request;
207 * if true, the first page of the allocation is stored in *first_page.
209 bool
210 FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
212 bool result;
213 Size contiguous_pages;
215 result = FreePageManagerGetInternal(fpm, npages, first_page);
218 * It's a bit counterintuitive, but allocating pages can actually create
219 * opportunities for cleanup that create larger ranges. We might pull a
220 * key out of the btree that enables the item at the head of the btree
221 * recycle list to be inserted; and then if there are more items behind it
222 * one of those might cause two currently-separated ranges to merge,
223 * creating a single range of contiguous pages larger than any that
224 * existed previously. It might be worth trying to improve the cleanup
225 * algorithm to avoid such corner cases, but for now we just notice the
226 * condition and do the appropriate reporting.
228 contiguous_pages = FreePageBtreeCleanup(fpm);
229 if (fpm->contiguous_pages < contiguous_pages)
230 fpm->contiguous_pages = contiguous_pages;
233 * FreePageManagerGetInternal may have set contiguous_pages_dirty.
234 * Recompute contiguous_pages if so.
236 FreePageManagerUpdateLargest(fpm);
238 #ifdef FPM_EXTRA_ASSERTS
239 if (result)
241 Assert(fpm->free_pages >= npages);
242 fpm->free_pages -= npages;
244 Assert(fpm->free_pages == sum_free_pages(fpm));
245 Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
246 #endif
247 return result;
250 #ifdef FPM_EXTRA_ASSERTS
251 static void
252 sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
254 char *base = fpm_segment_base(fpm);
256 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ||
257 btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
258 ++*sum;
259 if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
261 Size index;
264 for (index = 0; index < btp->hdr.nused; ++index)
266 FreePageBtree *child;
268 child = relptr_access(base, btp->u.internal_key[index].child);
269 sum_free_pages_recurse(fpm, child, sum);
273 static Size
274 sum_free_pages(FreePageManager *fpm)
276 FreePageSpanLeader *recycle;
277 char *base = fpm_segment_base(fpm);
278 Size sum = 0;
279 int list;
281 /* Count the spans by scanning the freelists. */
282 for (list = 0; list < FPM_NUM_FREELISTS; ++list)
285 if (!relptr_is_null(fpm->freelist[list]))
287 FreePageSpanLeader *candidate =
288 relptr_access(base, fpm->freelist[list]);
292 sum += candidate->npages;
293 candidate = relptr_access(base, candidate->next);
294 } while (candidate != NULL);
298 /* Count btree internal pages. */
299 if (fpm->btree_depth > 0)
301 FreePageBtree *root = relptr_access(base, fpm->btree_root);
303 sum_free_pages_recurse(fpm, root, &sum);
306 /* Count the recycle list. */
307 for (recycle = relptr_access(base, fpm->btree_recycle);
308 recycle != NULL;
309 recycle = relptr_access(base, recycle->next))
311 Assert(recycle->npages == 1);
312 ++sum;
315 return sum;
317 #endif
320 * Compute the size of the largest run of pages that the user could
321 * successfully get.
323 static Size
324 FreePageManagerLargestContiguous(FreePageManager *fpm)
326 char *base;
327 Size largest;
329 base = fpm_segment_base(fpm);
330 largest = 0;
331 if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
333 FreePageSpanLeader *candidate;
335 candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
338 if (candidate->npages > largest)
339 largest = candidate->npages;
340 candidate = relptr_access(base, candidate->next);
341 } while (candidate != NULL);
343 else
345 Size f = FPM_NUM_FREELISTS - 1;
349 --f;
350 if (!relptr_is_null(fpm->freelist[f]))
352 largest = f + 1;
353 break;
355 } while (f > 0);
358 return largest;
362 * Recompute the size of the largest run of pages that the user could
363 * successfully get, if it has been marked dirty.
365 static void
366 FreePageManagerUpdateLargest(FreePageManager *fpm)
368 if (!fpm->contiguous_pages_dirty)
369 return;
371 fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm);
372 fpm->contiguous_pages_dirty = false;
376 * Transfer a run of pages to the free page manager.
378 void
379 FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
381 Size contiguous_pages;
383 Assert(npages > 0);
385 /* Record the new pages. */
386 contiguous_pages =
387 FreePageManagerPutInternal(fpm, first_page, npages, false);
390 * If the new range we inserted into the page manager was contiguous with
391 * an existing range, it may have opened up cleanup opportunities.
393 if (contiguous_pages > npages)
395 Size cleanup_contiguous_pages;
397 cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
398 if (cleanup_contiguous_pages > contiguous_pages)
399 contiguous_pages = cleanup_contiguous_pages;
402 /* See if we now have a new largest chunk. */
403 if (fpm->contiguous_pages < contiguous_pages)
404 fpm->contiguous_pages = contiguous_pages;
407 * The earlier call to FreePageManagerPutInternal may have set
408 * contiguous_pages_dirty if it needed to allocate internal pages, so
409 * recompute contiguous_pages if necessary.
411 FreePageManagerUpdateLargest(fpm);
413 #ifdef FPM_EXTRA_ASSERTS
414 fpm->free_pages += npages;
415 Assert(fpm->free_pages == sum_free_pages(fpm));
416 Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
417 #endif
421 * Produce a debugging dump of the state of a free page manager.
423 char *
424 FreePageManagerDump(FreePageManager *fpm)
426 char *base = fpm_segment_base(fpm);
427 StringInfoData buf;
428 FreePageSpanLeader *recycle;
429 bool dumped_any_freelist = false;
430 Size f;
432 /* Initialize output buffer. */
433 initStringInfo(&buf);
435 /* Dump general stuff. */
436 appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
437 fpm->self.relptr_off, fpm->contiguous_pages);
439 /* Dump btree. */
440 if (fpm->btree_depth > 0)
442 FreePageBtree *root;
444 appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
445 root = relptr_access(base, fpm->btree_root);
446 FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
448 else if (fpm->singleton_npages > 0)
450 appendStringInfo(&buf, "singleton: %zu(%zu)\n",
451 fpm->singleton_first_page, fpm->singleton_npages);
454 /* Dump btree recycle list. */
455 recycle = relptr_access(base, fpm->btree_recycle);
456 if (recycle != NULL)
458 appendStringInfoString(&buf, "btree recycle:");
459 FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
462 /* Dump free lists. */
463 for (f = 0; f < FPM_NUM_FREELISTS; ++f)
465 FreePageSpanLeader *span;
467 if (relptr_is_null(fpm->freelist[f]))
468 continue;
469 if (!dumped_any_freelist)
471 appendStringInfoString(&buf, "freelists:\n");
472 dumped_any_freelist = true;
474 appendStringInfo(&buf, " %zu:", f + 1);
475 span = relptr_access(base, fpm->freelist[f]);
476 FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
479 /* And return result to caller. */
480 return buf.data;
485 * The first_page value stored at index zero in any non-root page must match
486 * the first_page value stored in its parent at the index which points to that
487 * page. So when the value stored at index zero in a btree page changes, we've
488 * got to walk up the tree adjusting ancestor keys until we reach an ancestor
489 * where that key isn't index zero. This function should be called after
490 * updating the first key on the target page; it will propagate the change
491 * upward as far as needed.
493 * We assume here that the first key on the page has not changed enough to
494 * require changes in the ordering of keys on its ancestor pages. Thus,
495 * if we search the parent page for the first key greater than or equal to
496 * the first key on the current page, the downlink to this page will be either
497 * the exact index returned by the search (if the first key decreased)
498 * or one less (if the first key increased).
500 static void
501 FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
503 char *base = fpm_segment_base(fpm);
504 Size first_page;
505 FreePageBtree *parent;
506 FreePageBtree *child;
508 /* This might be either a leaf or an internal page. */
509 Assert(btp->hdr.nused > 0);
510 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
512 Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
513 first_page = btp->u.leaf_key[0].first_page;
515 else
517 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
518 Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
519 first_page = btp->u.internal_key[0].first_page;
521 child = btp;
523 /* Loop until we find an ancestor that does not require adjustment. */
524 for (;;)
526 Size s;
528 parent = relptr_access(base, child->hdr.parent);
529 if (parent == NULL)
530 break;
531 s = FreePageBtreeSearchInternal(parent, first_page);
533 /* Key is either at index s or index s-1; figure out which. */
534 if (s >= parent->hdr.nused)
536 Assert(s == parent->hdr.nused);
537 --s;
539 else
541 FreePageBtree *check;
543 check = relptr_access(base, parent->u.internal_key[s].child);
544 if (check != child)
546 Assert(s > 0);
547 --s;
551 #ifdef USE_ASSERT_CHECKING
552 /* Debugging double-check. */
554 FreePageBtree *check;
556 check = relptr_access(base, parent->u.internal_key[s].child);
557 Assert(s < parent->hdr.nused);
558 Assert(child == check);
560 #endif
562 /* Update the parent key. */
563 parent->u.internal_key[s].first_page = first_page;
566 * If this is the first key in the parent, go up another level; else
567 * done.
569 if (s > 0)
570 break;
571 child = parent;
576 * Attempt to reclaim space from the free-page btree. The return value is
577 * the largest range of contiguous pages created by the cleanup operation.
579 static Size
580 FreePageBtreeCleanup(FreePageManager *fpm)
582 char *base = fpm_segment_base(fpm);
583 Size max_contiguous_pages = 0;
585 /* Attempt to shrink the depth of the btree. */
586 while (!relptr_is_null(fpm->btree_root))
588 FreePageBtree *root = relptr_access(base, fpm->btree_root);
590 /* If the root contains only one key, reduce depth by one. */
591 if (root->hdr.nused == 1)
593 /* Shrink depth of tree by one. */
594 Assert(fpm->btree_depth > 0);
595 --fpm->btree_depth;
596 if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
598 /* If root is a leaf, convert only entry to singleton range. */
599 relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
600 fpm->singleton_first_page = root->u.leaf_key[0].first_page;
601 fpm->singleton_npages = root->u.leaf_key[0].npages;
603 else
605 FreePageBtree *newroot;
607 /* If root is an internal page, make only child the root. */
608 Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
609 relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
610 newroot = relptr_access(base, fpm->btree_root);
611 relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
613 FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
615 else if (root->hdr.nused == 2 &&
616 root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
618 Size end_of_first;
619 Size start_of_second;
621 end_of_first = root->u.leaf_key[0].first_page +
622 root->u.leaf_key[0].npages;
623 start_of_second = root->u.leaf_key[1].first_page;
625 if (end_of_first + 1 == start_of_second)
627 Size root_page = fpm_pointer_to_page(base, root);
629 if (end_of_first == root_page)
631 FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
632 FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
633 fpm->singleton_first_page = root->u.leaf_key[0].first_page;
634 fpm->singleton_npages = root->u.leaf_key[0].npages +
635 root->u.leaf_key[1].npages + 1;
636 fpm->btree_depth = 0;
637 relptr_store(base, fpm->btree_root,
638 (FreePageBtree *) NULL);
639 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
640 fpm->singleton_npages);
641 Assert(max_contiguous_pages == 0);
642 max_contiguous_pages = fpm->singleton_npages;
646 /* Whether it worked or not, it's time to stop. */
647 break;
649 else
651 /* Nothing more to do. Stop. */
652 break;
657 * Attempt to free recycled btree pages. We skip this if releasing the
658 * recycled page would require a btree page split, because the page we're
659 * trying to recycle would be consumed by the split, which would be
660 * counterproductive.
662 * We also currently only ever attempt to recycle the first page on the
663 * list; that could be made more aggressive, but it's not clear that the
664 * complexity would be worthwhile.
666 while (fpm->btree_recycle_count > 0)
668 FreePageBtree *btp;
669 Size first_page;
670 Size contiguous_pages;
672 btp = FreePageBtreeGetRecycled(fpm);
673 first_page = fpm_pointer_to_page(base, btp);
674 contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
675 if (contiguous_pages == 0)
677 FreePageBtreeRecycle(fpm, first_page);
678 break;
680 else
682 if (contiguous_pages > max_contiguous_pages)
683 max_contiguous_pages = contiguous_pages;
687 return max_contiguous_pages;
691 * Consider consolidating the given page with its left or right sibling,
692 * if it's fairly empty.
694 static void
695 FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
697 char *base = fpm_segment_base(fpm);
698 FreePageBtree *np;
699 Size max;
702 * We only try to consolidate pages that are less than a third full. We
703 * could be more aggressive about this, but that might risk performing
704 * consolidation only to end up splitting again shortly thereafter. Since
705 * the btree should be very small compared to the space under management,
706 * our goal isn't so much to ensure that it always occupies the absolutely
707 * smallest possible number of pages as to reclaim pages before things get
708 * too egregiously out of hand.
710 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
711 max = FPM_ITEMS_PER_LEAF_PAGE;
712 else
714 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
715 max = FPM_ITEMS_PER_INTERNAL_PAGE;
717 if (btp->hdr.nused >= max / 3)
718 return;
721 * If we can fit our right sibling's keys onto this page, consolidate.
723 np = FreePageBtreeFindRightSibling(base, btp);
724 if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
726 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
728 memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
729 sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
730 btp->hdr.nused += np->hdr.nused;
732 else
734 memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
735 sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
736 btp->hdr.nused += np->hdr.nused;
737 FreePageBtreeUpdateParentPointers(base, btp);
739 FreePageBtreeRemovePage(fpm, np);
740 return;
744 * If we can fit our keys onto our left sibling's page, consolidate. In
745 * this case, we move our keys onto the other page rather than vice versa,
746 * to avoid having to adjust ancestor keys.
748 np = FreePageBtreeFindLeftSibling(base, btp);
749 if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
751 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
753 memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
754 sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
755 np->hdr.nused += btp->hdr.nused;
757 else
759 memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
760 sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
761 np->hdr.nused += btp->hdr.nused;
762 FreePageBtreeUpdateParentPointers(base, np);
764 FreePageBtreeRemovePage(fpm, btp);
765 return;
770 * Find the passed page's left sibling; that is, the page at the same level
771 * of the tree whose keyspace immediately precedes ours.
773 static FreePageBtree *
774 FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
776 FreePageBtree *p = btp;
777 int levels = 0;
779 /* Move up until we can move left. */
780 for (;;)
782 Size first_page;
783 Size index;
785 first_page = FreePageBtreeFirstKey(p);
786 p = relptr_access(base, p->hdr.parent);
788 if (p == NULL)
789 return NULL; /* we were passed the rightmost page */
791 index = FreePageBtreeSearchInternal(p, first_page);
792 if (index > 0)
794 Assert(p->u.internal_key[index].first_page == first_page);
795 p = relptr_access(base, p->u.internal_key[index - 1].child);
796 break;
798 Assert(index == 0);
799 ++levels;
802 /* Descend left. */
803 while (levels > 0)
805 Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
806 p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
807 --levels;
809 Assert(p->hdr.magic == btp->hdr.magic);
811 return p;
815 * Find the passed page's right sibling; that is, the page at the same level
816 * of the tree whose keyspace immediately follows ours.
818 static FreePageBtree *
819 FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
821 FreePageBtree *p = btp;
822 int levels = 0;
824 /* Move up until we can move right. */
825 for (;;)
827 Size first_page;
828 Size index;
830 first_page = FreePageBtreeFirstKey(p);
831 p = relptr_access(base, p->hdr.parent);
833 if (p == NULL)
834 return NULL; /* we were passed the rightmost page */
836 index = FreePageBtreeSearchInternal(p, first_page);
837 if (index < p->hdr.nused - 1)
839 Assert(p->u.internal_key[index].first_page == first_page);
840 p = relptr_access(base, p->u.internal_key[index + 1].child);
841 break;
843 Assert(index == p->hdr.nused - 1);
844 ++levels;
847 /* Descend left. */
848 while (levels > 0)
850 Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
851 p = relptr_access(base, p->u.internal_key[0].child);
852 --levels;
854 Assert(p->hdr.magic == btp->hdr.magic);
856 return p;
860 * Get the first key on a btree page.
862 static Size
863 FreePageBtreeFirstKey(FreePageBtree *btp)
865 Assert(btp->hdr.nused > 0);
867 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
868 return btp->u.leaf_key[0].first_page;
869 else
871 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
872 return btp->u.internal_key[0].first_page;
877 * Get a page from the btree recycle list for use as a btree page.
879 static FreePageBtree *
880 FreePageBtreeGetRecycled(FreePageManager *fpm)
882 char *base = fpm_segment_base(fpm);
883 FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
884 FreePageSpanLeader *newhead;
886 Assert(victim != NULL);
887 newhead = relptr_access(base, victim->next);
888 if (newhead != NULL)
889 relptr_copy(newhead->prev, victim->prev);
890 relptr_store(base, fpm->btree_recycle, newhead);
891 Assert(fpm_pointer_is_page_aligned(base, victim));
892 fpm->btree_recycle_count--;
893 return (FreePageBtree *) victim;
897 * Insert an item into an internal page.
899 static void
900 FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
901 Size first_page, FreePageBtree *child)
903 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
904 Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
905 Assert(index <= btp->hdr.nused);
906 memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
907 sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
908 btp->u.internal_key[index].first_page = first_page;
909 relptr_store(base, btp->u.internal_key[index].child, child);
910 ++btp->hdr.nused;
914 * Insert an item into a leaf page.
916 static void
917 FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
918 Size npages)
920 Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
921 Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
922 Assert(index <= btp->hdr.nused);
923 memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
924 sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
925 btp->u.leaf_key[index].first_page = first_page;
926 btp->u.leaf_key[index].npages = npages;
927 ++btp->hdr.nused;
931 * Put a page on the btree recycle list.
933 static void
934 FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
936 char *base = fpm_segment_base(fpm);
937 FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
938 FreePageSpanLeader *span;
940 span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
941 span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
942 span->npages = 1;
943 relptr_store(base, span->next, head);
944 relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
945 if (head != NULL)
946 relptr_store(base, head->prev, span);
947 relptr_store(base, fpm->btree_recycle, span);
948 fpm->btree_recycle_count++;
952 * Remove an item from the btree at the given position on the given page.
954 static void
955 FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
957 Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
958 Assert(index < btp->hdr.nused);
960 /* When last item is removed, extirpate entire page from btree. */
961 if (btp->hdr.nused == 1)
963 FreePageBtreeRemovePage(fpm, btp);
964 return;
967 /* Physically remove the key from the page. */
968 --btp->hdr.nused;
969 if (index < btp->hdr.nused)
970 memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
971 sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
973 /* If we just removed the first key, adjust ancestor keys. */
974 if (index == 0)
975 FreePageBtreeAdjustAncestorKeys(fpm, btp);
977 /* Consider whether to consolidate this page with a sibling. */
978 FreePageBtreeConsolidate(fpm, btp);
982 * Remove a page from the btree. Caller is responsible for having relocated
983 * any keys from this page that are still wanted. The page is placed on the
984 * recycled list.
986 static void
987 FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
989 char *base = fpm_segment_base(fpm);
990 FreePageBtree *parent;
991 Size index;
992 Size first_page;
994 for (;;)
996 /* Find parent page. */
997 parent = relptr_access(base, btp->hdr.parent);
998 if (parent == NULL)
1000 /* We are removing the root page. */
1001 relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
1002 fpm->btree_depth = 0;
1003 Assert(fpm->singleton_first_page == 0);
1004 Assert(fpm->singleton_npages == 0);
1005 return;
1009 * If the parent contains only one item, we need to remove it as well.
1011 if (parent->hdr.nused > 1)
1012 break;
1013 FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1014 btp = parent;
1017 /* Find and remove the downlink. */
1018 first_page = FreePageBtreeFirstKey(btp);
1019 if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1021 index = FreePageBtreeSearchLeaf(parent, first_page);
1022 Assert(index < parent->hdr.nused);
1023 if (index < parent->hdr.nused - 1)
1024 memmove(&parent->u.leaf_key[index],
1025 &parent->u.leaf_key[index + 1],
1026 sizeof(FreePageBtreeLeafKey)
1027 * (parent->hdr.nused - index - 1));
1029 else
1031 index = FreePageBtreeSearchInternal(parent, first_page);
1032 Assert(index < parent->hdr.nused);
1033 if (index < parent->hdr.nused - 1)
1034 memmove(&parent->u.internal_key[index],
1035 &parent->u.internal_key[index + 1],
1036 sizeof(FreePageBtreeInternalKey)
1037 * (parent->hdr.nused - index - 1));
1039 parent->hdr.nused--;
1040 Assert(parent->hdr.nused > 0);
1042 /* Recycle the page. */
1043 FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1045 /* Adjust ancestor keys if needed. */
1046 if (index == 0)
1047 FreePageBtreeAdjustAncestorKeys(fpm, parent);
1049 /* Consider whether to consolidate the parent with a sibling. */
1050 FreePageBtreeConsolidate(fpm, parent);
1054 * Search the btree for an entry for the given first page and initialize
1055 * *result with the results of the search. result->page and result->index
1056 * indicate either the position of an exact match or the position at which
1057 * the new key should be inserted. result->found is true for an exact match,
1058 * otherwise false. result->split_pages will contain the number of additional
1059 * btree pages that will be needed when performing a split to insert a key.
1060 * Except as described above, the contents of fields in the result object are
1061 * undefined on return.
1063 static void
1064 FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
1065 FreePageBtreeSearchResult *result)
1067 char *base = fpm_segment_base(fpm);
1068 FreePageBtree *btp = relptr_access(base, fpm->btree_root);
1069 Size index;
1071 result->split_pages = 1;
1073 /* If the btree is empty, there's nothing to find. */
1074 if (btp == NULL)
1076 result->page = NULL;
1077 result->found = false;
1078 return;
1081 /* Descend until we hit a leaf. */
1082 while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1084 FreePageBtree *child;
1085 bool found_exact;
1087 index = FreePageBtreeSearchInternal(btp, first_page);
1088 found_exact = index < btp->hdr.nused &&
1089 btp->u.internal_key[index].first_page == first_page;
1092 * If we found an exact match we descend directly. Otherwise, we
1093 * descend into the child to the left if possible so that we can find
1094 * the insertion point at that child's high end.
1096 if (!found_exact && index > 0)
1097 --index;
1099 /* Track required split depth for leaf insert. */
1100 if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
1102 Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1103 result->split_pages++;
1105 else
1106 result->split_pages = 0;
1108 /* Descend to appropriate child page. */
1109 Assert(index < btp->hdr.nused);
1110 child = relptr_access(base, btp->u.internal_key[index].child);
1111 Assert(relptr_access(base, child->hdr.parent) == btp);
1112 btp = child;
1115 /* Track required split depth for leaf insert. */
1116 if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
1118 Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1119 result->split_pages++;
1121 else
1122 result->split_pages = 0;
1124 /* Search leaf page. */
1125 index = FreePageBtreeSearchLeaf(btp, first_page);
1127 /* Assemble results. */
1128 result->page = btp;
1129 result->index = index;
1130 result->found = index < btp->hdr.nused &&
1131 first_page == btp->u.leaf_key[index].first_page;
1135 * Search an internal page for the first key greater than or equal to a given
1136 * page number. Returns the index of that key, or one greater than the number
1137 * of keys on the page if none.
1139 static Size
1140 FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
1142 Size low = 0;
1143 Size high = btp->hdr.nused;
1145 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1146 Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
1148 while (low < high)
1150 Size mid = (low + high) / 2;
1151 Size val = btp->u.internal_key[mid].first_page;
1153 if (first_page == val)
1154 return mid;
1155 else if (first_page < val)
1156 high = mid;
1157 else
1158 low = mid + 1;
1161 return low;
1165 * Search a leaf page for the first key greater than or equal to a given
1166 * page number. Returns the index of that key, or one greater than the number
1167 * of keys on the page if none.
1169 static Size
1170 FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
1172 Size low = 0;
1173 Size high = btp->hdr.nused;
1175 Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
1176 Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
1178 while (low < high)
1180 Size mid = (low + high) / 2;
1181 Size val = btp->u.leaf_key[mid].first_page;
1183 if (first_page == val)
1184 return mid;
1185 else if (first_page < val)
1186 high = mid;
1187 else
1188 low = mid + 1;
1191 return low;
1195 * Allocate a new btree page and move half the keys from the provided page
1196 * to the new page. Caller is responsible for making sure that there's a
1197 * page available from fpm->btree_recycle. Returns a pointer to the new page,
1198 * to which caller must add a downlink.
1200 static FreePageBtree *
1201 FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
1203 FreePageBtree *newsibling;
1205 newsibling = FreePageBtreeGetRecycled(fpm);
1206 newsibling->hdr.magic = btp->hdr.magic;
1207 newsibling->hdr.nused = btp->hdr.nused / 2;
1208 relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
1209 btp->hdr.nused -= newsibling->hdr.nused;
1211 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1212 memcpy(&newsibling->u.leaf_key,
1213 &btp->u.leaf_key[btp->hdr.nused],
1214 sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
1215 else
1217 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1218 memcpy(&newsibling->u.internal_key,
1219 &btp->u.internal_key[btp->hdr.nused],
1220 sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
1221 FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
1224 return newsibling;
1228 * When internal pages are split or merged, the parent pointers of their
1229 * children must be updated.
1231 static void
1232 FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
1234 Size i;
1236 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1237 for (i = 0; i < btp->hdr.nused; ++i)
1239 FreePageBtree *child;
1241 child = relptr_access(base, btp->u.internal_key[i].child);
1242 relptr_store(base, child->hdr.parent, btp);
1247 * Debugging dump of btree data.
1249 static void
1250 FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
1251 FreePageBtree *parent, int level, StringInfo buf)
1253 char *base = fpm_segment_base(fpm);
1254 Size pageno = fpm_pointer_to_page(base, btp);
1255 Size index;
1256 FreePageBtree *check_parent;
1258 check_stack_depth();
1259 check_parent = relptr_access(base, btp->hdr.parent);
1260 appendStringInfo(buf, " %zu@%d %c", pageno, level,
1261 btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
1262 if (parent != check_parent)
1263 appendStringInfo(buf, " [actual parent %zu, expected %zu]",
1264 fpm_pointer_to_page(base, check_parent),
1265 fpm_pointer_to_page(base, parent));
1266 appendStringInfoChar(buf, ':');
1267 for (index = 0; index < btp->hdr.nused; ++index)
1269 if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1270 appendStringInfo(buf, " %zu->%zu",
1271 btp->u.internal_key[index].first_page,
1272 btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE);
1273 else
1274 appendStringInfo(buf, " %zu(%zu)",
1275 btp->u.leaf_key[index].first_page,
1276 btp->u.leaf_key[index].npages);
1278 appendStringInfoChar(buf, '\n');
1280 if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1282 for (index = 0; index < btp->hdr.nused; ++index)
1284 FreePageBtree *child;
1286 child = relptr_access(base, btp->u.internal_key[index].child);
1287 FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
1293 * Debugging dump of free-span data.
1295 static void
1296 FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
1297 Size expected_pages, StringInfo buf)
1299 char *base = fpm_segment_base(fpm);
1301 while (span != NULL)
1303 if (span->npages != expected_pages)
1304 appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
1305 span->npages);
1306 else
1307 appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
1308 span = relptr_access(base, span->next);
1311 appendStringInfoChar(buf, '\n');
1315 * This function allocates a run of pages of the given length from the free
1316 * page manager.
1318 static bool
1319 FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
1321 char *base = fpm_segment_base(fpm);
1322 FreePageSpanLeader *victim = NULL;
1323 FreePageSpanLeader *prev;
1324 FreePageSpanLeader *next;
1325 FreePageBtreeSearchResult result;
1326 Size victim_page = 0; /* placate compiler */
1327 Size f;
1330 * Search for a free span.
1332 * Right now, we use a simple best-fit policy here, but it's possible for
1333 * this to result in memory fragmentation if we're repeatedly asked to
1334 * allocate chunks just a little smaller than what we have available.
1335 * Hopefully, this is unlikely, because we expect most requests to be
1336 * single pages or superblock-sized chunks -- but no policy can be optimal
1337 * under all circumstances unless it has knowledge of future allocation
1338 * patterns.
1340 for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
1342 /* Skip empty freelists. */
1343 if (relptr_is_null(fpm->freelist[f]))
1344 continue;
1347 * All of the freelists except the last one contain only items of a
1348 * single size, so we just take the first one. But the final free
1349 * list contains everything too big for any of the other lists, so we
1350 * need to search the list.
1352 if (f < FPM_NUM_FREELISTS - 1)
1353 victim = relptr_access(base, fpm->freelist[f]);
1354 else
1356 FreePageSpanLeader *candidate;
1358 candidate = relptr_access(base, fpm->freelist[f]);
1361 if (candidate->npages >= npages && (victim == NULL ||
1362 victim->npages > candidate->npages))
1364 victim = candidate;
1365 if (victim->npages == npages)
1366 break;
1368 candidate = relptr_access(base, candidate->next);
1369 } while (candidate != NULL);
1371 break;
1374 /* If we didn't find an allocatable span, return failure. */
1375 if (victim == NULL)
1376 return false;
1378 /* Remove span from free list. */
1379 Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
1380 prev = relptr_access(base, victim->prev);
1381 next = relptr_access(base, victim->next);
1382 if (prev != NULL)
1383 relptr_copy(prev->next, victim->next);
1384 else
1385 relptr_copy(fpm->freelist[f], victim->next);
1386 if (next != NULL)
1387 relptr_copy(next->prev, victim->prev);
1388 victim_page = fpm_pointer_to_page(base, victim);
1390 /* Decide whether we might be invalidating contiguous_pages. */
1391 if (f == FPM_NUM_FREELISTS - 1 &&
1392 victim->npages == fpm->contiguous_pages)
1395 * The victim span came from the oversized freelist, and had the same
1396 * size as the longest span. There may or may not be another one of
1397 * the same size, so contiguous_pages must be recomputed just to be
1398 * safe.
1400 fpm->contiguous_pages_dirty = true;
1402 else if (f + 1 == fpm->contiguous_pages &&
1403 relptr_is_null(fpm->freelist[f]))
1406 * The victim span came from a fixed sized freelist, and it was the
1407 * list for spans of the same size as the current longest span, and
1408 * the list is now empty after removing the victim. So
1409 * contiguous_pages must be recomputed without a doubt.
1411 fpm->contiguous_pages_dirty = true;
1415 * If we haven't initialized the btree yet, the victim must be the single
1416 * span stored within the FreePageManager itself. Otherwise, we need to
1417 * update the btree.
1419 if (relptr_is_null(fpm->btree_root))
1421 Assert(victim_page == fpm->singleton_first_page);
1422 Assert(victim->npages == fpm->singleton_npages);
1423 Assert(victim->npages >= npages);
1424 fpm->singleton_first_page += npages;
1425 fpm->singleton_npages -= npages;
1426 if (fpm->singleton_npages > 0)
1427 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1428 fpm->singleton_npages);
1430 else
1433 * If the span we found is exactly the right size, remove it from the
1434 * btree completely. Otherwise, adjust the btree entry to reflect the
1435 * still-unallocated portion of the span, and put that portion on the
1436 * appropriate free list.
1438 FreePageBtreeSearch(fpm, victim_page, &result);
1439 Assert(result.found);
1440 if (victim->npages == npages)
1441 FreePageBtreeRemove(fpm, result.page, result.index);
1442 else
1444 FreePageBtreeLeafKey *key;
1446 /* Adjust btree to reflect remaining pages. */
1447 Assert(victim->npages > npages);
1448 key = &result.page->u.leaf_key[result.index];
1449 Assert(key->npages == victim->npages);
1450 key->first_page += npages;
1451 key->npages -= npages;
1452 if (result.index == 0)
1453 FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1455 /* Put the unallocated pages back on the appropriate free list. */
1456 FreePagePushSpanLeader(fpm, victim_page + npages,
1457 victim->npages - npages);
1461 /* Return results to caller. */
1462 *first_page = fpm_pointer_to_page(base, victim);
1463 return true;
1467 * Put a range of pages into the btree and freelists, consolidating it with
1468 * existing free spans just before and/or after it. If 'soft' is true,
1469 * only perform the insertion if it can be done without allocating new btree
1470 * pages; if false, do it always. Returns 0 if the soft flag caused the
1471 * insertion to be skipped, or otherwise the size of the contiguous span
1472 * created by the insertion. This may be larger than npages if we're able
1473 * to consolidate with an adjacent range.
1475 static Size
1476 FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
1477 bool soft)
1479 char *base = fpm_segment_base(fpm);
1480 FreePageBtreeSearchResult result;
1481 FreePageBtreeLeafKey *prevkey = NULL;
1482 FreePageBtreeLeafKey *nextkey = NULL;
1483 FreePageBtree *np;
1484 Size nindex;
1486 Assert(npages > 0);
1488 /* We can store a single free span without initializing the btree. */
1489 if (fpm->btree_depth == 0)
1491 if (fpm->singleton_npages == 0)
1493 /* Don't have a span yet; store this one. */
1494 fpm->singleton_first_page = first_page;
1495 fpm->singleton_npages = npages;
1496 FreePagePushSpanLeader(fpm, first_page, npages);
1497 return fpm->singleton_npages;
1499 else if (fpm->singleton_first_page + fpm->singleton_npages ==
1500 first_page)
1502 /* New span immediately follows sole existing span. */
1503 fpm->singleton_npages += npages;
1504 FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1505 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1506 fpm->singleton_npages);
1507 return fpm->singleton_npages;
1509 else if (first_page + npages == fpm->singleton_first_page)
1511 /* New span immediately precedes sole existing span. */
1512 FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1513 fpm->singleton_first_page = first_page;
1514 fpm->singleton_npages += npages;
1515 FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1516 fpm->singleton_npages);
1517 return fpm->singleton_npages;
1519 else
1521 /* Not contiguous; we need to initialize the btree. */
1522 Size root_page;
1523 FreePageBtree *root;
1525 if (!relptr_is_null(fpm->btree_recycle))
1526 root = FreePageBtreeGetRecycled(fpm);
1527 else if (soft)
1528 return 0; /* Should not allocate if soft. */
1529 else if (FreePageManagerGetInternal(fpm, 1, &root_page))
1530 root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
1531 else
1533 /* We'd better be able to get a page from the existing range. */
1534 elog(FATAL, "free page manager btree is corrupt");
1537 /* Create the btree and move the preexisting range into it. */
1538 root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
1539 root->hdr.nused = 1;
1540 relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
1541 root->u.leaf_key[0].first_page = fpm->singleton_first_page;
1542 root->u.leaf_key[0].npages = fpm->singleton_npages;
1543 relptr_store(base, fpm->btree_root, root);
1544 fpm->singleton_first_page = 0;
1545 fpm->singleton_npages = 0;
1546 fpm->btree_depth = 1;
1549 * Corner case: it may be that the btree root took the very last
1550 * free page. In that case, the sole btree entry covers a zero
1551 * page run, which is invalid. Overwrite it with the entry we're
1552 * trying to insert and get out.
1554 if (root->u.leaf_key[0].npages == 0)
1556 root->u.leaf_key[0].first_page = first_page;
1557 root->u.leaf_key[0].npages = npages;
1558 FreePagePushSpanLeader(fpm, first_page, npages);
1559 return npages;
1562 /* Fall through to insert the new key. */
1566 /* Search the btree. */
1567 FreePageBtreeSearch(fpm, first_page, &result);
1568 Assert(!result.found);
1569 if (result.index > 0)
1570 prevkey = &result.page->u.leaf_key[result.index - 1];
1571 if (result.index < result.page->hdr.nused)
1573 np = result.page;
1574 nindex = result.index;
1575 nextkey = &result.page->u.leaf_key[result.index];
1577 else
1579 np = FreePageBtreeFindRightSibling(base, result.page);
1580 nindex = 0;
1581 if (np != NULL)
1582 nextkey = &np->u.leaf_key[0];
1585 /* Consolidate with the previous entry if possible. */
1586 if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
1588 bool remove_next = false;
1589 Size result;
1591 Assert(prevkey->first_page + prevkey->npages == first_page);
1592 prevkey->npages = (first_page - prevkey->first_page) + npages;
1594 /* Check whether we can *also* consolidate with the following entry. */
1595 if (nextkey != NULL &&
1596 prevkey->first_page + prevkey->npages >= nextkey->first_page)
1598 Assert(prevkey->first_page + prevkey->npages ==
1599 nextkey->first_page);
1600 prevkey->npages = (nextkey->first_page - prevkey->first_page)
1601 + nextkey->npages;
1602 FreePagePopSpanLeader(fpm, nextkey->first_page);
1603 remove_next = true;
1606 /* Put the span on the correct freelist and save size. */
1607 FreePagePopSpanLeader(fpm, prevkey->first_page);
1608 FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
1609 result = prevkey->npages;
1612 * If we consolidated with both the preceding and following entries,
1613 * we must remove the following entry. We do this last, because
1614 * removing an element from the btree may invalidate pointers we hold
1615 * into the current data structure.
1617 * NB: The btree is technically in an invalid state a this point
1618 * because we've already updated prevkey to cover the same key space
1619 * as nextkey. FreePageBtreeRemove() shouldn't notice that, though.
1621 if (remove_next)
1622 FreePageBtreeRemove(fpm, np, nindex);
1624 return result;
1627 /* Consolidate with the next entry if possible. */
1628 if (nextkey != NULL && first_page + npages >= nextkey->first_page)
1630 Size newpages;
1632 /* Compute new size for span. */
1633 Assert(first_page + npages == nextkey->first_page);
1634 newpages = (nextkey->first_page - first_page) + nextkey->npages;
1636 /* Put span on correct free list. */
1637 FreePagePopSpanLeader(fpm, nextkey->first_page);
1638 FreePagePushSpanLeader(fpm, first_page, newpages);
1640 /* Update key in place. */
1641 nextkey->first_page = first_page;
1642 nextkey->npages = newpages;
1644 /* If reducing first key on page, ancestors might need adjustment. */
1645 if (nindex == 0)
1646 FreePageBtreeAdjustAncestorKeys(fpm, np);
1648 return nextkey->npages;
1651 /* Split leaf page and as many of its ancestors as necessary. */
1652 if (result.split_pages > 0)
1655 * NB: We could consider various coping strategies here to avoid a
1656 * split; most obviously, if np != result.page, we could target that
1657 * page instead. More complicated shuffling strategies could be
1658 * possible as well; basically, unless every single leaf page is 100%
1659 * full, we can jam this key in there if we try hard enough. It's
1660 * unlikely that trying that hard is worthwhile, but it's possible we
1661 * might need to make more than no effort. For now, we just do the
1662 * easy thing, which is nothing.
1665 /* If this is a soft insert, it's time to give up. */
1666 if (soft)
1667 return 0;
1669 /* Check whether we need to allocate more btree pages to split. */
1670 if (result.split_pages > fpm->btree_recycle_count)
1672 Size pages_needed;
1673 Size recycle_page;
1674 Size i;
1677 * Allocate the required number of pages and split each one in
1678 * turn. This should never fail, because if we've got enough
1679 * spans of free pages kicking around that we need additional
1680 * storage space just to remember them all, then we should
1681 * certainly have enough to expand the btree, which should only
1682 * ever use a tiny number of pages compared to the number under
1683 * management. If it does, something's badly screwed up.
1685 pages_needed = result.split_pages - fpm->btree_recycle_count;
1686 for (i = 0; i < pages_needed; ++i)
1688 if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
1689 elog(FATAL, "free page manager btree is corrupt");
1690 FreePageBtreeRecycle(fpm, recycle_page);
1694 * The act of allocating pages to recycle may have invalidated the
1695 * results of our previous btree research, so repeat it. (We could
1696 * recheck whether any of our split-avoidance strategies that were
1697 * not viable before now are, but it hardly seems worthwhile, so
1698 * we don't bother. Consolidation can't be possible now if it
1699 * wasn't previously.)
1701 FreePageBtreeSearch(fpm, first_page, &result);
1704 * The act of allocating pages for use in constructing our btree
1705 * should never cause any page to become more full, so the new
1706 * split depth should be no greater than the old one, and perhaps
1707 * less if we fortuitously allocated a chunk that freed up a slot
1708 * on the page we need to update.
1710 Assert(result.split_pages <= fpm->btree_recycle_count);
1713 /* If we still need to perform a split, do it. */
1714 if (result.split_pages > 0)
1716 FreePageBtree *split_target = result.page;
1717 FreePageBtree *child = NULL;
1718 Size key = first_page;
1720 for (;;)
1722 FreePageBtree *newsibling;
1723 FreePageBtree *parent;
1725 /* Identify parent page, which must receive downlink. */
1726 parent = relptr_access(base, split_target->hdr.parent);
1728 /* Split the page - downlink not added yet. */
1729 newsibling = FreePageBtreeSplitPage(fpm, split_target);
1732 * At this point in the loop, we're always carrying a pending
1733 * insertion. On the first pass, it's the actual key we're
1734 * trying to insert; on subsequent passes, it's the downlink
1735 * that needs to be added as a result of the split performed
1736 * during the previous loop iteration. Since we've just split
1737 * the page, there's definitely room on one of the two
1738 * resulting pages.
1740 if (child == NULL)
1742 Size index;
1743 FreePageBtree *insert_into;
1745 insert_into = key < newsibling->u.leaf_key[0].first_page ?
1746 split_target : newsibling;
1747 index = FreePageBtreeSearchLeaf(insert_into, key);
1748 FreePageBtreeInsertLeaf(insert_into, index, key, npages);
1749 if (index == 0 && insert_into == split_target)
1750 FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1752 else
1754 Size index;
1755 FreePageBtree *insert_into;
1757 insert_into =
1758 key < newsibling->u.internal_key[0].first_page ?
1759 split_target : newsibling;
1760 index = FreePageBtreeSearchInternal(insert_into, key);
1761 FreePageBtreeInsertInternal(base, insert_into, index,
1762 key, child);
1763 relptr_store(base, child->hdr.parent, insert_into);
1764 if (index == 0 && insert_into == split_target)
1765 FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1768 /* If the page we just split has no parent, split the root. */
1769 if (parent == NULL)
1771 FreePageBtree *newroot;
1773 newroot = FreePageBtreeGetRecycled(fpm);
1774 newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
1775 newroot->hdr.nused = 2;
1776 relptr_store(base, newroot->hdr.parent,
1777 (FreePageBtree *) NULL);
1778 newroot->u.internal_key[0].first_page =
1779 FreePageBtreeFirstKey(split_target);
1780 relptr_store(base, newroot->u.internal_key[0].child,
1781 split_target);
1782 relptr_store(base, split_target->hdr.parent, newroot);
1783 newroot->u.internal_key[1].first_page =
1784 FreePageBtreeFirstKey(newsibling);
1785 relptr_store(base, newroot->u.internal_key[1].child,
1786 newsibling);
1787 relptr_store(base, newsibling->hdr.parent, newroot);
1788 relptr_store(base, fpm->btree_root, newroot);
1789 fpm->btree_depth++;
1791 break;
1794 /* If the parent page isn't full, insert the downlink. */
1795 key = newsibling->u.internal_key[0].first_page;
1796 if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
1798 Size index;
1800 index = FreePageBtreeSearchInternal(parent, key);
1801 FreePageBtreeInsertInternal(base, parent, index,
1802 key, newsibling);
1803 relptr_store(base, newsibling->hdr.parent, parent);
1804 if (index == 0)
1805 FreePageBtreeAdjustAncestorKeys(fpm, parent);
1806 break;
1809 /* The parent also needs to be split, so loop around. */
1810 child = newsibling;
1811 split_target = parent;
1815 * The loop above did the insert, so just need to update the free
1816 * list, and we're done.
1818 FreePagePushSpanLeader(fpm, first_page, npages);
1820 return npages;
1824 /* Physically add the key to the page. */
1825 Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
1826 FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
1828 /* If new first key on page, ancestors might need adjustment. */
1829 if (result.index == 0)
1830 FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1832 /* Put it on the free list. */
1833 FreePagePushSpanLeader(fpm, first_page, npages);
1835 return npages;
1839 * Remove a FreePageSpanLeader from the linked-list that contains it, either
1840 * because we're changing the size of the span, or because we're allocating it.
1842 static void
1843 FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
1845 char *base = fpm_segment_base(fpm);
1846 FreePageSpanLeader *span;
1847 FreePageSpanLeader *next;
1848 FreePageSpanLeader *prev;
1850 span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
1852 next = relptr_access(base, span->next);
1853 prev = relptr_access(base, span->prev);
1854 if (next != NULL)
1855 relptr_copy(next->prev, span->prev);
1856 if (prev != NULL)
1857 relptr_copy(prev->next, span->next);
1858 else
1860 Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
1862 Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE);
1863 relptr_copy(fpm->freelist[f], span->next);
1868 * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
1870 static void
1871 FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
1873 char *base = fpm_segment_base(fpm);
1874 Size f = Min(npages, FPM_NUM_FREELISTS) - 1;
1875 FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
1876 FreePageSpanLeader *span;
1878 span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
1879 span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
1880 span->npages = npages;
1881 relptr_store(base, span->next, head);
1882 relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
1883 if (head != NULL)
1884 relptr_store(base, head->prev, span);
1885 relptr_store(base, fpm->freelist[f], span);