1 /*-------------------------------------------------------------------------
4 * Management of free memory pages.
6 * The intention of this code is to provide infrastructure for memory
7 * allocators written specifically for PostgreSQL. At least in the case
8 * of dynamic shared memory, we can't simply use malloc() or even
9 * relatively thin wrappers like palloc() which sit on top of it, because
10 * no allocator built into the operating system will deal with relative
11 * pointers. In the future, we may find other cases in which greater
12 * control over our own memory management seems desirable.
14 * A FreePageManager keeps track of which 4kB pages of memory are currently
15 * unused from the point of view of some higher-level memory allocator.
16 * Unlike a user-facing allocator such as palloc(), a FreePageManager can
17 * only allocate and free in units of whole pages, and freeing an
18 * allocation can only be done given knowledge of its length in pages.
20 * Since a free page manager has only a fixed amount of dedicated memory,
21 * and since there is no underlying allocator, it uses the free pages
22 * it is given to manage to store its bookkeeping data. It keeps multiple
23 * freelists of runs of pages, sorted by the size of the run; the head of
24 * each freelist is stored in the FreePageManager itself, and the first
25 * page of each run contains a relative pointer to the next run. See
26 * FreePageManagerGetInternal for more details on how the freelists are
29 * To avoid memory fragmentation, it's important to consolidate adjacent
30 * spans of pages whenever possible; otherwise, large allocation requests
31 * might not be satisfied even when sufficient contiguous space is
32 * available. Therefore, in addition to the freelists, we maintain an
33 * in-memory btree of free page ranges ordered by page number. If a
34 * range being freed precedes or follows a range that is already free,
35 * the existing range is extended; if it exactly bridges the gap between
36 * free ranges, then the two existing ranges are consolidated with the
37 * newly-freed range to form one great big range of free pages.
39 * When there is only one range of free pages, the btree is trivial and
40 * is stored within the FreePageManager proper; otherwise, pages are
41 * allocated from the area under management as needed. Even in cases
42 * where memory fragmentation is very severe, only a tiny fraction of
43 * the pages under management are consumed by this btree.
45 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
46 * Portions Copyright (c) 1994, Regents of the University of California
49 * src/backend/utils/mmgr/freepage.c
51 *-------------------------------------------------------------------------
55 #include "lib/stringinfo.h"
56 #include "miscadmin.h"
58 #include "utils/freepage.h"
59 #include "utils/relptr.h"
62 /* Magic numbers to identify various page types */
63 #define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0
64 #define FREE_PAGE_LEAF_MAGIC 0x98eae728
65 #define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9
67 /* Doubly linked list of spans of free pages; stored in first page of span. */
68 struct FreePageSpanLeader
70 int magic
; /* always FREE_PAGE_SPAN_LEADER_MAGIC */
71 Size npages
; /* number of pages in span */
72 RelptrFreePageSpanLeader prev
;
73 RelptrFreePageSpanLeader next
;
76 /* Common header for btree leaf and internal pages. */
77 typedef struct FreePageBtreeHeader
79 int magic
; /* FREE_PAGE_LEAF_MAGIC or
80 * FREE_PAGE_INTERNAL_MAGIC */
81 Size nused
; /* number of items used */
82 RelptrFreePageBtree parent
; /* uplink */
83 } FreePageBtreeHeader
;
85 /* Internal key; points to next level of btree. */
86 typedef struct FreePageBtreeInternalKey
88 Size first_page
; /* low bound for keys on child page */
89 RelptrFreePageBtree child
; /* downlink */
90 } FreePageBtreeInternalKey
;
92 /* Leaf key; no payload data. */
93 typedef struct FreePageBtreeLeafKey
95 Size first_page
; /* first page in span */
96 Size npages
; /* number of pages in span */
97 } FreePageBtreeLeafKey
;
99 /* Work out how many keys will fit on a page. */
100 #define FPM_ITEMS_PER_INTERNAL_PAGE \
101 ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
102 sizeof(FreePageBtreeInternalKey))
103 #define FPM_ITEMS_PER_LEAF_PAGE \
104 ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
105 sizeof(FreePageBtreeLeafKey))
107 /* A btree page of either sort */
110 FreePageBtreeHeader hdr
;
113 FreePageBtreeInternalKey internal_key
[FPM_ITEMS_PER_INTERNAL_PAGE
];
114 FreePageBtreeLeafKey leaf_key
[FPM_ITEMS_PER_LEAF_PAGE
];
118 /* Results of a btree search */
119 typedef struct FreePageBtreeSearchResult
124 unsigned split_pages
;
125 } FreePageBtreeSearchResult
;
127 /* Helper functions */
128 static void FreePageBtreeAdjustAncestorKeys(FreePageManager
*fpm
,
130 static Size
FreePageBtreeCleanup(FreePageManager
*fpm
);
131 static FreePageBtree
*FreePageBtreeFindLeftSibling(char *base
,
133 static FreePageBtree
*FreePageBtreeFindRightSibling(char *base
,
135 static Size
FreePageBtreeFirstKey(FreePageBtree
*btp
);
136 static FreePageBtree
*FreePageBtreeGetRecycled(FreePageManager
*fpm
);
137 static void FreePageBtreeInsertInternal(char *base
, FreePageBtree
*btp
,
138 Size index
, Size first_page
, FreePageBtree
*child
);
139 static void FreePageBtreeInsertLeaf(FreePageBtree
*btp
, Size index
,
140 Size first_page
, Size npages
);
141 static void FreePageBtreeRecycle(FreePageManager
*fpm
, Size pageno
);
142 static void FreePageBtreeRemove(FreePageManager
*fpm
, FreePageBtree
*btp
,
144 static void FreePageBtreeRemovePage(FreePageManager
*fpm
, FreePageBtree
*btp
);
145 static void FreePageBtreeSearch(FreePageManager
*fpm
, Size first_page
,
146 FreePageBtreeSearchResult
*result
);
147 static Size
FreePageBtreeSearchInternal(FreePageBtree
*btp
, Size first_page
);
148 static Size
FreePageBtreeSearchLeaf(FreePageBtree
*btp
, Size first_page
);
149 static FreePageBtree
*FreePageBtreeSplitPage(FreePageManager
*fpm
,
151 static void FreePageBtreeUpdateParentPointers(char *base
, FreePageBtree
*btp
);
152 static void FreePageManagerDumpBtree(FreePageManager
*fpm
, FreePageBtree
*btp
,
153 FreePageBtree
*parent
, int level
, StringInfo buf
);
154 static void FreePageManagerDumpSpans(FreePageManager
*fpm
,
155 FreePageSpanLeader
*span
, Size expected_pages
,
157 static bool FreePageManagerGetInternal(FreePageManager
*fpm
, Size npages
,
159 static Size
FreePageManagerPutInternal(FreePageManager
*fpm
, Size first_page
,
160 Size npages
, bool soft
);
161 static void FreePagePopSpanLeader(FreePageManager
*fpm
, Size pageno
);
162 static void FreePagePushSpanLeader(FreePageManager
*fpm
, Size first_page
,
164 static Size
FreePageManagerLargestContiguous(FreePageManager
*fpm
);
165 static void FreePageManagerUpdateLargest(FreePageManager
*fpm
);
167 #ifdef FPM_EXTRA_ASSERTS
168 static Size
sum_free_pages(FreePageManager
*fpm
);
172 * Initialize a new, empty free page manager.
174 * 'fpm' should reference caller-provided memory large enough to contain a
175 * FreePageManager. We'll initialize it here.
177 * 'base' is the address to which all pointers are relative. When managing
178 * a dynamic shared memory segment, it should normally be the base of the
179 * segment. When managing backend-private memory, it can be either NULL or,
180 * if managing a single contiguous extent of memory, the start of that extent.
183 FreePageManagerInitialize(FreePageManager
*fpm
, char *base
)
187 relptr_store(base
, fpm
->self
, fpm
);
188 relptr_store(base
, fpm
->btree_root
, (FreePageBtree
*) NULL
);
189 relptr_store(base
, fpm
->btree_recycle
, (FreePageSpanLeader
*) NULL
);
190 fpm
->btree_depth
= 0;
191 fpm
->btree_recycle_count
= 0;
192 fpm
->singleton_first_page
= 0;
193 fpm
->singleton_npages
= 0;
194 fpm
->contiguous_pages
= 0;
195 fpm
->contiguous_pages_dirty
= true;
196 #ifdef FPM_EXTRA_ASSERTS
200 for (f
= 0; f
< FPM_NUM_FREELISTS
; f
++)
201 relptr_store(base
, fpm
->freelist
[f
], (FreePageSpanLeader
*) NULL
);
205 * Allocate a run of pages of the given length from the free page manager.
206 * The return value indicates whether we were able to satisfy the request;
207 * if true, the first page of the allocation is stored in *first_page.
210 FreePageManagerGet(FreePageManager
*fpm
, Size npages
, Size
*first_page
)
213 Size contiguous_pages
;
215 result
= FreePageManagerGetInternal(fpm
, npages
, first_page
);
218 * It's a bit counterintuitive, but allocating pages can actually create
219 * opportunities for cleanup that create larger ranges. We might pull a
220 * key out of the btree that enables the item at the head of the btree
221 * recycle list to be inserted; and then if there are more items behind it
222 * one of those might cause two currently-separated ranges to merge,
223 * creating a single range of contiguous pages larger than any that
224 * existed previously. It might be worth trying to improve the cleanup
225 * algorithm to avoid such corner cases, but for now we just notice the
226 * condition and do the appropriate reporting.
228 contiguous_pages
= FreePageBtreeCleanup(fpm
);
229 if (fpm
->contiguous_pages
< contiguous_pages
)
230 fpm
->contiguous_pages
= contiguous_pages
;
233 * FreePageManagerGetInternal may have set contiguous_pages_dirty.
234 * Recompute contiguous_pages if so.
236 FreePageManagerUpdateLargest(fpm
);
238 #ifdef FPM_EXTRA_ASSERTS
241 Assert(fpm
->free_pages
>= npages
);
242 fpm
->free_pages
-= npages
;
244 Assert(fpm
->free_pages
== sum_free_pages(fpm
));
245 Assert(fpm
->contiguous_pages
== FreePageManagerLargestContiguous(fpm
));
250 #ifdef FPM_EXTRA_ASSERTS
252 sum_free_pages_recurse(FreePageManager
*fpm
, FreePageBtree
*btp
, Size
*sum
)
254 char *base
= fpm_segment_base(fpm
);
256 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
||
257 btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
);
259 if (btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
)
264 for (index
= 0; index
< btp
->hdr
.nused
; ++index
)
266 FreePageBtree
*child
;
268 child
= relptr_access(base
, btp
->u
.internal_key
[index
].child
);
269 sum_free_pages_recurse(fpm
, child
, sum
);
274 sum_free_pages(FreePageManager
*fpm
)
276 FreePageSpanLeader
*recycle
;
277 char *base
= fpm_segment_base(fpm
);
281 /* Count the spans by scanning the freelists. */
282 for (list
= 0; list
< FPM_NUM_FREELISTS
; ++list
)
285 if (!relptr_is_null(fpm
->freelist
[list
]))
287 FreePageSpanLeader
*candidate
=
288 relptr_access(base
, fpm
->freelist
[list
]);
292 sum
+= candidate
->npages
;
293 candidate
= relptr_access(base
, candidate
->next
);
294 } while (candidate
!= NULL
);
298 /* Count btree internal pages. */
299 if (fpm
->btree_depth
> 0)
301 FreePageBtree
*root
= relptr_access(base
, fpm
->btree_root
);
303 sum_free_pages_recurse(fpm
, root
, &sum
);
306 /* Count the recycle list. */
307 for (recycle
= relptr_access(base
, fpm
->btree_recycle
);
309 recycle
= relptr_access(base
, recycle
->next
))
311 Assert(recycle
->npages
== 1);
320 * Compute the size of the largest run of pages that the user could
324 FreePageManagerLargestContiguous(FreePageManager
*fpm
)
329 base
= fpm_segment_base(fpm
);
331 if (!relptr_is_null(fpm
->freelist
[FPM_NUM_FREELISTS
- 1]))
333 FreePageSpanLeader
*candidate
;
335 candidate
= relptr_access(base
, fpm
->freelist
[FPM_NUM_FREELISTS
- 1]);
338 if (candidate
->npages
> largest
)
339 largest
= candidate
->npages
;
340 candidate
= relptr_access(base
, candidate
->next
);
341 } while (candidate
!= NULL
);
345 Size f
= FPM_NUM_FREELISTS
- 1;
350 if (!relptr_is_null(fpm
->freelist
[f
]))
362 * Recompute the size of the largest run of pages that the user could
363 * successfully get, if it has been marked dirty.
366 FreePageManagerUpdateLargest(FreePageManager
*fpm
)
368 if (!fpm
->contiguous_pages_dirty
)
371 fpm
->contiguous_pages
= FreePageManagerLargestContiguous(fpm
);
372 fpm
->contiguous_pages_dirty
= false;
376 * Transfer a run of pages to the free page manager.
379 FreePageManagerPut(FreePageManager
*fpm
, Size first_page
, Size npages
)
381 Size contiguous_pages
;
385 /* Record the new pages. */
387 FreePageManagerPutInternal(fpm
, first_page
, npages
, false);
390 * If the new range we inserted into the page manager was contiguous with
391 * an existing range, it may have opened up cleanup opportunities.
393 if (contiguous_pages
> npages
)
395 Size cleanup_contiguous_pages
;
397 cleanup_contiguous_pages
= FreePageBtreeCleanup(fpm
);
398 if (cleanup_contiguous_pages
> contiguous_pages
)
399 contiguous_pages
= cleanup_contiguous_pages
;
402 /* See if we now have a new largest chunk. */
403 if (fpm
->contiguous_pages
< contiguous_pages
)
404 fpm
->contiguous_pages
= contiguous_pages
;
407 * The earlier call to FreePageManagerPutInternal may have set
408 * contiguous_pages_dirty if it needed to allocate internal pages, so
409 * recompute contiguous_pages if necessary.
411 FreePageManagerUpdateLargest(fpm
);
413 #ifdef FPM_EXTRA_ASSERTS
414 fpm
->free_pages
+= npages
;
415 Assert(fpm
->free_pages
== sum_free_pages(fpm
));
416 Assert(fpm
->contiguous_pages
== FreePageManagerLargestContiguous(fpm
));
421 * Produce a debugging dump of the state of a free page manager.
424 FreePageManagerDump(FreePageManager
*fpm
)
426 char *base
= fpm_segment_base(fpm
);
428 FreePageSpanLeader
*recycle
;
429 bool dumped_any_freelist
= false;
432 /* Initialize output buffer. */
433 initStringInfo(&buf
);
435 /* Dump general stuff. */
436 appendStringInfo(&buf
, "metadata: self %zu max contiguous pages = %zu\n",
437 fpm
->self
.relptr_off
, fpm
->contiguous_pages
);
440 if (fpm
->btree_depth
> 0)
444 appendStringInfo(&buf
, "btree depth %u:\n", fpm
->btree_depth
);
445 root
= relptr_access(base
, fpm
->btree_root
);
446 FreePageManagerDumpBtree(fpm
, root
, NULL
, 0, &buf
);
448 else if (fpm
->singleton_npages
> 0)
450 appendStringInfo(&buf
, "singleton: %zu(%zu)\n",
451 fpm
->singleton_first_page
, fpm
->singleton_npages
);
454 /* Dump btree recycle list. */
455 recycle
= relptr_access(base
, fpm
->btree_recycle
);
458 appendStringInfoString(&buf
, "btree recycle:");
459 FreePageManagerDumpSpans(fpm
, recycle
, 1, &buf
);
462 /* Dump free lists. */
463 for (f
= 0; f
< FPM_NUM_FREELISTS
; ++f
)
465 FreePageSpanLeader
*span
;
467 if (relptr_is_null(fpm
->freelist
[f
]))
469 if (!dumped_any_freelist
)
471 appendStringInfoString(&buf
, "freelists:\n");
472 dumped_any_freelist
= true;
474 appendStringInfo(&buf
, " %zu:", f
+ 1);
475 span
= relptr_access(base
, fpm
->freelist
[f
]);
476 FreePageManagerDumpSpans(fpm
, span
, f
+ 1, &buf
);
479 /* And return result to caller. */
485 * The first_page value stored at index zero in any non-root page must match
486 * the first_page value stored in its parent at the index which points to that
487 * page. So when the value stored at index zero in a btree page changes, we've
488 * got to walk up the tree adjusting ancestor keys until we reach an ancestor
489 * where that key isn't index zero. This function should be called after
490 * updating the first key on the target page; it will propagate the change
491 * upward as far as needed.
493 * We assume here that the first key on the page has not changed enough to
494 * require changes in the ordering of keys on its ancestor pages. Thus,
495 * if we search the parent page for the first key greater than or equal to
496 * the first key on the current page, the downlink to this page will be either
497 * the exact index returned by the search (if the first key decreased)
498 * or one less (if the first key increased).
501 FreePageBtreeAdjustAncestorKeys(FreePageManager
*fpm
, FreePageBtree
*btp
)
503 char *base
= fpm_segment_base(fpm
);
505 FreePageBtree
*parent
;
506 FreePageBtree
*child
;
508 /* This might be either a leaf or an internal page. */
509 Assert(btp
->hdr
.nused
> 0);
510 if (btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
512 Assert(btp
->hdr
.nused
<= FPM_ITEMS_PER_LEAF_PAGE
);
513 first_page
= btp
->u
.leaf_key
[0].first_page
;
517 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
518 Assert(btp
->hdr
.nused
<= FPM_ITEMS_PER_INTERNAL_PAGE
);
519 first_page
= btp
->u
.internal_key
[0].first_page
;
523 /* Loop until we find an ancestor that does not require adjustment. */
528 parent
= relptr_access(base
, child
->hdr
.parent
);
531 s
= FreePageBtreeSearchInternal(parent
, first_page
);
533 /* Key is either at index s or index s-1; figure out which. */
534 if (s
>= parent
->hdr
.nused
)
536 Assert(s
== parent
->hdr
.nused
);
541 FreePageBtree
*check
;
543 check
= relptr_access(base
, parent
->u
.internal_key
[s
].child
);
551 #ifdef USE_ASSERT_CHECKING
552 /* Debugging double-check. */
554 FreePageBtree
*check
;
556 check
= relptr_access(base
, parent
->u
.internal_key
[s
].child
);
557 Assert(s
< parent
->hdr
.nused
);
558 Assert(child
== check
);
562 /* Update the parent key. */
563 parent
->u
.internal_key
[s
].first_page
= first_page
;
566 * If this is the first key in the parent, go up another level; else
576 * Attempt to reclaim space from the free-page btree. The return value is
577 * the largest range of contiguous pages created by the cleanup operation.
580 FreePageBtreeCleanup(FreePageManager
*fpm
)
582 char *base
= fpm_segment_base(fpm
);
583 Size max_contiguous_pages
= 0;
585 /* Attempt to shrink the depth of the btree. */
586 while (!relptr_is_null(fpm
->btree_root
))
588 FreePageBtree
*root
= relptr_access(base
, fpm
->btree_root
);
590 /* If the root contains only one key, reduce depth by one. */
591 if (root
->hdr
.nused
== 1)
593 /* Shrink depth of tree by one. */
594 Assert(fpm
->btree_depth
> 0);
596 if (root
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
598 /* If root is a leaf, convert only entry to singleton range. */
599 relptr_store(base
, fpm
->btree_root
, (FreePageBtree
*) NULL
);
600 fpm
->singleton_first_page
= root
->u
.leaf_key
[0].first_page
;
601 fpm
->singleton_npages
= root
->u
.leaf_key
[0].npages
;
605 FreePageBtree
*newroot
;
607 /* If root is an internal page, make only child the root. */
608 Assert(root
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
609 relptr_copy(fpm
->btree_root
, root
->u
.internal_key
[0].child
);
610 newroot
= relptr_access(base
, fpm
->btree_root
);
611 relptr_store(base
, newroot
->hdr
.parent
, (FreePageBtree
*) NULL
);
613 FreePageBtreeRecycle(fpm
, fpm_pointer_to_page(base
, root
));
615 else if (root
->hdr
.nused
== 2 &&
616 root
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
619 Size start_of_second
;
621 end_of_first
= root
->u
.leaf_key
[0].first_page
+
622 root
->u
.leaf_key
[0].npages
;
623 start_of_second
= root
->u
.leaf_key
[1].first_page
;
625 if (end_of_first
+ 1 == start_of_second
)
627 Size root_page
= fpm_pointer_to_page(base
, root
);
629 if (end_of_first
== root_page
)
631 FreePagePopSpanLeader(fpm
, root
->u
.leaf_key
[0].first_page
);
632 FreePagePopSpanLeader(fpm
, root
->u
.leaf_key
[1].first_page
);
633 fpm
->singleton_first_page
= root
->u
.leaf_key
[0].first_page
;
634 fpm
->singleton_npages
= root
->u
.leaf_key
[0].npages
+
635 root
->u
.leaf_key
[1].npages
+ 1;
636 fpm
->btree_depth
= 0;
637 relptr_store(base
, fpm
->btree_root
,
638 (FreePageBtree
*) NULL
);
639 FreePagePushSpanLeader(fpm
, fpm
->singleton_first_page
,
640 fpm
->singleton_npages
);
641 Assert(max_contiguous_pages
== 0);
642 max_contiguous_pages
= fpm
->singleton_npages
;
646 /* Whether it worked or not, it's time to stop. */
651 /* Nothing more to do. Stop. */
657 * Attempt to free recycled btree pages. We skip this if releasing the
658 * recycled page would require a btree page split, because the page we're
659 * trying to recycle would be consumed by the split, which would be
662 * We also currently only ever attempt to recycle the first page on the
663 * list; that could be made more aggressive, but it's not clear that the
664 * complexity would be worthwhile.
666 while (fpm
->btree_recycle_count
> 0)
670 Size contiguous_pages
;
672 btp
= FreePageBtreeGetRecycled(fpm
);
673 first_page
= fpm_pointer_to_page(base
, btp
);
674 contiguous_pages
= FreePageManagerPutInternal(fpm
, first_page
, 1, true);
675 if (contiguous_pages
== 0)
677 FreePageBtreeRecycle(fpm
, first_page
);
682 if (contiguous_pages
> max_contiguous_pages
)
683 max_contiguous_pages
= contiguous_pages
;
687 return max_contiguous_pages
;
691 * Consider consolidating the given page with its left or right sibling,
692 * if it's fairly empty.
695 FreePageBtreeConsolidate(FreePageManager
*fpm
, FreePageBtree
*btp
)
697 char *base
= fpm_segment_base(fpm
);
702 * We only try to consolidate pages that are less than a third full. We
703 * could be more aggressive about this, but that might risk performing
704 * consolidation only to end up splitting again shortly thereafter. Since
705 * the btree should be very small compared to the space under management,
706 * our goal isn't so much to ensure that it always occupies the absolutely
707 * smallest possible number of pages as to reclaim pages before things get
708 * too egregiously out of hand.
710 if (btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
711 max
= FPM_ITEMS_PER_LEAF_PAGE
;
714 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
715 max
= FPM_ITEMS_PER_INTERNAL_PAGE
;
717 if (btp
->hdr
.nused
>= max
/ 3)
721 * If we can fit our right sibling's keys onto this page, consolidate.
723 np
= FreePageBtreeFindRightSibling(base
, btp
);
724 if (np
!= NULL
&& btp
->hdr
.nused
+ np
->hdr
.nused
<= max
)
726 if (btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
728 memcpy(&btp
->u
.leaf_key
[btp
->hdr
.nused
], &np
->u
.leaf_key
[0],
729 sizeof(FreePageBtreeLeafKey
) * np
->hdr
.nused
);
730 btp
->hdr
.nused
+= np
->hdr
.nused
;
734 memcpy(&btp
->u
.internal_key
[btp
->hdr
.nused
], &np
->u
.internal_key
[0],
735 sizeof(FreePageBtreeInternalKey
) * np
->hdr
.nused
);
736 btp
->hdr
.nused
+= np
->hdr
.nused
;
737 FreePageBtreeUpdateParentPointers(base
, btp
);
739 FreePageBtreeRemovePage(fpm
, np
);
744 * If we can fit our keys onto our left sibling's page, consolidate. In
745 * this case, we move our keys onto the other page rather than vice versa,
746 * to avoid having to adjust ancestor keys.
748 np
= FreePageBtreeFindLeftSibling(base
, btp
);
749 if (np
!= NULL
&& btp
->hdr
.nused
+ np
->hdr
.nused
<= max
)
751 if (btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
753 memcpy(&np
->u
.leaf_key
[np
->hdr
.nused
], &btp
->u
.leaf_key
[0],
754 sizeof(FreePageBtreeLeafKey
) * btp
->hdr
.nused
);
755 np
->hdr
.nused
+= btp
->hdr
.nused
;
759 memcpy(&np
->u
.internal_key
[np
->hdr
.nused
], &btp
->u
.internal_key
[0],
760 sizeof(FreePageBtreeInternalKey
) * btp
->hdr
.nused
);
761 np
->hdr
.nused
+= btp
->hdr
.nused
;
762 FreePageBtreeUpdateParentPointers(base
, np
);
764 FreePageBtreeRemovePage(fpm
, btp
);
770 * Find the passed page's left sibling; that is, the page at the same level
771 * of the tree whose keyspace immediately precedes ours.
773 static FreePageBtree
*
774 FreePageBtreeFindLeftSibling(char *base
, FreePageBtree
*btp
)
776 FreePageBtree
*p
= btp
;
779 /* Move up until we can move left. */
785 first_page
= FreePageBtreeFirstKey(p
);
786 p
= relptr_access(base
, p
->hdr
.parent
);
789 return NULL
; /* we were passed the rightmost page */
791 index
= FreePageBtreeSearchInternal(p
, first_page
);
794 Assert(p
->u
.internal_key
[index
].first_page
== first_page
);
795 p
= relptr_access(base
, p
->u
.internal_key
[index
- 1].child
);
805 Assert(p
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
806 p
= relptr_access(base
, p
->u
.internal_key
[p
->hdr
.nused
- 1].child
);
809 Assert(p
->hdr
.magic
== btp
->hdr
.magic
);
815 * Find the passed page's right sibling; that is, the page at the same level
816 * of the tree whose keyspace immediately follows ours.
818 static FreePageBtree
*
819 FreePageBtreeFindRightSibling(char *base
, FreePageBtree
*btp
)
821 FreePageBtree
*p
= btp
;
824 /* Move up until we can move right. */
830 first_page
= FreePageBtreeFirstKey(p
);
831 p
= relptr_access(base
, p
->hdr
.parent
);
834 return NULL
; /* we were passed the rightmost page */
836 index
= FreePageBtreeSearchInternal(p
, first_page
);
837 if (index
< p
->hdr
.nused
- 1)
839 Assert(p
->u
.internal_key
[index
].first_page
== first_page
);
840 p
= relptr_access(base
, p
->u
.internal_key
[index
+ 1].child
);
843 Assert(index
== p
->hdr
.nused
- 1);
850 Assert(p
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
851 p
= relptr_access(base
, p
->u
.internal_key
[0].child
);
854 Assert(p
->hdr
.magic
== btp
->hdr
.magic
);
860 * Get the first key on a btree page.
863 FreePageBtreeFirstKey(FreePageBtree
*btp
)
865 Assert(btp
->hdr
.nused
> 0);
867 if (btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
868 return btp
->u
.leaf_key
[0].first_page
;
871 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
872 return btp
->u
.internal_key
[0].first_page
;
877 * Get a page from the btree recycle list for use as a btree page.
879 static FreePageBtree
*
880 FreePageBtreeGetRecycled(FreePageManager
*fpm
)
882 char *base
= fpm_segment_base(fpm
);
883 FreePageSpanLeader
*victim
= relptr_access(base
, fpm
->btree_recycle
);
884 FreePageSpanLeader
*newhead
;
886 Assert(victim
!= NULL
);
887 newhead
= relptr_access(base
, victim
->next
);
889 relptr_copy(newhead
->prev
, victim
->prev
);
890 relptr_store(base
, fpm
->btree_recycle
, newhead
);
891 Assert(fpm_pointer_is_page_aligned(base
, victim
));
892 fpm
->btree_recycle_count
--;
893 return (FreePageBtree
*) victim
;
897 * Insert an item into an internal page.
900 FreePageBtreeInsertInternal(char *base
, FreePageBtree
*btp
, Size index
,
901 Size first_page
, FreePageBtree
*child
)
903 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
904 Assert(btp
->hdr
.nused
<= FPM_ITEMS_PER_INTERNAL_PAGE
);
905 Assert(index
<= btp
->hdr
.nused
);
906 memmove(&btp
->u
.internal_key
[index
+ 1], &btp
->u
.internal_key
[index
],
907 sizeof(FreePageBtreeInternalKey
) * (btp
->hdr
.nused
- index
));
908 btp
->u
.internal_key
[index
].first_page
= first_page
;
909 relptr_store(base
, btp
->u
.internal_key
[index
].child
, child
);
914 * Insert an item into a leaf page.
917 FreePageBtreeInsertLeaf(FreePageBtree
*btp
, Size index
, Size first_page
,
920 Assert(btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
);
921 Assert(btp
->hdr
.nused
<= FPM_ITEMS_PER_LEAF_PAGE
);
922 Assert(index
<= btp
->hdr
.nused
);
923 memmove(&btp
->u
.leaf_key
[index
+ 1], &btp
->u
.leaf_key
[index
],
924 sizeof(FreePageBtreeLeafKey
) * (btp
->hdr
.nused
- index
));
925 btp
->u
.leaf_key
[index
].first_page
= first_page
;
926 btp
->u
.leaf_key
[index
].npages
= npages
;
931 * Put a page on the btree recycle list.
934 FreePageBtreeRecycle(FreePageManager
*fpm
, Size pageno
)
936 char *base
= fpm_segment_base(fpm
);
937 FreePageSpanLeader
*head
= relptr_access(base
, fpm
->btree_recycle
);
938 FreePageSpanLeader
*span
;
940 span
= (FreePageSpanLeader
*) fpm_page_to_pointer(base
, pageno
);
941 span
->magic
= FREE_PAGE_SPAN_LEADER_MAGIC
;
943 relptr_store(base
, span
->next
, head
);
944 relptr_store(base
, span
->prev
, (FreePageSpanLeader
*) NULL
);
946 relptr_store(base
, head
->prev
, span
);
947 relptr_store(base
, fpm
->btree_recycle
, span
);
948 fpm
->btree_recycle_count
++;
952 * Remove an item from the btree at the given position on the given page.
955 FreePageBtreeRemove(FreePageManager
*fpm
, FreePageBtree
*btp
, Size index
)
957 Assert(btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
);
958 Assert(index
< btp
->hdr
.nused
);
960 /* When last item is removed, extirpate entire page from btree. */
961 if (btp
->hdr
.nused
== 1)
963 FreePageBtreeRemovePage(fpm
, btp
);
967 /* Physically remove the key from the page. */
969 if (index
< btp
->hdr
.nused
)
970 memmove(&btp
->u
.leaf_key
[index
], &btp
->u
.leaf_key
[index
+ 1],
971 sizeof(FreePageBtreeLeafKey
) * (btp
->hdr
.nused
- index
));
973 /* If we just removed the first key, adjust ancestor keys. */
975 FreePageBtreeAdjustAncestorKeys(fpm
, btp
);
977 /* Consider whether to consolidate this page with a sibling. */
978 FreePageBtreeConsolidate(fpm
, btp
);
982 * Remove a page from the btree. Caller is responsible for having relocated
983 * any keys from this page that are still wanted. The page is placed on the
987 FreePageBtreeRemovePage(FreePageManager
*fpm
, FreePageBtree
*btp
)
989 char *base
= fpm_segment_base(fpm
);
990 FreePageBtree
*parent
;
996 /* Find parent page. */
997 parent
= relptr_access(base
, btp
->hdr
.parent
);
1000 /* We are removing the root page. */
1001 relptr_store(base
, fpm
->btree_root
, (FreePageBtree
*) NULL
);
1002 fpm
->btree_depth
= 0;
1003 Assert(fpm
->singleton_first_page
== 0);
1004 Assert(fpm
->singleton_npages
== 0);
1009 * If the parent contains only one item, we need to remove it as well.
1011 if (parent
->hdr
.nused
> 1)
1013 FreePageBtreeRecycle(fpm
, fpm_pointer_to_page(base
, btp
));
1017 /* Find and remove the downlink. */
1018 first_page
= FreePageBtreeFirstKey(btp
);
1019 if (parent
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
1021 index
= FreePageBtreeSearchLeaf(parent
, first_page
);
1022 Assert(index
< parent
->hdr
.nused
);
1023 if (index
< parent
->hdr
.nused
- 1)
1024 memmove(&parent
->u
.leaf_key
[index
],
1025 &parent
->u
.leaf_key
[index
+ 1],
1026 sizeof(FreePageBtreeLeafKey
)
1027 * (parent
->hdr
.nused
- index
- 1));
1031 index
= FreePageBtreeSearchInternal(parent
, first_page
);
1032 Assert(index
< parent
->hdr
.nused
);
1033 if (index
< parent
->hdr
.nused
- 1)
1034 memmove(&parent
->u
.internal_key
[index
],
1035 &parent
->u
.internal_key
[index
+ 1],
1036 sizeof(FreePageBtreeInternalKey
)
1037 * (parent
->hdr
.nused
- index
- 1));
1039 parent
->hdr
.nused
--;
1040 Assert(parent
->hdr
.nused
> 0);
1042 /* Recycle the page. */
1043 FreePageBtreeRecycle(fpm
, fpm_pointer_to_page(base
, btp
));
1045 /* Adjust ancestor keys if needed. */
1047 FreePageBtreeAdjustAncestorKeys(fpm
, parent
);
1049 /* Consider whether to consolidate the parent with a sibling. */
1050 FreePageBtreeConsolidate(fpm
, parent
);
1054 * Search the btree for an entry for the given first page and initialize
1055 * *result with the results of the search. result->page and result->index
1056 * indicate either the position of an exact match or the position at which
1057 * the new key should be inserted. result->found is true for an exact match,
1058 * otherwise false. result->split_pages will contain the number of additional
1059 * btree pages that will be needed when performing a split to insert a key.
1060 * Except as described above, the contents of fields in the result object are
1061 * undefined on return.
1064 FreePageBtreeSearch(FreePageManager
*fpm
, Size first_page
,
1065 FreePageBtreeSearchResult
*result
)
1067 char *base
= fpm_segment_base(fpm
);
1068 FreePageBtree
*btp
= relptr_access(base
, fpm
->btree_root
);
1071 result
->split_pages
= 1;
1073 /* If the btree is empty, there's nothing to find. */
1076 result
->page
= NULL
;
1077 result
->found
= false;
1081 /* Descend until we hit a leaf. */
1082 while (btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
)
1084 FreePageBtree
*child
;
1087 index
= FreePageBtreeSearchInternal(btp
, first_page
);
1088 found_exact
= index
< btp
->hdr
.nused
&&
1089 btp
->u
.internal_key
[index
].first_page
== first_page
;
1092 * If we found an exact match we descend directly. Otherwise, we
1093 * descend into the child to the left if possible so that we can find
1094 * the insertion point at that child's high end.
1096 if (!found_exact
&& index
> 0)
1099 /* Track required split depth for leaf insert. */
1100 if (btp
->hdr
.nused
>= FPM_ITEMS_PER_INTERNAL_PAGE
)
1102 Assert(btp
->hdr
.nused
== FPM_ITEMS_PER_INTERNAL_PAGE
);
1103 result
->split_pages
++;
1106 result
->split_pages
= 0;
1108 /* Descend to appropriate child page. */
1109 Assert(index
< btp
->hdr
.nused
);
1110 child
= relptr_access(base
, btp
->u
.internal_key
[index
].child
);
1111 Assert(relptr_access(base
, child
->hdr
.parent
) == btp
);
1115 /* Track required split depth for leaf insert. */
1116 if (btp
->hdr
.nused
>= FPM_ITEMS_PER_LEAF_PAGE
)
1118 Assert(btp
->hdr
.nused
== FPM_ITEMS_PER_INTERNAL_PAGE
);
1119 result
->split_pages
++;
1122 result
->split_pages
= 0;
1124 /* Search leaf page. */
1125 index
= FreePageBtreeSearchLeaf(btp
, first_page
);
1127 /* Assemble results. */
1129 result
->index
= index
;
1130 result
->found
= index
< btp
->hdr
.nused
&&
1131 first_page
== btp
->u
.leaf_key
[index
].first_page
;
1135 * Search an internal page for the first key greater than or equal to a given
1136 * page number. Returns the index of that key, or one greater than the number
1137 * of keys on the page if none.
1140 FreePageBtreeSearchInternal(FreePageBtree
*btp
, Size first_page
)
1143 Size high
= btp
->hdr
.nused
;
1145 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
1146 Assert(high
> 0 && high
<= FPM_ITEMS_PER_INTERNAL_PAGE
);
1150 Size mid
= (low
+ high
) / 2;
1151 Size val
= btp
->u
.internal_key
[mid
].first_page
;
1153 if (first_page
== val
)
1155 else if (first_page
< val
)
1165 * Search a leaf page for the first key greater than or equal to a given
1166 * page number. Returns the index of that key, or one greater than the number
1167 * of keys on the page if none.
1170 FreePageBtreeSearchLeaf(FreePageBtree
*btp
, Size first_page
)
1173 Size high
= btp
->hdr
.nused
;
1175 Assert(btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
);
1176 Assert(high
> 0 && high
<= FPM_ITEMS_PER_LEAF_PAGE
);
1180 Size mid
= (low
+ high
) / 2;
1181 Size val
= btp
->u
.leaf_key
[mid
].first_page
;
1183 if (first_page
== val
)
1185 else if (first_page
< val
)
1195 * Allocate a new btree page and move half the keys from the provided page
1196 * to the new page. Caller is responsible for making sure that there's a
1197 * page available from fpm->btree_recycle. Returns a pointer to the new page,
1198 * to which caller must add a downlink.
1200 static FreePageBtree
*
1201 FreePageBtreeSplitPage(FreePageManager
*fpm
, FreePageBtree
*btp
)
1203 FreePageBtree
*newsibling
;
1205 newsibling
= FreePageBtreeGetRecycled(fpm
);
1206 newsibling
->hdr
.magic
= btp
->hdr
.magic
;
1207 newsibling
->hdr
.nused
= btp
->hdr
.nused
/ 2;
1208 relptr_copy(newsibling
->hdr
.parent
, btp
->hdr
.parent
);
1209 btp
->hdr
.nused
-= newsibling
->hdr
.nused
;
1211 if (btp
->hdr
.magic
== FREE_PAGE_LEAF_MAGIC
)
1212 memcpy(&newsibling
->u
.leaf_key
,
1213 &btp
->u
.leaf_key
[btp
->hdr
.nused
],
1214 sizeof(FreePageBtreeLeafKey
) * newsibling
->hdr
.nused
);
1217 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
1218 memcpy(&newsibling
->u
.internal_key
,
1219 &btp
->u
.internal_key
[btp
->hdr
.nused
],
1220 sizeof(FreePageBtreeInternalKey
) * newsibling
->hdr
.nused
);
1221 FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm
), newsibling
);
1228 * When internal pages are split or merged, the parent pointers of their
1229 * children must be updated.
1232 FreePageBtreeUpdateParentPointers(char *base
, FreePageBtree
*btp
)
1236 Assert(btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
);
1237 for (i
= 0; i
< btp
->hdr
.nused
; ++i
)
1239 FreePageBtree
*child
;
1241 child
= relptr_access(base
, btp
->u
.internal_key
[i
].child
);
1242 relptr_store(base
, child
->hdr
.parent
, btp
);
1247 * Debugging dump of btree data.
1250 FreePageManagerDumpBtree(FreePageManager
*fpm
, FreePageBtree
*btp
,
1251 FreePageBtree
*parent
, int level
, StringInfo buf
)
1253 char *base
= fpm_segment_base(fpm
);
1254 Size pageno
= fpm_pointer_to_page(base
, btp
);
1256 FreePageBtree
*check_parent
;
1258 check_stack_depth();
1259 check_parent
= relptr_access(base
, btp
->hdr
.parent
);
1260 appendStringInfo(buf
, " %zu@%d %c", pageno
, level
,
1261 btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
? 'i' : 'l');
1262 if (parent
!= check_parent
)
1263 appendStringInfo(buf
, " [actual parent %zu, expected %zu]",
1264 fpm_pointer_to_page(base
, check_parent
),
1265 fpm_pointer_to_page(base
, parent
));
1266 appendStringInfoChar(buf
, ':');
1267 for (index
= 0; index
< btp
->hdr
.nused
; ++index
)
1269 if (btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
)
1270 appendStringInfo(buf
, " %zu->%zu",
1271 btp
->u
.internal_key
[index
].first_page
,
1272 btp
->u
.internal_key
[index
].child
.relptr_off
/ FPM_PAGE_SIZE
);
1274 appendStringInfo(buf
, " %zu(%zu)",
1275 btp
->u
.leaf_key
[index
].first_page
,
1276 btp
->u
.leaf_key
[index
].npages
);
1278 appendStringInfoChar(buf
, '\n');
1280 if (btp
->hdr
.magic
== FREE_PAGE_INTERNAL_MAGIC
)
1282 for (index
= 0; index
< btp
->hdr
.nused
; ++index
)
1284 FreePageBtree
*child
;
1286 child
= relptr_access(base
, btp
->u
.internal_key
[index
].child
);
1287 FreePageManagerDumpBtree(fpm
, child
, btp
, level
+ 1, buf
);
1293 * Debugging dump of free-span data.
1296 FreePageManagerDumpSpans(FreePageManager
*fpm
, FreePageSpanLeader
*span
,
1297 Size expected_pages
, StringInfo buf
)
1299 char *base
= fpm_segment_base(fpm
);
1301 while (span
!= NULL
)
1303 if (span
->npages
!= expected_pages
)
1304 appendStringInfo(buf
, " %zu(%zu)", fpm_pointer_to_page(base
, span
),
1307 appendStringInfo(buf
, " %zu", fpm_pointer_to_page(base
, span
));
1308 span
= relptr_access(base
, span
->next
);
1311 appendStringInfoChar(buf
, '\n');
1315 * This function allocates a run of pages of the given length from the free
1319 FreePageManagerGetInternal(FreePageManager
*fpm
, Size npages
, Size
*first_page
)
1321 char *base
= fpm_segment_base(fpm
);
1322 FreePageSpanLeader
*victim
= NULL
;
1323 FreePageSpanLeader
*prev
;
1324 FreePageSpanLeader
*next
;
1325 FreePageBtreeSearchResult result
;
1326 Size victim_page
= 0; /* placate compiler */
1330 * Search for a free span.
1332 * Right now, we use a simple best-fit policy here, but it's possible for
1333 * this to result in memory fragmentation if we're repeatedly asked to
1334 * allocate chunks just a little smaller than what we have available.
1335 * Hopefully, this is unlikely, because we expect most requests to be
1336 * single pages or superblock-sized chunks -- but no policy can be optimal
1337 * under all circumstances unless it has knowledge of future allocation
1340 for (f
= Min(npages
, FPM_NUM_FREELISTS
) - 1; f
< FPM_NUM_FREELISTS
; ++f
)
1342 /* Skip empty freelists. */
1343 if (relptr_is_null(fpm
->freelist
[f
]))
1347 * All of the freelists except the last one contain only items of a
1348 * single size, so we just take the first one. But the final free
1349 * list contains everything too big for any of the other lists, so we
1350 * need to search the list.
1352 if (f
< FPM_NUM_FREELISTS
- 1)
1353 victim
= relptr_access(base
, fpm
->freelist
[f
]);
1356 FreePageSpanLeader
*candidate
;
1358 candidate
= relptr_access(base
, fpm
->freelist
[f
]);
1361 if (candidate
->npages
>= npages
&& (victim
== NULL
||
1362 victim
->npages
> candidate
->npages
))
1365 if (victim
->npages
== npages
)
1368 candidate
= relptr_access(base
, candidate
->next
);
1369 } while (candidate
!= NULL
);
1374 /* If we didn't find an allocatable span, return failure. */
1378 /* Remove span from free list. */
1379 Assert(victim
->magic
== FREE_PAGE_SPAN_LEADER_MAGIC
);
1380 prev
= relptr_access(base
, victim
->prev
);
1381 next
= relptr_access(base
, victim
->next
);
1383 relptr_copy(prev
->next
, victim
->next
);
1385 relptr_copy(fpm
->freelist
[f
], victim
->next
);
1387 relptr_copy(next
->prev
, victim
->prev
);
1388 victim_page
= fpm_pointer_to_page(base
, victim
);
1390 /* Decide whether we might be invalidating contiguous_pages. */
1391 if (f
== FPM_NUM_FREELISTS
- 1 &&
1392 victim
->npages
== fpm
->contiguous_pages
)
1395 * The victim span came from the oversized freelist, and had the same
1396 * size as the longest span. There may or may not be another one of
1397 * the same size, so contiguous_pages must be recomputed just to be
1400 fpm
->contiguous_pages_dirty
= true;
1402 else if (f
+ 1 == fpm
->contiguous_pages
&&
1403 relptr_is_null(fpm
->freelist
[f
]))
1406 * The victim span came from a fixed sized freelist, and it was the
1407 * list for spans of the same size as the current longest span, and
1408 * the list is now empty after removing the victim. So
1409 * contiguous_pages must be recomputed without a doubt.
1411 fpm
->contiguous_pages_dirty
= true;
1415 * If we haven't initialized the btree yet, the victim must be the single
1416 * span stored within the FreePageManager itself. Otherwise, we need to
1419 if (relptr_is_null(fpm
->btree_root
))
1421 Assert(victim_page
== fpm
->singleton_first_page
);
1422 Assert(victim
->npages
== fpm
->singleton_npages
);
1423 Assert(victim
->npages
>= npages
);
1424 fpm
->singleton_first_page
+= npages
;
1425 fpm
->singleton_npages
-= npages
;
1426 if (fpm
->singleton_npages
> 0)
1427 FreePagePushSpanLeader(fpm
, fpm
->singleton_first_page
,
1428 fpm
->singleton_npages
);
1433 * If the span we found is exactly the right size, remove it from the
1434 * btree completely. Otherwise, adjust the btree entry to reflect the
1435 * still-unallocated portion of the span, and put that portion on the
1436 * appropriate free list.
1438 FreePageBtreeSearch(fpm
, victim_page
, &result
);
1439 Assert(result
.found
);
1440 if (victim
->npages
== npages
)
1441 FreePageBtreeRemove(fpm
, result
.page
, result
.index
);
1444 FreePageBtreeLeafKey
*key
;
1446 /* Adjust btree to reflect remaining pages. */
1447 Assert(victim
->npages
> npages
);
1448 key
= &result
.page
->u
.leaf_key
[result
.index
];
1449 Assert(key
->npages
== victim
->npages
);
1450 key
->first_page
+= npages
;
1451 key
->npages
-= npages
;
1452 if (result
.index
== 0)
1453 FreePageBtreeAdjustAncestorKeys(fpm
, result
.page
);
1455 /* Put the unallocated pages back on the appropriate free list. */
1456 FreePagePushSpanLeader(fpm
, victim_page
+ npages
,
1457 victim
->npages
- npages
);
1461 /* Return results to caller. */
1462 *first_page
= fpm_pointer_to_page(base
, victim
);
1467 * Put a range of pages into the btree and freelists, consolidating it with
1468 * existing free spans just before and/or after it. If 'soft' is true,
1469 * only perform the insertion if it can be done without allocating new btree
1470 * pages; if false, do it always. Returns 0 if the soft flag caused the
1471 * insertion to be skipped, or otherwise the size of the contiguous span
1472 * created by the insertion. This may be larger than npages if we're able
1473 * to consolidate with an adjacent range.
1476 FreePageManagerPutInternal(FreePageManager
*fpm
, Size first_page
, Size npages
,
1479 char *base
= fpm_segment_base(fpm
);
1480 FreePageBtreeSearchResult result
;
1481 FreePageBtreeLeafKey
*prevkey
= NULL
;
1482 FreePageBtreeLeafKey
*nextkey
= NULL
;
1488 /* We can store a single free span without initializing the btree. */
1489 if (fpm
->btree_depth
== 0)
1491 if (fpm
->singleton_npages
== 0)
1493 /* Don't have a span yet; store this one. */
1494 fpm
->singleton_first_page
= first_page
;
1495 fpm
->singleton_npages
= npages
;
1496 FreePagePushSpanLeader(fpm
, first_page
, npages
);
1497 return fpm
->singleton_npages
;
1499 else if (fpm
->singleton_first_page
+ fpm
->singleton_npages
==
1502 /* New span immediately follows sole existing span. */
1503 fpm
->singleton_npages
+= npages
;
1504 FreePagePopSpanLeader(fpm
, fpm
->singleton_first_page
);
1505 FreePagePushSpanLeader(fpm
, fpm
->singleton_first_page
,
1506 fpm
->singleton_npages
);
1507 return fpm
->singleton_npages
;
1509 else if (first_page
+ npages
== fpm
->singleton_first_page
)
1511 /* New span immediately precedes sole existing span. */
1512 FreePagePopSpanLeader(fpm
, fpm
->singleton_first_page
);
1513 fpm
->singleton_first_page
= first_page
;
1514 fpm
->singleton_npages
+= npages
;
1515 FreePagePushSpanLeader(fpm
, fpm
->singleton_first_page
,
1516 fpm
->singleton_npages
);
1517 return fpm
->singleton_npages
;
1521 /* Not contiguous; we need to initialize the btree. */
1523 FreePageBtree
*root
;
1525 if (!relptr_is_null(fpm
->btree_recycle
))
1526 root
= FreePageBtreeGetRecycled(fpm
);
1528 return 0; /* Should not allocate if soft. */
1529 else if (FreePageManagerGetInternal(fpm
, 1, &root_page
))
1530 root
= (FreePageBtree
*) fpm_page_to_pointer(base
, root_page
);
1533 /* We'd better be able to get a page from the existing range. */
1534 elog(FATAL
, "free page manager btree is corrupt");
1537 /* Create the btree and move the preexisting range into it. */
1538 root
->hdr
.magic
= FREE_PAGE_LEAF_MAGIC
;
1539 root
->hdr
.nused
= 1;
1540 relptr_store(base
, root
->hdr
.parent
, (FreePageBtree
*) NULL
);
1541 root
->u
.leaf_key
[0].first_page
= fpm
->singleton_first_page
;
1542 root
->u
.leaf_key
[0].npages
= fpm
->singleton_npages
;
1543 relptr_store(base
, fpm
->btree_root
, root
);
1544 fpm
->singleton_first_page
= 0;
1545 fpm
->singleton_npages
= 0;
1546 fpm
->btree_depth
= 1;
1549 * Corner case: it may be that the btree root took the very last
1550 * free page. In that case, the sole btree entry covers a zero
1551 * page run, which is invalid. Overwrite it with the entry we're
1552 * trying to insert and get out.
1554 if (root
->u
.leaf_key
[0].npages
== 0)
1556 root
->u
.leaf_key
[0].first_page
= first_page
;
1557 root
->u
.leaf_key
[0].npages
= npages
;
1558 FreePagePushSpanLeader(fpm
, first_page
, npages
);
1562 /* Fall through to insert the new key. */
1566 /* Search the btree. */
1567 FreePageBtreeSearch(fpm
, first_page
, &result
);
1568 Assert(!result
.found
);
1569 if (result
.index
> 0)
1570 prevkey
= &result
.page
->u
.leaf_key
[result
.index
- 1];
1571 if (result
.index
< result
.page
->hdr
.nused
)
1574 nindex
= result
.index
;
1575 nextkey
= &result
.page
->u
.leaf_key
[result
.index
];
1579 np
= FreePageBtreeFindRightSibling(base
, result
.page
);
1582 nextkey
= &np
->u
.leaf_key
[0];
1585 /* Consolidate with the previous entry if possible. */
1586 if (prevkey
!= NULL
&& prevkey
->first_page
+ prevkey
->npages
>= first_page
)
1588 bool remove_next
= false;
1591 Assert(prevkey
->first_page
+ prevkey
->npages
== first_page
);
1592 prevkey
->npages
= (first_page
- prevkey
->first_page
) + npages
;
1594 /* Check whether we can *also* consolidate with the following entry. */
1595 if (nextkey
!= NULL
&&
1596 prevkey
->first_page
+ prevkey
->npages
>= nextkey
->first_page
)
1598 Assert(prevkey
->first_page
+ prevkey
->npages
==
1599 nextkey
->first_page
);
1600 prevkey
->npages
= (nextkey
->first_page
- prevkey
->first_page
)
1602 FreePagePopSpanLeader(fpm
, nextkey
->first_page
);
1606 /* Put the span on the correct freelist and save size. */
1607 FreePagePopSpanLeader(fpm
, prevkey
->first_page
);
1608 FreePagePushSpanLeader(fpm
, prevkey
->first_page
, prevkey
->npages
);
1609 result
= prevkey
->npages
;
1612 * If we consolidated with both the preceding and following entries,
1613 * we must remove the following entry. We do this last, because
1614 * removing an element from the btree may invalidate pointers we hold
1615 * into the current data structure.
1617 * NB: The btree is technically in an invalid state a this point
1618 * because we've already updated prevkey to cover the same key space
1619 * as nextkey. FreePageBtreeRemove() shouldn't notice that, though.
1622 FreePageBtreeRemove(fpm
, np
, nindex
);
1627 /* Consolidate with the next entry if possible. */
1628 if (nextkey
!= NULL
&& first_page
+ npages
>= nextkey
->first_page
)
1632 /* Compute new size for span. */
1633 Assert(first_page
+ npages
== nextkey
->first_page
);
1634 newpages
= (nextkey
->first_page
- first_page
) + nextkey
->npages
;
1636 /* Put span on correct free list. */
1637 FreePagePopSpanLeader(fpm
, nextkey
->first_page
);
1638 FreePagePushSpanLeader(fpm
, first_page
, newpages
);
1640 /* Update key in place. */
1641 nextkey
->first_page
= first_page
;
1642 nextkey
->npages
= newpages
;
1644 /* If reducing first key on page, ancestors might need adjustment. */
1646 FreePageBtreeAdjustAncestorKeys(fpm
, np
);
1648 return nextkey
->npages
;
1651 /* Split leaf page and as many of its ancestors as necessary. */
1652 if (result
.split_pages
> 0)
1655 * NB: We could consider various coping strategies here to avoid a
1656 * split; most obviously, if np != result.page, we could target that
1657 * page instead. More complicated shuffling strategies could be
1658 * possible as well; basically, unless every single leaf page is 100%
1659 * full, we can jam this key in there if we try hard enough. It's
1660 * unlikely that trying that hard is worthwhile, but it's possible we
1661 * might need to make more than no effort. For now, we just do the
1662 * easy thing, which is nothing.
1665 /* If this is a soft insert, it's time to give up. */
1669 /* Check whether we need to allocate more btree pages to split. */
1670 if (result
.split_pages
> fpm
->btree_recycle_count
)
1677 * Allocate the required number of pages and split each one in
1678 * turn. This should never fail, because if we've got enough
1679 * spans of free pages kicking around that we need additional
1680 * storage space just to remember them all, then we should
1681 * certainly have enough to expand the btree, which should only
1682 * ever use a tiny number of pages compared to the number under
1683 * management. If it does, something's badly screwed up.
1685 pages_needed
= result
.split_pages
- fpm
->btree_recycle_count
;
1686 for (i
= 0; i
< pages_needed
; ++i
)
1688 if (!FreePageManagerGetInternal(fpm
, 1, &recycle_page
))
1689 elog(FATAL
, "free page manager btree is corrupt");
1690 FreePageBtreeRecycle(fpm
, recycle_page
);
1694 * The act of allocating pages to recycle may have invalidated the
1695 * results of our previous btree research, so repeat it. (We could
1696 * recheck whether any of our split-avoidance strategies that were
1697 * not viable before now are, but it hardly seems worthwhile, so
1698 * we don't bother. Consolidation can't be possible now if it
1699 * wasn't previously.)
1701 FreePageBtreeSearch(fpm
, first_page
, &result
);
1704 * The act of allocating pages for use in constructing our btree
1705 * should never cause any page to become more full, so the new
1706 * split depth should be no greater than the old one, and perhaps
1707 * less if we fortuitously allocated a chunk that freed up a slot
1708 * on the page we need to update.
1710 Assert(result
.split_pages
<= fpm
->btree_recycle_count
);
1713 /* If we still need to perform a split, do it. */
1714 if (result
.split_pages
> 0)
1716 FreePageBtree
*split_target
= result
.page
;
1717 FreePageBtree
*child
= NULL
;
1718 Size key
= first_page
;
1722 FreePageBtree
*newsibling
;
1723 FreePageBtree
*parent
;
1725 /* Identify parent page, which must receive downlink. */
1726 parent
= relptr_access(base
, split_target
->hdr
.parent
);
1728 /* Split the page - downlink not added yet. */
1729 newsibling
= FreePageBtreeSplitPage(fpm
, split_target
);
1732 * At this point in the loop, we're always carrying a pending
1733 * insertion. On the first pass, it's the actual key we're
1734 * trying to insert; on subsequent passes, it's the downlink
1735 * that needs to be added as a result of the split performed
1736 * during the previous loop iteration. Since we've just split
1737 * the page, there's definitely room on one of the two
1743 FreePageBtree
*insert_into
;
1745 insert_into
= key
< newsibling
->u
.leaf_key
[0].first_page
?
1746 split_target
: newsibling
;
1747 index
= FreePageBtreeSearchLeaf(insert_into
, key
);
1748 FreePageBtreeInsertLeaf(insert_into
, index
, key
, npages
);
1749 if (index
== 0 && insert_into
== split_target
)
1750 FreePageBtreeAdjustAncestorKeys(fpm
, split_target
);
1755 FreePageBtree
*insert_into
;
1758 key
< newsibling
->u
.internal_key
[0].first_page
?
1759 split_target
: newsibling
;
1760 index
= FreePageBtreeSearchInternal(insert_into
, key
);
1761 FreePageBtreeInsertInternal(base
, insert_into
, index
,
1763 relptr_store(base
, child
->hdr
.parent
, insert_into
);
1764 if (index
== 0 && insert_into
== split_target
)
1765 FreePageBtreeAdjustAncestorKeys(fpm
, split_target
);
1768 /* If the page we just split has no parent, split the root. */
1771 FreePageBtree
*newroot
;
1773 newroot
= FreePageBtreeGetRecycled(fpm
);
1774 newroot
->hdr
.magic
= FREE_PAGE_INTERNAL_MAGIC
;
1775 newroot
->hdr
.nused
= 2;
1776 relptr_store(base
, newroot
->hdr
.parent
,
1777 (FreePageBtree
*) NULL
);
1778 newroot
->u
.internal_key
[0].first_page
=
1779 FreePageBtreeFirstKey(split_target
);
1780 relptr_store(base
, newroot
->u
.internal_key
[0].child
,
1782 relptr_store(base
, split_target
->hdr
.parent
, newroot
);
1783 newroot
->u
.internal_key
[1].first_page
=
1784 FreePageBtreeFirstKey(newsibling
);
1785 relptr_store(base
, newroot
->u
.internal_key
[1].child
,
1787 relptr_store(base
, newsibling
->hdr
.parent
, newroot
);
1788 relptr_store(base
, fpm
->btree_root
, newroot
);
1794 /* If the parent page isn't full, insert the downlink. */
1795 key
= newsibling
->u
.internal_key
[0].first_page
;
1796 if (parent
->hdr
.nused
< FPM_ITEMS_PER_INTERNAL_PAGE
)
1800 index
= FreePageBtreeSearchInternal(parent
, key
);
1801 FreePageBtreeInsertInternal(base
, parent
, index
,
1803 relptr_store(base
, newsibling
->hdr
.parent
, parent
);
1805 FreePageBtreeAdjustAncestorKeys(fpm
, parent
);
1809 /* The parent also needs to be split, so loop around. */
1811 split_target
= parent
;
1815 * The loop above did the insert, so just need to update the free
1816 * list, and we're done.
1818 FreePagePushSpanLeader(fpm
, first_page
, npages
);
1824 /* Physically add the key to the page. */
1825 Assert(result
.page
->hdr
.nused
< FPM_ITEMS_PER_LEAF_PAGE
);
1826 FreePageBtreeInsertLeaf(result
.page
, result
.index
, first_page
, npages
);
1828 /* If new first key on page, ancestors might need adjustment. */
1829 if (result
.index
== 0)
1830 FreePageBtreeAdjustAncestorKeys(fpm
, result
.page
);
1832 /* Put it on the free list. */
1833 FreePagePushSpanLeader(fpm
, first_page
, npages
);
1839 * Remove a FreePageSpanLeader from the linked-list that contains it, either
1840 * because we're changing the size of the span, or because we're allocating it.
1843 FreePagePopSpanLeader(FreePageManager
*fpm
, Size pageno
)
1845 char *base
= fpm_segment_base(fpm
);
1846 FreePageSpanLeader
*span
;
1847 FreePageSpanLeader
*next
;
1848 FreePageSpanLeader
*prev
;
1850 span
= (FreePageSpanLeader
*) fpm_page_to_pointer(base
, pageno
);
1852 next
= relptr_access(base
, span
->next
);
1853 prev
= relptr_access(base
, span
->prev
);
1855 relptr_copy(next
->prev
, span
->prev
);
1857 relptr_copy(prev
->next
, span
->next
);
1860 Size f
= Min(span
->npages
, FPM_NUM_FREELISTS
) - 1;
1862 Assert(fpm
->freelist
[f
].relptr_off
== pageno
* FPM_PAGE_SIZE
);
1863 relptr_copy(fpm
->freelist
[f
], span
->next
);
1868 * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
1871 FreePagePushSpanLeader(FreePageManager
*fpm
, Size first_page
, Size npages
)
1873 char *base
= fpm_segment_base(fpm
);
1874 Size f
= Min(npages
, FPM_NUM_FREELISTS
) - 1;
1875 FreePageSpanLeader
*head
= relptr_access(base
, fpm
->freelist
[f
]);
1876 FreePageSpanLeader
*span
;
1878 span
= (FreePageSpanLeader
*) fpm_page_to_pointer(base
, first_page
);
1879 span
->magic
= FREE_PAGE_SPAN_LEADER_MAGIC
;
1880 span
->npages
= npages
;
1881 relptr_store(base
, span
->next
, head
);
1882 relptr_store(base
, span
->prev
, (FreePageSpanLeader
*) NULL
);
1884 relptr_store(base
, head
->prev
, span
);
1885 relptr_store(base
, fpm
->freelist
[f
], span
);