Fix integer underflow in shared memory debugging
[pgsql.git] / src / backend / utils / mmgr / dsa.c
blobb9e7f224d517ee9a2f25f5bf58c39e05ee60a2c7
1 /*-------------------------------------------------------------------------
3 * dsa.c
4 * Dynamic shared memory areas.
6 * This module provides dynamic shared memory areas which are built on top of
7 * DSM segments. While dsm.c allows segments of memory of shared memory to be
8 * created and shared between backends, it isn't designed to deal with small
9 * objects. A DSA area is a shared memory heap usually backed by one or more
10 * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11 * Alternatively, it can be created in pre-existing shared memory, including a
12 * DSM segment, and then create extra DSM segments as required. Unlike the
13 * regular system heap, it deals in pseudo-pointers which must be converted to
14 * backend-local pointers before they are dereferenced. These pseudo-pointers
15 * can however be shared with other backends, and can be used to construct
16 * shared data structures.
18 * Each DSA area manages a set of DSM segments, adding new segments as
19 * required and detaching them when they are no longer needed. Each segment
20 * contains a number of 4KB pages, a free page manager for tracking
21 * consecutive runs of free pages, and a page map for tracking the source of
22 * objects allocated on each page. Allocation requests above 8KB are handled
23 * by choosing a segment and finding consecutive free pages in its free page
24 * manager. Allocation requests for smaller sizes are handled using pools of
25 * objects of a selection of sizes. Each pool consists of a number of 16 page
26 * (64KB) superblocks allocated in the same way as large objects. Allocation
27 * of large objects and new superblocks is serialized by a single LWLock, but
28 * allocation of small objects from pre-existing superblocks uses one LWLock
29 * per pool. Currently there is one pool, and therefore one lock, per size
30 * class. Per-core pools to increase concurrency and strategies for reducing
31 * the resulting fragmentation are areas for future research. Each superblock
32 * is managed with a 'span', which tracks the superblock's freelist. Free
33 * requests are handled by looking in the page map to find which span an
34 * address was allocated from, so that small objects can be returned to the
35 * appropriate free list, and large object pages can be returned directly to
36 * the free page map. When allocating, simple heuristics for selecting
37 * segments and superblocks try to encourage occupied memory to be
38 * concentrated, increasing the likelihood that whole superblocks can become
39 * empty and be returned to the free page manager, and whole segments can
40 * become empty and be returned to the operating system.
42 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
43 * Portions Copyright (c) 1994, Regents of the University of California
45 * IDENTIFICATION
46 * src/backend/utils/mmgr/dsa.c
48 *-------------------------------------------------------------------------
51 #include "postgres.h"
53 #include "port/atomics.h"
54 #include "port/pg_bitutils.h"
55 #include "storage/dsm.h"
56 #include "storage/ipc.h"
57 #include "storage/lwlock.h"
58 #include "storage/shmem.h"
59 #include "utils/dsa.h"
60 #include "utils/freepage.h"
61 #include "utils/memutils.h"
62 #include "utils/resowner.h"
65 * The size of the initial DSM segment that backs a dsa_area created by
66 * dsa_create. After creating some number of segments of this size we'll
67 * double this size, and so on. Larger segments may be created if necessary
68 * to satisfy large requests.
70 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
73 * How many segments to create before we double the segment size. If this is
74 * low, then there is likely to be a lot of wasted space in the largest
75 * segment. If it is high, then we risk running out of segment slots (see
76 * dsm.c's limits on total number of segments), or limiting the total size
77 * an area can manage when using small pointers.
79 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
82 * The number of bits used to represent the offset part of a dsa_pointer.
83 * This controls the maximum size of a segment, the maximum possible
84 * allocation size and also the maximum number of segments per area.
86 #if SIZEOF_DSA_POINTER == 4
87 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
88 #else
89 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
90 #endif
93 * The maximum number of DSM segments that an area can own, determined by
94 * the number of bits remaining (but capped at 1024).
96 #define DSA_MAX_SEGMENTS \
97 Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
99 /* The bitmask for extracting the offset from a dsa_pointer. */
100 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
102 /* The maximum size of a DSM segment. */
103 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
105 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
106 #define DSA_PAGES_PER_SUPERBLOCK 16
109 * A magic number used as a sanity check for following DSM segments belonging
110 * to a DSA area (this number will be XORed with the area handle and
111 * the segment index).
113 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
115 /* Build a dsa_pointer given a segment number and offset. */
116 #define DSA_MAKE_POINTER(segment_number, offset) \
117 (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
119 /* Extract the segment number from a dsa_pointer. */
120 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
122 /* Extract the offset from a dsa_pointer. */
123 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
125 /* The type used for index segment indexes (zero based). */
126 typedef size_t dsa_segment_index;
128 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
129 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
132 * How many bins of segments do we have? The bins are used to categorize
133 * segments by their largest contiguous run of free pages.
135 #define DSA_NUM_SEGMENT_BINS 16
138 * What is the lowest bin that holds segments that *might* have n contiguous
139 * free pages? There is no point in looking in segments in lower bins; they
140 * definitely can't service a request for n free pages.
142 static inline size_t
143 contiguous_pages_to_segment_bin(size_t n)
145 size_t bin;
147 if (n == 0)
148 bin = 0;
149 else
150 bin = pg_leftmost_one_pos_size_t(n) + 1;
152 return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
155 /* Macros for access to locks. */
156 #define DSA_AREA_LOCK(area) (&area->control->lock)
157 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
160 * The header for an individual segment. This lives at the start of each DSM
161 * segment owned by a DSA area including the first segment (where it appears
162 * as part of the dsa_area_control struct).
164 typedef struct
166 /* Sanity check magic value. */
167 uint32 magic;
168 /* Total number of pages in this segment (excluding metadata area). */
169 size_t usable_pages;
170 /* Total size of this segment in bytes. */
171 size_t size;
174 * Index of the segment that precedes this one in the same segment bin, or
175 * DSA_SEGMENT_INDEX_NONE if this is the first one.
177 dsa_segment_index prev;
180 * Index of the segment that follows this one in the same segment bin, or
181 * DSA_SEGMENT_INDEX_NONE if this is the last one.
183 dsa_segment_index next;
184 /* The index of the bin that contains this segment. */
185 size_t bin;
188 * A flag raised to indicate that this segment is being returned to the
189 * operating system and has been unpinned.
191 bool freed;
192 } dsa_segment_header;
195 * Metadata for one superblock.
197 * For most blocks, span objects are stored out-of-line; that is, the span
198 * object is not stored within the block itself. But, as an exception, for a
199 * "span of spans", the span object is stored "inline". The allocation is
200 * always exactly one page, and the dsa_area_span object is located at
201 * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
202 * and the remaining fields are used just as they would be in an ordinary
203 * block. We can't allocate spans out of ordinary superblocks because
204 * creating an ordinary superblock requires us to be able to allocate a span
205 * *first*. Doing it this way avoids that circularity.
207 typedef struct
209 dsa_pointer pool; /* Containing pool. */
210 dsa_pointer prevspan; /* Previous span. */
211 dsa_pointer nextspan; /* Next span. */
212 dsa_pointer start; /* Starting address. */
213 size_t npages; /* Length of span in pages. */
214 uint16 size_class; /* Size class. */
215 uint16 ninitialized; /* Maximum number of objects ever allocated. */
216 uint16 nallocatable; /* Number of objects currently allocatable. */
217 uint16 firstfree; /* First object on free list. */
218 uint16 nmax; /* Maximum number of objects ever possible. */
219 uint16 fclass; /* Current fullness class. */
220 } dsa_area_span;
223 * Given a pointer to an object in a span, access the index of the next free
224 * object in the same span (ie in the span's freelist) as an L-value.
226 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
229 * Small allocations are handled by dividing a single block of memory into
230 * many small objects of equal size. The possible allocation sizes are
231 * defined by the following array. Larger size classes are spaced more widely
232 * than smaller size classes. We fudge the spacing for size classes >1kB to
233 * avoid space wastage: based on the knowledge that we plan to allocate 64kB
234 * blocks, we bump the maximum object size up to the largest multiple of
235 * 8 bytes that still lets us fit the same number of objects into one block.
237 * NB: Because of this fudging, if we were ever to use differently-sized blocks
238 * for small allocations, these size classes would need to be reworked to be
239 * optimal for the new size.
241 * NB: The optimal spacing for size classes, as well as the size of the blocks
242 * out of which small objects are allocated, is not a question that has one
243 * right answer. Some allocators (such as tcmalloc) use more closely-spaced
244 * size classes than we do here, while others (like aset.c) use more
245 * widely-spaced classes. Spacing the classes more closely avoids wasting
246 * memory within individual chunks, but also means a larger number of
247 * potentially-unfilled blocks.
249 static const uint16 dsa_size_classes[] = {
250 sizeof(dsa_area_span), 0, /* special size classes */
251 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
252 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
253 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
254 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
255 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
256 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
257 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
258 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
260 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
262 /* Special size classes. */
263 #define DSA_SCLASS_BLOCK_OF_SPANS 0
264 #define DSA_SCLASS_SPAN_LARGE 1
267 * The following lookup table is used to map the size of small objects
268 * (less than 1kB) onto the corresponding size class. To use this table,
269 * round the size of the object up to the next multiple of 8 bytes, and then
270 * index into this array.
272 static const uint8 dsa_size_class_map[] = {
273 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
274 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
275 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
276 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
277 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
278 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
279 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
280 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
282 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
285 * Superblocks are binned by how full they are. Generally, each fullness
286 * class corresponds to one quartile, but the block being used for
287 * allocations is always at the head of the list for fullness class 1,
288 * regardless of how full it really is.
290 #define DSA_FULLNESS_CLASSES 4
293 * A dsa_area_pool represents a set of objects of a given size class.
295 * Perhaps there should be multiple pools for the same size class for
296 * contention avoidance, but for now there is just one!
298 typedef struct
300 /* A lock protecting access to this pool. */
301 LWLock lock;
302 /* A set of linked lists of spans, arranged by fullness. */
303 dsa_pointer spans[DSA_FULLNESS_CLASSES];
304 /* Should we pad this out to a cacheline boundary? */
305 } dsa_area_pool;
308 * The control block for an area. This lives in shared memory, at the start of
309 * the first DSM segment controlled by this area.
311 typedef struct
313 /* The segment header for the first segment. */
314 dsa_segment_header segment_header;
315 /* The handle for this area. */
316 dsa_handle handle;
317 /* The handles of the segments owned by this area. */
318 dsm_handle segment_handles[DSA_MAX_SEGMENTS];
319 /* Lists of segments, binned by maximum contiguous run of free pages. */
320 dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
321 /* The object pools for each size class. */
322 dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
323 /* The total size of all active segments. */
324 size_t total_segment_size;
325 /* The maximum total size of backing storage we are allowed. */
326 size_t max_total_segment_size;
327 /* Highest used segment index in the history of this area. */
328 dsa_segment_index high_segment_index;
329 /* The reference count for this area. */
330 int refcnt;
331 /* A flag indicating that this area has been pinned. */
332 bool pinned;
333 /* The number of times that segments have been freed. */
334 size_t freed_segment_counter;
335 /* The LWLock tranche ID. */
336 int lwlock_tranche_id;
337 /* The general lock (protects everything except object pools). */
338 LWLock lock;
339 } dsa_area_control;
341 /* Given a pointer to a pool, find a dsa_pointer. */
342 #define DsaAreaPoolToDsaPointer(area, p) \
343 DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
346 * A dsa_segment_map is stored within the backend-private memory of each
347 * individual backend. It holds the base address of the segment within that
348 * backend, plus the addresses of key objects within the segment. Those
349 * could instead be derived from the base address but it's handy to have them
350 * around.
352 typedef struct
354 dsm_segment *segment; /* DSM segment */
355 char *mapped_address; /* Address at which segment is mapped */
356 dsa_segment_header *header; /* Header (same as mapped_address) */
357 FreePageManager *fpm; /* Free page manager within segment. */
358 dsa_pointer *pagemap; /* Page map within segment. */
359 } dsa_segment_map;
362 * Per-backend state for a storage area. Backends obtain one of these by
363 * creating an area or attaching to an existing one using a handle. Each
364 * process that needs to use an area uses its own object to track where the
365 * segments are mapped.
367 struct dsa_area
369 /* Pointer to the control object in shared memory. */
370 dsa_area_control *control;
373 * All the mappings are owned by this. The dsa_area itself is not
374 * directly tracked by the ResourceOwner, but the effect is the same. NULL
375 * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
376 * been called.
378 ResourceOwner resowner;
381 * This backend's array of segment maps, ordered by segment index
382 * corresponding to control->segment_handles. Some of the area's segments
383 * may not be mapped in this backend yet, and some slots may have been
384 * freed and need to be detached; these operations happen on demand.
386 dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
388 /* The highest segment index this backend has ever mapped. */
389 dsa_segment_index high_segment_index;
391 /* The last observed freed_segment_counter. */
392 size_t freed_segment_counter;
395 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
396 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
398 /* Given a pointer to a segment_map, obtain a segment index number. */
399 #define get_segment_index(area, segment_map_ptr) \
400 (segment_map_ptr - &area->segment_maps[0])
402 static void init_span(dsa_area *area, dsa_pointer span_pointer,
403 dsa_area_pool *pool, dsa_pointer start, size_t npages,
404 uint16 size_class);
405 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
406 int fromclass, int toclass);
407 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
408 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
409 int size_class);
410 static dsa_segment_map *get_segment_by_index(dsa_area *area,
411 dsa_segment_index index);
412 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
413 static void unlink_span(dsa_area *area, dsa_area_span *span);
414 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
415 dsa_pointer span_pointer, int fclass);
416 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
417 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
418 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
419 static dsa_area *create_internal(void *place, size_t size,
420 int tranche_id,
421 dsm_handle control_handle,
422 dsm_segment *control_segment);
423 static dsa_area *attach_internal(void *place, dsm_segment *segment,
424 dsa_handle handle);
425 static void check_for_freed_segments(dsa_area *area);
426 static void check_for_freed_segments_locked(dsa_area *area);
427 static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
430 * Create a new shared area in a new DSM segment. Further DSM segments will
431 * be allocated as required to extend the available space.
433 * We can't allocate a LWLock tranche_id within this function, because tranche
434 * IDs are a scarce resource; there are only 64k available, using low numbers
435 * when possible matters, and we have no provision for recycling them. So,
436 * we require the caller to provide one.
438 dsa_area *
439 dsa_create(int tranche_id)
441 dsm_segment *segment;
442 dsa_area *area;
445 * Create the DSM segment that will hold the shared control object and the
446 * first segment of usable space.
448 segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
451 * All segments backing this area are pinned, so that DSA can explicitly
452 * control their lifetime (otherwise a newly created segment belonging to
453 * this area might be freed when the only backend that happens to have it
454 * mapped in ends, corrupting the area).
456 dsm_pin_segment(segment);
458 /* Create a new DSA area with the control object in this segment. */
459 area = create_internal(dsm_segment_address(segment),
460 DSA_INITIAL_SEGMENT_SIZE,
461 tranche_id,
462 dsm_segment_handle(segment), segment);
464 /* Clean up when the control segment detaches. */
465 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
466 PointerGetDatum(dsm_segment_address(segment)));
468 return area;
472 * Create a new shared area in an existing shared memory space, which may be
473 * either DSM or Postmaster-initialized memory. DSM segments will be
474 * allocated as required to extend the available space, though that can be
475 * prevented with dsa_set_size_limit(area, size) using the same size provided
476 * to dsa_create_in_place.
478 * Areas created in-place must eventually be released by the backend that
479 * created them and all backends that attach to them. This can be done
480 * explicitly with dsa_release_in_place, or, in the special case that 'place'
481 * happens to be in a pre-existing DSM segment, by passing in a pointer to the
482 * segment so that a detach hook can be registered with the containing DSM
483 * segment.
485 * See dsa_create() for a note about the tranche arguments.
487 dsa_area *
488 dsa_create_in_place(void *place, size_t size,
489 int tranche_id, dsm_segment *segment)
491 dsa_area *area;
493 area = create_internal(place, size, tranche_id,
494 DSM_HANDLE_INVALID, NULL);
497 * Clean up when the control segment detaches, if a containing DSM segment
498 * was provided.
500 if (segment != NULL)
501 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
502 PointerGetDatum(place));
504 return area;
508 * Obtain a handle that can be passed to other processes so that they can
509 * attach to the given area. Cannot be called for areas created with
510 * dsa_create_in_place.
512 dsa_handle
513 dsa_get_handle(dsa_area *area)
515 Assert(area->control->handle != DSA_HANDLE_INVALID);
516 return area->control->handle;
520 * Attach to an area given a handle generated (possibly in another process) by
521 * dsa_get_handle. The area must have been created with dsa_create (not
522 * dsa_create_in_place).
524 dsa_area *
525 dsa_attach(dsa_handle handle)
527 dsm_segment *segment;
528 dsa_area *area;
531 * An area handle is really a DSM segment handle for the first segment, so
532 * we go ahead and attach to that.
534 segment = dsm_attach(handle);
535 if (segment == NULL)
536 ereport(ERROR,
537 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
538 errmsg("could not attach to dynamic shared area")));
540 area = attach_internal(dsm_segment_address(segment), segment, handle);
542 /* Clean up when the control segment detaches. */
543 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
544 PointerGetDatum(dsm_segment_address(segment)));
546 return area;
550 * Attach to an area that was created with dsa_create_in_place. The caller
551 * must somehow know the location in memory that was used when the area was
552 * created, though it may be mapped at a different virtual address in this
553 * process.
555 * See dsa_create_in_place for note about releasing in-place areas, and the
556 * optional 'segment' argument which can be provided to allow automatic
557 * release if the containing memory happens to be a DSM segment.
559 dsa_area *
560 dsa_attach_in_place(void *place, dsm_segment *segment)
562 dsa_area *area;
564 area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
567 * Clean up when the control segment detaches, if a containing DSM segment
568 * was provided.
570 if (segment != NULL)
571 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
572 PointerGetDatum(place));
574 return area;
578 * Release a DSA area that was produced by dsa_create_in_place or
579 * dsa_attach_in_place. The 'segment' argument is ignored but provides an
580 * interface suitable for on_dsm_detach, for the convenience of users who want
581 * to create a DSA segment inside an existing DSM segment and have it
582 * automatically released when the containing DSM segment is detached.
583 * 'place' should be the address of the place where the area was created.
585 * This callback is automatically registered for the DSM segment containing
586 * the control object of in-place areas when a segment is provided to
587 * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
588 * with dsa_create.
590 void
591 dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
593 dsa_release_in_place(DatumGetPointer(place));
597 * Release a DSA area that was produced by dsa_create_in_place or
598 * dsa_attach_in_place. The 'code' argument is ignored but provides an
599 * interface suitable for on_shmem_exit or before_shmem_exit, for the
600 * convenience of users who want to create a DSA segment inside shared memory
601 * other than a DSM segment and have it automatically release at backend exit.
602 * 'place' should be the address of the place where the area was created.
604 void
605 dsa_on_shmem_exit_release_in_place(int code, Datum place)
607 dsa_release_in_place(DatumGetPointer(place));
611 * Release a DSA area that was produced by dsa_create_in_place or
612 * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
613 * callbacks so that this is managed automatically, because failure to release
614 * an area created in-place leaks its segments permanently.
616 * This is also called automatically for areas produced by dsa_create or
617 * dsa_attach as an implementation detail.
619 void
620 dsa_release_in_place(void *place)
622 dsa_area_control *control = (dsa_area_control *) place;
623 int i;
625 LWLockAcquire(&control->lock, LW_EXCLUSIVE);
626 Assert(control->segment_header.magic ==
627 (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
628 Assert(control->refcnt > 0);
629 if (--control->refcnt == 0)
631 for (i = 0; i <= control->high_segment_index; ++i)
633 dsm_handle handle;
635 handle = control->segment_handles[i];
636 if (handle != DSM_HANDLE_INVALID)
637 dsm_unpin_segment(handle);
640 LWLockRelease(&control->lock);
644 * Keep a DSA area attached until end of session or explicit detach.
646 * By default, areas are owned by the current resource owner, which means they
647 * are detached automatically when that scope ends.
649 void
650 dsa_pin_mapping(dsa_area *area)
652 int i;
654 if (area->resowner != NULL)
656 area->resowner = NULL;
658 for (i = 0; i <= area->high_segment_index; ++i)
659 if (area->segment_maps[i].segment != NULL)
660 dsm_pin_mapping(area->segment_maps[i].segment);
665 * Allocate memory in this storage area. The return value is a dsa_pointer
666 * that can be passed to other processes, and converted to a local pointer
667 * with dsa_get_address. 'flags' is a bitmap which should be constructed
668 * from the following values:
670 * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
671 * will result in an ERROR.
673 * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
674 * no memory is available or a size limit established by dsa_set_size_limit
675 * would be exceeded. Otherwise, such allocations will result in an ERROR.
677 * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
678 * contents of newly-allocated memory are indeterminate.
680 * These flags correspond to similarly named flags used by
681 * MemoryContextAllocExtended(). See also the macros dsa_allocate and
682 * dsa_allocate0 which expand to a call to this function with commonly used
683 * flags.
685 dsa_pointer
686 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
688 uint16 size_class;
689 dsa_pointer start_pointer;
690 dsa_segment_map *segment_map;
691 dsa_pointer result;
693 Assert(size > 0);
695 /* Sanity check on huge individual allocation size. */
696 if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
697 ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
698 elog(ERROR, "invalid DSA memory alloc request size %zu", size);
701 * If bigger than the largest size class, just grab a run of pages from
702 * the free page manager, instead of allocating an object from a pool.
703 * There will still be a span, but it's a special class of span that
704 * manages this whole allocation and simply gives all pages back to the
705 * free page manager when dsa_free is called.
707 if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
709 size_t npages = fpm_size_to_pages(size);
710 size_t first_page;
711 dsa_pointer span_pointer;
712 dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
714 /* Obtain a span object. */
715 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
716 if (!DsaPointerIsValid(span_pointer))
718 /* Raise error unless asked not to. */
719 if ((flags & DSA_ALLOC_NO_OOM) == 0)
720 ereport(ERROR,
721 (errcode(ERRCODE_OUT_OF_MEMORY),
722 errmsg("out of memory"),
723 errdetail("Failed on DSA request of size %zu.",
724 size)));
725 return InvalidDsaPointer;
728 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
730 /* Find a segment from which to allocate. */
731 segment_map = get_best_segment(area, npages);
732 if (segment_map == NULL)
733 segment_map = make_new_segment(area, npages);
734 if (segment_map == NULL)
736 /* Can't make any more segments: game over. */
737 LWLockRelease(DSA_AREA_LOCK(area));
738 dsa_free(area, span_pointer);
740 /* Raise error unless asked not to. */
741 if ((flags & DSA_ALLOC_NO_OOM) == 0)
742 ereport(ERROR,
743 (errcode(ERRCODE_OUT_OF_MEMORY),
744 errmsg("out of memory"),
745 errdetail("Failed on DSA request of size %zu.",
746 size)));
747 return InvalidDsaPointer;
751 * Ask the free page manager for a run of pages. This should always
752 * succeed, since both get_best_segment and make_new_segment should
753 * only return a non-NULL pointer if it actually contains enough
754 * contiguous freespace. If it does fail, something in our backend
755 * private state is out of whack, so use FATAL to kill the process.
757 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
758 elog(FATAL,
759 "dsa_allocate could not find %zu free pages", npages);
760 LWLockRelease(DSA_AREA_LOCK(area));
762 start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
763 first_page * FPM_PAGE_SIZE);
765 /* Initialize span and pagemap. */
766 LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
767 LW_EXCLUSIVE);
768 init_span(area, span_pointer, pool, start_pointer, npages,
769 DSA_SCLASS_SPAN_LARGE);
770 segment_map->pagemap[first_page] = span_pointer;
771 LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
773 /* Zero-initialize the memory if requested. */
774 if ((flags & DSA_ALLOC_ZERO) != 0)
775 memset(dsa_get_address(area, start_pointer), 0, size);
777 return start_pointer;
780 /* Map allocation to a size class. */
781 if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
783 int mapidx;
785 /* For smaller sizes we have a lookup table... */
786 mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
787 DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
788 size_class = dsa_size_class_map[mapidx];
790 else
792 uint16 min;
793 uint16 max;
795 /* ... and for the rest we search by binary chop. */
796 min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
797 max = lengthof(dsa_size_classes) - 1;
799 while (min < max)
801 uint16 mid = (min + max) / 2;
802 uint16 class_size = dsa_size_classes[mid];
804 if (class_size < size)
805 min = mid + 1;
806 else
807 max = mid;
810 size_class = min;
812 Assert(size <= dsa_size_classes[size_class]);
813 Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
815 /* Attempt to allocate an object from the appropriate pool. */
816 result = alloc_object(area, size_class);
818 /* Check for failure to allocate. */
819 if (!DsaPointerIsValid(result))
821 /* Raise error unless asked not to. */
822 if ((flags & DSA_ALLOC_NO_OOM) == 0)
823 ereport(ERROR,
824 (errcode(ERRCODE_OUT_OF_MEMORY),
825 errmsg("out of memory"),
826 errdetail("Failed on DSA request of size %zu.", size)));
827 return InvalidDsaPointer;
830 /* Zero-initialize the memory if requested. */
831 if ((flags & DSA_ALLOC_ZERO) != 0)
832 memset(dsa_get_address(area, result), 0, size);
834 return result;
838 * Free memory obtained with dsa_allocate.
840 void
841 dsa_free(dsa_area *area, dsa_pointer dp)
843 dsa_segment_map *segment_map;
844 int pageno;
845 dsa_pointer span_pointer;
846 dsa_area_span *span;
847 char *superblock;
848 char *object;
849 size_t size;
850 int size_class;
852 /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
853 check_for_freed_segments(area);
855 /* Locate the object, span and pool. */
856 segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
857 pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
858 span_pointer = segment_map->pagemap[pageno];
859 span = dsa_get_address(area, span_pointer);
860 superblock = dsa_get_address(area, span->start);
861 object = dsa_get_address(area, dp);
862 size_class = span->size_class;
863 size = dsa_size_classes[size_class];
866 * Special case for large objects that live in a special span: we return
867 * those pages directly to the free page manager and free the span.
869 if (span->size_class == DSA_SCLASS_SPAN_LARGE)
872 #ifdef CLOBBER_FREED_MEMORY
873 memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
874 #endif
876 /* Give pages back to free page manager. */
877 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
878 FreePageManagerPut(segment_map->fpm,
879 DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
880 span->npages);
882 /* Move segment to appropriate bin if necessary. */
883 rebin_segment(area, segment_map);
884 LWLockRelease(DSA_AREA_LOCK(area));
886 /* Unlink span. */
887 LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
888 LW_EXCLUSIVE);
889 unlink_span(area, span);
890 LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
891 /* Free the span object so it can be reused. */
892 dsa_free(area, span_pointer);
893 return;
896 #ifdef CLOBBER_FREED_MEMORY
897 memset(object, 0x7f, size);
898 #endif
900 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
902 /* Put the object on the span's freelist. */
903 Assert(object >= superblock);
904 Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
905 Assert((object - superblock) % size == 0);
906 NextFreeObjectIndex(object) = span->firstfree;
907 span->firstfree = (object - superblock) / size;
908 ++span->nallocatable;
911 * See if the span needs to moved to a different fullness class, or be
912 * freed so its pages can be given back to the segment.
914 if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
917 * The block was completely full and is located in the
918 * highest-numbered fullness class, which is never scanned for free
919 * chunks. We must move it to the next-lower fullness class.
921 unlink_span(area, span);
922 add_span_to_fullness_class(area, span, span_pointer,
923 DSA_FULLNESS_CLASSES - 2);
926 * If this is the only span, and there is no active span, then we
927 * should probably move this span to fullness class 1. (Otherwise if
928 * you allocate exactly all the objects in the only span, it moves to
929 * class 3, then you free them all, it moves to 2, and then is given
930 * back, leaving no active span).
933 else if (span->nallocatable == span->nmax &&
934 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
937 * This entire block is free, and it's not the active block for this
938 * size class. Return the memory to the free page manager. We don't
939 * do this for the active block to prevent hysteresis: if we
940 * repeatedly allocate and free the only chunk in the active block, it
941 * will be very inefficient if we deallocate and reallocate the block
942 * every time.
944 destroy_superblock(area, span_pointer);
947 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
951 * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
952 * memory allocated by the given area (possibly in another process) that
953 * hasn't yet been freed. This may cause a segment to be mapped into the
954 * current process if required, and may cause freed segments to be unmapped.
956 void *
957 dsa_get_address(dsa_area *area, dsa_pointer dp)
959 dsa_segment_index index;
960 size_t offset;
962 /* Convert InvalidDsaPointer to NULL. */
963 if (!DsaPointerIsValid(dp))
964 return NULL;
966 /* Process any requests to detach from freed segments. */
967 check_for_freed_segments(area);
969 /* Break the dsa_pointer into its components. */
970 index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
971 offset = DSA_EXTRACT_OFFSET(dp);
972 Assert(index < DSA_MAX_SEGMENTS);
974 /* Check if we need to cause this segment to be mapped in. */
975 if (unlikely(area->segment_maps[index].mapped_address == NULL))
977 /* Call for effect (we don't need the result). */
978 get_segment_by_index(area, index);
981 return area->segment_maps[index].mapped_address + offset;
985 * Pin this area, so that it will continue to exist even if all backends
986 * detach from it. In that case, the area can still be reattached to if a
987 * handle has been recorded somewhere.
989 void
990 dsa_pin(dsa_area *area)
992 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
993 if (area->control->pinned)
995 LWLockRelease(DSA_AREA_LOCK(area));
996 elog(ERROR, "dsa_area already pinned");
998 area->control->pinned = true;
999 ++area->control->refcnt;
1000 LWLockRelease(DSA_AREA_LOCK(area));
1004 * Undo the effects of dsa_pin, so that the given area can be freed when no
1005 * backends are attached to it. May be called only if dsa_pin has been
1006 * called.
1008 void
1009 dsa_unpin(dsa_area *area)
1011 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1012 Assert(area->control->refcnt > 1);
1013 if (!area->control->pinned)
1015 LWLockRelease(DSA_AREA_LOCK(area));
1016 elog(ERROR, "dsa_area not pinned");
1018 area->control->pinned = false;
1019 --area->control->refcnt;
1020 LWLockRelease(DSA_AREA_LOCK(area));
1024 * Set the total size limit for this area. This limit is checked whenever new
1025 * segments need to be allocated from the operating system. If the new size
1026 * limit is already exceeded, this has no immediate effect.
1028 * Note that the total virtual memory usage may be temporarily larger than
1029 * this limit when segments have been freed, but not yet detached by all
1030 * backends that have attached to them.
1032 void
1033 dsa_set_size_limit(dsa_area *area, size_t limit)
1035 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1036 area->control->max_total_segment_size = limit;
1037 LWLockRelease(DSA_AREA_LOCK(area));
1041 * Aggressively free all spare memory in the hope of returning DSM segments to
1042 * the operating system.
1044 void
1045 dsa_trim(dsa_area *area)
1047 int size_class;
1050 * Trim in reverse pool order so we get to the spans-of-spans last, just
1051 * in case any become entirely free while processing all the other pools.
1053 for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1055 dsa_area_pool *pool = &area->control->pools[size_class];
1056 dsa_pointer span_pointer;
1058 if (size_class == DSA_SCLASS_SPAN_LARGE)
1060 /* Large object frees give back segments aggressively already. */
1061 continue;
1065 * Search fullness class 1 only. That is where we expect to find an
1066 * entirely empty superblock (entirely empty superblocks in other
1067 * fullness classes are returned to the free page map by dsa_free).
1069 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1070 span_pointer = pool->spans[1];
1071 while (DsaPointerIsValid(span_pointer))
1073 dsa_area_span *span = dsa_get_address(area, span_pointer);
1074 dsa_pointer next = span->nextspan;
1076 if (span->nallocatable == span->nmax)
1077 destroy_superblock(area, span_pointer);
1079 span_pointer = next;
1081 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1086 * Print out debugging information about the internal state of the shared
1087 * memory area.
1089 void
1090 dsa_dump(dsa_area *area)
1092 size_t i,
1096 * Note: This gives an inconsistent snapshot as it acquires and releases
1097 * individual locks as it goes...
1100 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1101 check_for_freed_segments_locked(area);
1102 fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1103 fprintf(stderr, " max_total_segment_size: %zu\n",
1104 area->control->max_total_segment_size);
1105 fprintf(stderr, " total_segment_size: %zu\n",
1106 area->control->total_segment_size);
1107 fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1108 fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1109 fprintf(stderr, " segment bins:\n");
1110 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1112 if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1114 dsa_segment_index segment_index;
1116 if (i == 0)
1117 fprintf(stderr,
1118 " segment bin %zu (no contiguous free pages):\n", i);
1119 else
1120 fprintf(stderr,
1121 " segment bin %zu (at least %d contiguous pages free):\n",
1122 i, 1 << (i - 1));
1123 segment_index = area->control->segment_bins[i];
1124 while (segment_index != DSA_SEGMENT_INDEX_NONE)
1126 dsa_segment_map *segment_map;
1128 segment_map =
1129 get_segment_by_index(area, segment_index);
1131 fprintf(stderr,
1132 " segment index %zu, usable_pages = %zu, "
1133 "contiguous_pages = %zu, mapped at %p\n",
1134 segment_index,
1135 segment_map->header->usable_pages,
1136 fpm_largest(segment_map->fpm),
1137 segment_map->mapped_address);
1138 segment_index = segment_map->header->next;
1142 LWLockRelease(DSA_AREA_LOCK(area));
1144 fprintf(stderr, " pools:\n");
1145 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1147 bool found = false;
1149 LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
1150 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1151 if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1152 found = true;
1153 if (found)
1155 if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1156 fprintf(stderr, " pool for blocks of span objects:\n");
1157 else if (i == DSA_SCLASS_SPAN_LARGE)
1158 fprintf(stderr, " pool for large object spans:\n");
1159 else
1160 fprintf(stderr,
1161 " pool for size class %zu (object size %hu bytes):\n",
1162 i, dsa_size_classes[i]);
1163 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1165 if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1166 fprintf(stderr, " fullness class %zu is empty\n", j);
1167 else
1169 dsa_pointer span_pointer = area->control->pools[i].spans[j];
1171 fprintf(stderr, " fullness class %zu:\n", j);
1172 while (DsaPointerIsValid(span_pointer))
1174 dsa_area_span *span;
1176 span = dsa_get_address(area, span_pointer);
1177 fprintf(stderr,
1178 " span descriptor at "
1179 DSA_POINTER_FORMAT ", superblock at "
1180 DSA_POINTER_FORMAT
1181 ", pages = %zu, objects free = %hu/%hu\n",
1182 span_pointer, span->start, span->npages,
1183 span->nallocatable, span->nmax);
1184 span_pointer = span->nextspan;
1189 LWLockRelease(DSA_SCLASS_LOCK(area, i));
1194 * Return the smallest size that you can successfully provide to
1195 * dsa_create_in_place.
1197 size_t
1198 dsa_minimum_size(void)
1200 size_t size;
1201 int pages = 0;
1203 size = MAXALIGN(sizeof(dsa_area_control)) +
1204 MAXALIGN(sizeof(FreePageManager));
1206 /* Figure out how many pages we need, including the page map... */
1207 while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1209 ++pages;
1210 size += sizeof(dsa_pointer);
1213 return pages * FPM_PAGE_SIZE;
1217 * Workhorse function for dsa_create and dsa_create_in_place.
1219 static dsa_area *
1220 create_internal(void *place, size_t size,
1221 int tranche_id,
1222 dsm_handle control_handle,
1223 dsm_segment *control_segment)
1225 dsa_area_control *control;
1226 dsa_area *area;
1227 dsa_segment_map *segment_map;
1228 size_t usable_pages;
1229 size_t total_pages;
1230 size_t metadata_bytes;
1231 int i;
1233 /* Sanity check on the space we have to work in. */
1234 if (size < dsa_minimum_size())
1235 elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1236 dsa_minimum_size(), size);
1238 /* Now figure out how much space is usable */
1239 total_pages = size / FPM_PAGE_SIZE;
1240 metadata_bytes =
1241 MAXALIGN(sizeof(dsa_area_control)) +
1242 MAXALIGN(sizeof(FreePageManager)) +
1243 total_pages * sizeof(dsa_pointer);
1244 /* Add padding up to next page boundary. */
1245 if (metadata_bytes % FPM_PAGE_SIZE != 0)
1246 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1247 Assert(metadata_bytes <= size);
1248 usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1251 * Initialize the dsa_area_control object located at the start of the
1252 * space.
1254 control = (dsa_area_control *) place;
1255 memset(place, 0, sizeof(*control));
1256 control->segment_header.magic =
1257 DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1258 control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
1259 control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
1260 control->segment_header.usable_pages = usable_pages;
1261 control->segment_header.freed = false;
1262 control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
1263 control->handle = control_handle;
1264 control->max_total_segment_size = (size_t) -1;
1265 control->total_segment_size = size;
1266 control->segment_handles[0] = control_handle;
1267 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1268 control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1269 control->refcnt = 1;
1270 control->lwlock_tranche_id = tranche_id;
1273 * Create the dsa_area object that this backend will use to access the
1274 * area. Other backends will need to obtain their own dsa_area object by
1275 * attaching.
1277 area = palloc(sizeof(dsa_area));
1278 area->control = control;
1279 area->resowner = CurrentResourceOwner;
1280 memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1281 area->high_segment_index = 0;
1282 area->freed_segment_counter = 0;
1283 LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1284 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1285 LWLockInitialize(DSA_SCLASS_LOCK(area, i),
1286 control->lwlock_tranche_id);
1288 /* Set up the segment map for this process's mapping. */
1289 segment_map = &area->segment_maps[0];
1290 segment_map->segment = control_segment;
1291 segment_map->mapped_address = place;
1292 segment_map->header = (dsa_segment_header *) place;
1293 segment_map->fpm = (FreePageManager *)
1294 (segment_map->mapped_address +
1295 MAXALIGN(sizeof(dsa_area_control)));
1296 segment_map->pagemap = (dsa_pointer *)
1297 (segment_map->mapped_address +
1298 MAXALIGN(sizeof(dsa_area_control)) +
1299 MAXALIGN(sizeof(FreePageManager)));
1301 /* Set up the free page map. */
1302 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1303 /* There can be 0 usable pages if size is dsa_minimum_size(). */
1305 if (usable_pages > 0)
1306 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1307 usable_pages);
1309 /* Put this segment into the appropriate bin. */
1310 control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1311 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1313 return area;
1317 * Workhorse function for dsa_attach and dsa_attach_in_place.
1319 static dsa_area *
1320 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1322 dsa_area_control *control;
1323 dsa_area *area;
1324 dsa_segment_map *segment_map;
1326 control = (dsa_area_control *) place;
1327 Assert(control->handle == handle);
1328 Assert(control->segment_handles[0] == handle);
1329 Assert(control->segment_header.magic ==
1330 (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1332 /* Build the backend-local area object. */
1333 area = palloc(sizeof(dsa_area));
1334 area->control = control;
1335 area->resowner = CurrentResourceOwner;
1336 memset(&area->segment_maps[0], 0,
1337 sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1338 area->high_segment_index = 0;
1340 /* Set up the segment map for this process's mapping. */
1341 segment_map = &area->segment_maps[0];
1342 segment_map->segment = segment; /* NULL for in-place */
1343 segment_map->mapped_address = place;
1344 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1345 segment_map->fpm = (FreePageManager *)
1346 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1347 segment_map->pagemap = (dsa_pointer *)
1348 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1349 MAXALIGN(sizeof(FreePageManager)));
1351 /* Bump the reference count. */
1352 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1353 if (control->refcnt == 0)
1355 /* We can't attach to a DSA area that has already been destroyed. */
1356 ereport(ERROR,
1357 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1358 errmsg("could not attach to dynamic shared area")));
1360 ++control->refcnt;
1361 area->freed_segment_counter = area->control->freed_segment_counter;
1362 LWLockRelease(DSA_AREA_LOCK(area));
1364 return area;
1368 * Add a new span to fullness class 1 of the indicated pool.
1370 static void
1371 init_span(dsa_area *area,
1372 dsa_pointer span_pointer,
1373 dsa_area_pool *pool, dsa_pointer start, size_t npages,
1374 uint16 size_class)
1376 dsa_area_span *span = dsa_get_address(area, span_pointer);
1377 size_t obsize = dsa_size_classes[size_class];
1380 * The per-pool lock must be held because we manipulate the span list for
1381 * this pool.
1383 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1385 /* Push this span onto the front of the span list for fullness class 1. */
1386 if (DsaPointerIsValid(pool->spans[1]))
1388 dsa_area_span *head = (dsa_area_span *)
1389 dsa_get_address(area, pool->spans[1]);
1391 head->prevspan = span_pointer;
1393 span->pool = DsaAreaPoolToDsaPointer(area, pool);
1394 span->nextspan = pool->spans[1];
1395 span->prevspan = InvalidDsaPointer;
1396 pool->spans[1] = span_pointer;
1398 span->start = start;
1399 span->npages = npages;
1400 span->size_class = size_class;
1401 span->ninitialized = 0;
1402 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1405 * A block-of-spans contains its own descriptor, so mark one object as
1406 * initialized and reduce the count of allocatable objects by one.
1407 * Doing this here has the side effect of also reducing nmax by one,
1408 * which is important to make sure we free this object at the correct
1409 * time.
1411 span->ninitialized = 1;
1412 span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1414 else if (size_class != DSA_SCLASS_SPAN_LARGE)
1415 span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1416 span->firstfree = DSA_SPAN_NOTHING_FREE;
1417 span->nmax = span->nallocatable;
1418 span->fclass = 1;
1422 * Transfer the first span in one fullness class to the head of another
1423 * fullness class.
1425 static bool
1426 transfer_first_span(dsa_area *area,
1427 dsa_area_pool *pool, int fromclass, int toclass)
1429 dsa_pointer span_pointer;
1430 dsa_area_span *span;
1431 dsa_area_span *nextspan;
1433 /* Can't do it if source list is empty. */
1434 span_pointer = pool->spans[fromclass];
1435 if (!DsaPointerIsValid(span_pointer))
1436 return false;
1438 /* Remove span from head of source list. */
1439 span = dsa_get_address(area, span_pointer);
1440 pool->spans[fromclass] = span->nextspan;
1441 if (DsaPointerIsValid(span->nextspan))
1443 nextspan = (dsa_area_span *)
1444 dsa_get_address(area, span->nextspan);
1445 nextspan->prevspan = InvalidDsaPointer;
1448 /* Add span to head of target list. */
1449 span->nextspan = pool->spans[toclass];
1450 pool->spans[toclass] = span_pointer;
1451 if (DsaPointerIsValid(span->nextspan))
1453 nextspan = (dsa_area_span *)
1454 dsa_get_address(area, span->nextspan);
1455 nextspan->prevspan = span_pointer;
1457 span->fclass = toclass;
1459 return true;
1463 * Allocate one object of the requested size class from the given area.
1465 static inline dsa_pointer
1466 alloc_object(dsa_area *area, int size_class)
1468 dsa_area_pool *pool = &area->control->pools[size_class];
1469 dsa_area_span *span;
1470 dsa_pointer block;
1471 dsa_pointer result;
1472 char *object;
1473 size_t size;
1476 * Even though ensure_active_superblock can in turn call alloc_object if
1477 * it needs to allocate a new span, that's always from a different pool,
1478 * and the order of lock acquisition is always the same, so it's OK that
1479 * we hold this lock for the duration of this function.
1481 Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1482 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1485 * If there's no active superblock, we must successfully obtain one or
1486 * fail the request.
1488 if (!DsaPointerIsValid(pool->spans[1]) &&
1489 !ensure_active_superblock(area, pool, size_class))
1491 result = InvalidDsaPointer;
1493 else
1496 * There should be a block in fullness class 1 at this point, and it
1497 * should never be completely full. Thus we can either pop an object
1498 * from the free list or, failing that, initialize a new object.
1500 Assert(DsaPointerIsValid(pool->spans[1]));
1501 span = (dsa_area_span *)
1502 dsa_get_address(area, pool->spans[1]);
1503 Assert(span->nallocatable > 0);
1504 block = span->start;
1505 Assert(size_class < DSA_NUM_SIZE_CLASSES);
1506 size = dsa_size_classes[size_class];
1507 if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1509 result = block + span->firstfree * size;
1510 object = dsa_get_address(area, result);
1511 span->firstfree = NextFreeObjectIndex(object);
1513 else
1515 result = block + span->ninitialized * size;
1516 ++span->ninitialized;
1518 --span->nallocatable;
1520 /* If it's now full, move it to the highest-numbered fullness class. */
1521 if (span->nallocatable == 0)
1522 transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1525 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1526 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1528 return result;
1532 * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1533 * superblocks are completely full and no more can be allocated.
1535 * Fullness classes K of 0..N are loosely intended to represent blocks whose
1536 * utilization percentage is at least K/N, but we only enforce this rigorously
1537 * for the highest-numbered fullness class, which always contains exactly
1538 * those blocks that are completely full. It's otherwise acceptable for a
1539 * block to be in a higher-numbered fullness class than the one to which it
1540 * logically belongs. In addition, the active block, which is always the
1541 * first block in fullness class 1, is permitted to have a higher allocation
1542 * percentage than would normally be allowable for that fullness class; we
1543 * don't move it until it's completely full, and then it goes to the
1544 * highest-numbered fullness class.
1546 * It might seem odd that the active block is the head of fullness class 1
1547 * rather than fullness class 0, but experience with other allocators has
1548 * shown that it's usually better to allocate from a block that's moderately
1549 * full rather than one that's nearly empty. Insofar as is reasonably
1550 * possible, we want to avoid performing new allocations in a block that would
1551 * otherwise become empty soon.
1553 static bool
1554 ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
1555 int size_class)
1557 dsa_pointer span_pointer;
1558 dsa_pointer start_pointer;
1559 size_t obsize = dsa_size_classes[size_class];
1560 size_t nmax;
1561 int fclass;
1562 size_t npages = 1;
1563 size_t first_page;
1564 size_t i;
1565 dsa_segment_map *segment_map;
1567 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1570 * Compute the number of objects that will fit in a block of this size
1571 * class. Span-of-spans blocks are just a single page, and the first
1572 * object isn't available for use because it describes the block-of-spans
1573 * itself.
1575 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1576 nmax = FPM_PAGE_SIZE / obsize - 1;
1577 else
1578 nmax = DSA_SUPERBLOCK_SIZE / obsize;
1581 * If fullness class 1 is empty, try to find a span to put in it by
1582 * scanning higher-numbered fullness classes (excluding the last one,
1583 * whose blocks are certain to all be completely full).
1585 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1587 span_pointer = pool->spans[fclass];
1589 while (DsaPointerIsValid(span_pointer))
1591 int tfclass;
1592 dsa_area_span *span;
1593 dsa_area_span *nextspan;
1594 dsa_area_span *prevspan;
1595 dsa_pointer next_span_pointer;
1597 span = (dsa_area_span *)
1598 dsa_get_address(area, span_pointer);
1599 next_span_pointer = span->nextspan;
1601 /* Figure out what fullness class should contain this span. */
1602 tfclass = (nmax - span->nallocatable)
1603 * (DSA_FULLNESS_CLASSES - 1) / nmax;
1605 /* Look up next span. */
1606 if (DsaPointerIsValid(span->nextspan))
1607 nextspan = (dsa_area_span *)
1608 dsa_get_address(area, span->nextspan);
1609 else
1610 nextspan = NULL;
1613 * If utilization has dropped enough that this now belongs in some
1614 * other fullness class, move it there.
1616 if (tfclass < fclass)
1618 /* Remove from the current fullness class list. */
1619 if (pool->spans[fclass] == span_pointer)
1621 /* It was the head; remove it. */
1622 Assert(!DsaPointerIsValid(span->prevspan));
1623 pool->spans[fclass] = span->nextspan;
1624 if (nextspan != NULL)
1625 nextspan->prevspan = InvalidDsaPointer;
1627 else
1629 /* It was not the head. */
1630 Assert(DsaPointerIsValid(span->prevspan));
1631 prevspan = (dsa_area_span *)
1632 dsa_get_address(area, span->prevspan);
1633 prevspan->nextspan = span->nextspan;
1635 if (nextspan != NULL)
1636 nextspan->prevspan = span->prevspan;
1638 /* Push onto the head of the new fullness class list. */
1639 span->nextspan = pool->spans[tfclass];
1640 pool->spans[tfclass] = span_pointer;
1641 span->prevspan = InvalidDsaPointer;
1642 if (DsaPointerIsValid(span->nextspan))
1644 nextspan = (dsa_area_span *)
1645 dsa_get_address(area, span->nextspan);
1646 nextspan->prevspan = span_pointer;
1648 span->fclass = tfclass;
1651 /* Advance to next span on list. */
1652 span_pointer = next_span_pointer;
1655 /* Stop now if we found a suitable block. */
1656 if (DsaPointerIsValid(pool->spans[1]))
1657 return true;
1661 * If there are no blocks that properly belong in fullness class 1, pick
1662 * one from some other fullness class and move it there anyway, so that we
1663 * have an allocation target. Our last choice is to transfer a block
1664 * that's almost empty (and might become completely empty soon if left
1665 * alone), but even that is better than failing, which is what we must do
1666 * if there are no blocks at all with freespace.
1668 Assert(!DsaPointerIsValid(pool->spans[1]));
1669 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1670 if (transfer_first_span(area, pool, fclass, 1))
1671 return true;
1672 if (!DsaPointerIsValid(pool->spans[1]) &&
1673 transfer_first_span(area, pool, 0, 1))
1674 return true;
1677 * We failed to find an existing span with free objects, so we need to
1678 * allocate a new superblock and construct a new span to manage it.
1680 * First, get a dsa_area_span object to describe the new superblock block
1681 * ... unless this allocation is for a dsa_area_span object, in which case
1682 * that's surely not going to work. We handle that case by storing the
1683 * span describing a block-of-spans inline.
1685 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1687 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1688 if (!DsaPointerIsValid(span_pointer))
1689 return false;
1690 npages = DSA_PAGES_PER_SUPERBLOCK;
1693 /* Find or create a segment and allocate the superblock. */
1694 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1695 segment_map = get_best_segment(area, npages);
1696 if (segment_map == NULL)
1698 segment_map = make_new_segment(area, npages);
1699 if (segment_map == NULL)
1701 LWLockRelease(DSA_AREA_LOCK(area));
1702 return false;
1707 * This shouldn't happen: get_best_segment() or make_new_segment()
1708 * promised that we can successfully allocate npages.
1710 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1711 elog(FATAL,
1712 "dsa_allocate could not find %zu free pages for superblock",
1713 npages);
1714 LWLockRelease(DSA_AREA_LOCK(area));
1716 /* Compute the start of the superblock. */
1717 start_pointer =
1718 DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1719 first_page * FPM_PAGE_SIZE);
1722 * If this is a block-of-spans, carve the descriptor right out of the
1723 * allocated space.
1725 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1728 * We have a pointer into the segment. We need to build a dsa_pointer
1729 * from the segment index and offset into the segment.
1731 span_pointer = start_pointer;
1734 /* Initialize span and pagemap. */
1735 init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1736 for (i = 0; i < npages; ++i)
1737 segment_map->pagemap[first_page + i] = span_pointer;
1739 return true;
1743 * Return the segment map corresponding to a given segment index, mapping the
1744 * segment in if necessary. For internal segment book-keeping, this is called
1745 * with the area lock held. It is also called by dsa_free and dsa_get_address
1746 * without any locking, relying on the fact they have a known live segment
1747 * index and they always call check_for_freed_segments to ensures that any
1748 * freed segment occupying the same slot is detached first.
1750 static dsa_segment_map *
1751 get_segment_by_index(dsa_area *area, dsa_segment_index index)
1753 if (unlikely(area->segment_maps[index].mapped_address == NULL))
1755 dsm_handle handle;
1756 dsm_segment *segment;
1757 dsa_segment_map *segment_map;
1758 ResourceOwner oldowner;
1761 * If we are reached by dsa_free or dsa_get_address, there must be at
1762 * least one object allocated in the referenced segment. Otherwise,
1763 * their caller has a double-free or access-after-free bug, which we
1764 * have no hope of detecting. So we know it's safe to access this
1765 * array slot without holding a lock; it won't change underneath us.
1766 * Furthermore, we know that we can see the latest contents of the
1767 * slot, as explained in check_for_freed_segments, which those
1768 * functions call before arriving here.
1770 handle = area->control->segment_handles[index];
1772 /* It's an error to try to access an unused slot. */
1773 if (handle == DSM_HANDLE_INVALID)
1774 elog(ERROR,
1775 "dsa_area could not attach to a segment that has been freed");
1777 oldowner = CurrentResourceOwner;
1778 CurrentResourceOwner = area->resowner;
1779 segment = dsm_attach(handle);
1780 CurrentResourceOwner = oldowner;
1781 if (segment == NULL)
1782 elog(ERROR, "dsa_area could not attach to segment");
1783 segment_map = &area->segment_maps[index];
1784 segment_map->segment = segment;
1785 segment_map->mapped_address = dsm_segment_address(segment);
1786 segment_map->header =
1787 (dsa_segment_header *) segment_map->mapped_address;
1788 segment_map->fpm = (FreePageManager *)
1789 (segment_map->mapped_address +
1790 MAXALIGN(sizeof(dsa_segment_header)));
1791 segment_map->pagemap = (dsa_pointer *)
1792 (segment_map->mapped_address +
1793 MAXALIGN(sizeof(dsa_segment_header)) +
1794 MAXALIGN(sizeof(FreePageManager)));
1796 /* Remember the highest index this backend has ever mapped. */
1797 if (area->high_segment_index < index)
1798 area->high_segment_index = index;
1800 Assert(segment_map->header->magic ==
1801 (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1805 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1806 * but it's a bug in the calling code and undefined behavior if the
1807 * address is not live (ie if the segment might possibly have been freed,
1808 * they're trying to use a dangling pointer).
1810 * For dsa.c code that holds the area lock to manipulate segment_bins
1811 * lists, it would be a bug if we ever reach a freed segment here. After
1812 * it's marked as freed, the only thing any backend should do with it is
1813 * unmap it, and it should always have done that in
1814 * check_for_freed_segments_locked() before arriving here to resolve an
1815 * index to a segment_map.
1817 * Either way we can assert that we aren't returning a freed segment.
1819 Assert(!area->segment_maps[index].header->freed);
1821 return &area->segment_maps[index];
1825 * Return a superblock to the free page manager. If the underlying segment
1826 * has become entirely free, then return it to the operating system.
1828 * The appropriate pool lock must be held.
1830 static void
1831 destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
1833 dsa_area_span *span = dsa_get_address(area, span_pointer);
1834 int size_class = span->size_class;
1835 dsa_segment_map *segment_map;
1838 /* Remove it from its fullness class list. */
1839 unlink_span(area, span);
1842 * Note: Here we acquire the area lock while we already hold a per-pool
1843 * lock. We never hold the area lock and then take a pool lock, or we
1844 * could deadlock.
1846 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1847 check_for_freed_segments_locked(area);
1848 segment_map =
1849 get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
1850 FreePageManagerPut(segment_map->fpm,
1851 DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
1852 span->npages);
1853 /* Check if the segment is now entirely free. */
1854 if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1856 dsa_segment_index index = get_segment_index(area, segment_map);
1858 /* If it's not the segment with extra control data, free it. */
1859 if (index != 0)
1862 * Give it back to the OS, and allow other backends to detect that
1863 * they need to detach.
1865 unlink_segment(area, segment_map);
1866 segment_map->header->freed = true;
1867 Assert(area->control->total_segment_size >=
1868 segment_map->header->size);
1869 area->control->total_segment_size -=
1870 segment_map->header->size;
1871 dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
1872 dsm_detach(segment_map->segment);
1873 area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1874 ++area->control->freed_segment_counter;
1875 segment_map->segment = NULL;
1876 segment_map->header = NULL;
1877 segment_map->mapped_address = NULL;
1881 /* Move segment to appropriate bin if necessary. */
1882 if (segment_map->header != NULL)
1883 rebin_segment(area, segment_map);
1885 LWLockRelease(DSA_AREA_LOCK(area));
1888 * Span-of-spans blocks store the span which describes them within the
1889 * block itself, so freeing the storage implicitly frees the descriptor
1890 * also. If this is a block of any other type, we need to separately free
1891 * the span object also. This recursive call to dsa_free will acquire the
1892 * span pool's lock. We can't deadlock because the acquisition order is
1893 * always some other pool and then the span pool.
1895 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1896 dsa_free(area, span_pointer);
1899 static void
1900 unlink_span(dsa_area *area, dsa_area_span *span)
1902 if (DsaPointerIsValid(span->nextspan))
1904 dsa_area_span *next = dsa_get_address(area, span->nextspan);
1906 next->prevspan = span->prevspan;
1908 if (DsaPointerIsValid(span->prevspan))
1910 dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1912 prev->nextspan = span->nextspan;
1914 else
1916 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1918 pool->spans[span->fclass] = span->nextspan;
1922 static void
1923 add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
1924 dsa_pointer span_pointer,
1925 int fclass)
1927 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1929 if (DsaPointerIsValid(pool->spans[fclass]))
1931 dsa_area_span *head = dsa_get_address(area,
1932 pool->spans[fclass]);
1934 head->prevspan = span_pointer;
1936 span->prevspan = InvalidDsaPointer;
1937 span->nextspan = pool->spans[fclass];
1938 pool->spans[fclass] = span_pointer;
1939 span->fclass = fclass;
1943 * Detach from an area that was either created or attached to by this process.
1945 void
1946 dsa_detach(dsa_area *area)
1948 int i;
1950 /* Detach from all segments. */
1951 for (i = 0; i <= area->high_segment_index; ++i)
1952 if (area->segment_maps[i].segment != NULL)
1953 dsm_detach(area->segment_maps[i].segment);
1956 * Note that 'detaching' (= detaching from DSM segments) doesn't include
1957 * 'releasing' (= adjusting the reference count). It would be nice to
1958 * combine these operations, but client code might never get around to
1959 * calling dsa_detach because of an error path, and a detach hook on any
1960 * particular segment is too late to detach other segments in the area
1961 * without risking a 'leak' warning in the non-error path.
1964 /* Free the backend-local area object. */
1965 pfree(area);
1969 * Unlink a segment from the bin that contains it.
1971 static void
1972 unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
1974 if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1976 dsa_segment_map *prev;
1978 prev = get_segment_by_index(area, segment_map->header->prev);
1979 prev->header->next = segment_map->header->next;
1981 else
1983 Assert(area->control->segment_bins[segment_map->header->bin] ==
1984 get_segment_index(area, segment_map));
1985 area->control->segment_bins[segment_map->header->bin] =
1986 segment_map->header->next;
1988 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1990 dsa_segment_map *next;
1992 next = get_segment_by_index(area, segment_map->header->next);
1993 next->header->prev = segment_map->header->prev;
1998 * Find a segment that could satisfy a request for 'npages' of contiguous
1999 * memory, or return NULL if none can be found. This may involve attaching to
2000 * segments that weren't previously attached so that we can query their free
2001 * pages map.
2003 static dsa_segment_map *
2004 get_best_segment(dsa_area *area, size_t npages)
2006 size_t bin;
2008 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2009 check_for_freed_segments_locked(area);
2012 * Start searching from the first bin that *might* have enough contiguous
2013 * pages.
2015 for (bin = contiguous_pages_to_segment_bin(npages);
2016 bin < DSA_NUM_SEGMENT_BINS;
2017 ++bin)
2020 * The minimum contiguous size that any segment in this bin should
2021 * have. We'll re-bin if we see segments with fewer.
2023 size_t threshold = (size_t) 1 << (bin - 1);
2024 dsa_segment_index segment_index;
2026 /* Search this bin for a segment with enough contiguous space. */
2027 segment_index = area->control->segment_bins[bin];
2028 while (segment_index != DSA_SEGMENT_INDEX_NONE)
2030 dsa_segment_map *segment_map;
2031 dsa_segment_index next_segment_index;
2032 size_t contiguous_pages;
2034 segment_map = get_segment_by_index(area, segment_index);
2035 next_segment_index = segment_map->header->next;
2036 contiguous_pages = fpm_largest(segment_map->fpm);
2038 /* Not enough for the request, still enough for this bin. */
2039 if (contiguous_pages >= threshold && contiguous_pages < npages)
2041 segment_index = next_segment_index;
2042 continue;
2045 /* Re-bin it if it's no longer in the appropriate bin. */
2046 if (contiguous_pages < threshold)
2048 rebin_segment(area, segment_map);
2051 * But fall through to see if it's enough to satisfy this
2052 * request anyway....
2056 /* Check if we are done. */
2057 if (contiguous_pages >= npages)
2058 return segment_map;
2060 /* Continue searching the same bin. */
2061 segment_index = next_segment_index;
2065 /* Not found. */
2066 return NULL;
2070 * Create a new segment that can handle at least requested_pages. Returns
2071 * NULL if the requested total size limit or maximum allowed number of
2072 * segments would be exceeded.
2074 static dsa_segment_map *
2075 make_new_segment(dsa_area *area, size_t requested_pages)
2077 dsa_segment_index new_index;
2078 size_t metadata_bytes;
2079 size_t total_size;
2080 size_t total_pages;
2081 size_t usable_pages;
2082 dsa_segment_map *segment_map;
2083 dsm_segment *segment;
2084 ResourceOwner oldowner;
2086 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2088 /* Find a segment slot that is not in use (linearly for now). */
2089 for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2091 if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2092 break;
2094 if (new_index == DSA_MAX_SEGMENTS)
2095 return NULL;
2098 * If the total size limit is already exceeded, then we exit early and
2099 * avoid arithmetic wraparound in the unsigned expressions below.
2101 if (area->control->total_segment_size >=
2102 area->control->max_total_segment_size)
2103 return NULL;
2106 * The size should be at least as big as requested, and at least big
2107 * enough to follow a geometric series that approximately doubles the
2108 * total storage each time we create a new segment. We use geometric
2109 * growth because the underlying DSM system isn't designed for large
2110 * numbers of segments (otherwise we might even consider just using one
2111 * DSM segment for each large allocation and for each superblock, and then
2112 * we wouldn't need to use FreePageManager).
2114 * We decide on a total segment size first, so that we produce tidy
2115 * power-of-two sized segments. This is a good property to have if we
2116 * move to huge pages in the future. Then we work back to the number of
2117 * pages we can fit.
2119 total_size = DSA_INITIAL_SEGMENT_SIZE *
2120 ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2121 total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2122 total_size = Min(total_size,
2123 area->control->max_total_segment_size -
2124 area->control->total_segment_size);
2126 total_pages = total_size / FPM_PAGE_SIZE;
2127 metadata_bytes =
2128 MAXALIGN(sizeof(dsa_segment_header)) +
2129 MAXALIGN(sizeof(FreePageManager)) +
2130 sizeof(dsa_pointer) * total_pages;
2132 /* Add padding up to next page boundary. */
2133 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2134 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2135 if (total_size <= metadata_bytes)
2136 return NULL;
2137 usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2138 Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2140 /* See if that is enough... */
2141 if (requested_pages > usable_pages)
2144 * We'll make an odd-sized segment, working forward from the requested
2145 * number of pages.
2147 usable_pages = requested_pages;
2148 metadata_bytes =
2149 MAXALIGN(sizeof(dsa_segment_header)) +
2150 MAXALIGN(sizeof(FreePageManager)) +
2151 usable_pages * sizeof(dsa_pointer);
2153 /* Add padding up to next page boundary. */
2154 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2155 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2156 total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2158 /* Is that too large for dsa_pointer's addressing scheme? */
2159 if (total_size > DSA_MAX_SEGMENT_SIZE)
2160 return NULL;
2162 /* Would that exceed the limit? */
2163 if (total_size > area->control->max_total_segment_size -
2164 area->control->total_segment_size)
2165 return NULL;
2168 /* Create the segment. */
2169 oldowner = CurrentResourceOwner;
2170 CurrentResourceOwner = area->resowner;
2171 segment = dsm_create(total_size, 0);
2172 CurrentResourceOwner = oldowner;
2173 if (segment == NULL)
2174 return NULL;
2175 dsm_pin_segment(segment);
2177 /* Store the handle in shared memory to be found by index. */
2178 area->control->segment_handles[new_index] =
2179 dsm_segment_handle(segment);
2180 /* Track the highest segment index in the history of the area. */
2181 if (area->control->high_segment_index < new_index)
2182 area->control->high_segment_index = new_index;
2183 /* Track the highest segment index this backend has ever mapped. */
2184 if (area->high_segment_index < new_index)
2185 area->high_segment_index = new_index;
2186 /* Track total size of all segments. */
2187 area->control->total_segment_size += total_size;
2188 Assert(area->control->total_segment_size <=
2189 area->control->max_total_segment_size);
2191 /* Build a segment map for this segment in this backend. */
2192 segment_map = &area->segment_maps[new_index];
2193 segment_map->segment = segment;
2194 segment_map->mapped_address = dsm_segment_address(segment);
2195 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2196 segment_map->fpm = (FreePageManager *)
2197 (segment_map->mapped_address +
2198 MAXALIGN(sizeof(dsa_segment_header)));
2199 segment_map->pagemap = (dsa_pointer *)
2200 (segment_map->mapped_address +
2201 MAXALIGN(sizeof(dsa_segment_header)) +
2202 MAXALIGN(sizeof(FreePageManager)));
2204 /* Set up the free page map. */
2205 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2206 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2207 usable_pages);
2209 /* Set up the segment header and put it in the appropriate bin. */
2210 segment_map->header->magic =
2211 DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2212 segment_map->header->usable_pages = usable_pages;
2213 segment_map->header->size = total_size;
2214 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2215 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2216 segment_map->header->next =
2217 area->control->segment_bins[segment_map->header->bin];
2218 segment_map->header->freed = false;
2219 area->control->segment_bins[segment_map->header->bin] = new_index;
2220 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2222 dsa_segment_map *next =
2223 get_segment_by_index(area, segment_map->header->next);
2225 Assert(next->header->bin == segment_map->header->bin);
2226 next->header->prev = new_index;
2229 return segment_map;
2233 * Check if any segments have been freed by destroy_superblock, so we can
2234 * detach from them in this backend. This function is called by
2235 * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2236 * received can be resolved to the correct segment.
2238 * The danger we want to defend against is that there could be an old segment
2239 * mapped into a given slot in this backend, and the dsa_pointer they have
2240 * might refer to some new segment in the same slot. So those functions must
2241 * be sure to process all instructions to detach from a freed segment that had
2242 * been generated by the time this process received the dsa_pointer, before
2243 * they call get_segment_by_index.
2245 static void
2246 check_for_freed_segments(dsa_area *area)
2248 size_t freed_segment_counter;
2251 * Any other process that has freed a segment has incremented
2252 * freed_segment_counter while holding an LWLock, and that must precede
2253 * any backend creating a new segment in the same slot while holding an
2254 * LWLock, and that must precede the creation of any dsa_pointer pointing
2255 * into the new segment which might reach us here, and the caller must
2256 * have sent the dsa_pointer to this process using appropriate memory
2257 * synchronization (some kind of locking or atomic primitive or system
2258 * call). So all we need to do on the reading side is ask for the load of
2259 * freed_segment_counter to follow the caller's load of the dsa_pointer it
2260 * has, and we can be sure to detect any segments that had been freed as
2261 * of the time that the dsa_pointer reached this process.
2263 pg_read_barrier();
2264 freed_segment_counter = area->control->freed_segment_counter;
2265 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2267 /* Check all currently mapped segments to find what's been freed. */
2268 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
2269 check_for_freed_segments_locked(area);
2270 LWLockRelease(DSA_AREA_LOCK(area));
2275 * Workhorse for check_for_freed_segments(), and also used directly in path
2276 * where the area lock is already held. This should be called after acquiring
2277 * the lock but before looking up any segment by index number, to make sure we
2278 * unmap any stale segments that might have previously had the same index as a
2279 * current segment.
2281 static void
2282 check_for_freed_segments_locked(dsa_area *area)
2284 size_t freed_segment_counter;
2285 int i;
2287 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2288 freed_segment_counter = area->control->freed_segment_counter;
2289 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2291 for (i = 0; i <= area->high_segment_index; ++i)
2293 if (area->segment_maps[i].header != NULL &&
2294 area->segment_maps[i].header->freed)
2296 dsm_detach(area->segment_maps[i].segment);
2297 area->segment_maps[i].segment = NULL;
2298 area->segment_maps[i].header = NULL;
2299 area->segment_maps[i].mapped_address = NULL;
2302 area->freed_segment_counter = freed_segment_counter;
2307 * Re-bin segment if it's no longer in the appropriate bin.
2309 static void
2310 rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
2312 size_t new_bin;
2313 dsa_segment_index segment_index;
2315 new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2316 if (segment_map->header->bin == new_bin)
2317 return;
2319 /* Remove it from its current bin. */
2320 unlink_segment(area, segment_map);
2322 /* Push it onto the front of its new bin. */
2323 segment_index = get_segment_index(area, segment_map);
2324 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2325 segment_map->header->next = area->control->segment_bins[new_bin];
2326 segment_map->header->bin = new_bin;
2327 area->control->segment_bins[new_bin] = segment_index;
2328 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2330 dsa_segment_map *next;
2332 next = get_segment_by_index(area, segment_map->header->next);
2333 Assert(next->header->bin == new_bin);
2334 next->header->prev = segment_index;