Remove MemoryContextContains().
[pgsql.git] / src / backend / utils / mmgr / dsa.c
blob82376fde2d98d9f98c70f0dfef72abf30363a54b
1 /*-------------------------------------------------------------------------
3 * dsa.c
4 * Dynamic shared memory areas.
6 * This module provides dynamic shared memory areas which are built on top of
7 * DSM segments. While dsm.c allows segments of memory of shared memory to be
8 * created and shared between backends, it isn't designed to deal with small
9 * objects. A DSA area is a shared memory heap usually backed by one or more
10 * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11 * Alternatively, it can be created in pre-existing shared memory, including a
12 * DSM segment, and then create extra DSM segments as required. Unlike the
13 * regular system heap, it deals in pseudo-pointers which must be converted to
14 * backend-local pointers before they are dereferenced. These pseudo-pointers
15 * can however be shared with other backends, and can be used to construct
16 * shared data structures.
18 * Each DSA area manages a set of DSM segments, adding new segments as
19 * required and detaching them when they are no longer needed. Each segment
20 * contains a number of 4KB pages, a free page manager for tracking
21 * consecutive runs of free pages, and a page map for tracking the source of
22 * objects allocated on each page. Allocation requests above 8KB are handled
23 * by choosing a segment and finding consecutive free pages in its free page
24 * manager. Allocation requests for smaller sizes are handled using pools of
25 * objects of a selection of sizes. Each pool consists of a number of 16 page
26 * (64KB) superblocks allocated in the same way as large objects. Allocation
27 * of large objects and new superblocks is serialized by a single LWLock, but
28 * allocation of small objects from pre-existing superblocks uses one LWLock
29 * per pool. Currently there is one pool, and therefore one lock, per size
30 * class. Per-core pools to increase concurrency and strategies for reducing
31 * the resulting fragmentation are areas for future research. Each superblock
32 * is managed with a 'span', which tracks the superblock's freelist. Free
33 * requests are handled by looking in the page map to find which span an
34 * address was allocated from, so that small objects can be returned to the
35 * appropriate free list, and large object pages can be returned directly to
36 * the free page map. When allocating, simple heuristics for selecting
37 * segments and superblocks try to encourage occupied memory to be
38 * concentrated, increasing the likelihood that whole superblocks can become
39 * empty and be returned to the free page manager, and whole segments can
40 * become empty and be returned to the operating system.
42 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
43 * Portions Copyright (c) 1994, Regents of the University of California
45 * IDENTIFICATION
46 * src/backend/utils/mmgr/dsa.c
48 *-------------------------------------------------------------------------
51 #include "postgres.h"
53 #include "port/atomics.h"
54 #include "port/pg_bitutils.h"
55 #include "storage/dsm.h"
56 #include "storage/ipc.h"
57 #include "storage/lwlock.h"
58 #include "storage/shmem.h"
59 #include "utils/dsa.h"
60 #include "utils/freepage.h"
61 #include "utils/memutils.h"
64 * The size of the initial DSM segment that backs a dsa_area created by
65 * dsa_create. After creating some number of segments of this size we'll
66 * double this size, and so on. Larger segments may be created if necessary
67 * to satisfy large requests.
69 #define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
72 * How many segments to create before we double the segment size. If this is
73 * low, then there is likely to be a lot of wasted space in the largest
74 * segment. If it is high, then we risk running out of segment slots (see
75 * dsm.c's limits on total number of segments), or limiting the total size
76 * an area can manage when using small pointers.
78 #define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
81 * The number of bits used to represent the offset part of a dsa_pointer.
82 * This controls the maximum size of a segment, the maximum possible
83 * allocation size and also the maximum number of segments per area.
85 #if SIZEOF_DSA_POINTER == 4
86 #define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
87 #else
88 #define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
89 #endif
92 * The maximum number of DSM segments that an area can own, determined by
93 * the number of bits remaining (but capped at 1024).
95 #define DSA_MAX_SEGMENTS \
96 Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
98 /* The bitmask for extracting the offset from a dsa_pointer. */
99 #define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
101 /* The maximum size of a DSM segment. */
102 #define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
104 /* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
105 #define DSA_PAGES_PER_SUPERBLOCK 16
108 * A magic number used as a sanity check for following DSM segments belonging
109 * to a DSA area (this number will be XORed with the area handle and
110 * the segment index).
112 #define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
114 /* Build a dsa_pointer given a segment number and offset. */
115 #define DSA_MAKE_POINTER(segment_number, offset) \
116 (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
118 /* Extract the segment number from a dsa_pointer. */
119 #define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
121 /* Extract the offset from a dsa_pointer. */
122 #define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
124 /* The type used for index segment indexes (zero based). */
125 typedef size_t dsa_segment_index;
127 /* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
128 #define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
131 * How many bins of segments do we have? The bins are used to categorize
132 * segments by their largest contiguous run of free pages.
134 #define DSA_NUM_SEGMENT_BINS 16
137 * What is the lowest bin that holds segments that *might* have n contiguous
138 * free pages? There is no point in looking in segments in lower bins; they
139 * definitely can't service a request for n free pages.
141 static inline size_t
142 contiguous_pages_to_segment_bin(size_t n)
144 size_t bin;
146 if (n == 0)
147 bin = 0;
148 else
149 bin = pg_leftmost_one_pos_size_t(n) + 1;
151 return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
154 /* Macros for access to locks. */
155 #define DSA_AREA_LOCK(area) (&area->control->lock)
156 #define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
159 * The header for an individual segment. This lives at the start of each DSM
160 * segment owned by a DSA area including the first segment (where it appears
161 * as part of the dsa_area_control struct).
163 typedef struct
165 /* Sanity check magic value. */
166 uint32 magic;
167 /* Total number of pages in this segment (excluding metadata area). */
168 size_t usable_pages;
169 /* Total size of this segment in bytes. */
170 size_t size;
173 * Index of the segment that precedes this one in the same segment bin, or
174 * DSA_SEGMENT_INDEX_NONE if this is the first one.
176 dsa_segment_index prev;
179 * Index of the segment that follows this one in the same segment bin, or
180 * DSA_SEGMENT_INDEX_NONE if this is the last one.
182 dsa_segment_index next;
183 /* The index of the bin that contains this segment. */
184 size_t bin;
187 * A flag raised to indicate that this segment is being returned to the
188 * operating system and has been unpinned.
190 bool freed;
191 } dsa_segment_header;
194 * Metadata for one superblock.
196 * For most blocks, span objects are stored out-of-line; that is, the span
197 * object is not stored within the block itself. But, as an exception, for a
198 * "span of spans", the span object is stored "inline". The allocation is
199 * always exactly one page, and the dsa_area_span object is located at
200 * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
201 * and the remaining fields are used just as they would be in an ordinary
202 * block. We can't allocate spans out of ordinary superblocks because
203 * creating an ordinary superblock requires us to be able to allocate a span
204 * *first*. Doing it this way avoids that circularity.
206 typedef struct
208 dsa_pointer pool; /* Containing pool. */
209 dsa_pointer prevspan; /* Previous span. */
210 dsa_pointer nextspan; /* Next span. */
211 dsa_pointer start; /* Starting address. */
212 size_t npages; /* Length of span in pages. */
213 uint16 size_class; /* Size class. */
214 uint16 ninitialized; /* Maximum number of objects ever allocated. */
215 uint16 nallocatable; /* Number of objects currently allocatable. */
216 uint16 firstfree; /* First object on free list. */
217 uint16 nmax; /* Maximum number of objects ever possible. */
218 uint16 fclass; /* Current fullness class. */
219 } dsa_area_span;
222 * Given a pointer to an object in a span, access the index of the next free
223 * object in the same span (ie in the span's freelist) as an L-value.
225 #define NextFreeObjectIndex(object) (* (uint16 *) (object))
228 * Small allocations are handled by dividing a single block of memory into
229 * many small objects of equal size. The possible allocation sizes are
230 * defined by the following array. Larger size classes are spaced more widely
231 * than smaller size classes. We fudge the spacing for size classes >1kB to
232 * avoid space wastage: based on the knowledge that we plan to allocate 64kB
233 * blocks, we bump the maximum object size up to the largest multiple of
234 * 8 bytes that still lets us fit the same number of objects into one block.
236 * NB: Because of this fudging, if we were ever to use differently-sized blocks
237 * for small allocations, these size classes would need to be reworked to be
238 * optimal for the new size.
240 * NB: The optimal spacing for size classes, as well as the size of the blocks
241 * out of which small objects are allocated, is not a question that has one
242 * right answer. Some allocators (such as tcmalloc) use more closely-spaced
243 * size classes than we do here, while others (like aset.c) use more
244 * widely-spaced classes. Spacing the classes more closely avoids wasting
245 * memory within individual chunks, but also means a larger number of
246 * potentially-unfilled blocks.
248 static const uint16 dsa_size_classes[] = {
249 sizeof(dsa_area_span), 0, /* special size classes */
250 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
251 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
252 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
253 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
254 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
255 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
256 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
257 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
259 #define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
261 /* Special size classes. */
262 #define DSA_SCLASS_BLOCK_OF_SPANS 0
263 #define DSA_SCLASS_SPAN_LARGE 1
266 * The following lookup table is used to map the size of small objects
267 * (less than 1kB) onto the corresponding size class. To use this table,
268 * round the size of the object up to the next multiple of 8 bytes, and then
269 * index into this array.
271 static const uint8 dsa_size_class_map[] = {
272 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
273 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
274 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
275 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
276 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
277 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
278 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
279 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
281 #define DSA_SIZE_CLASS_MAP_QUANTUM 8
284 * Superblocks are binned by how full they are. Generally, each fullness
285 * class corresponds to one quartile, but the block being used for
286 * allocations is always at the head of the list for fullness class 1,
287 * regardless of how full it really is.
289 #define DSA_FULLNESS_CLASSES 4
292 * A dsa_area_pool represents a set of objects of a given size class.
294 * Perhaps there should be multiple pools for the same size class for
295 * contention avoidance, but for now there is just one!
297 typedef struct
299 /* A lock protecting access to this pool. */
300 LWLock lock;
301 /* A set of linked lists of spans, arranged by fullness. */
302 dsa_pointer spans[DSA_FULLNESS_CLASSES];
303 /* Should we pad this out to a cacheline boundary? */
304 } dsa_area_pool;
307 * The control block for an area. This lives in shared memory, at the start of
308 * the first DSM segment controlled by this area.
310 typedef struct
312 /* The segment header for the first segment. */
313 dsa_segment_header segment_header;
314 /* The handle for this area. */
315 dsa_handle handle;
316 /* The handles of the segments owned by this area. */
317 dsm_handle segment_handles[DSA_MAX_SEGMENTS];
318 /* Lists of segments, binned by maximum contiguous run of free pages. */
319 dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
320 /* The object pools for each size class. */
321 dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
322 /* The total size of all active segments. */
323 size_t total_segment_size;
324 /* The maximum total size of backing storage we are allowed. */
325 size_t max_total_segment_size;
326 /* Highest used segment index in the history of this area. */
327 dsa_segment_index high_segment_index;
328 /* The reference count for this area. */
329 int refcnt;
330 /* A flag indicating that this area has been pinned. */
331 bool pinned;
332 /* The number of times that segments have been freed. */
333 size_t freed_segment_counter;
334 /* The LWLock tranche ID. */
335 int lwlock_tranche_id;
336 /* The general lock (protects everything except object pools). */
337 LWLock lock;
338 } dsa_area_control;
340 /* Given a pointer to a pool, find a dsa_pointer. */
341 #define DsaAreaPoolToDsaPointer(area, p) \
342 DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
345 * A dsa_segment_map is stored within the backend-private memory of each
346 * individual backend. It holds the base address of the segment within that
347 * backend, plus the addresses of key objects within the segment. Those
348 * could instead be derived from the base address but it's handy to have them
349 * around.
351 typedef struct
353 dsm_segment *segment; /* DSM segment */
354 char *mapped_address; /* Address at which segment is mapped */
355 dsa_segment_header *header; /* Header (same as mapped_address) */
356 FreePageManager *fpm; /* Free page manager within segment. */
357 dsa_pointer *pagemap; /* Page map within segment. */
358 } dsa_segment_map;
361 * Per-backend state for a storage area. Backends obtain one of these by
362 * creating an area or attaching to an existing one using a handle. Each
363 * process that needs to use an area uses its own object to track where the
364 * segments are mapped.
366 struct dsa_area
368 /* Pointer to the control object in shared memory. */
369 dsa_area_control *control;
371 /* Has the mapping been pinned? */
372 bool mapping_pinned;
375 * This backend's array of segment maps, ordered by segment index
376 * corresponding to control->segment_handles. Some of the area's segments
377 * may not be mapped in this backend yet, and some slots may have been
378 * freed and need to be detached; these operations happen on demand.
380 dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
382 /* The highest segment index this backend has ever mapped. */
383 dsa_segment_index high_segment_index;
385 /* The last observed freed_segment_counter. */
386 size_t freed_segment_counter;
389 #define DSA_SPAN_NOTHING_FREE ((uint16) -1)
390 #define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
392 /* Given a pointer to a segment_map, obtain a segment index number. */
393 #define get_segment_index(area, segment_map_ptr) \
394 (segment_map_ptr - &area->segment_maps[0])
396 static void init_span(dsa_area *area, dsa_pointer span_pointer,
397 dsa_area_pool *pool, dsa_pointer start, size_t npages,
398 uint16 size_class);
399 static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
400 int fromclass, int toclass);
401 static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
402 static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
403 int size_class);
404 static dsa_segment_map *get_segment_by_index(dsa_area *area,
405 dsa_segment_index index);
406 static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
407 static void unlink_span(dsa_area *area, dsa_area_span *span);
408 static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
409 dsa_pointer span_pointer, int fclass);
410 static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
411 static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
412 static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
413 static dsa_area *create_internal(void *place, size_t size,
414 int tranche_id,
415 dsm_handle control_handle,
416 dsm_segment *control_segment);
417 static dsa_area *attach_internal(void *place, dsm_segment *segment,
418 dsa_handle handle);
419 static void check_for_freed_segments(dsa_area *area);
420 static void check_for_freed_segments_locked(dsa_area *area);
423 * Create a new shared area in a new DSM segment. Further DSM segments will
424 * be allocated as required to extend the available space.
426 * We can't allocate a LWLock tranche_id within this function, because tranche
427 * IDs are a scarce resource; there are only 64k available, using low numbers
428 * when possible matters, and we have no provision for recycling them. So,
429 * we require the caller to provide one.
431 dsa_area *
432 dsa_create(int tranche_id)
434 dsm_segment *segment;
435 dsa_area *area;
438 * Create the DSM segment that will hold the shared control object and the
439 * first segment of usable space.
441 segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
444 * All segments backing this area are pinned, so that DSA can explicitly
445 * control their lifetime (otherwise a newly created segment belonging to
446 * this area might be freed when the only backend that happens to have it
447 * mapped in ends, corrupting the area).
449 dsm_pin_segment(segment);
451 /* Create a new DSA area with the control object in this segment. */
452 area = create_internal(dsm_segment_address(segment),
453 DSA_INITIAL_SEGMENT_SIZE,
454 tranche_id,
455 dsm_segment_handle(segment), segment);
457 /* Clean up when the control segment detaches. */
458 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
459 PointerGetDatum(dsm_segment_address(segment)));
461 return area;
465 * Create a new shared area in an existing shared memory space, which may be
466 * either DSM or Postmaster-initialized memory. DSM segments will be
467 * allocated as required to extend the available space, though that can be
468 * prevented with dsa_set_size_limit(area, size) using the same size provided
469 * to dsa_create_in_place.
471 * Areas created in-place must eventually be released by the backend that
472 * created them and all backends that attach to them. This can be done
473 * explicitly with dsa_release_in_place, or, in the special case that 'place'
474 * happens to be in a pre-existing DSM segment, by passing in a pointer to the
475 * segment so that a detach hook can be registered with the containing DSM
476 * segment.
478 * See dsa_create() for a note about the tranche arguments.
480 dsa_area *
481 dsa_create_in_place(void *place, size_t size,
482 int tranche_id, dsm_segment *segment)
484 dsa_area *area;
486 area = create_internal(place, size, tranche_id,
487 DSM_HANDLE_INVALID, NULL);
490 * Clean up when the control segment detaches, if a containing DSM segment
491 * was provided.
493 if (segment != NULL)
494 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
495 PointerGetDatum(place));
497 return area;
501 * Obtain a handle that can be passed to other processes so that they can
502 * attach to the given area. Cannot be called for areas created with
503 * dsa_create_in_place.
505 dsa_handle
506 dsa_get_handle(dsa_area *area)
508 Assert(area->control->handle != DSM_HANDLE_INVALID);
509 return area->control->handle;
513 * Attach to an area given a handle generated (possibly in another process) by
514 * dsa_get_handle. The area must have been created with dsa_create (not
515 * dsa_create_in_place).
517 dsa_area *
518 dsa_attach(dsa_handle handle)
520 dsm_segment *segment;
521 dsa_area *area;
524 * An area handle is really a DSM segment handle for the first segment, so
525 * we go ahead and attach to that.
527 segment = dsm_attach(handle);
528 if (segment == NULL)
529 ereport(ERROR,
530 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
531 errmsg("could not attach to dynamic shared area")));
533 area = attach_internal(dsm_segment_address(segment), segment, handle);
535 /* Clean up when the control segment detaches. */
536 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
537 PointerGetDatum(dsm_segment_address(segment)));
539 return area;
543 * Attach to an area that was created with dsa_create_in_place. The caller
544 * must somehow know the location in memory that was used when the area was
545 * created, though it may be mapped at a different virtual address in this
546 * process.
548 * See dsa_create_in_place for note about releasing in-place areas, and the
549 * optional 'segment' argument which can be provided to allow automatic
550 * release if the containing memory happens to be a DSM segment.
552 dsa_area *
553 dsa_attach_in_place(void *place, dsm_segment *segment)
555 dsa_area *area;
557 area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
560 * Clean up when the control segment detaches, if a containing DSM segment
561 * was provided.
563 if (segment != NULL)
564 on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
565 PointerGetDatum(place));
567 return area;
571 * Release a DSA area that was produced by dsa_create_in_place or
572 * dsa_attach_in_place. The 'segment' argument is ignored but provides an
573 * interface suitable for on_dsm_detach, for the convenience of users who want
574 * to create a DSA segment inside an existing DSM segment and have it
575 * automatically released when the containing DSM segment is detached.
576 * 'place' should be the address of the place where the area was created.
578 * This callback is automatically registered for the DSM segment containing
579 * the control object of in-place areas when a segment is provided to
580 * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
581 * with dsa_create.
583 void
584 dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
586 dsa_release_in_place(DatumGetPointer(place));
590 * Release a DSA area that was produced by dsa_create_in_place or
591 * dsa_attach_in_place. The 'code' argument is ignored but provides an
592 * interface suitable for on_shmem_exit or before_shmem_exit, for the
593 * convenience of users who want to create a DSA segment inside shared memory
594 * other than a DSM segment and have it automatically release at backend exit.
595 * 'place' should be the address of the place where the area was created.
597 void
598 dsa_on_shmem_exit_release_in_place(int code, Datum place)
600 dsa_release_in_place(DatumGetPointer(place));
604 * Release a DSA area that was produced by dsa_create_in_place or
605 * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
606 * callbacks so that this is managed automatically, because failure to release
607 * an area created in-place leaks its segments permanently.
609 * This is also called automatically for areas produced by dsa_create or
610 * dsa_attach as an implementation detail.
612 void
613 dsa_release_in_place(void *place)
615 dsa_area_control *control = (dsa_area_control *) place;
616 int i;
618 LWLockAcquire(&control->lock, LW_EXCLUSIVE);
619 Assert(control->segment_header.magic ==
620 (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
621 Assert(control->refcnt > 0);
622 if (--control->refcnt == 0)
624 for (i = 0; i <= control->high_segment_index; ++i)
626 dsm_handle handle;
628 handle = control->segment_handles[i];
629 if (handle != DSM_HANDLE_INVALID)
630 dsm_unpin_segment(handle);
633 LWLockRelease(&control->lock);
637 * Keep a DSA area attached until end of session or explicit detach.
639 * By default, areas are owned by the current resource owner, which means they
640 * are detached automatically when that scope ends.
642 void
643 dsa_pin_mapping(dsa_area *area)
645 int i;
647 Assert(!area->mapping_pinned);
648 area->mapping_pinned = true;
650 for (i = 0; i <= area->high_segment_index; ++i)
651 if (area->segment_maps[i].segment != NULL)
652 dsm_pin_mapping(area->segment_maps[i].segment);
656 * Allocate memory in this storage area. The return value is a dsa_pointer
657 * that can be passed to other processes, and converted to a local pointer
658 * with dsa_get_address. 'flags' is a bitmap which should be constructed
659 * from the following values:
661 * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
662 * will result in an ERROR.
664 * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
665 * no memory is available or a size limit established by dsa_set_size_limit
666 * would be exceeded. Otherwise, such allocations will result in an ERROR.
668 * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
669 * contents of newly-allocated memory are indeterminate.
671 * These flags correspond to similarly named flags used by
672 * MemoryContextAllocExtended(). See also the macros dsa_allocate and
673 * dsa_allocate0 which expand to a call to this function with commonly used
674 * flags.
676 dsa_pointer
677 dsa_allocate_extended(dsa_area *area, size_t size, int flags)
679 uint16 size_class;
680 dsa_pointer start_pointer;
681 dsa_segment_map *segment_map;
682 dsa_pointer result;
684 Assert(size > 0);
686 /* Sanity check on huge individual allocation size. */
687 if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
688 ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
689 elog(ERROR, "invalid DSA memory alloc request size %zu", size);
692 * If bigger than the largest size class, just grab a run of pages from
693 * the free page manager, instead of allocating an object from a pool.
694 * There will still be a span, but it's a special class of span that
695 * manages this whole allocation and simply gives all pages back to the
696 * free page manager when dsa_free is called.
698 if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
700 size_t npages = fpm_size_to_pages(size);
701 size_t first_page;
702 dsa_pointer span_pointer;
703 dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
705 /* Obtain a span object. */
706 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
707 if (!DsaPointerIsValid(span_pointer))
709 /* Raise error unless asked not to. */
710 if ((flags & DSA_ALLOC_NO_OOM) == 0)
711 ereport(ERROR,
712 (errcode(ERRCODE_OUT_OF_MEMORY),
713 errmsg("out of memory"),
714 errdetail("Failed on DSA request of size %zu.",
715 size)));
716 return InvalidDsaPointer;
719 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
721 /* Find a segment from which to allocate. */
722 segment_map = get_best_segment(area, npages);
723 if (segment_map == NULL)
724 segment_map = make_new_segment(area, npages);
725 if (segment_map == NULL)
727 /* Can't make any more segments: game over. */
728 LWLockRelease(DSA_AREA_LOCK(area));
729 dsa_free(area, span_pointer);
731 /* Raise error unless asked not to. */
732 if ((flags & DSA_ALLOC_NO_OOM) == 0)
733 ereport(ERROR,
734 (errcode(ERRCODE_OUT_OF_MEMORY),
735 errmsg("out of memory"),
736 errdetail("Failed on DSA request of size %zu.",
737 size)));
738 return InvalidDsaPointer;
742 * Ask the free page manager for a run of pages. This should always
743 * succeed, since both get_best_segment and make_new_segment should
744 * only return a non-NULL pointer if it actually contains enough
745 * contiguous freespace. If it does fail, something in our backend
746 * private state is out of whack, so use FATAL to kill the process.
748 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
749 elog(FATAL,
750 "dsa_allocate could not find %zu free pages", npages);
751 LWLockRelease(DSA_AREA_LOCK(area));
753 start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
754 first_page * FPM_PAGE_SIZE);
756 /* Initialize span and pagemap. */
757 LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
758 LW_EXCLUSIVE);
759 init_span(area, span_pointer, pool, start_pointer, npages,
760 DSA_SCLASS_SPAN_LARGE);
761 segment_map->pagemap[first_page] = span_pointer;
762 LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
764 /* Zero-initialize the memory if requested. */
765 if ((flags & DSA_ALLOC_ZERO) != 0)
766 memset(dsa_get_address(area, start_pointer), 0, size);
768 return start_pointer;
771 /* Map allocation to a size class. */
772 if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
774 int mapidx;
776 /* For smaller sizes we have a lookup table... */
777 mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
778 DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
779 size_class = dsa_size_class_map[mapidx];
781 else
783 uint16 min;
784 uint16 max;
786 /* ... and for the rest we search by binary chop. */
787 min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
788 max = lengthof(dsa_size_classes) - 1;
790 while (min < max)
792 uint16 mid = (min + max) / 2;
793 uint16 class_size = dsa_size_classes[mid];
795 if (class_size < size)
796 min = mid + 1;
797 else
798 max = mid;
801 size_class = min;
803 Assert(size <= dsa_size_classes[size_class]);
804 Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
806 /* Attempt to allocate an object from the appropriate pool. */
807 result = alloc_object(area, size_class);
809 /* Check for failure to allocate. */
810 if (!DsaPointerIsValid(result))
812 /* Raise error unless asked not to. */
813 if ((flags & DSA_ALLOC_NO_OOM) == 0)
814 ereport(ERROR,
815 (errcode(ERRCODE_OUT_OF_MEMORY),
816 errmsg("out of memory"),
817 errdetail("Failed on DSA request of size %zu.", size)));
818 return InvalidDsaPointer;
821 /* Zero-initialize the memory if requested. */
822 if ((flags & DSA_ALLOC_ZERO) != 0)
823 memset(dsa_get_address(area, result), 0, size);
825 return result;
829 * Free memory obtained with dsa_allocate.
831 void
832 dsa_free(dsa_area *area, dsa_pointer dp)
834 dsa_segment_map *segment_map;
835 int pageno;
836 dsa_pointer span_pointer;
837 dsa_area_span *span;
838 char *superblock;
839 char *object;
840 size_t size;
841 int size_class;
843 /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
844 check_for_freed_segments(area);
846 /* Locate the object, span and pool. */
847 segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
848 pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
849 span_pointer = segment_map->pagemap[pageno];
850 span = dsa_get_address(area, span_pointer);
851 superblock = dsa_get_address(area, span->start);
852 object = dsa_get_address(area, dp);
853 size_class = span->size_class;
854 size = dsa_size_classes[size_class];
857 * Special case for large objects that live in a special span: we return
858 * those pages directly to the free page manager and free the span.
860 if (span->size_class == DSA_SCLASS_SPAN_LARGE)
863 #ifdef CLOBBER_FREED_MEMORY
864 memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
865 #endif
867 /* Give pages back to free page manager. */
868 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
869 FreePageManagerPut(segment_map->fpm,
870 DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
871 span->npages);
872 LWLockRelease(DSA_AREA_LOCK(area));
873 /* Unlink span. */
874 LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
875 LW_EXCLUSIVE);
876 unlink_span(area, span);
877 LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
878 /* Free the span object so it can be reused. */
879 dsa_free(area, span_pointer);
880 return;
883 #ifdef CLOBBER_FREED_MEMORY
884 memset(object, 0x7f, size);
885 #endif
887 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
889 /* Put the object on the span's freelist. */
890 Assert(object >= superblock);
891 Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
892 Assert((object - superblock) % size == 0);
893 NextFreeObjectIndex(object) = span->firstfree;
894 span->firstfree = (object - superblock) / size;
895 ++span->nallocatable;
898 * See if the span needs to moved to a different fullness class, or be
899 * freed so its pages can be given back to the segment.
901 if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
904 * The block was completely full and is located in the
905 * highest-numbered fullness class, which is never scanned for free
906 * chunks. We must move it to the next-lower fullness class.
908 unlink_span(area, span);
909 add_span_to_fullness_class(area, span, span_pointer,
910 DSA_FULLNESS_CLASSES - 2);
913 * If this is the only span, and there is no active span, then we
914 * should probably move this span to fullness class 1. (Otherwise if
915 * you allocate exactly all the objects in the only span, it moves to
916 * class 3, then you free them all, it moves to 2, and then is given
917 * back, leaving no active span).
920 else if (span->nallocatable == span->nmax &&
921 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
924 * This entire block is free, and it's not the active block for this
925 * size class. Return the memory to the free page manager. We don't
926 * do this for the active block to prevent hysteresis: if we
927 * repeatedly allocate and free the only chunk in the active block, it
928 * will be very inefficient if we deallocate and reallocate the block
929 * every time.
931 destroy_superblock(area, span_pointer);
934 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
938 * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
939 * memory allocated by the given area (possibly in another process) that
940 * hasn't yet been freed. This may cause a segment to be mapped into the
941 * current process if required, and may cause freed segments to be unmapped.
943 void *
944 dsa_get_address(dsa_area *area, dsa_pointer dp)
946 dsa_segment_index index;
947 size_t offset;
949 /* Convert InvalidDsaPointer to NULL. */
950 if (!DsaPointerIsValid(dp))
951 return NULL;
953 /* Process any requests to detach from freed segments. */
954 check_for_freed_segments(area);
956 /* Break the dsa_pointer into its components. */
957 index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
958 offset = DSA_EXTRACT_OFFSET(dp);
959 Assert(index < DSA_MAX_SEGMENTS);
961 /* Check if we need to cause this segment to be mapped in. */
962 if (unlikely(area->segment_maps[index].mapped_address == NULL))
964 /* Call for effect (we don't need the result). */
965 get_segment_by_index(area, index);
968 return area->segment_maps[index].mapped_address + offset;
972 * Pin this area, so that it will continue to exist even if all backends
973 * detach from it. In that case, the area can still be reattached to if a
974 * handle has been recorded somewhere.
976 void
977 dsa_pin(dsa_area *area)
979 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
980 if (area->control->pinned)
982 LWLockRelease(DSA_AREA_LOCK(area));
983 elog(ERROR, "dsa_area already pinned");
985 area->control->pinned = true;
986 ++area->control->refcnt;
987 LWLockRelease(DSA_AREA_LOCK(area));
991 * Undo the effects of dsa_pin, so that the given area can be freed when no
992 * backends are attached to it. May be called only if dsa_pin has been
993 * called.
995 void
996 dsa_unpin(dsa_area *area)
998 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
999 Assert(area->control->refcnt > 1);
1000 if (!area->control->pinned)
1002 LWLockRelease(DSA_AREA_LOCK(area));
1003 elog(ERROR, "dsa_area not pinned");
1005 area->control->pinned = false;
1006 --area->control->refcnt;
1007 LWLockRelease(DSA_AREA_LOCK(area));
1011 * Set the total size limit for this area. This limit is checked whenever new
1012 * segments need to be allocated from the operating system. If the new size
1013 * limit is already exceeded, this has no immediate effect.
1015 * Note that the total virtual memory usage may be temporarily larger than
1016 * this limit when segments have been freed, but not yet detached by all
1017 * backends that have attached to them.
1019 void
1020 dsa_set_size_limit(dsa_area *area, size_t limit)
1022 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1023 area->control->max_total_segment_size = limit;
1024 LWLockRelease(DSA_AREA_LOCK(area));
1028 * Aggressively free all spare memory in the hope of returning DSM segments to
1029 * the operating system.
1031 void
1032 dsa_trim(dsa_area *area)
1034 int size_class;
1037 * Trim in reverse pool order so we get to the spans-of-spans last, just
1038 * in case any become entirely free while processing all the other pools.
1040 for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1042 dsa_area_pool *pool = &area->control->pools[size_class];
1043 dsa_pointer span_pointer;
1045 if (size_class == DSA_SCLASS_SPAN_LARGE)
1047 /* Large object frees give back segments aggressively already. */
1048 continue;
1052 * Search fullness class 1 only. That is where we expect to find an
1053 * entirely empty superblock (entirely empty superblocks in other
1054 * fullness classes are returned to the free page map by dsa_free).
1056 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1057 span_pointer = pool->spans[1];
1058 while (DsaPointerIsValid(span_pointer))
1060 dsa_area_span *span = dsa_get_address(area, span_pointer);
1061 dsa_pointer next = span->nextspan;
1063 if (span->nallocatable == span->nmax)
1064 destroy_superblock(area, span_pointer);
1066 span_pointer = next;
1068 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1073 * Print out debugging information about the internal state of the shared
1074 * memory area.
1076 void
1077 dsa_dump(dsa_area *area)
1079 size_t i,
1083 * Note: This gives an inconsistent snapshot as it acquires and releases
1084 * individual locks as it goes...
1087 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1088 check_for_freed_segments_locked(area);
1089 fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1090 fprintf(stderr, " max_total_segment_size: %zu\n",
1091 area->control->max_total_segment_size);
1092 fprintf(stderr, " total_segment_size: %zu\n",
1093 area->control->total_segment_size);
1094 fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1095 fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1096 fprintf(stderr, " segment bins:\n");
1097 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1099 if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
1101 dsa_segment_index segment_index;
1103 fprintf(stderr,
1104 " segment bin %zu (at least %d contiguous pages free):\n",
1105 i, 1 << (i - 1));
1106 segment_index = area->control->segment_bins[i];
1107 while (segment_index != DSA_SEGMENT_INDEX_NONE)
1109 dsa_segment_map *segment_map;
1111 segment_map =
1112 get_segment_by_index(area, segment_index);
1114 fprintf(stderr,
1115 " segment index %zu, usable_pages = %zu, "
1116 "contiguous_pages = %zu, mapped at %p\n",
1117 segment_index,
1118 segment_map->header->usable_pages,
1119 fpm_largest(segment_map->fpm),
1120 segment_map->mapped_address);
1121 segment_index = segment_map->header->next;
1125 LWLockRelease(DSA_AREA_LOCK(area));
1127 fprintf(stderr, " pools:\n");
1128 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1130 bool found = false;
1132 LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
1133 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1134 if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1135 found = true;
1136 if (found)
1138 if (i == DSA_SCLASS_BLOCK_OF_SPANS)
1139 fprintf(stderr, " pool for blocks of span objects:\n");
1140 else if (i == DSA_SCLASS_SPAN_LARGE)
1141 fprintf(stderr, " pool for large object spans:\n");
1142 else
1143 fprintf(stderr,
1144 " pool for size class %zu (object size %hu bytes):\n",
1145 i, dsa_size_classes[i]);
1146 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1148 if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1149 fprintf(stderr, " fullness class %zu is empty\n", j);
1150 else
1152 dsa_pointer span_pointer = area->control->pools[i].spans[j];
1154 fprintf(stderr, " fullness class %zu:\n", j);
1155 while (DsaPointerIsValid(span_pointer))
1157 dsa_area_span *span;
1159 span = dsa_get_address(area, span_pointer);
1160 fprintf(stderr,
1161 " span descriptor at "
1162 DSA_POINTER_FORMAT ", superblock at "
1163 DSA_POINTER_FORMAT
1164 ", pages = %zu, objects free = %hu/%hu\n",
1165 span_pointer, span->start, span->npages,
1166 span->nallocatable, span->nmax);
1167 span_pointer = span->nextspan;
1172 LWLockRelease(DSA_SCLASS_LOCK(area, i));
1177 * Return the smallest size that you can successfully provide to
1178 * dsa_create_in_place.
1180 size_t
1181 dsa_minimum_size(void)
1183 size_t size;
1184 int pages = 0;
1186 size = MAXALIGN(sizeof(dsa_area_control)) +
1187 MAXALIGN(sizeof(FreePageManager));
1189 /* Figure out how many pages we need, including the page map... */
1190 while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1192 ++pages;
1193 size += sizeof(dsa_pointer);
1196 return pages * FPM_PAGE_SIZE;
1200 * Workhorse function for dsa_create and dsa_create_in_place.
1202 static dsa_area *
1203 create_internal(void *place, size_t size,
1204 int tranche_id,
1205 dsm_handle control_handle,
1206 dsm_segment *control_segment)
1208 dsa_area_control *control;
1209 dsa_area *area;
1210 dsa_segment_map *segment_map;
1211 size_t usable_pages;
1212 size_t total_pages;
1213 size_t metadata_bytes;
1214 int i;
1216 /* Sanity check on the space we have to work in. */
1217 if (size < dsa_minimum_size())
1218 elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1219 dsa_minimum_size(), size);
1221 /* Now figure out how much space is usable */
1222 total_pages = size / FPM_PAGE_SIZE;
1223 metadata_bytes =
1224 MAXALIGN(sizeof(dsa_area_control)) +
1225 MAXALIGN(sizeof(FreePageManager)) +
1226 total_pages * sizeof(dsa_pointer);
1227 /* Add padding up to next page boundary. */
1228 if (metadata_bytes % FPM_PAGE_SIZE != 0)
1229 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1230 Assert(metadata_bytes <= size);
1231 usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1234 * Initialize the dsa_area_control object located at the start of the
1235 * space.
1237 control = (dsa_area_control *) place;
1238 memset(place, 0, sizeof(*control));
1239 control->segment_header.magic =
1240 DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1241 control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
1242 control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
1243 control->segment_header.usable_pages = usable_pages;
1244 control->segment_header.freed = false;
1245 control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
1246 control->handle = control_handle;
1247 control->max_total_segment_size = (size_t) -1;
1248 control->total_segment_size = size;
1249 control->segment_handles[0] = control_handle;
1250 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1251 control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
1252 control->refcnt = 1;
1253 control->lwlock_tranche_id = tranche_id;
1256 * Create the dsa_area object that this backend will use to access the
1257 * area. Other backends will need to obtain their own dsa_area object by
1258 * attaching.
1260 area = palloc(sizeof(dsa_area));
1261 area->control = control;
1262 area->mapping_pinned = false;
1263 memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1264 area->high_segment_index = 0;
1265 area->freed_segment_counter = 0;
1266 LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1267 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1268 LWLockInitialize(DSA_SCLASS_LOCK(area, i),
1269 control->lwlock_tranche_id);
1271 /* Set up the segment map for this process's mapping. */
1272 segment_map = &area->segment_maps[0];
1273 segment_map->segment = control_segment;
1274 segment_map->mapped_address = place;
1275 segment_map->header = (dsa_segment_header *) place;
1276 segment_map->fpm = (FreePageManager *)
1277 (segment_map->mapped_address +
1278 MAXALIGN(sizeof(dsa_area_control)));
1279 segment_map->pagemap = (dsa_pointer *)
1280 (segment_map->mapped_address +
1281 MAXALIGN(sizeof(dsa_area_control)) +
1282 MAXALIGN(sizeof(FreePageManager)));
1284 /* Set up the free page map. */
1285 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1286 /* There can be 0 usable pages if size is dsa_minimum_size(). */
1288 if (usable_pages > 0)
1289 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1290 usable_pages);
1292 /* Put this segment into the appropriate bin. */
1293 control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1294 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1296 return area;
1300 * Workhorse function for dsa_attach and dsa_attach_in_place.
1302 static dsa_area *
1303 attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1305 dsa_area_control *control;
1306 dsa_area *area;
1307 dsa_segment_map *segment_map;
1309 control = (dsa_area_control *) place;
1310 Assert(control->handle == handle);
1311 Assert(control->segment_handles[0] == handle);
1312 Assert(control->segment_header.magic ==
1313 (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1315 /* Build the backend-local area object. */
1316 area = palloc(sizeof(dsa_area));
1317 area->control = control;
1318 area->mapping_pinned = false;
1319 memset(&area->segment_maps[0], 0,
1320 sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1321 area->high_segment_index = 0;
1323 /* Set up the segment map for this process's mapping. */
1324 segment_map = &area->segment_maps[0];
1325 segment_map->segment = segment; /* NULL for in-place */
1326 segment_map->mapped_address = place;
1327 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1328 segment_map->fpm = (FreePageManager *)
1329 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1330 segment_map->pagemap = (dsa_pointer *)
1331 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1332 MAXALIGN(sizeof(FreePageManager)));
1334 /* Bump the reference count. */
1335 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1336 if (control->refcnt == 0)
1338 /* We can't attach to a DSA area that has already been destroyed. */
1339 ereport(ERROR,
1340 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1341 errmsg("could not attach to dynamic shared area")));
1343 ++control->refcnt;
1344 area->freed_segment_counter = area->control->freed_segment_counter;
1345 LWLockRelease(DSA_AREA_LOCK(area));
1347 return area;
1351 * Add a new span to fullness class 1 of the indicated pool.
1353 static void
1354 init_span(dsa_area *area,
1355 dsa_pointer span_pointer,
1356 dsa_area_pool *pool, dsa_pointer start, size_t npages,
1357 uint16 size_class)
1359 dsa_area_span *span = dsa_get_address(area, span_pointer);
1360 size_t obsize = dsa_size_classes[size_class];
1363 * The per-pool lock must be held because we manipulate the span list for
1364 * this pool.
1366 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1368 /* Push this span onto the front of the span list for fullness class 1. */
1369 if (DsaPointerIsValid(pool->spans[1]))
1371 dsa_area_span *head = (dsa_area_span *)
1372 dsa_get_address(area, pool->spans[1]);
1374 head->prevspan = span_pointer;
1376 span->pool = DsaAreaPoolToDsaPointer(area, pool);
1377 span->nextspan = pool->spans[1];
1378 span->prevspan = InvalidDsaPointer;
1379 pool->spans[1] = span_pointer;
1381 span->start = start;
1382 span->npages = npages;
1383 span->size_class = size_class;
1384 span->ninitialized = 0;
1385 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1388 * A block-of-spans contains its own descriptor, so mark one object as
1389 * initialized and reduce the count of allocatable objects by one.
1390 * Doing this here has the side effect of also reducing nmax by one,
1391 * which is important to make sure we free this object at the correct
1392 * time.
1394 span->ninitialized = 1;
1395 span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1397 else if (size_class != DSA_SCLASS_SPAN_LARGE)
1398 span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1399 span->firstfree = DSA_SPAN_NOTHING_FREE;
1400 span->nmax = span->nallocatable;
1401 span->fclass = 1;
1405 * Transfer the first span in one fullness class to the head of another
1406 * fullness class.
1408 static bool
1409 transfer_first_span(dsa_area *area,
1410 dsa_area_pool *pool, int fromclass, int toclass)
1412 dsa_pointer span_pointer;
1413 dsa_area_span *span;
1414 dsa_area_span *nextspan;
1416 /* Can't do it if source list is empty. */
1417 span_pointer = pool->spans[fromclass];
1418 if (!DsaPointerIsValid(span_pointer))
1419 return false;
1421 /* Remove span from head of source list. */
1422 span = dsa_get_address(area, span_pointer);
1423 pool->spans[fromclass] = span->nextspan;
1424 if (DsaPointerIsValid(span->nextspan))
1426 nextspan = (dsa_area_span *)
1427 dsa_get_address(area, span->nextspan);
1428 nextspan->prevspan = InvalidDsaPointer;
1431 /* Add span to head of target list. */
1432 span->nextspan = pool->spans[toclass];
1433 pool->spans[toclass] = span_pointer;
1434 if (DsaPointerIsValid(span->nextspan))
1436 nextspan = (dsa_area_span *)
1437 dsa_get_address(area, span->nextspan);
1438 nextspan->prevspan = span_pointer;
1440 span->fclass = toclass;
1442 return true;
1446 * Allocate one object of the requested size class from the given area.
1448 static inline dsa_pointer
1449 alloc_object(dsa_area *area, int size_class)
1451 dsa_area_pool *pool = &area->control->pools[size_class];
1452 dsa_area_span *span;
1453 dsa_pointer block;
1454 dsa_pointer result;
1455 char *object;
1456 size_t size;
1459 * Even though ensure_active_superblock can in turn call alloc_object if
1460 * it needs to allocate a new span, that's always from a different pool,
1461 * and the order of lock acquisition is always the same, so it's OK that
1462 * we hold this lock for the duration of this function.
1464 Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1465 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1468 * If there's no active superblock, we must successfully obtain one or
1469 * fail the request.
1471 if (!DsaPointerIsValid(pool->spans[1]) &&
1472 !ensure_active_superblock(area, pool, size_class))
1474 result = InvalidDsaPointer;
1476 else
1479 * There should be a block in fullness class 1 at this point, and it
1480 * should never be completely full. Thus we can either pop an object
1481 * from the free list or, failing that, initialize a new object.
1483 Assert(DsaPointerIsValid(pool->spans[1]));
1484 span = (dsa_area_span *)
1485 dsa_get_address(area, pool->spans[1]);
1486 Assert(span->nallocatable > 0);
1487 block = span->start;
1488 Assert(size_class < DSA_NUM_SIZE_CLASSES);
1489 size = dsa_size_classes[size_class];
1490 if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1492 result = block + span->firstfree * size;
1493 object = dsa_get_address(area, result);
1494 span->firstfree = NextFreeObjectIndex(object);
1496 else
1498 result = block + span->ninitialized * size;
1499 ++span->ninitialized;
1501 --span->nallocatable;
1503 /* If it's now full, move it to the highest-numbered fullness class. */
1504 if (span->nallocatable == 0)
1505 transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1508 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1509 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1511 return result;
1515 * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1516 * superblocks are completely full and no more can be allocated.
1518 * Fullness classes K of 0..N are loosely intended to represent blocks whose
1519 * utilization percentage is at least K/N, but we only enforce this rigorously
1520 * for the highest-numbered fullness class, which always contains exactly
1521 * those blocks that are completely full. It's otherwise acceptable for a
1522 * block to be in a higher-numbered fullness class than the one to which it
1523 * logically belongs. In addition, the active block, which is always the
1524 * first block in fullness class 1, is permitted to have a higher allocation
1525 * percentage than would normally be allowable for that fullness class; we
1526 * don't move it until it's completely full, and then it goes to the
1527 * highest-numbered fullness class.
1529 * It might seem odd that the active block is the head of fullness class 1
1530 * rather than fullness class 0, but experience with other allocators has
1531 * shown that it's usually better to allocate from a block that's moderately
1532 * full rather than one that's nearly empty. Insofar as is reasonably
1533 * possible, we want to avoid performing new allocations in a block that would
1534 * otherwise become empty soon.
1536 static bool
1537 ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
1538 int size_class)
1540 dsa_pointer span_pointer;
1541 dsa_pointer start_pointer;
1542 size_t obsize = dsa_size_classes[size_class];
1543 size_t nmax;
1544 int fclass;
1545 size_t npages = 1;
1546 size_t first_page;
1547 size_t i;
1548 dsa_segment_map *segment_map;
1550 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1553 * Compute the number of objects that will fit in a block of this size
1554 * class. Span-of-spans blocks are just a single page, and the first
1555 * object isn't available for use because it describes the block-of-spans
1556 * itself.
1558 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1559 nmax = FPM_PAGE_SIZE / obsize - 1;
1560 else
1561 nmax = DSA_SUPERBLOCK_SIZE / obsize;
1564 * If fullness class 1 is empty, try to find a span to put in it by
1565 * scanning higher-numbered fullness classes (excluding the last one,
1566 * whose blocks are certain to all be completely full).
1568 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1570 span_pointer = pool->spans[fclass];
1572 while (DsaPointerIsValid(span_pointer))
1574 int tfclass;
1575 dsa_area_span *span;
1576 dsa_area_span *nextspan;
1577 dsa_area_span *prevspan;
1578 dsa_pointer next_span_pointer;
1580 span = (dsa_area_span *)
1581 dsa_get_address(area, span_pointer);
1582 next_span_pointer = span->nextspan;
1584 /* Figure out what fullness class should contain this span. */
1585 tfclass = (nmax - span->nallocatable)
1586 * (DSA_FULLNESS_CLASSES - 1) / nmax;
1588 /* Look up next span. */
1589 if (DsaPointerIsValid(span->nextspan))
1590 nextspan = (dsa_area_span *)
1591 dsa_get_address(area, span->nextspan);
1592 else
1593 nextspan = NULL;
1596 * If utilization has dropped enough that this now belongs in some
1597 * other fullness class, move it there.
1599 if (tfclass < fclass)
1601 /* Remove from the current fullness class list. */
1602 if (pool->spans[fclass] == span_pointer)
1604 /* It was the head; remove it. */
1605 Assert(!DsaPointerIsValid(span->prevspan));
1606 pool->spans[fclass] = span->nextspan;
1607 if (nextspan != NULL)
1608 nextspan->prevspan = InvalidDsaPointer;
1610 else
1612 /* It was not the head. */
1613 Assert(DsaPointerIsValid(span->prevspan));
1614 prevspan = (dsa_area_span *)
1615 dsa_get_address(area, span->prevspan);
1616 prevspan->nextspan = span->nextspan;
1618 if (nextspan != NULL)
1619 nextspan->prevspan = span->prevspan;
1621 /* Push onto the head of the new fullness class list. */
1622 span->nextspan = pool->spans[tfclass];
1623 pool->spans[tfclass] = span_pointer;
1624 span->prevspan = InvalidDsaPointer;
1625 if (DsaPointerIsValid(span->nextspan))
1627 nextspan = (dsa_area_span *)
1628 dsa_get_address(area, span->nextspan);
1629 nextspan->prevspan = span_pointer;
1631 span->fclass = tfclass;
1634 /* Advance to next span on list. */
1635 span_pointer = next_span_pointer;
1638 /* Stop now if we found a suitable block. */
1639 if (DsaPointerIsValid(pool->spans[1]))
1640 return true;
1644 * If there are no blocks that properly belong in fullness class 1, pick
1645 * one from some other fullness class and move it there anyway, so that we
1646 * have an allocation target. Our last choice is to transfer a block
1647 * that's almost empty (and might become completely empty soon if left
1648 * alone), but even that is better than failing, which is what we must do
1649 * if there are no blocks at all with freespace.
1651 Assert(!DsaPointerIsValid(pool->spans[1]));
1652 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1653 if (transfer_first_span(area, pool, fclass, 1))
1654 return true;
1655 if (!DsaPointerIsValid(pool->spans[1]) &&
1656 transfer_first_span(area, pool, 0, 1))
1657 return true;
1660 * We failed to find an existing span with free objects, so we need to
1661 * allocate a new superblock and construct a new span to manage it.
1663 * First, get a dsa_area_span object to describe the new superblock block
1664 * ... unless this allocation is for a dsa_area_span object, in which case
1665 * that's surely not going to work. We handle that case by storing the
1666 * span describing a block-of-spans inline.
1668 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1670 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1671 if (!DsaPointerIsValid(span_pointer))
1672 return false;
1673 npages = DSA_PAGES_PER_SUPERBLOCK;
1676 /* Find or create a segment and allocate the superblock. */
1677 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1678 segment_map = get_best_segment(area, npages);
1679 if (segment_map == NULL)
1681 segment_map = make_new_segment(area, npages);
1682 if (segment_map == NULL)
1684 LWLockRelease(DSA_AREA_LOCK(area));
1685 return false;
1690 * This shouldn't happen: get_best_segment() or make_new_segment()
1691 * promised that we can successfully allocate npages.
1693 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1694 elog(FATAL,
1695 "dsa_allocate could not find %zu free pages for superblock",
1696 npages);
1697 LWLockRelease(DSA_AREA_LOCK(area));
1699 /* Compute the start of the superblock. */
1700 start_pointer =
1701 DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1702 first_page * FPM_PAGE_SIZE);
1705 * If this is a block-of-spans, carve the descriptor right out of the
1706 * allocated space.
1708 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1711 * We have a pointer into the segment. We need to build a dsa_pointer
1712 * from the segment index and offset into the segment.
1714 span_pointer = start_pointer;
1717 /* Initialize span and pagemap. */
1718 init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1719 for (i = 0; i < npages; ++i)
1720 segment_map->pagemap[first_page + i] = span_pointer;
1722 return true;
1726 * Return the segment map corresponding to a given segment index, mapping the
1727 * segment in if necessary. For internal segment book-keeping, this is called
1728 * with the area lock held. It is also called by dsa_free and dsa_get_address
1729 * without any locking, relying on the fact they have a known live segment
1730 * index and they always call check_for_freed_segments to ensures that any
1731 * freed segment occupying the same slot is detached first.
1733 static dsa_segment_map *
1734 get_segment_by_index(dsa_area *area, dsa_segment_index index)
1736 if (unlikely(area->segment_maps[index].mapped_address == NULL))
1738 dsm_handle handle;
1739 dsm_segment *segment;
1740 dsa_segment_map *segment_map;
1743 * If we are reached by dsa_free or dsa_get_address, there must be at
1744 * least one object allocated in the referenced segment. Otherwise,
1745 * their caller has a double-free or access-after-free bug, which we
1746 * have no hope of detecting. So we know it's safe to access this
1747 * array slot without holding a lock; it won't change underneath us.
1748 * Furthermore, we know that we can see the latest contents of the
1749 * slot, as explained in check_for_freed_segments, which those
1750 * functions call before arriving here.
1752 handle = area->control->segment_handles[index];
1754 /* It's an error to try to access an unused slot. */
1755 if (handle == DSM_HANDLE_INVALID)
1756 elog(ERROR,
1757 "dsa_area could not attach to a segment that has been freed");
1759 segment = dsm_attach(handle);
1760 if (segment == NULL)
1761 elog(ERROR, "dsa_area could not attach to segment");
1762 if (area->mapping_pinned)
1763 dsm_pin_mapping(segment);
1764 segment_map = &area->segment_maps[index];
1765 segment_map->segment = segment;
1766 segment_map->mapped_address = dsm_segment_address(segment);
1767 segment_map->header =
1768 (dsa_segment_header *) segment_map->mapped_address;
1769 segment_map->fpm = (FreePageManager *)
1770 (segment_map->mapped_address +
1771 MAXALIGN(sizeof(dsa_segment_header)));
1772 segment_map->pagemap = (dsa_pointer *)
1773 (segment_map->mapped_address +
1774 MAXALIGN(sizeof(dsa_segment_header)) +
1775 MAXALIGN(sizeof(FreePageManager)));
1777 /* Remember the highest index this backend has ever mapped. */
1778 if (area->high_segment_index < index)
1779 area->high_segment_index = index;
1781 Assert(segment_map->header->magic ==
1782 (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
1786 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1787 * but it's a bug in the calling code and undefined behavior if the
1788 * address is not live (ie if the segment might possibly have been freed,
1789 * they're trying to use a dangling pointer).
1791 * For dsa.c code that holds the area lock to manipulate segment_bins
1792 * lists, it would be a bug if we ever reach a freed segment here. After
1793 * it's marked as freed, the only thing any backend should do with it is
1794 * unmap it, and it should always have done that in
1795 * check_for_freed_segments_locked() before arriving here to resolve an
1796 * index to a segment_map.
1798 * Either way we can assert that we aren't returning a freed segment.
1800 Assert(!area->segment_maps[index].header->freed);
1802 return &area->segment_maps[index];
1806 * Return a superblock to the free page manager. If the underlying segment
1807 * has become entirely free, then return it to the operating system.
1809 * The appropriate pool lock must be held.
1811 static void
1812 destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
1814 dsa_area_span *span = dsa_get_address(area, span_pointer);
1815 int size_class = span->size_class;
1816 dsa_segment_map *segment_map;
1819 /* Remove it from its fullness class list. */
1820 unlink_span(area, span);
1823 * Note: Here we acquire the area lock while we already hold a per-pool
1824 * lock. We never hold the area lock and then take a pool lock, or we
1825 * could deadlock.
1827 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
1828 check_for_freed_segments_locked(area);
1829 segment_map =
1830 get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
1831 FreePageManagerPut(segment_map->fpm,
1832 DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
1833 span->npages);
1834 /* Check if the segment is now entirely free. */
1835 if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1837 dsa_segment_index index = get_segment_index(area, segment_map);
1839 /* If it's not the segment with extra control data, free it. */
1840 if (index != 0)
1843 * Give it back to the OS, and allow other backends to detect that
1844 * they need to detach.
1846 unlink_segment(area, segment_map);
1847 segment_map->header->freed = true;
1848 Assert(area->control->total_segment_size >=
1849 segment_map->header->size);
1850 area->control->total_segment_size -=
1851 segment_map->header->size;
1852 dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
1853 dsm_detach(segment_map->segment);
1854 area->control->segment_handles[index] = DSM_HANDLE_INVALID;
1855 ++area->control->freed_segment_counter;
1856 segment_map->segment = NULL;
1857 segment_map->header = NULL;
1858 segment_map->mapped_address = NULL;
1861 LWLockRelease(DSA_AREA_LOCK(area));
1864 * Span-of-spans blocks store the span which describes them within the
1865 * block itself, so freeing the storage implicitly frees the descriptor
1866 * also. If this is a block of any other type, we need to separately free
1867 * the span object also. This recursive call to dsa_free will acquire the
1868 * span pool's lock. We can't deadlock because the acquisition order is
1869 * always some other pool and then the span pool.
1871 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1872 dsa_free(area, span_pointer);
1875 static void
1876 unlink_span(dsa_area *area, dsa_area_span *span)
1878 if (DsaPointerIsValid(span->nextspan))
1880 dsa_area_span *next = dsa_get_address(area, span->nextspan);
1882 next->prevspan = span->prevspan;
1884 if (DsaPointerIsValid(span->prevspan))
1886 dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1888 prev->nextspan = span->nextspan;
1890 else
1892 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1894 pool->spans[span->fclass] = span->nextspan;
1898 static void
1899 add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
1900 dsa_pointer span_pointer,
1901 int fclass)
1903 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1905 if (DsaPointerIsValid(pool->spans[fclass]))
1907 dsa_area_span *head = dsa_get_address(area,
1908 pool->spans[fclass]);
1910 head->prevspan = span_pointer;
1912 span->prevspan = InvalidDsaPointer;
1913 span->nextspan = pool->spans[fclass];
1914 pool->spans[fclass] = span_pointer;
1915 span->fclass = fclass;
1919 * Detach from an area that was either created or attached to by this process.
1921 void
1922 dsa_detach(dsa_area *area)
1924 int i;
1926 /* Detach from all segments. */
1927 for (i = 0; i <= area->high_segment_index; ++i)
1928 if (area->segment_maps[i].segment != NULL)
1929 dsm_detach(area->segment_maps[i].segment);
1932 * Note that 'detaching' (= detaching from DSM segments) doesn't include
1933 * 'releasing' (= adjusting the reference count). It would be nice to
1934 * combine these operations, but client code might never get around to
1935 * calling dsa_detach because of an error path, and a detach hook on any
1936 * particular segment is too late to detach other segments in the area
1937 * without risking a 'leak' warning in the non-error path.
1940 /* Free the backend-local area object. */
1941 pfree(area);
1945 * Unlink a segment from the bin that contains it.
1947 static void
1948 unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
1950 if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1952 dsa_segment_map *prev;
1954 prev = get_segment_by_index(area, segment_map->header->prev);
1955 prev->header->next = segment_map->header->next;
1957 else
1959 Assert(area->control->segment_bins[segment_map->header->bin] ==
1960 get_segment_index(area, segment_map));
1961 area->control->segment_bins[segment_map->header->bin] =
1962 segment_map->header->next;
1964 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
1966 dsa_segment_map *next;
1968 next = get_segment_by_index(area, segment_map->header->next);
1969 next->header->prev = segment_map->header->prev;
1974 * Find a segment that could satisfy a request for 'npages' of contiguous
1975 * memory, or return NULL if none can be found. This may involve attaching to
1976 * segments that weren't previously attached so that we can query their free
1977 * pages map.
1979 static dsa_segment_map *
1980 get_best_segment(dsa_area *area, size_t npages)
1982 size_t bin;
1984 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
1985 check_for_freed_segments_locked(area);
1988 * Start searching from the first bin that *might* have enough contiguous
1989 * pages.
1991 for (bin = contiguous_pages_to_segment_bin(npages);
1992 bin < DSA_NUM_SEGMENT_BINS;
1993 ++bin)
1996 * The minimum contiguous size that any segment in this bin should
1997 * have. We'll re-bin if we see segments with fewer.
1999 size_t threshold = (size_t) 1 << (bin - 1);
2000 dsa_segment_index segment_index;
2002 /* Search this bin for a segment with enough contiguous space. */
2003 segment_index = area->control->segment_bins[bin];
2004 while (segment_index != DSA_SEGMENT_INDEX_NONE)
2006 dsa_segment_map *segment_map;
2007 dsa_segment_index next_segment_index;
2008 size_t contiguous_pages;
2010 segment_map = get_segment_by_index(area, segment_index);
2011 next_segment_index = segment_map->header->next;
2012 contiguous_pages = fpm_largest(segment_map->fpm);
2014 /* Not enough for the request, still enough for this bin. */
2015 if (contiguous_pages >= threshold && contiguous_pages < npages)
2017 segment_index = next_segment_index;
2018 continue;
2021 /* Re-bin it if it's no longer in the appropriate bin. */
2022 if (contiguous_pages < threshold)
2024 size_t new_bin;
2026 new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
2028 /* Remove it from its current bin. */
2029 unlink_segment(area, segment_map);
2031 /* Push it onto the front of its new bin. */
2032 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2033 segment_map->header->next =
2034 area->control->segment_bins[new_bin];
2035 segment_map->header->bin = new_bin;
2036 area->control->segment_bins[new_bin] = segment_index;
2037 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2039 dsa_segment_map *next;
2041 next = get_segment_by_index(area,
2042 segment_map->header->next);
2043 Assert(next->header->bin == new_bin);
2044 next->header->prev = segment_index;
2048 * But fall through to see if it's enough to satisfy this
2049 * request anyway....
2053 /* Check if we are done. */
2054 if (contiguous_pages >= npages)
2055 return segment_map;
2057 /* Continue searching the same bin. */
2058 segment_index = next_segment_index;
2062 /* Not found. */
2063 return NULL;
2067 * Create a new segment that can handle at least requested_pages. Returns
2068 * NULL if the requested total size limit or maximum allowed number of
2069 * segments would be exceeded.
2071 static dsa_segment_map *
2072 make_new_segment(dsa_area *area, size_t requested_pages)
2074 dsa_segment_index new_index;
2075 size_t metadata_bytes;
2076 size_t total_size;
2077 size_t total_pages;
2078 size_t usable_pages;
2079 dsa_segment_map *segment_map;
2080 dsm_segment *segment;
2082 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2084 /* Find a segment slot that is not in use (linearly for now). */
2085 for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2087 if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2088 break;
2090 if (new_index == DSA_MAX_SEGMENTS)
2091 return NULL;
2094 * If the total size limit is already exceeded, then we exit early and
2095 * avoid arithmetic wraparound in the unsigned expressions below.
2097 if (area->control->total_segment_size >=
2098 area->control->max_total_segment_size)
2099 return NULL;
2102 * The size should be at least as big as requested, and at least big
2103 * enough to follow a geometric series that approximately doubles the
2104 * total storage each time we create a new segment. We use geometric
2105 * growth because the underlying DSM system isn't designed for large
2106 * numbers of segments (otherwise we might even consider just using one
2107 * DSM segment for each large allocation and for each superblock, and then
2108 * we wouldn't need to use FreePageManager).
2110 * We decide on a total segment size first, so that we produce tidy
2111 * power-of-two sized segments. This is a good property to have if we
2112 * move to huge pages in the future. Then we work back to the number of
2113 * pages we can fit.
2115 total_size = DSA_INITIAL_SEGMENT_SIZE *
2116 ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2117 total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
2118 total_size = Min(total_size,
2119 area->control->max_total_segment_size -
2120 area->control->total_segment_size);
2122 total_pages = total_size / FPM_PAGE_SIZE;
2123 metadata_bytes =
2124 MAXALIGN(sizeof(dsa_segment_header)) +
2125 MAXALIGN(sizeof(FreePageManager)) +
2126 sizeof(dsa_pointer) * total_pages;
2128 /* Add padding up to next page boundary. */
2129 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2130 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2131 if (total_size <= metadata_bytes)
2132 return NULL;
2133 usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2134 Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2136 /* See if that is enough... */
2137 if (requested_pages > usable_pages)
2140 * We'll make an odd-sized segment, working forward from the requested
2141 * number of pages.
2143 usable_pages = requested_pages;
2144 metadata_bytes =
2145 MAXALIGN(sizeof(dsa_segment_header)) +
2146 MAXALIGN(sizeof(FreePageManager)) +
2147 usable_pages * sizeof(dsa_pointer);
2149 /* Add padding up to next page boundary. */
2150 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2151 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2152 total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2154 /* Is that too large for dsa_pointer's addressing scheme? */
2155 if (total_size > DSA_MAX_SEGMENT_SIZE)
2156 return NULL;
2158 /* Would that exceed the limit? */
2159 if (total_size > area->control->max_total_segment_size -
2160 area->control->total_segment_size)
2161 return NULL;
2164 /* Create the segment. */
2165 segment = dsm_create(total_size, 0);
2166 if (segment == NULL)
2167 return NULL;
2168 dsm_pin_segment(segment);
2169 if (area->mapping_pinned)
2170 dsm_pin_mapping(segment);
2172 /* Store the handle in shared memory to be found by index. */
2173 area->control->segment_handles[new_index] =
2174 dsm_segment_handle(segment);
2175 /* Track the highest segment index in the history of the area. */
2176 if (area->control->high_segment_index < new_index)
2177 area->control->high_segment_index = new_index;
2178 /* Track the highest segment index this backend has ever mapped. */
2179 if (area->high_segment_index < new_index)
2180 area->high_segment_index = new_index;
2181 /* Track total size of all segments. */
2182 area->control->total_segment_size += total_size;
2183 Assert(area->control->total_segment_size <=
2184 area->control->max_total_segment_size);
2186 /* Build a segment map for this segment in this backend. */
2187 segment_map = &area->segment_maps[new_index];
2188 segment_map->segment = segment;
2189 segment_map->mapped_address = dsm_segment_address(segment);
2190 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2191 segment_map->fpm = (FreePageManager *)
2192 (segment_map->mapped_address +
2193 MAXALIGN(sizeof(dsa_segment_header)));
2194 segment_map->pagemap = (dsa_pointer *)
2195 (segment_map->mapped_address +
2196 MAXALIGN(sizeof(dsa_segment_header)) +
2197 MAXALIGN(sizeof(FreePageManager)));
2199 /* Set up the free page map. */
2200 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2201 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2202 usable_pages);
2204 /* Set up the segment header and put it in the appropriate bin. */
2205 segment_map->header->magic =
2206 DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2207 segment_map->header->usable_pages = usable_pages;
2208 segment_map->header->size = total_size;
2209 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2210 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2211 segment_map->header->next =
2212 area->control->segment_bins[segment_map->header->bin];
2213 segment_map->header->freed = false;
2214 area->control->segment_bins[segment_map->header->bin] = new_index;
2215 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2217 dsa_segment_map *next =
2218 get_segment_by_index(area, segment_map->header->next);
2220 Assert(next->header->bin == segment_map->header->bin);
2221 next->header->prev = new_index;
2224 return segment_map;
2228 * Check if any segments have been freed by destroy_superblock, so we can
2229 * detach from them in this backend. This function is called by
2230 * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2231 * received can be resolved to the correct segment.
2233 * The danger we want to defend against is that there could be an old segment
2234 * mapped into a given slot in this backend, and the dsa_pointer they have
2235 * might refer to some new segment in the same slot. So those functions must
2236 * be sure to process all instructions to detach from a freed segment that had
2237 * been generated by the time this process received the dsa_pointer, before
2238 * they call get_segment_by_index.
2240 static void
2241 check_for_freed_segments(dsa_area *area)
2243 size_t freed_segment_counter;
2246 * Any other process that has freed a segment has incremented
2247 * freed_segment_counter while holding an LWLock, and that must precede
2248 * any backend creating a new segment in the same slot while holding an
2249 * LWLock, and that must precede the creation of any dsa_pointer pointing
2250 * into the new segment which might reach us here, and the caller must
2251 * have sent the dsa_pointer to this process using appropriate memory
2252 * synchronization (some kind of locking or atomic primitive or system
2253 * call). So all we need to do on the reading side is ask for the load of
2254 * freed_segment_counter to follow the caller's load of the dsa_pointer it
2255 * has, and we can be sure to detect any segments that had been freed as
2256 * of the time that the dsa_pointer reached this process.
2258 pg_read_barrier();
2259 freed_segment_counter = area->control->freed_segment_counter;
2260 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2262 /* Check all currently mapped segments to find what's been freed. */
2263 LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
2264 check_for_freed_segments_locked(area);
2265 LWLockRelease(DSA_AREA_LOCK(area));
2270 * Workhorse for check_for_freed_segments(), and also used directly in path
2271 * where the area lock is already held. This should be called after acquiring
2272 * the lock but before looking up any segment by index number, to make sure we
2273 * unmap any stale segments that might have previously had the same index as a
2274 * current segment.
2276 static void
2277 check_for_freed_segments_locked(dsa_area *area)
2279 size_t freed_segment_counter;
2280 int i;
2282 Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
2283 freed_segment_counter = area->control->freed_segment_counter;
2284 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2286 for (i = 0; i <= area->high_segment_index; ++i)
2288 if (area->segment_maps[i].header != NULL &&
2289 area->segment_maps[i].header->freed)
2291 dsm_detach(area->segment_maps[i].segment);
2292 area->segment_maps[i].segment = NULL;
2293 area->segment_maps[i].header = NULL;
2294 area->segment_maps[i].mapped_address = NULL;
2297 area->freed_segment_counter = freed_segment_counter;