1 /* frame_malloc.c -*-C-*-
3 *************************************************************************
6 * Copyright (C) 2009-2013, Intel Corporation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
20 * * Neither the name of Intel Corporation nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 **************************************************************************/
39 #include "frame_malloc.h"
41 #include "local_state.h"
42 #include "cilk_malloc.h"
48 /* #define USE_MMAP 1 */
55 // Define to fill the stack frame header with the fill character when pushing
56 // it on a free list. Note that this should be #ifdef'd out when checked in!
59 #define HEADER_FILL_CHAR 0xbf
62 // HEADER_FILL_CHAR should not be defined when checked in, so put out a warning
63 // message if this is a release build
65 #if defined(NDEBUG) && defined (HEADER_FILL_CHAR)
66 #pragma message ("Warning: HEADER_FILL_CHAR defined for a release build")
69 static void allocate_batch(__cilkrts_worker
*w
, int bucket
, size_t size
);
73 const unsigned short __cilkrts_bucket_sizes
[FRAME_MALLOC_NBUCKETS
] =
75 64, 128, 256, 512, 1024, 2048
78 #define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) __cilkrts_bucket_sizes[bucket]
80 /* threshold above which we use slow malloc */
81 #define FRAME_MALLOC_MAX_SIZE 2048
85 /* Note that this must match the implementation of framesz_to_bucket in
86 * asmilator/layout.ml! */
87 #define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) ((size_t)(64 << (bucket)))
89 /* threshold above which we use slow malloc */
90 #define FRAME_MALLOC_MAX_SIZE \
91 FRAME_MALLOC_BUCKET_TO_SIZE(FRAME_MALLOC_NBUCKETS - 1)
95 /* utility procedures */
96 static void push(struct free_list
**b
, struct free_list
*p
)
98 #ifdef HEADER_FILL_CHAR
99 memset (p
, HEADER_FILL_CHAR
, FRAME_MALLOC_BUCKET_TO_SIZE(0));
101 /* cons! onto free list */
106 static struct free_list
*pop(struct free_list
**b
)
108 struct free_list
*p
= *b
;
114 /*************************************************************
116 *************************************************************/
117 /* request slightly less than 2^K from the OS, which after malloc
118 overhead and alignment should end up filling each VM page almost
119 completely. 128 is a guess of the total malloc overhead and cache
121 #define FRAME_MALLOC_CHUNK (32 * 1024 - 128)
123 /** Implements linked list of frames */
125 char *p
; /**< This element of the list */
126 struct pool_cons
*cdr
; /**< Remainder of the list */
129 static void extend_global_pool(global_state_t
*g
)
131 /* FIXME: memalign to a cache line? */
132 struct pool_cons
*c
= (struct pool_cons
*)__cilkrts_malloc(sizeof(*c
));
133 g
->frame_malloc
.pool_begin
=
134 (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK
);
135 g
->frame_malloc
.pool_end
=
136 g
->frame_malloc
.pool_begin
+ FRAME_MALLOC_CHUNK
;
137 g
->frame_malloc
.allocated_from_os
+= FRAME_MALLOC_CHUNK
;
138 c
->p
= g
->frame_malloc
.pool_begin
;
139 c
->cdr
= g
->frame_malloc
.pool_list
;
140 g
->frame_malloc
.pool_list
= c
;
143 /* the size is already canonicalized at this point */
144 static struct free_list
*global_alloc(global_state_t
*g
, int bucket
)
146 struct free_list
*mem
;
149 CILK_ASSERT(bucket
< FRAME_MALLOC_NBUCKETS
);
150 size
= FRAME_MALLOC_BUCKET_TO_SIZE(bucket
);
151 g
->frame_malloc
.allocated_from_global_pool
+= size
;
153 if (!(mem
= pop(&g
->frame_malloc
.global_free_list
[bucket
]))) {
155 CILK_ASSERT(g
->frame_malloc
.pool_begin
<= g
->frame_malloc
.pool_end
);
156 if (g
->frame_malloc
.pool_begin
+ size
> g
->frame_malloc
.pool_end
) {
157 /* We waste the fragment of pool. */
158 g
->frame_malloc
.wasted
+=
159 g
->frame_malloc
.pool_end
- g
->frame_malloc
.pool_begin
;
160 extend_global_pool(g
);
162 mem
= (struct free_list
*)g
->frame_malloc
.pool_begin
;
163 g
->frame_malloc
.pool_begin
+= size
;
169 static void global_free(global_state_t
*g
, void *mem
, int bucket
)
173 CILK_ASSERT(bucket
< FRAME_MALLOC_NBUCKETS
);
174 size
= FRAME_MALLOC_BUCKET_TO_SIZE(bucket
);
175 g
->frame_malloc
.allocated_from_global_pool
-= size
;
177 push(&g
->frame_malloc
.global_free_list
[bucket
], mem
);
180 void __cilkrts_frame_malloc_global_init(global_state_t
*g
)
184 __cilkrts_mutex_init(&g
->frame_malloc
.lock
);
185 g
->frame_malloc
.check_for_leaks
= 1;
186 g
->frame_malloc
.pool_list
= 0;
187 g
->frame_malloc
.pool_begin
= 0;
188 g
->frame_malloc
.pool_end
= 0;
189 g
->frame_malloc
.batch_size
= 8000;
190 g
->frame_malloc
.potential_limit
= 4 * g
->frame_malloc
.batch_size
;
191 g
->frame_malloc
.allocated_from_os
= 0;
192 g
->frame_malloc
.allocated_from_global_pool
= 0;
193 g
->frame_malloc
.wasted
= 0;
194 for (i
= 0; i
< FRAME_MALLOC_NBUCKETS
; ++i
)
195 g
->frame_malloc
.global_free_list
[i
] = 0;
198 // Counts how many bytes are in the global free list.
199 static size_t count_memory_in_global_list(global_state_t
*g
)
202 // Count the memory remaining in the global free list.
203 size_t size_remaining_in_global_list
= 0;
205 for (i
= 0; i
< FRAME_MALLOC_NBUCKETS
; ++i
) {
207 size_t size_in_bucket
= 0;
208 p
= g
->frame_malloc
.global_free_list
[i
];
211 size_in_bucket
+= FRAME_MALLOC_BUCKET_TO_SIZE(i
);
214 size_remaining_in_global_list
+= size_in_bucket
;
216 return size_remaining_in_global_list
;
220 void __cilkrts_frame_malloc_global_cleanup(global_state_t
*g
)
224 if (g
->frame_malloc
.check_for_leaks
) {
225 size_t memory_in_global_list
= count_memory_in_global_list(g
);
226 // TBD: This check is weak. Short of memory corruption,
227 // I don't see how we have more memory in the free list
228 // than allocated from the os.
229 // Ideally, we should count the memory in the global free list
230 // and check that we have it all. But I believe the runtime
231 // itself also uses some memory, which is not being tracked.
232 if (memory_in_global_list
> g
->frame_malloc
.allocated_from_os
) {
233 __cilkrts_bug("\nError. The Cilk runtime data structures may have been corrupted.\n");
237 while ((c
= g
->frame_malloc
.pool_list
)) {
238 g
->frame_malloc
.pool_list
= c
->cdr
;
239 __cilkrts_free(c
->p
);
243 __cilkrts_mutex_destroy(0, &g
->frame_malloc
.lock
);
245 // Check that all the memory moved from the global pool into
246 // workers has been returned to the global pool.
247 if (g
->frame_malloc
.check_for_leaks
248 && (g
->frame_malloc
.allocated_from_global_pool
!= 0))
251 "---------------------------" "\n"
252 " MEMORY LEAK DETECTED!!! " "\n"
253 "---------------------------" "\n"
259 /*************************************************************
261 *************************************************************/
262 /* allocate a batch of frames of size SIZE from the global pool and
263 store them in the worker's free list */
264 static void allocate_batch(__cilkrts_worker
*w
, int bucket
, size_t size
)
266 global_state_t
*g
= w
->g
;
268 __cilkrts_mutex_lock(w
, &g
->frame_malloc
.lock
); {
270 char *p
= mmap(0, 12288, PROT_READ
|PROT_WRITE
, MAP_PRIVATE
|MAP_ANONYMOUS
, -1, 0);
272 __cilkrts_bug("mmap failed %d", errno
);
274 assert(p
!= MAP_FAILED
);
275 mprotect(p
, 4096, PROT_NONE
);
276 mprotect(p
+ 8192, 4096, PROT_NONE
);
277 w
->l
->bucket_potential
[bucket
] += size
;
278 push(&w
->l
->free_list
[bucket
], (struct free_list
*)(p
+ 8192 - size
));
280 size_t bytes_allocated
= 0;
282 w
->l
->bucket_potential
[bucket
] += size
;
283 bytes_allocated
+= size
;
284 push(&w
->l
->free_list
[bucket
], global_alloc(g
, bucket
));
285 } while (bytes_allocated
< g
->frame_malloc
.batch_size
);
287 } __cilkrts_mutex_unlock(w
, &g
->frame_malloc
.lock
);
291 static void gc_bucket(__cilkrts_worker
*w
, int bucket
, size_t size
)
293 struct free_list
*p
, *q
;
294 global_state_t
*g
= w
->g
;
295 size_t pot
= w
->l
->bucket_potential
[bucket
];
298 /* Keep up to POT/2 elements in the free list. The cost of
299 counting up to POT/2 is amortized against POT. */
301 for (newpot
= 0, p
= w
->l
->free_list
[bucket
]; p
&& 2 * newpot
< pot
;
302 p
= p
->cdr
, newpot
+= size
)
304 w
->l
->bucket_potential
[bucket
] = newpot
;
307 /* free the rest of the list. The cost of grabbing the lock
308 is amortized against POT/2; the cost of traversing the rest
309 of the list is amortized against the free operation that
310 puts the element on the list. */
311 __cilkrts_mutex_lock(w
, &g
->frame_malloc
.lock
); {
312 while ((q
= pop(&p
->cdr
)))
314 munmap((char *)q
+ size
- 8192, 12288);
316 global_free(g
, q
, bucket
);
318 } __cilkrts_mutex_unlock(w
, &g
->frame_malloc
.lock
);
322 // Free all the memory in this bucket for the specified worker,
323 // returning it to the global pool's free list.
324 static void move_bucket_to_global_free_list(__cilkrts_worker
*w
,
327 struct free_list
*p
, *q
;
328 global_state_t
*g
= w
->g
;
329 p
= w
->l
->free_list
[bucket
];
332 __cilkrts_mutex_lock(w
, &g
->frame_malloc
.lock
); {
333 while ((q
= pop(&p
))) {
335 size_t size
= FRAME_MALLOC_BUCKET_TO_SIZE(bucket
);
336 munmap((char *)q
+ size
- 8192, 12288);
338 global_free(g
, q
, bucket
);
341 } __cilkrts_mutex_unlock(w
, &g
->frame_malloc
.lock
);
344 // I'm not sure this does anything useful now, since
345 // the worker is about to be destroyed. But why not?
346 w
->l
->bucket_potential
[bucket
] = 0;
349 static int bucket_of_size(size_t size
)
353 for (i
= 0; i
< FRAME_MALLOC_NBUCKETS
; ++i
)
354 if (size
<= FRAME_MALLOC_BUCKET_TO_SIZE(i
))
357 CILK_ASSERT(0 /* can't happen */);
361 size_t __cilkrts_frame_malloc_roundup(size_t size
)
363 if (size
> FRAME_MALLOC_MAX_SIZE
) {
364 /* nothing, leave it alone */
366 int bucket
= bucket_of_size(size
);
367 size
= FRAME_MALLOC_BUCKET_TO_SIZE(bucket
);
372 size_t __cilkrts_size_of_bucket(int bucket
)
374 CILK_ASSERT(bucket
>= 0 && bucket
< FRAME_MALLOC_NBUCKETS
);
375 return FRAME_MALLOC_BUCKET_TO_SIZE(bucket
);
378 void *__cilkrts_frame_malloc(__cilkrts_worker
*w
, size_t size
)
383 /* if too large, or if no worker, fall back to __cilkrts_malloc() */
384 if (!w
|| size
> FRAME_MALLOC_MAX_SIZE
) {
385 NOTE_INTERVAL(w
, INTERVAL_FRAME_ALLOC_LARGE
);
386 return __cilkrts_malloc(size
);
389 START_INTERVAL(w
, INTERVAL_FRAME_ALLOC
); {
390 bucket
= bucket_of_size(size
);
391 size
= FRAME_MALLOC_BUCKET_TO_SIZE(bucket
);
393 while (!(mem
= pop(&w
->l
->free_list
[bucket
]))) {
394 /* get a batch of frames from the global pool */
395 START_INTERVAL(w
, INTERVAL_FRAME_ALLOC_GLOBAL
) {
396 allocate_batch(w
, bucket
, size
);
397 } STOP_INTERVAL(w
, INTERVAL_FRAME_ALLOC_GLOBAL
);
399 } STOP_INTERVAL(w
, INTERVAL_FRAME_ALLOC
);
404 void __cilkrts_frame_free(__cilkrts_worker
*w
, void *p0
, size_t size
)
407 struct free_list
*p
= (struct free_list
*)p0
;
409 /* if too large, or if no worker, fall back to __cilkrts_free() */
410 if (!w
|| size
> FRAME_MALLOC_MAX_SIZE
) {
411 NOTE_INTERVAL(w
, INTERVAL_FRAME_FREE_LARGE
);
420 START_INTERVAL(w
, INTERVAL_FRAME_FREE
); {
421 bucket
= bucket_of_size(size
);
422 size
= FRAME_MALLOC_BUCKET_TO_SIZE(bucket
);
423 w
->l
->bucket_potential
[bucket
] += size
;
424 push(&w
->l
->free_list
[bucket
], p
);
425 if (w
->l
->bucket_potential
[bucket
] >
426 w
->g
->frame_malloc
.potential_limit
) {
427 START_INTERVAL(w
, INTERVAL_FRAME_FREE_GLOBAL
) {
428 gc_bucket(w
, bucket
, size
);
429 } STOP_INTERVAL(w
, INTERVAL_FRAME_FREE_GLOBAL
);
431 } STOP_INTERVAL(w
, INTERVAL_FRAME_FREE
);
434 void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker
*w
)
437 local_state
*l
= w
->l
;
439 for (i
= 0; i
< FRAME_MALLOC_NBUCKETS
; ++i
) {
441 l
->bucket_potential
[i
] = 0;
445 void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker
*w
)
448 // Move memory to the global pool. This operation
449 // ensures the memory does not become unreachable / leak
450 // when the worker is destroyed.
451 for (i
= 0; i
< FRAME_MALLOC_NBUCKETS
; ++i
) {
452 move_bucket_to_global_free_list(w
, i
);
458 c-file-style:"bsd" **
460 indent-tabs-mode:nil **