1 /* cilk-abi-vla.cpp -*-C++-*-
3 *************************************************************************
6 * Copyright (C) 2013, Intel Corporation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
20 * * Neither the name of Intel Corporation nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
38 **************************************************************************/
41 * Implementation of Variable Length Array (VLA) ABI.
43 * __cilkrts_stack_alloc() and __cilkrts_stack_free must be compiled
44 * such that ebp/rbp is used for the stack frames. This is done by having
45 * each of them use alloca, which forces the special frame types needed on
46 * each of the ABIs. Additionally, for some forms of stack frame, special
47 * care must be taken because the alloca space may not be at the bottom of the
48 * stack frame of the caller. For Intel64 windows, and for some options
49 * with other ABIs, a preallocated parameter block may exist on the stack
50 * at a lower address than the alloca. If this is the case, the parameter
51 * distance_from_sp_to_alloca_area will be non-zero, and will indicate how
52 * much pre-allocated parameter space resides in the caller's stack frame
53 * between the alloca area, and the bottom of the stack when the call to
54 * the cilkrts is made. As such, when non-zero it also includes any space
55 * used for passing the cilkrts_stack_alloc or cilkrts_stack_free parameters.
62 // Getting a definition for alloca appears to be a pain in the butt. Here's
63 // a variant on what's recommended in the autoconf doc
66 # define alloca _alloca
67 #elif defined HAVE_ALLOCA_H
69 #elif defined __GNUC__
70 # define alloca __builtin_alloca
72 # define alloca __alloca
78 void *alloca (size_t);
82 # define INLINE static __inline
83 # pragma warning(disable:1025) // Don't whine about zero extending result of unary operation
85 # define INLINE static inline
89 #include "internal/abi.h"
90 #include "cilk-abi-vla-internal.h"
92 #if defined(__x86_64) || defined(_M_X64)
93 INLINE
void setsp(void *val
)
95 __asm__("movq %0, %%rsp" : : "r"(val
): "rsp");
97 INLINE
char* getsp(void)
101 __asm__("movq %%rsp, %0" : "=r"(res
): : "rsp");
104 INLINE
char* getbp(void)
108 __asm__("movq %%rbp, %0" : "=r"(res
): : "rbp");
111 INLINE
void copy_frame_down_and_move_bp(
118 // In this version, dst is guaranteed to be lower address than src,
119 // therefore copying upwards from src into dst is safe in case
120 // there is overlap. The number of bytes is also guaranteed to be
121 // a multiple of 8, and the copy is done in 64 bit word chunks for
131 "rm"(dst
), "rm"(src
), "rm"(cpy_bytes
), "rm"(new_ebp
) :
132 "rsi", "rdi", "rcx", "rbp", "memory");
134 INLINE
void copy_frame_up_and_move_bp(
141 // In this version, dst is guaranteed to be higher address than src,
142 // therefore copying downwards from src into dst is safe in case
143 // there is overlap. The number of bytes is also guaranteed to be
144 // a multiple of 8, and the copy is done in 64 bit word chunks for
146 dst
+= cpy_bytes
- 8;
147 src
+= cpy_bytes
- 8;
153 "std; rep movsq; cld;"
156 "rm"(dst
), "rm"(src
), "rm"(cpy_bytes
), "rm"(new_ebp
) :
157 "rsi", "rdi", "rcx", "rbp", "memory");
160 INLINE
void setsp(void *val
)
162 __asm__("movl %0, %%esp" : : "r"(val
): "esp");
164 INLINE
char* getsp(void)
168 __asm__("movl %%esp, %0" : "=r"(res
): : "esp");
171 INLINE
char* getbp(void)
175 __asm__("movl %%ebp, %0" : "=r"(res
): : "ebp");
178 INLINE
void copy_frame_down_and_move_bp(
185 // In this version, dst is guaranteed to be lower address than src,
186 // therefore copying upwards from src into dst is safe in case
187 // there is overlap. The number of bytes is also guaranteed to be
188 // a multiple of 4, and the copy is done in 32 bit word chunks for
198 "rm"(dst
), "rm"(src
), "rm"(cpy_bytes
), "rm"(new_ebp
) :
199 "esi", "edi", "ecx", "ebp", "memory");
201 INLINE
void copy_frame_up_and_move_bp(
208 // In this version, dst is guaranteed to be higher address than src,
209 // therefore copying downwards from src into dst is safe in case
210 // there is overlap. The number of bytes is also guaranteed to be
211 // a multiple of 4, and the copy is done in 32 bit word chunks for
213 dst
+= cpy_bytes
- 4;
214 src
+= cpy_bytes
- 4;
220 "std; rep movsd; cld;"
222 // "=D"(dst), "=S"(src), "=C"(cpy_bytes) :
224 "rm"(dst
), "rm"(src
), "rm"(cpy_bytes
), "rm"(new_ebp
) :
225 "esi", "edi", "ecx", "ebp", "memory");
230 #define c_cilk_ptr_from_heap 0xc2f2f00d
231 #define c_cilk_ptr_from_stack 0xc3f30d0f
233 CILK_ABI(__cilkrts_void_ptr
)
234 __cilkrts_stack_alloc(
235 __cilkrts_stack_frame
*sf
,
237 size_t distance_from_sp_to_alloca_area
,
238 uint32_t align
, // align is always >= minimum stack alignment and
239 // >= ptr_size as well, and must be a power of 2.
240 uint32_t needs_tag
// non-zero if the pointer being returned needs to
244 #ifdef __INTEL_COMPILER
245 // full_size will be a multiple of align, and contains
246 // enough extra space to allocate a marker.
247 size_t full_size
= (size
+ align
- 1) & ~(align
- 1);
254 if (sf
->worker
!= 0 &&
255 ((sf
->flags
& CILK_FRAME_UNSYNCHED
) != 0)) {
256 t
= vla_internal_heap_alloc(sf
, full_size
, align
);
259 ((uint32_t*)t
)[-1] = c_cilk_ptr_from_heap
;
264 // stack is still synced, allocate full_size from esp,
265 // and record in 32 bits immediately below the space
266 // allocated that this was space that this was
267 // allocated in the stack.
268 char *old_ebp
= getbp();
269 char *old_esp
= getsp();
271 // make top_ptr point to base of first parameter.
272 char *top_ptr
= ((char *)(_AddressOfReturnAddress()) +
274 size_t param_size
= 0;
276 #if defined(__x86_64)
277 // For Intel64 linux & MACH ABI, all the parameters were passed in
278 // register, so top of the stack frame above the return address
279 // is just the size of the return address plus
280 // distance_from_sp_to_alloca_area on the chance that the alloca
281 // area isn't at the very bottom of the calling functions stack.
282 #elif defined(__MACH__)
283 // For ia32 MACH, parameter size is always a mutliple of 16
284 // bytes to keep the stack 16 byte aligned. So we need to round
285 // number of parameters up to multiple of 4.
286 param_size
= 8 * sizeof(char *);
288 // For both windows Intel64 ABI, and the IA32 windows and
289 // linux ABIs, space is reserved on the stack for all these
290 // parameters. param_size is 5 * size of a stack slot.
291 param_size
= 5 * sizeof(char *);
294 // now make top_ptr point above the params, or if
295 // distance_from_sp_to_alloca_area is not zero, make
296 // it point above that area. When non-zero,
297 // distance_from_sp_to_alloca area is expected to contain
298 // the parameter space, so we only add one or the other,
300 top_ptr
+= (distance_from_sp_to_alloca_area
!= 0) ?
301 distance_from_sp_to_alloca_area
: param_size
;
303 // t needs to end up at current value of top_ptr less full_size and less
304 // distance_from_sp_to_alloca_area and
305 // then rounded down to the alignment needed. Then we have to bump
306 // esp down by current frame_size, so that when all is done with respect
307 // to executing the return sequence, the final value of esp will be the
309 t
= (top_ptr
- full_size
) - distance_from_sp_to_alloca_area
;
310 intptr_t temp
= (intptr_t)t
;
311 temp
&= ~((intptr_t)(align
- 1));
314 // ok, the value of t is set where we need it. Now set esp
315 // to the value of t less the current frame size.
316 // So now when we do regular return esp should be left such
317 // that it has moved down by full_size.
318 size_t cur_fm_size
= (top_ptr
- old_esp
);
319 char *new_esp
= t
- cur_fm_size
;
320 char *new_ebp
= old_ebp
- (old_esp
- new_esp
);
322 // extend the stack down by at least the difference between where
323 // I want it to be and where it currently is. This should take care
324 // of touching any pages necessary.
325 char *foo
= alloca(old_esp
- new_esp
);
326 setsp(foo
< new_esp
? foo
: new_esp
);
328 // Now set esp exactly where I want it.
331 copy_frame_down_and_move_bp(new_esp
, old_esp
, cur_fm_size
, new_ebp
);
335 ((uint32_t*)t
)[-1] = c_cilk_ptr_from_stack
;
339 #else // Not __INTEL_COMPILER
340 // Not supported unless we can figure out how to get the size of the frame
345 // This frees the space allocated for a variable length array.
347 __cilkrts_stack_free(
348 __cilkrts_stack_frame
*sf
,
351 size_t distance_from_sp_to_alloca_area
,
352 uint32_t align
, // same requirements as for align in allocation,
353 // and must match alignment that was passed when
354 // doing the allocation
355 uint32_t known_from_stack
// non-zero if this is known to be allocated
356 // on the stack, and therefore has no tag
359 #ifdef __INTEL_COMPILER
360 uint32_t *t
= (uint32_t*)p
;
362 // full_size will be a multiple of align, and contains
363 // enough extra space to allocate a marker if one was needed.
364 size_t full_size
= (size
+ align
- 1) & ~(align
- 1);
365 if (known_from_stack
== 0) {
366 // if the compiler hasn't told the run-time that this is
367 // known to be on the stack, then this pointer must have been
368 // tagged such that the run-time can tell.
369 assert(t
[-1] == c_cilk_ptr_from_stack
||
370 t
[-1] == c_cilk_ptr_from_heap
);
372 known_from_stack
= t
[-1] == c_cilk_ptr_from_stack
;
373 full_size
+= align
; // accounts for extra space for marker
374 t
= (uint32_t *)(((char *)t
) - align
);
377 if (known_from_stack
) {
378 // alloca useage forces an ebp/rbp based stack frame even though
380 char *foo
= alloca(0);
381 if (sf
->worker
== 0 || (sf
->flags
& CILK_FRAME_UNSYNCHED
) == 0) {
382 // p was allocated from current stack frame and we
383 // are synced on current stack frame. Return the
384 // amount of the stack that needs to be freed.
385 char *old_ebp
= getbp();
386 char *old_esp
= getsp();
388 // make top_ptr point to base of first parameter.
389 char *top_ptr
= ((char *)(_AddressOfReturnAddress()) +
391 size_t param_size
= 0;
393 #if defined(__x86_64)
394 // For Intel64 linux & MACH ABI, all the parameters were passed in
395 // register, so top of the stack frame above the return address
396 // is just the size of the return address plus
397 // distance_from_sp_to_alloca_area on the chance that the alloca
398 // area isn't at the very bottom of the calling functions stack.
399 #elif defined(__MACH__)
400 // For ia32 MACH, parameter size is always a mutliple of 16
401 // bytes to keep the stack 16 byte aligned. So we need to round
402 // number of parameters up to multiple of 4.
403 param_size
= 8 * sizeof(char *);
405 // For both windows Intel64 ABI, and the IA32 windows and
406 // linux ABIs, space is reserved on the stack for all these
407 // parameters. param_size is 5 * size of a stack slot.
408 param_size
= 6 * sizeof(char *);
411 // now make top_ptr point above the params, or if
412 // distance_from_sp_to_alloca_area is not zero, make
413 // it point above that area. When non-zero,
414 // distance_from_sp_to_alloca area is expected to contain
415 // the parameter space, so we only add one or the other,
417 top_ptr
+= (distance_from_sp_to_alloca_area
!= 0) ?
418 distance_from_sp_to_alloca_area
: param_size
;
420 size_t cur_fm_size
= (top_ptr
- old_esp
);
421 char *new_esp
= old_esp
+ full_size
;
422 char *new_ebp
= old_ebp
+ full_size
;
424 copy_frame_up_and_move_bp(new_esp
, old_esp
, cur_fm_size
, new_ebp
);
428 // p was allocated on stack frame, but that is
429 // no longer the current stack frame. Need to adjust the
430 // saved esp that is somewhere in the cilk runtime so that
431 // on sync, esp will be cut back correctly.
432 vla_free_from_original_stack(sf
, full_size
);
436 vla_internal_heap_free(t
, full_size
);
438 #else // Not __INTEL_COMPILER
439 // Not supported unless we can figure out how to get the size of the frame