1 /* -----------------------------------------------------------------------
2 unix64.S - Copyright (c) 2013 The Written Word, Inc.
3 - Copyright (c) 2008 Red Hat, Inc
4 - Copyright (c) 2002 Bo Thorsen <bo@suse.de>
6 x86-64 Foreign Function Interface
8 Permission is hereby granted, free of charge, to any person obtaining
9 a copy of this software and associated documentation files (the
10 ``Software''), to deal in the Software without restriction, including
11 without limitation the rights to use, copy, modify, merge, publish,
12 distribute, sublicense, and/or sell copies of the Software, and to
13 permit persons to whom the Software is furnished to do so, subject to
14 the following conditions:
16 The above copyright notice and this permission notice shall be included
17 in all copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
20 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 ----------------------------------------------------------------------- */
31 #include <fficonfig.h>
33 #include "internal64.h"
37 #define C2(X, Y) X ## Y
38 #define C1(X, Y) C2(X, Y)
39 #ifdef __USER_LABEL_PREFIX__
40 # define C(X) C1(__USER_LABEL_PREFIX__, X)
46 # define L(X) C1(L, X)
48 # define L(X) C1(.L, X)
53 # define ENDF(X) .type X,@function; .size X, . - X
59 /* This macro allows the safe creation of jump tables without an
60 actual table. The entry points into the table are all 8 bytes.
61 The use of ORG asserts that we're at the correct location. */
62 /* ??? The clang assembler doesn't handle .org with symbolic expressions. */
63 #if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__))
64 # define E(BASE, X) .balign 8
66 # define E(BASE, X) .balign 8; .org BASE + X * 8
69 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
70 void *raddr, void (*fnaddr)(void));
72 Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
73 for this function. This has been allocated by ffi_call. We also
74 deallocate some of the stack that has been alloca'd. */
77 .globl C(ffi_call_unix64)
78 FFI_HIDDEN(C(ffi_call_unix64))
82 movq (%rsp), %r10 /* Load return address. */
83 leaq (%rdi, %rsi), %rax /* Find local stack base. */
84 movq %rdx, (%rax) /* Save flags. */
85 movq %rcx, 8(%rax) /* Save raddr. */
86 movq %rbp, 16(%rax) /* Save old frame pointer. */
87 movq %r10, 24(%rax) /* Relocate return address. */
88 movq %rax, %rbp /* Finalize local stack frame. */
90 /* New stack frame based off rbp. This is a itty bit of unwind
91 trickery in that the CFA *has* changed. There is no easy way
92 to describe it correctly on entry to the function. Fortunately,
93 it doesn't matter too much since at all points we can correctly
94 unwind back to ffi_call. Note that the location to which we
95 moved the return address is (the new) CFA-8, so from the
96 perspective of the unwind info, it hasn't moved. */
98 /* cfi_def_cfa(%rbp, 32) */
99 /* cfi_rel_offset(%rbp, 16) */
101 movq %rdi, %r10 /* Save a copy of the register area. */
102 movq %r8, %r11 /* Save a copy of the target fn. */
103 movl %r9d, %eax /* Set number of SSE registers. */
105 /* Load up all argument registers. */
107 movq 0x08(%r10), %rsi
108 movq 0x10(%r10), %rdx
109 movq 0x18(%r10), %rcx
112 movl 0xb0(%r10), %eax
115 L(ret_from_load_sse):
117 /* Deallocate the reg arg area, except for r10, then load via pop. */
118 leaq 0xb8(%r10), %rsp
121 /* Call the user function. */
124 /* Deallocate stack arg area; local stack frame in redzone. */
127 movq 0(%rbp), %rcx /* Reload flags. */
128 movq 8(%rbp), %rdi /* Reload raddr. */
129 movq 16(%rbp), %rbp /* Reload old frame pointer. */
131 /* cfi_remember_state */
132 /* cfi_def_cfa(%rsp, 8) */
133 /* cfi_restore(%rbp) */
135 /* The first byte of the flags contains the FFI_TYPE. */
136 cmpb $UNIX64_RET_LAST, %cl
138 leaq L(store_table)(%rip), %r11
140 leaq (%r11, %r10, 8), %r10
142 /* Prep for the structure cases: scratch area in redzone. */
148 E(L(store_table), UNIX64_RET_VOID)
150 E(L(store_table), UNIX64_RET_UINT8)
154 E(L(store_table), UNIX64_RET_UINT16)
158 E(L(store_table), UNIX64_RET_UINT32)
162 E(L(store_table), UNIX64_RET_SINT8)
166 E(L(store_table), UNIX64_RET_SINT16)
170 E(L(store_table), UNIX64_RET_SINT32)
174 E(L(store_table), UNIX64_RET_INT64)
177 E(L(store_table), UNIX64_RET_XMM32)
180 E(L(store_table), UNIX64_RET_XMM64)
183 E(L(store_table), UNIX64_RET_X87)
186 E(L(store_table), UNIX64_RET_X87_2)
190 E(L(store_table), UNIX64_RET_ST_XMM0_RAX)
193 E(L(store_table), UNIX64_RET_ST_RAX_XMM0)
196 E(L(store_table), UNIX64_RET_ST_XMM0_XMM1)
199 E(L(store_table), UNIX64_RET_ST_RAX_RDX)
203 shrl $UNIX64_SIZE_SHIFT, %ecx
209 shrl $UNIX64_SIZE_SHIFT, %ecx
213 L(sa): call PLT(C(abort))
215 /* Many times we can avoid loading any SSE registers at all.
216 It's not worth an indirect jump to load the exact set of
217 SSE registers needed; zero or all is a good compromise. */
220 /* cfi_restore_state */
222 movdqa 0x30(%r10), %xmm0
223 movdqa 0x40(%r10), %xmm1
224 movdqa 0x50(%r10), %xmm2
225 movdqa 0x60(%r10), %xmm3
226 movdqa 0x70(%r10), %xmm4
227 movdqa 0x80(%r10), %xmm5
228 movdqa 0x90(%r10), %xmm6
229 movdqa 0xa0(%r10), %xmm7
230 jmp L(ret_from_load_sse)
233 ENDF(C(ffi_call_unix64))
235 /* 6 general registers, 8 vector registers,
236 32 bytes of rvalue, 8 bytes of alignment. */
237 #define ffi_closure_OFS_G 0
238 #define ffi_closure_OFS_V (6*8)
239 #define ffi_closure_OFS_RVALUE (ffi_closure_OFS_V + 8*16)
240 #define ffi_closure_FS (ffi_closure_OFS_RVALUE + 32 + 8)
242 /* The location of rvalue within the red zone after deallocating the frame. */
243 #define ffi_closure_RED_RVALUE (ffi_closure_OFS_RVALUE - ffi_closure_FS)
246 .globl C(ffi_closure_unix64_sse)
247 FFI_HIDDEN(C(ffi_closure_unix64_sse))
249 C(ffi_closure_unix64_sse):
251 subq $ffi_closure_FS, %rsp
253 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
255 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp)
256 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp)
257 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp)
258 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp)
259 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp)
260 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp)
261 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp)
262 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp)
266 ENDF(C(ffi_closure_unix64_sse))
269 .globl C(ffi_closure_unix64)
270 FFI_HIDDEN(C(ffi_closure_unix64))
272 C(ffi_closure_unix64):
274 subq $ffi_closure_FS, %rsp
276 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
278 movq %rdi, ffi_closure_OFS_G+0x00(%rsp)
279 movq %rsi, ffi_closure_OFS_G+0x08(%rsp)
280 movq %rdx, ffi_closure_OFS_G+0x10(%rsp)
281 movq %rcx, ffi_closure_OFS_G+0x18(%rsp)
282 movq %r8, ffi_closure_OFS_G+0x20(%rsp)
283 movq %r9, ffi_closure_OFS_G+0x28(%rsp)
286 movl FFI_TRAMPOLINE_SIZE(%r10), %edi /* Load cif */
287 movl FFI_TRAMPOLINE_SIZE+4(%r10), %esi /* Load fun */
288 movl FFI_TRAMPOLINE_SIZE+8(%r10), %edx /* Load user_data */
290 movq FFI_TRAMPOLINE_SIZE(%r10), %rdi /* Load cif */
291 movq FFI_TRAMPOLINE_SIZE+8(%r10), %rsi /* Load fun */
292 movq FFI_TRAMPOLINE_SIZE+16(%r10), %rdx /* Load user_data */
295 leaq ffi_closure_OFS_RVALUE(%rsp), %rcx /* Load rvalue */
296 movq %rsp, %r8 /* Load reg_args */
297 leaq ffi_closure_FS+8(%rsp), %r9 /* Load argp */
298 call C(ffi_closure_unix64_inner)
300 /* Deallocate stack frame early; return value is now in redzone. */
301 addq $ffi_closure_FS, %rsp
303 /* cfi_adjust_cfa_offset(-ffi_closure_FS) */
305 /* The first byte of the return value contains the FFI_TYPE. */
306 cmpb $UNIX64_RET_LAST, %al
308 leaq L(load_table)(%rip), %r11
310 leaq (%r11, %r10, 8), %r10
311 leaq ffi_closure_RED_RVALUE(%rsp), %rsi
316 E(L(load_table), UNIX64_RET_VOID)
318 E(L(load_table), UNIX64_RET_UINT8)
321 E(L(load_table), UNIX64_RET_UINT16)
324 E(L(load_table), UNIX64_RET_UINT32)
327 E(L(load_table), UNIX64_RET_SINT8)
330 E(L(load_table), UNIX64_RET_SINT16)
333 E(L(load_table), UNIX64_RET_SINT32)
336 E(L(load_table), UNIX64_RET_INT64)
339 E(L(load_table), UNIX64_RET_XMM32)
342 E(L(load_table), UNIX64_RET_XMM64)
345 E(L(load_table), UNIX64_RET_X87)
348 E(L(load_table), UNIX64_RET_X87_2)
352 E(L(load_table), UNIX64_RET_ST_XMM0_RAX)
355 E(L(load_table), UNIX64_RET_ST_RAX_XMM0)
358 E(L(load_table), UNIX64_RET_ST_XMM0_XMM1)
361 E(L(load_table), UNIX64_RET_ST_RAX_RDX)
371 L(la): call PLT(C(abort))
374 ENDF(C(ffi_closure_unix64))
377 .globl C(ffi_go_closure_unix64_sse)
378 FFI_HIDDEN(C(ffi_go_closure_unix64_sse))
380 C(ffi_go_closure_unix64_sse):
382 subq $ffi_closure_FS, %rsp
384 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
386 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp)
387 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp)
388 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp)
389 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp)
390 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp)
391 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp)
392 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp)
393 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp)
397 ENDF(C(ffi_go_closure_unix64_sse))
400 .globl C(ffi_go_closure_unix64)
401 FFI_HIDDEN(C(ffi_go_closure_unix64))
403 C(ffi_go_closure_unix64):
405 subq $ffi_closure_FS, %rsp
407 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
409 movq %rdi, ffi_closure_OFS_G+0x00(%rsp)
410 movq %rsi, ffi_closure_OFS_G+0x08(%rsp)
411 movq %rdx, ffi_closure_OFS_G+0x10(%rsp)
412 movq %rcx, ffi_closure_OFS_G+0x18(%rsp)
413 movq %r8, ffi_closure_OFS_G+0x20(%rsp)
414 movq %r9, ffi_closure_OFS_G+0x28(%rsp)
417 movl 4(%r10), %edi /* Load cif */
418 movl 8(%r10), %esi /* Load fun */
419 movl %r10d, %edx /* Load closure (user_data) */
421 movq 8(%r10), %rdi /* Load cif */
422 movq 16(%r10), %rsi /* Load fun */
423 movq %r10, %rdx /* Load closure (user_data) */
428 ENDF(C(ffi_go_closure_unix64))
430 /* Sadly, OSX cctools-as doesn't understand .cfi directives at all. */
433 .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
435 #elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
436 .section .eh_frame,"a",@unwind
438 .section .eh_frame,"a",@progbits
441 #ifdef HAVE_AS_X86_PCREL
442 # define PCREL(X) X - .
444 # define PCREL(X) X@rel
447 /* Simplify advancing between labels. Assume DW_CFA_advance_loc1 fits. */
448 #define ADV(N, P) .byte 2, L(N)-L(P)
452 .set L(set0),L(ECIE)-L(SCIE)
453 .long L(set0) /* CIE Length */
455 .long 0 /* CIE Identifier Tag */
456 .byte 1 /* CIE Version */
457 .ascii "zR\0" /* CIE Augmentation */
458 .byte 1 /* CIE Code Alignment Factor */
459 .byte 0x78 /* CIE Data Alignment Factor */
460 .byte 0x10 /* CIE RA Column */
461 .byte 1 /* Augmentation size */
462 .byte 0x1b /* FDE Encoding (pcrel sdata4) */
463 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp offset 8 */
464 .byte 0x80+16, 1 /* DW_CFA_offset, %rip offset 1*-8 */
468 .set L(set1),L(EFDE1)-L(SFDE1)
469 .long L(set1) /* FDE Length */
471 .long L(SFDE1)-L(CIE) /* FDE CIE offset */
472 .long PCREL(L(UW0)) /* Initial location */
473 .long L(UW4)-L(UW0) /* Address range */
474 .byte 0 /* Augmentation size */
476 .byte 0xc, 6, 32 /* DW_CFA_def_cfa, %rbp 32 */
477 .byte 0x80+6, 2 /* DW_CFA_offset, %rbp 2*-8 */
479 .byte 0xa /* DW_CFA_remember_state */
480 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp 8 */
481 .byte 0xc0+6 /* DW_CFA_restore, %rbp */
483 .byte 0xb /* DW_CFA_restore_state */
487 .set L(set2),L(EFDE2)-L(SFDE2)
488 .long L(set2) /* FDE Length */
490 .long L(SFDE2)-L(CIE) /* FDE CIE offset */
491 .long PCREL(L(UW5)) /* Initial location */
492 .long L(UW7)-L(UW5) /* Address range */
493 .byte 0 /* Augmentation size */
495 .byte 0xe /* DW_CFA_def_cfa_offset */
496 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
500 .set L(set3),L(EFDE3)-L(SFDE3)
501 .long L(set3) /* FDE Length */
503 .long L(SFDE3)-L(CIE) /* FDE CIE offset */
504 .long PCREL(L(UW8)) /* Initial location */
505 .long L(UW11)-L(UW8) /* Address range */
506 .byte 0 /* Augmentation size */
508 .byte 0xe /* DW_CFA_def_cfa_offset */
509 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
511 .byte 0xe, 8 /* DW_CFA_def_cfa_offset 8 */
514 .set L(set4),L(EFDE4)-L(SFDE4)
515 .long L(set4) /* FDE Length */
517 .long L(SFDE4)-L(CIE) /* FDE CIE offset */
518 .long PCREL(L(UW12)) /* Initial location */
519 .long L(UW14)-L(UW12) /* Address range */
520 .byte 0 /* Augmentation size */
522 .byte 0xe /* DW_CFA_def_cfa_offset */
523 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
527 .set L(set5),L(EFDE5)-L(SFDE5)
528 .long L(set5) /* FDE Length */
530 .long L(SFDE5)-L(CIE) /* FDE CIE offset */
531 .long PCREL(L(UW15)) /* Initial location */
532 .long L(UW17)-L(UW15) /* Address range */
533 .byte 0 /* Augmentation size */
535 .byte 0xe /* DW_CFA_def_cfa_offset */
536 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
540 .subsections_via_symbols
543 #endif /* __x86_64__ */
544 #if defined __ELF__ && defined __linux__
545 .section .note.GNU-stack,"",@progbits