From cbce6d2bac82351154edb69d6e8f78b4b8dcaa65 Mon Sep 17 00:00:00 2001 From: James Lyon Date: Fri, 19 Apr 2013 22:05:49 +0100 Subject: [PATCH] Improved x86-64 XMM register argument passing. Also made XMM0-7 available for use as temporary registers, since they are not used by the ABI. I'd like to do the same with RSI and RDI but that's trickier since they can be used by gv() as temporary registers and there isn't a way to disable that. --- tccgen.c | 2 +- x86_64-gen.c | 193 ++++++++++++++++++++++++++++++++++++----------------------- 2 files changed, 119 insertions(+), 76 deletions(-) diff --git a/tccgen.c b/tccgen.c index f4de9f4e..8f2f1bd3 100644 --- a/tccgen.c +++ b/tccgen.c @@ -764,7 +764,7 @@ ST_FUNC int gv(int rc) #endif r = vtop->r & VT_VALMASK; - rc2 = RC_INT; + rc2 = (rc & RC_FLOAT) ? RC_FLOAT : RC_INT; if (rc == RC_IRET) rc2 = RC_LRET; #ifdef TCC_TARGET_X86_64 diff --git a/x86_64-gen.c b/x86_64-gen.c index 0ba928c0..4b9d28d7 100644 --- a/x86_64-gen.c +++ b/x86_64-gen.c @@ -23,7 +23,7 @@ #ifdef TARGET_DEFS_ONLY /* number of available registers */ -#define NB_REGS 18 +#define NB_REGS 24 #define NB_ASM_REGS 8 /* a register can belong to several classes. The classes must be @@ -34,13 +34,19 @@ #define RC_RAX 0x0004 #define RC_RCX 0x0008 #define RC_RDX 0x0010 +#define RC_ST0 0x0080 /* only for long double */ #define RC_R8 0x0100 #define RC_R9 0x0200 #define RC_R10 0x0400 #define RC_R11 0x0800 -#define RC_XMM0 0x0020 -#define RC_XMM1 0x0040 -#define RC_ST0 0x0080 /* only for long double */ +#define RC_XMM0 0x1000 +#define RC_XMM1 0x2000 +#define RC_XMM2 0x4000 +#define RC_XMM3 0x8000 +#define RC_XMM4 0x10000 +#define RC_XMM5 0x20000 +#define RC_XMM6 0x40000 +#define RC_XMM7 0x80000 #define RC_IRET RC_RAX /* function return: integer register */ #define RC_LRET RC_RDX /* function return: second integer register */ #define RC_FRET RC_XMM0 /* function return: float register */ @@ -61,6 +67,12 @@ enum { TREG_XMM0 = 16, TREG_XMM1 = 17, + TREG_XMM2 = 18, + TREG_XMM3 = 19, + TREG_XMM4 = 20, + TREG_XMM5 = 21, + TREG_XMM6 = 22, + TREG_XMM7 = 23, TREG_ST0 = 4, // SP slot won't be used @@ -117,16 +129,22 @@ ST_DATA const int reg_classes[NB_REGS] = { 0, 0, 0, - /*RC_INT |*/ RC_R8, - /*RC_INT |*/ RC_R9, - /*RC_INT |*/ RC_R10, - /*RC_INT |*/ RC_R11, + RC_R8, + RC_R9, + RC_R10, + RC_R11, 0, 0, 0, 0, /* xmm0 */ RC_FLOAT | RC_XMM0, /* xmm1 */ RC_FLOAT | RC_XMM1, + /* xmm2 */ RC_FLOAT | RC_XMM2, + /* xmm3 */ RC_FLOAT | RC_XMM3, + /* xmm4 */ RC_FLOAT | RC_XMM4, + /* xmm5 */ RC_FLOAT | RC_XMM5, + /* xmm6 */ RC_FLOAT | RC_XMM6, + /* xmm7 */ RC_FLOAT | RC_XMM7, }; static unsigned long func_sub_sp_offset; @@ -141,8 +159,6 @@ void g(int c) section_realloc(cur_text_section, ind1); cur_text_section->data[ind] = c; ind = ind1; - assert((ind < 4) || (cur_text_section->data[ind-4] != ('\362'&0xFF)) || (cur_text_section->data[ind-3] != '\017') - || (cur_text_section->data[ind-2] != 'X') || (cur_text_section->data[ind-1] != '\001')); } void o(unsigned int c) @@ -1055,32 +1071,48 @@ void gfunc_call(int nb_args) oad(0xec81, args_size); /* sub $xxx, %rsp */ } - for(i = 0; i < nb_args; i++) { + for(i = 0; i < nb_args;) { /* Swap argument to top, it will possibly be changed here, - and might use more temps. All arguments must remain on the - stack, so that get_reg can correctly evict some of them onto - stack. We could use also use a vrott(nb_args) at the end - of this loop, but this seems faster. */ + and might use more temps. At the end of the loop we keep + in on the stack and swap it back to its original position + if it is a register. */ SValue tmp = vtop[0]; vtop[0] = vtop[-i]; vtop[-i] = tmp; + mode = classify_x86_64_arg(&vtop->type, NULL, &size, ®_count); - switch (mode) { - case x86_64_mode_memory: - /* allocate the necessary size on stack */ - o(0x48); - oad(0xec81, size); /* sub $xxx, %rsp */ - /* generate structure store */ - r = get_reg(RC_INT); - orex(1, r, 0, 0x89); /* mov %rsp, r */ - o(0xe0 + REG_VALUE(r)); - vset(&vtop->type, r | VT_LVAL, 0); - vswap(); - vstore(); - args_size += size; + + int arg_stored = 1; + switch (vtop->type.t & VT_BTYPE) { + case VT_STRUCT: + if (mode == x86_64_mode_sse) { + if (sse_reg > 8) + sse_reg -= reg_count; + else + arg_stored = 0; + } else if (mode == x86_64_mode_integer) { + if (gen_reg > REGN) + gen_reg -= reg_count; + else + arg_stored = 0; + } + + if (arg_stored) { + /* allocate the necessary size on stack */ + o(0x48); + oad(0xec81, size); /* sub $xxx, %rsp */ + /* generate structure store */ + r = get_reg(RC_INT); + orex(1, r, 0, 0x89); /* mov %rsp, r */ + o(0xe0 + REG_VALUE(r)); + vset(&vtop->type, r | VT_LVAL, 0); + vswap(); + vstore(); + args_size += size; + } break; - case x86_64_mode_x87: + case VT_LDOUBLE: gv(RC_ST0); size = LDOUBLE_SIZE; oad(0xec8148, size); /* sub $xxx, %rsp */ @@ -1090,8 +1122,11 @@ void gfunc_call(int nb_args) args_size += size; break; - case x86_64_mode_sse: + case VT_FLOAT: + case VT_DOUBLE: + assert(mode == x86_64_mode_sse); if (sse_reg > 8) { + --sse_reg; r = gv(RC_FLOAT); o(0x50); /* push $rax */ /* movq %xmm0, (%rsp) */ @@ -1099,26 +1134,39 @@ void gfunc_call(int nb_args) o(0x04 + REG_VALUE(r)*8); o(0x24); args_size += size; + } else { + arg_stored = 0; } - sse_reg -= reg_count; break; - case x86_64_mode_integer: + default: + assert(mode == x86_64_mode_integer); /* simple type */ /* XXX: implicit cast ? */ if (gen_reg > REGN) { + --gen_reg; r = gv(RC_INT); orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */ args_size += size; + } else { + arg_stored = 0; } - gen_reg -= reg_count; break; } - - /* And swap the argument back to it's original position. */ + + /* And swap the argument back to it's original position. */ tmp = vtop[0]; - vtop[0] = vtop[-i]; - vtop[-i] = tmp; + vtop[0] = vtop[-i]; + vtop[-i] = tmp; + + if (arg_stored) { + vrotb(i+1); + assert(vtop->type.t == tmp.type.t); + vpop(); + --nb_args; + } else { + ++i; + } } /* XXX This should be superfluous. */ @@ -1128,55 +1176,50 @@ void gfunc_call(int nb_args) Note that we cannot set RDX and RCX in this loop because gv() may break these temporary registers. Let's use R10 and R11 instead of them */ - gen_reg = nb_reg_args; - sse_reg = nb_sse_args; + assert(gen_reg <= REGN); + assert(sse_reg <= 8); for(i = 0; i < nb_args; i++) { mode = classify_x86_64_arg(&vtop->type, &type, &size, ®_count); /* Alter stack entry type so that gv() knows how to treat it */ vtop->type = type; - switch (mode) { - default: - break; - - case x86_64_mode_sse: - sse_reg -= reg_count; - if (sse_reg + reg_count <= 8) { - gv(RC_FRET); /* only one float register */ - if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */ - /* movaps %xmm0, %xmmN */ - o(0x280f); - o(0xc0 + (sse_reg << 3)); - if (reg_count == 2) { - /* movaps %xmm1, %xmmN */ - o(0x280f); - o(0xc1 + ((sse_reg+1) << 3)); - } - } - } - break; - - case x86_64_mode_integer: + if (mode == x86_64_mode_sse) { + if (reg_count == 2) { + sse_reg -= 2; + gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */ + if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */ + /* movaps %xmm0, %xmmN */ + o(0x280f); + o(0xc0 + (sse_reg << 3)); + /* movaps %xmm1, %xmmN */ + o(0x280f); + o(0xc1 + ((sse_reg+1) << 3)); + } + } else { + assert(reg_count == 1); + --sse_reg; + /* Load directly to register */ + gv(RC_XMM0 << sse_reg); + } + } else if (mode == x86_64_mode_integer) { /* simple type */ /* XXX: implicit cast ? */ gen_reg -= reg_count; - if (gen_reg + reg_count <= REGN) { - r = gv((reg_count == 1) ? RC_INT : RC_IRET); - int d = arg_prepare_reg(gen_reg); - orex(1,d,r,0x89); /* mov */ - o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d)); - if (reg_count == 2) { - /* Second word of two-word value should always be in rdx - this case is handled via RC_IRET */ - assert(vtop->r2 == TREG_RDX); - d = arg_prepare_reg(gen_reg+1); - orex(1,d,vtop->r2,0x89); /* mov */ - o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d)); - } + r = gv(RC_INT); + int d = arg_prepare_reg(gen_reg); + orex(1,d,r,0x89); /* mov */ + o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d)); + if (reg_count == 2) { + /* Second word of two-word value should always be in rdx + this case is handled via RC_IRET */ + d = arg_prepare_reg(gen_reg+1); + orex(1,d,vtop->r2,0x89); /* mov */ + o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d)); } - break; } vtop--; } + assert(gen_reg == 0); + assert(sse_reg == 0); /* We shouldn't have many operands on the stack anymore, but the call address itself is still there, and it might be in %eax -- 2.11.4.GIT