x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #ifdef TARGET_DEFS_ONLY
  24
  25 /* number of available registers */
  26 #define NB_REGS         25
  27 #define NB_ASM_REGS     16
  28 #define CONFIG_TCC_ASM
  29
  30 /* a register can belong to several classes. The classes must be
  31    sorted from more general to more precise (see gv2() code which does
  32    assumptions on it). */
  33 #define RC_INT     0x0001 /* generic integer register */
  34 #define RC_FLOAT   0x0002 /* generic float register */
  35 #define RC_RAX     0x0004
  36 #define RC_RCX     0x0008
  37 #define RC_RDX     0x0010
  38 #define RC_ST0     0x0080 /* only for long double */
  39 #define RC_R8      0x0100
  40 #define RC_R9      0x0200
  41 #define RC_R10     0x0400
  42 #define RC_R11     0x0800
  43 #define RC_XMM0    0x1000
  44 #define RC_XMM1    0x2000
  45 #define RC_XMM2    0x4000
  46 #define RC_XMM3    0x8000
  47 #define RC_XMM4    0x10000
  48 #define RC_XMM5    0x20000
  49 #define RC_XMM6    0x40000
  50 #define RC_XMM7    0x80000
  51 #define RC_IRET    RC_RAX /* function return: integer register */
  52 #define RC_IRE2    RC_RDX /* function return: second integer register */
  53 #define RC_FRET    RC_XMM0 /* function return: float register */
  54 #define RC_FRE2    RC_XMM1 /* function return: second float register */
  55
  56 /* pretty names for the registers */
  57 enum {
  58     TREG_RAX = 0,
  59     TREG_RCX = 1,
  60     TREG_RDX = 2,
  61     TREG_RSP = 4,
  62     TREG_RSI = 6,
  63     TREG_RDI = 7,
  64
  65     TREG_R8  = 8,
  66     TREG_R9  = 9,
  67     TREG_R10 = 10,
  68     TREG_R11 = 11,
  69
  70     TREG_XMM0 = 16,
  71     TREG_XMM1 = 17,
  72     TREG_XMM2 = 18,
  73     TREG_XMM3 = 19,
  74     TREG_XMM4 = 20,
  75     TREG_XMM5 = 21,
  76     TREG_XMM6 = 22,
  77     TREG_XMM7 = 23,
  78
  79     TREG_ST0 = 24,
  80
  81     TREG_MEM = 0x20
  82 };
  83
  84 #define REX_BASE(reg) (((reg) >> 3) & 1)
  85 #define REG_VALUE(reg) ((reg) & 7)
  86
  87 /* return registers for function */
  88 #define REG_IRET TREG_RAX /* single word int return register */
  89 #define REG_IRE2 TREG_RDX /* second word return register (for long long) */
  90 #define REG_FRET TREG_XMM0 /* float return register */
  91 #define REG_FRE2 TREG_XMM1 /* second float return register */
  92
  93 /* defined if function parameters must be evaluated in reverse order */
  94 #define INVERT_FUNC_PARAMS
  95
  96 /* pointer size, in bytes */
  97 #define PTR_SIZE 8
  98
  99 /* long double size and alignment, in bytes */
 100 #define LDOUBLE_SIZE  16
 101 #define LDOUBLE_ALIGN 16
 102 /* maximum alignment (for aligned attribute support) */
 103 #define MAX_ALIGN     16
 104
 105 /* define if return values need to be extended explicitely
 106    at caller side (for interfacing with non-TCC compilers) */
 107 #define PROMOTE_RET
 108 /******************************************************/
 109 #else /* ! TARGET_DEFS_ONLY */
 110 /******************************************************/
 111 #define USING_GLOBALS
 112 #include "tcc.h"
 113 #include <assert.h>
 114
 115 ST_DATA const int reg_classes[NB_REGS] = {
 116     /* eax */ RC_INT | RC_RAX,
 117     /* ecx */ RC_INT | RC_RCX,
 118     /* edx */ RC_INT | RC_RDX,
 119     0,
 120     0,
 121     0,
 122     0,
 123     0,
 124     RC_R8,
 125     RC_R9,
 126     RC_R10,
 127     RC_R11,
 128     0,
 129     0,
 130     0,
 131     0,
 132     /* xmm0 */ RC_FLOAT | RC_XMM0,
 133     /* xmm1 */ RC_FLOAT | RC_XMM1,
 134     /* xmm2 */ RC_FLOAT | RC_XMM2,
 135     /* xmm3 */ RC_FLOAT | RC_XMM3,
 136     /* xmm4 */ RC_FLOAT | RC_XMM4,
 137     /* xmm5 */ RC_FLOAT | RC_XMM5,
 138     /* xmm6 an xmm7 are included so gv() can be used on them,
 139        but they are not tagged with RC_FLOAT because they are
 140        callee saved on Windows */
 141     RC_XMM6,
 142     RC_XMM7,
 143     /* st0 */ RC_ST0
 144 };
 145
 146 static unsigned long func_sub_sp_offset;
 147 static int func_ret_sub;
 148
 149 #if defined(CONFIG_TCC_BCHECK)
 150 static addr_t func_bound_offset;
 151 static unsigned long func_bound_ind;
 152 ST_DATA int func_bound_add_epilog;
 153 #endif
 154
 155 #ifdef TCC_TARGET_PE
 156 static int func_scratch, func_alloca;
 157 #endif
 158
 159 /* XXX: make it faster ? */
 160 ST_FUNC void g(int c)
 161 {
 162     int ind1;
 163     if (nocode_wanted)
 164         return;
 165     ind1 = ind + 1;
 166     if (ind1 > cur_text_section->data_allocated)
 167         section_realloc(cur_text_section, ind1);
 168     cur_text_section->data[ind] = c;
 169     ind = ind1;
 170 }
 171
 172 ST_FUNC void o(unsigned int c)
 173 {
 174     while (c) {
 175         g(c);
 176         c = c >> 8;
 177     }
 178 }
 179
 180 ST_FUNC void gen_le16(int v)
 181 {
 182     g(v);
 183     g(v >> 8);
 184 }
 185
 186 ST_FUNC void gen_le32(int c)
 187 {
 188     g(c);
 189     g(c >> 8);
 190     g(c >> 16);
 191     g(c >> 24);
 192 }
 193
 194 ST_FUNC void gen_le64(int64_t c)
 195 {
 196     g(c);
 197     g(c >> 8);
 198     g(c >> 16);
 199     g(c >> 24);
 200     g(c >> 32);
 201     g(c >> 40);
 202     g(c >> 48);
 203     g(c >> 56);
 204 }
 205
 206 static void orex(int ll, int r, int r2, int b)
 207 {
 208     if ((r & VT_VALMASK) >= VT_CONST)
 209         r = 0;
 210     if ((r2 & VT_VALMASK) >= VT_CONST)
 211         r2 = 0;
 212     if (ll || REX_BASE(r) || REX_BASE(r2))
 213         o(0x40 | REX_BASE(r) | (REX_BASE(r2) << 2) | (ll << 3));
 214     o(b);
 215 }
 216
 217 /* output a symbol and patch all calls to it */
 218 ST_FUNC void gsym_addr(int t, int a)
 219 {
 220     while (t) {
 221         unsigned char *ptr = cur_text_section->data + t;
 222         uint32_t n = read32le(ptr); /* next value */
 223         write32le(ptr, a < 0 ? -a : a - t - 4);
 224         t = n;
 225     }
 226 }
 227
 228 static int is64_type(int t)
 229 {
 230     return ((t & VT_BTYPE) == VT_PTR ||
 231             (t & VT_BTYPE) == VT_FUNC ||
 232             (t & VT_BTYPE) == VT_LLONG);
 233 }
 234
 235 /* instruction + 4 bytes data. Return the address of the data */
 236 static int oad(int c, int s)
 237 {
 238     int t;
 239     if (nocode_wanted)
 240         return s;
 241     o(c);
 242     t = ind;
 243     gen_le32(s);
 244     return t;
 245 }
 246
 247 /* generate jmp to a label */
 248 #define gjmp2(instr,lbl) oad(instr,lbl)
 249
 250 ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 251 {
 252     if (r & VT_SYM)
 253         greloca(cur_text_section, sym, ind, R_X86_64_32S, c), c=0;
 254     gen_le32(c);
 255 }
 256
 257 /* output constant with relocation if 'r & VT_SYM' is true */
 258 ST_FUNC void gen_addr64(int r, Sym *sym, int64_t c)
 259 {
 260     if (r & VT_SYM)
 261         greloca(cur_text_section, sym, ind, R_X86_64_64, c), c=0;
 262     gen_le64(c);
 263 }
 264
 265 /* output constant with relocation if 'r & VT_SYM' is true */
 266 ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
 267 {
 268     if (r & VT_SYM)
 269         greloca(cur_text_section, sym, ind, R_X86_64_PC32, c-4), c=4;
 270     gen_le32(c-4);
 271 }
 272
 273 /* output got address with relocation */
 274 static void gen_gotpcrel(int r, Sym *sym, int c)
 275 {
 276 #ifdef TCC_TARGET_PE
 277     tcc_error("internal error: no GOT on PE: %s %x %x | %02x %02x %02x\n",
 278         get_tok_str(sym->v, NULL), c, r,
 279         cur_text_section->data[ind-3],
 280         cur_text_section->data[ind-2],
 281         cur_text_section->data[ind-1]
 282         );
 283 #endif
 284     greloca(cur_text_section, sym, ind, R_X86_64_GOTPCREL, -4);
 285     gen_le32(0);
 286     if (c) {
 287         /* we use add c, %xxx for displacement */
 288         orex(1, r, 0, 0x81);
 289         o(0xc0 + REG_VALUE(r));
 290         gen_le32(c);
 291     }
 292 }
 293
 294 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 295 {
 296     op_reg = REG_VALUE(op_reg) << 3;
 297     if ((r & VT_VALMASK) == VT_CONST) {
 298         /* constant memory reference */
 299         if (!(r & VT_SYM)) {
 300             /* Absolute memory reference */
 301             o(0x04 | op_reg); /* [sib] | destreg */
 302             oad(0x25, c);     /* disp32 */
 303         } else {
 304             o(0x05 | op_reg); /* (%rip)+disp32 | destreg */
 305             if (is_got) {
 306                 gen_gotpcrel(r, sym, c);
 307             } else {
 308                 gen_addrpc32(r, sym, c);
 309             }
 310         }
 311     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 312         /* currently, we use only ebp as base */
 313         if (c == (char)c) {
 314             /* short reference */
 315             o(0x45 | op_reg);
 316             g(c);
 317         } else {
 318             oad(0x85 | op_reg, c);
 319         }
 320     } else if ((r & VT_VALMASK) >= TREG_MEM) {
 321         if (c) {
 322             g(0x80 | op_reg | REG_VALUE(r));
 323             gen_le32(c);
 324         } else {
 325             g(0x00 | op_reg | REG_VALUE(r));
 326         }
 327     } else {
 328         g(0x00 | op_reg | REG_VALUE(r));
 329     }
 330 }
 331
 332 /* generate a modrm reference. 'op_reg' contains the additional 3
 333    opcode bits */
 334 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 335 {
 336     gen_modrm_impl(op_reg, r, sym, c, 0);
 337 }
 338
 339 /* generate a modrm reference. 'op_reg' contains the additional 3
 340    opcode bits */
 341 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 342 {
 343     int is_got;
 344     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 345     orex(1, r, op_reg, opcode);
 346     gen_modrm_impl(op_reg, r, sym, c, is_got);
 347 }
 348
 349
 350 /* load 'r' from value 'sv' */
 351 void load(int r, SValue *sv)
 352 {
 353     int v, t, ft, fc, fr;
 354     SValue v1;
 355
 356 #ifdef TCC_TARGET_PE
 357     SValue v2;
 358     sv = pe_getimport(sv, &v2);
 359 #endif
 360
 361     fr = sv->r;
 362     ft = sv->type.t & ~VT_DEFSIGN;
 363     fc = sv->c.i;
 364     if (fc != sv->c.i && (fr & VT_SYM))
 365       tcc_error("64 bit addend in load");
 366
 367     ft &= ~(VT_VOLATILE | VT_CONSTANT);
 368
 369 #ifndef TCC_TARGET_PE
 370     /* we use indirect access via got */
 371     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 372         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 373         /* use the result register as a temporal register */
 374         int tr = r | TREG_MEM;
 375         if (is_float(ft)) {
 376             /* we cannot use float registers as a temporal register */
 377             tr = get_reg(RC_INT) | TREG_MEM;
 378         }
 379         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 380
 381         /* load from the temporal register */
 382         fr = tr | VT_LVAL;
 383     }
 384 #endif
 385
 386     v = fr & VT_VALMASK;
 387     if (fr & VT_LVAL) {
 388         int b, ll;
 389         if (v == VT_LLOCAL) {
 390             v1.type.t = VT_PTR;
 391             v1.r = VT_LOCAL | VT_LVAL;
 392             v1.c.i = fc;
 393             fr = r;
 394             if (!(reg_classes[fr] & (RC_INT|RC_R11)))
 395                 fr = get_reg(RC_INT);
 396             load(fr, &v1);
 397         }
 398         if (fc != sv->c.i) {
 399             /* If the addends doesn't fit into a 32bit signed
 400                we must use a 64bit move.  We've checked above
 401                that this doesn't have a sym associated.  */
 402             v1.type.t = VT_LLONG;
 403             v1.r = VT_CONST;
 404             v1.c.i = sv->c.i;
 405             fr = r;
 406             if (!(reg_classes[fr] & (RC_INT|RC_R11)))
 407                 fr = get_reg(RC_INT);
 408             load(fr, &v1);
 409             fc = 0;
 410         }
 411         ll = 0;
 412         /* Like GCC we can load from small enough properly sized
 413            structs and unions as well.
 414            XXX maybe move to generic operand handling, but should
 415            occur only with asm, so tccasm.c might also be a better place */
 416         if ((ft & VT_BTYPE) == VT_STRUCT) {
 417             int align;
 418             switch (type_size(&sv->type, &align)) {
 419                 case 1: ft = VT_BYTE; break;
 420                 case 2: ft = VT_SHORT; break;
 421                 case 4: ft = VT_INT; break;
 422                 case 8: ft = VT_LLONG; break;
 423                 default:
 424                     tcc_error("invalid aggregate type for register load");
 425                     break;
 426             }
 427         }
 428         if ((ft & VT_BTYPE) == VT_FLOAT) {
 429             b = 0x6e0f66;
 430             r = REG_VALUE(r); /* movd */
 431         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 432             b = 0x7e0ff3; /* movq */
 433             r = REG_VALUE(r);
 434         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 435             b = 0xdb, r = 5; /* fldt */
 436         } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
 437             b = 0xbe0f;   /* movsbl */
 438         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 439             b = 0xb60f;   /* movzbl */
 440         } else if ((ft & VT_TYPE) == VT_SHORT) {
 441             b = 0xbf0f;   /* movswl */
 442         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 443             b = 0xb70f;   /* movzwl */
 444         } else if ((ft & VT_TYPE) == (VT_VOID)) {
 445             /* Can happen with zero size structs */
 446             return;
 447         } else {
 448             assert(((ft & VT_BTYPE) == VT_INT)
 449                    || ((ft & VT_BTYPE) == VT_LLONG)
 450                    || ((ft & VT_BTYPE) == VT_PTR)
 451                    || ((ft & VT_BTYPE) == VT_FUNC)
 452                 );
 453             ll = is64_type(ft);
 454             b = 0x8b;
 455         }
 456         if (ll) {
 457             gen_modrm64(b, r, fr, sv->sym, fc);
 458         } else {
 459             orex(ll, fr, r, b);
 460             gen_modrm(r, fr, sv->sym, fc);
 461         }
 462     } else {
 463         if (v == VT_CONST) {
 464             if (fr & VT_SYM) {
 465 #ifdef TCC_TARGET_PE
 466                 orex(1,0,r,0x8d);
 467                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 468                 gen_addrpc32(fr, sv->sym, fc);
 469 #else
 470                 if (sv->sym->type.t & VT_STATIC) {
 471                     orex(1,0,r,0x8d);
 472                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 473                     gen_addrpc32(fr, sv->sym, fc);
 474                 } else {
 475                     orex(1,0,r,0x8b);
 476                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 477                     gen_gotpcrel(r, sv->sym, fc);
 478                 }
 479 #endif
 480             } else if (is64_type(ft)) {
 481                 orex(1,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 482                 gen_le64(sv->c.i);
 483             } else {
 484                 orex(0,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 485                 gen_le32(fc);
 486             }
 487         } else if (v == VT_LOCAL) {
 488             orex(1,0,r,0x8d); /* lea xxx(%ebp), r */
 489             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 490         } else if (v == VT_CMP) {
 491             if (fc & 0x100)
 492               {
 493                 v = vtop->cmp_r;
 494                 fc &= ~0x100;
 495                 /* This was a float compare.  If the parity bit is
 496                    set the result was unordered, meaning false for everything
 497                    except TOK_NE, and true for TOK_NE.  */
 498                 orex(0, r, 0, 0xb0 + REG_VALUE(r)); /* mov $0/1,%al */
 499                 g(v ^ fc ^ (v == TOK_NE));
 500                 o(0x037a + (REX_BASE(r) << 8));
 501               }
 502             orex(0,r,0, 0x0f); /* setxx %br */
 503             o(fc);
 504             o(0xc0 + REG_VALUE(r));
 505             orex(0,r,0, 0x0f);
 506             o(0xc0b6 + REG_VALUE(r) * 0x900); /* movzbl %al, %eax */
 507         } else if (v == VT_JMP || v == VT_JMPI) {
 508             t = v & 1;
 509             orex(0,r,0,0);
 510             oad(0xb8 + REG_VALUE(r), t); /* mov $1, r */
 511             o(0x05eb + (REX_BASE(r) << 8)); /* jmp after */
 512             gsym(fc);
 513             orex(0,r,0,0);
 514             oad(0xb8 + REG_VALUE(r), t ^ 1); /* mov $0, r */
 515         } else if (v != r) {
 516             if ((r >= TREG_XMM0) && (r <= TREG_XMM7)) {
 517                 if (v == TREG_ST0) {
 518                     /* gen_cvt_ftof(VT_DOUBLE); */
 519                     o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 520                     /* movsd -0x10(%rsp),%xmmN */
 521                     o(0x100ff2);
 522                     o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 523                     o(0xf024);
 524                 } else {
 525                     assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 526                     if ((ft & VT_BTYPE) == VT_FLOAT) {
 527                         o(0x100ff3);
 528                     } else {
 529                         assert((ft & VT_BTYPE) == VT_DOUBLE);
 530                         o(0x100ff2);
 531                     }
 532                     o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8);
 533                 }
 534             } else if (r == TREG_ST0) {
 535                 assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 536                 /* gen_cvt_ftof(VT_LDOUBLE); */
 537                 /* movsd %xmmN,-0x10(%rsp) */
 538                 o(0x110ff2);
 539                 o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 540                 o(0xf024);
 541                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 542             } else {
 543                 orex(is64_type(ft), r, v, 0x89);
 544                 o(0xc0 + REG_VALUE(r) + REG_VALUE(v) * 8); /* mov v, r */
 545             }
 546         }
 547     }
 548 }
 549
 550 /* store register 'r' in lvalue 'v' */
 551 void store(int r, SValue *v)
 552 {
 553     int fr, bt, ft, fc;
 554     int op64 = 0;
 555     /* store the REX prefix in this variable when PIC is enabled */
 556     int pic = 0;
 557
 558 #ifdef TCC_TARGET_PE
 559     SValue v2;
 560     v = pe_getimport(v, &v2);
 561 #endif
 562
 563     fr = v->r & VT_VALMASK;
 564     ft = v->type.t;
 565     fc = v->c.i;
 566     if (fc != v->c.i && (fr & VT_SYM))
 567       tcc_error("64 bit addend in store");
 568     ft &= ~(VT_VOLATILE | VT_CONSTANT);
 569     bt = ft & VT_BTYPE;
 570
 571 #ifndef TCC_TARGET_PE
 572     /* we need to access the variable via got */
 573     if (fr == VT_CONST && (v->r & VT_SYM)) {
 574         /* mov xx(%rip), %r11 */
 575         o(0x1d8b4c);
 576         gen_gotpcrel(TREG_R11, v->sym, v->c.i);
 577         pic = is64_type(bt) ? 0x49 : 0x41;
 578     }
 579 #endif
 580
 581     /* XXX: incorrect if float reg to reg */
 582     if (bt == VT_FLOAT) {
 583         o(0x66);
 584         o(pic);
 585         o(0x7e0f); /* movd */
 586         r = REG_VALUE(r);
 587     } else if (bt == VT_DOUBLE) {
 588         o(0x66);
 589         o(pic);
 590         o(0xd60f); /* movq */
 591         r = REG_VALUE(r);
 592     } else if (bt == VT_LDOUBLE) {
 593         o(0xc0d9); /* fld %st(0) */
 594         o(pic);
 595         o(0xdb); /* fstpt */
 596         r = 7;
 597     } else {
 598         if (bt == VT_SHORT)
 599             o(0x66);
 600         o(pic);
 601         if (bt == VT_BYTE || bt == VT_BOOL)
 602             orex(0, 0, r, 0x88);
 603         else if (is64_type(bt))
 604             op64 = 0x89;
 605         else
 606             orex(0, 0, r, 0x89);
 607     }
 608     if (pic) {
 609         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 610         if (op64)
 611             o(op64);
 612         o(3 + (r << 3));
 613     } else if (op64) {
 614         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 615             gen_modrm64(op64, r, v->r, v->sym, fc);
 616         } else if (fr != r) {
 617             orex(1, fr, r, op64);
 618             o(0xc0 + fr + r * 8); /* mov r, fr */
 619         }
 620     } else {
 621         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 622             gen_modrm(r, v->r, v->sym, fc);
 623         } else if (fr != r) {
 624             o(0xc0 + fr + r * 8); /* mov r, fr */
 625         }
 626     }
 627 }
 628
 629 /* 'is_jmp' is '1' if it is a jump */
 630 static void gcall_or_jmp(int is_jmp)
 631 {
 632     int r;
 633     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST &&
 634         ((vtop->r & VT_SYM) && (vtop->c.i-4) == (int)(vtop->c.i-4))) {
 635         /* constant symbolic case -> simple relocation */
 636 #ifdef TCC_TARGET_PE
 637         greloca(cur_text_section, vtop->sym, ind + 1, R_X86_64_PC32, (int)(vtop->c.i-4));
 638 #else
 639         greloca(cur_text_section, vtop->sym, ind + 1, R_X86_64_PLT32, (int)(vtop->c.i-4));
 640 #endif
 641         oad(0xe8 + is_jmp, 0); /* call/jmp im */
 642     } else {
 643         /* otherwise, indirect call */
 644         r = TREG_R11;
 645         load(r, vtop);
 646         o(0x41); /* REX */
 647         o(0xff); /* call/jmp *r */
 648         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 649     }
 650 }
 651
 652 #if defined(CONFIG_TCC_BCHECK)
 653
 654 static void gen_bounds_call(int v)
 655 {
 656     Sym *sym = external_helper_sym(v);
 657     oad(0xe8, 0);
 658 #ifdef TCC_TARGET_PE
 659     greloca(cur_text_section, sym, ind-4, R_X86_64_PC32, -4);
 660 #else
 661     greloca(cur_text_section, sym, ind-4, R_X86_64_PLT32, -4);
 662 #endif
 663 }
 664
 665 #ifdef TCC_TARGET_PE
 666 # define TREG_FASTCALL_1 TREG_RCX
 667 #else
 668 # define TREG_FASTCALL_1 TREG_RDI
 669 #endif
 670
 671 static void gen_bounds_prolog(void)
 672 {
 673     /* leave some room for bound checking code */
 674     func_bound_offset = lbounds_section->data_offset;
 675     func_bound_ind = ind;
 676     func_bound_add_epilog = 0;
 677     o(0x0d8d48 + ((TREG_FASTCALL_1 == TREG_RDI) * 0x300000)); /*lbound section pointer */
 678     gen_le32 (0);
 679     oad(0xb8, 0); /* call to function */
 680 }
 681
 682 static void gen_bounds_epilog(void)
 683 {
 684     addr_t saved_ind;
 685     addr_t *bounds_ptr;
 686     Sym *sym_data;
 687     int offset_modified = func_bound_offset != lbounds_section->data_offset;
 688
 689     if (!offset_modified && !func_bound_add_epilog)
 690         return;
 691
 692     /* add end of table info */
 693     bounds_ptr = section_ptr_add(lbounds_section, sizeof(addr_t));
 694     *bounds_ptr = 0;
 695
 696     sym_data = get_sym_ref(&char_pointer_type, lbounds_section,
 697                            func_bound_offset, lbounds_section->data_offset);
 698
 699     /* generate bound local allocation */
 700     if (offset_modified) {
 701         saved_ind = ind;
 702         ind = func_bound_ind;
 703         greloca(cur_text_section, sym_data, ind + 3, R_X86_64_PC32, -4);
 704         ind = ind + 7;
 705         gen_bounds_call(TOK___bound_local_new);
 706         ind = saved_ind;
 707     }
 708
 709     /* generate bound check local freeing */
 710     o(0x5250); /* save returned value, if any */
 711     greloca(cur_text_section, sym_data, ind + 3, R_X86_64_PC32, -4);
 712     o(0x0d8d48 + ((TREG_FASTCALL_1 == TREG_RDI) * 0x300000)); /* lea xxx(%rip), %rcx/rdi */
 713     gen_le32 (0);
 714     gen_bounds_call(TOK___bound_local_delete);
 715     o(0x585a); /* restore returned value, if any */
 716 }
 717 #endif
 718
 719 #ifdef TCC_TARGET_PE
 720
 721 #define REGN 4
 722 static const uint8_t arg_regs[REGN] = {
 723     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 724 };
 725
 726 /* Prepare arguments in R10 and R11 rather than RCX and RDX
 727    because gv() will not ever use these */
 728 static int arg_prepare_reg(int idx) {
 729   if (idx == 0 || idx == 1)
 730       /* idx=0: r10, idx=1: r11 */
 731       return idx + 10;
 732   else
 733       return arg_regs[idx];
 734 }
 735
 736 /* Generate function call. The function address is pushed first, then
 737    all the parameters in call order. This functions pops all the
 738    parameters and the function address. */
 739
 740 static void gen_offs_sp(int b, int r, int d)
 741 {
 742     orex(1,0,r & 0x100 ? 0 : r, b);
 743     if (d == (char)d) {
 744         o(0x2444 | (REG_VALUE(r) << 3));
 745         g(d);
 746     } else {
 747         o(0x2484 | (REG_VALUE(r) << 3));
 748         gen_le32(d);
 749     }
 750 }
 751
 752 static int using_regs(int size)
 753 {
 754     return !(size > 8 || (size & (size - 1)));
 755 }
 756
 757 /* Return the number of registers needed to return the struct, or 0 if
 758    returning via struct pointer. */
 759 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
 760 {
 761     int size, align;
 762     *ret_align = 1; // Never have to re-align return values for x86-64
 763     *regsize = 8;
 764     size = type_size(vt, &align);
 765     if (!using_regs(size))
 766         return 0;
 767     if (size == 8)
 768         ret->t = VT_LLONG;
 769     else if (size == 4)
 770         ret->t = VT_INT;
 771     else if (size == 2)
 772         ret->t = VT_SHORT;
 773     else
 774         ret->t = VT_BYTE;
 775     ret->ref = NULL;
 776     return 1;
 777 }
 778
 779 static int is_sse_float(int t) {
 780     int bt;
 781     bt = t & VT_BTYPE;
 782     return bt == VT_DOUBLE || bt == VT_FLOAT;
 783 }
 784
 785 static int gfunc_arg_size(CType *type) {
 786     int align;
 787     if (type->t & (VT_ARRAY|VT_BITFIELD))
 788         return 8;
 789     return type_size(type, &align);
 790 }
 791
 792 void gfunc_call(int nb_args)
 793 {
 794     int size, r, args_size, i, d, bt, struct_size;
 795     int arg;
 796
 797 #ifdef CONFIG_TCC_BCHECK
 798     if (tcc_state->do_bounds_check)
 799         gbound_args(nb_args);
 800 #endif
 801
 802     args_size = (nb_args < REGN ? REGN : nb_args) * PTR_SIZE;
 803     arg = nb_args;
 804
 805     /* for struct arguments, we need to call memcpy and the function
 806        call breaks register passing arguments we are preparing.
 807        So, we process arguments which will be passed by stack first. */
 808     struct_size = args_size;
 809     for(i = 0; i < nb_args; i++) {
 810         SValue *sv;
 811
 812         --arg;
 813         sv = &vtop[-i];
 814         bt = (sv->type.t & VT_BTYPE);
 815         size = gfunc_arg_size(&sv->type);
 816
 817         if (using_regs(size))
 818             continue; /* arguments smaller than 8 bytes passed in registers or on stack */
 819
 820         if (bt == VT_STRUCT) {
 821             /* align to stack align size */
 822             size = (size + 15) & ~15;
 823             /* generate structure store */
 824             r = get_reg(RC_INT);
 825             gen_offs_sp(0x8d, r, struct_size);
 826             struct_size += size;
 827
 828             /* generate memcpy call */
 829             vset(&sv->type, r | VT_LVAL, 0);
 830             vpushv(sv);
 831             vstore();
 832             --vtop;
 833         } else if (bt == VT_LDOUBLE) {
 834             gv(RC_ST0);
 835             gen_offs_sp(0xdb, 0x107, struct_size);
 836             struct_size += 16;
 837         }
 838     }
 839
 840     if (func_scratch < struct_size)
 841         func_scratch = struct_size;
 842
 843     arg = nb_args;
 844     struct_size = args_size;
 845
 846     for(i = 0; i < nb_args; i++) {
 847         --arg;
 848         bt = (vtop->type.t & VT_BTYPE);
 849
 850         size = gfunc_arg_size(&vtop->type);
 851         if (!using_regs(size)) {
 852             /* align to stack align size */
 853             size = (size + 15) & ~15;
 854             if (arg >= REGN) {
 855                 d = get_reg(RC_INT);
 856                 gen_offs_sp(0x8d, d, struct_size);
 857                 gen_offs_sp(0x89, d, arg*8);
 858             } else {
 859                 d = arg_prepare_reg(arg);
 860                 gen_offs_sp(0x8d, d, struct_size);
 861             }
 862             struct_size += size;
 863         } else {
 864             if (is_sse_float(vtop->type.t)) {
 865                 if (tcc_state->nosse)
 866                   tcc_error("SSE disabled");
 867                 if (arg >= REGN) {
 868                     gv(RC_XMM0);
 869                     /* movq %xmm0, j*8(%rsp) */
 870                     gen_offs_sp(0xd60f66, 0x100, arg*8);
 871                 } else {
 872                     /* Load directly to xmmN register */
 873                     gv(RC_XMM0 << arg);
 874                     d = arg_prepare_reg(arg);
 875                     /* mov %xmmN, %rxx */
 876                     o(0x66);
 877                     orex(1,d,0, 0x7e0f);
 878                     o(0xc0 + arg*8 + REG_VALUE(d));
 879                 }
 880             } else {
 881                 if (bt == VT_STRUCT) {
 882                     vtop->type.ref = NULL;
 883                     vtop->type.t = size > 4 ? VT_LLONG : size > 2 ? VT_INT
 884                         : size > 1 ? VT_SHORT : VT_BYTE;
 885                 }
 886
 887                 r = gv(RC_INT);
 888                 if (arg >= REGN) {
 889                     gen_offs_sp(0x89, r, arg*8);
 890                 } else {
 891                     d = arg_prepare_reg(arg);
 892                     orex(1,d,r,0x89); /* mov */
 893                     o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
 894                 }
 895             }
 896         }
 897         vtop--;
 898     }
 899     save_regs(0);
 900     /* Copy R10 and R11 into RCX and RDX, respectively */
 901     if (nb_args > 0) {
 902         o(0xd1894c); /* mov %r10, %rcx */
 903         if (nb_args > 1) {
 904             o(0xda894c); /* mov %r11, %rdx */
 905         }
 906     }
 907
 908     gcall_or_jmp(0);
 909
 910     if ((vtop->r & VT_SYM) && vtop->sym->v == TOK_alloca) {
 911         /* need to add the "func_scratch" area after alloca */
 912         o(0x48); func_alloca = oad(0x05, func_alloca); /* add $NN, %rax */
 913 #ifdef CONFIG_TCC_BCHECK
 914         if (tcc_state->do_bounds_check)
 915             gen_bounds_call(TOK___bound_alloca_nr); /* new region */
 916 #endif
 917     }
 918     vtop--;
 919 }
 920
 921
 922 #define FUNC_PROLOG_SIZE 11
 923
 924 /* generate function prolog of type 't' */
 925 void gfunc_prolog(Sym *func_sym)
 926 {
 927     CType *func_type = &func_sym->type;
 928     int addr, reg_param_index, bt, size;
 929     Sym *sym;
 930     CType *type;
 931
 932     func_ret_sub = 0;
 933     func_scratch = 32;
 934     func_alloca = 0;
 935     loc = 0;
 936
 937     addr = PTR_SIZE * 2;
 938     ind += FUNC_PROLOG_SIZE;
 939     func_sub_sp_offset = ind;
 940     reg_param_index = 0;
 941
 942     sym = func_type->ref;
 943
 944     /* if the function returns a structure, then add an
 945        implicit pointer parameter */
 946     size = gfunc_arg_size(&func_vt);
 947     if (!using_regs(size)) {
 948         gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 949         func_vc = addr;
 950         reg_param_index++;
 951         addr += 8;
 952     }
 953
 954     /* define parameters */
 955     while ((sym = sym->next) != NULL) {
 956         type = &sym->type;
 957         bt = type->t & VT_BTYPE;
 958         size = gfunc_arg_size(type);
 959         if (!using_regs(size)) {
 960             if (reg_param_index < REGN) {
 961                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 962             }
 963             sym_push(sym->v & ~SYM_FIELD, type,
 964                      VT_LLOCAL | VT_LVAL, addr);
 965         } else {
 966             if (reg_param_index < REGN) {
 967                 /* save arguments passed by register */
 968                 if ((bt == VT_FLOAT) || (bt == VT_DOUBLE)) {
 969                     if (tcc_state->nosse)
 970                       tcc_error("SSE disabled");
 971                     o(0xd60f66); /* movq */
 972                     gen_modrm(reg_param_index, VT_LOCAL, NULL, addr);
 973                 } else {
 974                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 975                 }
 976             }
 977             sym_push(sym->v & ~SYM_FIELD, type,
 978                      VT_LOCAL | VT_LVAL, addr);
 979         }
 980         addr += 8;
 981         reg_param_index++;
 982     }
 983
 984     while (reg_param_index < REGN) {
 985         if (func_var) {
 986             gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 987             addr += 8;
 988         }
 989         reg_param_index++;
 990     }
 991 #ifdef CONFIG_TCC_BCHECK
 992     if (tcc_state->do_bounds_check)
 993         gen_bounds_prolog();
 994 #endif
 995 }
 996
 997 /* generate function epilog */
 998 void gfunc_epilog(void)
 999 {
1000     int v, saved_ind;
1001
1002     /* align local size to word & save local variables */
1003     func_scratch = (func_scratch + 15) & -16;
1004     loc = (loc & -16) - func_scratch;
1005
1006 #ifdef CONFIG_TCC_BCHECK
1007     if (tcc_state->do_bounds_check)
1008         gen_bounds_epilog();
1009 #endif
1010
1011     o(0xc9); /* leave */
1012     if (func_ret_sub == 0) {
1013         o(0xc3); /* ret */
1014     } else {
1015         o(0xc2); /* ret n */
1016         g(func_ret_sub);
1017         g(func_ret_sub >> 8);
1018     }
1019
1020     saved_ind = ind;
1021     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1022     v = -loc;
1023
1024     if (v >= 4096) {
1025         Sym *sym = external_helper_sym(TOK___chkstk);
1026         oad(0xb8, v); /* mov stacksize, %eax */
1027         oad(0xe8, 0); /* call __chkstk, (does the stackframe too) */
1028         greloca(cur_text_section, sym, ind-4, R_X86_64_PC32, -4);
1029         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
1030     } else {
1031         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1032         o(0xec8148);  /* sub rsp, stacksize */
1033         gen_le32(v);
1034     }
1035
1036     /* add the "func_scratch" area after each alloca seen */
1037     gsym_addr(func_alloca, -func_scratch);
1038
1039     cur_text_section->data_offset = saved_ind;
1040     pe_add_unwind_data(ind, saved_ind, v);
1041     ind = cur_text_section->data_offset;
1042 }
1043
1044 #else
1045
1046 static void gadd_sp(int val)
1047 {
1048     if (val == (char)val) {
1049         o(0xc48348);
1050         g(val);
1051     } else {
1052         oad(0xc48148, val); /* add $xxx, %rsp */
1053     }
1054 }
1055
1056 typedef enum X86_64_Mode {
1057   x86_64_mode_none,
1058   x86_64_mode_memory,
1059   x86_64_mode_integer,
1060   x86_64_mode_sse,
1061   x86_64_mode_x87
1062 } X86_64_Mode;
1063
1064 static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b)
1065 {
1066     if (a == b)
1067         return a;
1068     else if (a == x86_64_mode_none)
1069         return b;
1070     else if (b == x86_64_mode_none)
1071         return a;
1072     else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
1073         return x86_64_mode_memory;
1074     else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
1075         return x86_64_mode_integer;
1076     else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
1077         return x86_64_mode_memory;
1078     else
1079         return x86_64_mode_sse;
1080 }
1081
1082 static X86_64_Mode classify_x86_64_inner(CType *ty)
1083 {
1084     X86_64_Mode mode;
1085     Sym *f;
1086
1087     switch (ty->t & VT_BTYPE) {
1088     case VT_VOID: return x86_64_mode_none;
1089
1090     case VT_INT:
1091     case VT_BYTE:
1092     case VT_SHORT:
1093     case VT_LLONG:
1094     case VT_BOOL:
1095     case VT_PTR:
1096     case VT_FUNC:
1097         return x86_64_mode_integer;
1098
1099     case VT_FLOAT:
1100     case VT_DOUBLE: return x86_64_mode_sse;
1101
1102     case VT_LDOUBLE: return x86_64_mode_x87;
1103
1104     case VT_STRUCT:
1105         f = ty->ref;
1106
1107         mode = x86_64_mode_none;
1108         for (f = f->next; f; f = f->next)
1109             mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
1110
1111         return mode;
1112     }
1113     assert(0);
1114     return 0;
1115 }
1116
1117 static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count)
1118 {
1119     X86_64_Mode mode;
1120     int size, align, ret_t = 0;
1121
1122     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
1123         *psize = 8;
1124         *palign = 8;
1125         *reg_count = 1;
1126         ret_t = ty->t;
1127         mode = x86_64_mode_integer;
1128     } else {
1129         size = type_size(ty, &align);
1130         *psize = (size + 7) & ~7;
1131         *palign = (align + 7) & ~7;
1132
1133         if (size > 16) {
1134             mode = x86_64_mode_memory;
1135         } else {
1136             mode = classify_x86_64_inner(ty);
1137             switch (mode) {
1138             case x86_64_mode_integer:
1139                 if (size > 8) {
1140                     *reg_count = 2;
1141                     ret_t = VT_QLONG;
1142                 } else {
1143                     *reg_count = 1;
1144                     if (size > 4)
1145                         ret_t = VT_LLONG;
1146                     else if (size > 2)
1147                         ret_t = VT_INT;
1148                     else if (size > 1)
1149                         ret_t = VT_SHORT;
1150                     else
1151                         ret_t = VT_BYTE;
1152                     if ((ty->t & VT_BTYPE) == VT_STRUCT || (ty->t & VT_UNSIGNED))
1153                         ret_t |= VT_UNSIGNED;
1154                 }
1155                 break;
1156
1157             case x86_64_mode_x87:
1158                 *reg_count = 1;
1159                 ret_t = VT_LDOUBLE;
1160                 break;
1161
1162             case x86_64_mode_sse:
1163                 if (size > 8) {
1164                     *reg_count = 2;
1165                     ret_t = VT_QFLOAT;
1166                 } else {
1167                     *reg_count = 1;
1168                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
1169                 }
1170                 break;
1171             default: break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
1172             }
1173         }
1174     }
1175
1176     if (ret) {
1177         ret->ref = NULL;
1178         ret->t = ret_t;
1179     }
1180
1181     return mode;
1182 }
1183
1184 ST_FUNC int classify_x86_64_va_arg(CType *ty)
1185 {
1186     /* This definition must be synced with stdarg.h */
1187     enum __va_arg_type {
1188         __va_gen_reg, __va_float_reg, __va_stack
1189     };
1190     int size, align, reg_count;
1191     X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
1192     switch (mode) {
1193     default: return __va_stack;
1194     case x86_64_mode_integer: return __va_gen_reg;
1195     case x86_64_mode_sse: return __va_float_reg;
1196     }
1197 }
1198
1199 /* Return the number of registers needed to return the struct, or 0 if
1200    returning via struct pointer. */
1201 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
1202 {
1203     int size, align, reg_count;
1204     *ret_align = 1; // Never have to re-align return values for x86-64
1205     *regsize = 8;
1206     return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
1207 }
1208
1209 #define REGN 6
1210 static const uint8_t arg_regs[REGN] = {
1211     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
1212 };
1213
1214 static int arg_prepare_reg(int idx) {
1215   if (idx == 2 || idx == 3)
1216       /* idx=2: r10, idx=3: r11 */
1217       return idx + 8;
1218   else
1219       return arg_regs[idx];
1220 }
1221
1222 /* Generate function call. The function address is pushed first, then
1223    all the parameters in call order. This functions pops all the
1224    parameters and the function address. */
1225 void gfunc_call(int nb_args)
1226 {
1227     X86_64_Mode mode;
1228     CType type;
1229     int size, align, r, args_size, stack_adjust, i, reg_count, k;
1230     int nb_reg_args = 0;
1231     int nb_sse_args = 0;
1232     int sse_reg, gen_reg;
1233     char *onstack = tcc_malloc((nb_args + 1) * sizeof (char));
1234
1235 #ifdef CONFIG_TCC_BCHECK
1236     if (tcc_state->do_bounds_check)
1237         gbound_args(nb_args);
1238 #endif
1239
1240     /* calculate the number of integer/float register arguments, remember
1241        arguments to be passed via stack (in onstack[]), and also remember
1242        if we have to align the stack pointer to 16 (onstack[i] == 2).  Needs
1243        to be done in a left-to-right pass over arguments.  */
1244     stack_adjust = 0;
1245     for(i = nb_args - 1; i >= 0; i--) {
1246         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1247         if (size == 0) continue;
1248         if (mode == x86_64_mode_sse && nb_sse_args + reg_count <= 8) {
1249             nb_sse_args += reg_count;
1250             onstack[i] = 0;
1251         } else if (mode == x86_64_mode_integer && nb_reg_args + reg_count <= REGN) {
1252             nb_reg_args += reg_count;
1253             onstack[i] = 0;
1254         } else if (mode == x86_64_mode_none) {
1255             onstack[i] = 0;
1256         } else {
1257             if (align == 16 && (stack_adjust &= 15)) {
1258                 onstack[i] = 2;
1259                 stack_adjust = 0;
1260             } else
1261               onstack[i] = 1;
1262             stack_adjust += size;
1263         }
1264     }
1265
1266     if (nb_sse_args && tcc_state->nosse)
1267       tcc_error("SSE disabled but floating point arguments passed");
1268
1269     /* fetch cpu flag before generating any code */
1270     if ((vtop->r & VT_VALMASK) == VT_CMP)
1271       gv(RC_INT);
1272
1273     /* for struct arguments, we need to call memcpy and the function
1274        call breaks register passing arguments we are preparing.
1275        So, we process arguments which will be passed by stack first. */
1276     gen_reg = nb_reg_args;
1277     sse_reg = nb_sse_args;
1278     args_size = 0;
1279     stack_adjust &= 15;
1280     for (i = k = 0; i < nb_args;) {
1281         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1282         if (size) {
1283             if (!onstack[i + k]) {
1284                 ++i;
1285                 continue;
1286             }
1287             /* Possibly adjust stack to align SSE boundary.  We're processing
1288                args from right to left while allocating happens left to right
1289                (stack grows down), so the adjustment needs to happen _after_
1290                an argument that requires it.  */
1291             if (stack_adjust) {
1292                 o(0x50); /* push %rax; aka sub $8,%rsp */
1293                 args_size += 8;
1294                 stack_adjust = 0;
1295             }
1296             if (onstack[i + k] == 2)
1297                 stack_adjust = 1;
1298         }
1299
1300         vrotb(i+1);
1301
1302         switch (vtop->type.t & VT_BTYPE) {
1303             case VT_STRUCT:
1304                 /* allocate the necessary size on stack */
1305                 o(0x48);
1306                 oad(0xec81, size); /* sub $xxx, %rsp */
1307                 /* generate structure store */
1308                 r = get_reg(RC_INT);
1309                 orex(1, r, 0, 0x89); /* mov %rsp, r */
1310                 o(0xe0 + REG_VALUE(r));
1311                 vset(&vtop->type, r | VT_LVAL, 0);
1312                 vswap();
1313                 vstore();
1314                 break;
1315
1316             case VT_LDOUBLE:
1317                 gv(RC_ST0);
1318                 oad(0xec8148, size); /* sub $xxx, %rsp */
1319                 o(0x7cdb); /* fstpt 0(%rsp) */
1320                 g(0x24);
1321                 g(0x00);
1322                 break;
1323
1324             case VT_FLOAT:
1325             case VT_DOUBLE:
1326                 assert(mode == x86_64_mode_sse);
1327                 r = gv(RC_FLOAT);
1328                 o(0x50); /* push $rax */
1329                 /* movq %xmmN, (%rsp) */
1330                 o(0xd60f66);
1331                 o(0x04 + REG_VALUE(r)*8);
1332                 o(0x24);
1333                 break;
1334
1335             default:
1336                 assert(mode == x86_64_mode_integer);
1337                 /* simple type */
1338                 /* XXX: implicit cast ? */
1339                 r = gv(RC_INT);
1340                 orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
1341                 break;
1342         }
1343         args_size += size;
1344
1345         vpop();
1346         --nb_args;
1347         k++;
1348     }
1349
1350     tcc_free(onstack);
1351
1352     /* XXX This should be superfluous.  */
1353     save_regs(0); /* save used temporary registers */
1354
1355     /* then, we prepare register passing arguments.
1356        Note that we cannot set RDX and RCX in this loop because gv()
1357        may break these temporary registers. Let's use R10 and R11
1358        instead of them */
1359     assert(gen_reg <= REGN);
1360     assert(sse_reg <= 8);
1361     for(i = 0; i < nb_args; i++) {
1362         mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
1363         if (size == 0) continue;
1364         /* Alter stack entry type so that gv() knows how to treat it */
1365         vtop->type = type;
1366         if (mode == x86_64_mode_sse) {
1367             if (reg_count == 2) {
1368                 sse_reg -= 2;
1369                 gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
1370                 if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
1371                     /* movaps %xmm1, %xmmN */
1372                     o(0x280f);
1373                     o(0xc1 + ((sse_reg+1) << 3));
1374                     /* movaps %xmm0, %xmmN */
1375                     o(0x280f);
1376                     o(0xc0 + (sse_reg << 3));
1377                 }
1378             } else {
1379                 assert(reg_count == 1);
1380                 --sse_reg;
1381                 /* Load directly to register */
1382                 gv(RC_XMM0 << sse_reg);
1383             }
1384         } else if (mode == x86_64_mode_integer) {
1385             /* simple type */
1386             /* XXX: implicit cast ? */
1387             int d;
1388             gen_reg -= reg_count;
1389             r = gv(RC_INT);
1390             d = arg_prepare_reg(gen_reg);
1391             orex(1,d,r,0x89); /* mov */
1392             o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
1393             if (reg_count == 2) {
1394                 d = arg_prepare_reg(gen_reg+1);
1395                 orex(1,d,vtop->r2,0x89); /* mov */
1396                 o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
1397             }
1398         }
1399         vtop--;
1400     }
1401     assert(gen_reg == 0);
1402     assert(sse_reg == 0);
1403
1404     /* We shouldn't have many operands on the stack anymore, but the
1405        call address itself is still there, and it might be in %eax
1406        (or edx/ecx) currently, which the below writes would clobber.
1407        So evict all remaining operands here.  */
1408     save_regs(0);
1409
1410     /* Copy R10 and R11 into RDX and RCX, respectively */
1411     if (nb_reg_args > 2) {
1412         o(0xd2894c); /* mov %r10, %rdx */
1413         if (nb_reg_args > 3) {
1414             o(0xd9894c); /* mov %r11, %rcx */
1415         }
1416     }
1417
1418     if (vtop->type.ref->f.func_type != FUNC_NEW) /* implies FUNC_OLD or FUNC_ELLIPSIS */
1419         oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
1420     gcall_or_jmp(0);
1421     if (args_size)
1422         gadd_sp(args_size);
1423     vtop--;
1424 }
1425
1426 #define FUNC_PROLOG_SIZE 11
1427
1428 static void push_arg_reg(int i) {
1429     loc -= 8;
1430     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
1431 }
1432
1433 /* generate function prolog of type 't' */
1434 void gfunc_prolog(Sym *func_sym)
1435 {
1436     CType *func_type = &func_sym->type;
1437     X86_64_Mode mode;
1438     int i, addr, align, size, reg_count;
1439     int param_addr = 0, reg_param_index, sse_param_index;
1440     Sym *sym;
1441     CType *type;
1442
1443     sym = func_type->ref;
1444     addr = PTR_SIZE * 2;
1445     loc = 0;
1446     ind += FUNC_PROLOG_SIZE;
1447     func_sub_sp_offset = ind;
1448     func_ret_sub = 0;
1449
1450     if (func_var) {
1451         int seen_reg_num, seen_sse_num, seen_stack_size;
1452         seen_reg_num = seen_sse_num = 0;
1453         /* frame pointer and return address */
1454         seen_stack_size = PTR_SIZE * 2;
1455         /* count the number of seen parameters */
1456         sym = func_type->ref;
1457         while ((sym = sym->next) != NULL) {
1458             type = &sym->type;
1459             mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1460             switch (mode) {
1461             default:
1462             stack_arg:
1463                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
1464                 break;
1465
1466             case x86_64_mode_integer:
1467                 if (seen_reg_num + reg_count > REGN)
1468                     goto stack_arg;
1469                 seen_reg_num += reg_count;
1470                 break;
1471
1472             case x86_64_mode_sse:
1473                 if (seen_sse_num + reg_count > 8)
1474                     goto stack_arg;
1475                 seen_sse_num += reg_count;
1476                 break;
1477             }
1478         }
1479
1480         loc -= 24;
1481         /* movl $0x????????, -0x18(%rbp) */
1482         o(0xe845c7);
1483         gen_le32(seen_reg_num * 8);
1484         /* movl $0x????????, -0x14(%rbp) */
1485         o(0xec45c7);
1486         gen_le32(seen_sse_num * 16 + 48);
1487         /* leaq $0x????????, %r11 */
1488         o(0x9d8d4c);
1489         gen_le32(seen_stack_size);
1490         /* movq %r11, -0x10(%rbp) */
1491         o(0xf05d894c);
1492         /* leaq $-192(%rbp), %r11 */
1493         o(0x9d8d4c);
1494         gen_le32(-176 - 24);
1495         /* movq %r11, -0x8(%rbp) */
1496         o(0xf85d894c);
1497
1498         /* save all register passing arguments */
1499         for (i = 0; i < 8; i++) {
1500             loc -= 16;
1501             if (!tcc_state->nosse) {
1502                 o(0xd60f66); /* movq */
1503                 gen_modrm(7 - i, VT_LOCAL, NULL, loc);
1504             }
1505             /* movq $0, loc+8(%rbp) */
1506             o(0x85c748);
1507             gen_le32(loc + 8);
1508             gen_le32(0);
1509         }
1510         for (i = 0; i < REGN; i++) {
1511             push_arg_reg(REGN-1-i);
1512         }
1513     }
1514
1515     sym = func_type->ref;
1516     reg_param_index = 0;
1517     sse_param_index = 0;
1518
1519     /* if the function returns a structure, then add an
1520        implicit pointer parameter */
1521     mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
1522     if (mode == x86_64_mode_memory) {
1523         push_arg_reg(reg_param_index);
1524         func_vc = loc;
1525         reg_param_index++;
1526     }
1527     /* define parameters */
1528     while ((sym = sym->next) != NULL) {
1529         type = &sym->type;
1530         mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1531         switch (mode) {
1532         case x86_64_mode_sse:
1533             if (tcc_state->nosse)
1534                 tcc_error("SSE disabled but floating point arguments used");
1535             if (sse_param_index + reg_count <= 8) {
1536                 /* save arguments passed by register */
1537                 loc -= reg_count * 8;
1538                 param_addr = loc;
1539                 for (i = 0; i < reg_count; ++i) {
1540                     o(0xd60f66); /* movq */
1541                     gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
1542                     ++sse_param_index;
1543                 }
1544             } else {
1545                 addr = (addr + align - 1) & -align;
1546                 param_addr = addr;
1547                 addr += size;
1548             }
1549             break;
1550
1551         case x86_64_mode_memory:
1552         case x86_64_mode_x87:
1553             addr = (addr + align - 1) & -align;
1554             param_addr = addr;
1555             addr += size;
1556             break;
1557
1558         case x86_64_mode_integer: {
1559             if (reg_param_index + reg_count <= REGN) {
1560                 /* save arguments passed by register */
1561                 loc -= reg_count * 8;
1562                 param_addr = loc;
1563                 for (i = 0; i < reg_count; ++i) {
1564                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
1565                     ++reg_param_index;
1566                 }
1567             } else {
1568                 addr = (addr + align - 1) & -align;
1569                 param_addr = addr;
1570                 addr += size;
1571             }
1572             break;
1573         }
1574         default: break; /* nothing to be done for x86_64_mode_none */
1575         }
1576         sym_push(sym->v & ~SYM_FIELD, type,
1577                  VT_LOCAL | VT_LVAL, param_addr);
1578     }
1579
1580 #ifdef CONFIG_TCC_BCHECK
1581     if (tcc_state->do_bounds_check)
1582         gen_bounds_prolog();
1583 #endif
1584 }
1585
1586 /* generate function epilog */
1587 void gfunc_epilog(void)
1588 {
1589     int v, saved_ind;
1590
1591 #ifdef CONFIG_TCC_BCHECK
1592     if (tcc_state->do_bounds_check)
1593         gen_bounds_epilog();
1594 #endif
1595     o(0xc9); /* leave */
1596     if (func_ret_sub == 0) {
1597         o(0xc3); /* ret */
1598     } else {
1599         o(0xc2); /* ret n */
1600         g(func_ret_sub);
1601         g(func_ret_sub >> 8);
1602     }
1603     /* align local size to word & save local variables */
1604     v = (-loc + 15) & -16;
1605     saved_ind = ind;
1606     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1607     o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1608     o(0xec8148);  /* sub rsp, stacksize */
1609     gen_le32(v);
1610     ind = saved_ind;
1611 }
1612
1613 #endif /* not PE */
1614
1615 ST_FUNC void gen_fill_nops(int bytes)
1616 {
1617     while (bytes--)
1618       g(0x90);
1619 }
1620
1621 /* generate a jump to a label */
1622 int gjmp(int t)
1623 {
1624     return gjmp2(0xe9, t);
1625 }
1626
1627 /* generate a jump to a fixed address */
1628 void gjmp_addr(int a)
1629 {
1630     int r;
1631     r = a - ind - 2;
1632     if (r == (char)r) {
1633         g(0xeb);
1634         g(r);
1635     } else {
1636         oad(0xe9, a - ind - 5);
1637     }
1638 }
1639
1640 ST_FUNC int gjmp_append(int n, int t)
1641 {
1642     void *p;
1643     /* insert vtop->c jump list in t */
1644     if (n) {
1645         uint32_t n1 = n, n2;
1646         while ((n2 = read32le(p = cur_text_section->data + n1)))
1647             n1 = n2;
1648         write32le(p, t);
1649         t = n;
1650     }
1651     return t;
1652 }
1653
1654 ST_FUNC int gjmp_cond(int op, int t)
1655 {
1656         if (op & 0x100)
1657           {
1658             /* This was a float compare.  If the parity flag is set
1659                the result was unordered.  For anything except != this
1660                means false and we don't jump (anding both conditions).
1661                For != this means true (oring both).
1662                Take care about inverting the test.  We need to jump
1663                to our target if the result was unordered and test wasn't NE,
1664                otherwise if unordered we don't want to jump.  */
1665             int v = vtop->cmp_r;
1666             op &= ~0x100;
1667             if (op ^ v ^ (v != TOK_NE))
1668               o(0x067a);  /* jp +6 */
1669             else
1670               {
1671                 g(0x0f);
1672                 t = gjmp2(0x8a, t); /* jp t */
1673               }
1674           }
1675         g(0x0f);
1676         t = gjmp2(op - 16, t);
1677         return t;
1678 }
1679
1680 /* generate an integer binary operation */
1681 void gen_opi(int op)
1682 {
1683     int r, fr, opc, c;
1684     int ll, uu, cc;
1685
1686     ll = is64_type(vtop[-1].type.t);
1687     uu = (vtop[-1].type.t & VT_UNSIGNED) != 0;
1688     cc = (vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
1689
1690     switch(op) {
1691     case '+':
1692     case TOK_ADDC1: /* add with carry generation */
1693         opc = 0;
1694     gen_op8:
1695         if (cc && (!ll || (int)vtop->c.i == vtop->c.i)) {
1696             /* constant case */
1697             vswap();
1698             r = gv(RC_INT);
1699             vswap();
1700             c = vtop->c.i;
1701             if (c == (char)c) {
1702                 /* XXX: generate inc and dec for smaller code ? */
1703                 orex(ll, r, 0, 0x83);
1704                 o(0xc0 | (opc << 3) | REG_VALUE(r));
1705                 g(c);
1706             } else {
1707                 orex(ll, r, 0, 0x81);
1708                 oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
1709             }
1710         } else {
1711             gv2(RC_INT, RC_INT);
1712             r = vtop[-1].r;
1713             fr = vtop[0].r;
1714             orex(ll, r, fr, (opc << 3) | 0x01);
1715             o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
1716         }
1717         vtop--;
1718         if (op >= TOK_ULT && op <= TOK_GT)
1719             vset_VT_CMP(op);
1720         break;
1721     case '-':
1722     case TOK_SUBC1: /* sub with carry generation */
1723         opc = 5;
1724         goto gen_op8;
1725     case TOK_ADDC2: /* add with carry use */
1726         opc = 2;
1727         goto gen_op8;
1728     case TOK_SUBC2: /* sub with carry use */
1729         opc = 3;
1730         goto gen_op8;
1731     case '&':
1732         opc = 4;
1733         goto gen_op8;
1734     case '^':
1735         opc = 6;
1736         goto gen_op8;
1737     case '|':
1738         opc = 1;
1739         goto gen_op8;
1740     case '*':
1741         gv2(RC_INT, RC_INT);
1742         r = vtop[-1].r;
1743         fr = vtop[0].r;
1744         orex(ll, fr, r, 0xaf0f); /* imul fr, r */
1745         o(0xc0 + REG_VALUE(fr) + REG_VALUE(r) * 8);
1746         vtop--;
1747         break;
1748     case TOK_SHL:
1749         opc = 4;
1750         goto gen_shift;
1751     case TOK_SHR:
1752         opc = 5;
1753         goto gen_shift;
1754     case TOK_SAR:
1755         opc = 7;
1756     gen_shift:
1757         opc = 0xc0 | (opc << 3);
1758         if (cc) {
1759             /* constant case */
1760             vswap();
1761             r = gv(RC_INT);
1762             vswap();
1763             orex(ll, r, 0, 0xc1); /* shl/shr/sar $xxx, r */
1764             o(opc | REG_VALUE(r));
1765             g(vtop->c.i & (ll ? 63 : 31));
1766         } else {
1767             /* we generate the shift in ecx */
1768             gv2(RC_INT, RC_RCX);
1769             r = vtop[-1].r;
1770             orex(ll, r, 0, 0xd3); /* shl/shr/sar %cl, r */
1771             o(opc | REG_VALUE(r));
1772         }
1773         vtop--;
1774         break;
1775     case TOK_UDIV:
1776     case TOK_UMOD:
1777         uu = 1;
1778         goto divmod;
1779     case '/':
1780     case '%':
1781     case TOK_PDIV:
1782         uu = 0;
1783     divmod:
1784         /* first operand must be in eax */
1785         /* XXX: need better constraint for second operand */
1786         gv2(RC_RAX, RC_RCX);
1787         r = vtop[-1].r;
1788         fr = vtop[0].r;
1789         vtop--;
1790         save_reg(TREG_RDX);
1791         orex(ll, 0, 0, uu ? 0xd231 : 0x99); /* xor %edx,%edx : cqto */
1792         orex(ll, fr, 0, 0xf7); /* div fr, %eax */
1793         o((uu ? 0xf0 : 0xf8) + REG_VALUE(fr));
1794         if (op == '%' || op == TOK_UMOD)
1795             r = TREG_RDX;
1796         else
1797             r = TREG_RAX;
1798         vtop->r = r;
1799         break;
1800     default:
1801         opc = 7;
1802         goto gen_op8;
1803     }
1804 }
1805
1806 void gen_opl(int op)
1807 {
1808     gen_opi(op);
1809 }
1810
1811 /* generate a floating point operation 'v = t1 op t2' instruction. The
1812    two operands are guaranteed to have the same floating point type */
1813 /* XXX: need to use ST1 too */
1814 void gen_opf(int op)
1815 {
1816     int a, ft, fc, swapped, r;
1817     int float_type =
1818         (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1819
1820     /* convert constants to memory references */
1821     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1822         vswap();
1823         gv(float_type);
1824         vswap();
1825     }
1826     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1827         gv(float_type);
1828
1829     /* must put at least one value in the floating point register */
1830     if ((vtop[-1].r & VT_LVAL) &&
1831         (vtop[0].r & VT_LVAL)) {
1832         vswap();
1833         gv(float_type);
1834         vswap();
1835     }
1836     swapped = 0;
1837     /* swap the stack if needed so that t1 is the register and t2 is
1838        the memory reference */
1839     if (vtop[-1].r & VT_LVAL) {
1840         vswap();
1841         swapped = 1;
1842     }
1843     if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1844         if (op >= TOK_ULT && op <= TOK_GT) {
1845             /* load on stack second operand */
1846             load(TREG_ST0, vtop);
1847             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1848             if (op == TOK_GE || op == TOK_GT)
1849                 swapped = !swapped;
1850             else if (op == TOK_EQ || op == TOK_NE)
1851                 swapped = 0;
1852             if (swapped)
1853                 o(0xc9d9); /* fxch %st(1) */
1854             if (op == TOK_EQ || op == TOK_NE)
1855                 o(0xe9da); /* fucompp */
1856             else
1857                 o(0xd9de); /* fcompp */
1858             o(0xe0df); /* fnstsw %ax */
1859             if (op == TOK_EQ) {
1860                 o(0x45e480); /* and $0x45, %ah */
1861                 o(0x40fC80); /* cmp $0x40, %ah */
1862             } else if (op == TOK_NE) {
1863                 o(0x45e480); /* and $0x45, %ah */
1864                 o(0x40f480); /* xor $0x40, %ah */
1865                 op = TOK_NE;
1866             } else if (op == TOK_GE || op == TOK_LE) {
1867                 o(0x05c4f6); /* test $0x05, %ah */
1868                 op = TOK_EQ;
1869             } else {
1870                 o(0x45c4f6); /* test $0x45, %ah */
1871                 op = TOK_EQ;
1872             }
1873             vtop--;
1874             vset_VT_CMP(op);
1875         } else {
1876             /* no memory reference possible for long double operations */
1877             load(TREG_ST0, vtop);
1878             swapped = !swapped;
1879
1880             switch(op) {
1881             default:
1882             case '+':
1883                 a = 0;
1884                 break;
1885             case '-':
1886                 a = 4;
1887                 if (swapped)
1888                     a++;
1889                 break;
1890             case '*':
1891                 a = 1;
1892                 break;
1893             case '/':
1894                 a = 6;
1895                 if (swapped)
1896                     a++;
1897                 break;
1898             }
1899             ft = vtop->type.t;
1900             fc = vtop->c.i;
1901             o(0xde); /* fxxxp %st, %st(1) */
1902             o(0xc1 + (a << 3));
1903             vtop--;
1904         }
1905     } else {
1906         if (op >= TOK_ULT && op <= TOK_GT) {
1907             /* if saved lvalue, then we must reload it */
1908             r = vtop->r;
1909             fc = vtop->c.i;
1910             if ((r & VT_VALMASK) == VT_LLOCAL) {
1911                 SValue v1;
1912                 r = get_reg(RC_INT);
1913                 v1.type.t = VT_PTR;
1914                 v1.r = VT_LOCAL | VT_LVAL;
1915                 v1.c.i = fc;
1916                 load(r, &v1);
1917                 fc = 0;
1918                 vtop->r = r = r | VT_LVAL;
1919             }
1920
1921             if (op == TOK_EQ || op == TOK_NE) {
1922                 swapped = 0;
1923             } else {
1924                 if (op == TOK_LE || op == TOK_LT)
1925                     swapped = !swapped;
1926                 if (op == TOK_LE || op == TOK_GE) {
1927                     op = 0x93; /* setae */
1928                 } else {
1929                     op = 0x97; /* seta */
1930                 }
1931             }
1932
1933             if (swapped) {
1934                 gv(RC_FLOAT);
1935                 vswap();
1936             }
1937             assert(!(vtop[-1].r & VT_LVAL));
1938
1939             if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE)
1940                 o(0x66);
1941             if (op == TOK_EQ || op == TOK_NE)
1942                 o(0x2e0f); /* ucomisd */
1943             else
1944                 o(0x2f0f); /* comisd */
1945
1946             if (vtop->r & VT_LVAL) {
1947                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
1948             } else {
1949                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
1950             }
1951
1952             vtop--;
1953             vset_VT_CMP(op | 0x100);
1954             vtop->cmp_r = op;
1955         } else {
1956             assert((vtop->type.t & VT_BTYPE) != VT_LDOUBLE);
1957             switch(op) {
1958             default:
1959             case '+':
1960                 a = 0;
1961                 break;
1962             case '-':
1963                 a = 4;
1964                 break;
1965             case '*':
1966                 a = 1;
1967                 break;
1968             case '/':
1969                 a = 6;
1970                 break;
1971             }
1972             ft = vtop->type.t;
1973             fc = vtop->c.i;
1974             assert((ft & VT_BTYPE) != VT_LDOUBLE);
1975
1976             r = vtop->r;
1977             /* if saved lvalue, then we must reload it */
1978             if ((vtop->r & VT_VALMASK) == VT_LLOCAL) {
1979                 SValue v1;
1980                 r = get_reg(RC_INT);
1981                 v1.type.t = VT_PTR;
1982                 v1.r = VT_LOCAL | VT_LVAL;
1983                 v1.c.i = fc;
1984                 load(r, &v1);
1985                 fc = 0;
1986                 vtop->r = r = r | VT_LVAL;
1987             }
1988
1989             assert(!(vtop[-1].r & VT_LVAL));
1990             if (swapped) {
1991                 assert(vtop->r & VT_LVAL);
1992                 gv(RC_FLOAT);
1993                 vswap();
1994             }
1995
1996             if ((ft & VT_BTYPE) == VT_DOUBLE) {
1997                 o(0xf2);
1998             } else {
1999                 o(0xf3);
2000             }
2001             o(0x0f);
2002             o(0x58 + a);
2003
2004             if (vtop->r & VT_LVAL) {
2005                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
2006             } else {
2007                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
2008             }
2009
2010             vtop--;
2011         }
2012     }
2013 }
2014
2015 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
2016    and 'long long' cases. */
2017 void gen_cvt_itof(int t)
2018 {
2019     if ((t & VT_BTYPE) == VT_LDOUBLE) {
2020         save_reg(TREG_ST0);
2021         gv(RC_INT);
2022         if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
2023             /* signed long long to float/double/long double (unsigned case
2024                is handled generically) */
2025             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2026             o(0x242cdf); /* fildll (%rsp) */
2027             o(0x08c48348); /* add $8, %rsp */
2028         } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2029                    (VT_INT | VT_UNSIGNED)) {
2030             /* unsigned int to float/double/long double */
2031             o(0x6a); /* push $0 */
2032             g(0x00);
2033             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2034             o(0x242cdf); /* fildll (%rsp) */
2035             o(0x10c48348); /* add $16, %rsp */
2036         } else {
2037             /* int to float/double/long double */
2038             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2039             o(0x2404db); /* fildl (%rsp) */
2040             o(0x08c48348); /* add $8, %rsp */
2041         }
2042         vtop->r = TREG_ST0;
2043     } else {
2044         int r = get_reg(RC_FLOAT);
2045         gv(RC_INT);
2046         o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT?1:0));
2047         if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2048             (VT_INT | VT_UNSIGNED) ||
2049             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
2050             o(0x48); /* REX */
2051         }
2052         o(0x2a0f);
2053         o(0xc0 + (vtop->r & VT_VALMASK) + REG_VALUE(r)*8); /* cvtsi2sd */
2054         vtop->r = r;
2055     }
2056 }
2057
2058 /* convert from one floating point type to another */
2059 void gen_cvt_ftof(int t)
2060 {
2061     int ft, bt, tbt;
2062
2063     ft = vtop->type.t;
2064     bt = ft & VT_BTYPE;
2065     tbt = t & VT_BTYPE;
2066
2067     if (bt == VT_FLOAT) {
2068         gv(RC_FLOAT);
2069         if (tbt == VT_DOUBLE) {
2070             o(0x140f); /* unpcklps */
2071             o(0xc0 + REG_VALUE(vtop->r)*9);
2072             o(0x5a0f); /* cvtps2pd */
2073             o(0xc0 + REG_VALUE(vtop->r)*9);
2074         } else if (tbt == VT_LDOUBLE) {
2075             save_reg(RC_ST0);
2076             /* movss %xmm0,-0x10(%rsp) */
2077             o(0x110ff3);
2078             o(0x44 + REG_VALUE(vtop->r)*8);
2079             o(0xf024);
2080             o(0xf02444d9); /* flds -0x10(%rsp) */
2081             vtop->r = TREG_ST0;
2082         }
2083     } else if (bt == VT_DOUBLE) {
2084         gv(RC_FLOAT);
2085         if (tbt == VT_FLOAT) {
2086             o(0x140f66); /* unpcklpd */
2087             o(0xc0 + REG_VALUE(vtop->r)*9);
2088             o(0x5a0f66); /* cvtpd2ps */
2089             o(0xc0 + REG_VALUE(vtop->r)*9);
2090         } else if (tbt == VT_LDOUBLE) {
2091             save_reg(RC_ST0);
2092             /* movsd %xmm0,-0x10(%rsp) */
2093             o(0x110ff2);
2094             o(0x44 + REG_VALUE(vtop->r)*8);
2095             o(0xf024);
2096             o(0xf02444dd); /* fldl -0x10(%rsp) */
2097             vtop->r = TREG_ST0;
2098         }
2099     } else {
2100         int r;
2101         gv(RC_ST0);
2102         r = get_reg(RC_FLOAT);
2103         if (tbt == VT_DOUBLE) {
2104             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
2105             /* movsd -0x10(%rsp),%xmm0 */
2106             o(0x100ff2);
2107             o(0x44 + REG_VALUE(r)*8);
2108             o(0xf024);
2109             vtop->r = r;
2110         } else if (tbt == VT_FLOAT) {
2111             o(0xf0245cd9); /* fstps -0x10(%rsp) */
2112             /* movss -0x10(%rsp),%xmm0 */
2113             o(0x100ff3);
2114             o(0x44 + REG_VALUE(r)*8);
2115             o(0xf024);
2116             vtop->r = r;
2117         }
2118     }
2119 }
2120
2121 /* convert fp to int 't' type */
2122 void gen_cvt_ftoi(int t)
2123 {
2124     int ft, bt, size, r;
2125     ft = vtop->type.t;
2126     bt = ft & VT_BTYPE;
2127     if (bt == VT_LDOUBLE) {
2128         gen_cvt_ftof(VT_DOUBLE);
2129         bt = VT_DOUBLE;
2130     }
2131
2132     gv(RC_FLOAT);
2133     if (t != VT_INT)
2134         size = 8;
2135     else
2136         size = 4;
2137
2138     r = get_reg(RC_INT);
2139     if (bt == VT_FLOAT) {
2140         o(0xf3);
2141     } else if (bt == VT_DOUBLE) {
2142         o(0xf2);
2143     } else {
2144         assert(0);
2145     }
2146     orex(size == 8, r, 0, 0x2c0f); /* cvttss2si or cvttsd2si */
2147     o(0xc0 + REG_VALUE(vtop->r) + REG_VALUE(r)*8);
2148     vtop->r = r;
2149 }
2150
2151 // Generate sign extension from 32 to 64 bits:
2152 ST_FUNC void gen_cvt_sxtw(void)
2153 {
2154     int r = gv(RC_INT);
2155     /* x86_64 specific: movslq */
2156     o(0x6348);
2157     o(0xc0 + (REG_VALUE(r) << 3) + REG_VALUE(r));
2158 }
2159
2160 /* char/short to int conversion */
2161 ST_FUNC void gen_cvt_csti(int t)
2162 {
2163     int r, sz, xl, ll;
2164     r = gv(RC_INT);
2165     sz = !(t & VT_UNSIGNED);
2166     xl = (t & VT_BTYPE) == VT_SHORT;
2167     ll = (vtop->type.t & VT_BTYPE) == VT_LLONG;
2168     orex(ll, r, 0, 0xc0b60f /* mov[sz] %a[xl], %eax */
2169         | (sz << 3 | xl) << 8
2170         | (REG_VALUE(r) << 3 | REG_VALUE(r)) << 16
2171         );
2172 }
2173
2174 /* computed goto support */
2175 void ggoto(void)
2176 {
2177     gcall_or_jmp(1);
2178     vtop--;
2179 }
2180
2181 /* Save the stack pointer onto the stack and return the location of its address */
2182 ST_FUNC void gen_vla_sp_save(int addr) {
2183     /* mov %rsp,addr(%rbp)*/
2184     gen_modrm64(0x89, TREG_RSP, VT_LOCAL, NULL, addr);
2185 }
2186
2187 /* Restore the SP from a location on the stack */
2188 ST_FUNC void gen_vla_sp_restore(int addr) {
2189     gen_modrm64(0x8b, TREG_RSP, VT_LOCAL, NULL, addr);
2190 }
2191
2192 #ifdef TCC_TARGET_PE
2193 /* Save result of gen_vla_alloc onto the stack */
2194 ST_FUNC void gen_vla_result(int addr) {
2195     /* mov %rax,addr(%rbp)*/
2196     gen_modrm64(0x89, TREG_RAX, VT_LOCAL, NULL, addr);
2197 }
2198 #endif
2199
2200 /* Subtract from the stack pointer, and push the resulting value onto the stack */
2201 ST_FUNC void gen_vla_alloc(CType *type, int align) {
2202     int use_call = 0;
2203
2204 #if defined(CONFIG_TCC_BCHECK)
2205     use_call = tcc_state->do_bounds_check;
2206 #endif
2207 #ifdef TCC_TARGET_PE    /* alloca does more than just adjust %rsp on Windows */
2208     use_call = 1;
2209 #endif
2210     if (use_call)
2211     {
2212         vpush_helper_func(TOK_alloca);
2213         vswap(); /* Move alloca ref past allocation size */
2214         gfunc_call(1);
2215     }
2216     else {
2217         int r;
2218         r = gv(RC_INT); /* allocation size */
2219         /* sub r,%rsp */
2220         o(0x2b48);
2221         o(0xe0 | REG_VALUE(r));
2222         /* We align to 16 bytes rather than align */
2223         /* and ~15, %rsp */
2224         o(0xf0e48348);
2225         vpop();
2226     }
2227 }
2228
2229
2230 /* end of x86-64 code generator */
2231 /*************************************************************/
2232 #endif /* ! TARGET_DEFS_ONLY */
2233 /******************************************************/