x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #ifdef TARGET_DEFS_ONLY
  24
  25 /* number of available registers */
  26 #define NB_REGS         25
  27 #define NB_ASM_REGS     8
  28
  29 /* a register can belong to several classes. The classes must be
  30    sorted from more general to more precise (see gv2() code which does
  31    assumptions on it). */
  32 #define RC_INT     0x0001 /* generic integer register */
  33 #define RC_FLOAT   0x0002 /* generic float register */
  34 #define RC_RAX     0x0004
  35 #define RC_RCX     0x0008
  36 #define RC_RDX     0x0010
  37 #define RC_ST0     0x0080 /* only for long double */
  38 #define RC_R8      0x0100
  39 #define RC_R9      0x0200
  40 #define RC_R10     0x0400
  41 #define RC_R11     0x0800
  42 #define RC_XMM0    0x1000
  43 #define RC_XMM1    0x2000
  44 #define RC_XMM2    0x4000
  45 #define RC_XMM3    0x8000
  46 #define RC_XMM4    0x10000
  47 #define RC_XMM5    0x20000
  48 #define RC_XMM6    0x40000
  49 #define RC_XMM7    0x80000
  50 #define RC_IRET    RC_RAX /* function return: integer register */
  51 #define RC_LRET    RC_RDX /* function return: second integer register */
  52 #define RC_FRET    RC_XMM0 /* function return: float register */
  53 #define RC_QRET    RC_XMM1 /* function return: second float register */
  54
  55 /* pretty names for the registers */
  56 enum {
  57     TREG_RAX = 0,
  58     TREG_RCX = 1,
  59     TREG_RDX = 2,
  60     TREG_RSP = 4,
  61     TREG_RSI = 6,
  62     TREG_RDI = 7,
  63
  64     TREG_R8  = 8,
  65     TREG_R9  = 9,
  66     TREG_R10 = 10,
  67     TREG_R11 = 11,
  68
  69     TREG_XMM0 = 16,
  70     TREG_XMM1 = 17,
  71     TREG_XMM2 = 18,
  72     TREG_XMM3 = 19,
  73     TREG_XMM4 = 20,
  74     TREG_XMM5 = 21,
  75     TREG_XMM6 = 22,
  76     TREG_XMM7 = 23,
  77
  78     TREG_ST0 = 24,
  79
  80     TREG_MEM = 0x20,
  81 };
  82
  83 #define REX_BASE(reg) (((reg) >> 3) & 1)
  84 #define REG_VALUE(reg) ((reg) & 7)
  85
  86 /* return registers for function */
  87 #define REG_IRET TREG_RAX /* single word int return register */
  88 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  89 #define REG_FRET TREG_XMM0 /* float return register */
  90 #define REG_QRET TREG_XMM1 /* second float return register */
  91
  92 /* defined if function parameters must be evaluated in reverse order */
  93 #define INVERT_FUNC_PARAMS
  94
  95 /* pointer size, in bytes */
  96 #define PTR_SIZE 8
  97
  98 /* long double size and alignment, in bytes */
  99 #define LDOUBLE_SIZE  16
 100 #define LDOUBLE_ALIGN 16
 101 /* maximum alignment (for aligned attribute support) */
 102 #define MAX_ALIGN     16
 103
 104 /******************************************************/
 105 /* ELF defines */
 106
 107 #define EM_TCC_TARGET EM_X86_64
 108
 109 /* relocation type for 32 bit data relocation */
 110 #define R_DATA_32   R_X86_64_32
 111 #define R_DATA_PTR  R_X86_64_64
 112 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
 113 #define R_COPY      R_X86_64_COPY
 114
 115 #define ELF_START_ADDR 0x08048000
 116 #define ELF_PAGE_SIZE  0x1000
 117
 118 /******************************************************/
 119 #else /* ! TARGET_DEFS_ONLY */
 120 /******************************************************/
 121 #include "tcc.h"
 122 #include <assert.h>
 123
 124 ST_DATA const int reg_classes[NB_REGS] = {
 125     /* eax */ RC_INT | RC_RAX,
 126     /* ecx */ RC_INT | RC_RCX,
 127     /* edx */ RC_INT | RC_RDX,
 128     0,
 129     0,
 130     0,
 131     0,
 132     0,
 133     RC_R8,
 134     RC_R9,
 135     RC_R10,
 136     RC_R11,
 137     0,
 138     0,
 139     0,
 140     0,
 141     /* xmm0 */ RC_FLOAT | RC_XMM0,
 142     /* xmm1 */ RC_FLOAT | RC_XMM1,
 143     /* xmm2 */ RC_FLOAT | RC_XMM2,
 144     /* xmm3 */ RC_FLOAT | RC_XMM3,
 145     /* xmm4 */ RC_FLOAT | RC_XMM4,
 146     /* xmm5 */ RC_FLOAT | RC_XMM5,
 147     /* xmm6 an xmm7 are included so gv() can be used on them,
 148        but they are not tagged with RC_FLOAT because they are
 149        callee saved on Windows */
 150     RC_XMM6,
 151     RC_XMM7,
 152     /* st0 */ RC_ST0
 153 };
 154
 155 static unsigned long func_sub_sp_offset;
 156 static int func_ret_sub;
 157
 158 /* XXX: make it faster ? */
 159 void g(int c)
 160 {
 161     int ind1;
 162     ind1 = ind + 1;
 163     if (ind1 > cur_text_section->data_allocated)
 164         section_realloc(cur_text_section, ind1);
 165     cur_text_section->data[ind] = c;
 166     ind = ind1;
 167 }
 168
 169 void o(unsigned int c)
 170 {
 171     while (c) {
 172         g(c);
 173         c = c >> 8;
 174     }
 175 }
 176
 177 void gen_le16(int v)
 178 {
 179     g(v);
 180     g(v >> 8);
 181 }
 182
 183 void gen_le32(int c)
 184 {
 185     g(c);
 186     g(c >> 8);
 187     g(c >> 16);
 188     g(c >> 24);
 189 }
 190
 191 void gen_le64(int64_t c)
 192 {
 193     g(c);
 194     g(c >> 8);
 195     g(c >> 16);
 196     g(c >> 24);
 197     g(c >> 32);
 198     g(c >> 40);
 199     g(c >> 48);
 200     g(c >> 56);
 201 }
 202
 203 void orex(int ll, int r, int r2, int b)
 204 {
 205     if ((r & VT_VALMASK) >= VT_CONST)
 206         r = 0;
 207     if ((r2 & VT_VALMASK) >= VT_CONST)
 208         r2 = 0;
 209     if (ll || REX_BASE(r) || REX_BASE(r2))
 210         o(0x40 | REX_BASE(r) | (REX_BASE(r2) << 2) | (ll << 3));
 211     o(b);
 212 }
 213
 214 /* output a symbol and patch all calls to it */
 215 void gsym_addr(int t, int a)
 216 {
 217     int n, *ptr;
 218     while (t) {
 219         ptr = (int *)(cur_text_section->data + t);
 220         n = *ptr; /* next value */
 221         *ptr = a - t - 4;
 222         t = n;
 223     }
 224 }
 225
 226 void gsym(int t)
 227 {
 228     gsym_addr(t, ind);
 229 }
 230
 231 /* psym is used to put an instruction with a data field which is a
 232    reference to a symbol. It is in fact the same as oad ! */
 233 #define psym oad
 234
 235 static int is64_type(int t)
 236 {
 237     return ((t & VT_BTYPE) == VT_PTR ||
 238             (t & VT_BTYPE) == VT_FUNC ||
 239             (t & VT_BTYPE) == VT_LLONG);
 240 }
 241
 242 /* instruction + 4 bytes data. Return the address of the data */
 243 ST_FUNC int oad(int c, int s)
 244 {
 245     int ind1;
 246
 247     o(c);
 248     ind1 = ind + 4;
 249     if (ind1 > cur_text_section->data_allocated)
 250         section_realloc(cur_text_section, ind1);
 251     *(int *)(cur_text_section->data + ind) = s;
 252     s = ind;
 253     ind = ind1;
 254     return s;
 255 }
 256
 257 ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 258 {
 259     if (r & VT_SYM)
 260         greloc(cur_text_section, sym, ind, R_X86_64_32);
 261     gen_le32(c);
 262 }
 263
 264 /* output constant with relocation if 'r & VT_SYM' is true */
 265 ST_FUNC void gen_addr64(int r, Sym *sym, int64_t c)
 266 {
 267     if (r & VT_SYM)
 268         greloc(cur_text_section, sym, ind, R_X86_64_64);
 269     gen_le64(c);
 270 }
 271
 272 /* output constant with relocation if 'r & VT_SYM' is true */
 273 ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
 274 {
 275     if (r & VT_SYM)
 276         greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 277     gen_le32(c-4);
 278 }
 279
 280 /* output got address with relocation */
 281 static void gen_gotpcrel(int r, Sym *sym, int c)
 282 {
 283 #ifndef TCC_TARGET_PE
 284     Section *sr;
 285     ElfW(Rela) *rel;
 286     greloc(cur_text_section, sym, ind, R_X86_64_GOTPCREL);
 287     sr = cur_text_section->reloc;
 288     rel = (ElfW(Rela) *)(sr->data + sr->data_offset - sizeof(ElfW(Rela)));
 289     rel->r_addend = -4;
 290 #else
 291     printf("picpic: %s %x %x | %02x %02x %02x\n", get_tok_str(sym->v, NULL), c, r,
 292         cur_text_section->data[ind-3],
 293         cur_text_section->data[ind-2],
 294         cur_text_section->data[ind-1]
 295         );
 296     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 297 #endif
 298     gen_le32(0);
 299     if (c) {
 300         /* we use add c, %xxx for displacement */
 301         orex(1, r, 0, 0x81);
 302         o(0xc0 + REG_VALUE(r));
 303         gen_le32(c);
 304     }
 305 }
 306
 307 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 308 {
 309     op_reg = REG_VALUE(op_reg) << 3;
 310     if ((r & VT_VALMASK) == VT_CONST) {
 311         /* constant memory reference */
 312         o(0x05 | op_reg);
 313         if (is_got) {
 314             gen_gotpcrel(r, sym, c);
 315         } else {
 316             gen_addrpc32(r, sym, c);
 317         }
 318     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 319         /* currently, we use only ebp as base */
 320         if (c == (char)c) {
 321             /* short reference */
 322             o(0x45 | op_reg);
 323             g(c);
 324         } else {
 325             oad(0x85 | op_reg, c);
 326         }
 327     } else if ((r & VT_VALMASK) >= TREG_MEM) {
 328         if (c) {
 329             g(0x80 | op_reg | REG_VALUE(r));
 330             gen_le32(c);
 331         } else {
 332             g(0x00 | op_reg | REG_VALUE(r));
 333         }
 334     } else {
 335         g(0x00 | op_reg | REG_VALUE(r));
 336     }
 337 }
 338
 339 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 340    opcode bits */
 341 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 342 {
 343     gen_modrm_impl(op_reg, r, sym, c, 0);
 344 }
 345
 346 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 347    opcode bits */
 348 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 349 {
 350     int is_got;
 351     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 352     orex(1, r, op_reg, opcode);
 353     gen_modrm_impl(op_reg, r, sym, c, is_got);
 354 }
 355
 356
 357 /* load 'r' from value 'sv' */
 358 void load(int r, SValue *sv)
 359 {
 360     int v, t, ft, fc, fr;
 361     SValue v1;
 362
 363 #ifdef TCC_TARGET_PE
 364     SValue v2;
 365     sv = pe_getimport(sv, &v2);
 366 #endif
 367
 368     fr = sv->r;
 369     ft = sv->type.t;
 370     fc = sv->c.ul;
 371
 372 #ifndef TCC_TARGET_PE
 373     /* we use indirect access via got */
 374     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 375         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 376         /* use the result register as a temporal register */
 377         int tr = r | TREG_MEM;
 378         if (is_float(ft)) {
 379             /* we cannot use float registers as a temporal register */
 380             tr = get_reg(RC_INT) | TREG_MEM;
 381         }
 382         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 383
 384         /* load from the temporal register */
 385         fr = tr | VT_LVAL;
 386     }
 387 #endif
 388
 389     v = fr & VT_VALMASK;
 390     if (fr & VT_LVAL) {
 391         int b, ll;
 392         if (v == VT_LLOCAL) {
 393             v1.type.t = VT_PTR;
 394             v1.r = VT_LOCAL | VT_LVAL;
 395             v1.c.ul = fc;
 396             fr = r;
 397             if (!(reg_classes[fr] & RC_INT))
 398                 fr = get_reg(RC_INT);
 399             load(fr, &v1);
 400         }
 401         ll = 0;
 402         if ((ft & VT_BTYPE) == VT_FLOAT) {
 403             b = 0x6e0f66;
 404             r = REG_VALUE(r); /* movd */
 405         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 406             b = 0x7e0ff3; /* movq */
 407             r = REG_VALUE(r);
 408         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 409             b = 0xdb, r = 5; /* fldt */
 410         } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
 411             b = 0xbe0f;   /* movsbl */
 412         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 413             b = 0xb60f;   /* movzbl */
 414         } else if ((ft & VT_TYPE) == VT_SHORT) {
 415             b = 0xbf0f;   /* movswl */
 416         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 417             b = 0xb70f;   /* movzwl */
 418         } else {
 419             assert(((ft & VT_BTYPE) == VT_INT) || ((ft & VT_BTYPE) == VT_LLONG)
 420                    || ((ft & VT_BTYPE) == VT_PTR) || ((ft & VT_BTYPE) == VT_ENUM)
 421                    || ((ft & VT_BTYPE) == VT_FUNC));
 422             ll = is64_type(ft);
 423             b = 0x8b;
 424         }
 425         if (ll) {
 426             gen_modrm64(b, r, fr, sv->sym, fc);
 427         } else {
 428             orex(ll, fr, r, b);
 429             gen_modrm(r, fr, sv->sym, fc);
 430         }
 431     } else {
 432         if (v == VT_CONST) {
 433             if (fr & VT_SYM) {
 434 #ifdef TCC_TARGET_PE
 435                 orex(1,0,r,0x8d);
 436                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 437                 gen_addrpc32(fr, sv->sym, fc);
 438 #else
 439                 if (sv->sym->type.t & VT_STATIC) {
 440                     orex(1,0,r,0x8d);
 441                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 442                     gen_addrpc32(fr, sv->sym, fc);
 443                 } else {
 444                     orex(1,0,r,0x8b);
 445                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 446                     gen_gotpcrel(r, sv->sym, fc);
 447                 }
 448 #endif
 449             } else if (is64_type(ft)) {
 450                 orex(1,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 451                 gen_le64(sv->c.ull);
 452             } else {
 453                 orex(0,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 454                 gen_le32(fc);
 455             }
 456         } else if (v == VT_LOCAL) {
 457             orex(1,0,r,0x8d); /* lea xxx(%ebp), r */
 458             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 459         } else if (v == VT_CMP) {
 460             orex(0,r,0,0);
 461             if ((fc & ~0x100) != TOK_NE)
 462               oad(0xb8 + REG_VALUE(r), 0); /* mov $0, r */
 463             else
 464               oad(0xb8 + REG_VALUE(r), 1); /* mov $1, r */
 465             if (fc & 0x100)
 466               {
 467                 /* This was a float compare.  If the parity bit is
 468                    set the result was unordered, meaning false for everything
 469                    except TOK_NE, and true for TOK_NE.  */
 470                 fc &= ~0x100;
 471                 o(0x037a + (REX_BASE(r) << 8));
 472               }
 473             orex(0,r,0, 0x0f); /* setxx %br */
 474             o(fc);
 475             o(0xc0 + REG_VALUE(r));
 476         } else if (v == VT_JMP || v == VT_JMPI) {
 477             t = v & 1;
 478             orex(0,r,0,0);
 479             oad(0xb8 + REG_VALUE(r), t); /* mov $1, r */
 480             o(0x05eb + (REX_BASE(r) << 8)); /* jmp after */
 481             gsym(fc);
 482             orex(0,r,0,0);
 483             oad(0xb8 + REG_VALUE(r), t ^ 1); /* mov $0, r */
 484         } else if (v != r) {
 485             if ((r >= TREG_XMM0) && (r <= TREG_XMM7)) {
 486                 if (v == TREG_ST0) {
 487                     /* gen_cvt_ftof(VT_DOUBLE); */
 488                     o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 489                     /* movsd -0x10(%rsp),%xmmN */
 490                     o(0x100ff2);
 491                     o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 492                     o(0xf024);
 493                 } else {
 494                     assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 495                     if ((ft & VT_BTYPE) == VT_FLOAT) {
 496                         o(0x100ff3);
 497                     } else {
 498                         assert((ft & VT_BTYPE) == VT_DOUBLE);
 499                         o(0x100ff2);
 500                     }
 501                     o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8);
 502                 }
 503             } else if (r == TREG_ST0) {
 504                 assert((v >= TREG_XMM0) || (v <= TREG_XMM7));
 505                 /* gen_cvt_ftof(VT_LDOUBLE); */
 506                 /* movsd %xmmN,-0x10(%rsp) */
 507                 o(0x110ff2);
 508                 o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 509                 o(0xf024);
 510                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 511             } else {
 512                 orex(1,r,v, 0x89);
 513                 o(0xc0 + REG_VALUE(r) + REG_VALUE(v) * 8); /* mov v, r */
 514             }
 515         }
 516     }
 517 }
 518
 519 /* store register 'r' in lvalue 'v' */
 520 void store(int r, SValue *v)
 521 {
 522     int fr, bt, ft, fc;
 523     int op64 = 0;
 524     /* store the REX prefix in this variable when PIC is enabled */
 525     int pic = 0;
 526
 527 #ifdef TCC_TARGET_PE
 528     SValue v2;
 529     v = pe_getimport(v, &v2);
 530 #endif
 531
 532     ft = v->type.t;
 533     fc = v->c.ul;
 534     fr = v->r & VT_VALMASK;
 535     bt = ft & VT_BTYPE;
 536
 537 #ifndef TCC_TARGET_PE
 538     /* we need to access the variable via got */
 539     if (fr == VT_CONST && (v->r & VT_SYM)) {
 540         /* mov xx(%rip), %r11 */
 541         o(0x1d8b4c);
 542         gen_gotpcrel(TREG_R11, v->sym, v->c.ul);
 543         pic = is64_type(bt) ? 0x49 : 0x41;
 544     }
 545 #endif
 546
 547     /* XXX: incorrect if float reg to reg */
 548     if (bt == VT_FLOAT) {
 549         o(0x66);
 550         o(pic);
 551         o(0x7e0f); /* movd */
 552         r = REG_VALUE(r);
 553     } else if (bt == VT_DOUBLE) {
 554         o(0x66);
 555         o(pic);
 556         o(0xd60f); /* movq */
 557         r = REG_VALUE(r);
 558     } else if (bt == VT_LDOUBLE) {
 559         o(0xc0d9); /* fld %st(0) */
 560         o(pic);
 561         o(0xdb); /* fstpt */
 562         r = 7;
 563     } else {
 564         if (bt == VT_SHORT)
 565             o(0x66);
 566         o(pic);
 567         if (bt == VT_BYTE || bt == VT_BOOL)
 568             orex(0, 0, r, 0x88);
 569         else if (is64_type(bt))
 570             op64 = 0x89;
 571         else
 572             orex(0, 0, r, 0x89);
 573     }
 574     if (pic) {
 575         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 576         if (op64)
 577             o(op64);
 578         o(3 + (r << 3));
 579     } else if (op64) {
 580         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 581             gen_modrm64(op64, r, v->r, v->sym, fc);
 582         } else if (fr != r) {
 583             /* XXX: don't we really come here? */
 584             abort();
 585             o(0xc0 + fr + r * 8); /* mov r, fr */
 586         }
 587     } else {
 588         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 589             gen_modrm(r, v->r, v->sym, fc);
 590         } else if (fr != r) {
 591             /* XXX: don't we really come here? */
 592             abort();
 593             o(0xc0 + fr + r * 8); /* mov r, fr */
 594         }
 595     }
 596 }
 597
 598 /* 'is_jmp' is '1' if it is a jump */
 599 static void gcall_or_jmp(int is_jmp)
 600 {
 601     int r;
 602     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
 603         /* constant case */
 604         if (vtop->r & VT_SYM) {
 605             /* relocation case */
 606             greloc(cur_text_section, vtop->sym,
 607                    ind + 1, R_X86_64_PC32);
 608         } else {
 609             /* put an empty PC32 relocation */
 610             put_elf_reloc(symtab_section, cur_text_section,
 611                           ind + 1, R_X86_64_PC32, 0);
 612         }
 613         oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
 614     } else {
 615         /* otherwise, indirect call */
 616         r = TREG_R11;
 617         load(r, vtop);
 618         o(0x41); /* REX */
 619         o(0xff); /* call/jmp *r */
 620         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 621     }
 622 }
 623
 624 #ifdef TCC_TARGET_PE
 625
 626 #define REGN 4
 627 static const uint8_t arg_regs[REGN] = {
 628     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 629 };
 630
 631 /* Prepare arguments in R10 and R11 rather than RCX and RDX
 632    because gv() will not ever use these */
 633 static int arg_prepare_reg(int idx) {
 634   if (idx == 0 || idx == 1)
 635       /* idx=0: r10, idx=1: r11 */
 636       return idx + 10;
 637   else
 638       return arg_regs[idx];
 639 }
 640
 641 static int func_scratch;
 642
 643 /* Generate function call. The function address is pushed first, then
 644    all the parameters in call order. This functions pops all the
 645    parameters and the function address. */
 646
 647 void gen_offs_sp(int b, int r, int d)
 648 {
 649     orex(1,0,r & 0x100 ? 0 : r, b);
 650     if (d == (char)d) {
 651         o(0x2444 | (REG_VALUE(r) << 3));
 652         g(d);
 653     } else {
 654         o(0x2484 | (REG_VALUE(r) << 3));
 655         gen_le32(d);
 656     }
 657 }
 658
 659 /* Return the number of registers needed to return the struct, or 0 if
 660    returning via struct pointer. */
 661 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align)
 662 {
 663     int size, align;
 664     *ret_align = 1; // Never have to re-align return values for x86-64
 665     size = type_size(vt, &align);
 666     ret->ref = NULL;
 667     if (size > 8) {
 668         return 0;
 669     } else if (size > 4) {
 670         ret->t = VT_LLONG;
 671         return 1;
 672     } else if (size > 2) {
 673         ret->t = VT_INT;
 674         return 1;
 675     } else if (size > 1) {
 676         ret->t = VT_SHORT;
 677         return 1;
 678     } else {
 679         ret->t = VT_BYTE;
 680         return 1;
 681     }
 682 }
 683
 684 static int is_sse_float(int t) {
 685     int bt;
 686     bt = t & VT_BTYPE;
 687     return bt == VT_DOUBLE || bt == VT_FLOAT;
 688 }
 689
 690 int gfunc_arg_size(CType *type) {
 691     int align;
 692     if (type->t & (VT_ARRAY|VT_BITFIELD))
 693         return 8;
 694     return type_size(type, &align);
 695 }
 696
 697 void gfunc_call(int nb_args)
 698 {
 699     int size, r, args_size, i, d, bt, struct_size;
 700     int arg;
 701
 702     args_size = (nb_args < REGN ? REGN : nb_args) * PTR_SIZE;
 703     arg = nb_args;
 704
 705     /* for struct arguments, we need to call memcpy and the function
 706        call breaks register passing arguments we are preparing.
 707        So, we process arguments which will be passed by stack first. */
 708     struct_size = args_size;
 709     for(i = 0; i < nb_args; i++) {
 710         SValue *sv;
 711
 712         --arg;
 713         sv = &vtop[-i];
 714         bt = (sv->type.t & VT_BTYPE);
 715         size = gfunc_arg_size(&sv->type);
 716
 717         if (size <= 8)
 718             continue; /* arguments smaller than 8 bytes passed in registers or on stack */
 719
 720         if (bt == VT_STRUCT) {
 721             /* align to stack align size */
 722             size = (size + 15) & ~15;
 723             /* generate structure store */
 724             r = get_reg(RC_INT);
 725             gen_offs_sp(0x8d, r, struct_size);
 726             struct_size += size;
 727
 728             /* generate memcpy call */
 729             vset(&sv->type, r | VT_LVAL, 0);
 730             vpushv(sv);
 731             vstore();
 732             --vtop;
 733         } else if (bt == VT_LDOUBLE) {
 734             gv(RC_ST0);
 735             gen_offs_sp(0xdb, 0x107, struct_size);
 736             struct_size += 16;
 737         }
 738     }
 739
 740     if (func_scratch < struct_size)
 741         func_scratch = struct_size;
 742
 743     arg = nb_args;
 744     struct_size = args_size;
 745
 746     for(i = 0; i < nb_args; i++) {
 747         --arg;
 748         bt = (vtop->type.t & VT_BTYPE);
 749
 750         size = gfunc_arg_size(&vtop->type);
 751         if (size > 8) {
 752             /* align to stack align size */
 753             size = (size + 15) & ~15;
 754             if (arg >= REGN) {
 755                 d = get_reg(RC_INT);
 756                 gen_offs_sp(0x8d, d, struct_size);
 757                 gen_offs_sp(0x89, d, arg*8);
 758             } else {
 759                 d = arg_prepare_reg(arg);
 760                 gen_offs_sp(0x8d, d, struct_size);
 761             }
 762             struct_size += size;
 763         } else {
 764             if (is_sse_float(vtop->type.t)) {
 765                 gv(RC_XMM0); /* only use one float register */
 766                 if (arg >= REGN) {
 767                     /* movq %xmm0, j*8(%rsp) */
 768                     gen_offs_sp(0xd60f66, 0x100, arg*8);
 769                 } else {
 770                     /* movaps %xmm0, %xmmN */
 771                     o(0x280f);
 772                     o(0xc0 + (arg << 3));
 773                     d = arg_prepare_reg(arg);
 774                     /* mov %xmm0, %rxx */
 775                     o(0x66);
 776                     orex(1,d,0, 0x7e0f);
 777                     o(0xc0 + REG_VALUE(d));
 778                 }
 779             } else {
 780                 if (bt == VT_STRUCT) {
 781                     vtop->type.ref = NULL;
 782                     vtop->type.t = size > 4 ? VT_LLONG : size > 2 ? VT_INT
 783                         : size > 1 ? VT_SHORT : VT_BYTE;
 784                 }
 785
 786                 r = gv(RC_INT);
 787                 if (arg >= REGN) {
 788                     gen_offs_sp(0x89, r, arg*8);
 789                 } else {
 790                     d = arg_prepare_reg(arg);
 791                     orex(1,d,r,0x89); /* mov */
 792                     o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
 793                 }
 794             }
 795         }
 796         vtop--;
 797     }
 798     save_regs(0);
 799
 800     /* Copy R10 and R11 into RCX and RDX, respectively */
 801     if (nb_args > 0) {
 802         o(0xd1894c); /* mov %r10, %rcx */
 803         if (nb_args > 1) {
 804             o(0xda894c); /* mov %r11, %rdx */
 805         }
 806     }
 807
 808     gcall_or_jmp(0);
 809     vtop--;
 810 }
 811
 812
 813 #define FUNC_PROLOG_SIZE 11
 814
 815 /* generate function prolog of type 't' */
 816 void gfunc_prolog(CType *func_type)
 817 {
 818     int addr, reg_param_index, bt, size;
 819     Sym *sym;
 820     CType *type;
 821
 822     func_ret_sub = 0;
 823     func_scratch = 0;
 824     loc = 0;
 825
 826     addr = PTR_SIZE * 2;
 827     ind += FUNC_PROLOG_SIZE;
 828     func_sub_sp_offset = ind;
 829     reg_param_index = 0;
 830
 831     sym = func_type->ref;
 832
 833     /* if the function returns a structure, then add an
 834        implicit pointer parameter */
 835     func_vt = sym->type;
 836     func_var = (sym->c == FUNC_ELLIPSIS);
 837     size = gfunc_arg_size(&func_vt);
 838     if (size > 8) {
 839         gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 840         func_vc = addr;
 841         reg_param_index++;
 842         addr += 8;
 843     }
 844
 845     /* define parameters */
 846     while ((sym = sym->next) != NULL) {
 847         type = &sym->type;
 848         bt = type->t & VT_BTYPE;
 849         size = gfunc_arg_size(type);
 850         if (size > 8) {
 851             if (reg_param_index < REGN) {
 852                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 853             }
 854             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL | VT_REF, addr);
 855         } else {
 856             if (reg_param_index < REGN) {
 857                 /* save arguments passed by register */
 858                 if ((bt == VT_FLOAT) || (bt == VT_DOUBLE)) {
 859                     o(0xd60f66); /* movq */
 860                     gen_modrm(reg_param_index, VT_LOCAL, NULL, addr);
 861                 } else {
 862                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 863                 }
 864             }
 865             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL, addr);
 866         }
 867         addr += 8;
 868         reg_param_index++;
 869     }
 870
 871     while (reg_param_index < REGN) {
 872         if (func_type->ref->c == FUNC_ELLIPSIS) {
 873             gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 874             addr += 8;
 875         }
 876         reg_param_index++;
 877     }
 878 }
 879
 880 /* generate function epilog */
 881 void gfunc_epilog(void)
 882 {
 883     int v, saved_ind;
 884
 885     o(0xc9); /* leave */
 886     if (func_ret_sub == 0) {
 887         o(0xc3); /* ret */
 888     } else {
 889         o(0xc2); /* ret n */
 890         g(func_ret_sub);
 891         g(func_ret_sub >> 8);
 892     }
 893
 894     saved_ind = ind;
 895     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 896     /* align local size to word & save local variables */
 897     v = (func_scratch + -loc + 15) & -16;
 898
 899     if (v >= 4096) {
 900         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 901         oad(0xb8, v); /* mov stacksize, %eax */
 902         oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
 903         greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 904         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
 905     } else {
 906         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 907         o(0xec8148);  /* sub rsp, stacksize */
 908         gen_le32(v);
 909     }
 910
 911     cur_text_section->data_offset = saved_ind;
 912     pe_add_unwind_data(ind, saved_ind, v);
 913     ind = cur_text_section->data_offset;
 914 }
 915
 916 #else
 917
 918 static void gadd_sp(int val)
 919 {
 920     if (val == (char)val) {
 921         o(0xc48348);
 922         g(val);
 923     } else {
 924         oad(0xc48148, val); /* add $xxx, %rsp */
 925     }
 926 }
 927
 928 typedef enum X86_64_Mode {
 929   x86_64_mode_none,
 930   x86_64_mode_memory,
 931   x86_64_mode_integer,
 932   x86_64_mode_sse,
 933   x86_64_mode_x87
 934 } X86_64_Mode;
 935
 936 static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b) {
 937     if (a == b)
 938         return a;
 939     else if (a == x86_64_mode_none)
 940         return b;
 941     else if (b == x86_64_mode_none)
 942         return a;
 943     else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
 944         return x86_64_mode_memory;
 945     else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
 946         return x86_64_mode_integer;
 947     else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
 948         return x86_64_mode_memory;
 949     else
 950         return x86_64_mode_sse;
 951 }
 952
 953 static X86_64_Mode classify_x86_64_inner(CType *ty) {
 954     X86_64_Mode mode;
 955     Sym *f;
 956
 957     switch (ty->t & VT_BTYPE) {
 958     case VT_VOID: return x86_64_mode_none;
 959
 960     case VT_INT:
 961     case VT_BYTE:
 962     case VT_SHORT:
 963     case VT_LLONG:
 964     case VT_BOOL:
 965     case VT_PTR:
 966     case VT_FUNC:
 967     case VT_ENUM: return x86_64_mode_integer;
 968
 969     case VT_FLOAT:
 970     case VT_DOUBLE: return x86_64_mode_sse;
 971
 972     case VT_LDOUBLE: return x86_64_mode_x87;
 973
 974     case VT_STRUCT:
 975         f = ty->ref;
 976
 977         // Detect union
 978         if (f->next && (f->c == f->next->c))
 979           return x86_64_mode_memory;
 980
 981         mode = x86_64_mode_none;
 982         for (; f; f = f->next)
 983             mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
 984
 985         return mode;
 986     }
 987
 988     assert(0);
 989 }
 990
 991 static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count) {
 992     X86_64_Mode mode;
 993     int size, align, ret_t = 0;
 994
 995     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
 996         *psize = 8;
 997         *palign = 8;
 998         *reg_count = 1;
 999         ret_t = ty->t;
1000         mode = x86_64_mode_integer;
1001     } else {
1002         size = type_size(ty, &align);
1003         *psize = (size + 7) & ~7;
1004         *palign = (align + 7) & ~7;
1005
1006         if (size > 16) {
1007             mode = x86_64_mode_memory;
1008         } else {
1009             mode = classify_x86_64_inner(ty);
1010             switch (mode) {
1011             case x86_64_mode_integer:
1012                 if (size > 8) {
1013                     *reg_count = 2;
1014                     ret_t = VT_QLONG;
1015                 } else {
1016                     *reg_count = 1;
1017                     ret_t = (size > 4) ? VT_LLONG : VT_INT;
1018                 }
1019                 break;
1020
1021             case x86_64_mode_x87:
1022                 *reg_count = 1;
1023                 ret_t = VT_LDOUBLE;
1024                 break;
1025
1026             case x86_64_mode_sse:
1027                 if (size > 8) {
1028                     *reg_count = 2;
1029                     ret_t = VT_QFLOAT;
1030                 } else {
1031                     *reg_count = 1;
1032                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
1033                 }
1034                 break;
1035             default: break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
1036             }
1037         }
1038     }
1039
1040     if (ret) {
1041         ret->ref = NULL;
1042         ret->t = ret_t;
1043     }
1044
1045     return mode;
1046 }
1047
1048 ST_FUNC int classify_x86_64_va_arg(CType *ty) {
1049     /* This definition must be synced with stdarg.h */
1050     enum __va_arg_type {
1051         __va_gen_reg, __va_float_reg, __va_stack
1052     };
1053     int size, align, reg_count;
1054     X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
1055     switch (mode) {
1056     default: return __va_stack;
1057     case x86_64_mode_integer: return __va_gen_reg;
1058     case x86_64_mode_sse: return __va_float_reg;
1059     }
1060 }
1061
1062 /* Return the number of registers needed to return the struct, or 0 if
1063    returning via struct pointer. */
1064 int gfunc_sret(CType *vt, CType *ret, int *ret_align) {
1065     int size, align, reg_count;
1066     *ret_align = 1; // Never have to re-align return values for x86-64
1067     return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
1068 }
1069
1070 #define REGN 6
1071 static const uint8_t arg_regs[REGN] = {
1072     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
1073 };
1074
1075 static int arg_prepare_reg(int idx) {
1076   if (idx == 2 || idx == 3)
1077       /* idx=2: r10, idx=3: r11 */
1078       return idx + 8;
1079   else
1080       return arg_regs[idx];
1081 }
1082
1083 /* Generate function call. The function address is pushed first, then
1084    all the parameters in call order. This functions pops all the
1085    parameters and the function address. */
1086 void gfunc_call(int nb_args)
1087 {
1088     X86_64_Mode mode;
1089     CType type;
1090     int size, align, r, args_size, stack_adjust, run_start, run_end, i, reg_count;
1091     int nb_reg_args = 0;
1092     int nb_sse_args = 0;
1093     int sse_reg, gen_reg;
1094
1095     /* calculate the number of integer/float register arguments */
1096     for(i = 0; i < nb_args; i++) {
1097         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1098         if (mode == x86_64_mode_sse)
1099             nb_sse_args += reg_count;
1100         else if (mode == x86_64_mode_integer)
1101             nb_reg_args += reg_count;
1102     }
1103
1104     /* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
1105        and ended by a 16-byte aligned argument. This is because, from the point of view of
1106        the callee, argument alignment is computed from the bottom up. */
1107     /* for struct arguments, we need to call memcpy and the function
1108        call breaks register passing arguments we are preparing.
1109        So, we process arguments which will be passed by stack first. */
1110     gen_reg = nb_reg_args;
1111     sse_reg = nb_sse_args;
1112     run_start = 0;
1113     args_size = 0;
1114     while (run_start != nb_args) {
1115         int run_gen_reg = gen_reg, run_sse_reg = sse_reg;
1116
1117         run_end = nb_args;
1118         stack_adjust = 0;
1119         for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
1120             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1121             switch (mode) {
1122             case x86_64_mode_memory:
1123             case x86_64_mode_x87:
1124             stack_arg:
1125                 if (align == 16)
1126                     run_end = i;
1127                 else
1128                     stack_adjust += size;
1129                 break;
1130
1131             case x86_64_mode_sse:
1132                 sse_reg -= reg_count;
1133                 if (sse_reg + reg_count > 8) goto stack_arg;
1134                 break;
1135
1136             case x86_64_mode_integer:
1137                 gen_reg -= reg_count;
1138                 if (gen_reg + reg_count > REGN) goto stack_arg;
1139                 break;
1140             default: break; /* nothing to be done for x86_64_mode_none */
1141             }
1142         }
1143
1144         gen_reg = run_gen_reg;
1145         sse_reg = run_sse_reg;
1146
1147         /* adjust stack to align SSE boundary */
1148         if (stack_adjust &= 15) {
1149             /* fetch cpu flag before the following sub will change the value */
1150             if (vtop >= vstack && (vtop->r & VT_VALMASK) == VT_CMP)
1151                 gv(RC_INT);
1152
1153             stack_adjust = 16 - stack_adjust;
1154             o(0x48);
1155             oad(0xec81, stack_adjust); /* sub $xxx, %rsp */
1156             args_size += stack_adjust;
1157         }
1158
1159         for(i = run_start; i < run_end;) {
1160             /* Swap argument to top, it will possibly be changed here,
1161               and might use more temps. At the end of the loop we keep
1162               in on the stack and swap it back to its original position
1163               if it is a register. */
1164             SValue tmp = vtop[0];
1165             vtop[0] = vtop[-i];
1166             vtop[-i] = tmp;
1167
1168             mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
1169
1170             int arg_stored = 1;
1171             switch (vtop->type.t & VT_BTYPE) {
1172             case VT_STRUCT:
1173                 if (mode == x86_64_mode_sse) {
1174                     if (sse_reg > 8)
1175                         sse_reg -= reg_count;
1176                     else
1177                         arg_stored = 0;
1178                 } else if (mode == x86_64_mode_integer) {
1179                     if (gen_reg > REGN)
1180                         gen_reg -= reg_count;
1181                     else
1182                         arg_stored = 0;
1183                 }
1184
1185                 if (arg_stored) {
1186                     /* allocate the necessary size on stack */
1187                     o(0x48);
1188                     oad(0xec81, size); /* sub $xxx, %rsp */
1189                     /* generate structure store */
1190                     r = get_reg(RC_INT);
1191                     orex(1, r, 0, 0x89); /* mov %rsp, r */
1192                     o(0xe0 + REG_VALUE(r));
1193                     vset(&vtop->type, r | VT_LVAL, 0);
1194                     vswap();
1195                     vstore();
1196                     args_size += size;
1197                 }
1198                 break;
1199
1200             case VT_LDOUBLE:
1201                 assert(0);
1202                 break;
1203
1204             case VT_FLOAT:
1205             case VT_DOUBLE:
1206                 assert(mode == x86_64_mode_sse);
1207                 if (sse_reg > 8) {
1208                     --sse_reg;
1209                     r = gv(RC_FLOAT);
1210                     o(0x50); /* push $rax */
1211                     /* movq %xmmN, (%rsp) */
1212                     o(0xd60f66);
1213                     o(0x04 + REG_VALUE(r)*8);
1214                     o(0x24);
1215                     args_size += size;
1216                 } else {
1217                     arg_stored = 0;
1218                 }
1219                 break;
1220
1221             default:
1222                 assert(mode == x86_64_mode_integer);
1223                 /* simple type */
1224                 /* XXX: implicit cast ? */
1225                 if (gen_reg > REGN) {
1226                     --gen_reg;
1227                     r = gv(RC_INT);
1228                     orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
1229                     args_size += size;
1230                 } else {
1231                     arg_stored = 0;
1232                 }
1233                 break;
1234             }
1235
1236             /* And swap the argument back to it's original position.  */
1237             tmp = vtop[0];
1238             vtop[0] = vtop[-i];
1239             vtop[-i] = tmp;
1240
1241             if (arg_stored) {
1242               vrotb(i+1);
1243               assert((vtop->type.t == tmp.type.t) && (vtop->r == tmp.r));
1244               vpop();
1245               --nb_args;
1246               --run_end;
1247             } else {
1248               ++i;
1249             }
1250         }
1251
1252         /* handle 16 byte aligned arguments at end of run */
1253         run_start = i = run_end;
1254         while (i < nb_args) {
1255             /* Rotate argument to top since it will always be popped */
1256             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1257             if (align != 16)
1258               break;
1259
1260             vrotb(i+1);
1261
1262             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1263                 gv(RC_ST0);
1264                 oad(0xec8148, size); /* sub $xxx, %rsp */
1265                 o(0x7cdb); /* fstpt 0(%rsp) */
1266                 g(0x24);
1267                 g(0x00);
1268                 args_size += size;
1269             } else {
1270                 assert(mode == x86_64_mode_memory);
1271
1272                 /* allocate the necessary size on stack */
1273                 o(0x48);
1274                 oad(0xec81, size); /* sub $xxx, %rsp */
1275                 /* generate structure store */
1276                 r = get_reg(RC_INT);
1277                 orex(1, r, 0, 0x89); /* mov %rsp, r */
1278                 o(0xe0 + REG_VALUE(r));
1279                 vset(&vtop->type, r | VT_LVAL, 0);
1280                 vswap();
1281                 vstore();
1282                 args_size += size;
1283             }
1284
1285             vpop();
1286             --nb_args;
1287         }
1288     }
1289
1290     /* XXX This should be superfluous.  */
1291     save_regs(0); /* save used temporary registers */
1292
1293     /* then, we prepare register passing arguments.
1294        Note that we cannot set RDX and RCX in this loop because gv()
1295        may break these temporary registers. Let's use R10 and R11
1296        instead of them */
1297     assert(gen_reg <= REGN);
1298     assert(sse_reg <= 8);
1299     for(i = 0; i < nb_args; i++) {
1300         mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
1301         /* Alter stack entry type so that gv() knows how to treat it */
1302         vtop->type = type;
1303         if (mode == x86_64_mode_sse) {
1304             if (reg_count == 2) {
1305                 sse_reg -= 2;
1306                 gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
1307                 if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
1308                     /* movaps %xmm0, %xmmN */
1309                     o(0x280f);
1310                     o(0xc0 + (sse_reg << 3));
1311                     /* movaps %xmm1, %xmmN */
1312                     o(0x280f);
1313                     o(0xc1 + ((sse_reg+1) << 3));
1314                 }
1315             } else {
1316                 assert(reg_count == 1);
1317                 --sse_reg;
1318                 /* Load directly to register */
1319                 gv(RC_XMM0 << sse_reg);
1320             }
1321         } else if (mode == x86_64_mode_integer) {
1322             /* simple type */
1323             /* XXX: implicit cast ? */
1324             gen_reg -= reg_count;
1325             r = gv(RC_INT);
1326             int d = arg_prepare_reg(gen_reg);
1327             orex(1,d,r,0x89); /* mov */
1328             o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
1329             if (reg_count == 2) {
1330                 d = arg_prepare_reg(gen_reg+1);
1331                 orex(1,d,vtop->r2,0x89); /* mov */
1332                 o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
1333             }
1334         }
1335         vtop--;
1336     }
1337     assert(gen_reg == 0);
1338     assert(sse_reg == 0);
1339
1340     /* We shouldn't have many operands on the stack anymore, but the
1341        call address itself is still there, and it might be in %eax
1342        (or edx/ecx) currently, which the below writes would clobber.
1343        So evict all remaining operands here.  */
1344     save_regs(0);
1345
1346     /* Copy R10 and R11 into RDX and RCX, respectively */
1347     if (nb_reg_args > 2) {
1348         o(0xd2894c); /* mov %r10, %rdx */
1349         if (nb_reg_args > 3) {
1350             o(0xd9894c); /* mov %r11, %rcx */
1351         }
1352     }
1353
1354     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
1355     gcall_or_jmp(0);
1356     if (args_size)
1357         gadd_sp(args_size);
1358     vtop--;
1359 }
1360
1361
1362 #define FUNC_PROLOG_SIZE 11
1363
1364 static void push_arg_reg(int i) {
1365     loc -= 8;
1366     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
1367 }
1368
1369 /* generate function prolog of type 't' */
1370 void gfunc_prolog(CType *func_type)
1371 {
1372     X86_64_Mode mode;
1373     int i, addr, align, size, reg_count;
1374     int param_addr = 0, reg_param_index, sse_param_index;
1375     Sym *sym;
1376     CType *type;
1377
1378     sym = func_type->ref;
1379     addr = PTR_SIZE * 2;
1380     loc = 0;
1381     ind += FUNC_PROLOG_SIZE;
1382     func_sub_sp_offset = ind;
1383     func_ret_sub = 0;
1384
1385     if (func_type->ref->c == FUNC_ELLIPSIS) {
1386         int seen_reg_num, seen_sse_num, seen_stack_size;
1387         seen_reg_num = seen_sse_num = 0;
1388         /* frame pointer and return address */
1389         seen_stack_size = PTR_SIZE * 2;
1390         /* count the number of seen parameters */
1391         sym = func_type->ref;
1392         while ((sym = sym->next) != NULL) {
1393             type = &sym->type;
1394             mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1395             switch (mode) {
1396             default:
1397             stack_arg:
1398                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
1399                 break;
1400
1401             case x86_64_mode_integer:
1402                 if (seen_reg_num + reg_count <= 8) {
1403                     seen_reg_num += reg_count;
1404                 } else {
1405                     seen_reg_num = 8;
1406                     goto stack_arg;
1407                 }
1408                 break;
1409
1410             case x86_64_mode_sse:
1411                 if (seen_sse_num + reg_count <= 8) {
1412                     seen_sse_num += reg_count;
1413                 } else {
1414                     seen_sse_num = 8;
1415                     goto stack_arg;
1416                 }
1417                 break;
1418             }
1419         }
1420
1421         loc -= 16;
1422         /* movl $0x????????, -0x10(%rbp) */
1423         o(0xf045c7);
1424         gen_le32(seen_reg_num * 8);
1425         /* movl $0x????????, -0xc(%rbp) */
1426         o(0xf445c7);
1427         gen_le32(seen_sse_num * 16 + 48);
1428         /* movl $0x????????, -0x8(%rbp) */
1429         o(0xf845c7);
1430         gen_le32(seen_stack_size);
1431
1432         /* save all register passing arguments */
1433         for (i = 0; i < 8; i++) {
1434             loc -= 16;
1435             o(0xd60f66); /* movq */
1436             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
1437             /* movq $0, loc+8(%rbp) */
1438             o(0x85c748);
1439             gen_le32(loc + 8);
1440             gen_le32(0);
1441         }
1442         for (i = 0; i < REGN; i++) {
1443             push_arg_reg(REGN-1-i);
1444         }
1445     }
1446
1447     sym = func_type->ref;
1448     reg_param_index = 0;
1449     sse_param_index = 0;
1450
1451     /* if the function returns a structure, then add an
1452        implicit pointer parameter */
1453     func_vt = sym->type;
1454     mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
1455     if (mode == x86_64_mode_memory) {
1456         push_arg_reg(reg_param_index);
1457         func_vc = loc;
1458         reg_param_index++;
1459     }
1460     /* define parameters */
1461     while ((sym = sym->next) != NULL) {
1462         type = &sym->type;
1463         mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1464         switch (mode) {
1465         case x86_64_mode_sse:
1466             if (sse_param_index + reg_count <= 8) {
1467                 /* save arguments passed by register */
1468                 loc -= reg_count * 8;
1469                 param_addr = loc;
1470                 for (i = 0; i < reg_count; ++i) {
1471                     o(0xd60f66); /* movq */
1472                     gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
1473                     ++sse_param_index;
1474                 }
1475             } else {
1476                 addr = (addr + align - 1) & -align;
1477                 param_addr = addr;
1478                 addr += size;
1479                 sse_param_index += reg_count;
1480             }
1481             break;
1482
1483         case x86_64_mode_memory:
1484         case x86_64_mode_x87:
1485             addr = (addr + align - 1) & -align;
1486             param_addr = addr;
1487             addr += size;
1488             break;
1489
1490         case x86_64_mode_integer: {
1491             if (reg_param_index + reg_count <= REGN) {
1492                 /* save arguments passed by register */
1493                 loc -= reg_count * 8;
1494                 param_addr = loc;
1495                 for (i = 0; i < reg_count; ++i) {
1496                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
1497                     ++reg_param_index;
1498                 }
1499             } else {
1500                 addr = (addr + align - 1) & -align;
1501                 param_addr = addr;
1502                 addr += size;
1503                 reg_param_index += reg_count;
1504             }
1505             break;
1506         }
1507         default: break; /* nothing to be done for x86_64_mode_none */
1508         }
1509         sym_push(sym->v & ~SYM_FIELD, type,
1510                  VT_LOCAL | VT_LVAL, param_addr);
1511     }
1512 }
1513
1514 /* generate function epilog */
1515 void gfunc_epilog(void)
1516 {
1517     int v, saved_ind;
1518
1519     o(0xc9); /* leave */
1520     if (func_ret_sub == 0) {
1521         o(0xc3); /* ret */
1522     } else {
1523         o(0xc2); /* ret n */
1524         g(func_ret_sub);
1525         g(func_ret_sub >> 8);
1526     }
1527     /* align local size to word & save local variables */
1528     v = (-loc + 15) & -16;
1529     saved_ind = ind;
1530     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1531     o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1532     o(0xec8148);  /* sub rsp, stacksize */
1533     gen_le32(v);
1534     ind = saved_ind;
1535 }
1536
1537 #endif /* not PE */
1538
1539 /* generate a jump to a label */
1540 int gjmp(int t)
1541 {
1542     return psym(0xe9, t);
1543 }
1544
1545 /* generate a jump to a fixed address */
1546 void gjmp_addr(int a)
1547 {
1548     int r;
1549     r = a - ind - 2;
1550     if (r == (char)r) {
1551         g(0xeb);
1552         g(r);
1553     } else {
1554         oad(0xe9, a - ind - 5);
1555     }
1556 }
1557
1558 /* generate a test. set 'inv' to invert test. Stack entry is popped */
1559 int gtst(int inv, int t)
1560 {
1561     int v, *p;
1562
1563     v = vtop->r & VT_VALMASK;
1564     if (v == VT_CMP) {
1565         /* fast case : can jump directly since flags are set */
1566         if (vtop->c.i & 0x100)
1567           {
1568             /* This was a float compare.  If the parity flag is set
1569                the result was unordered.  For anything except != this
1570                means false and we don't jump (anding both conditions).
1571                For != this means true (oring both).
1572                Take care about inverting the test.  We need to jump
1573                to our target if the result was unordered and test wasn't NE,
1574                otherwise if unordered we don't want to jump.  */
1575             vtop->c.i &= ~0x100;
1576             if (!inv == (vtop->c.i != TOK_NE))
1577               o(0x067a);  /* jp +6 */
1578             else
1579               {
1580                 g(0x0f);
1581                 t = psym(0x8a, t); /* jp t */
1582               }
1583           }
1584         g(0x0f);
1585         t = psym((vtop->c.i - 16) ^ inv, t);
1586     } else { /* VT_JMP || VT_JMPI */
1587         /* && or || optimization */
1588         if ((v & 1) == inv) {
1589             /* insert vtop->c jump list in t */
1590             p = &vtop->c.i;
1591             while (*p != 0)
1592                 p = (int *)(cur_text_section->data + *p);
1593             *p = t;
1594             t = vtop->c.i;
1595         } else {
1596             t = gjmp(t);
1597             gsym(vtop->c.i);
1598         }
1599     }
1600     vtop--;
1601     return t;
1602 }
1603
1604 /* generate an integer binary operation */
1605 void gen_opi(int op)
1606 {
1607     int r, fr, opc, c;
1608     int ll, uu, cc;
1609
1610     ll = is64_type(vtop[-1].type.t);
1611     uu = (vtop[-1].type.t & VT_UNSIGNED) != 0;
1612     cc = (vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
1613
1614     switch(op) {
1615     case '+':
1616     case TOK_ADDC1: /* add with carry generation */
1617         opc = 0;
1618     gen_op8:
1619         if (cc && (!ll || (int)vtop->c.ll == vtop->c.ll)) {
1620             /* constant case */
1621             vswap();
1622             r = gv(RC_INT);
1623             vswap();
1624             c = vtop->c.i;
1625             if (c == (char)c) {
1626                 /* XXX: generate inc and dec for smaller code ? */
1627                 orex(ll, r, 0, 0x83);
1628                 o(0xc0 | (opc << 3) | REG_VALUE(r));
1629                 g(c);
1630             } else {
1631                 orex(ll, r, 0, 0x81);
1632                 oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
1633             }
1634         } else {
1635             gv2(RC_INT, RC_INT);
1636             r = vtop[-1].r;
1637             fr = vtop[0].r;
1638             orex(ll, r, fr, (opc << 3) | 0x01);
1639             o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
1640         }
1641         vtop--;
1642         if (op >= TOK_ULT && op <= TOK_GT) {
1643             vtop->r = VT_CMP;
1644             vtop->c.i = op;
1645         }
1646         break;
1647     case '-':
1648     case TOK_SUBC1: /* sub with carry generation */
1649         opc = 5;
1650         goto gen_op8;
1651     case TOK_ADDC2: /* add with carry use */
1652         opc = 2;
1653         goto gen_op8;
1654     case TOK_SUBC2: /* sub with carry use */
1655         opc = 3;
1656         goto gen_op8;
1657     case '&':
1658         opc = 4;
1659         goto gen_op8;
1660     case '^':
1661         opc = 6;
1662         goto gen_op8;
1663     case '|':
1664         opc = 1;
1665         goto gen_op8;
1666     case '*':
1667         gv2(RC_INT, RC_INT);
1668         r = vtop[-1].r;
1669         fr = vtop[0].r;
1670         orex(ll, fr, r, 0xaf0f); /* imul fr, r */
1671         o(0xc0 + REG_VALUE(fr) + REG_VALUE(r) * 8);
1672         vtop--;
1673         break;
1674     case TOK_SHL:
1675         opc = 4;
1676         goto gen_shift;
1677     case TOK_SHR:
1678         opc = 5;
1679         goto gen_shift;
1680     case TOK_SAR:
1681         opc = 7;
1682     gen_shift:
1683         opc = 0xc0 | (opc << 3);
1684         if (cc) {
1685             /* constant case */
1686             vswap();
1687             r = gv(RC_INT);
1688             vswap();
1689             orex(ll, r, 0, 0xc1); /* shl/shr/sar $xxx, r */
1690             o(opc | REG_VALUE(r));
1691             g(vtop->c.i & (ll ? 63 : 31));
1692         } else {
1693             /* we generate the shift in ecx */
1694             gv2(RC_INT, RC_RCX);
1695             r = vtop[-1].r;
1696             orex(ll, r, 0, 0xd3); /* shl/shr/sar %cl, r */
1697             o(opc | REG_VALUE(r));
1698         }
1699         vtop--;
1700         break;
1701     case TOK_UDIV:
1702     case TOK_UMOD:
1703         uu = 1;
1704         goto divmod;
1705     case '/':
1706     case '%':
1707     case TOK_PDIV:
1708         uu = 0;
1709     divmod:
1710         /* first operand must be in eax */
1711         /* XXX: need better constraint for second operand */
1712         gv2(RC_RAX, RC_RCX);
1713         r = vtop[-1].r;
1714         fr = vtop[0].r;
1715         vtop--;
1716         save_reg(TREG_RDX);
1717         orex(ll, 0, 0, uu ? 0xd231 : 0x99); /* xor %edx,%edx : cqto */
1718         orex(ll, fr, 0, 0xf7); /* div fr, %eax */
1719         o((uu ? 0xf0 : 0xf8) + REG_VALUE(fr));
1720         if (op == '%' || op == TOK_UMOD)
1721             r = TREG_RDX;
1722         else
1723             r = TREG_RAX;
1724         vtop->r = r;
1725         break;
1726     default:
1727         opc = 7;
1728         goto gen_op8;
1729     }
1730 }
1731
1732 void gen_opl(int op)
1733 {
1734     gen_opi(op);
1735 }
1736
1737 /* generate a floating point operation 'v = t1 op t2' instruction. The
1738    two operands are guaranted to have the same floating point type */
1739 /* XXX: need to use ST1 too */
1740 void gen_opf(int op)
1741 {
1742     int a, ft, fc, swapped, r;
1743     int float_type =
1744         (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1745
1746     /* convert constants to memory references */
1747     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1748         vswap();
1749         gv(float_type);
1750         vswap();
1751     }
1752     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1753         gv(float_type);
1754
1755     /* must put at least one value in the floating point register */
1756     if ((vtop[-1].r & VT_LVAL) &&
1757         (vtop[0].r & VT_LVAL)) {
1758         vswap();
1759         gv(float_type);
1760         vswap();
1761     }
1762     swapped = 0;
1763     /* swap the stack if needed so that t1 is the register and t2 is
1764        the memory reference */
1765     if (vtop[-1].r & VT_LVAL) {
1766         vswap();
1767         swapped = 1;
1768     }
1769     if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1770         if (op >= TOK_ULT && op <= TOK_GT) {
1771             /* load on stack second operand */
1772             load(TREG_ST0, vtop);
1773             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1774             if (op == TOK_GE || op == TOK_GT)
1775                 swapped = !swapped;
1776             else if (op == TOK_EQ || op == TOK_NE)
1777                 swapped = 0;
1778             if (swapped)
1779                 o(0xc9d9); /* fxch %st(1) */
1780             if (op == TOK_EQ || op == TOK_NE)
1781                 o(0xe9da); /* fucompp */
1782             else
1783                 o(0xd9de); /* fcompp */
1784             o(0xe0df); /* fnstsw %ax */
1785             if (op == TOK_EQ) {
1786                 o(0x45e480); /* and $0x45, %ah */
1787                 o(0x40fC80); /* cmp $0x40, %ah */
1788             } else if (op == TOK_NE) {
1789                 o(0x45e480); /* and $0x45, %ah */
1790                 o(0x40f480); /* xor $0x40, %ah */
1791                 op = TOK_NE;
1792             } else if (op == TOK_GE || op == TOK_LE) {
1793                 o(0x05c4f6); /* test $0x05, %ah */
1794                 op = TOK_EQ;
1795             } else {
1796                 o(0x45c4f6); /* test $0x45, %ah */
1797                 op = TOK_EQ;
1798             }
1799             vtop--;
1800             vtop->r = VT_CMP;
1801             vtop->c.i = op;
1802         } else {
1803             /* no memory reference possible for long double operations */
1804             load(TREG_ST0, vtop);
1805             swapped = !swapped;
1806
1807             switch(op) {
1808             default:
1809             case '+':
1810                 a = 0;
1811                 break;
1812             case '-':
1813                 a = 4;
1814                 if (swapped)
1815                     a++;
1816                 break;
1817             case '*':
1818                 a = 1;
1819                 break;
1820             case '/':
1821                 a = 6;
1822                 if (swapped)
1823                     a++;
1824                 break;
1825             }
1826             ft = vtop->type.t;
1827             fc = vtop->c.ul;
1828             o(0xde); /* fxxxp %st, %st(1) */
1829             o(0xc1 + (a << 3));
1830             vtop--;
1831         }
1832     } else {
1833         if (op >= TOK_ULT && op <= TOK_GT) {
1834             /* if saved lvalue, then we must reload it */
1835             r = vtop->r;
1836             fc = vtop->c.ul;
1837             if ((r & VT_VALMASK) == VT_LLOCAL) {
1838                 SValue v1;
1839                 r = get_reg(RC_INT);
1840                 v1.type.t = VT_PTR;
1841                 v1.r = VT_LOCAL | VT_LVAL;
1842                 v1.c.ul = fc;
1843                 load(r, &v1);
1844                 fc = 0;
1845             }
1846
1847             if (op == TOK_EQ || op == TOK_NE) {
1848                 swapped = 0;
1849             } else {
1850                 if (op == TOK_LE || op == TOK_LT)
1851                     swapped = !swapped;
1852                 if (op == TOK_LE || op == TOK_GE) {
1853                     op = 0x93; /* setae */
1854                 } else {
1855                     op = 0x97; /* seta */
1856                 }
1857             }
1858
1859             if (swapped) {
1860                 gv(RC_FLOAT);
1861                 vswap();
1862             }
1863             assert(!(vtop[-1].r & VT_LVAL));
1864
1865             if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE)
1866                 o(0x66);
1867             if (op == TOK_EQ || op == TOK_NE)
1868                 o(0x2e0f); /* ucomisd */
1869             else
1870                 o(0x2f0f); /* comisd */
1871
1872             if (vtop->r & VT_LVAL) {
1873                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
1874             } else {
1875                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
1876             }
1877
1878             vtop--;
1879             vtop->r = VT_CMP;
1880             vtop->c.i = op | 0x100;
1881         } else {
1882             assert((vtop->type.t & VT_BTYPE) != VT_LDOUBLE);
1883             switch(op) {
1884             default:
1885             case '+':
1886                 a = 0;
1887                 break;
1888             case '-':
1889                 a = 4;
1890                 break;
1891             case '*':
1892                 a = 1;
1893                 break;
1894             case '/':
1895                 a = 6;
1896                 break;
1897             }
1898             ft = vtop->type.t;
1899             fc = vtop->c.ul;
1900             assert((ft & VT_BTYPE) != VT_LDOUBLE);
1901
1902             r = vtop->r;
1903             /* if saved lvalue, then we must reload it */
1904             if ((vtop->r & VT_VALMASK) == VT_LLOCAL) {
1905                 SValue v1;
1906                 r = get_reg(RC_INT);
1907                 v1.type.t = VT_PTR;
1908                 v1.r = VT_LOCAL | VT_LVAL;
1909                 v1.c.ul = fc;
1910                 load(r, &v1);
1911                 fc = 0;
1912             }
1913
1914             assert(!(vtop[-1].r & VT_LVAL));
1915             if (swapped) {
1916                 assert(vtop->r & VT_LVAL);
1917                 gv(RC_FLOAT);
1918                 vswap();
1919             }
1920
1921             if ((ft & VT_BTYPE) == VT_DOUBLE) {
1922                 o(0xf2);
1923             } else {
1924                 o(0xf3);
1925             }
1926             o(0x0f);
1927             o(0x58 + a);
1928
1929             if (vtop->r & VT_LVAL) {
1930                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
1931             } else {
1932                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
1933             }
1934
1935             vtop--;
1936         }
1937     }
1938 }
1939
1940 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
1941    and 'long long' cases. */
1942 void gen_cvt_itof(int t)
1943 {
1944     if ((t & VT_BTYPE) == VT_LDOUBLE) {
1945         save_reg(TREG_ST0);
1946         gv(RC_INT);
1947         if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
1948             /* signed long long to float/double/long double (unsigned case
1949                is handled generically) */
1950             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1951             o(0x242cdf); /* fildll (%rsp) */
1952             o(0x08c48348); /* add $8, %rsp */
1953         } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
1954                    (VT_INT | VT_UNSIGNED)) {
1955             /* unsigned int to float/double/long double */
1956             o(0x6a); /* push $0 */
1957             g(0x00);
1958             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1959             o(0x242cdf); /* fildll (%rsp) */
1960             o(0x10c48348); /* add $16, %rsp */
1961         } else {
1962             /* int to float/double/long double */
1963             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1964             o(0x2404db); /* fildl (%rsp) */
1965             o(0x08c48348); /* add $8, %rsp */
1966         }
1967         vtop->r = TREG_ST0;
1968     } else {
1969         int r = get_reg(RC_FLOAT);
1970         gv(RC_INT);
1971         o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT?1:0));
1972         if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
1973             (VT_INT | VT_UNSIGNED) ||
1974             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
1975             o(0x48); /* REX */
1976         }
1977         o(0x2a0f);
1978         o(0xc0 + (vtop->r & VT_VALMASK) + REG_VALUE(r)*8); /* cvtsi2sd */
1979         vtop->r = r;
1980     }
1981 }
1982
1983 /* convert from one floating point type to another */
1984 void gen_cvt_ftof(int t)
1985 {
1986     int ft, bt, tbt;
1987
1988     ft = vtop->type.t;
1989     bt = ft & VT_BTYPE;
1990     tbt = t & VT_BTYPE;
1991
1992     if (bt == VT_FLOAT) {
1993         gv(RC_FLOAT);
1994         if (tbt == VT_DOUBLE) {
1995             o(0x140f); /* unpcklps */
1996             o(0xc0 + REG_VALUE(vtop->r)*9);
1997             o(0x5a0f); /* cvtps2pd */
1998             o(0xc0 + REG_VALUE(vtop->r)*9);
1999         } else if (tbt == VT_LDOUBLE) {
2000             save_reg(RC_ST0);
2001             /* movss %xmm0,-0x10(%rsp) */
2002             o(0x110ff3);
2003             o(0x44 + REG_VALUE(vtop->r)*8);
2004             o(0xf024);
2005             o(0xf02444d9); /* flds -0x10(%rsp) */
2006             vtop->r = TREG_ST0;
2007         }
2008     } else if (bt == VT_DOUBLE) {
2009         gv(RC_FLOAT);
2010         if (tbt == VT_FLOAT) {
2011             o(0x140f66); /* unpcklpd */
2012             o(0xc0 + REG_VALUE(vtop->r)*9);
2013             o(0x5a0f66); /* cvtpd2ps */
2014             o(0xc0 + REG_VALUE(vtop->r)*9);
2015         } else if (tbt == VT_LDOUBLE) {
2016             save_reg(RC_ST0);
2017             /* movsd %xmm0,-0x10(%rsp) */
2018             o(0x110ff2);
2019             o(0x44 + REG_VALUE(vtop->r)*8);
2020             o(0xf024);
2021             o(0xf02444dd); /* fldl -0x10(%rsp) */
2022             vtop->r = TREG_ST0;
2023         }
2024     } else {
2025         int r;
2026         gv(RC_ST0);
2027         r = get_reg(RC_FLOAT);
2028         if (tbt == VT_DOUBLE) {
2029             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
2030             /* movsd -0x10(%rsp),%xmm0 */
2031             o(0x100ff2);
2032             o(0x44 + REG_VALUE(r)*8);
2033             o(0xf024);
2034             vtop->r = r;
2035         } else if (tbt == VT_FLOAT) {
2036             o(0xf0245cd9); /* fstps -0x10(%rsp) */
2037             /* movss -0x10(%rsp),%xmm0 */
2038             o(0x100ff3);
2039             o(0x44 + REG_VALUE(r)*8);
2040             o(0xf024);
2041             vtop->r = r;
2042         }
2043     }
2044 }
2045
2046 /* convert fp to int 't' type */
2047 void gen_cvt_ftoi(int t)
2048 {
2049     int ft, bt, size, r;
2050     ft = vtop->type.t;
2051     bt = ft & VT_BTYPE;
2052     if (bt == VT_LDOUBLE) {
2053         gen_cvt_ftof(VT_DOUBLE);
2054         bt = VT_DOUBLE;
2055     }
2056
2057     gv(RC_FLOAT);
2058     if (t != VT_INT)
2059         size = 8;
2060     else
2061         size = 4;
2062
2063     r = get_reg(RC_INT);
2064     if (bt == VT_FLOAT) {
2065         o(0xf3);
2066     } else if (bt == VT_DOUBLE) {
2067         o(0xf2);
2068     } else {
2069         assert(0);
2070     }
2071     orex(size == 8, r, 0, 0x2c0f); /* cvttss2si or cvttsd2si */
2072     o(0xc0 + REG_VALUE(vtop->r) + REG_VALUE(r)*8);
2073     vtop->r = r;
2074 }
2075
2076 /* computed goto support */
2077 void ggoto(void)
2078 {
2079     gcall_or_jmp(1);
2080     vtop--;
2081 }
2082
2083 /* Save the stack pointer onto the stack and return the location of its address */
2084 ST_FUNC void gen_vla_sp_save(int addr) {
2085     /* mov %rsp,addr(%rbp)*/
2086     gen_modrm64(0x89, TREG_RSP, VT_LOCAL, NULL, addr);
2087 }
2088
2089 /* Restore the SP from a location on the stack */
2090 ST_FUNC void gen_vla_sp_restore(int addr) {
2091     gen_modrm64(0x8b, TREG_RSP, VT_LOCAL, NULL, addr);
2092 }
2093
2094 /* Subtract from the stack pointer, and push the resulting value onto the stack */
2095 ST_FUNC void gen_vla_alloc(CType *type, int align) {
2096 #ifdef TCC_TARGET_PE
2097     /* alloca does more than just adjust %rsp on Windows */
2098     vpush_global_sym(&func_old_type, TOK_alloca);
2099     vswap(); /* Move alloca ref past allocation size */
2100     gfunc_call(1);
2101     vset(type, REG_IRET, 0);
2102 #else
2103     int r;
2104     r = gv(RC_INT); /* allocation size */
2105     /* sub r,%rsp */
2106     o(0x2b48);
2107     o(0xe0 | REG_VALUE(r));
2108     /* We align to 16 bytes rather than align */
2109     /* and ~15, %rsp */
2110     o(0xf0e48348);
2111     /* mov %rsp, r */
2112     o(0x8948);
2113     o(0xe0 | REG_VALUE(r));
2114     vpop();
2115     vset(type, r, 0);
2116 #endif
2117 }
2118
2119
2120 /* end of x86-64 code generator */
2121 /*************************************************************/
2122 #endif /* ! TARGET_DEFS_ONLY */
2123 /******************************************************/