x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #ifdef TARGET_DEFS_ONLY
  24
  25 /* number of available registers */
  26 #define NB_REGS         25
  27 #define NB_ASM_REGS     8
  28
  29 /* a register can belong to several classes. The classes must be
  30    sorted from more general to more precise (see gv2() code which does
  31    assumptions on it). */
  32 #define RC_INT          0x0001 /* generic integer register */
  33 #define RC_FLOAT        0x0002 /* generic float register */
  34 #define RC_RAX          0x0004
  35 #define RC_RCX          0x0008
  36 #define RC_RDX          0x0010
  37 #define RC_ST0          0x0020 /* only for long double */
  38 #define RC_R8           0x0040
  39 #define RC_R9           0x0080
  40 #define RC_XMM0         0x0100
  41 #define RC_XMM1         0x0200
  42 #define RC_XMM2         0x0400
  43 #define RC_XMM3         0x0800
  44 #define RC_XMM4         0x1000
  45 #define RC_XMM5         0x2000
  46 #define RC_XMM6         0x4000
  47 #define RC_XMM7         0x8000
  48 #define RC_RSI          0x10000
  49 #define RC_RDI          0x20000
  50 #define RC_INT1         0x40000 /* function_pointer */
  51 #define RC_INT2         0x80000
  52 #define RC_RBX          0x100000
  53 #define RC_R10          0x200000
  54 #define RC_R11          0x400000
  55 #define RC_R12          0x800000
  56 #define RC_R13          0x1000000
  57 #define RC_R14          0x2000000
  58 #define RC_R15          0x4000000
  59 #define RC_IRET         RC_RAX /* function return: integer register */
  60 #define RC_LRET         RC_RDX /* function return: second integer register */
  61 #define RC_FRET         RC_XMM0 /* function return: float register */
  62 #define RC_QRET         RC_XMM1 /* function return: second float register */
  63 #define RC_MASK         (RC_INT|RC_INT1|RC_INT2|RC_FLOAT)
  64
  65 /* pretty names for the registers */
  66 enum {
  67     TREG_RAX = 0,
  68     TREG_RCX = 1,
  69     TREG_RDX = 2,
  70     TREG_RSP = 4,
  71     TREG_ST0 = 5,
  72     TREG_RSI = 6,
  73     TREG_RDI = 7,
  74
  75     TREG_R8  = 8,
  76     TREG_R9  = 9,
  77     TREG_R10 = 10,
  78     TREG_R11 = 11,
  79
  80     TREG_XMM0 = 16,
  81     TREG_XMM1 = 17,
  82     TREG_XMM2 = 18,
  83     TREG_XMM3 = 19,
  84     TREG_XMM4 = 20,
  85     TREG_XMM5 = 21,
  86     TREG_XMM6 = 22,
  87     TREG_XMM7 = 23,
  88
  89 };
  90
  91 #define REX_BASE(reg) (((reg) >> 3) & 1)
  92 #define REG_VALUE(reg) ((reg) & 7)
  93 #define FLAG_GOT        0X01
  94
  95 /* return registers for function */
  96 #define REG_IRET TREG_RAX /* single word int return register */
  97 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  98 #define REG_FRET TREG_XMM0 /* float return register */
  99 #define REG_QRET TREG_XMM1 /* second float return register */
 100
 101 /* defined if function parameters must be evaluated in reverse order */
 102 #define INVERT_FUNC_PARAMS
 103
 104 /* pointer size, in bytes */
 105 #define PTR_SIZE 8
 106
 107 /* long double size and alignment, in bytes */
 108 #define LDOUBLE_SIZE  16
 109 #define LDOUBLE_ALIGN 16
 110 /* maximum alignment (for aligned attribute support) */
 111 #define MAX_ALIGN     16
 112
 113 /******************************************************/
 114 /* ELF defines */
 115
 116 #define EM_TCC_TARGET EM_X86_64
 117
 118 /* relocation type for 32 bit data relocation */
 119 #define R_DATA_32   R_X86_64_32
 120 #define R_DATA_PTR  R_X86_64_64
 121 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
 122 #define R_COPY      R_X86_64_COPY
 123
 124 #define ELF_START_ADDR 0x400000
 125 #define ELF_PAGE_SIZE  0x200000
 126
 127 /******************************************************/
 128 #else /* ! TARGET_DEFS_ONLY */
 129 /******************************************************/
 130 #include "tcc.h"
 131 #include <assert.h>
 132
 133 ST_DATA const int reg_classes[NB_REGS] = {
 134     /* eax */ RC_INT|RC_RAX|RC_INT2,
 135     /* ecx */ RC_INT|RC_RCX|RC_INT2,
 136     /* edx */ RC_INT|RC_RDX,
 137         RC_INT|RC_INT1|RC_INT2|RC_RBX,
 138     0,
 139     /* st0 */ RC_ST0,
 140     RC_RSI|RC_INT2,
 141     RC_RDI|RC_INT2,
 142     RC_INT|RC_R8|RC_INT2,
 143     RC_INT|RC_R9|RC_INT2,
 144     RC_INT|RC_INT1|RC_INT2|RC_R10,
 145         RC_INT|RC_INT1|RC_INT2|RC_R11,
 146         RC_INT|RC_INT1|RC_INT2|RC_R12,
 147         RC_INT|RC_INT1|RC_INT2|RC_R13,
 148         RC_INT|RC_INT1|RC_INT2|RC_R14,
 149         RC_INT|RC_INT1|RC_INT2|RC_R15,
 150         /* xmm0 */ RC_FLOAT | RC_XMM0,
 151         RC_FLOAT|RC_XMM1,
 152         RC_FLOAT|RC_XMM2,
 153         RC_FLOAT|RC_XMM3,
 154         RC_FLOAT|RC_XMM4,
 155         RC_FLOAT|RC_XMM5,
 156         RC_FLOAT|RC_XMM6,
 157         RC_FLOAT|RC_XMM7,
 158 };
 159
 160 static unsigned long func_sub_sp_offset;
 161 static int func_ret_sub;
 162
 163 /* XXX: make it faster ? */
 164 void g(int c)
 165 {
 166     int ind1;
 167     ind1 = ind + 1;
 168     if (ind1 > cur_text_section->data_allocated)
 169         section_realloc(cur_text_section, ind1);
 170     cur_text_section->data[ind] = c;
 171     ind = ind1;
 172 }
 173
 174 void o(unsigned int c)
 175 {
 176     while (c) {
 177         g(c);
 178         c = c >> 8;
 179     }
 180 }
 181
 182 void gen_le16(int v)
 183 {
 184     g(v);
 185     g(v >> 8);
 186 }
 187
 188 void gen_le32(int c)
 189 {
 190     g(c);
 191     g(c >> 8);
 192     g(c >> 16);
 193     g(c >> 24);
 194 }
 195
 196 void gen_le64(int64_t c)
 197 {
 198     g(c);
 199     g(c >> 8);
 200     g(c >> 16);
 201     g(c >> 24);
 202     g(c >> 32);
 203     g(c >> 40);
 204     g(c >> 48);
 205     g(c >> 56);
 206 }
 207
 208 void orex(int ll, int r, int r2, int b)
 209 {
 210     if ((r & VT_VALMASK) >= VT_CONST)
 211         r = 0;
 212     if ((r2 & VT_VALMASK) >= VT_CONST)
 213         r2 = 0;
 214     if (ll || REX_BASE(r) || REX_BASE(r2))
 215         o(0x40 | REX_BASE(r) | (REX_BASE(r2) << 2) | (ll << 3));
 216     o(b);
 217 }
 218
 219 /* output a symbol and patch all calls to it */
 220 void gsym_addr(int t, int a)
 221 {
 222     int n, *ptr;
 223     while (t) {
 224         ptr = (int *)(cur_text_section->data + t);
 225         n = *ptr; /* next value */
 226         *ptr = a - t - 4;
 227         t = n;
 228     }
 229 }
 230
 231 void gsym(int t)
 232 {
 233     gsym_addr(t, ind);
 234 }
 235
 236 /* psym is used to put an instruction with a data field which is a
 237    reference to a symbol. It is in fact the same as oad ! */
 238 #define psym oad
 239
 240 static int is64_type(int t)
 241 {
 242     return ((t & VT_BTYPE) == VT_PTR ||
 243             (t & VT_BTYPE) == VT_FUNC ||
 244             (t & VT_BTYPE) == VT_LLONG);
 245 }
 246
 247 /* instruction + 4 bytes data. Return the address of the data */
 248 ST_FUNC int oad(int c, int s)
 249 {
 250     int ind1;
 251
 252     o(c);
 253     ind1 = ind + 4;
 254     if (ind1 > cur_text_section->data_allocated)
 255         section_realloc(cur_text_section, ind1);
 256     *(int *)(cur_text_section->data + ind) = s;
 257     s = ind;
 258     ind = ind1;
 259     return s;
 260 }
 261
 262 ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 263 {
 264     if (r & VT_SYM)
 265         greloc(cur_text_section, sym, ind, R_X86_64_32);
 266     gen_le32(c);
 267 }
 268
 269 /* output constant with relocation if 'r & VT_SYM' is true */
 270 ST_FUNC void gen_addr64(int r, Sym *sym, int64_t c)
 271 {
 272     if (r & VT_SYM)
 273         greloc(cur_text_section, sym, ind, R_X86_64_64);
 274     gen_le64(c);
 275 }
 276
 277 /* output constant with relocation if 'r & VT_SYM' is true */
 278 ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
 279 {
 280     if (r & VT_SYM)
 281         greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 282     gen_le32(c-4);
 283 }
 284
 285 /* output got address with relocation */
 286 static void gen_gotpcrel(int r, Sym *sym, int c)
 287 {
 288 #ifndef TCC_TARGET_PE
 289     Section *sr;
 290     ElfW(Rela) *rel;
 291     greloc(cur_text_section, sym, ind, R_X86_64_GOTPCREL);
 292     sr = cur_text_section->reloc;
 293     rel = (ElfW(Rela) *)(sr->data + sr->data_offset - sizeof(ElfW(Rela)));
 294     rel->r_addend = -4;
 295 #else
 296     printf("picpic: %s %x %x | %02x %02x %02x\n", get_tok_str(sym->v, NULL), c, r,
 297         cur_text_section->data[ind-3],
 298         cur_text_section->data[ind-2],
 299         cur_text_section->data[ind-1]
 300         );
 301     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 302 #endif
 303     gen_le32(0);
 304     if (c) {
 305         /* we use add c, %xxx for displacement */
 306         orex(1, r, 0, 0x81);
 307         o(0xc0 + REG_VALUE(r));
 308         gen_le32(c);
 309     }
 310 }
 311
 312 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 313 {
 314     op_reg = REG_VALUE(op_reg) << 3;
 315     if ((r & VT_VALMASK) == VT_CONST) {
 316         /* constant memory reference */
 317         o(0x05 | op_reg);
 318         if (is_got) {
 319             gen_gotpcrel(r, sym, c);
 320         } else {
 321             gen_addrpc32(r, sym, c);
 322         }
 323     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 324         /* currently, we use only ebp as base */
 325         if (c == (char)c) {
 326             /* short reference */
 327             o(0x45 | op_reg);
 328             g(c);
 329         } else {
 330             oad(0x85 | op_reg, c);
 331         }
 332     } else if (r & TREG_MEM) {
 333         if (c) {
 334             g(0x80 | op_reg | REG_VALUE(r));
 335             gen_le32(c);
 336         } else {
 337             g(0x00 | op_reg | REG_VALUE(r));
 338         }
 339     } else {
 340         g(0x00 | op_reg | REG_VALUE(r));
 341     }
 342 }
 343
 344 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 345    opcode bits */
 346 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 347 {
 348     gen_modrm_impl(op_reg, r, sym, c, 0);
 349 }
 350
 351 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 352    opcode bits */
 353 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 354 {
 355     int is_got;
 356     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 357     orex(1, r, op_reg, opcode);
 358     gen_modrm_impl(op_reg, r, sym, c, is_got);
 359 }
 360
 361
 362 /* load 'r' from value 'sv' */
 363 void load(int r, SValue *sv)
 364 {
 365     int v, t, ft, fc, fr;
 366     SValue v1;
 367
 368 #ifdef TCC_TARGET_PE
 369     SValue v2;
 370     sv = pe_getimport(sv, &v2);
 371 #endif
 372
 373     fr = sv->r;
 374     ft = sv->type.t & ~VT_DEFSIGN;
 375     fc = sv->c.ul;
 376
 377 #ifndef TCC_TARGET_PE
 378     /* we use indirect access via got */
 379     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 380         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 381         /* use the result register as a temporal register */
 382         int tr = r | TREG_MEM;
 383         if (is_float(ft)) {
 384             /* we cannot use float registers as a temporal register */
 385             tr = get_reg(RC_INT) | TREG_MEM;
 386         }
 387         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 388
 389         /* load from the temporal register */
 390         fr = tr | VT_LVAL;
 391     }
 392 #endif
 393
 394     v = fr & VT_VALMASK;
 395     if (fr & VT_LVAL) {
 396         int b, ll;
 397         if (v == VT_LLOCAL) {
 398             v1.type.t = VT_PTR;
 399             v1.r = VT_LOCAL | VT_LVAL;
 400             v1.c.ul = fc;
 401             fr = r;
 402             if (!(reg_classes[fr] & RC_INT))
 403                 fr = get_reg(RC_INT);
 404             load(fr, &v1);
 405         }
 406         ll = 0;
 407         if ((ft & VT_BTYPE) == VT_FLOAT) {
 408             b = 0x6e0f66;
 409             r = REG_VALUE(r); /* movd */
 410         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 411             b = 0x7e0ff3; /* movq */
 412             r = REG_VALUE(r);
 413         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 414             b = 0xdb, r = 5; /* fldt */
 415         } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
 416             b = 0xbe0f;   /* movsbl */
 417         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 418             b = 0xb60f;   /* movzbl */
 419         } else if ((ft & VT_TYPE) == VT_SHORT) {
 420             b = 0xbf0f;   /* movswl */
 421         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 422             b = 0xb70f;   /* movzwl */
 423         } else {
 424             assert(((ft & VT_BTYPE) == VT_INT) || ((ft & VT_BTYPE) == VT_LLONG)
 425                    || ((ft & VT_BTYPE) == VT_PTR) || ((ft & VT_BTYPE) == VT_ENUM)
 426                    || ((ft & VT_BTYPE) == VT_FUNC));
 427             ll = is64_type(ft);
 428             b = 0x8b;
 429         }
 430         if (ll) {
 431             gen_modrm64(b, r, fr, sv->sym, fc);
 432         } else {
 433             orex(ll, fr, r, b);
 434             gen_modrm(r, fr, sv->sym, fc);
 435         }
 436     } else {
 437         if (v == VT_CONST) {
 438             if (fr & VT_SYM) {
 439 #ifdef TCC_TARGET_PE
 440                 orex(1,0,r,0x8d);
 441                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 442                 gen_addrpc32(fr, sv->sym, fc);
 443 #else
 444                 if (sv->sym->type.t & VT_STATIC) {
 445                     orex(1,0,r,0x8d);
 446                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 447                     gen_addrpc32(fr, sv->sym, fc);
 448                 } else {
 449                     orex(1,0,r,0x8b);
 450                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 451                     gen_gotpcrel(r, sv->sym, fc);
 452                 }
 453 #endif
 454             } else if (is64_type(ft)) {
 455                 orex(1,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 456                 gen_le64(sv->c.ull);
 457             } else {
 458                 orex(0,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 459                 gen_le32(fc);
 460             }
 461         } else if (v == VT_LOCAL) {
 462             orex(1,0,r,0x8d); /* lea xxx(%ebp), r */
 463             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 464         } else if (v == VT_CMP) {
 465             orex(0,r,0,0);
 466             if ((fc & ~0x100) != TOK_NE)
 467               oad(0xb8 + REG_VALUE(r), 0); /* mov $0, r */
 468             else
 469               oad(0xb8 + REG_VALUE(r), 1); /* mov $1, r */
 470             if (fc & 0x100)
 471               {
 472                 /* This was a float compare.  If the parity bit is
 473                    set the result was unordered, meaning false for everything
 474                    except TOK_NE, and true for TOK_NE.  */
 475                 fc &= ~0x100;
 476                 o(0x037a + (REX_BASE(r) << 8));
 477               }
 478             orex(0,r,0, 0x0f); /* setxx %br */
 479             o(fc);
 480             o(0xc0 + REG_VALUE(r));
 481         } else if (v == VT_JMP || v == VT_JMPI) {
 482             t = v & 1;
 483             orex(0,r,0,0);
 484             oad(0xb8 + REG_VALUE(r), t); /* mov $1, r */
 485             o(0x05eb + (REX_BASE(r) << 8)); /* jmp after */
 486             gsym(fc);
 487             orex(0,r,0,0);
 488             oad(0xb8 + REG_VALUE(r), t ^ 1); /* mov $0, r */
 489         } else if (v != r) {
 490             if (reg_classes[r] & RC_FLOAT) {
 491                 if(v == TREG_ST0){
 492                                         /* gen_cvt_ftof(VT_DOUBLE); */
 493                                         o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 494                                         /* movsd -0x10(%rsp),%xmm0 */
 495                                         o(0x100ff2);
 496                                         o(0xf02444 + REG_VALUE(r)*8);
 497                                 }else if(reg_classes[v] & RC_FLOAT){
 498                                         o(0x7e0ff3);
 499                                         o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8);
 500                                 }else
 501                                         assert(0);
 502             } else if (r == TREG_ST0) {
 503                 assert(reg_classes[v] & RC_FLOAT);
 504                 /* gen_cvt_ftof(VT_LDOUBLE); */
 505                 /* movsd %xmm0,-0x10(%rsp) */
 506                                 o(0x110ff2);
 507                                 o(0xf02444 + REG_VALUE(v)*8);
 508                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 509             } else {
 510                 orex(1,r,v, 0x89);
 511                 o(0xc0 + REG_VALUE(r) + REG_VALUE(v) * 8); /* mov v, r */
 512             }
 513         }
 514     }
 515 }
 516
 517 /* store register 'r' in lvalue 'v' */
 518 void store(int r, SValue *sv)
 519 {
 520     int fr, bt, ft, fc, ll, v;
 521
 522 #ifdef TCC_TARGET_PE
 523     SValue v2;
 524     sv = pe_getimport(sv, &v2);
 525 #endif
 526     ft = sv->type.t & ~VT_DEFSIGN;
 527     fc = sv->c.ul;
 528     fr = sv->r;
 529     bt = ft & VT_BTYPE;
 530         ll = is64_type(ft);
 531         v = fr & VT_VALMASK;
 532
 533 //#ifndef TCC_TARGET_PE
 534     /* we need to access the variable via got */
 535   //  if (fr == VT_CONST && (v->r & VT_SYM)) {
 536         /* mov xx(%rip), %r11 */
 537     //    o(0x1d8b4c);
 538       //  gen_gotpcrel(TREG_R11, v->sym, v->c.ul);
 539         //pic = is64_type(bt) ? 0x49 : 0x41;
 540    // }
 541 //#endif
 542
 543     /* XXX: incorrect if float reg to reg */
 544     if (bt == VT_FLOAT) {
 545                 orex(0, fr, r, 0x110ff3); /* movss */
 546     } else if (bt == VT_DOUBLE) {
 547                 orex(0, fr, r, 0x110ff2);/* movds */
 548     } else if (bt == VT_LDOUBLE) {
 549         o(0xc0d9); /* fld %st(0) */
 550                 orex(0, fr, r, 0xdb);/* fstpt */
 551         r = 7;
 552     } else {
 553         if (bt == VT_SHORT)
 554             o(0x66);
 555                 if (bt == VT_BYTE || bt == VT_BOOL)
 556                         orex(ll, fr, r, 0x88);
 557                 else{
 558                         orex(ll, fr, r, 0x89);
 559                 }
 560     }
 561         if (v == VT_CONST || v == VT_LOCAL || (fr & VT_LVAL)) {
 562                 gen_modrm(r, fr, sv->sym, fc);
 563         } else if (v != r) {
 564                 /* XXX: don't we really come here? */
 565                 abort();
 566                 o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8); /* mov r, fr */
 567         }
 568 }
 569
 570 /* 'is_jmp' is '1' if it is a jump */
 571 static void gcall_or_jmp(int is_jmp)
 572 {
 573     int r;
 574     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
 575         /* constant case */
 576         if (vtop->r & VT_SYM) {
 577             /* relocation case */
 578             greloc(cur_text_section, vtop->sym,
 579                    ind + 1, R_X86_64_PLT32);
 580         } else {
 581             /* put an empty PC32 relocation */
 582             put_elf_reloc(symtab_section, cur_text_section,
 583                           ind + 1, R_X86_64_PC32, 0);
 584         }
 585         oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
 586     } else {
 587         /* otherwise, indirect call */
 588         r = get_reg(RC_INT1);
 589         load(r, vtop);
 590                 orex(0, r, 0, 0xff); /* REX call/jmp *r */
 591         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 592     }
 593 }
 594
 595 #ifdef TCC_TARGET_PE
 596
 597 #define REGN 4
 598 static const uint8_t arg_regs[REGN] = {
 599     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 600 };
 601
 602 /* Prepare arguments in R10 and R11 rather than RCX and RDX
 603    because gv() will not ever use these */
 604 static int arg_prepare_reg(int idx) {
 605   if (idx == 0 || idx == 1)
 606       /* idx=0: r10, idx=1: r11 */
 607       return idx + 10;
 608   else
 609       return arg_regs[idx];
 610 }
 611
 612 static int func_scratch;
 613
 614 /* Generate function call. The function address is pushed first, then
 615    all the parameters in call order. This functions pops all the
 616    parameters and the function address. */
 617
 618 void gen_offs_sp(int b, int r, int d)
 619 {
 620     orex(1,0,r & 0x100 ? 0 : r, b);
 621     if (d == (char)d) {
 622         o(0x2444 | (REG_VALUE(r) << 3));
 623         g(d);
 624     } else {
 625         o(0x2484 | (REG_VALUE(r) << 3));
 626         gen_le32(d);
 627     }
 628 }
 629
 630 /* Return the number of registers needed to return the struct, or 0 if
 631    returning via struct pointer. */
 632 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align)
 633 {
 634     int size, align;
 635     *ret_align = 1; // Never have to re-align return values for x86-64
 636     size = type_size(vt, &align);
 637     ret->ref = NULL;
 638     if (size > 8) {
 639         return 0;
 640     } else if (size > 4) {
 641         ret->t = VT_LLONG;
 642         return 1;
 643     } else if (size > 2) {
 644         ret->t = VT_INT;
 645         return 1;
 646     } else if (size > 1) {
 647         ret->t = VT_SHORT;
 648         return 1;
 649     } else {
 650         ret->t = VT_BYTE;
 651         return 1;
 652     }
 653 }
 654
 655 static int is_sse_float(int t) {
 656     int bt;
 657     bt = t & VT_BTYPE;
 658     return bt == VT_DOUBLE || bt == VT_FLOAT;
 659 }
 660
 661 int gfunc_arg_size(CType *type) {
 662     int align;
 663     if (type->t & (VT_ARRAY|VT_BITFIELD))
 664         return 8;
 665     return type_size(type, &align);
 666 }
 667
 668 void gfunc_call(int nb_args)
 669 {
 670     int size, r, args_size, i, d, bt, struct_size;
 671     int arg;
 672
 673     args_size = (nb_args < REGN ? REGN : nb_args) * PTR_SIZE;
 674     arg = nb_args;
 675
 676     /* for struct arguments, we need to call memcpy and the function
 677        call breaks register passing arguments we are preparing.
 678        So, we process arguments which will be passed by stack first. */
 679     struct_size = args_size;
 680     for(i = 0; i < nb_args; i++) {
 681         SValue *sv;
 682
 683         --arg;
 684         sv = &vtop[-i];
 685         bt = (sv->type.t & VT_BTYPE);
 686         size = gfunc_arg_size(&sv->type);
 687
 688         if (size <= 8)
 689             continue; /* arguments smaller than 8 bytes passed in registers or on stack */
 690
 691         if (bt == VT_STRUCT) {
 692             /* align to stack align size */
 693             size = (size + 15) & ~15;
 694             /* generate structure store */
 695             r = get_reg(RC_INT);
 696             gen_offs_sp(0x8d, r, struct_size);
 697             struct_size += size;
 698
 699             /* generate memcpy call */
 700             vset(&sv->type, r | VT_LVAL, 0);
 701             vpushv(sv);
 702             vstore();
 703             --vtop;
 704         } else if (bt == VT_LDOUBLE) {
 705             gv(RC_ST0);
 706             gen_offs_sp(0xdb, 0x107, struct_size);
 707             struct_size += 16;
 708         }
 709     }
 710
 711     if (func_scratch < struct_size)
 712         func_scratch = struct_size;
 713
 714     arg = nb_args;
 715     struct_size = args_size;
 716
 717     for(i = 0; i < nb_args; i++) {
 718         --arg;
 719         bt = (vtop->type.t & VT_BTYPE);
 720
 721         size = gfunc_arg_size(&vtop->type);
 722         if (size > 8) {
 723             /* align to stack align size */
 724             size = (size + 15) & ~15;
 725             if (arg >= REGN) {
 726                 d = get_reg(RC_INT);
 727                 gen_offs_sp(0x8d, d, struct_size);
 728                 gen_offs_sp(0x89, d, arg*8);
 729             } else {
 730                 d = arg_prepare_reg(arg);
 731                 gen_offs_sp(0x8d, d, struct_size);
 732             }
 733             struct_size += size;
 734         } else {
 735             if (is_sse_float(vtop->type.t)) {
 736                 gv(RC_XMM0); /* only use one float register */
 737                 if (arg >= REGN) {
 738                     /* movq %xmm0, j*8(%rsp) */
 739                     gen_offs_sp(0xd60f66, 0x100, arg*8);
 740                 } else {
 741                     /* movaps %xmm0, %xmmN */
 742                     o(0x280f);
 743                     o(0xc0 + (arg << 3));
 744                     d = arg_prepare_reg(arg);
 745                     /* mov %xmm0, %rxx */
 746                     o(0x66);
 747                     orex(1,d,0, 0x7e0f);
 748                     o(0xc0 + REG_VALUE(d));
 749                 }
 750             } else {
 751                 if (bt == VT_STRUCT) {
 752                     vtop->type.ref = NULL;
 753                     vtop->type.t = size > 4 ? VT_LLONG : size > 2 ? VT_INT
 754                         : size > 1 ? VT_SHORT : VT_BYTE;
 755                 }
 756
 757                 r = gv(RC_INT);
 758                 if (arg >= REGN) {
 759                     gen_offs_sp(0x89, r, arg*8);
 760                 } else {
 761                     d = arg_prepare_reg(arg);
 762                     orex(1,d,r,0x89); /* mov */
 763                     o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
 764                 }
 765             }
 766         }
 767         vtop--;
 768     }
 769     save_regs(0);
 770
 771     /* Copy R10 and R11 into RCX and RDX, respectively */
 772     if (nb_args > 0) {
 773         o(0xd1894c); /* mov %r10, %rcx */
 774         if (nb_args > 1) {
 775             o(0xda894c); /* mov %r11, %rdx */
 776         }
 777     }
 778
 779     gcall_or_jmp(0);
 780     vtop--;
 781 }
 782
 783
 784 #define FUNC_PROLOG_SIZE 11
 785
 786 /* generate function prolog of type 't' */
 787 void gfunc_prolog(CType *func_type)
 788 {
 789     int addr, reg_param_index, bt, size;
 790     Sym *sym;
 791     CType *type;
 792
 793     func_ret_sub = 0;
 794     func_scratch = 0;
 795     pop_stack = loc = 0;
 796
 797     addr = PTR_SIZE * 2;
 798     ind += FUNC_PROLOG_SIZE;
 799     func_sub_sp_offset = ind;
 800     reg_param_index = 0;
 801
 802     sym = func_type->ref;
 803
 804     /* if the function returns a structure, then add an
 805        implicit pointer parameter */
 806     func_vt = sym->type;
 807     func_var = (sym->c == FUNC_ELLIPSIS);
 808     size = gfunc_arg_size(&func_vt);
 809     if (size > 8) {
 810         gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 811         func_vc = addr;
 812         reg_param_index++;
 813         addr += 8;
 814     }
 815
 816     /* define parameters */
 817     while ((sym = sym->next) != NULL) {
 818         type = &sym->type;
 819         bt = type->t & VT_BTYPE;
 820         size = gfunc_arg_size(type);
 821         if (size > 8) {
 822             if (reg_param_index < REGN) {
 823                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 824             }
 825             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL | VT_REF, addr);
 826         } else {
 827             if (reg_param_index < REGN) {
 828                 /* save arguments passed by register */
 829                 if ((bt == VT_FLOAT) || (bt == VT_DOUBLE)) {
 830                     o(0xd60f66); /* movq */
 831                     gen_modrm(reg_param_index, VT_LOCAL, NULL, addr);
 832                 } else {
 833                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 834                 }
 835             }
 836             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL, addr);
 837         }
 838         addr += 8;
 839         reg_param_index++;
 840     }
 841
 842     while (reg_param_index < REGN) {
 843         if (func_type->ref->c == FUNC_ELLIPSIS) {
 844             gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 845             addr += 8;
 846         }
 847         reg_param_index++;
 848     }
 849 }
 850
 851 /* generate function epilog */
 852 void gfunc_epilog(void)
 853 {
 854     int v, saved_ind;
 855
 856     o(0xc9); /* leave */
 857     if (func_ret_sub == 0) {
 858         o(0xc3); /* ret */
 859     } else {
 860         o(0xc2); /* ret n */
 861         g(func_ret_sub);
 862         g(func_ret_sub >> 8);
 863     }
 864
 865     saved_ind = ind;
 866     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 867     /* align local size to word & save local variables */
 868     v = (func_scratch + -loc + 15) & -16;
 869
 870     if (v >= 4096) {
 871         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 872         oad(0xb8, v); /* mov stacksize, %eax */
 873         oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
 874         greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 875         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
 876     } else {
 877         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 878         o(0xec8148);  /* sub rsp, stacksize */
 879         gen_le32(v);
 880     }
 881
 882     cur_text_section->data_offset = saved_ind;
 883     pe_add_unwind_data(ind, saved_ind, v);
 884     ind = cur_text_section->data_offset;
 885 }
 886
 887 #else
 888
 889 static void gadd_sp(int val)
 890 {
 891     if (val == (char)val) {
 892         o(0xc48348);
 893         g(val);
 894     } else {
 895         oad(0xc48148, val); /* add $xxx, %rsp */
 896     }
 897 }
 898
 899 typedef enum X86_64_Mode {
 900   x86_64_mode_none,
 901   x86_64_mode_memory,
 902   x86_64_mode_integer,
 903   x86_64_mode_sse,
 904   x86_64_mode_x87
 905 } X86_64_Mode;
 906
 907 static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b)
 908 {
 909     if (a == b)
 910         return a;
 911     else if (a == x86_64_mode_none)
 912         return b;
 913     else if (b == x86_64_mode_none)
 914         return a;
 915     else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
 916         return x86_64_mode_memory;
 917     else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
 918         return x86_64_mode_integer;
 919     else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
 920         return x86_64_mode_memory;
 921     else
 922         return x86_64_mode_sse;
 923 }
 924
 925 static X86_64_Mode classify_x86_64_inner(CType *ty)
 926 {
 927     X86_64_Mode mode;
 928     Sym *f;
 929
 930     switch (ty->t & VT_BTYPE) {
 931     case VT_VOID: return x86_64_mode_none;
 932
 933     case VT_INT:
 934     case VT_BYTE:
 935     case VT_SHORT:
 936     case VT_LLONG:
 937         case VT_QLONG:
 938     case VT_BOOL:
 939     case VT_PTR:
 940     case VT_FUNC:
 941     case VT_ENUM: return x86_64_mode_integer;
 942
 943     case VT_FLOAT:
 944         case VT_QFLOAT:
 945     case VT_DOUBLE: return x86_64_mode_sse;
 946
 947     case VT_LDOUBLE: return x86_64_mode_x87;
 948
 949     case VT_STRUCT:
 950         f = ty->ref;
 951
 952         // Detect union
 953         if (f->next && (f->c == f->next->c))
 954           return x86_64_mode_memory;
 955
 956         mode = x86_64_mode_none;
 957         for (f = f->next; f; f = f->next)
 958             mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
 959
 960         return mode;
 961     }
 962
 963     assert(0);
 964 }
 965
 966 static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count)
 967 {
 968     X86_64_Mode mode;
 969     int size, align, ret_t = 0;
 970
 971     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
 972         *psize = 8;
 973                 *palign = 8;
 974         *reg_count = 1;
 975         ret_t = ty->t;
 976         mode = x86_64_mode_integer;
 977     } else {
 978         size = type_size(ty, &align);
 979         *psize = (size + 7) & ~7;
 980         *palign = (align + 7) & ~7;
 981
 982         if (size > 16) {
 983             mode = x86_64_mode_memory;
 984                         ret_t = ty->t;
 985         } else {
 986             mode = classify_x86_64_inner(ty);
 987             switch (mode) {
 988             case x86_64_mode_integer:
 989                 if (size > 8) {
 990                     *reg_count = 2;
 991                     ret_t = VT_QLONG;
 992                 } else {
 993                                         *reg_count = 1;
 994                                         if(size > 4)
 995                                                 ret_t = VT_LLONG;
 996                                         else if(size > 2){
 997                                                 ret_t = VT_INT;
 998                                         }else if(size > 1)
 999                                                 ret_t = VT_SHORT;
1000                                         else
1001                                                 ret_t = VT_BYTE;
1002                                 }
1003                                 ret_t |= (ty->t & VT_UNSIGNED);
1004                 break;
1005             case x86_64_mode_x87:
1006                 *reg_count = 1;
1007                 ret_t = VT_LDOUBLE;
1008                 break;
1009             case x86_64_mode_sse:
1010                 if (size > 8) {
1011                     *reg_count = 2;
1012                     ret_t = VT_QFLOAT;
1013                 } else {
1014                     *reg_count = 1;
1015                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
1016                 }
1017                 break;
1018                         default:
1019                                 ret_t = ty->t;
1020                                 break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
1021             }
1022         }
1023     }
1024
1025     if (ret) {
1026         ret->ref = ty->ref;
1027         ret->t = ret_t;
1028     }
1029
1030     return mode;
1031 }
1032
1033 ST_FUNC int classify_x86_64_va_arg(CType *ty)
1034 {
1035     /* This definition must be synced with stdarg.h */
1036     enum __va_arg_type {
1037                 __va_gen_reg, __va_float_reg, __va_ld_reg, __va_stack
1038         };
1039     int size, align, reg_count;
1040     X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
1041     switch (mode) {
1042     default: return __va_stack;
1043         case x86_64_mode_x87: return __va_ld_reg;
1044     case x86_64_mode_integer: return __va_gen_reg;
1045     case x86_64_mode_sse: return __va_float_reg;
1046     }
1047 }
1048
1049 /* Return the number of registers needed to return the struct, or 0 if
1050    returning via struct pointer. */
1051 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align)
1052 {
1053     int size, align, reg_count;
1054     *ret_align = 1; // Never have to re-align return values for x86-64
1055     return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
1056 }
1057
1058 #define REGN 6
1059 static const uint8_t arg_regs[REGN] = {
1060     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
1061 };
1062
1063 static int arg_prepare_reg(int idx) {
1064   if (idx == 2 || idx == 3)
1065       /* idx=2: r10, idx=3: r11 */
1066       return idx + 8;
1067   else
1068       return arg_regs[idx];
1069 }
1070
1071 /* Generate function call. The function address is pushed first, then
1072    all the parameters in call order. This functions pops all the
1073    parameters and the function address. */
1074 void gfunc_call(int nb_args)
1075 {
1076     X86_64_Mode mode;
1077     CType type;
1078     int size, align, r, args_size, stack_adjust, run_start, run_end, i, reg_count;
1079     int nb_reg_args = 0;
1080     int nb_sse_args = 0;
1081     int sse_reg, gen_reg;
1082
1083     /* calculate the number of integer/float register arguments */
1084     for(i = 0; i < nb_args; i++) {
1085         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1086         if (mode == x86_64_mode_sse)
1087             nb_sse_args += reg_count;
1088         else if (mode == x86_64_mode_integer)
1089             nb_reg_args += reg_count;
1090     }
1091
1092     /* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
1093        and ended by a 16-byte aligned argument. This is because, from the point of view of
1094        the callee, argument alignment is computed from the bottom up. */
1095     /* for struct arguments, we need to call memcpy and the function
1096        call breaks register passing arguments we are preparing.
1097        So, we process arguments which will be passed by stack first. */
1098     gen_reg = nb_reg_args;
1099     sse_reg = nb_sse_args;
1100     run_start = 0;
1101     args_size = 0;
1102     while (run_start != nb_args) {
1103         int run_gen_reg = gen_reg, run_sse_reg = sse_reg;
1104
1105         run_end = nb_args;
1106         stack_adjust = 0;
1107         for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
1108             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1109             switch (mode) {
1110             case x86_64_mode_memory:
1111             case x86_64_mode_x87:
1112             stack_arg:
1113                 if (align == 16)
1114                     run_end = i;
1115                 else
1116                     stack_adjust += size;
1117                 break;
1118
1119             case x86_64_mode_sse:
1120                 sse_reg -= reg_count;
1121                 if (sse_reg + reg_count > 8) goto stack_arg;
1122                 break;
1123
1124             case x86_64_mode_integer:
1125                 gen_reg -= reg_count;
1126                 if (gen_reg + reg_count > REGN) goto stack_arg;
1127                 break;
1128             default: break; /* nothing to be done for x86_64_mode_none */
1129             }
1130         }
1131
1132         gen_reg = run_gen_reg;
1133         sse_reg = run_sse_reg;
1134
1135         /* adjust stack to align SSE boundary */
1136         if (stack_adjust &= 15) {
1137             /* fetch cpu flag before the following sub will change the value */
1138             if (vtop >= vstack && (vtop->r & VT_VALMASK) == VT_CMP)
1139                 gv(RC_INT);
1140
1141             stack_adjust = 16 - stack_adjust;
1142             o(0x48);
1143             oad(0xec81, stack_adjust); /* sub $xxx, %rsp */
1144             args_size += stack_adjust;
1145         }
1146
1147         for(i = run_start; i < run_end;) {
1148             /* Swap argument to top, it will possibly be changed here,
1149               and might use more temps. At the end of the loop we keep
1150               in on the stack and swap it back to its original position
1151               if it is a register. */
1152             SValue tmp = vtop[0];
1153             vtop[0] = vtop[-i];
1154             vtop[-i] = tmp;
1155
1156             mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
1157
1158             int arg_stored = 1;
1159             switch (vtop->type.t & VT_BTYPE) {
1160             case VT_STRUCT:
1161                 if (mode == x86_64_mode_sse) {
1162                     if (sse_reg > 8)
1163                         sse_reg -= reg_count;
1164                     else
1165                         arg_stored = 0;
1166                 } else if (mode == x86_64_mode_integer) {
1167                     if (gen_reg > REGN)
1168                         gen_reg -= reg_count;
1169                     else
1170                         arg_stored = 0;
1171                 }
1172
1173                 if (arg_stored) {
1174                     /* allocate the necessary size on stack */
1175                     o(0x48);
1176                     oad(0xec81, size); /* sub $xxx, %rsp */
1177                     /* generate structure store */
1178                     r = get_reg(RC_INT);
1179                     orex(1, r, 0, 0x89); /* mov %rsp, r */
1180                     o(0xe0 + REG_VALUE(r));
1181                     vset(&vtop->type, r | VT_LVAL, 0);
1182                     vswap();
1183                     vstore();
1184                     args_size += size;
1185                 }
1186                 break;
1187
1188             case VT_LDOUBLE:
1189                 assert(0);
1190                 break;
1191
1192             case VT_FLOAT:
1193             case VT_DOUBLE:
1194                 assert(mode == x86_64_mode_sse);
1195                 if (sse_reg > 8) {
1196                     --sse_reg;
1197                     r = gv(RC_FLOAT);
1198                     o(0x50); /* push $rax */
1199                     /* movq %xmmN, (%rsp) */
1200                     o(0xd60f66);
1201                     o(0x04 + REG_VALUE(r)*8);
1202                     o(0x24);
1203                     args_size += size;
1204                 } else {
1205                     arg_stored = 0;
1206                 }
1207                 break;
1208
1209             default:
1210                 assert(mode == x86_64_mode_integer);
1211                 /* simple type */
1212                 /* XXX: implicit cast ? */
1213                 if (gen_reg > REGN) {
1214                     --gen_reg;
1215                     r = gv(RC_INT);
1216                     orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
1217                     args_size += size;
1218                 } else {
1219                     arg_stored = 0;
1220                 }
1221                 break;
1222             }
1223
1224             /* And swap the argument back to it's original position.  */
1225             tmp = vtop[0];
1226             vtop[0] = vtop[-i];
1227             vtop[-i] = tmp;
1228
1229             if (arg_stored) {
1230               vrotb(i+1);
1231               assert((vtop->type.t == tmp.type.t) && (vtop->r == tmp.r));
1232               vpop();
1233               --nb_args;
1234               --run_end;
1235             } else {
1236               ++i;
1237             }
1238         }
1239
1240         /* handle 16 byte aligned arguments at end of run */
1241         run_start = i = run_end;
1242         while (i < nb_args) {
1243             /* Rotate argument to top since it will always be popped */
1244             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1245             if (align != 16)
1246               break;
1247
1248             vrotb(i+1);
1249
1250             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1251                 gv(RC_ST0);
1252                 oad(0xec8148, size); /* sub $xxx, %rsp */
1253                 o(0x7cdb); /* fstpt 0(%rsp) */
1254                 g(0x24);
1255                 g(0x00);
1256                 args_size += size;
1257             } else {
1258                 //assert(mode == x86_64_mode_memory);
1259
1260                 /* allocate the necessary size on stack */
1261                 o(0x48);
1262                 oad(0xec81, size); /* sub $xxx, %rsp */
1263                 /* generate structure store */
1264                 r = get_reg(RC_INT);
1265                 orex(1, r, 0, 0x89); /* mov %rsp, r */
1266                 o(0xe0 + REG_VALUE(r));
1267                 vset(&vtop->type, r | VT_LVAL, 0);
1268                 vswap();
1269                 vstore();
1270                 args_size += size;
1271             }
1272
1273             vpop();
1274             --nb_args;
1275         }
1276     }
1277
1278     /* XXX This should be superfluous.  */
1279     save_regs(0); /* save used temporary registers */
1280
1281     /* then, we prepare register passing arguments.
1282        Note that we cannot set RDX and RCX in this loop because gv()
1283        may break these temporary registers. Let's use R10 and R11
1284        instead of them */
1285     assert(gen_reg <= REGN);
1286     assert(sse_reg <= 8);
1287     for(i = 0; i < nb_args; i++) {
1288         mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
1289         /* Alter stack entry type so that gv() knows how to treat it */
1290         vtop->type = type;
1291         if (mode == x86_64_mode_sse) {
1292             if (reg_count == 2) {
1293                 sse_reg -= 2;
1294                 gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
1295                 if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
1296                     /* movaps %xmm0, %xmmN */
1297                     o(0x280f);
1298                     o(0xc0 + (sse_reg << 3));
1299                     /* movaps %xmm1, %xmmN */
1300                     o(0x280f);
1301                     o(0xc1 + ((sse_reg+1) << 3));
1302                 }
1303             } else {
1304                 assert(reg_count == 1);
1305                 --sse_reg;
1306                 /* Load directly to register */
1307                 gv(RC_XMM0 << sse_reg);
1308             }
1309         } else if (mode == x86_64_mode_integer) {
1310             /* simple type */
1311             /* XXX: implicit cast ? */
1312             gen_reg -= reg_count;
1313             r = gv(RC_INT);
1314             int d = arg_prepare_reg(gen_reg);
1315             orex(1,d,r,0x89); /* mov */
1316             o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
1317             if (reg_count == 2) {
1318                 d = arg_prepare_reg(gen_reg+1);
1319                 orex(1,d,vtop->r2,0x89); /* mov */
1320                 o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
1321             }
1322         }
1323         vtop--;
1324     }
1325     assert(gen_reg == 0);
1326     assert(sse_reg == 0);
1327
1328     /* We shouldn't have many operands on the stack anymore, but the
1329        call address itself is still there, and it might be in %eax
1330        (or edx/ecx) currently, which the below writes would clobber.
1331        So evict all remaining operands here.  */
1332     save_regs(0);
1333
1334     /* Copy R10 and R11 into RDX and RCX, respectively */
1335     if (nb_reg_args > 2) {
1336         o(0xd2894c); /* mov %r10, %rdx */
1337         if (nb_reg_args > 3) {
1338             o(0xd9894c); /* mov %r11, %rcx */
1339         }
1340     }
1341
1342     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
1343     gcall_or_jmp(0);
1344     if (args_size)
1345         gadd_sp(args_size);
1346     vtop--;
1347 }
1348
1349
1350 #define FUNC_PROLOG_SIZE 11
1351
1352 static void push_arg_reg(int i) {
1353     loc -= 8;
1354     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
1355 }
1356
1357 /* generate function prolog of type 't' */
1358 void gfunc_prolog(CType *func_type)
1359 {
1360     X86_64_Mode mode;
1361     int i, addr, align, size, reg_count;
1362     int param_addr = 0, reg_param_index, sse_param_index;
1363     Sym *sym;
1364     CType *type;
1365
1366     sym = func_type->ref;
1367     addr = PTR_SIZE * 2;
1368     pop_stack = loc = 0;
1369     ind += FUNC_PROLOG_SIZE;
1370     func_sub_sp_offset = ind;
1371     func_ret_sub = 0;
1372
1373     if (func_type->ref->c == FUNC_ELLIPSIS) {
1374         int seen_reg_num, seen_sse_num, seen_stack_size;
1375         seen_reg_num = seen_sse_num = 0;
1376         /* frame pointer and return address */
1377         seen_stack_size = PTR_SIZE * 2;
1378         /* count the number of seen parameters */
1379         while ((sym = sym->next) != NULL) {
1380             type = &sym->type;
1381             mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1382             switch (mode) {
1383             default:
1384             stack_arg:
1385                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
1386                 break;
1387
1388             case x86_64_mode_integer:
1389                 if (seen_reg_num + reg_count <= 8) {
1390                     seen_reg_num += reg_count;
1391                 } else {
1392                     seen_reg_num = 8;
1393                     goto stack_arg;
1394                 }
1395                 break;
1396
1397             case x86_64_mode_sse:
1398                 if (seen_sse_num + reg_count <= 8) {
1399                     seen_sse_num += reg_count;
1400                 } else {
1401                     seen_sse_num = 8;
1402                     goto stack_arg;
1403                 }
1404                 break;
1405             }
1406         }
1407
1408         loc -= 16;
1409         /* movl $0x????????, -0x10(%rbp) */
1410         o(0xf045c7);
1411         gen_le32(seen_reg_num * 8);
1412         /* movl $0x????????, -0xc(%rbp) */
1413         o(0xf445c7);
1414         gen_le32(seen_sse_num * 16 + 48);
1415         /* movl $0x????????, -0x8(%rbp) */
1416         o(0xf845c7);
1417         gen_le32(seen_stack_size);
1418
1419                 o(0xc084);/* test   %al,%al */
1420                 o(0x74);/* je */
1421                 g(4*(8 - seen_sse_num) + 3);
1422
1423         /* save all register passing arguments */
1424         for (i = 0; i < 8; i++) {
1425             loc -= 16;
1426             o(0x290f);/* movaps %xmm1-7,-XXX(%rbp) */
1427             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
1428         }
1429         for (i = 0; i < (REGN - seen_reg_num); i++) {
1430             push_arg_reg(REGN-1 - i);
1431                 }
1432     }
1433
1434     sym = func_type->ref;
1435     reg_param_index = 0;
1436     sse_param_index = 0;
1437
1438     /* if the function returns a structure, then add an
1439        implicit pointer parameter */
1440     func_vt = sym->type;
1441     mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
1442     if (mode == x86_64_mode_memory) {
1443         push_arg_reg(reg_param_index);
1444         func_vc = loc;
1445         reg_param_index++;
1446     }
1447     /* define parameters */
1448     while ((sym = sym->next) != NULL) {
1449         type = &sym->type;
1450         mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1451         switch (mode) {
1452         case x86_64_mode_sse:
1453             if (sse_param_index + reg_count <= 8) {
1454                 /* save arguments passed by register */
1455                 loc -= reg_count * 8;
1456                 param_addr = loc;
1457                 for (i = 0; i < reg_count; ++i) {
1458                     o(0xd60f66); /* movq */
1459                     gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
1460                     ++sse_param_index;
1461                 }
1462             } else {
1463                 addr = (addr + align - 1) & -align;
1464                 param_addr = addr;
1465                 addr += size;
1466                 sse_param_index += reg_count;
1467             }
1468             break;
1469
1470         case x86_64_mode_memory:
1471         case x86_64_mode_x87:
1472             addr = (addr + align - 1) & -align;
1473             param_addr = addr;
1474             addr += size;
1475             break;
1476
1477         case x86_64_mode_integer: {
1478             if (reg_param_index + reg_count <= REGN) {
1479                 /* save arguments passed by register */
1480                 loc -= reg_count * 8;
1481                 param_addr = loc;
1482                 for (i = 0; i < reg_count; ++i) {
1483                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
1484                     ++reg_param_index;
1485                 }
1486             } else {
1487                 addr = (addr + align - 1) & -align;
1488                 param_addr = addr;
1489                 addr += size;
1490                 reg_param_index += reg_count;
1491             }
1492             break;
1493         }
1494         default: break; /* nothing to be done for x86_64_mode_none */
1495         }
1496         sym_push(sym->v & ~SYM_FIELD, type,
1497                  VT_LOCAL | VT_LVAL, param_addr);
1498     }
1499 }
1500
1501 /* generate function epilog */
1502 void gfunc_epilog(void)
1503 {
1504     int v, saved_ind;
1505
1506     o(0xc9); /* leave */
1507     if (func_ret_sub == 0) {
1508         o(0xc3); /* ret */
1509     } else {
1510         o(0xc2); /* ret n */
1511         g(func_ret_sub);
1512         g(func_ret_sub >> 8);
1513     }
1514     /* align local size to word & save local variables */
1515     v = (-loc + 15) & -16;
1516     saved_ind = ind;
1517     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1518     o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1519     o(0xec8148);  /* sub rsp, stacksize */
1520     gen_le32(v);
1521     ind = saved_ind;
1522 }
1523
1524 #endif /* not PE */
1525
1526 /* generate a jump to a label */
1527 int gjmp(int t)
1528 {
1529     return psym(0xe9, t);
1530 }
1531
1532 /* generate a jump to a fixed address */
1533 void gjmp_addr(int a)
1534 {
1535     int r;
1536     r = a - ind - 2;
1537     if (r == (char)r) {
1538         g(0xeb);
1539         g(r);
1540     } else {
1541         oad(0xe9, a - ind - 5);
1542     }
1543 }
1544
1545 /* generate a test. set 'inv' to invert test. Stack entry is popped */
1546 int gtst(int inv, int t)
1547 {
1548     int v, *p;
1549
1550     v = vtop->r & VT_VALMASK;
1551     if (v == VT_CMP) {
1552         /* fast case : can jump directly since flags are set */
1553         if (vtop->c.i & 0x100)
1554           {
1555             /* This was a float compare.  If the parity flag is set
1556                the result was unordered.  For anything except != this
1557                means false and we don't jump (anding both conditions).
1558                For != this means true (oring both).
1559                Take care about inverting the test.  We need to jump
1560                to our target if the result was unordered and test wasn't NE,
1561                otherwise if unordered we don't want to jump.  */
1562             vtop->c.i &= ~0x100;
1563             if (!inv == (vtop->c.i != TOK_NE))
1564               o(0x067a);  /* jp +6 */
1565             else
1566               {
1567                 g(0x0f);
1568                 t = psym(0x8a, t); /* jp t */
1569               }
1570           }
1571         g(0x0f);
1572         t = psym((vtop->c.i - 16) ^ inv, t);
1573     } else if (v == VT_JMP || v == VT_JMPI) {
1574         /* && or || optimization */
1575         if ((v & 1) == inv) {
1576             /* insert vtop->c jump list in t */
1577             p = &vtop->c.i;
1578             while (*p != 0)
1579                 p = (int *)(cur_text_section->data + *p);
1580             *p = t;
1581             t = vtop->c.i;
1582         } else {
1583             t = gjmp(t);
1584             gsym(vtop->c.i);
1585         }
1586         } else {
1587         if (is_float(vtop->type.t) ||
1588             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
1589             vpushi(0);
1590             gen_op(TOK_NE);
1591         }
1592         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
1593             /* constant jmp optimization */
1594             if ((vtop->c.i != 0) != inv)
1595                 t = gjmp(t);
1596         } else {
1597             v = gv(RC_INT);
1598             orex(0,v,v,0x85);
1599             o(0xc0 + REG_VALUE(v) * 9);
1600             g(0x0f);
1601             t = psym(0x85 ^ inv, t);
1602         }
1603     }
1604     vtop--;
1605     return t;
1606 }
1607
1608 /* generate an integer binary operation */
1609 void gen_opi(int op)
1610 {
1611     int r, fr, opc, fc, c, ll, uu, cc, tt2;
1612
1613         fr = vtop[0].r;
1614         fc = vtop->c.ul;
1615     ll = is64_type(vtop[-1].type.t);
1616     cc = (fr & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
1617         tt2 = (fr & (VT_LVAL | VT_LVAL_TYPE)) == VT_LVAL;
1618
1619     switch(op) {
1620     case '+':
1621     case TOK_ADDC1: /* add with carry generation */
1622         opc = 0;
1623     gen_op8:
1624                 vswap();
1625                 r = gv(RC_INT);
1626                 vswap();
1627         if (cc && (!ll || (int)vtop->c.ll == vtop->c.ll)) {
1628             /* constant case */
1629             c = vtop->c.i;
1630             if (c == (char)c) {
1631                 /* XXX: generate inc and dec for smaller code ? */
1632                                 orex(ll, r, 0, 0x83);
1633                                 o(0xc0 + REG_VALUE(r) + opc*8);
1634                                 g(c);
1635             } else {
1636                 orex(ll, r, 0, 0x81);
1637                 oad(0xc0 + REG_VALUE(r) + opc*8, c);
1638             }
1639         } else {
1640                         if(!tt2)
1641                                 fr = gv(RC_INT);
1642                         orex(ll, fr, r, 0x03 + opc*8);
1643                         if(fr >= VT_CONST)
1644                 gen_modrm(r, fr, vtop->sym, fc);
1645                         else
1646                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1647         }
1648         vtop--;
1649         if (op >= TOK_ULT && op <= TOK_GT) {
1650             vtop->r = VT_CMP;
1651             vtop->c.i = op;
1652         }
1653         break;
1654     case '-':
1655     case TOK_SUBC1: /* sub with carry generation */
1656         opc = 5;
1657         goto gen_op8;
1658     case TOK_ADDC2: /* add with carry use */
1659         opc = 2;
1660         goto gen_op8;
1661     case TOK_SUBC2: /* sub with carry use */
1662         opc = 3;
1663         goto gen_op8;
1664     case '&':
1665         opc = 4;
1666         goto gen_op8;
1667     case '^':
1668         opc = 6;
1669         goto gen_op8;
1670     case '|':
1671         opc = 1;
1672         goto gen_op8;
1673     case '*':
1674                 opc = 5;
1675         vswap();
1676                 r = gv(RC_INT);
1677                 vswap();
1678                 if(!tt2)
1679                         fr = gv(RC_INT);
1680                 if(r == TREG_RAX){
1681                         if(fr != TREG_RDX)
1682                                 save_reg(TREG_RDX);
1683                         orex(ll, fr, r, 0xf7);
1684                         if(fr >= VT_CONST)
1685                                 gen_modrm(opc, fr, vtop->sym, fc);
1686                         else
1687                                 o(0xc0 + REG_VALUE(fr)  + opc*8);
1688                 }else{
1689                         orex(ll, fr, r, 0xaf0f);        /* imul fr, r */
1690                         if(fr >= VT_CONST)
1691                                 gen_modrm(r, fr, vtop->sym, fc);
1692                         else
1693                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1694                 }
1695         vtop--;
1696         break;
1697     case TOK_SHL:
1698         opc = 4;
1699         goto gen_shift;
1700     case TOK_SHR:
1701         opc = 5;
1702         goto gen_shift;
1703     case TOK_SAR:
1704         opc = 7;
1705     gen_shift:
1706         if (cc) {
1707             /* constant case */
1708             vswap();
1709             r = gv(RC_INT);
1710             vswap();
1711                         c = vtop->c.i;
1712                         if(c == 1){
1713                                 orex(ll, r, 0, 0xd1);
1714                                 o(0xc0 + REG_VALUE(r) + opc*8);
1715                         }else{
1716                                 orex(ll, r, 0, 0xc1); /* shl/shr/sar $xxx, r */
1717                                 o(0xc0 + REG_VALUE(r) + opc*8);
1718                                 g(c & (ll ? 0x3f : 0x1f));
1719                         }
1720         } else {
1721             /* we generate the shift in ecx */
1722             gv2(RC_INT, RC_RCX);
1723             r = vtop[-1].r;
1724             orex(ll, r, 0, 0xd3); /* shl/shr/sar %cl, r */
1725                         o(0xc0 + REG_VALUE(r) + opc*8);
1726         }
1727         vtop--;
1728         break;
1729     case TOK_UDIV:
1730     case TOK_UMOD:
1731                 opc = 6;
1732         uu = 1;
1733         goto divmod;
1734     case '/':
1735     case '%':
1736     case TOK_PDIV:
1737                 opc = 7;
1738         uu = 0;
1739     divmod:
1740         /* first operand must be in eax */
1741         /* XXX: need better constraint for second operand */
1742                 if(!tt2){
1743                         gv2(RC_RAX, RC_INT2);
1744                         fr = vtop[0].r;
1745                 }else{
1746                         vswap();
1747                         gv(RC_RAX);
1748                         vswap();
1749                 }
1750                 save_reg(TREG_RDX);
1751                 orex(ll, 0, 0, uu ? 0xd231 : 0x99); /* xor %edx,%edx : cdq RDX:RAX <- sign-extend of RAX. */
1752                 orex(ll, fr, 0, 0xf7); /* div fr, %eax */
1753                 if(fr >= VT_CONST)
1754                         gen_modrm(opc, fr, vtop->sym, fc);
1755                 else
1756                         o(0xc0 + REG_VALUE(fr) + opc*8);
1757         if (op == '%' || op == TOK_UMOD)
1758             r = TREG_RDX;
1759         else
1760             r = TREG_RAX;
1761         vtop--;
1762         vtop->r = r;
1763         break;
1764     default:
1765         opc = 7;
1766         goto gen_op8;
1767     }
1768 }
1769
1770 void gen_opl(int op)
1771 {
1772     gen_opi(op);
1773 }
1774
1775 /* generate a floating point operation 'v = t1 op t2' instruction. The
1776    two operands are guaranted to have the same floating point type */
1777 /* XXX: need to use ST1 too */
1778 void gen_opf(int op)
1779 {
1780     int a, ft, fc, swapped, fr, r;
1781     int float_type = (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1782
1783     /* convert constants to memory references */
1784     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1785         vswap();
1786         gv(float_type);
1787         vswap();
1788     }
1789     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1790         gv(float_type);
1791
1792         swapped = 0;
1793         fc = vtop->c.ul;
1794         ft = vtop->type.t;
1795
1796     if ((ft & VT_BTYPE) == VT_LDOUBLE) {
1797                 /* swap the stack if needed so that t1 is the register and t2 is
1798                 the memory reference */
1799                 /* must put at least one value in the floating point register */
1800                 if ((vtop[-1].r & VT_LVAL) && (vtop[0].r & VT_LVAL)) {
1801                         vswap();
1802                         gv(float_type);
1803                         vswap();
1804                 }
1805                 if (vtop[-1].r & VT_LVAL) {
1806                         vswap();
1807                         swapped = 1;
1808                 }
1809         if (op >= TOK_ULT && op <= TOK_GT) {
1810             /* load on stack second operand */
1811             load(TREG_ST0, vtop);
1812             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1813             if (op == TOK_GE || op == TOK_GT)
1814                 swapped = !swapped;
1815             else if (op == TOK_EQ || op == TOK_NE)
1816                 swapped = 0;
1817             if (swapped)
1818                 o(0xc9d9); /* fxch %st(1) */
1819                         if (op == TOK_EQ || op == TOK_NE)
1820                                 o(0xe9da); /* fucompp */
1821                         else
1822                                 o(0xd9de); /* fcompp */
1823             o(0xe0df); /* fnstsw %ax */
1824             if (op == TOK_EQ) {
1825                 o(0x45e480); /* and $0x45, %ah */
1826                 o(0x40fC80); /* cmp $0x40, %ah */
1827             } else if (op == TOK_NE) {
1828                 o(0x45e480); /* and $0x45, %ah */
1829                 o(0x40f480); /* xor $0x40, %ah */
1830                 op = TOK_NE;
1831             } else if (op == TOK_GE || op == TOK_LE) {
1832                 o(0x05c4f6); /* test $0x05, %ah */
1833                 op = TOK_EQ;
1834             } else {
1835                 o(0x45c4f6); /* test $0x45, %ah */
1836                 op = TOK_EQ;
1837             }
1838             vtop--;
1839             vtop->r = VT_CMP;
1840             vtop->c.i = op;
1841         } else {
1842             /* no memory reference possible for long double operations */
1843             load(TREG_ST0, vtop);
1844             swapped = !swapped;
1845             switch(op) {
1846             default:
1847             case '+':
1848                 a = 0;
1849                 break;
1850             case '-':
1851                 a = 4;
1852                 if (swapped)
1853                     a++;
1854                 break;
1855             case '*':
1856                 a = 1;
1857                 break;
1858             case '/':
1859                 a = 6;
1860                 if (swapped)
1861                     a++;
1862                 break;
1863             }
1864             o(0xde); /* fxxxp %st, %st(1) */
1865             o(0xc1 + (a << 3));
1866             vtop--;
1867         }
1868     } else {
1869                 vswap();
1870                 gv(float_type);
1871                 vswap();
1872                 fr = vtop->r;
1873                 r = vtop[-1].r;
1874         if (op >= TOK_ULT && op <= TOK_GT) {
1875                         switch(op){
1876                         case TOK_LE:
1877                                 op = TOK_ULE; /* setae */
1878                                 break;
1879                         case TOK_LT:
1880                                 op = TOK_ULT;
1881                                 break;
1882                         case TOK_GE:
1883                                 op = TOK_UGE;
1884                                 break;
1885                         case TOK_GT:
1886                                 op = TOK_UGT; /* seta */
1887                                 break;
1888                         }
1889                         assert(!(vtop[-1].r & VT_LVAL));
1890                         if ((ft & VT_BTYPE) == VT_DOUBLE)
1891                                 o(0x66);
1892                         o(0x2e0f); /* ucomisd */
1893                         if(fr >= VT_CONST)
1894                                 gen_modrm(r, fr, vtop->sym, fc);
1895                         else
1896                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1897             vtop--;
1898             vtop->r = VT_CMP;
1899             vtop->c.i = op | 0x100;
1900         } else {
1901                         assert((vtop->type.t & VT_BTYPE) != VT_LDOUBLE);
1902             /* no memory reference possible for long double operations */
1903             switch(op) {
1904             default:
1905             case '+':
1906                 a = 0;
1907                 break;
1908             case '-':
1909                 a = 4;
1910                 break;
1911             case '*':
1912                 a = 1;
1913                 break;
1914             case '/':
1915                 a = 6;
1916                 break;
1917             }
1918                         assert((ft & VT_BTYPE) != VT_LDOUBLE);
1919                         assert(!(vtop[-1].r & VT_LVAL));
1920                         if ((ft & VT_BTYPE) == VT_DOUBLE) {
1921                                 o(0xf2);
1922                         } else {
1923                                 o(0xf3);
1924                         }
1925                         o(0x0f);
1926                         o(0x58 + a);
1927                         if(fr >= VT_CONST)
1928                                 gen_modrm(r, fr, vtop->sym, fc);
1929                         else
1930                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1931                         vtop--;
1932         }
1933     }
1934 }
1935
1936 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
1937    and 'long long' cases. */
1938 void gen_cvt_itof(int t)
1939 {
1940         int ft, bt, tbt, r;
1941
1942     ft = vtop->type.t;
1943     bt = ft & VT_BTYPE;
1944     tbt = t & VT_BTYPE;
1945         r = gv(RC_INT);
1946
1947     if (tbt == VT_LDOUBLE) {
1948         save_reg(TREG_ST0);
1949         if ((ft & VT_BTYPE) == VT_LLONG) {
1950             /* signed long long to float/double/long double (unsigned case
1951                is handled generically) */
1952             o(0x50 + REG_VALUE(r)); /* push r */
1953             o(0x242cdf); /* fildll (%rsp) */
1954             o(0x08c48348); /* add $8, %rsp */
1955         } else if ((ft & (VT_BTYPE | VT_UNSIGNED)) == (VT_INT | VT_UNSIGNED)) {
1956             /* unsigned int to float/double/long double */
1957             o(0x6a); /* push $0 */
1958             g(0x00);
1959             o(0x50 + REG_VALUE(r)); /* push r */
1960             o(0x242cdf); /* fildll (%rsp) */
1961             o(0x10c48348); /* add $16, %rsp */
1962         } else {
1963             /* int to float/double/long double */
1964             o(0x50 + REG_VALUE(r)); /* push r */
1965             o(0x2404db); /* fildl (%rsp) */
1966             o(0x08c48348); /* add $8, %rsp */
1967         }
1968         vtop->r = TREG_ST0;
1969     } else {
1970                 int r_xmm;
1971         r_xmm = get_reg(RC_FLOAT);
1972         o(0xf2 + (tbt == VT_FLOAT));
1973         if ((ft & (VT_BTYPE | VT_UNSIGNED)) == (VT_INT | VT_UNSIGNED) || bt == VT_LLONG) {
1974             o(0x48); /* REX */
1975         }
1976         o(0x2a0f);
1977         o(0xc0 + REG_VALUE(r) + REG_VALUE(r_xmm)*8); /* cvtsi2sd or cvtsi2ss */
1978         vtop->r = r_xmm;
1979     }
1980 }
1981
1982 /* convert from one floating point type to another */
1983 void gen_cvt_ftof(int t)
1984 {
1985     int ft, bt, tbt, r;
1986
1987     ft = vtop->type.t;
1988     bt = ft & VT_BTYPE;
1989     tbt = t & VT_BTYPE;
1990
1991         if(bt == VT_LDOUBLE)
1992                 r = get_reg(RC_FLOAT);
1993         else
1994                 r = gv(RC_FLOAT);
1995     if (bt == VT_FLOAT) {
1996         if (tbt == VT_DOUBLE) {
1997             o(0x5a0f); /* cvtps2pd */
1998                         o(0xc0 + REG_VALUE(r) + REG_VALUE(r) * 8);
1999         } else if (tbt == VT_LDOUBLE) {
2000             /* movss %xmm0-7,-0x10(%rsp) */
2001             o(0x110ff3);
2002             o(0xf02444 + REG_VALUE(r)*8);
2003             o(0xf02444d9); /* flds -0x10(%rsp) */
2004             vtop->r = TREG_ST0;
2005         }
2006     } else if (bt == VT_DOUBLE) {
2007         if (tbt == VT_FLOAT) {
2008             o(0x5a0f66); /* cvtpd2ps */
2009                         o(0xc0 + REG_VALUE(r) + REG_VALUE(r) * 8);
2010         } else if (tbt == VT_LDOUBLE) {
2011             /* movsd %xmm0-7,-0x10(%rsp) */
2012             o(0x110ff2);
2013             o(0xf02444 + REG_VALUE(r)*8);
2014             o(0xf02444dd); /* fldl -0x10(%rsp) */
2015             vtop->r = TREG_ST0;
2016         }
2017     } else {
2018         gv(RC_ST0);
2019         if (tbt == VT_DOUBLE) {
2020             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
2021             /* movsd -0x10(%rsp),%xmm0-7 */
2022             o(0x100ff2);
2023             o(0xf02444 + REG_VALUE(r)*8);
2024             vtop->r = r;
2025         } else if (tbt == VT_FLOAT) {
2026             o(0xf0245cd9); /* fstps -0x10(%rsp) */
2027             /* movss -0x10(%rsp),%xmm0-7 */
2028             o(0x100ff3);
2029             o(0xf02444 + REG_VALUE(r)*8);
2030             vtop->r = r;
2031         }
2032     }
2033 }
2034
2035 /* convert fp to int 't' type */
2036 void gen_cvt_ftoi(int t)
2037 {
2038     int ft, bt, ll, r, r_xmm;
2039
2040     ft = vtop->type.t;
2041     bt = ft & VT_BTYPE;
2042
2043     if (bt == VT_LDOUBLE) {
2044         gen_cvt_ftof(VT_DOUBLE);
2045         bt = VT_DOUBLE;
2046     }
2047     r_xmm = gv(RC_FLOAT);
2048     if ((t & VT_BTYPE) == VT_INT)
2049         ll = 0;
2050     else
2051         ll = 1;
2052     r = get_reg(RC_INT);
2053     if (bt == VT_FLOAT) {
2054         o(0xf3);
2055     } else if (bt == VT_DOUBLE) {
2056         o(0xf2);
2057     } else {
2058         assert(0);
2059     }
2060     orex(ll, r, r_xmm, 0x2c0f); /* cvttss2si or cvttsd2si */
2061     o(0xc0 + REG_VALUE(r_xmm) + (REG_VALUE(r) << 3));
2062     vtop->r = r;
2063 }
2064
2065 /* computed goto support */
2066 void ggoto(void)
2067 {
2068     gcall_or_jmp(1);
2069     vtop--;
2070 }
2071
2072 /* Save the stack pointer onto the stack and return the location of its address */
2073 ST_FUNC void gen_vla_sp_save(int addr) {
2074     /* mov %rsp,addr(%rbp)*/
2075     gen_modrm64(0x89, TREG_RSP, VT_LOCAL, NULL, addr);
2076 }
2077
2078 /* Restore the SP from a location on the stack */
2079 ST_FUNC void gen_vla_sp_restore(int addr) {
2080     gen_modrm64(0x8b, TREG_RSP, VT_LOCAL, NULL, addr);
2081 }
2082
2083 /* Subtract from the stack pointer, and push the resulting value onto the stack */
2084 ST_FUNC void gen_vla_alloc(CType *type, int align) {
2085 #ifdef TCC_TARGET_PE
2086     /* alloca does more than just adjust %rsp on Windows */
2087     vpush_global_sym(&func_old_type, TOK_alloca);
2088     vswap(); /* Move alloca ref past allocation size */
2089     gfunc_call(1);
2090     vset(type, REG_IRET, 0);
2091 #else
2092     int r;
2093     r = gv(RC_INT); /* allocation size */
2094     /* sub r,%rsp */
2095     o(0x2b48);
2096     o(0xe0 | REG_VALUE(r));
2097     /* We align to 16 bytes rather than align */
2098     /* and ~15, %rsp */
2099     o(0xf0e48348);
2100     /* mov %rsp, r */
2101     o(0x8948);
2102     o(0xe0 | REG_VALUE(r));
2103     vpop();
2104     vset(type, r, 0);
2105 #endif
2106 }
2107
2108
2109 /* end of x86-64 code generator */
2110 /*************************************************************/
2111 #endif /* ! TARGET_DEFS_ONLY */
2112 /******************************************************/