x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #ifdef TARGET_DEFS_ONLY
  24
  25 /* number of available registers */
  26 #define NB_REGS         25
  27 #define NB_ASM_REGS     8
  28
  29 /* a register can belong to several classes. The classes must be
  30    sorted from more general to more precise (see gv2() code which does
  31    assumptions on it). */
  32 #define RC_INT          0x0001 /* generic integer register */
  33 #define RC_FLOAT        0x0002 /* generic float register */
  34 #define RC_RAX          0x0004
  35 #define RC_RCX          0x0008
  36 #define RC_RDX          0x0010
  37 #define RC_ST0          0x0020 /* only for long double */
  38 #define RC_R8           0x0040
  39 #define RC_R9           0x0080
  40 #define RC_XMM0         0x0100
  41 #define RC_XMM1         0x0200
  42 #define RC_XMM2         0x0400
  43 #define RC_XMM3         0x0800
  44 #define RC_XMM4         0x1000
  45 #define RC_XMM5         0x2000
  46 #define RC_XMM6         0x4000
  47 #define RC_XMM7         0x8000
  48 #define RC_RSI          0x10000
  49 #define RC_RDI          0x20000
  50 #define RC_INT1         0x40000 /* function_pointer */
  51 #define RC_INT2         0x80000
  52 #define RC_RBX          0x100000
  53 #define RC_R10          0x200000
  54 #define RC_R11          0x400000
  55 #define RC_R12          0x800000
  56 #define RC_R13          0x1000000
  57 #define RC_R14          0x2000000
  58 #define RC_R15          0x4000000
  59 #define RC_IRET         RC_RAX /* function return: integer register */
  60 #define RC_LRET         RC_RDX /* function return: second integer register */
  61 #define RC_FRET         RC_XMM0 /* function return: float register */
  62 #define RC_QRET         RC_XMM1 /* function return: second float register */
  63 #define RC_MASK         (RC_INT|RC_INT1|RC_INT2|RC_FLOAT)
  64
  65 /* pretty names for the registers */
  66 enum {
  67     TREG_RAX = 0,
  68     TREG_RCX = 1,
  69     TREG_RDX = 2,
  70     TREG_RSP = 4,
  71     TREG_ST0 = 5,
  72     TREG_RSI = 6,
  73     TREG_RDI = 7,
  74
  75     TREG_R8  = 8,
  76     TREG_R9  = 9,
  77     TREG_R10 = 10,
  78     TREG_R11 = 11,
  79
  80     TREG_XMM0 = 16,
  81     TREG_XMM1 = 17,
  82     TREG_XMM2 = 18,
  83     TREG_XMM3 = 19,
  84     TREG_XMM4 = 20,
  85     TREG_XMM5 = 21,
  86     TREG_XMM6 = 22,
  87     TREG_XMM7 = 23,
  88
  89 };
  90
  91 #define REX_BASE(reg) (((reg) >> 3) & 1)
  92 #define REG_VALUE(reg) ((reg) & 7)
  93 #define FLAG_GOT        0X01
  94
  95 /* return registers for function */
  96 #define REG_IRET TREG_RAX /* single word int return register */
  97 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  98 #define REG_FRET TREG_XMM0 /* float return register */
  99 #define REG_QRET TREG_XMM1 /* second float return register */
 100
 101 /* defined if function parameters must be evaluated in reverse order */
 102 #define INVERT_FUNC_PARAMS
 103
 104 /* pointer size, in bytes */
 105 #define PTR_SIZE 8
 106
 107 /* long double size and alignment, in bytes */
 108 #define LDOUBLE_SIZE  16
 109 #define LDOUBLE_ALIGN 16
 110 /* maximum alignment (for aligned attribute support) */
 111 #define MAX_ALIGN     16
 112
 113 /******************************************************/
 114 /* ELF defines */
 115
 116 #define EM_TCC_TARGET EM_X86_64
 117
 118 /* relocation type for 32 bit data relocation */
 119 #define R_DATA_32   R_X86_64_32
 120 #define R_DATA_PTR  R_X86_64_64
 121 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
 122 #define R_COPY      R_X86_64_COPY
 123
 124 #define ELF_START_ADDR 0x400000
 125 #define ELF_PAGE_SIZE  0x200000
 126
 127 /******************************************************/
 128 #else /* ! TARGET_DEFS_ONLY */
 129 /******************************************************/
 130 #include "tcc.h"
 131 #include <assert.h>
 132
 133 ST_DATA const int reg_classes[NB_REGS] = {
 134     /* eax */ RC_INT|RC_RAX|RC_INT2,
 135     /* ecx */ RC_INT|RC_RCX|RC_INT2,
 136     /* edx */ RC_INT|RC_RDX,
 137         RC_INT|RC_INT1|RC_INT2|RC_RBX,
 138     0,
 139     /* st0 */ RC_ST0,
 140     RC_RSI|RC_INT2,
 141     RC_RDI|RC_INT2,
 142     RC_INT|RC_R8|RC_INT2,
 143     RC_INT|RC_R9|RC_INT2,
 144     RC_INT|RC_INT1|RC_INT2|RC_R10,
 145         RC_INT|RC_INT1|RC_INT2|RC_R11,
 146         RC_INT|RC_INT1|RC_INT2|RC_R12,
 147         RC_INT|RC_INT1|RC_INT2|RC_R13,
 148         RC_INT|RC_INT1|RC_INT2|RC_R14,
 149         RC_INT|RC_INT1|RC_INT2|RC_R15,
 150         /* xmm0 */ RC_FLOAT | RC_XMM0,
 151         RC_FLOAT|RC_XMM1,
 152         RC_FLOAT|RC_XMM2,
 153         RC_FLOAT|RC_XMM3,
 154         RC_FLOAT|RC_XMM4,
 155         RC_FLOAT|RC_XMM5,
 156         RC_FLOAT|RC_XMM6,
 157         RC_FLOAT|RC_XMM7,
 158 };
 159
 160 static unsigned long func_sub_sp_offset;
 161 static int func_ret_sub;
 162
 163 /* XXX: make it faster ? */
 164 void g(int c)
 165 {
 166     int ind1;
 167     ind1 = ind + 1;
 168     if (ind1 > cur_text_section->data_allocated)
 169         section_realloc(cur_text_section, ind1);
 170     cur_text_section->data[ind] = c;
 171     ind = ind1;
 172 }
 173
 174 void o(unsigned int c)
 175 {
 176     while (c) {
 177         g(c);
 178         c = c >> 8;
 179     }
 180 }
 181
 182 void gen_le16(int v)
 183 {
 184     g(v);
 185     g(v >> 8);
 186 }
 187
 188 void gen_le32(int c)
 189 {
 190     g(c);
 191     g(c >> 8);
 192     g(c >> 16);
 193     g(c >> 24);
 194 }
 195
 196 void gen_le64(int64_t c)
 197 {
 198     g(c);
 199     g(c >> 8);
 200     g(c >> 16);
 201     g(c >> 24);
 202     g(c >> 32);
 203     g(c >> 40);
 204     g(c >> 48);
 205     g(c >> 56);
 206 }
 207
 208 void orex(int ll, int r, int r2, int b)
 209 {
 210     if ((r & VT_VALMASK) >= VT_CONST)
 211         r = 0;
 212     if ((r2 & VT_VALMASK) >= VT_CONST)
 213         r2 = 0;
 214     if (ll || REX_BASE(r) || REX_BASE(r2))
 215         o(0x40 | REX_BASE(r) | (REX_BASE(r2) << 2) | (ll << 3));
 216     o(b);
 217 }
 218
 219 /* output a symbol and patch all calls to it */
 220 void gsym_addr(int t, int a)
 221 {
 222     int n, *ptr;
 223     while (t) {
 224         ptr = (int *)(cur_text_section->data + t);
 225         n = *ptr; /* next value */
 226         *ptr = a - t - 4;
 227         t = n;
 228     }
 229 }
 230
 231 void gsym(int t)
 232 {
 233     gsym_addr(t, ind);
 234 }
 235
 236 /* psym is used to put an instruction with a data field which is a
 237    reference to a symbol. It is in fact the same as oad ! */
 238 #define psym oad
 239
 240 static int is64_type(int t)
 241 {
 242     return ((t & VT_BTYPE) == VT_PTR ||
 243             (t & VT_BTYPE) == VT_FUNC ||
 244             (t & VT_BTYPE) == VT_LLONG);
 245 }
 246
 247 /* instruction + 4 bytes data. Return the address of the data */
 248 ST_FUNC int oad(int c, int s)
 249 {
 250     int ind1;
 251
 252     o(c);
 253     ind1 = ind + 4;
 254     if (ind1 > cur_text_section->data_allocated)
 255         section_realloc(cur_text_section, ind1);
 256     *(int *)(cur_text_section->data + ind) = s;
 257     s = ind;
 258     ind = ind1;
 259     return s;
 260 }
 261
 262 ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 263 {
 264     if (r & VT_SYM)
 265         greloc(cur_text_section, sym, ind, R_X86_64_32);
 266     gen_le32(c);
 267 }
 268
 269 /* output constant with relocation if 'r & VT_SYM' is true */
 270 ST_FUNC void gen_addr64(int r, Sym *sym, int64_t c)
 271 {
 272     if (r & VT_SYM)
 273         greloc(cur_text_section, sym, ind, R_X86_64_64);
 274     gen_le64(c);
 275 }
 276
 277 /* output constant with relocation if 'r & VT_SYM' is true */
 278 ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
 279 {
 280     if (r & VT_SYM)
 281         greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 282     gen_le32(c-4);
 283 }
 284
 285 /* output got address with relocation */
 286 static void gen_gotpcrel(int r, Sym *sym, int c)
 287 {
 288 #ifndef TCC_TARGET_PE
 289     Section *sr;
 290     ElfW(Rela) *rel;
 291     greloc(cur_text_section, sym, ind, R_X86_64_GOTPCREL);
 292     sr = cur_text_section->reloc;
 293     rel = (ElfW(Rela) *)(sr->data + sr->data_offset - sizeof(ElfW(Rela)));
 294     rel->r_addend = -4;
 295 #else
 296     printf("picpic: %s %x %x | %02x %02x %02x\n", get_tok_str(sym->v, NULL), c, r,
 297         cur_text_section->data[ind-3],
 298         cur_text_section->data[ind-2],
 299         cur_text_section->data[ind-1]
 300         );
 301     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 302 #endif
 303     gen_le32(0);
 304     if (c) {
 305         /* we use add c, %xxx for displacement */
 306         orex(1, r, 0, 0x81);
 307         o(0xc0 + REG_VALUE(r));
 308         gen_le32(c);
 309     }
 310 }
 311
 312 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 313 {
 314     op_reg = REG_VALUE(op_reg) << 3;
 315     if ((r & VT_VALMASK) == VT_CONST) {
 316         /* constant memory reference */
 317         o(0x05 | op_reg);
 318         if (is_got) {
 319             gen_gotpcrel(r, sym, c);
 320         } else {
 321             gen_addrpc32(r, sym, c);
 322         }
 323     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 324         /* currently, we use only ebp as base */
 325         if (c == (char)c) {
 326             /* short reference */
 327             o(0x45 | op_reg);
 328             g(c);
 329         } else {
 330             oad(0x85 | op_reg, c);
 331         }
 332     } else if (r & TREG_MEM) {
 333         if (c) {
 334             g(0x80 | op_reg | REG_VALUE(r));
 335             gen_le32(c);
 336         } else {
 337             g(0x00 | op_reg | REG_VALUE(r));
 338         }
 339     } else {
 340         g(0x00 | op_reg | REG_VALUE(r));
 341     }
 342 }
 343
 344 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 345    opcode bits */
 346 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 347 {
 348     gen_modrm_impl(op_reg, r, sym, c, 0);
 349 }
 350
 351 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 352    opcode bits */
 353 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 354 {
 355     int is_got;
 356     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 357     orex(1, r, op_reg, opcode);
 358     gen_modrm_impl(op_reg, r, sym, c, is_got);
 359 }
 360
 361
 362 /* load 'r' from value 'sv' */
 363 void load(int r, SValue *sv)
 364 {
 365     int v, t, ft, fc, fr;
 366     SValue v1;
 367
 368 #ifdef TCC_TARGET_PE
 369     SValue v2;
 370     sv = pe_getimport(sv, &v2);
 371 #endif
 372
 373     fr = sv->r;
 374     ft = sv->type.t & ~VT_DEFSIGN;
 375     fc = sv->c.ul;
 376
 377 #ifndef TCC_TARGET_PE
 378     /* we use indirect access via got */
 379     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 380         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 381         /* use the result register as a temporal register */
 382         int tr = r | TREG_MEM;
 383         if (is_float(ft)) {
 384             /* we cannot use float registers as a temporal register */
 385             tr = get_reg(RC_INT) | TREG_MEM;
 386         }
 387         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 388
 389         /* load from the temporal register */
 390         fr = tr | VT_LVAL;
 391     }
 392 #endif
 393
 394     v = fr & VT_VALMASK;
 395     if (fr & VT_LVAL) {
 396         int b, ll;
 397         if (v == VT_LLOCAL) {
 398             v1.type.t = VT_PTR;
 399             v1.r = VT_LOCAL | VT_LVAL;
 400             v1.c.ul = fc;
 401             fr = r;
 402             if (!(reg_classes[fr] & RC_INT))
 403                 fr = get_reg(RC_INT);
 404             load(fr, &v1);
 405         }
 406         ll = 0;
 407         if ((ft & VT_BTYPE) == VT_FLOAT) {
 408             b = 0x6e0f66;
 409             r = REG_VALUE(r); /* movd */
 410         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 411             b = 0x7e0ff3; /* movq */
 412             r = REG_VALUE(r);
 413         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 414             b = 0xdb, r = 5; /* fldt */
 415         } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
 416             b = 0xbe0f;   /* movsbl */
 417         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 418             b = 0xb60f;   /* movzbl */
 419         } else if ((ft & VT_TYPE) == VT_SHORT) {
 420             b = 0xbf0f;   /* movswl */
 421         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 422             b = 0xb70f;   /* movzwl */
 423         } else {
 424             assert(((ft & VT_BTYPE) == VT_INT) || ((ft & VT_BTYPE) == VT_LLONG)
 425                    || ((ft & VT_BTYPE) == VT_PTR) || ((ft & VT_BTYPE) == VT_ENUM)
 426                    || ((ft & VT_BTYPE) == VT_FUNC));
 427             ll = is64_type(ft);
 428             b = 0x8b;
 429         }
 430         if (ll) {
 431             gen_modrm64(b, r, fr, sv->sym, fc);
 432         } else {
 433             orex(ll, fr, r, b);
 434             gen_modrm(r, fr, sv->sym, fc);
 435         }
 436     } else {
 437         if (v == VT_CONST) {
 438             if (fr & VT_SYM) {
 439 #ifdef TCC_TARGET_PE
 440                 orex(1,0,r,0x8d);
 441                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 442                 gen_addrpc32(fr, sv->sym, fc);
 443 #else
 444                 if (sv->sym->type.t & VT_STATIC) {
 445                     orex(1,0,r,0x8d);
 446                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 447                     gen_addrpc32(fr, sv->sym, fc);
 448                 } else {
 449                     orex(1,0,r,0x8b);
 450                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 451                     gen_gotpcrel(r, sv->sym, fc);
 452                 }
 453 #endif
 454             } else if (is64_type(ft)) {
 455                 orex(1,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 456                 gen_le64(sv->c.ull);
 457             } else {
 458                 orex(0,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 459                 gen_le32(fc);
 460             }
 461         } else if (v == VT_LOCAL) {
 462             orex(1,0,r,0x8d); /* lea xxx(%ebp), r */
 463             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 464         } else if (v == VT_CMP) {
 465             orex(0,r,0,0);
 466             if ((fc & ~0x100) != TOK_NE)
 467               oad(0xb8 + REG_VALUE(r), 0); /* mov $0, r */
 468             else
 469               oad(0xb8 + REG_VALUE(r), 1); /* mov $1, r */
 470             if (fc & 0x100)
 471               {
 472                 /* This was a float compare.  If the parity bit is
 473                    set the result was unordered, meaning false for everything
 474                    except TOK_NE, and true for TOK_NE.  */
 475                 fc &= ~0x100;
 476                 o(0x037a + (REX_BASE(r) << 8));
 477               }
 478             orex(0,r,0, 0x0f); /* setxx %br */
 479             o(fc);
 480             o(0xc0 + REG_VALUE(r));
 481         } else if (v == VT_JMP || v == VT_JMPI) {
 482             t = v & 1;
 483             orex(0,r,0,0);
 484             oad(0xb8 + REG_VALUE(r), t); /* mov $1, r */
 485             o(0x05eb + (REX_BASE(r) << 8)); /* jmp after */
 486             gsym(fc);
 487             orex(0,r,0,0);
 488             oad(0xb8 + REG_VALUE(r), t ^ 1); /* mov $0, r */
 489         } else if (v != r) {
 490             if ((r >= TREG_XMM0) && (r <= TREG_XMM7)) {
 491                 if (v == TREG_ST0) {
 492                     /* gen_cvt_ftof(VT_DOUBLE); */
 493                     o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 494                     /* movsd -0x10(%rsp),%xmmN */
 495                     o(0x100ff2);
 496                     o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 497                     o(0xf024);
 498                 } else {
 499                     assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 500                     if ((ft & VT_BTYPE) == VT_FLOAT) {
 501                         o(0x100ff3);
 502                     } else {
 503                         assert((ft & VT_BTYPE) == VT_DOUBLE);
 504                         o(0x100ff2);
 505                     }
 506                     o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8);
 507                 }
 508             } else if (r == TREG_ST0) {
 509                 assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 510                 /* gen_cvt_ftof(VT_LDOUBLE); */
 511                 /* movsd %xmmN,-0x10(%rsp) */
 512                 o(0x110ff2);
 513                 o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 514                 o(0xf024);
 515                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 516             } else {
 517                 orex(1,r,v, 0x89);
 518                 o(0xc0 + REG_VALUE(r) + REG_VALUE(v) * 8); /* mov v, r */
 519             }
 520         }
 521     }
 522 }
 523
 524 /* store register 'r' in lvalue 'v' */
 525 void store(int r, SValue *v)
 526 {
 527     int fr, bt, ft, fc;
 528     int op64 = 0;
 529     /* store the REX prefix in this variable when PIC is enabled */
 530     int pic = 0;
 531
 532 #ifdef TCC_TARGET_PE
 533     SValue v2;
 534     v = pe_getimport(v, &v2);
 535 #endif
 536
 537     ft = v->type.t;
 538     fc = v->c.ul;
 539     fr = v->r & VT_VALMASK;
 540     bt = ft & VT_BTYPE;
 541
 542 #ifndef TCC_TARGET_PE
 543     /* we need to access the variable via got */
 544     if (fr == VT_CONST && (v->r & VT_SYM)) {
 545         /* mov xx(%rip), %r11 */
 546         o(0x1d8b4c);
 547         gen_gotpcrel(TREG_R11, v->sym, v->c.ul);
 548         pic = is64_type(bt) ? 0x49 : 0x41;
 549     }
 550 #endif
 551
 552     /* XXX: incorrect if float reg to reg */
 553     if (bt == VT_FLOAT) {
 554         o(0x66);
 555         o(pic);
 556         o(0x7e0f); /* movd */
 557         r = REG_VALUE(r);
 558     } else if (bt == VT_DOUBLE) {
 559         o(0x66);
 560         o(pic);
 561         o(0xd60f); /* movq */
 562         r = REG_VALUE(r);
 563     } else if (bt == VT_LDOUBLE) {
 564         o(0xc0d9); /* fld %st(0) */
 565         o(pic);
 566         o(0xdb); /* fstpt */
 567         r = 7;
 568     } else {
 569         if (bt == VT_SHORT)
 570             o(0x66);
 571         o(pic);
 572         if (bt == VT_BYTE || bt == VT_BOOL)
 573             orex(0, 0, r, 0x88);
 574         else if (is64_type(bt))
 575             op64 = 0x89;
 576         else
 577             orex(0, 0, r, 0x89);
 578     }
 579     if (pic) {
 580         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 581         if (op64)
 582             o(op64);
 583         o(3 + (r << 3));
 584     } else if (op64) {
 585         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 586             gen_modrm64(op64, r, v->r, v->sym, fc);
 587         } else if (fr != r) {
 588             /* XXX: don't we really come here? */
 589             abort();
 590             o(0xc0 + fr + r * 8); /* mov r, fr */
 591         }
 592     } else {
 593         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 594             gen_modrm(r, v->r, v->sym, fc);
 595         } else if (fr != r) {
 596             /* XXX: don't we really come here? */
 597             abort();
 598             o(0xc0 + fr + r * 8); /* mov r, fr */
 599         }
 600     }
 601 }
 602
 603 /* 'is_jmp' is '1' if it is a jump */
 604 static void gcall_or_jmp(int is_jmp)
 605 {
 606     int r;
 607     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
 608         /* constant case */
 609         if (vtop->r & VT_SYM) {
 610             /* relocation case */
 611             greloc(cur_text_section, vtop->sym,
 612                    ind + 1, R_X86_64_PLT32);
 613         } else {
 614             /* put an empty PC32 relocation */
 615             put_elf_reloc(symtab_section, cur_text_section,
 616                           ind + 1, R_X86_64_PC32, 0);
 617         }
 618         oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
 619     } else {
 620         /* otherwise, indirect call */
 621         r = TREG_R11;
 622         load(r, vtop);
 623         o(0x41); /* REX */
 624         o(0xff); /* call/jmp *r */
 625         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 626     }
 627 }
 628
 629 #ifdef TCC_TARGET_PE
 630
 631 #define REGN 4
 632 static const uint8_t arg_regs[REGN] = {
 633     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 634 };
 635
 636 /* Prepare arguments in R10 and R11 rather than RCX and RDX
 637    because gv() will not ever use these */
 638 static int arg_prepare_reg(int idx) {
 639   if (idx == 0 || idx == 1)
 640       /* idx=0: r10, idx=1: r11 */
 641       return idx + 10;
 642   else
 643       return arg_regs[idx];
 644 }
 645
 646 static int func_scratch;
 647
 648 /* Generate function call. The function address is pushed first, then
 649    all the parameters in call order. This functions pops all the
 650    parameters and the function address. */
 651
 652 void gen_offs_sp(int b, int r, int d)
 653 {
 654     orex(1,0,r & 0x100 ? 0 : r, b);
 655     if (d == (char)d) {
 656         o(0x2444 | (REG_VALUE(r) << 3));
 657         g(d);
 658     } else {
 659         o(0x2484 | (REG_VALUE(r) << 3));
 660         gen_le32(d);
 661     }
 662 }
 663
 664 /* Return the number of registers needed to return the struct, or 0 if
 665    returning via struct pointer. */
 666 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align)
 667 {
 668     int size, align;
 669     *ret_align = 1; // Never have to re-align return values for x86-64
 670     size = type_size(vt, &align);
 671     ret->ref = NULL;
 672     if (size > 8) {
 673         return 0;
 674     } else if (size > 4) {
 675         ret->t = VT_LLONG;
 676         return 1;
 677     } else if (size > 2) {
 678         ret->t = VT_INT;
 679         return 1;
 680     } else if (size > 1) {
 681         ret->t = VT_SHORT;
 682         return 1;
 683     } else {
 684         ret->t = VT_BYTE;
 685         return 1;
 686     }
 687 }
 688
 689 static int is_sse_float(int t) {
 690     int bt;
 691     bt = t & VT_BTYPE;
 692     return bt == VT_DOUBLE || bt == VT_FLOAT;
 693 }
 694
 695 int gfunc_arg_size(CType *type) {
 696     int align;
 697     if (type->t & (VT_ARRAY|VT_BITFIELD))
 698         return 8;
 699     return type_size(type, &align);
 700 }
 701
 702 void gfunc_call(int nb_args)
 703 {
 704     int size, r, args_size, i, d, bt, struct_size;
 705     int arg;
 706
 707     args_size = (nb_args < REGN ? REGN : nb_args) * PTR_SIZE;
 708     arg = nb_args;
 709
 710     /* for struct arguments, we need to call memcpy and the function
 711        call breaks register passing arguments we are preparing.
 712        So, we process arguments which will be passed by stack first. */
 713     struct_size = args_size;
 714     for(i = 0; i < nb_args; i++) {
 715         SValue *sv;
 716
 717         --arg;
 718         sv = &vtop[-i];
 719         bt = (sv->type.t & VT_BTYPE);
 720         size = gfunc_arg_size(&sv->type);
 721
 722         if (size <= 8)
 723             continue; /* arguments smaller than 8 bytes passed in registers or on stack */
 724
 725         if (bt == VT_STRUCT) {
 726             /* align to stack align size */
 727             size = (size + 15) & ~15;
 728             /* generate structure store */
 729             r = get_reg(RC_INT);
 730             gen_offs_sp(0x8d, r, struct_size);
 731             struct_size += size;
 732
 733             /* generate memcpy call */
 734             vset(&sv->type, r | VT_LVAL, 0);
 735             vpushv(sv);
 736             vstore();
 737             --vtop;
 738         } else if (bt == VT_LDOUBLE) {
 739             gv(RC_ST0);
 740             gen_offs_sp(0xdb, 0x107, struct_size);
 741             struct_size += 16;
 742         }
 743     }
 744
 745     if (func_scratch < struct_size)
 746         func_scratch = struct_size;
 747
 748     arg = nb_args;
 749     struct_size = args_size;
 750
 751     for(i = 0; i < nb_args; i++) {
 752         --arg;
 753         bt = (vtop->type.t & VT_BTYPE);
 754
 755         size = gfunc_arg_size(&vtop->type);
 756         if (size > 8) {
 757             /* align to stack align size */
 758             size = (size + 15) & ~15;
 759             if (arg >= REGN) {
 760                 d = get_reg(RC_INT);
 761                 gen_offs_sp(0x8d, d, struct_size);
 762                 gen_offs_sp(0x89, d, arg*8);
 763             } else {
 764                 d = arg_prepare_reg(arg);
 765                 gen_offs_sp(0x8d, d, struct_size);
 766             }
 767             struct_size += size;
 768         } else {
 769             if (is_sse_float(vtop->type.t)) {
 770                 gv(RC_XMM0); /* only use one float register */
 771                 if (arg >= REGN) {
 772                     /* movq %xmm0, j*8(%rsp) */
 773                     gen_offs_sp(0xd60f66, 0x100, arg*8);
 774                 } else {
 775                     /* movaps %xmm0, %xmmN */
 776                     o(0x280f);
 777                     o(0xc0 + (arg << 3));
 778                     d = arg_prepare_reg(arg);
 779                     /* mov %xmm0, %rxx */
 780                     o(0x66);
 781                     orex(1,d,0, 0x7e0f);
 782                     o(0xc0 + REG_VALUE(d));
 783                 }
 784             } else {
 785                 if (bt == VT_STRUCT) {
 786                     vtop->type.ref = NULL;
 787                     vtop->type.t = size > 4 ? VT_LLONG : size > 2 ? VT_INT
 788                         : size > 1 ? VT_SHORT : VT_BYTE;
 789                 }
 790
 791                 r = gv(RC_INT);
 792                 if (arg >= REGN) {
 793                     gen_offs_sp(0x89, r, arg*8);
 794                 } else {
 795                     d = arg_prepare_reg(arg);
 796                     orex(1,d,r,0x89); /* mov */
 797                     o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
 798                 }
 799             }
 800         }
 801         vtop--;
 802     }
 803     save_regs(0);
 804
 805     /* Copy R10 and R11 into RCX and RDX, respectively */
 806     if (nb_args > 0) {
 807         o(0xd1894c); /* mov %r10, %rcx */
 808         if (nb_args > 1) {
 809             o(0xda894c); /* mov %r11, %rdx */
 810         }
 811     }
 812
 813     gcall_or_jmp(0);
 814     vtop--;
 815 }
 816
 817
 818 #define FUNC_PROLOG_SIZE 11
 819
 820 /* generate function prolog of type 't' */
 821 void gfunc_prolog(CType *func_type)
 822 {
 823     int addr, reg_param_index, bt, size;
 824     Sym *sym;
 825     CType *type;
 826
 827     func_ret_sub = 0;
 828     func_scratch = 0;
 829     loc = 0;
 830
 831     addr = PTR_SIZE * 2;
 832     ind += FUNC_PROLOG_SIZE;
 833     func_sub_sp_offset = ind;
 834     reg_param_index = 0;
 835
 836     sym = func_type->ref;
 837
 838     /* if the function returns a structure, then add an
 839        implicit pointer parameter */
 840     func_vt = sym->type;
 841     func_var = (sym->c == FUNC_ELLIPSIS);
 842     size = gfunc_arg_size(&func_vt);
 843     if (size > 8) {
 844         gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 845         func_vc = addr;
 846         reg_param_index++;
 847         addr += 8;
 848     }
 849
 850     /* define parameters */
 851     while ((sym = sym->next) != NULL) {
 852         type = &sym->type;
 853         bt = type->t & VT_BTYPE;
 854         size = gfunc_arg_size(type);
 855         if (size > 8) {
 856             if (reg_param_index < REGN) {
 857                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 858             }
 859             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL | VT_REF, addr);
 860         } else {
 861             if (reg_param_index < REGN) {
 862                 /* save arguments passed by register */
 863                 if ((bt == VT_FLOAT) || (bt == VT_DOUBLE)) {
 864                     o(0xd60f66); /* movq */
 865                     gen_modrm(reg_param_index, VT_LOCAL, NULL, addr);
 866                 } else {
 867                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 868                 }
 869             }
 870             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL, addr);
 871         }
 872         addr += 8;
 873         reg_param_index++;
 874     }
 875
 876     while (reg_param_index < REGN) {
 877         if (func_type->ref->c == FUNC_ELLIPSIS) {
 878             gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 879             addr += 8;
 880         }
 881         reg_param_index++;
 882     }
 883 }
 884
 885 /* generate function epilog */
 886 void gfunc_epilog(void)
 887 {
 888     int v, saved_ind;
 889
 890     o(0xc9); /* leave */
 891     if (func_ret_sub == 0) {
 892         o(0xc3); /* ret */
 893     } else {
 894         o(0xc2); /* ret n */
 895         g(func_ret_sub);
 896         g(func_ret_sub >> 8);
 897     }
 898
 899     saved_ind = ind;
 900     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 901     /* align local size to word & save local variables */
 902     v = (func_scratch + -loc + 15) & -16;
 903
 904     if (v >= 4096) {
 905         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 906         oad(0xb8, v); /* mov stacksize, %eax */
 907         oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
 908         greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 909         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
 910     } else {
 911         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 912         o(0xec8148);  /* sub rsp, stacksize */
 913         gen_le32(v);
 914     }
 915
 916     cur_text_section->data_offset = saved_ind;
 917     pe_add_unwind_data(ind, saved_ind, v);
 918     ind = cur_text_section->data_offset;
 919 }
 920
 921 #else
 922
 923 static void gadd_sp(int val)
 924 {
 925     if (val == (char)val) {
 926         o(0xc48348);
 927         g(val);
 928     } else {
 929         oad(0xc48148, val); /* add $xxx, %rsp */
 930     }
 931 }
 932
 933 typedef enum X86_64_Mode {
 934   x86_64_mode_none,
 935   x86_64_mode_memory,
 936   x86_64_mode_integer,
 937   x86_64_mode_sse,
 938   x86_64_mode_x87
 939 } X86_64_Mode;
 940
 941 static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b)
 942 {
 943     if (a == b)
 944         return a;
 945     else if (a == x86_64_mode_none)
 946         return b;
 947     else if (b == x86_64_mode_none)
 948         return a;
 949     else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
 950         return x86_64_mode_memory;
 951     else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
 952         return x86_64_mode_integer;
 953     else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
 954         return x86_64_mode_memory;
 955     else
 956         return x86_64_mode_sse;
 957 }
 958
 959 static X86_64_Mode classify_x86_64_inner(CType *ty)
 960 {
 961     X86_64_Mode mode;
 962     Sym *f;
 963
 964     switch (ty->t & VT_BTYPE) {
 965     case VT_VOID: return x86_64_mode_none;
 966
 967     case VT_INT:
 968     case VT_BYTE:
 969     case VT_SHORT:
 970     case VT_LLONG:
 971     case VT_BOOL:
 972     case VT_PTR:
 973     case VT_FUNC:
 974     case VT_ENUM: return x86_64_mode_integer;
 975
 976     case VT_FLOAT:
 977     case VT_DOUBLE: return x86_64_mode_sse;
 978
 979     case VT_LDOUBLE: return x86_64_mode_x87;
 980
 981     case VT_STRUCT:
 982         f = ty->ref;
 983
 984         // Detect union
 985         if (f->next && (f->c == f->next->c))
 986           return x86_64_mode_memory;
 987
 988         mode = x86_64_mode_none;
 989         for (f = f->next; f; f = f->next)
 990             mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
 991
 992         return mode;
 993     }
 994
 995     assert(0);
 996 }
 997
 998 static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count)
 999 {
1000     X86_64_Mode mode;
1001     int size, align, ret_t = 0;
1002
1003     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
1004         *psize = 8;
1005         *palign = 8;
1006         *reg_count = 1;
1007         ret_t = ty->t;
1008         mode = x86_64_mode_integer;
1009     } else {
1010         size = type_size(ty, &align);
1011         *psize = (size + 7) & ~7;
1012         *palign = (align + 7) & ~7;
1013
1014         if (size > 16) {
1015             mode = x86_64_mode_memory;
1016         } else {
1017             mode = classify_x86_64_inner(ty);
1018             switch (mode) {
1019             case x86_64_mode_integer:
1020                 if (size > 8) {
1021                     *reg_count = 2;
1022                     ret_t = VT_QLONG;
1023                 } else {
1024                     *reg_count = 1;
1025                     ret_t = (size > 4) ? VT_LLONG : VT_INT;
1026                 }
1027                 break;
1028
1029             case x86_64_mode_x87:
1030                 *reg_count = 1;
1031                 ret_t = VT_LDOUBLE;
1032                 break;
1033
1034             case x86_64_mode_sse:
1035                 if (size > 8) {
1036                     *reg_count = 2;
1037                     ret_t = VT_QFLOAT;
1038                 } else {
1039                     *reg_count = 1;
1040                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
1041                 }
1042                 break;
1043             default: break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
1044             }
1045         }
1046     }
1047
1048     if (ret) {
1049         ret->ref = NULL;
1050         ret->t = ret_t;
1051     }
1052
1053     return mode;
1054 }
1055
1056 ST_FUNC int classify_x86_64_va_arg(CType *ty)
1057 {
1058     /* This definition must be synced with stdarg.h */
1059     enum __va_arg_type {
1060         __va_gen_reg, __va_float_reg, __va_stack
1061     };
1062     int size, align, reg_count;
1063     X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
1064     switch (mode) {
1065     default: return __va_stack;
1066     case x86_64_mode_integer: return __va_gen_reg;
1067     case x86_64_mode_sse: return __va_float_reg;
1068     }
1069 }
1070
1071 /* Return the number of registers needed to return the struct, or 0 if
1072    returning via struct pointer. */
1073 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align)
1074 {
1075     int size, align, reg_count;
1076     *ret_align = 1; // Never have to re-align return values for x86-64
1077     return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
1078 }
1079
1080 #define REGN 6
1081 static const uint8_t arg_regs[REGN] = {
1082     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
1083 };
1084
1085 static int arg_prepare_reg(int idx) {
1086   if (idx == 2 || idx == 3)
1087       /* idx=2: r10, idx=3: r11 */
1088       return idx + 8;
1089   else
1090       return arg_regs[idx];
1091 }
1092
1093 /* Generate function call. The function address is pushed first, then
1094    all the parameters in call order. This functions pops all the
1095    parameters and the function address. */
1096 void gfunc_call(int nb_args)
1097 {
1098     X86_64_Mode mode;
1099     CType type;
1100     int size, align, r, args_size, stack_adjust, run_start, run_end, i, reg_count;
1101     int nb_reg_args = 0;
1102     int nb_sse_args = 0;
1103     int sse_reg, gen_reg;
1104
1105     /* calculate the number of integer/float register arguments */
1106     for(i = 0; i < nb_args; i++) {
1107         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1108         if (mode == x86_64_mode_sse)
1109             nb_sse_args += reg_count;
1110         else if (mode == x86_64_mode_integer)
1111             nb_reg_args += reg_count;
1112     }
1113
1114     /* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
1115        and ended by a 16-byte aligned argument. This is because, from the point of view of
1116        the callee, argument alignment is computed from the bottom up. */
1117     /* for struct arguments, we need to call memcpy and the function
1118        call breaks register passing arguments we are preparing.
1119        So, we process arguments which will be passed by stack first. */
1120     gen_reg = nb_reg_args;
1121     sse_reg = nb_sse_args;
1122     run_start = 0;
1123     args_size = 0;
1124     while (run_start != nb_args) {
1125         int run_gen_reg = gen_reg, run_sse_reg = sse_reg;
1126
1127         run_end = nb_args;
1128         stack_adjust = 0;
1129         for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
1130             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1131             switch (mode) {
1132             case x86_64_mode_memory:
1133             case x86_64_mode_x87:
1134             stack_arg:
1135                 if (align == 16)
1136                     run_end = i;
1137                 else
1138                     stack_adjust += size;
1139                 break;
1140
1141             case x86_64_mode_sse:
1142                 sse_reg -= reg_count;
1143                 if (sse_reg + reg_count > 8) goto stack_arg;
1144                 break;
1145
1146             case x86_64_mode_integer:
1147                 gen_reg -= reg_count;
1148                 if (gen_reg + reg_count > REGN) goto stack_arg;
1149                 break;
1150             default: break; /* nothing to be done for x86_64_mode_none */
1151             }
1152         }
1153
1154         gen_reg = run_gen_reg;
1155         sse_reg = run_sse_reg;
1156
1157         /* adjust stack to align SSE boundary */
1158         if (stack_adjust &= 15) {
1159             /* fetch cpu flag before the following sub will change the value */
1160             if (vtop >= vstack && (vtop->r & VT_VALMASK) == VT_CMP)
1161                 gv(RC_INT);
1162
1163             stack_adjust = 16 - stack_adjust;
1164             o(0x48);
1165             oad(0xec81, stack_adjust); /* sub $xxx, %rsp */
1166             args_size += stack_adjust;
1167         }
1168
1169         for(i = run_start; i < run_end;) {
1170             /* Swap argument to top, it will possibly be changed here,
1171               and might use more temps. At the end of the loop we keep
1172               in on the stack and swap it back to its original position
1173               if it is a register. */
1174             SValue tmp = vtop[0];
1175             vtop[0] = vtop[-i];
1176             vtop[-i] = tmp;
1177
1178             mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
1179
1180             int arg_stored = 1;
1181             switch (vtop->type.t & VT_BTYPE) {
1182             case VT_STRUCT:
1183                 if (mode == x86_64_mode_sse) {
1184                     if (sse_reg > 8)
1185                         sse_reg -= reg_count;
1186                     else
1187                         arg_stored = 0;
1188                 } else if (mode == x86_64_mode_integer) {
1189                     if (gen_reg > REGN)
1190                         gen_reg -= reg_count;
1191                     else
1192                         arg_stored = 0;
1193                 }
1194
1195                 if (arg_stored) {
1196                     /* allocate the necessary size on stack */
1197                     o(0x48);
1198                     oad(0xec81, size); /* sub $xxx, %rsp */
1199                     /* generate structure store */
1200                     r = get_reg(RC_INT);
1201                     orex(1, r, 0, 0x89); /* mov %rsp, r */
1202                     o(0xe0 + REG_VALUE(r));
1203                     vset(&vtop->type, r | VT_LVAL, 0);
1204                     vswap();
1205                     vstore();
1206                     args_size += size;
1207                 }
1208                 break;
1209
1210             case VT_LDOUBLE:
1211                 assert(0);
1212                 break;
1213
1214             case VT_FLOAT:
1215             case VT_DOUBLE:
1216                 assert(mode == x86_64_mode_sse);
1217                 if (sse_reg > 8) {
1218                     --sse_reg;
1219                     r = gv(RC_FLOAT);
1220                     o(0x50); /* push $rax */
1221                     /* movq %xmmN, (%rsp) */
1222                     o(0xd60f66);
1223                     o(0x04 + REG_VALUE(r)*8);
1224                     o(0x24);
1225                     args_size += size;
1226                 } else {
1227                     arg_stored = 0;
1228                 }
1229                 break;
1230
1231             default:
1232                 assert(mode == x86_64_mode_integer);
1233                 /* simple type */
1234                 /* XXX: implicit cast ? */
1235                 if (gen_reg > REGN) {
1236                     --gen_reg;
1237                     r = gv(RC_INT);
1238                     orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
1239                     args_size += size;
1240                 } else {
1241                     arg_stored = 0;
1242                 }
1243                 break;
1244             }
1245
1246             /* And swap the argument back to it's original position.  */
1247             tmp = vtop[0];
1248             vtop[0] = vtop[-i];
1249             vtop[-i] = tmp;
1250
1251             if (arg_stored) {
1252               vrotb(i+1);
1253               assert((vtop->type.t == tmp.type.t) && (vtop->r == tmp.r));
1254               vpop();
1255               --nb_args;
1256               --run_end;
1257             } else {
1258               ++i;
1259             }
1260         }
1261
1262         /* handle 16 byte aligned arguments at end of run */
1263         run_start = i = run_end;
1264         while (i < nb_args) {
1265             /* Rotate argument to top since it will always be popped */
1266             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1267             if (align != 16)
1268               break;
1269
1270             vrotb(i+1);
1271
1272             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1273                 gv(RC_ST0);
1274                 oad(0xec8148, size); /* sub $xxx, %rsp */
1275                 o(0x7cdb); /* fstpt 0(%rsp) */
1276                 g(0x24);
1277                 g(0x00);
1278                 args_size += size;
1279             } else {
1280                 //assert(mode == x86_64_mode_memory);
1281
1282                 /* allocate the necessary size on stack */
1283                 o(0x48);
1284                 oad(0xec81, size); /* sub $xxx, %rsp */
1285                 /* generate structure store */
1286                 r = get_reg(RC_INT);
1287                 orex(1, r, 0, 0x89); /* mov %rsp, r */
1288                 o(0xe0 + REG_VALUE(r));
1289                 vset(&vtop->type, r | VT_LVAL, 0);
1290                 vswap();
1291                 vstore();
1292                 args_size += size;
1293             }
1294
1295             vpop();
1296             --nb_args;
1297         }
1298     }
1299
1300     /* XXX This should be superfluous.  */
1301     save_regs(0); /* save used temporary registers */
1302
1303     /* then, we prepare register passing arguments.
1304        Note that we cannot set RDX and RCX in this loop because gv()
1305        may break these temporary registers. Let's use R10 and R11
1306        instead of them */
1307     assert(gen_reg <= REGN);
1308     assert(sse_reg <= 8);
1309     for(i = 0; i < nb_args; i++) {
1310         mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
1311         /* Alter stack entry type so that gv() knows how to treat it */
1312         vtop->type = type;
1313         if (mode == x86_64_mode_sse) {
1314             if (reg_count == 2) {
1315                 sse_reg -= 2;
1316                 gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
1317                 if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
1318                     /* movaps %xmm0, %xmmN */
1319                     o(0x280f);
1320                     o(0xc0 + (sse_reg << 3));
1321                     /* movaps %xmm1, %xmmN */
1322                     o(0x280f);
1323                     o(0xc1 + ((sse_reg+1) << 3));
1324                 }
1325             } else {
1326                 assert(reg_count == 1);
1327                 --sse_reg;
1328                 /* Load directly to register */
1329                 gv(RC_XMM0 << sse_reg);
1330             }
1331         } else if (mode == x86_64_mode_integer) {
1332             /* simple type */
1333             /* XXX: implicit cast ? */
1334             gen_reg -= reg_count;
1335             r = gv(RC_INT);
1336             int d = arg_prepare_reg(gen_reg);
1337             orex(1,d,r,0x89); /* mov */
1338             o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
1339             if (reg_count == 2) {
1340                 d = arg_prepare_reg(gen_reg+1);
1341                 orex(1,d,vtop->r2,0x89); /* mov */
1342                 o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
1343             }
1344         }
1345         vtop--;
1346     }
1347     assert(gen_reg == 0);
1348     assert(sse_reg == 0);
1349
1350     /* We shouldn't have many operands on the stack anymore, but the
1351        call address itself is still there, and it might be in %eax
1352        (or edx/ecx) currently, which the below writes would clobber.
1353        So evict all remaining operands here.  */
1354     save_regs(0);
1355
1356     /* Copy R10 and R11 into RDX and RCX, respectively */
1357     if (nb_reg_args > 2) {
1358         o(0xd2894c); /* mov %r10, %rdx */
1359         if (nb_reg_args > 3) {
1360             o(0xd9894c); /* mov %r11, %rcx */
1361         }
1362     }
1363
1364     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
1365     gcall_or_jmp(0);
1366     if (args_size)
1367         gadd_sp(args_size);
1368     vtop--;
1369 }
1370
1371
1372 #define FUNC_PROLOG_SIZE 11
1373
1374 static void push_arg_reg(int i) {
1375     loc -= 8;
1376     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
1377 }
1378
1379 /* generate function prolog of type 't' */
1380 void gfunc_prolog(CType *func_type)
1381 {
1382     X86_64_Mode mode;
1383     int i, addr, align, size, reg_count;
1384     int param_addr = 0, reg_param_index, sse_param_index;
1385     Sym *sym;
1386     CType *type;
1387
1388     sym = func_type->ref;
1389     addr = PTR_SIZE * 2;
1390     loc = 0;
1391     ind += FUNC_PROLOG_SIZE;
1392     func_sub_sp_offset = ind;
1393     func_ret_sub = 0;
1394
1395     if (func_type->ref->c == FUNC_ELLIPSIS) {
1396         int seen_reg_num, seen_sse_num, seen_stack_size;
1397         seen_reg_num = seen_sse_num = 0;
1398         /* frame pointer and return address */
1399         seen_stack_size = PTR_SIZE * 2;
1400         /* count the number of seen parameters */
1401         sym = func_type->ref;
1402         while ((sym = sym->next) != NULL) {
1403             type = &sym->type;
1404             mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1405             switch (mode) {
1406             default:
1407             stack_arg:
1408                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
1409                 break;
1410
1411             case x86_64_mode_integer:
1412                 if (seen_reg_num + reg_count <= 8) {
1413                     seen_reg_num += reg_count;
1414                 } else {
1415                     seen_reg_num = 8;
1416                     goto stack_arg;
1417                 }
1418                 break;
1419
1420             case x86_64_mode_sse:
1421                 if (seen_sse_num + reg_count <= 8) {
1422                     seen_sse_num += reg_count;
1423                 } else {
1424                     seen_sse_num = 8;
1425                     goto stack_arg;
1426                 }
1427                 break;
1428             }
1429         }
1430
1431         loc -= 16;
1432         /* movl $0x????????, -0x10(%rbp) */
1433         o(0xf045c7);
1434         gen_le32(seen_reg_num * 8);
1435         /* movl $0x????????, -0xc(%rbp) */
1436         o(0xf445c7);
1437         gen_le32(seen_sse_num * 16 + 48);
1438         /* movl $0x????????, -0x8(%rbp) */
1439         o(0xf845c7);
1440         gen_le32(seen_stack_size);
1441
1442         /* save all register passing arguments */
1443         for (i = 0; i < 8; i++) {
1444             loc -= 16;
1445             o(0xd60f66); /* movq */
1446             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
1447             /* movq $0, loc+8(%rbp) */
1448             o(0x85c748);
1449             gen_le32(loc + 8);
1450             gen_le32(0);
1451         }
1452         for (i = 0; i < REGN; i++) {
1453             push_arg_reg(REGN-1-i);
1454         }
1455     }
1456
1457     sym = func_type->ref;
1458     reg_param_index = 0;
1459     sse_param_index = 0;
1460
1461     /* if the function returns a structure, then add an
1462        implicit pointer parameter */
1463     func_vt = sym->type;
1464     mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
1465     if (mode == x86_64_mode_memory) {
1466         push_arg_reg(reg_param_index);
1467         func_vc = loc;
1468         reg_param_index++;
1469     }
1470     /* define parameters */
1471     while ((sym = sym->next) != NULL) {
1472         type = &sym->type;
1473         mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1474         switch (mode) {
1475         case x86_64_mode_sse:
1476             if (sse_param_index + reg_count <= 8) {
1477                 /* save arguments passed by register */
1478                 loc -= reg_count * 8;
1479                 param_addr = loc;
1480                 for (i = 0; i < reg_count; ++i) {
1481                     o(0xd60f66); /* movq */
1482                     gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
1483                     ++sse_param_index;
1484                 }
1485             } else {
1486                 addr = (addr + align - 1) & -align;
1487                 param_addr = addr;
1488                 addr += size;
1489                 sse_param_index += reg_count;
1490             }
1491             break;
1492
1493         case x86_64_mode_memory:
1494         case x86_64_mode_x87:
1495             addr = (addr + align - 1) & -align;
1496             param_addr = addr;
1497             addr += size;
1498             break;
1499
1500         case x86_64_mode_integer: {
1501             if (reg_param_index + reg_count <= REGN) {
1502                 /* save arguments passed by register */
1503                 loc -= reg_count * 8;
1504                 param_addr = loc;
1505                 for (i = 0; i < reg_count; ++i) {
1506                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
1507                     ++reg_param_index;
1508                 }
1509             } else {
1510                 addr = (addr + align - 1) & -align;
1511                 param_addr = addr;
1512                 addr += size;
1513                 reg_param_index += reg_count;
1514             }
1515             break;
1516         }
1517         default: break; /* nothing to be done for x86_64_mode_none */
1518         }
1519         sym_push(sym->v & ~SYM_FIELD, type,
1520                  VT_LOCAL | VT_LVAL, param_addr);
1521     }
1522 }
1523
1524 /* generate function epilog */
1525 void gfunc_epilog(void)
1526 {
1527     int v, saved_ind;
1528
1529     o(0xc9); /* leave */
1530     if (func_ret_sub == 0) {
1531         o(0xc3); /* ret */
1532     } else {
1533         o(0xc2); /* ret n */
1534         g(func_ret_sub);
1535         g(func_ret_sub >> 8);
1536     }
1537     /* align local size to word & save local variables */
1538     v = (-loc + 15) & -16;
1539     saved_ind = ind;
1540     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1541     o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1542     o(0xec8148);  /* sub rsp, stacksize */
1543     gen_le32(v);
1544     ind = saved_ind;
1545 }
1546
1547 #endif /* not PE */
1548
1549 /* generate a jump to a label */
1550 int gjmp(int t)
1551 {
1552     return psym(0xe9, t);
1553 }
1554
1555 /* generate a jump to a fixed address */
1556 void gjmp_addr(int a)
1557 {
1558     int r;
1559     r = a - ind - 2;
1560     if (r == (char)r) {
1561         g(0xeb);
1562         g(r);
1563     } else {
1564         oad(0xe9, a - ind - 5);
1565     }
1566 }
1567
1568 /* generate a test. set 'inv' to invert test. Stack entry is popped */
1569 int gtst(int inv, int t)
1570 {
1571     int v, *p;
1572
1573     v = vtop->r & VT_VALMASK;
1574     if (v == VT_CMP) {
1575         /* fast case : can jump directly since flags are set */
1576         if (vtop->c.i & 0x100)
1577           {
1578             /* This was a float compare.  If the parity flag is set
1579                the result was unordered.  For anything except != this
1580                means false and we don't jump (anding both conditions).
1581                For != this means true (oring both).
1582                Take care about inverting the test.  We need to jump
1583                to our target if the result was unordered and test wasn't NE,
1584                otherwise if unordered we don't want to jump.  */
1585             vtop->c.i &= ~0x100;
1586             if (!inv == (vtop->c.i != TOK_NE))
1587               o(0x067a);  /* jp +6 */
1588             else
1589               {
1590                 g(0x0f);
1591                 t = psym(0x8a, t); /* jp t */
1592               }
1593           }
1594         g(0x0f);
1595         t = psym((vtop->c.i - 16) ^ inv, t);
1596     } else if (v == VT_JMP || v == VT_JMPI) {
1597         /* && or || optimization */
1598         if ((v & 1) == inv) {
1599             /* insert vtop->c jump list in t */
1600             p = &vtop->c.i;
1601             while (*p != 0)
1602                 p = (int *)(cur_text_section->data + *p);
1603             *p = t;
1604             t = vtop->c.i;
1605         } else {
1606             t = gjmp(t);
1607             gsym(vtop->c.i);
1608         }
1609         } else {
1610         if (is_float(vtop->type.t) ||
1611             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
1612             vpushi(0);
1613             gen_op(TOK_NE);
1614         }
1615         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
1616             /* constant jmp optimization */
1617             if ((vtop->c.i != 0) != inv)
1618                 t = gjmp(t);
1619         } else {
1620             v = gv(RC_INT);
1621             orex(0,v,v,0x85);
1622             o(0xc0 + REG_VALUE(v) * 9);
1623             g(0x0f);
1624             t = psym(0x85 ^ inv, t);
1625         }
1626     }
1627     vtop--;
1628     return t;
1629 }
1630
1631 /* generate an integer binary operation */
1632 void gen_opi(int op)
1633 {
1634     int r, fr, opc, fc, c, ll, uu, cc, tt2;
1635
1636         fr = vtop[0].r;
1637         fc = vtop->c.ul;
1638     ll = is64_type(vtop[-1].type.t);
1639     cc = (fr & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
1640         tt2 = (fr & (VT_LVAL | VT_LVAL_TYPE)) == VT_LVAL;
1641
1642     switch(op) {
1643     case '+':
1644     case TOK_ADDC1: /* add with carry generation */
1645         opc = 0;
1646     gen_op8:
1647                 vswap();
1648                 r = gv(RC_INT);
1649                 vswap();
1650         if (cc && (!ll || (int)vtop->c.ll == vtop->c.ll)) {
1651             /* constant case */
1652             c = vtop->c.i;
1653             if (c == (char)c) {
1654                 /* XXX: generate inc and dec for smaller code ? */
1655                                 orex(ll, r, 0, 0x83);
1656                                 o(0xc0 + REG_VALUE(r) + opc*8);
1657                                 g(c);
1658             } else {
1659                 orex(ll, r, 0, 0x81);
1660                 oad(0xc0 + REG_VALUE(r) + opc*8, c);
1661             }
1662         } else {
1663                         if(!tt2)
1664                                 fr = gv(RC_INT);
1665                         orex(ll, fr, r, 0x03 + opc*8);
1666                         if(fr >= VT_CONST)
1667                 gen_modrm(r, fr, vtop->sym, fc);
1668                         else
1669                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1670         }
1671         vtop--;
1672         if (op >= TOK_ULT && op <= TOK_GT) {
1673             vtop->r = VT_CMP;
1674             vtop->c.i = op;
1675         }
1676         break;
1677     case '-':
1678     case TOK_SUBC1: /* sub with carry generation */
1679         opc = 5;
1680         goto gen_op8;
1681     case TOK_ADDC2: /* add with carry use */
1682         opc = 2;
1683         goto gen_op8;
1684     case TOK_SUBC2: /* sub with carry use */
1685         opc = 3;
1686         goto gen_op8;
1687     case '&':
1688         opc = 4;
1689         goto gen_op8;
1690     case '^':
1691         opc = 6;
1692         goto gen_op8;
1693     case '|':
1694         opc = 1;
1695         goto gen_op8;
1696     case '*':
1697                 opc = 5;
1698         vswap();
1699                 r = gv(RC_INT);
1700                 vswap();
1701                 if(!tt2)
1702                         fr = gv(RC_INT);
1703                 if(r == TREG_RAX){
1704                         if(fr != TREG_RDX)
1705                                 save_reg(TREG_RDX);
1706                         orex(ll, fr, r, 0xf7);
1707                         if(fr >= VT_CONST)
1708                                 gen_modrm(opc, fr, vtop->sym, fc);
1709                         else
1710                                 o(0xc0 + REG_VALUE(fr)  + opc*8);
1711                 }else{
1712                         orex(ll, fr, r, 0xaf0f);        /* imul fr, r */
1713                         if(fr >= VT_CONST)
1714                                 gen_modrm(r, fr, vtop->sym, fc);
1715                         else
1716                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1717                 }
1718         vtop--;
1719         break;
1720     case TOK_SHL:
1721         opc = 4;
1722         goto gen_shift;
1723     case TOK_SHR:
1724         opc = 5;
1725         goto gen_shift;
1726     case TOK_SAR:
1727         opc = 7;
1728     gen_shift:
1729         if (cc) {
1730             /* constant case */
1731             vswap();
1732             r = gv(RC_INT);
1733             vswap();
1734                         c = vtop->c.i;
1735                         if(c == 1){
1736                                 orex(ll, r, 0, 0xd1);
1737                                 o(0xc0 + REG_VALUE(r) + opc*8);
1738                         }else{
1739                                 orex(ll, r, 0, 0xc1); /* shl/shr/sar $xxx, r */
1740                                 o(0xc0 + REG_VALUE(r) + opc*8);
1741                                 g(c & (ll ? 0x3f : 0x1f));
1742                         }
1743         } else {
1744             /* we generate the shift in ecx */
1745             gv2(RC_INT, RC_RCX);
1746             r = vtop[-1].r;
1747             orex(ll, r, 0, 0xd3); /* shl/shr/sar %cl, r */
1748                         o(0xc0 + REG_VALUE(r) + opc*8);
1749         }
1750         vtop--;
1751         break;
1752     case TOK_UDIV:
1753     case TOK_UMOD:
1754                 opc = 6;
1755         uu = 1;
1756         goto divmod;
1757     case '/':
1758     case '%':
1759     case TOK_PDIV:
1760                 opc = 7;
1761         uu = 0;
1762     divmod:
1763         /* first operand must be in eax */
1764         /* XXX: need better constraint for second operand */
1765                 if(!tt2){
1766                         gv2(RC_RAX, RC_INT2);
1767                         fr = vtop[0].r;
1768                 }else{
1769                         vswap();
1770                         gv(RC_RAX);
1771                         vswap();
1772                 }
1773                 save_reg(TREG_RDX);
1774                 orex(ll, 0, 0, uu ? 0xd231 : 0x99); /* xor %edx,%edx : cdq RDX:RAX <- sign-extend of RAX. */
1775                 orex(ll, fr, 0, 0xf7); /* div fr, %eax */
1776                 if(fr >= VT_CONST)
1777                         gen_modrm(opc, fr, vtop->sym, fc);
1778                 else
1779                         o(0xc0 + REG_VALUE(fr) + opc*8);
1780         if (op == '%' || op == TOK_UMOD)
1781             r = TREG_RDX;
1782         else
1783             r = TREG_RAX;
1784         vtop--;
1785         vtop->r = r;
1786         break;
1787     default:
1788         opc = 7;
1789         goto gen_op8;
1790     }
1791 }
1792
1793 void gen_opl(int op)
1794 {
1795     gen_opi(op);
1796 }
1797
1798 /* generate a floating point operation 'v = t1 op t2' instruction. The
1799    two operands are guaranted to have the same floating point type */
1800 /* XXX: need to use ST1 too */
1801 void gen_opf(int op)
1802 {
1803     int a, ft, fc, swapped, fr, r;
1804     int float_type = (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1805
1806     /* convert constants to memory references */
1807     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1808         vswap();
1809         gv(float_type);
1810         vswap();
1811     }
1812     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1813         gv(float_type);
1814
1815         swapped = 0;
1816         fc = vtop->c.ul;
1817         ft = vtop->type.t;
1818
1819     if ((ft & VT_BTYPE) == VT_LDOUBLE) {
1820                 /* swap the stack if needed so that t1 is the register and t2 is
1821                 the memory reference */
1822                 /* must put at least one value in the floating point register */
1823                 if ((vtop[-1].r & VT_LVAL) && (vtop[0].r & VT_LVAL)) {
1824                         vswap();
1825                         gv(float_type);
1826                         vswap();
1827                 }
1828                 if (vtop[-1].r & VT_LVAL) {
1829                         vswap();
1830                         swapped = 1;
1831                 }
1832         if (op >= TOK_ULT && op <= TOK_GT) {
1833             /* load on stack second operand */
1834             load(TREG_ST0, vtop);
1835             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1836             if (op == TOK_GE || op == TOK_GT)
1837                 swapped = !swapped;
1838             else if (op == TOK_EQ || op == TOK_NE)
1839                 swapped = 0;
1840             if (swapped)
1841                 o(0xc9d9); /* fxch %st(1) */
1842                         if (op == TOK_EQ || op == TOK_NE)
1843                                 o(0xe9da); /* fucompp */
1844                         else
1845                                 o(0xd9de); /* fcompp */
1846             o(0xe0df); /* fnstsw %ax */
1847             if (op == TOK_EQ) {
1848                 o(0x45e480); /* and $0x45, %ah */
1849                 o(0x40fC80); /* cmp $0x40, %ah */
1850             } else if (op == TOK_NE) {
1851                 o(0x45e480); /* and $0x45, %ah */
1852                 o(0x40f480); /* xor $0x40, %ah */
1853                 op = TOK_NE;
1854             } else if (op == TOK_GE || op == TOK_LE) {
1855                 o(0x05c4f6); /* test $0x05, %ah */
1856                 op = TOK_EQ;
1857             } else {
1858                 o(0x45c4f6); /* test $0x45, %ah */
1859                 op = TOK_EQ;
1860             }
1861             vtop--;
1862             vtop->r = VT_CMP;
1863             vtop->c.i = op;
1864         } else {
1865             /* no memory reference possible for long double operations */
1866             load(TREG_ST0, vtop);
1867             swapped = !swapped;
1868             switch(op) {
1869             default:
1870             case '+':
1871                 a = 0;
1872                 break;
1873             case '-':
1874                 a = 4;
1875                 if (swapped)
1876                     a++;
1877                 break;
1878             case '*':
1879                 a = 1;
1880                 break;
1881             case '/':
1882                 a = 6;
1883                 if (swapped)
1884                     a++;
1885                 break;
1886             }
1887             o(0xde); /* fxxxp %st, %st(1) */
1888             o(0xc1 + (a << 3));
1889             vtop--;
1890         }
1891     } else {
1892                 vswap();
1893                 gv(float_type);
1894                 vswap();
1895                 fr = vtop->r;
1896                 r = vtop[-1].r;
1897         if (op >= TOK_ULT && op <= TOK_GT) {
1898                         switch(op){
1899                         case TOK_LE:
1900                                 op = TOK_ULE; /* setae */
1901                                 break;
1902                         case TOK_LT:
1903                                 op = TOK_ULT;
1904                                 break;
1905                         case TOK_GE:
1906                                 op = TOK_UGE;
1907                                 break;
1908                         case TOK_GT:
1909                                 op = TOK_UGT; /* seta */
1910                                 break;
1911                         }
1912                         assert(!(vtop[-1].r & VT_LVAL));
1913                         if ((ft & VT_BTYPE) == VT_DOUBLE)
1914                                 o(0x66);
1915                         o(0x2e0f); /* ucomisd */
1916                         if(fr >= VT_CONST)
1917                                 gen_modrm(r, fr, vtop->sym, fc);
1918                         else
1919                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1920             vtop--;
1921             vtop->r = VT_CMP;
1922             vtop->c.i = op | 0x100;
1923         } else {
1924                         assert((vtop->type.t & VT_BTYPE) != VT_LDOUBLE);
1925             /* no memory reference possible for long double operations */
1926             switch(op) {
1927             default:
1928             case '+':
1929                 a = 0;
1930                 break;
1931             case '-':
1932                 a = 4;
1933                 break;
1934             case '*':
1935                 a = 1;
1936                 break;
1937             case '/':
1938                 a = 6;
1939                 break;
1940             }
1941                         assert((ft & VT_BTYPE) != VT_LDOUBLE);
1942                         assert(!(vtop[-1].r & VT_LVAL));
1943                         if ((ft & VT_BTYPE) == VT_DOUBLE) {
1944                                 o(0xf2);
1945                         } else {
1946                                 o(0xf3);
1947                         }
1948                         o(0x0f);
1949                         o(0x58 + a);
1950                         if(fr >= VT_CONST)
1951                                 gen_modrm(r, fr, vtop->sym, fc);
1952                         else
1953                                 o(0xc0 + REG_VALUE(fr) + REG_VALUE(r)*8);
1954                         vtop--;
1955         }
1956     }
1957 }
1958
1959 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
1960    and 'long long' cases. */
1961 void gen_cvt_itof(int t)
1962 {
1963         int ft, bt, tbt, r;
1964
1965     ft = vtop->type.t;
1966     bt = ft & VT_BTYPE;
1967     tbt = t & VT_BTYPE;
1968         r = gv(RC_INT);
1969
1970     if (tbt == VT_LDOUBLE) {
1971         save_reg(TREG_ST0);
1972         if ((ft & VT_BTYPE) == VT_LLONG) {
1973             /* signed long long to float/double/long double (unsigned case
1974                is handled generically) */
1975             o(0x50 + REG_VALUE(r)); /* push r */
1976             o(0x242cdf); /* fildll (%rsp) */
1977             o(0x08c48348); /* add $8, %rsp */
1978         } else if ((ft & (VT_BTYPE | VT_UNSIGNED)) == (VT_INT | VT_UNSIGNED)) {
1979             /* unsigned int to float/double/long double */
1980             o(0x6a); /* push $0 */
1981             g(0x00);
1982             o(0x50 + REG_VALUE(r)); /* push r */
1983             o(0x242cdf); /* fildll (%rsp) */
1984             o(0x10c48348); /* add $16, %rsp */
1985         } else {
1986             /* int to float/double/long double */
1987             o(0x50 + REG_VALUE(r)); /* push r */
1988             o(0x2404db); /* fildl (%rsp) */
1989             o(0x08c48348); /* add $8, %rsp */
1990         }
1991         vtop->r = TREG_ST0;
1992     } else {
1993                 int r_xmm;
1994         r_xmm = get_reg(RC_FLOAT);
1995         o(0xf2 + (tbt == VT_FLOAT));
1996         if ((ft & (VT_BTYPE | VT_UNSIGNED)) == (VT_INT | VT_UNSIGNED) || bt == VT_LLONG) {
1997             o(0x48); /* REX */
1998         }
1999         o(0x2a0f);
2000         o(0xc0 + REG_VALUE(r) + REG_VALUE(r_xmm)*8); /* cvtsi2sd or cvtsi2ss */
2001         vtop->r = r_xmm;
2002     }
2003 }
2004
2005 /* convert from one floating point type to another */
2006 void gen_cvt_ftof(int t)
2007 {
2008     int ft, bt, tbt, r;
2009
2010     ft = vtop->type.t;
2011     bt = ft & VT_BTYPE;
2012     tbt = t & VT_BTYPE;
2013
2014         if(bt == VT_LDOUBLE)
2015                 r = get_reg(RC_FLOAT);
2016         else
2017                 r = gv(RC_FLOAT);
2018     if (bt == VT_FLOAT) {
2019         if (tbt == VT_DOUBLE) {
2020             o(0x5a0f); /* cvtps2pd */
2021                         o(0xc0 + REG_VALUE(r) + REG_VALUE(r) * 8);
2022         } else if (tbt == VT_LDOUBLE) {
2023             /* movss %xmm0-7,-0x10(%rsp) */
2024             o(0x110ff3);
2025             o(0xf02444 + REG_VALUE(r)*8);
2026             o(0xf02444d9); /* flds -0x10(%rsp) */
2027             vtop->r = TREG_ST0;
2028         }
2029     } else if (bt == VT_DOUBLE) {
2030         if (tbt == VT_FLOAT) {
2031             o(0x5a0f66); /* cvtpd2ps */
2032                         o(0xc0 + REG_VALUE(r) + REG_VALUE(r) * 8);
2033         } else if (tbt == VT_LDOUBLE) {
2034             /* movsd %xmm0-7,-0x10(%rsp) */
2035             o(0x110ff2);
2036             o(0xf02444 + REG_VALUE(r)*8);
2037             o(0xf02444dd); /* fldl -0x10(%rsp) */
2038             vtop->r = TREG_ST0;
2039         }
2040     } else {
2041         gv(RC_ST0);
2042         if (tbt == VT_DOUBLE) {
2043             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
2044             /* movsd -0x10(%rsp),%xmm0-7 */
2045             o(0x100ff2);
2046             o(0xf02444 + REG_VALUE(r)*8);
2047             vtop->r = r;
2048         } else if (tbt == VT_FLOAT) {
2049             o(0xf0245cd9); /* fstps -0x10(%rsp) */
2050             /* movss -0x10(%rsp),%xmm0-7 */
2051             o(0x100ff3);
2052             o(0xf02444 + REG_VALUE(r)*8);
2053             vtop->r = r;
2054         }
2055     }
2056 }
2057
2058 /* convert fp to int 't' type */
2059 void gen_cvt_ftoi(int t)
2060 {
2061     int ft, bt, ll, r, r_xmm;
2062
2063     ft = vtop->type.t;
2064     bt = ft & VT_BTYPE;
2065
2066     if (bt == VT_LDOUBLE) {
2067         gen_cvt_ftof(VT_DOUBLE);
2068         bt = VT_DOUBLE;
2069     }
2070     r_xmm = gv(RC_FLOAT);
2071     if ((t & VT_BTYPE) == VT_INT)
2072         ll = 0;
2073     else
2074         ll = 1;
2075     r = get_reg(RC_INT);
2076     if (bt == VT_FLOAT) {
2077         o(0xf3);
2078     } else if (bt == VT_DOUBLE) {
2079         o(0xf2);
2080     } else {
2081         assert(0);
2082     }
2083     orex(ll, r, r_xmm, 0x2c0f); /* cvttss2si or cvttsd2si */
2084     o(0xc0 + REG_VALUE(r_xmm) + (REG_VALUE(r) << 3));
2085     vtop->r = r;
2086 }
2087
2088 /* computed goto support */
2089 void ggoto(void)
2090 {
2091     gcall_or_jmp(1);
2092     vtop--;
2093 }
2094
2095 /* Save the stack pointer onto the stack and return the location of its address */
2096 ST_FUNC void gen_vla_sp_save(int addr) {
2097     /* mov %rsp,addr(%rbp)*/
2098     gen_modrm64(0x89, TREG_RSP, VT_LOCAL, NULL, addr);
2099 }
2100
2101 /* Restore the SP from a location on the stack */
2102 ST_FUNC void gen_vla_sp_restore(int addr) {
2103     gen_modrm64(0x8b, TREG_RSP, VT_LOCAL, NULL, addr);
2104 }
2105
2106 /* Subtract from the stack pointer, and push the resulting value onto the stack */
2107 ST_FUNC void gen_vla_alloc(CType *type, int align) {
2108 #ifdef TCC_TARGET_PE
2109     /* alloca does more than just adjust %rsp on Windows */
2110     vpush_global_sym(&func_old_type, TOK_alloca);
2111     vswap(); /* Move alloca ref past allocation size */
2112     gfunc_call(1);
2113     vset(type, REG_IRET, 0);
2114 #else
2115     int r;
2116     r = gv(RC_INT); /* allocation size */
2117     /* sub r,%rsp */
2118     o(0x2b48);
2119     o(0xe0 | REG_VALUE(r));
2120     /* We align to 16 bytes rather than align */
2121     /* and ~15, %rsp */
2122     o(0xf0e48348);
2123     /* mov %rsp, r */
2124     o(0x8948);
2125     o(0xe0 | REG_VALUE(r));
2126     vpop();
2127     vset(type, r, 0);
2128 #endif
2129 }
2130
2131
2132 /* end of x86-64 code generator */
2133 /*************************************************************/
2134 #endif /* ! TARGET_DEFS_ONLY */
2135 /******************************************************/