x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #ifdef TARGET_DEFS_ONLY
  24
  25 /* number of available registers */
  26 #define NB_REGS         25
  27 #define NB_ASM_REGS     8
  28
  29 /* a register can belong to several classes. The classes must be
  30    sorted from more general to more precise (see gv2() code which does
  31    assumptions on it). */
  32 #define RC_INT     0x0001 /* generic integer register */
  33 #define RC_FLOAT   0x0002 /* generic float register */
  34 #define RC_RAX     0x0004
  35 #define RC_RCX     0x0008
  36 #define RC_RDX     0x0010
  37 #define RC_ST0     0x0080 /* only for long double */
  38 #define RC_R8      0x0100
  39 #define RC_R9      0x0200
  40 #define RC_R10     0x0400
  41 #define RC_R11     0x0800
  42 #define RC_XMM0    0x1000
  43 #define RC_XMM1    0x2000
  44 #define RC_XMM2    0x4000
  45 #define RC_XMM3    0x8000
  46 #define RC_XMM4    0x10000
  47 #define RC_XMM5    0x20000
  48 #define RC_XMM6    0x40000
  49 #define RC_XMM7    0x80000
  50 #define RC_IRET    RC_RAX /* function return: integer register */
  51 #define RC_LRET    RC_RDX /* function return: second integer register */
  52 #define RC_FRET    RC_XMM0 /* function return: float register */
  53 #define RC_QRET    RC_XMM1 /* function return: second float register */
  54
  55 /* pretty names for the registers */
  56 enum {
  57     TREG_RAX = 0,
  58     TREG_RCX = 1,
  59     TREG_RDX = 2,
  60     TREG_RSP = 4,
  61     TREG_RSI = 6,
  62     TREG_RDI = 7,
  63
  64     TREG_R8  = 8,
  65     TREG_R9  = 9,
  66     TREG_R10 = 10,
  67     TREG_R11 = 11,
  68
  69     TREG_XMM0 = 16,
  70     TREG_XMM1 = 17,
  71     TREG_XMM2 = 18,
  72     TREG_XMM3 = 19,
  73     TREG_XMM4 = 20,
  74     TREG_XMM5 = 21,
  75     TREG_XMM6 = 22,
  76     TREG_XMM7 = 23,
  77
  78     TREG_ST0 = 24,
  79
  80     TREG_MEM = 0x20
  81 };
  82
  83 #define REX_BASE(reg) (((reg) >> 3) & 1)
  84 #define REG_VALUE(reg) ((reg) & 7)
  85
  86 /* return registers for function */
  87 #define REG_IRET TREG_RAX /* single word int return register */
  88 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  89 #define REG_FRET TREG_XMM0 /* float return register */
  90 #define REG_QRET TREG_XMM1 /* second float return register */
  91
  92 /* defined if function parameters must be evaluated in reverse order */
  93 #define INVERT_FUNC_PARAMS
  94
  95 /* pointer size, in bytes */
  96 #define PTR_SIZE 8
  97
  98 /* long double size and alignment, in bytes */
  99 #define LDOUBLE_SIZE  16
 100 #define LDOUBLE_ALIGN 16
 101 /* maximum alignment (for aligned attribute support) */
 102 #define MAX_ALIGN     16
 103
 104 /******************************************************/
 105 /* ELF defines */
 106
 107 #define EM_TCC_TARGET EM_X86_64
 108
 109 /* relocation type for 32 bit data relocation */
 110 #define R_DATA_32   R_X86_64_32
 111 #define R_DATA_PTR  R_X86_64_64
 112 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
 113 #define R_COPY      R_X86_64_COPY
 114
 115 #define ELF_START_ADDR 0x400000
 116 #define ELF_PAGE_SIZE  0x200000
 117
 118 /******************************************************/
 119 #else /* ! TARGET_DEFS_ONLY */
 120 /******************************************************/
 121 #include "tcc.h"
 122 #include <assert.h>
 123
 124 ST_DATA const int reg_classes[NB_REGS] = {
 125     /* eax */ RC_INT | RC_RAX,
 126     /* ecx */ RC_INT | RC_RCX,
 127     /* edx */ RC_INT | RC_RDX,
 128     0,
 129     0,
 130     0,
 131     0,
 132     0,
 133     RC_R8,
 134     RC_R9,
 135     RC_R10,
 136     RC_R11,
 137     0,
 138     0,
 139     0,
 140     0,
 141     /* xmm0 */ RC_FLOAT | RC_XMM0,
 142     /* xmm1 */ RC_FLOAT | RC_XMM1,
 143     /* xmm2 */ RC_FLOAT | RC_XMM2,
 144     /* xmm3 */ RC_FLOAT | RC_XMM3,
 145     /* xmm4 */ RC_FLOAT | RC_XMM4,
 146     /* xmm5 */ RC_FLOAT | RC_XMM5,
 147     /* xmm6 an xmm7 are included so gv() can be used on them,
 148        but they are not tagged with RC_FLOAT because they are
 149        callee saved on Windows */
 150     RC_XMM6,
 151     RC_XMM7,
 152     /* st0 */ RC_ST0
 153 };
 154
 155 static unsigned long func_sub_sp_offset;
 156 static int func_ret_sub;
 157
 158 /* XXX: make it faster ? */
 159 void g(int c)
 160 {
 161     int ind1;
 162     ind1 = ind + 1;
 163     if (ind1 > cur_text_section->data_allocated)
 164         section_realloc(cur_text_section, ind1);
 165     cur_text_section->data[ind] = c;
 166     ind = ind1;
 167 }
 168
 169 void o(unsigned int c)
 170 {
 171     while (c) {
 172         g(c);
 173         c = c >> 8;
 174     }
 175 }
 176
 177 void gen_le16(int v)
 178 {
 179     g(v);
 180     g(v >> 8);
 181 }
 182
 183 void gen_le32(int c)
 184 {
 185     g(c);
 186     g(c >> 8);
 187     g(c >> 16);
 188     g(c >> 24);
 189 }
 190
 191 void gen_le64(int64_t c)
 192 {
 193     g(c);
 194     g(c >> 8);
 195     g(c >> 16);
 196     g(c >> 24);
 197     g(c >> 32);
 198     g(c >> 40);
 199     g(c >> 48);
 200     g(c >> 56);
 201 }
 202
 203 void orex(int ll, int r, int r2, int b)
 204 {
 205     if ((r & VT_VALMASK) >= VT_CONST)
 206         r = 0;
 207     if ((r2 & VT_VALMASK) >= VT_CONST)
 208         r2 = 0;
 209     if (ll || REX_BASE(r) || REX_BASE(r2))
 210         o(0x40 | REX_BASE(r) | (REX_BASE(r2) << 2) | (ll << 3));
 211     o(b);
 212 }
 213
 214 /* output a symbol and patch all calls to it */
 215 void gsym_addr(int t, int a)
 216 {
 217     while (t) {
 218         unsigned char *ptr = cur_text_section->data + t;
 219         uint32_t n = read32le(ptr); /* next value */
 220         write32le(ptr, a - t - 4);
 221         t = n;
 222     }
 223 }
 224
 225 void gsym(int t)
 226 {
 227     gsym_addr(t, ind);
 228 }
 229
 230 /* psym is used to put an instruction with a data field which is a
 231    reference to a symbol. It is in fact the same as oad ! */
 232 #define psym oad
 233
 234 static int is64_type(int t)
 235 {
 236     return ((t & VT_BTYPE) == VT_PTR ||
 237             (t & VT_BTYPE) == VT_FUNC ||
 238             (t & VT_BTYPE) == VT_LLONG);
 239 }
 240
 241 /* instruction + 4 bytes data. Return the address of the data */
 242 ST_FUNC int oad(int c, int s)
 243 {
 244     int ind1;
 245
 246     o(c);
 247     ind1 = ind + 4;
 248     if (ind1 > cur_text_section->data_allocated)
 249         section_realloc(cur_text_section, ind1);
 250     write32le(cur_text_section->data + ind, s);
 251     s = ind;
 252     ind = ind1;
 253     return s;
 254 }
 255
 256 ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 257 {
 258     if (r & VT_SYM)
 259         greloca(cur_text_section, sym, ind, R_X86_64_32, c), c=0;
 260     gen_le32(c);
 261 }
 262
 263 /* output constant with relocation if 'r & VT_SYM' is true */
 264 ST_FUNC void gen_addr64(int r, Sym *sym, int64_t c)
 265 {
 266     if (r & VT_SYM)
 267         greloca(cur_text_section, sym, ind, R_X86_64_64, c), c=0;
 268     gen_le64(c);
 269 }
 270
 271 /* output constant with relocation if 'r & VT_SYM' is true */
 272 ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
 273 {
 274     if (r & VT_SYM)
 275         greloca(cur_text_section, sym, ind, R_X86_64_PC32, c-4), c=4;
 276     gen_le32(c-4);
 277 }
 278
 279 /* output got address with relocation */
 280 static void gen_gotpcrel(int r, Sym *sym, int c)
 281 {
 282 #ifndef TCC_TARGET_PE
 283     greloca(cur_text_section, sym, ind, R_X86_64_GOTPCREL, -4);
 284 #else
 285     tcc_error("internal error: no GOT on PE: %s %x %x | %02x %02x %02x\n",
 286         get_tok_str(sym->v, NULL), c, r,
 287         cur_text_section->data[ind-3],
 288         cur_text_section->data[ind-2],
 289         cur_text_section->data[ind-1]
 290         );
 291     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 292 #endif
 293     gen_le32(0);
 294     if (c) {
 295         /* we use add c, %xxx for displacement */
 296         orex(1, r, 0, 0x81);
 297         o(0xc0 + REG_VALUE(r));
 298         gen_le32(c);
 299     }
 300 }
 301
 302 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 303 {
 304     op_reg = REG_VALUE(op_reg) << 3;
 305     if ((r & VT_VALMASK) == VT_CONST) {
 306         /* constant memory reference */
 307         o(0x05 | op_reg);
 308         if (is_got) {
 309             gen_gotpcrel(r, sym, c);
 310         } else {
 311             gen_addrpc32(r, sym, c);
 312         }
 313     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 314         /* currently, we use only ebp as base */
 315         if (c == (char)c) {
 316             /* short reference */
 317             o(0x45 | op_reg);
 318             g(c);
 319         } else {
 320             oad(0x85 | op_reg, c);
 321         }
 322     } else if ((r & VT_VALMASK) >= TREG_MEM) {
 323         if (c) {
 324             g(0x80 | op_reg | REG_VALUE(r));
 325             gen_le32(c);
 326         } else {
 327             g(0x00 | op_reg | REG_VALUE(r));
 328         }
 329     } else {
 330         g(0x00 | op_reg | REG_VALUE(r));
 331     }
 332 }
 333
 334 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 335    opcode bits */
 336 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 337 {
 338     gen_modrm_impl(op_reg, r, sym, c, 0);
 339 }
 340
 341 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 342    opcode bits */
 343 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 344 {
 345     int is_got;
 346     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 347     orex(1, r, op_reg, opcode);
 348     gen_modrm_impl(op_reg, r, sym, c, is_got);
 349 }
 350
 351
 352 /* load 'r' from value 'sv' */
 353 void load(int r, SValue *sv)
 354 {
 355     int v, t, ft, fc, fr;
 356     SValue v1;
 357
 358 #ifdef TCC_TARGET_PE
 359     SValue v2;
 360     sv = pe_getimport(sv, &v2);
 361 #endif
 362
 363     fr = sv->r;
 364     ft = sv->type.t & ~VT_DEFSIGN;
 365     fc = sv->c.i;
 366
 367     ft &= ~(VT_VOLATILE | VT_CONSTANT);
 368
 369 #ifndef TCC_TARGET_PE
 370     /* we use indirect access via got */
 371     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 372         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 373         /* use the result register as a temporal register */
 374         int tr = r | TREG_MEM;
 375         if (is_float(ft)) {
 376             /* we cannot use float registers as a temporal register */
 377             tr = get_reg(RC_INT) | TREG_MEM;
 378         }
 379         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 380
 381         /* load from the temporal register */
 382         fr = tr | VT_LVAL;
 383     }
 384 #endif
 385
 386     v = fr & VT_VALMASK;
 387     if (fr & VT_LVAL) {
 388         int b, ll;
 389         if (v == VT_LLOCAL) {
 390             v1.type.t = VT_PTR;
 391             v1.r = VT_LOCAL | VT_LVAL;
 392             v1.c.i = fc;
 393             fr = r;
 394             if (!(reg_classes[fr] & (RC_INT|RC_R11)))
 395                 fr = get_reg(RC_INT);
 396             load(fr, &v1);
 397         }
 398         ll = 0;
 399         if ((ft & VT_BTYPE) == VT_FLOAT) {
 400             b = 0x6e0f66;
 401             r = REG_VALUE(r); /* movd */
 402         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 403             b = 0x7e0ff3; /* movq */
 404             r = REG_VALUE(r);
 405         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 406             b = 0xdb, r = 5; /* fldt */
 407         } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
 408             b = 0xbe0f;   /* movsbl */
 409         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 410             b = 0xb60f;   /* movzbl */
 411         } else if ((ft & VT_TYPE) == VT_SHORT) {
 412             b = 0xbf0f;   /* movswl */
 413         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 414             b = 0xb70f;   /* movzwl */
 415         } else {
 416             assert(((ft & VT_BTYPE) == VT_INT) || ((ft & VT_BTYPE) == VT_LLONG)
 417                    || ((ft & VT_BTYPE) == VT_PTR) || ((ft & VT_BTYPE) == VT_ENUM)
 418                    || ((ft & VT_BTYPE) == VT_FUNC));
 419             ll = is64_type(ft);
 420             b = 0x8b;
 421         }
 422         if (ll) {
 423             gen_modrm64(b, r, fr, sv->sym, fc);
 424         } else {
 425             orex(ll, fr, r, b);
 426             gen_modrm(r, fr, sv->sym, fc);
 427         }
 428     } else {
 429         if (v == VT_CONST) {
 430             if (fr & VT_SYM) {
 431 #ifdef TCC_TARGET_PE
 432                 orex(1,0,r,0x8d);
 433                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 434                 gen_addrpc32(fr, sv->sym, fc);
 435 #else
 436                 if (sv->sym->type.t & VT_STATIC) {
 437                     orex(1,0,r,0x8d);
 438                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 439                     gen_addrpc32(fr, sv->sym, fc);
 440                 } else {
 441                     orex(1,0,r,0x8b);
 442                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 443                     gen_gotpcrel(r, sv->sym, fc);
 444                 }
 445 #endif
 446             } else if (is64_type(ft)) {
 447                 orex(1,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 448                 gen_le64(sv->c.i);
 449             } else {
 450                 orex(0,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 451                 gen_le32(fc);
 452             }
 453         } else if (v == VT_LOCAL) {
 454             orex(1,0,r,0x8d); /* lea xxx(%ebp), r */
 455             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 456         } else if (v == VT_CMP) {
 457             orex(0,r,0,0);
 458             if ((fc & ~0x100) != TOK_NE)
 459               oad(0xb8 + REG_VALUE(r), 0); /* mov $0, r */
 460             else
 461               oad(0xb8 + REG_VALUE(r), 1); /* mov $1, r */
 462             if (fc & 0x100)
 463               {
 464                 /* This was a float compare.  If the parity bit is
 465                    set the result was unordered, meaning false for everything
 466                    except TOK_NE, and true for TOK_NE.  */
 467                 fc &= ~0x100;
 468                 o(0x037a + (REX_BASE(r) << 8));
 469               }
 470             orex(0,r,0, 0x0f); /* setxx %br */
 471             o(fc);
 472             o(0xc0 + REG_VALUE(r));
 473         } else if (v == VT_JMP || v == VT_JMPI) {
 474             t = v & 1;
 475             orex(0,r,0,0);
 476             oad(0xb8 + REG_VALUE(r), t); /* mov $1, r */
 477             o(0x05eb + (REX_BASE(r) << 8)); /* jmp after */
 478             gsym(fc);
 479             orex(0,r,0,0);
 480             oad(0xb8 + REG_VALUE(r), t ^ 1); /* mov $0, r */
 481         } else if (v != r) {
 482             if ((r >= TREG_XMM0) && (r <= TREG_XMM7)) {
 483                 if (v == TREG_ST0) {
 484                     /* gen_cvt_ftof(VT_DOUBLE); */
 485                     o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 486                     /* movsd -0x10(%rsp),%xmmN */
 487                     o(0x100ff2);
 488                     o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 489                     o(0xf024);
 490                 } else {
 491                     assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 492                     if ((ft & VT_BTYPE) == VT_FLOAT) {
 493                         o(0x100ff3);
 494                     } else {
 495                         assert((ft & VT_BTYPE) == VT_DOUBLE);
 496                         o(0x100ff2);
 497                     }
 498                     o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8);
 499                 }
 500             } else if (r == TREG_ST0) {
 501                 assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 502                 /* gen_cvt_ftof(VT_LDOUBLE); */
 503                 /* movsd %xmmN,-0x10(%rsp) */
 504                 o(0x110ff2);
 505                 o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 506                 o(0xf024);
 507                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 508             } else {
 509                 orex(1,r,v, 0x89);
 510                 o(0xc0 + REG_VALUE(r) + REG_VALUE(v) * 8); /* mov v, r */
 511             }
 512         }
 513     }
 514 }
 515
 516 /* store register 'r' in lvalue 'v' */
 517 void store(int r, SValue *v)
 518 {
 519     int fr, bt, ft, fc;
 520     int op64 = 0;
 521     /* store the REX prefix in this variable when PIC is enabled */
 522     int pic = 0;
 523
 524 #ifdef TCC_TARGET_PE
 525     SValue v2;
 526     v = pe_getimport(v, &v2);
 527 #endif
 528
 529     ft = v->type.t;
 530     fc = v->c.i;
 531     fr = v->r & VT_VALMASK;
 532     ft &= ~(VT_VOLATILE | VT_CONSTANT);
 533     bt = ft & VT_BTYPE;
 534
 535 #ifndef TCC_TARGET_PE
 536     /* we need to access the variable via got */
 537     if (fr == VT_CONST && (v->r & VT_SYM)) {
 538         /* mov xx(%rip), %r11 */
 539         o(0x1d8b4c);
 540         gen_gotpcrel(TREG_R11, v->sym, v->c.i);
 541         pic = is64_type(bt) ? 0x49 : 0x41;
 542     }
 543 #endif
 544
 545     /* XXX: incorrect if float reg to reg */
 546     if (bt == VT_FLOAT) {
 547         o(0x66);
 548         o(pic);
 549         o(0x7e0f); /* movd */
 550         r = REG_VALUE(r);
 551     } else if (bt == VT_DOUBLE) {
 552         o(0x66);
 553         o(pic);
 554         o(0xd60f); /* movq */
 555         r = REG_VALUE(r);
 556     } else if (bt == VT_LDOUBLE) {
 557         o(0xc0d9); /* fld %st(0) */
 558         o(pic);
 559         o(0xdb); /* fstpt */
 560         r = 7;
 561     } else {
 562         if (bt == VT_SHORT)
 563             o(0x66);
 564         o(pic);
 565         if (bt == VT_BYTE || bt == VT_BOOL)
 566             orex(0, 0, r, 0x88);
 567         else if (is64_type(bt))
 568             op64 = 0x89;
 569         else
 570             orex(0, 0, r, 0x89);
 571     }
 572     if (pic) {
 573         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 574         if (op64)
 575             o(op64);
 576         o(3 + (r << 3));
 577     } else if (op64) {
 578         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 579             gen_modrm64(op64, r, v->r, v->sym, fc);
 580         } else if (fr != r) {
 581             /* XXX: don't we really come here? */
 582             abort();
 583             o(0xc0 + fr + r * 8); /* mov r, fr */
 584         }
 585     } else {
 586         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 587             gen_modrm(r, v->r, v->sym, fc);
 588         } else if (fr != r) {
 589             /* XXX: don't we really come here? */
 590             abort();
 591             o(0xc0 + fr + r * 8); /* mov r, fr */
 592         }
 593     }
 594 }
 595
 596 /* 'is_jmp' is '1' if it is a jump */
 597 static void gcall_or_jmp(int is_jmp)
 598 {
 599     int r;
 600     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST &&
 601         ((vtop->r & VT_SYM) || (vtop->c.i-4) == (int)(vtop->c.i-4))) {
 602         /* constant case */
 603         if (vtop->r & VT_SYM) {
 604             /* relocation case */
 605 #ifdef TCC_TARGET_PE
 606             greloca(cur_text_section, vtop->sym, ind + 1, R_X86_64_PC32, (int)(vtop->c.i-4));
 607 #else
 608             greloca(cur_text_section, vtop->sym, ind + 1, R_X86_64_PLT32, (int)(vtop->c.i-4));
 609 #endif
 610         } else {
 611             /* put an empty PC32 relocation */
 612             put_elf_reloca(symtab_section, cur_text_section,
 613                           ind + 1, R_X86_64_PC32, 0, (int)(vtop->c.i-4));
 614         }
 615         oad(0xe8 + is_jmp, 0); /* call/jmp im */
 616     } else {
 617         /* otherwise, indirect call */
 618         r = TREG_R11;
 619         load(r, vtop);
 620         o(0x41); /* REX */
 621         o(0xff); /* call/jmp *r */
 622         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 623     }
 624 }
 625
 626 #if defined(CONFIG_TCC_BCHECK)
 627 #ifndef TCC_TARGET_PE
 628 static addr_t func_bound_offset;
 629 static unsigned long func_bound_ind;
 630 #endif
 631
 632 static void gen_static_call(int v)
 633 {
 634     Sym *sym = external_global_sym(v, &func_old_type, 0);
 635     oad(0xe8, 0);
 636     greloca(cur_text_section, sym, ind-4, R_X86_64_PC32, -4);
 637 }
 638
 639 /* generate a bounded pointer addition */
 640 ST_FUNC void gen_bounded_ptr_add(void)
 641 {
 642     /* save all temporary registers */
 643     save_regs(0);
 644
 645     /* prepare fast x86_64 function call */
 646     gv(RC_RAX);
 647     o(0xc68948); // mov  %rax,%rsi ## second arg in %rsi, this must be size
 648     vtop--;
 649
 650     gv(RC_RAX);
 651     o(0xc78948); // mov  %rax,%rdi ## first arg in %rdi, this must be ptr
 652     vtop--;
 653
 654     /* do a fast function call */
 655     gen_static_call(TOK___bound_ptr_add);
 656
 657     /* returned pointer is in rax */
 658     vtop++;
 659     vtop->r = TREG_RAX | VT_BOUNDED;
 660
 661
 662     /* relocation offset of the bounding function call point */
 663     vtop->c.i = (cur_text_section->reloc->data_offset - sizeof(ElfW(Rela)));
 664 }
 665
 666 /* patch pointer addition in vtop so that pointer dereferencing is
 667    also tested */
 668 ST_FUNC void gen_bounded_ptr_deref(void)
 669 {
 670     addr_t func;
 671     int size, align;
 672     ElfW(Rela) *rel;
 673     Sym *sym;
 674
 675     size = 0;
 676     /* XXX: put that code in generic part of tcc */
 677     if (!is_float(vtop->type.t)) {
 678         if (vtop->r & VT_LVAL_BYTE)
 679             size = 1;
 680         else if (vtop->r & VT_LVAL_SHORT)
 681             size = 2;
 682     }
 683     if (!size)
 684     size = type_size(&vtop->type, &align);
 685     switch(size) {
 686     case  1: func = TOK___bound_ptr_indir1; break;
 687     case  2: func = TOK___bound_ptr_indir2; break;
 688     case  4: func = TOK___bound_ptr_indir4; break;
 689     case  8: func = TOK___bound_ptr_indir8; break;
 690     case 12: func = TOK___bound_ptr_indir12; break;
 691     case 16: func = TOK___bound_ptr_indir16; break;
 692     default:
 693         tcc_error("unhandled size when dereferencing bounded pointer");
 694         func = 0;
 695         break;
 696     }
 697
 698     sym = external_global_sym(func, &func_old_type, 0);
 699     if (!sym->c)
 700         put_extern_sym(sym, NULL, 0, 0);
 701
 702     /* patch relocation */
 703     /* XXX: find a better solution ? */
 704
 705     rel = (ElfW(Rela) *)(cur_text_section->reloc->data + vtop->c.i);
 706     rel->r_info = ELF64_R_INFO(sym->c, ELF64_R_TYPE(rel->r_info));
 707 }
 708 #endif
 709
 710 #ifdef TCC_TARGET_PE
 711
 712 #define REGN 4
 713 static const uint8_t arg_regs[REGN] = {
 714     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 715 };
 716
 717 /* Prepare arguments in R10 and R11 rather than RCX and RDX
 718    because gv() will not ever use these */
 719 static int arg_prepare_reg(int idx) {
 720   if (idx == 0 || idx == 1)
 721       /* idx=0: r10, idx=1: r11 */
 722       return idx + 10;
 723   else
 724       return arg_regs[idx];
 725 }
 726
 727 static int func_scratch;
 728
 729 /* Generate function call. The function address is pushed first, then
 730    all the parameters in call order. This functions pops all the
 731    parameters and the function address. */
 732
 733 void gen_offs_sp(int b, int r, int d)
 734 {
 735     orex(1,0,r & 0x100 ? 0 : r, b);
 736     if (d == (char)d) {
 737         o(0x2444 | (REG_VALUE(r) << 3));
 738         g(d);
 739     } else {
 740         o(0x2484 | (REG_VALUE(r) << 3));
 741         gen_le32(d);
 742     }
 743 }
 744
 745 /* Return the number of registers needed to return the struct, or 0 if
 746    returning via struct pointer. */
 747 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
 748 {
 749     int size, align;
 750     *regsize = 8;
 751     *ret_align = 1; // Never have to re-align return values for x86-64
 752     size = type_size(vt, &align);
 753     ret->ref = NULL;
 754     if (size > 8) {
 755         return 0;
 756     } else if (size > 4) {
 757         ret->t = VT_LLONG;
 758         return 1;
 759     } else if (size > 2) {
 760         ret->t = VT_INT;
 761         return 1;
 762     } else if (size > 1) {
 763         ret->t = VT_SHORT;
 764         return 1;
 765     } else {
 766         ret->t = VT_BYTE;
 767         return 1;
 768     }
 769 }
 770
 771 static int is_sse_float(int t) {
 772     int bt;
 773     bt = t & VT_BTYPE;
 774     return bt == VT_DOUBLE || bt == VT_FLOAT;
 775 }
 776
 777 int gfunc_arg_size(CType *type) {
 778     int align;
 779     if (type->t & (VT_ARRAY|VT_BITFIELD))
 780         return 8;
 781     return type_size(type, &align);
 782 }
 783
 784 void gfunc_call(int nb_args)
 785 {
 786     int size, r, args_size, i, d, bt, struct_size;
 787     int arg;
 788
 789     args_size = (nb_args < REGN ? REGN : nb_args) * PTR_SIZE;
 790     arg = nb_args;
 791
 792     /* for struct arguments, we need to call memcpy and the function
 793        call breaks register passing arguments we are preparing.
 794        So, we process arguments which will be passed by stack first. */
 795     struct_size = args_size;
 796     for(i = 0; i < nb_args; i++) {
 797         SValue *sv;
 798
 799         --arg;
 800         sv = &vtop[-i];
 801         bt = (sv->type.t & VT_BTYPE);
 802         size = gfunc_arg_size(&sv->type);
 803
 804         if (size <= 8)
 805             continue; /* arguments smaller than 8 bytes passed in registers or on stack */
 806
 807         if (bt == VT_STRUCT) {
 808             /* align to stack align size */
 809             size = (size + 15) & ~15;
 810             /* generate structure store */
 811             r = get_reg(RC_INT);
 812             gen_offs_sp(0x8d, r, struct_size);
 813             struct_size += size;
 814
 815             /* generate memcpy call */
 816             vset(&sv->type, r | VT_LVAL, 0);
 817             vpushv(sv);
 818             vstore();
 819             --vtop;
 820         } else if (bt == VT_LDOUBLE) {
 821             gv(RC_ST0);
 822             gen_offs_sp(0xdb, 0x107, struct_size);
 823             struct_size += 16;
 824         }
 825     }
 826
 827     if (func_scratch < struct_size)
 828         func_scratch = struct_size;
 829
 830     arg = nb_args;
 831     struct_size = args_size;
 832
 833     for(i = 0; i < nb_args; i++) {
 834         --arg;
 835         bt = (vtop->type.t & VT_BTYPE);
 836
 837         size = gfunc_arg_size(&vtop->type);
 838         if (size > 8) {
 839             /* align to stack align size */
 840             size = (size + 15) & ~15;
 841             if (arg >= REGN) {
 842                 d = get_reg(RC_INT);
 843                 gen_offs_sp(0x8d, d, struct_size);
 844                 gen_offs_sp(0x89, d, arg*8);
 845             } else {
 846                 d = arg_prepare_reg(arg);
 847                 gen_offs_sp(0x8d, d, struct_size);
 848             }
 849             struct_size += size;
 850         } else {
 851             if (is_sse_float(vtop->type.t)) {
 852                 gv(RC_XMM0); /* only use one float register */
 853                 if (arg >= REGN) {
 854                     /* movq %xmm0, j*8(%rsp) */
 855                     gen_offs_sp(0xd60f66, 0x100, arg*8);
 856                 } else {
 857                     /* movaps %xmm0, %xmmN */
 858                     o(0x280f);
 859                     o(0xc0 + (arg << 3));
 860                     d = arg_prepare_reg(arg);
 861                     /* mov %xmm0, %rxx */
 862                     o(0x66);
 863                     orex(1,d,0, 0x7e0f);
 864                     o(0xc0 + REG_VALUE(d));
 865                 }
 866             } else {
 867                 if (bt == VT_STRUCT) {
 868                     vtop->type.ref = NULL;
 869                     vtop->type.t = size > 4 ? VT_LLONG : size > 2 ? VT_INT
 870                         : size > 1 ? VT_SHORT : VT_BYTE;
 871                 }
 872
 873                 r = gv(RC_INT);
 874                 if (arg >= REGN) {
 875                     gen_offs_sp(0x89, r, arg*8);
 876                 } else {
 877                     d = arg_prepare_reg(arg);
 878                     orex(1,d,r,0x89); /* mov */
 879                     o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
 880                 }
 881             }
 882         }
 883         vtop--;
 884     }
 885     save_regs(0);
 886
 887     /* Copy R10 and R11 into RCX and RDX, respectively */
 888     if (nb_args > 0) {
 889         o(0xd1894c); /* mov %r10, %rcx */
 890         if (nb_args > 1) {
 891             o(0xda894c); /* mov %r11, %rdx */
 892         }
 893     }
 894
 895     gcall_or_jmp(0);
 896     vtop--;
 897 }
 898
 899
 900 #define FUNC_PROLOG_SIZE 11
 901
 902 /* generate function prolog of type 't' */
 903 void gfunc_prolog(CType *func_type)
 904 {
 905     int addr, reg_param_index, bt, size;
 906     Sym *sym;
 907     CType *type;
 908
 909     func_ret_sub = 0;
 910     func_scratch = 0;
 911     loc = 0;
 912
 913     addr = PTR_SIZE * 2;
 914     ind += FUNC_PROLOG_SIZE;
 915     func_sub_sp_offset = ind;
 916     reg_param_index = 0;
 917
 918     sym = func_type->ref;
 919
 920     /* if the function returns a structure, then add an
 921        implicit pointer parameter */
 922     func_vt = sym->type;
 923     func_var = (sym->c == FUNC_ELLIPSIS);
 924     size = gfunc_arg_size(&func_vt);
 925     if (size > 8) {
 926         gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 927         func_vc = addr;
 928         reg_param_index++;
 929         addr += 8;
 930     }
 931
 932     /* define parameters */
 933     while ((sym = sym->next) != NULL) {
 934         type = &sym->type;
 935         bt = type->t & VT_BTYPE;
 936         size = gfunc_arg_size(type);
 937         if (size > 8) {
 938             if (reg_param_index < REGN) {
 939                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 940             }
 941             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL | VT_REF, addr);
 942         } else {
 943             if (reg_param_index < REGN) {
 944                 /* save arguments passed by register */
 945                 if ((bt == VT_FLOAT) || (bt == VT_DOUBLE)) {
 946                     o(0xd60f66); /* movq */
 947                     gen_modrm(reg_param_index, VT_LOCAL, NULL, addr);
 948                 } else {
 949                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 950                 }
 951             }
 952             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL, addr);
 953         }
 954         addr += 8;
 955         reg_param_index++;
 956     }
 957
 958     while (reg_param_index < REGN) {
 959         if (func_type->ref->c == FUNC_ELLIPSIS) {
 960             gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 961             addr += 8;
 962         }
 963         reg_param_index++;
 964     }
 965 }
 966
 967 /* generate function epilog */
 968 void gfunc_epilog(void)
 969 {
 970     int v, saved_ind;
 971
 972     o(0xc9); /* leave */
 973     if (func_ret_sub == 0) {
 974         o(0xc3); /* ret */
 975     } else {
 976         o(0xc2); /* ret n */
 977         g(func_ret_sub);
 978         g(func_ret_sub >> 8);
 979     }
 980
 981     saved_ind = ind;
 982     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 983     /* align local size to word & save local variables */
 984     v = (func_scratch + -loc + 15) & -16;
 985
 986     if (v >= 4096) {
 987         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 988         oad(0xb8, v); /* mov stacksize, %eax */
 989         oad(0xe8, 0); /* call __chkstk, (does the stackframe too) */
 990         greloca(cur_text_section, sym, ind-4, R_X86_64_PC32, -4);
 991         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
 992     } else {
 993         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 994         o(0xec8148);  /* sub rsp, stacksize */
 995         gen_le32(v);
 996     }
 997
 998     cur_text_section->data_offset = saved_ind;
 999     pe_add_unwind_data(ind, saved_ind, v);
1000     ind = cur_text_section->data_offset;
1001 }
1002
1003 #else
1004
1005 static void gadd_sp(int val)
1006 {
1007     if (val == (char)val) {
1008         o(0xc48348);
1009         g(val);
1010     } else {
1011         oad(0xc48148, val); /* add $xxx, %rsp */
1012     }
1013 }
1014
1015 typedef enum X86_64_Mode {
1016   x86_64_mode_none,
1017   x86_64_mode_memory,
1018   x86_64_mode_integer,
1019   x86_64_mode_sse,
1020   x86_64_mode_x87
1021 } X86_64_Mode;
1022
1023 static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b)
1024 {
1025     if (a == b)
1026         return a;
1027     else if (a == x86_64_mode_none)
1028         return b;
1029     else if (b == x86_64_mode_none)
1030         return a;
1031     else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
1032         return x86_64_mode_memory;
1033     else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
1034         return x86_64_mode_integer;
1035     else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
1036         return x86_64_mode_memory;
1037     else
1038         return x86_64_mode_sse;
1039 }
1040
1041 static X86_64_Mode classify_x86_64_inner(CType *ty)
1042 {
1043     X86_64_Mode mode;
1044     Sym *f;
1045
1046     switch (ty->t & VT_BTYPE) {
1047     case VT_VOID: return x86_64_mode_none;
1048
1049     case VT_INT:
1050     case VT_BYTE:
1051     case VT_SHORT:
1052     case VT_LLONG:
1053     case VT_BOOL:
1054     case VT_PTR:
1055     case VT_FUNC:
1056     case VT_ENUM: return x86_64_mode_integer;
1057
1058     case VT_FLOAT:
1059     case VT_DOUBLE: return x86_64_mode_sse;
1060
1061     case VT_LDOUBLE: return x86_64_mode_x87;
1062
1063     case VT_STRUCT:
1064         f = ty->ref;
1065
1066         mode = x86_64_mode_none;
1067         for (f = f->next; f; f = f->next)
1068             mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
1069
1070         return mode;
1071     }
1072
1073     assert(0);
1074 }
1075
1076 static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count)
1077 {
1078     X86_64_Mode mode;
1079     int size, align, ret_t = 0;
1080
1081     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
1082         *psize = 8;
1083         *palign = 8;
1084         *reg_count = 1;
1085         ret_t = ty->t;
1086         mode = x86_64_mode_integer;
1087     } else {
1088         size = type_size(ty, &align);
1089         *psize = (size + 7) & ~7;
1090         *palign = (align + 7) & ~7;
1091
1092         if (size > 16) {
1093             mode = x86_64_mode_memory;
1094         } else {
1095             mode = classify_x86_64_inner(ty);
1096             switch (mode) {
1097             case x86_64_mode_integer:
1098                 if (size > 8) {
1099                     *reg_count = 2;
1100                     ret_t = VT_QLONG;
1101                 } else {
1102                     *reg_count = 1;
1103                     ret_t = (size > 4) ? VT_LLONG : VT_INT;
1104                 }
1105                 break;
1106
1107             case x86_64_mode_x87:
1108                 *reg_count = 1;
1109                 ret_t = VT_LDOUBLE;
1110                 break;
1111
1112             case x86_64_mode_sse:
1113                 if (size > 8) {
1114                     *reg_count = 2;
1115                     ret_t = VT_QFLOAT;
1116                 } else {
1117                     *reg_count = 1;
1118                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
1119                 }
1120                 break;
1121             default: break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
1122             }
1123         }
1124     }
1125
1126     if (ret) {
1127         ret->ref = NULL;
1128         ret->t = ret_t;
1129     }
1130
1131     return mode;
1132 }
1133
1134 ST_FUNC int classify_x86_64_va_arg(CType *ty)
1135 {
1136     /* This definition must be synced with stdarg.h */
1137     enum __va_arg_type {
1138         __va_gen_reg, __va_float_reg, __va_stack
1139     };
1140     int size, align, reg_count;
1141     X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
1142     switch (mode) {
1143     default: return __va_stack;
1144     case x86_64_mode_integer: return __va_gen_reg;
1145     case x86_64_mode_sse: return __va_float_reg;
1146     }
1147 }
1148
1149 /* Return the number of registers needed to return the struct, or 0 if
1150    returning via struct pointer. */
1151 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
1152 {
1153     int size, align, reg_count;
1154     *ret_align = 1; // Never have to re-align return values for x86-64
1155     *regsize = 8;
1156     return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
1157 }
1158
1159 #define REGN 6
1160 static const uint8_t arg_regs[REGN] = {
1161     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
1162 };
1163
1164 static int arg_prepare_reg(int idx) {
1165   if (idx == 2 || idx == 3)
1166       /* idx=2: r10, idx=3: r11 */
1167       return idx + 8;
1168   else
1169       return arg_regs[idx];
1170 }
1171
1172 /* Generate function call. The function address is pushed first, then
1173    all the parameters in call order. This functions pops all the
1174    parameters and the function address. */
1175 void gfunc_call(int nb_args)
1176 {
1177     X86_64_Mode mode;
1178     CType type;
1179     int size, align, r, args_size, stack_adjust, run_start, run_end, i, reg_count;
1180     int nb_reg_args = 0;
1181     int nb_sse_args = 0;
1182     int sse_reg, gen_reg;
1183
1184     /* calculate the number of integer/float register arguments */
1185     for(i = 0; i < nb_args; i++) {
1186         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1187         if (mode == x86_64_mode_sse)
1188             nb_sse_args += reg_count;
1189         else if (mode == x86_64_mode_integer)
1190             nb_reg_args += reg_count;
1191     }
1192
1193     /* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
1194        and ended by a 16-byte aligned argument. This is because, from the point of view of
1195        the callee, argument alignment is computed from the bottom up. */
1196     /* for struct arguments, we need to call memcpy and the function
1197        call breaks register passing arguments we are preparing.
1198        So, we process arguments which will be passed by stack first. */
1199     gen_reg = nb_reg_args;
1200     sse_reg = nb_sse_args;
1201     run_start = 0;
1202     args_size = 0;
1203     while (run_start != nb_args) {
1204         int run_gen_reg = gen_reg, run_sse_reg = sse_reg;
1205
1206         run_end = nb_args;
1207         stack_adjust = 0;
1208         for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
1209             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1210             switch (mode) {
1211             case x86_64_mode_memory:
1212             case x86_64_mode_x87:
1213             stack_arg:
1214                 if (align == 16)
1215                     run_end = i;
1216                 else
1217                     stack_adjust += size;
1218                 break;
1219
1220             case x86_64_mode_sse:
1221                 sse_reg -= reg_count;
1222                 if (sse_reg + reg_count > 8) goto stack_arg;
1223                 break;
1224
1225             case x86_64_mode_integer:
1226                 gen_reg -= reg_count;
1227                 if (gen_reg + reg_count > REGN) goto stack_arg;
1228                 break;
1229             default: break; /* nothing to be done for x86_64_mode_none */
1230             }
1231         }
1232
1233         gen_reg = run_gen_reg;
1234         sse_reg = run_sse_reg;
1235
1236         /* adjust stack to align SSE boundary */
1237         if (stack_adjust &= 15) {
1238             /* fetch cpu flag before the following sub will change the value */
1239             if (vtop >= vstack && (vtop->r & VT_VALMASK) == VT_CMP)
1240                 gv(RC_INT);
1241
1242             stack_adjust = 16 - stack_adjust;
1243             o(0x48);
1244             oad(0xec81, stack_adjust); /* sub $xxx, %rsp */
1245             args_size += stack_adjust;
1246         }
1247
1248         for(i = run_start; i < run_end;) {
1249             /* Swap argument to top, it will possibly be changed here,
1250               and might use more temps. At the end of the loop we keep
1251               in on the stack and swap it back to its original position
1252               if it is a register. */
1253             SValue tmp = vtop[0];
1254             int arg_stored = 1;
1255
1256             vtop[0] = vtop[-i];
1257             vtop[-i] = tmp;
1258             mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
1259
1260             switch (vtop->type.t & VT_BTYPE) {
1261             case VT_STRUCT:
1262                 if (mode == x86_64_mode_sse) {
1263                     if (sse_reg > 8)
1264                         sse_reg -= reg_count;
1265                     else
1266                         arg_stored = 0;
1267                 } else if (mode == x86_64_mode_integer) {
1268                     if (gen_reg > REGN)
1269                         gen_reg -= reg_count;
1270                     else
1271                         arg_stored = 0;
1272                 }
1273
1274                 if (arg_stored) {
1275                     /* allocate the necessary size on stack */
1276                     o(0x48);
1277                     oad(0xec81, size); /* sub $xxx, %rsp */
1278                     /* generate structure store */
1279                     r = get_reg(RC_INT);
1280                     orex(1, r, 0, 0x89); /* mov %rsp, r */
1281                     o(0xe0 + REG_VALUE(r));
1282                     vset(&vtop->type, r | VT_LVAL, 0);
1283                     vswap();
1284                     vstore();
1285                     args_size += size;
1286                 }
1287                 break;
1288
1289             case VT_LDOUBLE:
1290                 assert(0);
1291                 break;
1292
1293             case VT_FLOAT:
1294             case VT_DOUBLE:
1295                 assert(mode == x86_64_mode_sse);
1296                 if (sse_reg > 8) {
1297                     --sse_reg;
1298                     r = gv(RC_FLOAT);
1299                     o(0x50); /* push $rax */
1300                     /* movq %xmmN, (%rsp) */
1301                     o(0xd60f66);
1302                     o(0x04 + REG_VALUE(r)*8);
1303                     o(0x24);
1304                     args_size += size;
1305                 } else {
1306                     arg_stored = 0;
1307                 }
1308                 break;
1309
1310             default:
1311                 assert(mode == x86_64_mode_integer);
1312                 /* simple type */
1313                 /* XXX: implicit cast ? */
1314                 if (gen_reg > REGN) {
1315                     --gen_reg;
1316                     r = gv(RC_INT);
1317                     orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
1318                     args_size += size;
1319                 } else {
1320                     arg_stored = 0;
1321                 }
1322                 break;
1323             }
1324
1325             /* And swap the argument back to it's original position.  */
1326             tmp = vtop[0];
1327             vtop[0] = vtop[-i];
1328             vtop[-i] = tmp;
1329
1330             if (arg_stored) {
1331               vrotb(i+1);
1332               assert((vtop->type.t == tmp.type.t) && (vtop->r == tmp.r));
1333               vpop();
1334               --nb_args;
1335               --run_end;
1336             } else {
1337               ++i;
1338             }
1339         }
1340
1341         /* handle 16 byte aligned arguments at end of run */
1342         run_start = i = run_end;
1343         while (i < nb_args) {
1344             /* Rotate argument to top since it will always be popped */
1345             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1346             if (align != 16)
1347               break;
1348
1349             vrotb(i+1);
1350
1351             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1352                 gv(RC_ST0);
1353                 oad(0xec8148, size); /* sub $xxx, %rsp */
1354                 o(0x7cdb); /* fstpt 0(%rsp) */
1355                 g(0x24);
1356                 g(0x00);
1357                 args_size += size;
1358             } else {
1359                 assert(mode == x86_64_mode_memory);
1360
1361                 /* allocate the necessary size on stack */
1362                 o(0x48);
1363                 oad(0xec81, size); /* sub $xxx, %rsp */
1364                 /* generate structure store */
1365                 r = get_reg(RC_INT);
1366                 orex(1, r, 0, 0x89); /* mov %rsp, r */
1367                 o(0xe0 + REG_VALUE(r));
1368                 vset(&vtop->type, r | VT_LVAL, 0);
1369                 vswap();
1370                 vstore();
1371                 args_size += size;
1372             }
1373
1374             vpop();
1375             --nb_args;
1376         }
1377     }
1378
1379     /* XXX This should be superfluous.  */
1380     save_regs(0); /* save used temporary registers */
1381
1382     /* then, we prepare register passing arguments.
1383        Note that we cannot set RDX and RCX in this loop because gv()
1384        may break these temporary registers. Let's use R10 and R11
1385        instead of them */
1386     assert(gen_reg <= REGN);
1387     assert(sse_reg <= 8);
1388     for(i = 0; i < nb_args; i++) {
1389         mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
1390         /* Alter stack entry type so that gv() knows how to treat it */
1391         vtop->type = type;
1392         if (mode == x86_64_mode_sse) {
1393             if (reg_count == 2) {
1394                 sse_reg -= 2;
1395                 gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
1396                 if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
1397                     /* movaps %xmm0, %xmmN */
1398                     o(0x280f);
1399                     o(0xc0 + (sse_reg << 3));
1400                     /* movaps %xmm1, %xmmN */
1401                     o(0x280f);
1402                     o(0xc1 + ((sse_reg+1) << 3));
1403                 }
1404             } else {
1405                 assert(reg_count == 1);
1406                 --sse_reg;
1407                 /* Load directly to register */
1408                 gv(RC_XMM0 << sse_reg);
1409             }
1410         } else if (mode == x86_64_mode_integer) {
1411             /* simple type */
1412             /* XXX: implicit cast ? */
1413             int d;
1414             gen_reg -= reg_count;
1415             r = gv(RC_INT);
1416             d = arg_prepare_reg(gen_reg);
1417             orex(1,d,r,0x89); /* mov */
1418             o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
1419             if (reg_count == 2) {
1420                 d = arg_prepare_reg(gen_reg+1);
1421                 orex(1,d,vtop->r2,0x89); /* mov */
1422                 o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
1423             }
1424         }
1425         vtop--;
1426     }
1427     assert(gen_reg == 0);
1428     assert(sse_reg == 0);
1429
1430     /* We shouldn't have many operands on the stack anymore, but the
1431        call address itself is still there, and it might be in %eax
1432        (or edx/ecx) currently, which the below writes would clobber.
1433        So evict all remaining operands here.  */
1434     save_regs(0);
1435
1436     /* Copy R10 and R11 into RDX and RCX, respectively */
1437     if (nb_reg_args > 2) {
1438         o(0xd2894c); /* mov %r10, %rdx */
1439         if (nb_reg_args > 3) {
1440             o(0xd9894c); /* mov %r11, %rcx */
1441         }
1442     }
1443
1444     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
1445     gcall_or_jmp(0);
1446     if (args_size)
1447         gadd_sp(args_size);
1448     vtop--;
1449 }
1450
1451
1452 #define FUNC_PROLOG_SIZE 11
1453
1454 static void push_arg_reg(int i) {
1455     loc -= 8;
1456     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
1457 }
1458
1459 /* generate function prolog of type 't' */
1460 void gfunc_prolog(CType *func_type)
1461 {
1462     X86_64_Mode mode;
1463     int i, addr, align, size, reg_count;
1464     int param_addr = 0, reg_param_index, sse_param_index;
1465     Sym *sym;
1466     CType *type;
1467
1468     sym = func_type->ref;
1469     addr = PTR_SIZE * 2;
1470     loc = 0;
1471     ind += FUNC_PROLOG_SIZE;
1472     func_sub_sp_offset = ind;
1473     func_ret_sub = 0;
1474
1475     if (func_type->ref->c == FUNC_ELLIPSIS) {
1476         int seen_reg_num, seen_sse_num, seen_stack_size;
1477         seen_reg_num = seen_sse_num = 0;
1478         /* frame pointer and return address */
1479         seen_stack_size = PTR_SIZE * 2;
1480         /* count the number of seen parameters */
1481         sym = func_type->ref;
1482         while ((sym = sym->next) != NULL) {
1483             type = &sym->type;
1484             mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1485             switch (mode) {
1486             default:
1487             stack_arg:
1488                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
1489                 break;
1490
1491             case x86_64_mode_integer:
1492                 if (seen_reg_num + reg_count <= 8) {
1493                     seen_reg_num += reg_count;
1494                 } else {
1495                     seen_reg_num = 8;
1496                     goto stack_arg;
1497                 }
1498                 break;
1499
1500             case x86_64_mode_sse:
1501                 if (seen_sse_num + reg_count <= 8) {
1502                     seen_sse_num += reg_count;
1503                 } else {
1504                     seen_sse_num = 8;
1505                     goto stack_arg;
1506                 }
1507                 break;
1508             }
1509         }
1510
1511         loc -= 16;
1512         /* movl $0x????????, -0x10(%rbp) */
1513         o(0xf045c7);
1514         gen_le32(seen_reg_num * 8);
1515         /* movl $0x????????, -0xc(%rbp) */
1516         o(0xf445c7);
1517         gen_le32(seen_sse_num * 16 + 48);
1518         /* movl $0x????????, -0x8(%rbp) */
1519         o(0xf845c7);
1520         gen_le32(seen_stack_size);
1521
1522         /* save all register passing arguments */
1523         for (i = 0; i < 8; i++) {
1524             loc -= 16;
1525             o(0xd60f66); /* movq */
1526             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
1527             /* movq $0, loc+8(%rbp) */
1528             o(0x85c748);
1529             gen_le32(loc + 8);
1530             gen_le32(0);
1531         }
1532         for (i = 0; i < REGN; i++) {
1533             push_arg_reg(REGN-1-i);
1534         }
1535     }
1536
1537     sym = func_type->ref;
1538     reg_param_index = 0;
1539     sse_param_index = 0;
1540
1541     /* if the function returns a structure, then add an
1542        implicit pointer parameter */
1543     func_vt = sym->type;
1544     mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
1545     if (mode == x86_64_mode_memory) {
1546         push_arg_reg(reg_param_index);
1547         func_vc = loc;
1548         reg_param_index++;
1549     }
1550     /* define parameters */
1551     while ((sym = sym->next) != NULL) {
1552         type = &sym->type;
1553         mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1554         switch (mode) {
1555         case x86_64_mode_sse:
1556             if (sse_param_index + reg_count <= 8) {
1557                 /* save arguments passed by register */
1558                 loc -= reg_count * 8;
1559                 param_addr = loc;
1560                 for (i = 0; i < reg_count; ++i) {
1561                     o(0xd60f66); /* movq */
1562                     gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
1563                     ++sse_param_index;
1564                 }
1565             } else {
1566                 addr = (addr + align - 1) & -align;
1567                 param_addr = addr;
1568                 addr += size;
1569             }
1570             break;
1571
1572         case x86_64_mode_memory:
1573         case x86_64_mode_x87:
1574             addr = (addr + align - 1) & -align;
1575             param_addr = addr;
1576             addr += size;
1577             break;
1578
1579         case x86_64_mode_integer: {
1580             if (reg_param_index + reg_count <= REGN) {
1581                 /* save arguments passed by register */
1582                 loc -= reg_count * 8;
1583                 param_addr = loc;
1584                 for (i = 0; i < reg_count; ++i) {
1585                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
1586                     ++reg_param_index;
1587                 }
1588             } else {
1589                 addr = (addr + align - 1) & -align;
1590                 param_addr = addr;
1591                 addr += size;
1592             }
1593             break;
1594         }
1595         default: break; /* nothing to be done for x86_64_mode_none */
1596         }
1597         sym_push(sym->v & ~SYM_FIELD, type,
1598                  VT_LOCAL | VT_LVAL, param_addr);
1599     }
1600
1601 #ifdef CONFIG_TCC_BCHECK
1602     /* leave some room for bound checking code */
1603     if (tcc_state->do_bounds_check) {
1604         func_bound_offset = lbounds_section->data_offset;
1605         func_bound_ind = ind;
1606         oad(0xb8, 0); /* lbound section pointer */
1607         o(0xc78948);  /* mov  %rax,%rdi ## first arg in %rdi, this must be ptr */
1608         oad(0xb8, 0); /* call to function */
1609     }
1610 #endif
1611 }
1612
1613 /* generate function epilog */
1614 void gfunc_epilog(void)
1615 {
1616     int v, saved_ind;
1617
1618 #ifdef CONFIG_TCC_BCHECK
1619     if (tcc_state->do_bounds_check
1620         && func_bound_offset != lbounds_section->data_offset)
1621     {
1622         addr_t saved_ind;
1623         addr_t *bounds_ptr;
1624         Sym *sym_data;
1625
1626         /* add end of table info */
1627         bounds_ptr = section_ptr_add(lbounds_section, sizeof(addr_t));
1628         *bounds_ptr = 0;
1629
1630         /* generate bound local allocation */
1631         sym_data = get_sym_ref(&char_pointer_type, lbounds_section,
1632                                func_bound_offset, lbounds_section->data_offset);
1633         saved_ind = ind;
1634         ind = func_bound_ind;
1635         greloc(cur_text_section, sym_data, ind + 1, R_386_32);
1636         ind = ind + 5 + 3;
1637         gen_static_call(TOK___bound_local_new);
1638         ind = saved_ind;
1639
1640         /* generate bound check local freeing */
1641         o(0x5250); /* save returned value, if any */
1642         greloc(cur_text_section, sym_data, ind + 1, R_386_32);
1643         oad(0xb8, 0); /* mov xxx, %rax */
1644         o(0xc78948);  /* mov  %rax,%rdi ## first arg in %rdi, this must be ptr */
1645         gen_static_call(TOK___bound_local_delete);
1646         o(0x585a); /* restore returned value, if any */
1647     }
1648 #endif
1649     o(0xc9); /* leave */
1650     if (func_ret_sub == 0) {
1651         o(0xc3); /* ret */
1652     } else {
1653         o(0xc2); /* ret n */
1654         g(func_ret_sub);
1655         g(func_ret_sub >> 8);
1656     }
1657     /* align local size to word & save local variables */
1658     v = (-loc + 15) & -16;
1659     saved_ind = ind;
1660     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1661     o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1662     o(0xec8148);  /* sub rsp, stacksize */
1663     gen_le32(v);
1664     ind = saved_ind;
1665 }
1666
1667 #endif /* not PE */
1668
1669 /* generate a jump to a label */
1670 int gjmp(int t)
1671 {
1672     return psym(0xe9, t);
1673 }
1674
1675 /* generate a jump to a fixed address */
1676 void gjmp_addr(int a)
1677 {
1678     int r;
1679     r = a - ind - 2;
1680     if (r == (char)r) {
1681         g(0xeb);
1682         g(r);
1683     } else {
1684         oad(0xe9, a - ind - 5);
1685     }
1686 }
1687
1688 /* generate a test. set 'inv' to invert test. Stack entry is popped */
1689 int gtst(int inv, int t)
1690 {
1691     int v = vtop->r & VT_VALMASK;
1692     if (v == VT_CMP) {
1693         /* fast case : can jump directly since flags are set */
1694         if (vtop->c.i & 0x100)
1695           {
1696             /* This was a float compare.  If the parity flag is set
1697                the result was unordered.  For anything except != this
1698                means false and we don't jump (anding both conditions).
1699                For != this means true (oring both).
1700                Take care about inverting the test.  We need to jump
1701                to our target if the result was unordered and test wasn't NE,
1702                otherwise if unordered we don't want to jump.  */
1703             vtop->c.i &= ~0x100;
1704             if (inv == (vtop->c.i == TOK_NE))
1705               o(0x067a);  /* jp +6 */
1706             else
1707               {
1708                 g(0x0f);
1709                 t = psym(0x8a, t); /* jp t */
1710               }
1711           }
1712         g(0x0f);
1713         t = psym((vtop->c.i - 16) ^ inv, t);
1714     } else if (v == VT_JMP || v == VT_JMPI) {
1715         /* && or || optimization */
1716         if ((v & 1) == inv) {
1717             /* insert vtop->c jump list in t */
1718             uint32_t n1, n = vtop->c.i;
1719             if (n) {
1720                 while ((n1 = read32le(cur_text_section->data + n)))
1721                     n = n1;
1722                 write32le(cur_text_section->data + n, t);
1723                 t = vtop->c.i;
1724             }
1725         } else {
1726             t = gjmp(t);
1727             gsym(vtop->c.i);
1728         }
1729     }
1730     vtop--;
1731     return t;
1732 }
1733
1734 /* generate an integer binary operation */
1735 void gen_opi(int op)
1736 {
1737     int r, fr, opc, c;
1738     int ll, uu, cc;
1739
1740     ll = is64_type(vtop[-1].type.t);
1741     uu = (vtop[-1].type.t & VT_UNSIGNED) != 0;
1742     cc = (vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
1743
1744     switch(op) {
1745     case '+':
1746     case TOK_ADDC1: /* add with carry generation */
1747         opc = 0;
1748     gen_op8:
1749         if (cc && (!ll || (int)vtop->c.i == vtop->c.i)) {
1750             /* constant case */
1751             vswap();
1752             r = gv(RC_INT);
1753             vswap();
1754             c = vtop->c.i;
1755             if (c == (char)c) {
1756                 /* XXX: generate inc and dec for smaller code ? */
1757                 orex(ll, r, 0, 0x83);
1758                 o(0xc0 | (opc << 3) | REG_VALUE(r));
1759                 g(c);
1760             } else {
1761                 orex(ll, r, 0, 0x81);
1762                 oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
1763             }
1764         } else {
1765             gv2(RC_INT, RC_INT);
1766             r = vtop[-1].r;
1767             fr = vtop[0].r;
1768             orex(ll, r, fr, (opc << 3) | 0x01);
1769             o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
1770         }
1771         vtop--;
1772         if (op >= TOK_ULT && op <= TOK_GT) {
1773             vtop->r = VT_CMP;
1774             vtop->c.i = op;
1775         }
1776         break;
1777     case '-':
1778     case TOK_SUBC1: /* sub with carry generation */
1779         opc = 5;
1780         goto gen_op8;
1781     case TOK_ADDC2: /* add with carry use */
1782         opc = 2;
1783         goto gen_op8;
1784     case TOK_SUBC2: /* sub with carry use */
1785         opc = 3;
1786         goto gen_op8;
1787     case '&':
1788         opc = 4;
1789         goto gen_op8;
1790     case '^':
1791         opc = 6;
1792         goto gen_op8;
1793     case '|':
1794         opc = 1;
1795         goto gen_op8;
1796     case '*':
1797         gv2(RC_INT, RC_INT);
1798         r = vtop[-1].r;
1799         fr = vtop[0].r;
1800         orex(ll, fr, r, 0xaf0f); /* imul fr, r */
1801         o(0xc0 + REG_VALUE(fr) + REG_VALUE(r) * 8);
1802         vtop--;
1803         break;
1804     case TOK_SHL:
1805         opc = 4;
1806         goto gen_shift;
1807     case TOK_SHR:
1808         opc = 5;
1809         goto gen_shift;
1810     case TOK_SAR:
1811         opc = 7;
1812     gen_shift:
1813         opc = 0xc0 | (opc << 3);
1814         if (cc) {
1815             /* constant case */
1816             vswap();
1817             r = gv(RC_INT);
1818             vswap();
1819             orex(ll, r, 0, 0xc1); /* shl/shr/sar $xxx, r */
1820             o(opc | REG_VALUE(r));
1821             g(vtop->c.i & (ll ? 63 : 31));
1822         } else {
1823             /* we generate the shift in ecx */
1824             gv2(RC_INT, RC_RCX);
1825             r = vtop[-1].r;
1826             orex(ll, r, 0, 0xd3); /* shl/shr/sar %cl, r */
1827             o(opc | REG_VALUE(r));
1828         }
1829         vtop--;
1830         break;
1831     case TOK_UDIV:
1832     case TOK_UMOD:
1833         uu = 1;
1834         goto divmod;
1835     case '/':
1836     case '%':
1837     case TOK_PDIV:
1838         uu = 0;
1839     divmod:
1840         /* first operand must be in eax */
1841         /* XXX: need better constraint for second operand */
1842         gv2(RC_RAX, RC_RCX);
1843         r = vtop[-1].r;
1844         fr = vtop[0].r;
1845         vtop--;
1846         save_reg(TREG_RDX);
1847         orex(ll, 0, 0, uu ? 0xd231 : 0x99); /* xor %edx,%edx : cqto */
1848         orex(ll, fr, 0, 0xf7); /* div fr, %eax */
1849         o((uu ? 0xf0 : 0xf8) + REG_VALUE(fr));
1850         if (op == '%' || op == TOK_UMOD)
1851             r = TREG_RDX;
1852         else
1853             r = TREG_RAX;
1854         vtop->r = r;
1855         break;
1856     default:
1857         opc = 7;
1858         goto gen_op8;
1859     }
1860 }
1861
1862 void gen_opl(int op)
1863 {
1864     gen_opi(op);
1865 }
1866
1867 /* generate a floating point operation 'v = t1 op t2' instruction. The
1868    two operands are guaranted to have the same floating point type */
1869 /* XXX: need to use ST1 too */
1870 void gen_opf(int op)
1871 {
1872     int a, ft, fc, swapped, r;
1873     int float_type =
1874         (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1875
1876     /* convert constants to memory references */
1877     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1878         vswap();
1879         gv(float_type);
1880         vswap();
1881     }
1882     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1883         gv(float_type);
1884
1885     /* must put at least one value in the floating point register */
1886     if ((vtop[-1].r & VT_LVAL) &&
1887         (vtop[0].r & VT_LVAL)) {
1888         vswap();
1889         gv(float_type);
1890         vswap();
1891     }
1892     swapped = 0;
1893     /* swap the stack if needed so that t1 is the register and t2 is
1894        the memory reference */
1895     if (vtop[-1].r & VT_LVAL) {
1896         vswap();
1897         swapped = 1;
1898     }
1899     if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1900         if (op >= TOK_ULT && op <= TOK_GT) {
1901             /* load on stack second operand */
1902             load(TREG_ST0, vtop);
1903             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1904             if (op == TOK_GE || op == TOK_GT)
1905                 swapped = !swapped;
1906             else if (op == TOK_EQ || op == TOK_NE)
1907                 swapped = 0;
1908             if (swapped)
1909                 o(0xc9d9); /* fxch %st(1) */
1910             if (op == TOK_EQ || op == TOK_NE)
1911                 o(0xe9da); /* fucompp */
1912             else
1913                 o(0xd9de); /* fcompp */
1914             o(0xe0df); /* fnstsw %ax */
1915             if (op == TOK_EQ) {
1916                 o(0x45e480); /* and $0x45, %ah */
1917                 o(0x40fC80); /* cmp $0x40, %ah */
1918             } else if (op == TOK_NE) {
1919                 o(0x45e480); /* and $0x45, %ah */
1920                 o(0x40f480); /* xor $0x40, %ah */
1921                 op = TOK_NE;
1922             } else if (op == TOK_GE || op == TOK_LE) {
1923                 o(0x05c4f6); /* test $0x05, %ah */
1924                 op = TOK_EQ;
1925             } else {
1926                 o(0x45c4f6); /* test $0x45, %ah */
1927                 op = TOK_EQ;
1928             }
1929             vtop--;
1930             vtop->r = VT_CMP;
1931             vtop->c.i = op;
1932         } else {
1933             /* no memory reference possible for long double operations */
1934             load(TREG_ST0, vtop);
1935             swapped = !swapped;
1936
1937             switch(op) {
1938             default:
1939             case '+':
1940                 a = 0;
1941                 break;
1942             case '-':
1943                 a = 4;
1944                 if (swapped)
1945                     a++;
1946                 break;
1947             case '*':
1948                 a = 1;
1949                 break;
1950             case '/':
1951                 a = 6;
1952                 if (swapped)
1953                     a++;
1954                 break;
1955             }
1956             ft = vtop->type.t;
1957             fc = vtop->c.i;
1958             o(0xde); /* fxxxp %st, %st(1) */
1959             o(0xc1 + (a << 3));
1960             vtop--;
1961         }
1962     } else {
1963         if (op >= TOK_ULT && op <= TOK_GT) {
1964             /* if saved lvalue, then we must reload it */
1965             r = vtop->r;
1966             fc = vtop->c.i;
1967             if ((r & VT_VALMASK) == VT_LLOCAL) {
1968                 SValue v1;
1969                 r = get_reg(RC_INT);
1970                 v1.type.t = VT_PTR;
1971                 v1.r = VT_LOCAL | VT_LVAL;
1972                 v1.c.i = fc;
1973                 load(r, &v1);
1974                 fc = 0;
1975             }
1976
1977             if (op == TOK_EQ || op == TOK_NE) {
1978                 swapped = 0;
1979             } else {
1980                 if (op == TOK_LE || op == TOK_LT)
1981                     swapped = !swapped;
1982                 if (op == TOK_LE || op == TOK_GE) {
1983                     op = 0x93; /* setae */
1984                 } else {
1985                     op = 0x97; /* seta */
1986                 }
1987             }
1988
1989             if (swapped) {
1990                 gv(RC_FLOAT);
1991                 vswap();
1992             }
1993             assert(!(vtop[-1].r & VT_LVAL));
1994
1995             if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE)
1996                 o(0x66);
1997             if (op == TOK_EQ || op == TOK_NE)
1998                 o(0x2e0f); /* ucomisd */
1999             else
2000                 o(0x2f0f); /* comisd */
2001
2002             if (vtop->r & VT_LVAL) {
2003                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
2004             } else {
2005                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
2006             }
2007
2008             vtop--;
2009             vtop->r = VT_CMP;
2010             vtop->c.i = op | 0x100;
2011         } else {
2012             assert((vtop->type.t & VT_BTYPE) != VT_LDOUBLE);
2013             switch(op) {
2014             default:
2015             case '+':
2016                 a = 0;
2017                 break;
2018             case '-':
2019                 a = 4;
2020                 break;
2021             case '*':
2022                 a = 1;
2023                 break;
2024             case '/':
2025                 a = 6;
2026                 break;
2027             }
2028             ft = vtop->type.t;
2029             fc = vtop->c.i;
2030             assert((ft & VT_BTYPE) != VT_LDOUBLE);
2031
2032             r = vtop->r;
2033             /* if saved lvalue, then we must reload it */
2034             if ((vtop->r & VT_VALMASK) == VT_LLOCAL) {
2035                 SValue v1;
2036                 r = get_reg(RC_INT);
2037                 v1.type.t = VT_PTR;
2038                 v1.r = VT_LOCAL | VT_LVAL;
2039                 v1.c.i = fc;
2040                 load(r, &v1);
2041                 fc = 0;
2042             }
2043
2044             assert(!(vtop[-1].r & VT_LVAL));
2045             if (swapped) {
2046                 assert(vtop->r & VT_LVAL);
2047                 gv(RC_FLOAT);
2048                 vswap();
2049             }
2050
2051             if ((ft & VT_BTYPE) == VT_DOUBLE) {
2052                 o(0xf2);
2053             } else {
2054                 o(0xf3);
2055             }
2056             o(0x0f);
2057             o(0x58 + a);
2058
2059             if (vtop->r & VT_LVAL) {
2060                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
2061             } else {
2062                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
2063             }
2064
2065             vtop--;
2066         }
2067     }
2068 }
2069
2070 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
2071    and 'long long' cases. */
2072 void gen_cvt_itof(int t)
2073 {
2074     if ((t & VT_BTYPE) == VT_LDOUBLE) {
2075         save_reg(TREG_ST0);
2076         gv(RC_INT);
2077         if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
2078             /* signed long long to float/double/long double (unsigned case
2079                is handled generically) */
2080             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2081             o(0x242cdf); /* fildll (%rsp) */
2082             o(0x08c48348); /* add $8, %rsp */
2083         } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2084                    (VT_INT | VT_UNSIGNED)) {
2085             /* unsigned int to float/double/long double */
2086             o(0x6a); /* push $0 */
2087             g(0x00);
2088             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2089             o(0x242cdf); /* fildll (%rsp) */
2090             o(0x10c48348); /* add $16, %rsp */
2091         } else {
2092             /* int to float/double/long double */
2093             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2094             o(0x2404db); /* fildl (%rsp) */
2095             o(0x08c48348); /* add $8, %rsp */
2096         }
2097         vtop->r = TREG_ST0;
2098     } else {
2099         int r = get_reg(RC_FLOAT);
2100         gv(RC_INT);
2101         o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT?1:0));
2102         if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2103             (VT_INT | VT_UNSIGNED) ||
2104             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
2105             o(0x48); /* REX */
2106         }
2107         o(0x2a0f);
2108         o(0xc0 + (vtop->r & VT_VALMASK) + REG_VALUE(r)*8); /* cvtsi2sd */
2109         vtop->r = r;
2110     }
2111 }
2112
2113 /* convert from one floating point type to another */
2114 void gen_cvt_ftof(int t)
2115 {
2116     int ft, bt, tbt;
2117
2118     ft = vtop->type.t;
2119     bt = ft & VT_BTYPE;
2120     tbt = t & VT_BTYPE;
2121
2122     if (bt == VT_FLOAT) {
2123         gv(RC_FLOAT);
2124         if (tbt == VT_DOUBLE) {
2125             o(0x140f); /* unpcklps */
2126             o(0xc0 + REG_VALUE(vtop->r)*9);
2127             o(0x5a0f); /* cvtps2pd */
2128             o(0xc0 + REG_VALUE(vtop->r)*9);
2129         } else if (tbt == VT_LDOUBLE) {
2130             save_reg(RC_ST0);
2131             /* movss %xmm0,-0x10(%rsp) */
2132             o(0x110ff3);
2133             o(0x44 + REG_VALUE(vtop->r)*8);
2134             o(0xf024);
2135             o(0xf02444d9); /* flds -0x10(%rsp) */
2136             vtop->r = TREG_ST0;
2137         }
2138     } else if (bt == VT_DOUBLE) {
2139         gv(RC_FLOAT);
2140         if (tbt == VT_FLOAT) {
2141             o(0x140f66); /* unpcklpd */
2142             o(0xc0 + REG_VALUE(vtop->r)*9);
2143             o(0x5a0f66); /* cvtpd2ps */
2144             o(0xc0 + REG_VALUE(vtop->r)*9);
2145         } else if (tbt == VT_LDOUBLE) {
2146             save_reg(RC_ST0);
2147             /* movsd %xmm0,-0x10(%rsp) */
2148             o(0x110ff2);
2149             o(0x44 + REG_VALUE(vtop->r)*8);
2150             o(0xf024);
2151             o(0xf02444dd); /* fldl -0x10(%rsp) */
2152             vtop->r = TREG_ST0;
2153         }
2154     } else {
2155         int r;
2156         gv(RC_ST0);
2157         r = get_reg(RC_FLOAT);
2158         if (tbt == VT_DOUBLE) {
2159             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
2160             /* movsd -0x10(%rsp),%xmm0 */
2161             o(0x100ff2);
2162             o(0x44 + REG_VALUE(r)*8);
2163             o(0xf024);
2164             vtop->r = r;
2165         } else if (tbt == VT_FLOAT) {
2166             o(0xf0245cd9); /* fstps -0x10(%rsp) */
2167             /* movss -0x10(%rsp),%xmm0 */
2168             o(0x100ff3);
2169             o(0x44 + REG_VALUE(r)*8);
2170             o(0xf024);
2171             vtop->r = r;
2172         }
2173     }
2174 }
2175
2176 /* convert fp to int 't' type */
2177 void gen_cvt_ftoi(int t)
2178 {
2179     int ft, bt, size, r;
2180     ft = vtop->type.t;
2181     bt = ft & VT_BTYPE;
2182     if (bt == VT_LDOUBLE) {
2183         gen_cvt_ftof(VT_DOUBLE);
2184         bt = VT_DOUBLE;
2185     }
2186
2187     gv(RC_FLOAT);
2188     if (t != VT_INT)
2189         size = 8;
2190     else
2191         size = 4;
2192
2193     r = get_reg(RC_INT);
2194     if (bt == VT_FLOAT) {
2195         o(0xf3);
2196     } else if (bt == VT_DOUBLE) {
2197         o(0xf2);
2198     } else {
2199         assert(0);
2200     }
2201     orex(size == 8, r, 0, 0x2c0f); /* cvttss2si or cvttsd2si */
2202     o(0xc0 + REG_VALUE(vtop->r) + REG_VALUE(r)*8);
2203     vtop->r = r;
2204 }
2205
2206 /* computed goto support */
2207 void ggoto(void)
2208 {
2209     gcall_or_jmp(1);
2210     vtop--;
2211 }
2212
2213 /* Save the stack pointer onto the stack and return the location of its address */
2214 ST_FUNC void gen_vla_sp_save(int addr) {
2215     /* mov %rsp,addr(%rbp)*/
2216     gen_modrm64(0x89, TREG_RSP, VT_LOCAL, NULL, addr);
2217 }
2218
2219 /* Restore the SP from a location on the stack */
2220 ST_FUNC void gen_vla_sp_restore(int addr) {
2221     gen_modrm64(0x8b, TREG_RSP, VT_LOCAL, NULL, addr);
2222 }
2223
2224 /* Subtract from the stack pointer, and push the resulting value onto the stack */
2225 ST_FUNC void gen_vla_alloc(CType *type, int align) {
2226 #ifdef TCC_TARGET_PE
2227     /* alloca does more than just adjust %rsp on Windows */
2228     vpush_global_sym(&func_old_type, TOK_alloca);
2229     vswap(); /* Move alloca ref past allocation size */
2230     gfunc_call(1);
2231     vset(type, REG_IRET, 0);
2232 #else
2233     int r;
2234     r = gv(RC_INT); /* allocation size */
2235     /* sub r,%rsp */
2236     o(0x2b48);
2237     o(0xe0 | REG_VALUE(r));
2238     /* We align to 16 bytes rather than align */
2239     /* and ~15, %rsp */
2240     o(0xf0e48348);
2241     vpop();
2242 #endif
2243 }
2244
2245
2246 /* end of x86-64 code generator */
2247 /*************************************************************/
2248 #endif /* ! TARGET_DEFS_ONLY */
2249 /******************************************************/