x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #ifdef TARGET_DEFS_ONLY
  24
  25 /* number of available registers */
  26 #define NB_REGS         25
  27 #define NB_ASM_REGS     8
  28
  29 /* a register can belong to several classes. The classes must be
  30    sorted from more general to more precise (see gv2() code which does
  31    assumptions on it). */
  32 #define RC_INT     0x0001 /* generic integer register */
  33 #define RC_FLOAT   0x0002 /* generic float register */
  34 #define RC_RAX     0x0004
  35 #define RC_RCX     0x0008
  36 #define RC_RDX     0x0010
  37 #define RC_ST0     0x0080 /* only for long double */
  38 #define RC_R8      0x0100
  39 #define RC_R9      0x0200
  40 #define RC_R10     0x0400
  41 #define RC_R11     0x0800
  42 #define RC_XMM0    0x1000
  43 #define RC_XMM1    0x2000
  44 #define RC_XMM2    0x4000
  45 #define RC_XMM3    0x8000
  46 #define RC_XMM4    0x10000
  47 #define RC_XMM5    0x20000
  48 #define RC_XMM6    0x40000
  49 #define RC_XMM7    0x80000
  50 #define RC_IRET    RC_RAX /* function return: integer register */
  51 #define RC_LRET    RC_RDX /* function return: second integer register */
  52 #define RC_FRET    RC_XMM0 /* function return: float register */
  53 #define RC_QRET    RC_XMM1 /* function return: second float register */
  54
  55 /* pretty names for the registers */
  56 enum {
  57     TREG_RAX = 0,
  58     TREG_RCX = 1,
  59     TREG_RDX = 2,
  60     TREG_RSP = 4,
  61     TREG_RSI = 6,
  62     TREG_RDI = 7,
  63
  64     TREG_R8  = 8,
  65     TREG_R9  = 9,
  66     TREG_R10 = 10,
  67     TREG_R11 = 11,
  68
  69     TREG_XMM0 = 16,
  70     TREG_XMM1 = 17,
  71     TREG_XMM2 = 18,
  72     TREG_XMM3 = 19,
  73     TREG_XMM4 = 20,
  74     TREG_XMM5 = 21,
  75     TREG_XMM6 = 22,
  76     TREG_XMM7 = 23,
  77
  78     TREG_ST0 = 24,
  79
  80     TREG_MEM = 0x20,
  81 };
  82
  83 #define REX_BASE(reg) (((reg) >> 3) & 1)
  84 #define REG_VALUE(reg) ((reg) & 7)
  85
  86 /* return registers for function */
  87 #define REG_IRET TREG_RAX /* single word int return register */
  88 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  89 #define REG_FRET TREG_XMM0 /* float return register */
  90 #define REG_QRET TREG_XMM1 /* second float return register */
  91
  92 /* defined if function parameters must be evaluated in reverse order */
  93 #define INVERT_FUNC_PARAMS
  94
  95 /* pointer size, in bytes */
  96 #define PTR_SIZE 8
  97
  98 /* long double size and alignment, in bytes */
  99 #define LDOUBLE_SIZE  16
 100 #define LDOUBLE_ALIGN 16
 101 /* maximum alignment (for aligned attribute support) */
 102 #define MAX_ALIGN     16
 103
 104 /******************************************************/
 105 /* ELF defines */
 106
 107 #define EM_TCC_TARGET EM_X86_64
 108
 109 /* relocation type for 32 bit data relocation */
 110 #define R_DATA_32   R_X86_64_32
 111 #define R_DATA_PTR  R_X86_64_64
 112 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
 113 #define R_COPY      R_X86_64_COPY
 114
 115 #define ELF_START_ADDR 0x400000
 116 #define ELF_PAGE_SIZE  0x200000
 117
 118 /******************************************************/
 119 #else /* ! TARGET_DEFS_ONLY */
 120 /******************************************************/
 121 #include "tcc.h"
 122 #include <assert.h>
 123
 124 ST_DATA const int reg_classes[NB_REGS] = {
 125     /* eax */ RC_INT | RC_RAX,
 126     /* ecx */ RC_INT | RC_RCX,
 127     /* edx */ RC_INT | RC_RDX,
 128     0,
 129     0,
 130     0,
 131     0,
 132     0,
 133     RC_R8,
 134     RC_R9,
 135     RC_R10,
 136     RC_R11,
 137     0,
 138     0,
 139     0,
 140     0,
 141     /* xmm0 */ RC_FLOAT | RC_XMM0,
 142     /* xmm1 */ RC_FLOAT | RC_XMM1,
 143     /* xmm2 */ RC_FLOAT | RC_XMM2,
 144     /* xmm3 */ RC_FLOAT | RC_XMM3,
 145     /* xmm4 */ RC_FLOAT | RC_XMM4,
 146     /* xmm5 */ RC_FLOAT | RC_XMM5,
 147     /* xmm6 an xmm7 are included so gv() can be used on them,
 148        but they are not tagged with RC_FLOAT because they are
 149        callee saved on Windows */
 150     RC_XMM6,
 151     RC_XMM7,
 152     /* st0 */ RC_ST0
 153 };
 154
 155 static unsigned long func_sub_sp_offset;
 156 static int func_ret_sub;
 157
 158 /* XXX: make it faster ? */
 159 void g(int c)
 160 {
 161     int ind1;
 162     ind1 = ind + 1;
 163     if (ind1 > cur_text_section->data_allocated)
 164         section_realloc(cur_text_section, ind1);
 165     cur_text_section->data[ind] = c;
 166     ind = ind1;
 167 }
 168
 169 void o(unsigned int c)
 170 {
 171     while (c) {
 172         g(c);
 173         c = c >> 8;
 174     }
 175 }
 176
 177 void gen_le16(int v)
 178 {
 179     g(v);
 180     g(v >> 8);
 181 }
 182
 183 void gen_le32(int c)
 184 {
 185     g(c);
 186     g(c >> 8);
 187     g(c >> 16);
 188     g(c >> 24);
 189 }
 190
 191 void gen_le64(int64_t c)
 192 {
 193     g(c);
 194     g(c >> 8);
 195     g(c >> 16);
 196     g(c >> 24);
 197     g(c >> 32);
 198     g(c >> 40);
 199     g(c >> 48);
 200     g(c >> 56);
 201 }
 202
 203 void orex(int ll, int r, int r2, int b)
 204 {
 205     if ((r & VT_VALMASK) >= VT_CONST)
 206         r = 0;
 207     if ((r2 & VT_VALMASK) >= VT_CONST)
 208         r2 = 0;
 209     if (ll || REX_BASE(r) || REX_BASE(r2))
 210         o(0x40 | REX_BASE(r) | (REX_BASE(r2) << 2) | (ll << 3));
 211     o(b);
 212 }
 213
 214 /* output a symbol and patch all calls to it */
 215 void gsym_addr(int t, int a)
 216 {
 217     int n, *ptr;
 218     while (t) {
 219         ptr = (int *)(cur_text_section->data + t);
 220         n = *ptr; /* next value */
 221         *ptr = a - t - 4;
 222         t = n;
 223     }
 224 }
 225
 226 void gsym(int t)
 227 {
 228     gsym_addr(t, ind);
 229 }
 230
 231 /* psym is used to put an instruction with a data field which is a
 232    reference to a symbol. It is in fact the same as oad ! */
 233 #define psym oad
 234
 235 static int is64_type(int t)
 236 {
 237     return ((t & VT_BTYPE) == VT_PTR ||
 238             (t & VT_BTYPE) == VT_FUNC ||
 239             (t & VT_BTYPE) == VT_LLONG);
 240 }
 241
 242 /* instruction + 4 bytes data. Return the address of the data */
 243 ST_FUNC int oad(int c, int s)
 244 {
 245     int ind1;
 246
 247     o(c);
 248     ind1 = ind + 4;
 249     if (ind1 > cur_text_section->data_allocated)
 250         section_realloc(cur_text_section, ind1);
 251     *(int *)(cur_text_section->data + ind) = s;
 252     s = ind;
 253     ind = ind1;
 254     return s;
 255 }
 256
 257 ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 258 {
 259     if (r & VT_SYM)
 260         greloc(cur_text_section, sym, ind, R_X86_64_32);
 261     gen_le32(c);
 262 }
 263
 264 /* output constant with relocation if 'r & VT_SYM' is true */
 265 ST_FUNC void gen_addr64(int r, Sym *sym, int64_t c)
 266 {
 267     if (r & VT_SYM)
 268         greloc(cur_text_section, sym, ind, R_X86_64_64);
 269     gen_le64(c);
 270 }
 271
 272 /* output constant with relocation if 'r & VT_SYM' is true */
 273 ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
 274 {
 275     if (r & VT_SYM)
 276         greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 277     gen_le32(c-4);
 278 }
 279
 280 /* output got address with relocation */
 281 static void gen_gotpcrel(int r, Sym *sym, int c)
 282 {
 283 #ifndef TCC_TARGET_PE
 284     Section *sr;
 285     ElfW(Rela) *rel;
 286     greloc(cur_text_section, sym, ind, R_X86_64_GOTPCREL);
 287     sr = cur_text_section->reloc;
 288     rel = (ElfW(Rela) *)(sr->data + sr->data_offset - sizeof(ElfW(Rela)));
 289     rel->r_addend = -4;
 290 #else
 291     tcc_error("internal error: no GOT on PE: %s %x %x | %02x %02x %02x\n",
 292         get_tok_str(sym->v, NULL), c, r,
 293         cur_text_section->data[ind-3],
 294         cur_text_section->data[ind-2],
 295         cur_text_section->data[ind-1]
 296         );
 297     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 298 #endif
 299     gen_le32(0);
 300     if (c) {
 301         /* we use add c, %xxx for displacement */
 302         orex(1, r, 0, 0x81);
 303         o(0xc0 + REG_VALUE(r));
 304         gen_le32(c);
 305     }
 306 }
 307
 308 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 309 {
 310     op_reg = REG_VALUE(op_reg) << 3;
 311     if ((r & VT_VALMASK) == VT_CONST) {
 312         /* constant memory reference */
 313         o(0x05 | op_reg);
 314         if (is_got) {
 315             gen_gotpcrel(r, sym, c);
 316         } else {
 317             gen_addrpc32(r, sym, c);
 318         }
 319     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 320         /* currently, we use only ebp as base */
 321         if (c == (char)c) {
 322             /* short reference */
 323             o(0x45 | op_reg);
 324             g(c);
 325         } else {
 326             oad(0x85 | op_reg, c);
 327         }
 328     } else if ((r & VT_VALMASK) >= TREG_MEM) {
 329         if (c) {
 330             g(0x80 | op_reg | REG_VALUE(r));
 331             gen_le32(c);
 332         } else {
 333             g(0x00 | op_reg | REG_VALUE(r));
 334         }
 335     } else {
 336         g(0x00 | op_reg | REG_VALUE(r));
 337     }
 338 }
 339
 340 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 341    opcode bits */
 342 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 343 {
 344     gen_modrm_impl(op_reg, r, sym, c, 0);
 345 }
 346
 347 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 348    opcode bits */
 349 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 350 {
 351     int is_got;
 352     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 353     orex(1, r, op_reg, opcode);
 354     gen_modrm_impl(op_reg, r, sym, c, is_got);
 355 }
 356
 357
 358 /* load 'r' from value 'sv' */
 359 void load(int r, SValue *sv)
 360 {
 361     int v, t, ft, fc, fr;
 362     SValue v1;
 363
 364 #ifdef TCC_TARGET_PE
 365     SValue v2;
 366     sv = pe_getimport(sv, &v2);
 367 #endif
 368
 369     fr = sv->r;
 370     ft = sv->type.t & ~VT_DEFSIGN;
 371     fc = sv->c.ul;
 372
 373 #ifndef TCC_TARGET_PE
 374     /* we use indirect access via got */
 375     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 376         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 377         /* use the result register as a temporal register */
 378         int tr = r | TREG_MEM;
 379         if (is_float(ft)) {
 380             /* we cannot use float registers as a temporal register */
 381             tr = get_reg(RC_INT) | TREG_MEM;
 382         }
 383         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 384
 385         /* load from the temporal register */
 386         fr = tr | VT_LVAL;
 387     }
 388 #endif
 389
 390     v = fr & VT_VALMASK;
 391     if (fr & VT_LVAL) {
 392         int b, ll;
 393         if (v == VT_LLOCAL) {
 394             v1.type.t = VT_PTR;
 395             v1.r = VT_LOCAL | VT_LVAL;
 396             v1.c.ul = fc;
 397             fr = r;
 398             if (!(reg_classes[fr] & (RC_INT|RC_R11)))
 399                 fr = get_reg(RC_INT);
 400             load(fr, &v1);
 401         }
 402         ll = 0;
 403         if ((ft & VT_BTYPE) == VT_FLOAT) {
 404             b = 0x6e0f66;
 405             r = REG_VALUE(r); /* movd */
 406         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 407             b = 0x7e0ff3; /* movq */
 408             r = REG_VALUE(r);
 409         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 410             b = 0xdb, r = 5; /* fldt */
 411         } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
 412             b = 0xbe0f;   /* movsbl */
 413         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 414             b = 0xb60f;   /* movzbl */
 415         } else if ((ft & VT_TYPE) == VT_SHORT) {
 416             b = 0xbf0f;   /* movswl */
 417         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 418             b = 0xb70f;   /* movzwl */
 419         } else {
 420             assert(((ft & VT_BTYPE) == VT_INT) || ((ft & VT_BTYPE) == VT_LLONG)
 421                    || ((ft & VT_BTYPE) == VT_PTR) || ((ft & VT_BTYPE) == VT_ENUM)
 422                    || ((ft & VT_BTYPE) == VT_FUNC));
 423             ll = is64_type(ft);
 424             b = 0x8b;
 425         }
 426         if (ll) {
 427             gen_modrm64(b, r, fr, sv->sym, fc);
 428         } else {
 429             orex(ll, fr, r, b);
 430             gen_modrm(r, fr, sv->sym, fc);
 431         }
 432     } else {
 433         if (v == VT_CONST) {
 434             if (fr & VT_SYM) {
 435 #ifdef TCC_TARGET_PE
 436                 orex(1,0,r,0x8d);
 437                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 438                 gen_addrpc32(fr, sv->sym, fc);
 439 #else
 440                 if (sv->sym->type.t & VT_STATIC) {
 441                     orex(1,0,r,0x8d);
 442                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 443                     gen_addrpc32(fr, sv->sym, fc);
 444                 } else {
 445                     orex(1,0,r,0x8b);
 446                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 447                     gen_gotpcrel(r, sv->sym, fc);
 448                 }
 449 #endif
 450             } else if (is64_type(ft)) {
 451                 orex(1,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 452                 gen_le64(sv->c.ull);
 453             } else {
 454                 orex(0,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 455                 gen_le32(fc);
 456             }
 457         } else if (v == VT_LOCAL) {
 458             orex(1,0,r,0x8d); /* lea xxx(%ebp), r */
 459             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 460         } else if (v == VT_CMP) {
 461             orex(0,r,0,0);
 462             if ((fc & ~0x100) != TOK_NE)
 463               oad(0xb8 + REG_VALUE(r), 0); /* mov $0, r */
 464             else
 465               oad(0xb8 + REG_VALUE(r), 1); /* mov $1, r */
 466             if (fc & 0x100)
 467               {
 468                 /* This was a float compare.  If the parity bit is
 469                    set the result was unordered, meaning false for everything
 470                    except TOK_NE, and true for TOK_NE.  */
 471                 fc &= ~0x100;
 472                 o(0x037a + (REX_BASE(r) << 8));
 473               }
 474             orex(0,r,0, 0x0f); /* setxx %br */
 475             o(fc);
 476             o(0xc0 + REG_VALUE(r));
 477         } else if (v == VT_JMP || v == VT_JMPI) {
 478             t = v & 1;
 479             orex(0,r,0,0);
 480             oad(0xb8 + REG_VALUE(r), t); /* mov $1, r */
 481             o(0x05eb + (REX_BASE(r) << 8)); /* jmp after */
 482             gsym(fc);
 483             orex(0,r,0,0);
 484             oad(0xb8 + REG_VALUE(r), t ^ 1); /* mov $0, r */
 485         } else if (v != r) {
 486             if ((r >= TREG_XMM0) && (r <= TREG_XMM7)) {
 487                 if (v == TREG_ST0) {
 488                     /* gen_cvt_ftof(VT_DOUBLE); */
 489                     o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 490                     /* movsd -0x10(%rsp),%xmmN */
 491                     o(0x100ff2);
 492                     o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 493                     o(0xf024);
 494                 } else {
 495                     assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 496                     if ((ft & VT_BTYPE) == VT_FLOAT) {
 497                         o(0x100ff3);
 498                     } else {
 499                         assert((ft & VT_BTYPE) == VT_DOUBLE);
 500                         o(0x100ff2);
 501                     }
 502                     o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8);
 503                 }
 504             } else if (r == TREG_ST0) {
 505                 assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 506                 /* gen_cvt_ftof(VT_LDOUBLE); */
 507                 /* movsd %xmmN,-0x10(%rsp) */
 508                 o(0x110ff2);
 509                 o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 510                 o(0xf024);
 511                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 512             } else {
 513                 orex(1,r,v, 0x89);
 514                 o(0xc0 + REG_VALUE(r) + REG_VALUE(v) * 8); /* mov v, r */
 515             }
 516         }
 517     }
 518 }
 519
 520 /* store register 'r' in lvalue 'v' */
 521 void store(int r, SValue *v)
 522 {
 523     int fr, bt, ft, fc;
 524     int op64 = 0;
 525     /* store the REX prefix in this variable when PIC is enabled */
 526     int pic = 0;
 527
 528 #ifdef TCC_TARGET_PE
 529     SValue v2;
 530     v = pe_getimport(v, &v2);
 531 #endif
 532
 533     ft = v->type.t;
 534     fc = v->c.ul;
 535     fr = v->r & VT_VALMASK;
 536     bt = ft & VT_BTYPE;
 537
 538 #ifndef TCC_TARGET_PE
 539     /* we need to access the variable via got */
 540     if (fr == VT_CONST && (v->r & VT_SYM)) {
 541         /* mov xx(%rip), %r11 */
 542         o(0x1d8b4c);
 543         gen_gotpcrel(TREG_R11, v->sym, v->c.ul);
 544         pic = is64_type(bt) ? 0x49 : 0x41;
 545     }
 546 #endif
 547
 548     /* XXX: incorrect if float reg to reg */
 549     if (bt == VT_FLOAT) {
 550         o(0x66);
 551         o(pic);
 552         o(0x7e0f); /* movd */
 553         r = REG_VALUE(r);
 554     } else if (bt == VT_DOUBLE) {
 555         o(0x66);
 556         o(pic);
 557         o(0xd60f); /* movq */
 558         r = REG_VALUE(r);
 559     } else if (bt == VT_LDOUBLE) {
 560         o(0xc0d9); /* fld %st(0) */
 561         o(pic);
 562         o(0xdb); /* fstpt */
 563         r = 7;
 564     } else {
 565         if (bt == VT_SHORT)
 566             o(0x66);
 567         o(pic);
 568         if (bt == VT_BYTE || bt == VT_BOOL)
 569             orex(0, 0, r, 0x88);
 570         else if (is64_type(bt))
 571             op64 = 0x89;
 572         else
 573             orex(0, 0, r, 0x89);
 574     }
 575     if (pic) {
 576         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 577         if (op64)
 578             o(op64);
 579         o(3 + (r << 3));
 580     } else if (op64) {
 581         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 582             gen_modrm64(op64, r, v->r, v->sym, fc);
 583         } else if (fr != r) {
 584             /* XXX: don't we really come here? */
 585             abort();
 586             o(0xc0 + fr + r * 8); /* mov r, fr */
 587         }
 588     } else {
 589         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 590             gen_modrm(r, v->r, v->sym, fc);
 591         } else if (fr != r) {
 592             /* XXX: don't we really come here? */
 593             abort();
 594             o(0xc0 + fr + r * 8); /* mov r, fr */
 595         }
 596     }
 597 }
 598
 599 /* 'is_jmp' is '1' if it is a jump */
 600 static void gcall_or_jmp(int is_jmp)
 601 {
 602     int r;
 603     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST &&
 604         ((vtop->r & VT_SYM) || (vtop->c.ll-4) == (int)(vtop->c.ll-4))) {
 605         /* constant case */
 606         if (vtop->r & VT_SYM) {
 607             /* relocation case */
 608 #ifdef TCC_TARGET_PE
 609             greloc(cur_text_section, vtop->sym, ind + 1, R_X86_64_PC32);
 610 #else
 611             greloc(cur_text_section, vtop->sym, ind + 1, R_X86_64_PLT32);
 612 #endif
 613         } else {
 614             /* put an empty PC32 relocation */
 615             put_elf_reloc(symtab_section, cur_text_section,
 616                           ind + 1, R_X86_64_PC32, 0);
 617         }
 618         oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
 619     } else {
 620         /* otherwise, indirect call */
 621         r = TREG_R11;
 622         load(r, vtop);
 623         o(0x41); /* REX */
 624         o(0xff); /* call/jmp *r */
 625         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 626     }
 627 }
 628
 629 #if defined(CONFIG_TCC_BCHECK)
 630 #ifndef TCC_TARGET_PE
 631 static addr_t func_bound_offset;
 632 static unsigned long func_bound_ind;
 633 #endif
 634
 635 static void gen_static_call(int v)
 636 {
 637     Sym *sym = external_global_sym(v, &func_old_type, 0);
 638     oad(0xe8, -4);
 639     greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 640 }
 641
 642 /* generate a bounded pointer addition */
 643 ST_FUNC void gen_bounded_ptr_add(void)
 644 {
 645     /* save all temporary registers */
 646     save_regs(0);
 647
 648     /* prepare fast x86_64 function call */
 649     gv(RC_RAX);
 650     o(0xc68948); // mov  %rax,%rsi ## second arg in %rsi, this must be size
 651     vtop--;
 652
 653     gv(RC_RAX);
 654     o(0xc78948); // mov  %rax,%rdi ## first arg in %rdi, this must be ptr
 655     vtop--;
 656
 657     /* do a fast function call */
 658     gen_static_call(TOK___bound_ptr_add);
 659
 660     /* returned pointer is in rax */
 661     vtop++;
 662     vtop->r = TREG_RAX | VT_BOUNDED;
 663
 664
 665     /* relocation offset of the bounding function call point */
 666     vtop->c.ull = (cur_text_section->reloc->data_offset - sizeof(ElfW(Rela)));
 667 }
 668
 669 /* patch pointer addition in vtop so that pointer dereferencing is
 670    also tested */
 671 ST_FUNC void gen_bounded_ptr_deref(void)
 672 {
 673     addr_t func;
 674     int size, align;
 675     ElfW(Rela) *rel;
 676     Sym *sym;
 677
 678     size = 0;
 679     /* XXX: put that code in generic part of tcc */
 680     if (!is_float(vtop->type.t)) {
 681         if (vtop->r & VT_LVAL_BYTE)
 682             size = 1;
 683         else if (vtop->r & VT_LVAL_SHORT)
 684             size = 2;
 685     }
 686     if (!size)
 687     size = type_size(&vtop->type, &align);
 688     switch(size) {
 689     case  1: func = TOK___bound_ptr_indir1; break;
 690     case  2: func = TOK___bound_ptr_indir2; break;
 691     case  4: func = TOK___bound_ptr_indir4; break;
 692     case  8: func = TOK___bound_ptr_indir8; break;
 693     case 12: func = TOK___bound_ptr_indir12; break;
 694     case 16: func = TOK___bound_ptr_indir16; break;
 695     default:
 696         tcc_error("unhandled size when dereferencing bounded pointer");
 697         func = 0;
 698         break;
 699     }
 700
 701     sym = external_global_sym(func, &func_old_type, 0);
 702     if (!sym->c)
 703         put_extern_sym(sym, NULL, 0, 0);
 704
 705     /* patch relocation */
 706     /* XXX: find a better solution ? */
 707
 708     rel = (ElfW(Rela) *)(cur_text_section->reloc->data + vtop->c.ull);
 709     rel->r_info = ELF64_R_INFO(sym->c, ELF64_R_TYPE(rel->r_info));
 710 }
 711 #endif
 712
 713 #ifdef TCC_TARGET_PE
 714
 715 #define REGN 4
 716 static const uint8_t arg_regs[REGN] = {
 717     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 718 };
 719
 720 /* Prepare arguments in R10 and R11 rather than RCX and RDX
 721    because gv() will not ever use these */
 722 static int arg_prepare_reg(int idx) {
 723   if (idx == 0 || idx == 1)
 724       /* idx=0: r10, idx=1: r11 */
 725       return idx + 10;
 726   else
 727       return arg_regs[idx];
 728 }
 729
 730 static int func_scratch;
 731
 732 /* Generate function call. The function address is pushed first, then
 733    all the parameters in call order. This functions pops all the
 734    parameters and the function address. */
 735
 736 void gen_offs_sp(int b, int r, int d)
 737 {
 738     orex(1,0,r & 0x100 ? 0 : r, b);
 739     if (d == (char)d) {
 740         o(0x2444 | (REG_VALUE(r) << 3));
 741         g(d);
 742     } else {
 743         o(0x2484 | (REG_VALUE(r) << 3));
 744         gen_le32(d);
 745     }
 746 }
 747
 748 /* Return the number of registers needed to return the struct, or 0 if
 749    returning via struct pointer. */
 750 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
 751 {
 752     int size, align;
 753     *regsize = 8;
 754     *ret_align = 1; // Never have to re-align return values for x86-64
 755     size = type_size(vt, &align);
 756     ret->ref = NULL;
 757     if (size > 8) {
 758         return 0;
 759     } else if (size > 4) {
 760         ret->t = VT_LLONG;
 761         return 1;
 762     } else if (size > 2) {
 763         ret->t = VT_INT;
 764         return 1;
 765     } else if (size > 1) {
 766         ret->t = VT_SHORT;
 767         return 1;
 768     } else {
 769         ret->t = VT_BYTE;
 770         return 1;
 771     }
 772 }
 773
 774 static int is_sse_float(int t) {
 775     int bt;
 776     bt = t & VT_BTYPE;
 777     return bt == VT_DOUBLE || bt == VT_FLOAT;
 778 }
 779
 780 int gfunc_arg_size(CType *type) {
 781     int align;
 782     if (type->t & (VT_ARRAY|VT_BITFIELD))
 783         return 8;
 784     return type_size(type, &align);
 785 }
 786
 787 void gfunc_call(int nb_args)
 788 {
 789     int size, r, args_size, i, d, bt, struct_size;
 790     int arg;
 791
 792     args_size = (nb_args < REGN ? REGN : nb_args) * PTR_SIZE;
 793     arg = nb_args;
 794
 795     /* for struct arguments, we need to call memcpy and the function
 796        call breaks register passing arguments we are preparing.
 797        So, we process arguments which will be passed by stack first. */
 798     struct_size = args_size;
 799     for(i = 0; i < nb_args; i++) {
 800         SValue *sv;
 801
 802         --arg;
 803         sv = &vtop[-i];
 804         bt = (sv->type.t & VT_BTYPE);
 805         size = gfunc_arg_size(&sv->type);
 806
 807         if (size <= 8)
 808             continue; /* arguments smaller than 8 bytes passed in registers or on stack */
 809
 810         if (bt == VT_STRUCT) {
 811             /* align to stack align size */
 812             size = (size + 15) & ~15;
 813             /* generate structure store */
 814             r = get_reg(RC_INT);
 815             gen_offs_sp(0x8d, r, struct_size);
 816             struct_size += size;
 817
 818             /* generate memcpy call */
 819             vset(&sv->type, r | VT_LVAL, 0);
 820             vpushv(sv);
 821             vstore();
 822             --vtop;
 823         } else if (bt == VT_LDOUBLE) {
 824             gv(RC_ST0);
 825             gen_offs_sp(0xdb, 0x107, struct_size);
 826             struct_size += 16;
 827         }
 828     }
 829
 830     if (func_scratch < struct_size)
 831         func_scratch = struct_size;
 832
 833     arg = nb_args;
 834     struct_size = args_size;
 835
 836     for(i = 0; i < nb_args; i++) {
 837         --arg;
 838         bt = (vtop->type.t & VT_BTYPE);
 839
 840         size = gfunc_arg_size(&vtop->type);
 841         if (size > 8) {
 842             /* align to stack align size */
 843             size = (size + 15) & ~15;
 844             if (arg >= REGN) {
 845                 d = get_reg(RC_INT);
 846                 gen_offs_sp(0x8d, d, struct_size);
 847                 gen_offs_sp(0x89, d, arg*8);
 848             } else {
 849                 d = arg_prepare_reg(arg);
 850                 gen_offs_sp(0x8d, d, struct_size);
 851             }
 852             struct_size += size;
 853         } else {
 854             if (is_sse_float(vtop->type.t)) {
 855                 gv(RC_XMM0); /* only use one float register */
 856                 if (arg >= REGN) {
 857                     /* movq %xmm0, j*8(%rsp) */
 858                     gen_offs_sp(0xd60f66, 0x100, arg*8);
 859                 } else {
 860                     /* movaps %xmm0, %xmmN */
 861                     o(0x280f);
 862                     o(0xc0 + (arg << 3));
 863                     d = arg_prepare_reg(arg);
 864                     /* mov %xmm0, %rxx */
 865                     o(0x66);
 866                     orex(1,d,0, 0x7e0f);
 867                     o(0xc0 + REG_VALUE(d));
 868                 }
 869             } else {
 870                 if (bt == VT_STRUCT) {
 871                     vtop->type.ref = NULL;
 872                     vtop->type.t = size > 4 ? VT_LLONG : size > 2 ? VT_INT
 873                         : size > 1 ? VT_SHORT : VT_BYTE;
 874                 }
 875
 876                 r = gv(RC_INT);
 877                 if (arg >= REGN) {
 878                     gen_offs_sp(0x89, r, arg*8);
 879                 } else {
 880                     d = arg_prepare_reg(arg);
 881                     orex(1,d,r,0x89); /* mov */
 882                     o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
 883                 }
 884             }
 885         }
 886         vtop--;
 887     }
 888     save_regs(0);
 889
 890     /* Copy R10 and R11 into RCX and RDX, respectively */
 891     if (nb_args > 0) {
 892         o(0xd1894c); /* mov %r10, %rcx */
 893         if (nb_args > 1) {
 894             o(0xda894c); /* mov %r11, %rdx */
 895         }
 896     }
 897
 898     gcall_or_jmp(0);
 899     vtop--;
 900 }
 901
 902
 903 #define FUNC_PROLOG_SIZE 11
 904
 905 /* generate function prolog of type 't' */
 906 void gfunc_prolog(CType *func_type)
 907 {
 908     int addr, reg_param_index, bt, size;
 909     Sym *sym;
 910     CType *type;
 911
 912     func_ret_sub = 0;
 913     func_scratch = 0;
 914     loc = 0;
 915
 916     addr = PTR_SIZE * 2;
 917     ind += FUNC_PROLOG_SIZE;
 918     func_sub_sp_offset = ind;
 919     reg_param_index = 0;
 920
 921     sym = func_type->ref;
 922
 923     /* if the function returns a structure, then add an
 924        implicit pointer parameter */
 925     func_vt = sym->type;
 926     func_var = (sym->c == FUNC_ELLIPSIS);
 927     size = gfunc_arg_size(&func_vt);
 928     if (size > 8) {
 929         gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 930         func_vc = addr;
 931         reg_param_index++;
 932         addr += 8;
 933     }
 934
 935     /* define parameters */
 936     while ((sym = sym->next) != NULL) {
 937         type = &sym->type;
 938         bt = type->t & VT_BTYPE;
 939         size = gfunc_arg_size(type);
 940         if (size > 8) {
 941             if (reg_param_index < REGN) {
 942                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 943             }
 944             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL | VT_REF, addr);
 945         } else {
 946             if (reg_param_index < REGN) {
 947                 /* save arguments passed by register */
 948                 if ((bt == VT_FLOAT) || (bt == VT_DOUBLE)) {
 949                     o(0xd60f66); /* movq */
 950                     gen_modrm(reg_param_index, VT_LOCAL, NULL, addr);
 951                 } else {
 952                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 953                 }
 954             }
 955             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL, addr);
 956         }
 957         addr += 8;
 958         reg_param_index++;
 959     }
 960
 961     while (reg_param_index < REGN) {
 962         if (func_type->ref->c == FUNC_ELLIPSIS) {
 963             gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 964             addr += 8;
 965         }
 966         reg_param_index++;
 967     }
 968 }
 969
 970 /* generate function epilog */
 971 void gfunc_epilog(void)
 972 {
 973     int v, saved_ind;
 974
 975     o(0xc9); /* leave */
 976     if (func_ret_sub == 0) {
 977         o(0xc3); /* ret */
 978     } else {
 979         o(0xc2); /* ret n */
 980         g(func_ret_sub);
 981         g(func_ret_sub >> 8);
 982     }
 983
 984     saved_ind = ind;
 985     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 986     /* align local size to word & save local variables */
 987     v = (func_scratch + -loc + 15) & -16;
 988
 989     if (v >= 4096) {
 990         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 991         oad(0xb8, v); /* mov stacksize, %eax */
 992         oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
 993         greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 994         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
 995     } else {
 996         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 997         o(0xec8148);  /* sub rsp, stacksize */
 998         gen_le32(v);
 999     }
1000
1001     cur_text_section->data_offset = saved_ind;
1002     pe_add_unwind_data(ind, saved_ind, v);
1003     ind = cur_text_section->data_offset;
1004 }
1005
1006 #else
1007
1008 static void gadd_sp(int val)
1009 {
1010     if (val == (char)val) {
1011         o(0xc48348);
1012         g(val);
1013     } else {
1014         oad(0xc48148, val); /* add $xxx, %rsp */
1015     }
1016 }
1017
1018 typedef enum X86_64_Mode {
1019   x86_64_mode_none,
1020   x86_64_mode_memory,
1021   x86_64_mode_integer,
1022   x86_64_mode_sse,
1023   x86_64_mode_x87
1024 } X86_64_Mode;
1025
1026 static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b)
1027 {
1028     if (a == b)
1029         return a;
1030     else if (a == x86_64_mode_none)
1031         return b;
1032     else if (b == x86_64_mode_none)
1033         return a;
1034     else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
1035         return x86_64_mode_memory;
1036     else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
1037         return x86_64_mode_integer;
1038     else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
1039         return x86_64_mode_memory;
1040     else
1041         return x86_64_mode_sse;
1042 }
1043
1044 static X86_64_Mode classify_x86_64_inner(CType *ty)
1045 {
1046     X86_64_Mode mode;
1047     Sym *f;
1048
1049     switch (ty->t & VT_BTYPE) {
1050     case VT_VOID: return x86_64_mode_none;
1051
1052     case VT_INT:
1053     case VT_BYTE:
1054     case VT_SHORT:
1055     case VT_LLONG:
1056     case VT_BOOL:
1057     case VT_PTR:
1058     case VT_FUNC:
1059     case VT_ENUM: return x86_64_mode_integer;
1060
1061     case VT_FLOAT:
1062     case VT_DOUBLE: return x86_64_mode_sse;
1063
1064     case VT_LDOUBLE: return x86_64_mode_x87;
1065
1066     case VT_STRUCT:
1067         f = ty->ref;
1068
1069         mode = x86_64_mode_none;
1070         for (f = f->next; f; f = f->next)
1071             mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
1072
1073         return mode;
1074     }
1075
1076     assert(0);
1077 }
1078
1079 static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count)
1080 {
1081     X86_64_Mode mode;
1082     int size, align, ret_t = 0;
1083
1084     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
1085         *psize = 8;
1086         *palign = 8;
1087         *reg_count = 1;
1088         ret_t = ty->t;
1089         mode = x86_64_mode_integer;
1090     } else {
1091         size = type_size(ty, &align);
1092         *psize = (size + 7) & ~7;
1093         *palign = (align + 7) & ~7;
1094
1095         if (size > 16) {
1096             mode = x86_64_mode_memory;
1097         } else {
1098             mode = classify_x86_64_inner(ty);
1099             switch (mode) {
1100             case x86_64_mode_integer:
1101                 if (size > 8) {
1102                     *reg_count = 2;
1103                     ret_t = VT_QLONG;
1104                 } else {
1105                     *reg_count = 1;
1106                     ret_t = (size > 4) ? VT_LLONG : VT_INT;
1107                 }
1108                 break;
1109
1110             case x86_64_mode_x87:
1111                 *reg_count = 1;
1112                 ret_t = VT_LDOUBLE;
1113                 break;
1114
1115             case x86_64_mode_sse:
1116                 if (size > 8) {
1117                     *reg_count = 2;
1118                     ret_t = VT_QFLOAT;
1119                 } else {
1120                     *reg_count = 1;
1121                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
1122                 }
1123                 break;
1124             default: break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
1125             }
1126         }
1127     }
1128
1129     if (ret) {
1130         ret->ref = NULL;
1131         ret->t = ret_t;
1132     }
1133
1134     return mode;
1135 }
1136
1137 ST_FUNC int classify_x86_64_va_arg(CType *ty)
1138 {
1139     /* This definition must be synced with stdarg.h */
1140     enum __va_arg_type {
1141         __va_gen_reg, __va_float_reg, __va_stack
1142     };
1143     int size, align, reg_count;
1144     X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
1145     switch (mode) {
1146     default: return __va_stack;
1147     case x86_64_mode_integer: return __va_gen_reg;
1148     case x86_64_mode_sse: return __va_float_reg;
1149     }
1150 }
1151
1152 /* Return the number of registers needed to return the struct, or 0 if
1153    returning via struct pointer. */
1154 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
1155 {
1156     int size, align, reg_count;
1157     *ret_align = 1; // Never have to re-align return values for x86-64
1158     *regsize = 8;
1159     return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
1160 }
1161
1162 #define REGN 6
1163 static const uint8_t arg_regs[REGN] = {
1164     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
1165 };
1166
1167 static int arg_prepare_reg(int idx) {
1168   if (idx == 2 || idx == 3)
1169       /* idx=2: r10, idx=3: r11 */
1170       return idx + 8;
1171   else
1172       return arg_regs[idx];
1173 }
1174
1175 /* Generate function call. The function address is pushed first, then
1176    all the parameters in call order. This functions pops all the
1177    parameters and the function address. */
1178 void gfunc_call(int nb_args)
1179 {
1180     X86_64_Mode mode;
1181     CType type;
1182     int size, align, r, args_size, stack_adjust, run_start, run_end, i, reg_count;
1183     int nb_reg_args = 0;
1184     int nb_sse_args = 0;
1185     int sse_reg, gen_reg;
1186
1187     /* calculate the number of integer/float register arguments */
1188     for(i = 0; i < nb_args; i++) {
1189         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1190         if (mode == x86_64_mode_sse)
1191             nb_sse_args += reg_count;
1192         else if (mode == x86_64_mode_integer)
1193             nb_reg_args += reg_count;
1194     }
1195
1196     /* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
1197        and ended by a 16-byte aligned argument. This is because, from the point of view of
1198        the callee, argument alignment is computed from the bottom up. */
1199     /* for struct arguments, we need to call memcpy and the function
1200        call breaks register passing arguments we are preparing.
1201        So, we process arguments which will be passed by stack first. */
1202     gen_reg = nb_reg_args;
1203     sse_reg = nb_sse_args;
1204     run_start = 0;
1205     args_size = 0;
1206     while (run_start != nb_args) {
1207         int run_gen_reg = gen_reg, run_sse_reg = sse_reg;
1208
1209         run_end = nb_args;
1210         stack_adjust = 0;
1211         for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
1212             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1213             switch (mode) {
1214             case x86_64_mode_memory:
1215             case x86_64_mode_x87:
1216             stack_arg:
1217                 if (align == 16)
1218                     run_end = i;
1219                 else
1220                     stack_adjust += size;
1221                 break;
1222
1223             case x86_64_mode_sse:
1224                 sse_reg -= reg_count;
1225                 if (sse_reg + reg_count > 8) goto stack_arg;
1226                 break;
1227
1228             case x86_64_mode_integer:
1229                 gen_reg -= reg_count;
1230                 if (gen_reg + reg_count > REGN) goto stack_arg;
1231                 break;
1232             default: break; /* nothing to be done for x86_64_mode_none */
1233             }
1234         }
1235
1236         gen_reg = run_gen_reg;
1237         sse_reg = run_sse_reg;
1238
1239         /* adjust stack to align SSE boundary */
1240         if (stack_adjust &= 15) {
1241             /* fetch cpu flag before the following sub will change the value */
1242             if (vtop >= vstack && (vtop->r & VT_VALMASK) == VT_CMP)
1243                 gv(RC_INT);
1244
1245             stack_adjust = 16 - stack_adjust;
1246             o(0x48);
1247             oad(0xec81, stack_adjust); /* sub $xxx, %rsp */
1248             args_size += stack_adjust;
1249         }
1250
1251         for(i = run_start; i < run_end;) {
1252             /* Swap argument to top, it will possibly be changed here,
1253               and might use more temps. At the end of the loop we keep
1254               in on the stack and swap it back to its original position
1255               if it is a register. */
1256             SValue tmp = vtop[0];
1257             vtop[0] = vtop[-i];
1258             vtop[-i] = tmp;
1259
1260             mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
1261
1262             int arg_stored = 1;
1263             switch (vtop->type.t & VT_BTYPE) {
1264             case VT_STRUCT:
1265                 if (mode == x86_64_mode_sse) {
1266                     if (sse_reg > 8)
1267                         sse_reg -= reg_count;
1268                     else
1269                         arg_stored = 0;
1270                 } else if (mode == x86_64_mode_integer) {
1271                     if (gen_reg > REGN)
1272                         gen_reg -= reg_count;
1273                     else
1274                         arg_stored = 0;
1275                 }
1276
1277                 if (arg_stored) {
1278                     /* allocate the necessary size on stack */
1279                     o(0x48);
1280                     oad(0xec81, size); /* sub $xxx, %rsp */
1281                     /* generate structure store */
1282                     r = get_reg(RC_INT);
1283                     orex(1, r, 0, 0x89); /* mov %rsp, r */
1284                     o(0xe0 + REG_VALUE(r));
1285                     vset(&vtop->type, r | VT_LVAL, 0);
1286                     vswap();
1287                     vstore();
1288                     args_size += size;
1289                 }
1290                 break;
1291
1292             case VT_LDOUBLE:
1293                 assert(0);
1294                 break;
1295
1296             case VT_FLOAT:
1297             case VT_DOUBLE:
1298                 assert(mode == x86_64_mode_sse);
1299                 if (sse_reg > 8) {
1300                     --sse_reg;
1301                     r = gv(RC_FLOAT);
1302                     o(0x50); /* push $rax */
1303                     /* movq %xmmN, (%rsp) */
1304                     o(0xd60f66);
1305                     o(0x04 + REG_VALUE(r)*8);
1306                     o(0x24);
1307                     args_size += size;
1308                 } else {
1309                     arg_stored = 0;
1310                 }
1311                 break;
1312
1313             default:
1314                 assert(mode == x86_64_mode_integer);
1315                 /* simple type */
1316                 /* XXX: implicit cast ? */
1317                 if (gen_reg > REGN) {
1318                     --gen_reg;
1319                     r = gv(RC_INT);
1320                     orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
1321                     args_size += size;
1322                 } else {
1323                     arg_stored = 0;
1324                 }
1325                 break;
1326             }
1327
1328             /* And swap the argument back to it's original position.  */
1329             tmp = vtop[0];
1330             vtop[0] = vtop[-i];
1331             vtop[-i] = tmp;
1332
1333             if (arg_stored) {
1334               vrotb(i+1);
1335               assert((vtop->type.t == tmp.type.t) && (vtop->r == tmp.r));
1336               vpop();
1337               --nb_args;
1338               --run_end;
1339             } else {
1340               ++i;
1341             }
1342         }
1343
1344         /* handle 16 byte aligned arguments at end of run */
1345         run_start = i = run_end;
1346         while (i < nb_args) {
1347             /* Rotate argument to top since it will always be popped */
1348             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1349             if (align != 16)
1350               break;
1351
1352             vrotb(i+1);
1353
1354             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1355                 gv(RC_ST0);
1356                 oad(0xec8148, size); /* sub $xxx, %rsp */
1357                 o(0x7cdb); /* fstpt 0(%rsp) */
1358                 g(0x24);
1359                 g(0x00);
1360                 args_size += size;
1361             } else {
1362                 assert(mode == x86_64_mode_memory);
1363
1364                 /* allocate the necessary size on stack */
1365                 o(0x48);
1366                 oad(0xec81, size); /* sub $xxx, %rsp */
1367                 /* generate structure store */
1368                 r = get_reg(RC_INT);
1369                 orex(1, r, 0, 0x89); /* mov %rsp, r */
1370                 o(0xe0 + REG_VALUE(r));
1371                 vset(&vtop->type, r | VT_LVAL, 0);
1372                 vswap();
1373                 vstore();
1374                 args_size += size;
1375             }
1376
1377             vpop();
1378             --nb_args;
1379         }
1380     }
1381
1382     /* XXX This should be superfluous.  */
1383     save_regs(0); /* save used temporary registers */
1384
1385     /* then, we prepare register passing arguments.
1386        Note that we cannot set RDX and RCX in this loop because gv()
1387        may break these temporary registers. Let's use R10 and R11
1388        instead of them */
1389     assert(gen_reg <= REGN);
1390     assert(sse_reg <= 8);
1391     for(i = 0; i < nb_args; i++) {
1392         mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
1393         /* Alter stack entry type so that gv() knows how to treat it */
1394         vtop->type = type;
1395         if (mode == x86_64_mode_sse) {
1396             if (reg_count == 2) {
1397                 sse_reg -= 2;
1398                 gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
1399                 if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
1400                     /* movaps %xmm0, %xmmN */
1401                     o(0x280f);
1402                     o(0xc0 + (sse_reg << 3));
1403                     /* movaps %xmm1, %xmmN */
1404                     o(0x280f);
1405                     o(0xc1 + ((sse_reg+1) << 3));
1406                 }
1407             } else {
1408                 assert(reg_count == 1);
1409                 --sse_reg;
1410                 /* Load directly to register */
1411                 gv(RC_XMM0 << sse_reg);
1412             }
1413         } else if (mode == x86_64_mode_integer) {
1414             /* simple type */
1415             /* XXX: implicit cast ? */
1416             gen_reg -= reg_count;
1417             r = gv(RC_INT);
1418             int d = arg_prepare_reg(gen_reg);
1419             orex(1,d,r,0x89); /* mov */
1420             o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
1421             if (reg_count == 2) {
1422                 d = arg_prepare_reg(gen_reg+1);
1423                 orex(1,d,vtop->r2,0x89); /* mov */
1424                 o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
1425             }
1426         }
1427         vtop--;
1428     }
1429     assert(gen_reg == 0);
1430     assert(sse_reg == 0);
1431
1432     /* We shouldn't have many operands on the stack anymore, but the
1433        call address itself is still there, and it might be in %eax
1434        (or edx/ecx) currently, which the below writes would clobber.
1435        So evict all remaining operands here.  */
1436     save_regs(0);
1437
1438     /* Copy R10 and R11 into RDX and RCX, respectively */
1439     if (nb_reg_args > 2) {
1440         o(0xd2894c); /* mov %r10, %rdx */
1441         if (nb_reg_args > 3) {
1442             o(0xd9894c); /* mov %r11, %rcx */
1443         }
1444     }
1445
1446     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
1447     gcall_or_jmp(0);
1448     if (args_size)
1449         gadd_sp(args_size);
1450     vtop--;
1451 }
1452
1453
1454 #define FUNC_PROLOG_SIZE 11
1455
1456 static void push_arg_reg(int i) {
1457     loc -= 8;
1458     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
1459 }
1460
1461 /* generate function prolog of type 't' */
1462 void gfunc_prolog(CType *func_type)
1463 {
1464     X86_64_Mode mode;
1465     int i, addr, align, size, reg_count;
1466     int param_addr = 0, reg_param_index, sse_param_index;
1467     Sym *sym;
1468     CType *type;
1469
1470     sym = func_type->ref;
1471     addr = PTR_SIZE * 2;
1472     loc = 0;
1473     ind += FUNC_PROLOG_SIZE;
1474     func_sub_sp_offset = ind;
1475     func_ret_sub = 0;
1476
1477     if (func_type->ref->c == FUNC_ELLIPSIS) {
1478         int seen_reg_num, seen_sse_num, seen_stack_size;
1479         seen_reg_num = seen_sse_num = 0;
1480         /* frame pointer and return address */
1481         seen_stack_size = PTR_SIZE * 2;
1482         /* count the number of seen parameters */
1483         sym = func_type->ref;
1484         while ((sym = sym->next) != NULL) {
1485             type = &sym->type;
1486             mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1487             switch (mode) {
1488             default:
1489             stack_arg:
1490                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
1491                 break;
1492
1493             case x86_64_mode_integer:
1494                 if (seen_reg_num + reg_count <= 8) {
1495                     seen_reg_num += reg_count;
1496                 } else {
1497                     seen_reg_num = 8;
1498                     goto stack_arg;
1499                 }
1500                 break;
1501
1502             case x86_64_mode_sse:
1503                 if (seen_sse_num + reg_count <= 8) {
1504                     seen_sse_num += reg_count;
1505                 } else {
1506                     seen_sse_num = 8;
1507                     goto stack_arg;
1508                 }
1509                 break;
1510             }
1511         }
1512
1513         loc -= 16;
1514         /* movl $0x????????, -0x10(%rbp) */
1515         o(0xf045c7);
1516         gen_le32(seen_reg_num * 8);
1517         /* movl $0x????????, -0xc(%rbp) */
1518         o(0xf445c7);
1519         gen_le32(seen_sse_num * 16 + 48);
1520         /* movl $0x????????, -0x8(%rbp) */
1521         o(0xf845c7);
1522         gen_le32(seen_stack_size);
1523
1524         /* save all register passing arguments */
1525         for (i = 0; i < 8; i++) {
1526             loc -= 16;
1527             o(0xd60f66); /* movq */
1528             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
1529             /* movq $0, loc+8(%rbp) */
1530             o(0x85c748);
1531             gen_le32(loc + 8);
1532             gen_le32(0);
1533         }
1534         for (i = 0; i < REGN; i++) {
1535             push_arg_reg(REGN-1-i);
1536         }
1537     }
1538
1539     sym = func_type->ref;
1540     reg_param_index = 0;
1541     sse_param_index = 0;
1542
1543     /* if the function returns a structure, then add an
1544        implicit pointer parameter */
1545     func_vt = sym->type;
1546     mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
1547     if (mode == x86_64_mode_memory) {
1548         push_arg_reg(reg_param_index);
1549         func_vc = loc;
1550         reg_param_index++;
1551     }
1552     /* define parameters */
1553     while ((sym = sym->next) != NULL) {
1554         type = &sym->type;
1555         mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1556         switch (mode) {
1557         case x86_64_mode_sse:
1558             if (sse_param_index + reg_count <= 8) {
1559                 /* save arguments passed by register */
1560                 loc -= reg_count * 8;
1561                 param_addr = loc;
1562                 for (i = 0; i < reg_count; ++i) {
1563                     o(0xd60f66); /* movq */
1564                     gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
1565                     ++sse_param_index;
1566                 }
1567             } else {
1568                 addr = (addr + align - 1) & -align;
1569                 param_addr = addr;
1570                 addr += size;
1571             }
1572             break;
1573
1574         case x86_64_mode_memory:
1575         case x86_64_mode_x87:
1576             addr = (addr + align - 1) & -align;
1577             param_addr = addr;
1578             addr += size;
1579             break;
1580
1581         case x86_64_mode_integer: {
1582             if (reg_param_index + reg_count <= REGN) {
1583                 /* save arguments passed by register */
1584                 loc -= reg_count * 8;
1585                 param_addr = loc;
1586                 for (i = 0; i < reg_count; ++i) {
1587                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
1588                     ++reg_param_index;
1589                 }
1590             } else {
1591                 addr = (addr + align - 1) & -align;
1592                 param_addr = addr;
1593                 addr += size;
1594             }
1595             break;
1596         }
1597         default: break; /* nothing to be done for x86_64_mode_none */
1598         }
1599         sym_push(sym->v & ~SYM_FIELD, type,
1600                  VT_LOCAL | VT_LVAL, param_addr);
1601     }
1602
1603 #ifdef CONFIG_TCC_BCHECK
1604     /* leave some room for bound checking code */
1605     if (tcc_state->do_bounds_check) {
1606         func_bound_offset = lbounds_section->data_offset;
1607         func_bound_ind = ind;
1608         oad(0xb8, 0); /* lbound section pointer */
1609         o(0xc78948);  /* mov  %rax,%rdi ## first arg in %rdi, this must be ptr */
1610         oad(0xb8, 0); /* call to function */
1611     }
1612 #endif
1613 }
1614
1615 /* generate function epilog */
1616 void gfunc_epilog(void)
1617 {
1618     int v, saved_ind;
1619
1620 #ifdef CONFIG_TCC_BCHECK
1621     if (tcc_state->do_bounds_check
1622         && func_bound_offset != lbounds_section->data_offset)
1623     {
1624         addr_t saved_ind;
1625         addr_t *bounds_ptr;
1626         Sym *sym_data;
1627
1628         /* add end of table info */
1629         bounds_ptr = section_ptr_add(lbounds_section, sizeof(addr_t));
1630         *bounds_ptr = 0;
1631
1632         /* generate bound local allocation */
1633         sym_data = get_sym_ref(&char_pointer_type, lbounds_section,
1634                                func_bound_offset, lbounds_section->data_offset);
1635         saved_ind = ind;
1636         ind = func_bound_ind;
1637         greloc(cur_text_section, sym_data, ind + 1, R_386_32);
1638         ind = ind + 5 + 3;
1639         gen_static_call(TOK___bound_local_new);
1640         ind = saved_ind;
1641
1642         /* generate bound check local freeing */
1643         o(0x5250); /* save returned value, if any */
1644         greloc(cur_text_section, sym_data, ind + 1, R_386_32);
1645         oad(0xb8, 0); /* mov xxx, %rax */
1646         o(0xc78948);  /* mov  %rax,%rdi ## first arg in %rdi, this must be ptr */
1647         gen_static_call(TOK___bound_local_delete);
1648         o(0x585a); /* restore returned value, if any */
1649     }
1650 #endif
1651     o(0xc9); /* leave */
1652     if (func_ret_sub == 0) {
1653         o(0xc3); /* ret */
1654     } else {
1655         o(0xc2); /* ret n */
1656         g(func_ret_sub);
1657         g(func_ret_sub >> 8);
1658     }
1659     /* align local size to word & save local variables */
1660     v = (-loc + 15) & -16;
1661     saved_ind = ind;
1662     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1663     o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1664     o(0xec8148);  /* sub rsp, stacksize */
1665     gen_le32(v);
1666     ind = saved_ind;
1667 }
1668
1669 #endif /* not PE */
1670
1671 /* generate a jump to a label */
1672 int gjmp(int t)
1673 {
1674     return psym(0xe9, t);
1675 }
1676
1677 /* generate a jump to a fixed address */
1678 void gjmp_addr(int a)
1679 {
1680     int r;
1681     r = a - ind - 2;
1682     if (r == (char)r) {
1683         g(0xeb);
1684         g(r);
1685     } else {
1686         oad(0xe9, a - ind - 5);
1687     }
1688 }
1689
1690 /* generate a test. set 'inv' to invert test. Stack entry is popped */
1691 int gtst(int inv, int t)
1692 {
1693     int v, *p;
1694
1695     v = vtop->r & VT_VALMASK;
1696     if (v == VT_CMP) {
1697         /* fast case : can jump directly since flags are set */
1698         if (vtop->c.i & 0x100)
1699           {
1700             /* This was a float compare.  If the parity flag is set
1701                the result was unordered.  For anything except != this
1702                means false and we don't jump (anding both conditions).
1703                For != this means true (oring both).
1704                Take care about inverting the test.  We need to jump
1705                to our target if the result was unordered and test wasn't NE,
1706                otherwise if unordered we don't want to jump.  */
1707             vtop->c.i &= ~0x100;
1708             if (!inv == (vtop->c.i != TOK_NE))
1709               o(0x067a);  /* jp +6 */
1710             else
1711               {
1712                 g(0x0f);
1713                 t = psym(0x8a, t); /* jp t */
1714               }
1715           }
1716         g(0x0f);
1717         t = psym((vtop->c.i - 16) ^ inv, t);
1718     } else if (v == VT_JMP || v == VT_JMPI) {
1719         /* && or || optimization */
1720         if ((v & 1) == inv) {
1721             /* insert vtop->c jump list in t */
1722             p = &vtop->c.i;
1723             while (*p != 0)
1724                 p = (int *)(cur_text_section->data + *p);
1725             *p = t;
1726             t = vtop->c.i;
1727         } else {
1728             t = gjmp(t);
1729             gsym(vtop->c.i);
1730         }
1731     }
1732     vtop--;
1733     return t;
1734 }
1735
1736 /* generate an integer binary operation */
1737 void gen_opi(int op)
1738 {
1739     int r, fr, opc, c;
1740     int ll, uu, cc;
1741
1742     ll = is64_type(vtop[-1].type.t);
1743     uu = (vtop[-1].type.t & VT_UNSIGNED) != 0;
1744     cc = (vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
1745
1746     switch(op) {
1747     case '+':
1748     case TOK_ADDC1: /* add with carry generation */
1749         opc = 0;
1750     gen_op8:
1751         if (cc && (!ll || (int)vtop->c.ll == vtop->c.ll)) {
1752             /* constant case */
1753             vswap();
1754             r = gv(RC_INT);
1755             vswap();
1756             c = vtop->c.i;
1757             if (c == (char)c) {
1758                 /* XXX: generate inc and dec for smaller code ? */
1759                 orex(ll, r, 0, 0x83);
1760                 o(0xc0 | (opc << 3) | REG_VALUE(r));
1761                 g(c);
1762             } else {
1763                 orex(ll, r, 0, 0x81);
1764                 oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
1765             }
1766         } else {
1767             gv2(RC_INT, RC_INT);
1768             r = vtop[-1].r;
1769             fr = vtop[0].r;
1770             orex(ll, r, fr, (opc << 3) | 0x01);
1771             o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
1772         }
1773         vtop--;
1774         if (op >= TOK_ULT && op <= TOK_GT) {
1775             vtop->r = VT_CMP;
1776             vtop->c.i = op;
1777         }
1778         break;
1779     case '-':
1780     case TOK_SUBC1: /* sub with carry generation */
1781         opc = 5;
1782         goto gen_op8;
1783     case TOK_ADDC2: /* add with carry use */
1784         opc = 2;
1785         goto gen_op8;
1786     case TOK_SUBC2: /* sub with carry use */
1787         opc = 3;
1788         goto gen_op8;
1789     case '&':
1790         opc = 4;
1791         goto gen_op8;
1792     case '^':
1793         opc = 6;
1794         goto gen_op8;
1795     case '|':
1796         opc = 1;
1797         goto gen_op8;
1798     case '*':
1799         gv2(RC_INT, RC_INT);
1800         r = vtop[-1].r;
1801         fr = vtop[0].r;
1802         orex(ll, fr, r, 0xaf0f); /* imul fr, r */
1803         o(0xc0 + REG_VALUE(fr) + REG_VALUE(r) * 8);
1804         vtop--;
1805         break;
1806     case TOK_SHL:
1807         opc = 4;
1808         goto gen_shift;
1809     case TOK_SHR:
1810         opc = 5;
1811         goto gen_shift;
1812     case TOK_SAR:
1813         opc = 7;
1814     gen_shift:
1815         opc = 0xc0 | (opc << 3);
1816         if (cc) {
1817             /* constant case */
1818             vswap();
1819             r = gv(RC_INT);
1820             vswap();
1821             orex(ll, r, 0, 0xc1); /* shl/shr/sar $xxx, r */
1822             o(opc | REG_VALUE(r));
1823             g(vtop->c.i & (ll ? 63 : 31));
1824         } else {
1825             /* we generate the shift in ecx */
1826             gv2(RC_INT, RC_RCX);
1827             r = vtop[-1].r;
1828             orex(ll, r, 0, 0xd3); /* shl/shr/sar %cl, r */
1829             o(opc | REG_VALUE(r));
1830         }
1831         vtop--;
1832         break;
1833     case TOK_UDIV:
1834     case TOK_UMOD:
1835         uu = 1;
1836         goto divmod;
1837     case '/':
1838     case '%':
1839     case TOK_PDIV:
1840         uu = 0;
1841     divmod:
1842         /* first operand must be in eax */
1843         /* XXX: need better constraint for second operand */
1844         gv2(RC_RAX, RC_RCX);
1845         r = vtop[-1].r;
1846         fr = vtop[0].r;
1847         vtop--;
1848         save_reg(TREG_RDX);
1849         orex(ll, 0, 0, uu ? 0xd231 : 0x99); /* xor %edx,%edx : cqto */
1850         orex(ll, fr, 0, 0xf7); /* div fr, %eax */
1851         o((uu ? 0xf0 : 0xf8) + REG_VALUE(fr));
1852         if (op == '%' || op == TOK_UMOD)
1853             r = TREG_RDX;
1854         else
1855             r = TREG_RAX;
1856         vtop->r = r;
1857         break;
1858     default:
1859         opc = 7;
1860         goto gen_op8;
1861     }
1862 }
1863
1864 void gen_opl(int op)
1865 {
1866     gen_opi(op);
1867 }
1868
1869 /* generate a floating point operation 'v = t1 op t2' instruction. The
1870    two operands are guaranted to have the same floating point type */
1871 /* XXX: need to use ST1 too */
1872 void gen_opf(int op)
1873 {
1874     int a, ft, fc, swapped, r;
1875     int float_type =
1876         (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1877
1878     /* convert constants to memory references */
1879     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1880         vswap();
1881         gv(float_type);
1882         vswap();
1883     }
1884     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1885         gv(float_type);
1886
1887     /* must put at least one value in the floating point register */
1888     if ((vtop[-1].r & VT_LVAL) &&
1889         (vtop[0].r & VT_LVAL)) {
1890         vswap();
1891         gv(float_type);
1892         vswap();
1893     }
1894     swapped = 0;
1895     /* swap the stack if needed so that t1 is the register and t2 is
1896        the memory reference */
1897     if (vtop[-1].r & VT_LVAL) {
1898         vswap();
1899         swapped = 1;
1900     }
1901     if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1902         if (op >= TOK_ULT && op <= TOK_GT) {
1903             /* load on stack second operand */
1904             load(TREG_ST0, vtop);
1905             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1906             if (op == TOK_GE || op == TOK_GT)
1907                 swapped = !swapped;
1908             else if (op == TOK_EQ || op == TOK_NE)
1909                 swapped = 0;
1910             if (swapped)
1911                 o(0xc9d9); /* fxch %st(1) */
1912             if (op == TOK_EQ || op == TOK_NE)
1913                 o(0xe9da); /* fucompp */
1914             else
1915                 o(0xd9de); /* fcompp */
1916             o(0xe0df); /* fnstsw %ax */
1917             if (op == TOK_EQ) {
1918                 o(0x45e480); /* and $0x45, %ah */
1919                 o(0x40fC80); /* cmp $0x40, %ah */
1920             } else if (op == TOK_NE) {
1921                 o(0x45e480); /* and $0x45, %ah */
1922                 o(0x40f480); /* xor $0x40, %ah */
1923                 op = TOK_NE;
1924             } else if (op == TOK_GE || op == TOK_LE) {
1925                 o(0x05c4f6); /* test $0x05, %ah */
1926                 op = TOK_EQ;
1927             } else {
1928                 o(0x45c4f6); /* test $0x45, %ah */
1929                 op = TOK_EQ;
1930             }
1931             vtop--;
1932             vtop->r = VT_CMP;
1933             vtop->c.i = op;
1934         } else {
1935             /* no memory reference possible for long double operations */
1936             load(TREG_ST0, vtop);
1937             swapped = !swapped;
1938
1939             switch(op) {
1940             default:
1941             case '+':
1942                 a = 0;
1943                 break;
1944             case '-':
1945                 a = 4;
1946                 if (swapped)
1947                     a++;
1948                 break;
1949             case '*':
1950                 a = 1;
1951                 break;
1952             case '/':
1953                 a = 6;
1954                 if (swapped)
1955                     a++;
1956                 break;
1957             }
1958             ft = vtop->type.t;
1959             fc = vtop->c.ul;
1960             o(0xde); /* fxxxp %st, %st(1) */
1961             o(0xc1 + (a << 3));
1962             vtop--;
1963         }
1964     } else {
1965         if (op >= TOK_ULT && op <= TOK_GT) {
1966             /* if saved lvalue, then we must reload it */
1967             r = vtop->r;
1968             fc = vtop->c.ul;
1969             if ((r & VT_VALMASK) == VT_LLOCAL) {
1970                 SValue v1;
1971                 r = get_reg(RC_INT);
1972                 v1.type.t = VT_PTR;
1973                 v1.r = VT_LOCAL | VT_LVAL;
1974                 v1.c.ul = fc;
1975                 load(r, &v1);
1976                 fc = 0;
1977             }
1978
1979             if (op == TOK_EQ || op == TOK_NE) {
1980                 swapped = 0;
1981             } else {
1982                 if (op == TOK_LE || op == TOK_LT)
1983                     swapped = !swapped;
1984                 if (op == TOK_LE || op == TOK_GE) {
1985                     op = 0x93; /* setae */
1986                 } else {
1987                     op = 0x97; /* seta */
1988                 }
1989             }
1990
1991             if (swapped) {
1992                 gv(RC_FLOAT);
1993                 vswap();
1994             }
1995             assert(!(vtop[-1].r & VT_LVAL));
1996
1997             if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE)
1998                 o(0x66);
1999             if (op == TOK_EQ || op == TOK_NE)
2000                 o(0x2e0f); /* ucomisd */
2001             else
2002                 o(0x2f0f); /* comisd */
2003
2004             if (vtop->r & VT_LVAL) {
2005                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
2006             } else {
2007                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
2008             }
2009
2010             vtop--;
2011             vtop->r = VT_CMP;
2012             vtop->c.i = op | 0x100;
2013         } else {
2014             assert((vtop->type.t & VT_BTYPE) != VT_LDOUBLE);
2015             switch(op) {
2016             default:
2017             case '+':
2018                 a = 0;
2019                 break;
2020             case '-':
2021                 a = 4;
2022                 break;
2023             case '*':
2024                 a = 1;
2025                 break;
2026             case '/':
2027                 a = 6;
2028                 break;
2029             }
2030             ft = vtop->type.t;
2031             fc = vtop->c.ul;
2032             assert((ft & VT_BTYPE) != VT_LDOUBLE);
2033
2034             r = vtop->r;
2035             /* if saved lvalue, then we must reload it */
2036             if ((vtop->r & VT_VALMASK) == VT_LLOCAL) {
2037                 SValue v1;
2038                 r = get_reg(RC_INT);
2039                 v1.type.t = VT_PTR;
2040                 v1.r = VT_LOCAL | VT_LVAL;
2041                 v1.c.ul = fc;
2042                 load(r, &v1);
2043                 fc = 0;
2044             }
2045
2046             assert(!(vtop[-1].r & VT_LVAL));
2047             if (swapped) {
2048                 assert(vtop->r & VT_LVAL);
2049                 gv(RC_FLOAT);
2050                 vswap();
2051             }
2052
2053             if ((ft & VT_BTYPE) == VT_DOUBLE) {
2054                 o(0xf2);
2055             } else {
2056                 o(0xf3);
2057             }
2058             o(0x0f);
2059             o(0x58 + a);
2060
2061             if (vtop->r & VT_LVAL) {
2062                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
2063             } else {
2064                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
2065             }
2066
2067             vtop--;
2068         }
2069     }
2070 }
2071
2072 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
2073    and 'long long' cases. */
2074 void gen_cvt_itof(int t)
2075 {
2076     if ((t & VT_BTYPE) == VT_LDOUBLE) {
2077         save_reg(TREG_ST0);
2078         gv(RC_INT);
2079         if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
2080             /* signed long long to float/double/long double (unsigned case
2081                is handled generically) */
2082             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2083             o(0x242cdf); /* fildll (%rsp) */
2084             o(0x08c48348); /* add $8, %rsp */
2085         } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2086                    (VT_INT | VT_UNSIGNED)) {
2087             /* unsigned int to float/double/long double */
2088             o(0x6a); /* push $0 */
2089             g(0x00);
2090             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2091             o(0x242cdf); /* fildll (%rsp) */
2092             o(0x10c48348); /* add $16, %rsp */
2093         } else {
2094             /* int to float/double/long double */
2095             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2096             o(0x2404db); /* fildl (%rsp) */
2097             o(0x08c48348); /* add $8, %rsp */
2098         }
2099         vtop->r = TREG_ST0;
2100     } else {
2101         int r = get_reg(RC_FLOAT);
2102         gv(RC_INT);
2103         o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT?1:0));
2104         if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2105             (VT_INT | VT_UNSIGNED) ||
2106             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
2107             o(0x48); /* REX */
2108         }
2109         o(0x2a0f);
2110         o(0xc0 + (vtop->r & VT_VALMASK) + REG_VALUE(r)*8); /* cvtsi2sd */
2111         vtop->r = r;
2112     }
2113 }
2114
2115 /* convert from one floating point type to another */
2116 void gen_cvt_ftof(int t)
2117 {
2118     int ft, bt, tbt;
2119
2120     ft = vtop->type.t;
2121     bt = ft & VT_BTYPE;
2122     tbt = t & VT_BTYPE;
2123
2124     if (bt == VT_FLOAT) {
2125         gv(RC_FLOAT);
2126         if (tbt == VT_DOUBLE) {
2127             o(0x140f); /* unpcklps */
2128             o(0xc0 + REG_VALUE(vtop->r)*9);
2129             o(0x5a0f); /* cvtps2pd */
2130             o(0xc0 + REG_VALUE(vtop->r)*9);
2131         } else if (tbt == VT_LDOUBLE) {
2132             save_reg(RC_ST0);
2133             /* movss %xmm0,-0x10(%rsp) */
2134             o(0x110ff3);
2135             o(0x44 + REG_VALUE(vtop->r)*8);
2136             o(0xf024);
2137             o(0xf02444d9); /* flds -0x10(%rsp) */
2138             vtop->r = TREG_ST0;
2139         }
2140     } else if (bt == VT_DOUBLE) {
2141         gv(RC_FLOAT);
2142         if (tbt == VT_FLOAT) {
2143             o(0x140f66); /* unpcklpd */
2144             o(0xc0 + REG_VALUE(vtop->r)*9);
2145             o(0x5a0f66); /* cvtpd2ps */
2146             o(0xc0 + REG_VALUE(vtop->r)*9);
2147         } else if (tbt == VT_LDOUBLE) {
2148             save_reg(RC_ST0);
2149             /* movsd %xmm0,-0x10(%rsp) */
2150             o(0x110ff2);
2151             o(0x44 + REG_VALUE(vtop->r)*8);
2152             o(0xf024);
2153             o(0xf02444dd); /* fldl -0x10(%rsp) */
2154             vtop->r = TREG_ST0;
2155         }
2156     } else {
2157         int r;
2158         gv(RC_ST0);
2159         r = get_reg(RC_FLOAT);
2160         if (tbt == VT_DOUBLE) {
2161             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
2162             /* movsd -0x10(%rsp),%xmm0 */
2163             o(0x100ff2);
2164             o(0x44 + REG_VALUE(r)*8);
2165             o(0xf024);
2166             vtop->r = r;
2167         } else if (tbt == VT_FLOAT) {
2168             o(0xf0245cd9); /* fstps -0x10(%rsp) */
2169             /* movss -0x10(%rsp),%xmm0 */
2170             o(0x100ff3);
2171             o(0x44 + REG_VALUE(r)*8);
2172             o(0xf024);
2173             vtop->r = r;
2174         }
2175     }
2176 }
2177
2178 /* convert fp to int 't' type */
2179 void gen_cvt_ftoi(int t)
2180 {
2181     int ft, bt, size, r;
2182     ft = vtop->type.t;
2183     bt = ft & VT_BTYPE;
2184     if (bt == VT_LDOUBLE) {
2185         gen_cvt_ftof(VT_DOUBLE);
2186         bt = VT_DOUBLE;
2187     }
2188
2189     gv(RC_FLOAT);
2190     if (t != VT_INT)
2191         size = 8;
2192     else
2193         size = 4;
2194
2195     r = get_reg(RC_INT);
2196     if (bt == VT_FLOAT) {
2197         o(0xf3);
2198     } else if (bt == VT_DOUBLE) {
2199         o(0xf2);
2200     } else {
2201         assert(0);
2202     }
2203     orex(size == 8, r, 0, 0x2c0f); /* cvttss2si or cvttsd2si */
2204     o(0xc0 + REG_VALUE(vtop->r) + REG_VALUE(r)*8);
2205     vtop->r = r;
2206 }
2207
2208 /* computed goto support */
2209 void ggoto(void)
2210 {
2211     gcall_or_jmp(1);
2212     vtop--;
2213 }
2214
2215 /* Save the stack pointer onto the stack and return the location of its address */
2216 ST_FUNC void gen_vla_sp_save(int addr) {
2217     /* mov %rsp,addr(%rbp)*/
2218     gen_modrm64(0x89, TREG_RSP, VT_LOCAL, NULL, addr);
2219 }
2220
2221 /* Restore the SP from a location on the stack */
2222 ST_FUNC void gen_vla_sp_restore(int addr) {
2223     gen_modrm64(0x8b, TREG_RSP, VT_LOCAL, NULL, addr);
2224 }
2225
2226 /* Subtract from the stack pointer, and push the resulting value onto the stack */
2227 ST_FUNC void gen_vla_alloc(CType *type, int align) {
2228 #ifdef TCC_TARGET_PE
2229     /* alloca does more than just adjust %rsp on Windows */
2230     vpush_global_sym(&func_old_type, TOK_alloca);
2231     vswap(); /* Move alloca ref past allocation size */
2232     gfunc_call(1);
2233     vset(type, REG_IRET, 0);
2234 #else
2235     int r;
2236     r = gv(RC_INT); /* allocation size */
2237     /* sub r,%rsp */
2238     o(0x2b48);
2239     o(0xe0 | REG_VALUE(r));
2240     /* We align to 16 bytes rather than align */
2241     /* and ~15, %rsp */
2242     o(0xf0e48348);
2243     /* mov %rsp, r */
2244     o(0x8948);
2245     o(0xe0 | REG_VALUE(r));
2246     vpop();
2247     vset(type, r, 0);
2248 #endif
2249 }
2250
2251
2252 /* end of x86-64 code generator */
2253 /*************************************************************/
2254 #endif /* ! TARGET_DEFS_ONLY */
2255 /******************************************************/