x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #ifdef TARGET_DEFS_ONLY
  24
  25 /* number of available registers */
  26 #define NB_REGS         25
  27 #define NB_ASM_REGS     8
  28
  29 /* a register can belong to several classes. The classes must be
  30    sorted from more general to more precise (see gv2() code which does
  31    assumptions on it). */
  32 #define RC_INT     0x0001 /* generic integer register */
  33 #define RC_FLOAT   0x0002 /* generic float register */
  34 #define RC_RAX     0x0004
  35 #define RC_RCX     0x0008
  36 #define RC_RDX     0x0010
  37 #define RC_ST0     0x0080 /* only for long double */
  38 #define RC_R8      0x0100
  39 #define RC_R9      0x0200
  40 #define RC_R10     0x0400
  41 #define RC_R11     0x0800
  42 #define RC_XMM0    0x1000
  43 #define RC_XMM1    0x2000
  44 #define RC_XMM2    0x4000
  45 #define RC_XMM3    0x8000
  46 #define RC_XMM4    0x10000
  47 #define RC_XMM5    0x20000
  48 #define RC_XMM6    0x40000
  49 #define RC_XMM7    0x80000
  50 #define RC_IRET    RC_RAX /* function return: integer register */
  51 #define RC_LRET    RC_RDX /* function return: second integer register */
  52 #define RC_FRET    RC_XMM0 /* function return: float register */
  53 #define RC_QRET    RC_XMM1 /* function return: second float register */
  54
  55 /* pretty names for the registers */
  56 enum {
  57     TREG_RAX = 0,
  58     TREG_RCX = 1,
  59     TREG_RDX = 2,
  60     TREG_RSP = 4,
  61     TREG_RSI = 6,
  62     TREG_RDI = 7,
  63
  64     TREG_R8  = 8,
  65     TREG_R9  = 9,
  66     TREG_R10 = 10,
  67     TREG_R11 = 11,
  68
  69     TREG_XMM0 = 16,
  70     TREG_XMM1 = 17,
  71     TREG_XMM2 = 18,
  72     TREG_XMM3 = 19,
  73     TREG_XMM4 = 20,
  74     TREG_XMM5 = 21,
  75     TREG_XMM6 = 22,
  76     TREG_XMM7 = 23,
  77
  78     TREG_ST0 = 24,
  79
  80     TREG_MEM = 0x20,
  81 };
  82
  83 #define REX_BASE(reg) (((reg) >> 3) & 1)
  84 #define REG_VALUE(reg) ((reg) & 7)
  85
  86 /* return registers for function */
  87 #define REG_IRET TREG_RAX /* single word int return register */
  88 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  89 #define REG_FRET TREG_XMM0 /* float return register */
  90 #define REG_QRET TREG_XMM1 /* second float return register */
  91
  92 /* defined if function parameters must be evaluated in reverse order */
  93 #define INVERT_FUNC_PARAMS
  94
  95 /* pointer size, in bytes */
  96 #define PTR_SIZE 8
  97
  98 /* long double size and alignment, in bytes */
  99 #define LDOUBLE_SIZE  16
 100 #define LDOUBLE_ALIGN 16
 101 /* maximum alignment (for aligned attribute support) */
 102 #define MAX_ALIGN     16
 103
 104 /******************************************************/
 105 /* ELF defines */
 106
 107 #define EM_TCC_TARGET EM_X86_64
 108
 109 /* relocation type for 32 bit data relocation */
 110 #define R_DATA_32   R_X86_64_32
 111 #define R_DATA_PTR  R_X86_64_64
 112 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
 113 #define R_COPY      R_X86_64_COPY
 114
 115 #define ELF_START_ADDR 0x400000
 116 #define ELF_PAGE_SIZE  0x200000
 117
 118 /******************************************************/
 119 #else /* ! TARGET_DEFS_ONLY */
 120 /******************************************************/
 121 #include "tcc.h"
 122 #include <assert.h>
 123
 124 ST_DATA const int reg_classes[NB_REGS] = {
 125     /* eax */ RC_INT | RC_RAX,
 126     /* ecx */ RC_INT | RC_RCX,
 127     /* edx */ RC_INT | RC_RDX,
 128     0,
 129     0,
 130     0,
 131     0,
 132     0,
 133     RC_R8,
 134     RC_R9,
 135     RC_R10,
 136     RC_R11,
 137     0,
 138     0,
 139     0,
 140     0,
 141     /* xmm0 */ RC_FLOAT | RC_XMM0,
 142     /* xmm1 */ RC_FLOAT | RC_XMM1,
 143     /* xmm2 */ RC_FLOAT | RC_XMM2,
 144     /* xmm3 */ RC_FLOAT | RC_XMM3,
 145     /* xmm4 */ RC_FLOAT | RC_XMM4,
 146     /* xmm5 */ RC_FLOAT | RC_XMM5,
 147     /* xmm6 an xmm7 are included so gv() can be used on them,
 148        but they are not tagged with RC_FLOAT because they are
 149        callee saved on Windows */
 150     RC_XMM6,
 151     RC_XMM7,
 152     /* st0 */ RC_ST0
 153 };
 154
 155 static unsigned long func_sub_sp_offset;
 156 static int func_ret_sub;
 157
 158 /* XXX: make it faster ? */
 159 void g(int c)
 160 {
 161     int ind1;
 162     ind1 = ind + 1;
 163     if (ind1 > cur_text_section->data_allocated)
 164         section_realloc(cur_text_section, ind1);
 165     cur_text_section->data[ind] = c;
 166     ind = ind1;
 167 }
 168
 169 void o(unsigned int c)
 170 {
 171     while (c) {
 172         g(c);
 173         c = c >> 8;
 174     }
 175 }
 176
 177 void gen_le16(int v)
 178 {
 179     g(v);
 180     g(v >> 8);
 181 }
 182
 183 void gen_le32(int c)
 184 {
 185     g(c);
 186     g(c >> 8);
 187     g(c >> 16);
 188     g(c >> 24);
 189 }
 190
 191 void gen_le64(int64_t c)
 192 {
 193     g(c);
 194     g(c >> 8);
 195     g(c >> 16);
 196     g(c >> 24);
 197     g(c >> 32);
 198     g(c >> 40);
 199     g(c >> 48);
 200     g(c >> 56);
 201 }
 202
 203 void orex(int ll, int r, int r2, int b)
 204 {
 205     if ((r & VT_VALMASK) >= VT_CONST)
 206         r = 0;
 207     if ((r2 & VT_VALMASK) >= VT_CONST)
 208         r2 = 0;
 209     if (ll || REX_BASE(r) || REX_BASE(r2))
 210         o(0x40 | REX_BASE(r) | (REX_BASE(r2) << 2) | (ll << 3));
 211     o(b);
 212 }
 213
 214 /* output a symbol and patch all calls to it */
 215 void gsym_addr(int t, int a)
 216 {
 217     int n, *ptr;
 218     while (t) {
 219         ptr = (int *)(cur_text_section->data + t);
 220         n = *ptr; /* next value */
 221         *ptr = a - t - 4;
 222         t = n;
 223     }
 224 }
 225
 226 void gsym(int t)
 227 {
 228     gsym_addr(t, ind);
 229 }
 230
 231 /* psym is used to put an instruction with a data field which is a
 232    reference to a symbol. It is in fact the same as oad ! */
 233 #define psym oad
 234
 235 static int is64_type(int t)
 236 {
 237     return ((t & VT_BTYPE) == VT_PTR ||
 238             (t & VT_BTYPE) == VT_FUNC ||
 239             (t & VT_BTYPE) == VT_LLONG);
 240 }
 241
 242 /* instruction + 4 bytes data. Return the address of the data */
 243 ST_FUNC int oad(int c, int s)
 244 {
 245     int ind1;
 246
 247     o(c);
 248     ind1 = ind + 4;
 249     if (ind1 > cur_text_section->data_allocated)
 250         section_realloc(cur_text_section, ind1);
 251     *(int *)(cur_text_section->data + ind) = s;
 252     s = ind;
 253     ind = ind1;
 254     return s;
 255 }
 256
 257 ST_FUNC void gen_addr32(int r, Sym *sym, int c)
 258 {
 259     if (r & VT_SYM)
 260         greloc(cur_text_section, sym, ind, R_X86_64_32);
 261     gen_le32(c);
 262 }
 263
 264 /* output constant with relocation if 'r & VT_SYM' is true */
 265 ST_FUNC void gen_addr64(int r, Sym *sym, int64_t c)
 266 {
 267     if (r & VT_SYM)
 268         greloc(cur_text_section, sym, ind, R_X86_64_64);
 269     gen_le64(c);
 270 }
 271
 272 /* output constant with relocation if 'r & VT_SYM' is true */
 273 ST_FUNC void gen_addrpc32(int r, Sym *sym, int c)
 274 {
 275     if (r & VT_SYM)
 276         greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 277     gen_le32(c-4);
 278 }
 279
 280 /* output got address with relocation */
 281 static void gen_gotpcrel(int r, Sym *sym, int c)
 282 {
 283 #ifndef TCC_TARGET_PE
 284     Section *sr;
 285     ElfW(Rela) *rel;
 286     greloc(cur_text_section, sym, ind, R_X86_64_GOTPCREL);
 287     sr = cur_text_section->reloc;
 288     rel = (ElfW(Rela) *)(sr->data + sr->data_offset - sizeof(ElfW(Rela)));
 289     rel->r_addend = -4;
 290 #else
 291     tcc_error("internal error: no GOT on PE: %s %x %x | %02x %02x %02x\n",
 292         get_tok_str(sym->v, NULL), c, r,
 293         cur_text_section->data[ind-3],
 294         cur_text_section->data[ind-2],
 295         cur_text_section->data[ind-1]
 296         );
 297     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 298 #endif
 299     gen_le32(0);
 300     if (c) {
 301         /* we use add c, %xxx for displacement */
 302         orex(1, r, 0, 0x81);
 303         o(0xc0 + REG_VALUE(r));
 304         gen_le32(c);
 305     }
 306 }
 307
 308 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 309 {
 310     op_reg = REG_VALUE(op_reg) << 3;
 311     if ((r & VT_VALMASK) == VT_CONST) {
 312         /* constant memory reference */
 313         o(0x05 | op_reg);
 314         if (is_got) {
 315             gen_gotpcrel(r, sym, c);
 316         } else {
 317             gen_addrpc32(r, sym, c);
 318         }
 319     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 320         /* currently, we use only ebp as base */
 321         if (c == (char)c) {
 322             /* short reference */
 323             o(0x45 | op_reg);
 324             g(c);
 325         } else {
 326             oad(0x85 | op_reg, c);
 327         }
 328     } else if ((r & VT_VALMASK) >= TREG_MEM) {
 329         if (c) {
 330             g(0x80 | op_reg | REG_VALUE(r));
 331             gen_le32(c);
 332         } else {
 333             g(0x00 | op_reg | REG_VALUE(r));
 334         }
 335     } else {
 336         g(0x00 | op_reg | REG_VALUE(r));
 337     }
 338 }
 339
 340 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 341    opcode bits */
 342 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 343 {
 344     gen_modrm_impl(op_reg, r, sym, c, 0);
 345 }
 346
 347 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 348    opcode bits */
 349 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 350 {
 351     int is_got;
 352     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 353     orex(1, r, op_reg, opcode);
 354     gen_modrm_impl(op_reg, r, sym, c, is_got);
 355 }
 356
 357
 358 /* load 'r' from value 'sv' */
 359 void load(int r, SValue *sv)
 360 {
 361     int v, t, ft, fc, fr;
 362     SValue v1;
 363
 364 #ifdef TCC_TARGET_PE
 365     SValue v2;
 366     sv = pe_getimport(sv, &v2);
 367 #endif
 368
 369     fr = sv->r;
 370     ft = sv->type.t & ~VT_DEFSIGN;
 371     fc = sv->c.ul;
 372
 373 #ifndef TCC_TARGET_PE
 374     /* we use indirect access via got */
 375     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 376         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 377         /* use the result register as a temporal register */
 378         int tr = r | TREG_MEM;
 379         if (is_float(ft)) {
 380             /* we cannot use float registers as a temporal register */
 381             tr = get_reg(RC_INT) | TREG_MEM;
 382         }
 383         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 384
 385         /* load from the temporal register */
 386         fr = tr | VT_LVAL;
 387     }
 388 #endif
 389
 390     v = fr & VT_VALMASK;
 391     if (fr & VT_LVAL) {
 392         int b, ll;
 393         if (v == VT_LLOCAL) {
 394             v1.type.t = VT_PTR;
 395             v1.r = VT_LOCAL | VT_LVAL;
 396             v1.c.ul = fc;
 397             fr = r;
 398             if (!(reg_classes[fr] & (RC_INT|RC_R11)))
 399                 fr = get_reg(RC_INT);
 400             load(fr, &v1);
 401         }
 402         ll = 0;
 403         if ((ft & VT_BTYPE) == VT_FLOAT) {
 404             b = 0x6e0f66;
 405             r = REG_VALUE(r); /* movd */
 406         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 407             b = 0x7e0ff3; /* movq */
 408             r = REG_VALUE(r);
 409         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 410             b = 0xdb, r = 5; /* fldt */
 411         } else if ((ft & VT_TYPE) == VT_BYTE || (ft & VT_TYPE) == VT_BOOL) {
 412             b = 0xbe0f;   /* movsbl */
 413         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 414             b = 0xb60f;   /* movzbl */
 415         } else if ((ft & VT_TYPE) == VT_SHORT) {
 416             b = 0xbf0f;   /* movswl */
 417         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 418             b = 0xb70f;   /* movzwl */
 419         } else {
 420             assert(((ft & VT_BTYPE) == VT_INT) || ((ft & VT_BTYPE) == VT_LLONG)
 421                    || ((ft & VT_BTYPE) == VT_PTR) || ((ft & VT_BTYPE) == VT_ENUM)
 422                    || ((ft & VT_BTYPE) == VT_FUNC));
 423             ll = is64_type(ft);
 424             b = 0x8b;
 425         }
 426         if (ll) {
 427             gen_modrm64(b, r, fr, sv->sym, fc);
 428         } else {
 429             orex(ll, fr, r, b);
 430             gen_modrm(r, fr, sv->sym, fc);
 431         }
 432     } else {
 433         if (v == VT_CONST) {
 434             if (fr & VT_SYM) {
 435 #ifdef TCC_TARGET_PE
 436                 orex(1,0,r,0x8d);
 437                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 438                 gen_addrpc32(fr, sv->sym, fc);
 439 #else
 440                 if (sv->sym->type.t & VT_STATIC) {
 441                     orex(1,0,r,0x8d);
 442                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 443                     gen_addrpc32(fr, sv->sym, fc);
 444                 } else {
 445                     orex(1,0,r,0x8b);
 446                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 447                     gen_gotpcrel(r, sv->sym, fc);
 448                 }
 449 #endif
 450             } else if (is64_type(ft)) {
 451                 orex(1,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 452                 gen_le64(sv->c.ull);
 453             } else {
 454                 orex(0,r,0, 0xb8 + REG_VALUE(r)); /* mov $xx, r */
 455                 gen_le32(fc);
 456             }
 457         } else if (v == VT_LOCAL) {
 458             orex(1,0,r,0x8d); /* lea xxx(%ebp), r */
 459             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 460         } else if (v == VT_CMP) {
 461             orex(0,r,0,0);
 462             if ((fc & ~0x100) != TOK_NE)
 463               oad(0xb8 + REG_VALUE(r), 0); /* mov $0, r */
 464             else
 465               oad(0xb8 + REG_VALUE(r), 1); /* mov $1, r */
 466             if (fc & 0x100)
 467               {
 468                 /* This was a float compare.  If the parity bit is
 469                    set the result was unordered, meaning false for everything
 470                    except TOK_NE, and true for TOK_NE.  */
 471                 fc &= ~0x100;
 472                 o(0x037a + (REX_BASE(r) << 8));
 473               }
 474             orex(0,r,0, 0x0f); /* setxx %br */
 475             o(fc);
 476             o(0xc0 + REG_VALUE(r));
 477         } else if (v == VT_JMP || v == VT_JMPI) {
 478             t = v & 1;
 479             orex(0,r,0,0);
 480             oad(0xb8 + REG_VALUE(r), t); /* mov $1, r */
 481             o(0x05eb + (REX_BASE(r) << 8)); /* jmp after */
 482             gsym(fc);
 483             orex(0,r,0,0);
 484             oad(0xb8 + REG_VALUE(r), t ^ 1); /* mov $0, r */
 485         } else if (v != r) {
 486             if ((r >= TREG_XMM0) && (r <= TREG_XMM7)) {
 487                 if (v == TREG_ST0) {
 488                     /* gen_cvt_ftof(VT_DOUBLE); */
 489                     o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 490                     /* movsd -0x10(%rsp),%xmmN */
 491                     o(0x100ff2);
 492                     o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 493                     o(0xf024);
 494                 } else {
 495                     assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 496                     if ((ft & VT_BTYPE) == VT_FLOAT) {
 497                         o(0x100ff3);
 498                     } else {
 499                         assert((ft & VT_BTYPE) == VT_DOUBLE);
 500                         o(0x100ff2);
 501                     }
 502                     o(0xc0 + REG_VALUE(v) + REG_VALUE(r)*8);
 503                 }
 504             } else if (r == TREG_ST0) {
 505                 assert((v >= TREG_XMM0) && (v <= TREG_XMM7));
 506                 /* gen_cvt_ftof(VT_LDOUBLE); */
 507                 /* movsd %xmmN,-0x10(%rsp) */
 508                 o(0x110ff2);
 509                 o(0x44 + REG_VALUE(r)*8); /* %xmmN */
 510                 o(0xf024);
 511                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 512             } else {
 513                 orex(1,r,v, 0x89);
 514                 o(0xc0 + REG_VALUE(r) + REG_VALUE(v) * 8); /* mov v, r */
 515             }
 516         }
 517     }
 518 }
 519
 520 /* store register 'r' in lvalue 'v' */
 521 void store(int r, SValue *v)
 522 {
 523     int fr, bt, ft, fc;
 524     int op64 = 0;
 525     /* store the REX prefix in this variable when PIC is enabled */
 526     int pic = 0;
 527
 528 #ifdef TCC_TARGET_PE
 529     SValue v2;
 530     v = pe_getimport(v, &v2);
 531 #endif
 532
 533     ft = v->type.t;
 534     fc = v->c.ul;
 535     fr = v->r & VT_VALMASK;
 536     bt = ft & VT_BTYPE;
 537
 538 #ifndef TCC_TARGET_PE
 539     /* we need to access the variable via got */
 540     if (fr == VT_CONST && (v->r & VT_SYM)) {
 541         /* mov xx(%rip), %r11 */
 542         o(0x1d8b4c);
 543         gen_gotpcrel(TREG_R11, v->sym, v->c.ul);
 544         pic = is64_type(bt) ? 0x49 : 0x41;
 545     }
 546 #endif
 547
 548     /* XXX: incorrect if float reg to reg */
 549     if (bt == VT_FLOAT) {
 550         o(0x66);
 551         o(pic);
 552         o(0x7e0f); /* movd */
 553         r = REG_VALUE(r);
 554     } else if (bt == VT_DOUBLE) {
 555         o(0x66);
 556         o(pic);
 557         o(0xd60f); /* movq */
 558         r = REG_VALUE(r);
 559     } else if (bt == VT_LDOUBLE) {
 560         o(0xc0d9); /* fld %st(0) */
 561         o(pic);
 562         o(0xdb); /* fstpt */
 563         r = 7;
 564     } else {
 565         if (bt == VT_SHORT)
 566             o(0x66);
 567         o(pic);
 568         if (bt == VT_BYTE || bt == VT_BOOL)
 569             orex(0, 0, r, 0x88);
 570         else if (is64_type(bt))
 571             op64 = 0x89;
 572         else
 573             orex(0, 0, r, 0x89);
 574     }
 575     if (pic) {
 576         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 577         if (op64)
 578             o(op64);
 579         o(3 + (r << 3));
 580     } else if (op64) {
 581         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 582             gen_modrm64(op64, r, v->r, v->sym, fc);
 583         } else if (fr != r) {
 584             /* XXX: don't we really come here? */
 585             abort();
 586             o(0xc0 + fr + r * 8); /* mov r, fr */
 587         }
 588     } else {
 589         if (fr == VT_CONST || fr == VT_LOCAL || (v->r & VT_LVAL)) {
 590             gen_modrm(r, v->r, v->sym, fc);
 591         } else if (fr != r) {
 592             /* XXX: don't we really come here? */
 593             abort();
 594             o(0xc0 + fr + r * 8); /* mov r, fr */
 595         }
 596     }
 597 }
 598
 599 /* 'is_jmp' is '1' if it is a jump */
 600 static void gcall_or_jmp(int is_jmp)
 601 {
 602     int r;
 603     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST &&
 604         ((vtop->r & VT_SYM) || (vtop->c.ll-4) == (int)(vtop->c.ll-4))) {
 605         /* constant case */
 606         if (vtop->r & VT_SYM) {
 607             /* relocation case */
 608 #ifdef TCC_TARGET_PE
 609             greloc(cur_text_section, vtop->sym, ind + 1, R_X86_64_PC32);
 610 #else
 611             greloc(cur_text_section, vtop->sym, ind + 1, R_X86_64_PLT32);
 612 #endif
 613         } else {
 614             /* put an empty PC32 relocation */
 615             put_elf_reloc(symtab_section, cur_text_section,
 616                           ind + 1, R_X86_64_PC32, 0);
 617         }
 618         oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
 619     } else {
 620         /* otherwise, indirect call */
 621         r = TREG_R11;
 622         load(r, vtop);
 623         o(0x41); /* REX */
 624         o(0xff); /* call/jmp *r */
 625         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 626     }
 627 }
 628
 629 #if defined(CONFIG_TCC_BCHECK)
 630 #ifndef TCC_TARGET_PE
 631 static addr_t func_bound_offset;
 632 static unsigned long func_bound_ind;
 633 #endif
 634
 635 static void gen_static_call(int v)
 636 {
 637     Sym *sym = external_global_sym(v, &func_old_type, 0);
 638     oad(0xe8, -4);
 639     greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 640 }
 641
 642 /* generate a bounded pointer addition */
 643 ST_FUNC void gen_bounded_ptr_add(void)
 644 {
 645     /* save all temporary registers */
 646     save_regs(0);
 647
 648     /* prepare fast x86_64 function call */
 649     gv(RC_RAX);
 650     o(0xc68948); // mov  %rax,%rsi ## second arg in %rsi, this must be size
 651     vtop--;
 652
 653     gv(RC_RAX);
 654     o(0xc78948); // mov  %rax,%rdi ## first arg in %rdi, this must be ptr
 655     vtop--;
 656
 657     /* do a fast function call */
 658     gen_static_call(TOK___bound_ptr_add);
 659
 660     /* returned pointer is in rax */
 661     vtop++;
 662     vtop->r = TREG_RAX | VT_BOUNDED;
 663
 664
 665     /* relocation offset of the bounding function call point */
 666     vtop->c.ull = (cur_text_section->reloc->data_offset - sizeof(ElfW(Rela)));
 667 }
 668
 669 /* patch pointer addition in vtop so that pointer dereferencing is
 670    also tested */
 671 ST_FUNC void gen_bounded_ptr_deref(void)
 672 {
 673     addr_t func;
 674     int size, align;
 675     ElfW(Rela) *rel;
 676     Sym *sym;
 677
 678     size = 0;
 679     /* XXX: put that code in generic part of tcc */
 680     if (!is_float(vtop->type.t)) {
 681         if (vtop->r & VT_LVAL_BYTE)
 682             size = 1;
 683         else if (vtop->r & VT_LVAL_SHORT)
 684             size = 2;
 685     }
 686     if (!size)
 687     size = type_size(&vtop->type, &align);
 688     switch(size) {
 689     case  1: func = TOK___bound_ptr_indir1; break;
 690     case  2: func = TOK___bound_ptr_indir2; break;
 691     case  4: func = TOK___bound_ptr_indir4; break;
 692     case  8: func = TOK___bound_ptr_indir8; break;
 693     case 12: func = TOK___bound_ptr_indir12; break;
 694     case 16: func = TOK___bound_ptr_indir16; break;
 695     default:
 696         tcc_error("unhandled size when dereferencing bounded pointer");
 697         func = 0;
 698         break;
 699     }
 700
 701     sym = external_global_sym(func, &func_old_type, 0);
 702     if (!sym->c)
 703         put_extern_sym(sym, NULL, 0, 0);
 704
 705     /* patch relocation */
 706     /* XXX: find a better solution ? */
 707
 708     rel = (ElfW(Rela) *)(cur_text_section->reloc->data + vtop->c.ull);
 709     rel->r_info = ELF64_R_INFO(sym->c, ELF64_R_TYPE(rel->r_info));
 710 }
 711 #endif
 712
 713 #ifdef TCC_TARGET_PE
 714
 715 #define REGN 4
 716 static const uint8_t arg_regs[REGN] = {
 717     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 718 };
 719
 720 /* Prepare arguments in R10 and R11 rather than RCX and RDX
 721    because gv() will not ever use these */
 722 static int arg_prepare_reg(int idx) {
 723   if (idx == 0 || idx == 1)
 724       /* idx=0: r10, idx=1: r11 */
 725       return idx + 10;
 726   else
 727       return arg_regs[idx];
 728 }
 729
 730 static int func_scratch;
 731
 732 /* Generate function call. The function address is pushed first, then
 733    all the parameters in call order. This functions pops all the
 734    parameters and the function address. */
 735
 736 void gen_offs_sp(int b, int r, int d)
 737 {
 738     orex(1,0,r & 0x100 ? 0 : r, b);
 739     if (d == (char)d) {
 740         o(0x2444 | (REG_VALUE(r) << 3));
 741         g(d);
 742     } else {
 743         o(0x2484 | (REG_VALUE(r) << 3));
 744         gen_le32(d);
 745     }
 746 }
 747
 748 /* Return the number of registers needed to return the struct, or 0 if
 749    returning via struct pointer. */
 750 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
 751 {
 752     int size, align;
 753     *regsize = 8;
 754     *ret_align = 1; // Never have to re-align return values for x86-64
 755     size = type_size(vt, &align);
 756     ret->ref = NULL;
 757     if (size > 8) {
 758         return 0;
 759     } else if (size > 4) {
 760         ret->t = VT_LLONG;
 761         return 1;
 762     } else if (size > 2) {
 763         ret->t = VT_INT;
 764         return 1;
 765     } else if (size > 1) {
 766         ret->t = VT_SHORT;
 767         return 1;
 768     } else {
 769         ret->t = VT_BYTE;
 770         return 1;
 771     }
 772 }
 773
 774 static int is_sse_float(int t) {
 775     int bt;
 776     bt = t & VT_BTYPE;
 777     return bt == VT_DOUBLE || bt == VT_FLOAT;
 778 }
 779
 780 int gfunc_arg_size(CType *type) {
 781     int align;
 782     if (type->t & (VT_ARRAY|VT_BITFIELD))
 783         return 8;
 784     return type_size(type, &align);
 785 }
 786
 787 void gfunc_call(int nb_args)
 788 {
 789     int size, r, args_size, i, d, bt, struct_size;
 790     int arg;
 791
 792     args_size = (nb_args < REGN ? REGN : nb_args) * PTR_SIZE;
 793     arg = nb_args;
 794
 795     /* for struct arguments, we need to call memcpy and the function
 796        call breaks register passing arguments we are preparing.
 797        So, we process arguments which will be passed by stack first. */
 798     struct_size = args_size;
 799     for(i = 0; i < nb_args; i++) {
 800         SValue *sv;
 801
 802         --arg;
 803         sv = &vtop[-i];
 804         bt = (sv->type.t & VT_BTYPE);
 805         size = gfunc_arg_size(&sv->type);
 806
 807         if (size <= 8)
 808             continue; /* arguments smaller than 8 bytes passed in registers or on stack */
 809
 810         if (bt == VT_STRUCT) {
 811             /* align to stack align size */
 812             size = (size + 15) & ~15;
 813             /* generate structure store */
 814             r = get_reg(RC_INT);
 815             gen_offs_sp(0x8d, r, struct_size);
 816             struct_size += size;
 817
 818             /* generate memcpy call */
 819             vset(&sv->type, r | VT_LVAL, 0);
 820             vpushv(sv);
 821             vstore();
 822             --vtop;
 823         } else if (bt == VT_LDOUBLE) {
 824             gv(RC_ST0);
 825             gen_offs_sp(0xdb, 0x107, struct_size);
 826             struct_size += 16;
 827         }
 828     }
 829
 830     if (func_scratch < struct_size)
 831         func_scratch = struct_size;
 832
 833     arg = nb_args;
 834     struct_size = args_size;
 835
 836     for(i = 0; i < nb_args; i++) {
 837         --arg;
 838         bt = (vtop->type.t & VT_BTYPE);
 839
 840         size = gfunc_arg_size(&vtop->type);
 841         if (size > 8) {
 842             /* align to stack align size */
 843             size = (size + 15) & ~15;
 844             if (arg >= REGN) {
 845                 d = get_reg(RC_INT);
 846                 gen_offs_sp(0x8d, d, struct_size);
 847                 gen_offs_sp(0x89, d, arg*8);
 848             } else {
 849                 d = arg_prepare_reg(arg);
 850                 gen_offs_sp(0x8d, d, struct_size);
 851             }
 852             struct_size += size;
 853         } else {
 854             if (is_sse_float(vtop->type.t)) {
 855                 gv(RC_XMM0); /* only use one float register */
 856                 if (arg >= REGN) {
 857                     /* movq %xmm0, j*8(%rsp) */
 858                     gen_offs_sp(0xd60f66, 0x100, arg*8);
 859                 } else {
 860                     /* movaps %xmm0, %xmmN */
 861                     o(0x280f);
 862                     o(0xc0 + (arg << 3));
 863                     d = arg_prepare_reg(arg);
 864                     /* mov %xmm0, %rxx */
 865                     o(0x66);
 866                     orex(1,d,0, 0x7e0f);
 867                     o(0xc0 + REG_VALUE(d));
 868                 }
 869             } else {
 870                 if (bt == VT_STRUCT) {
 871                     vtop->type.ref = NULL;
 872                     vtop->type.t = size > 4 ? VT_LLONG : size > 2 ? VT_INT
 873                         : size > 1 ? VT_SHORT : VT_BYTE;
 874                 }
 875
 876                 r = gv(RC_INT);
 877                 if (arg >= REGN) {
 878                     gen_offs_sp(0x89, r, arg*8);
 879                 } else {
 880                     d = arg_prepare_reg(arg);
 881                     orex(1,d,r,0x89); /* mov */
 882                     o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
 883                 }
 884             }
 885         }
 886         vtop--;
 887     }
 888     save_regs(0);
 889
 890     /* Copy R10 and R11 into RCX and RDX, respectively */
 891     if (nb_args > 0) {
 892         o(0xd1894c); /* mov %r10, %rcx */
 893         if (nb_args > 1) {
 894             o(0xda894c); /* mov %r11, %rdx */
 895         }
 896     }
 897
 898     gcall_or_jmp(0);
 899     vtop--;
 900 }
 901
 902
 903 #define FUNC_PROLOG_SIZE 11
 904
 905 /* generate function prolog of type 't' */
 906 void gfunc_prolog(CType *func_type)
 907 {
 908     int addr, reg_param_index, bt, size;
 909     Sym *sym;
 910     CType *type;
 911
 912     func_ret_sub = 0;
 913     func_scratch = 0;
 914     loc = 0;
 915
 916     addr = PTR_SIZE * 2;
 917     ind += FUNC_PROLOG_SIZE;
 918     func_sub_sp_offset = ind;
 919     reg_param_index = 0;
 920
 921     sym = func_type->ref;
 922
 923     /* if the function returns a structure, then add an
 924        implicit pointer parameter */
 925     func_vt = sym->type;
 926     func_var = (sym->c == FUNC_ELLIPSIS);
 927     size = gfunc_arg_size(&func_vt);
 928     if (size > 8) {
 929         gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 930         func_vc = addr;
 931         reg_param_index++;
 932         addr += 8;
 933     }
 934
 935     /* define parameters */
 936     while ((sym = sym->next) != NULL) {
 937         type = &sym->type;
 938         bt = type->t & VT_BTYPE;
 939         size = gfunc_arg_size(type);
 940         if (size > 8) {
 941             if (reg_param_index < REGN) {
 942                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 943             }
 944             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL | VT_REF, addr);
 945         } else {
 946             if (reg_param_index < REGN) {
 947                 /* save arguments passed by register */
 948                 if ((bt == VT_FLOAT) || (bt == VT_DOUBLE)) {
 949                     o(0xd60f66); /* movq */
 950                     gen_modrm(reg_param_index, VT_LOCAL, NULL, addr);
 951                 } else {
 952                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 953                 }
 954             }
 955             sym_push(sym->v & ~SYM_FIELD, type, VT_LOCAL | VT_LVAL, addr);
 956         }
 957         addr += 8;
 958         reg_param_index++;
 959     }
 960
 961     while (reg_param_index < REGN) {
 962         if (func_type->ref->c == FUNC_ELLIPSIS) {
 963             gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 964             addr += 8;
 965         }
 966         reg_param_index++;
 967     }
 968 }
 969
 970 /* generate function epilog */
 971 void gfunc_epilog(void)
 972 {
 973     int v, saved_ind;
 974
 975     o(0xc9); /* leave */
 976     if (func_ret_sub == 0) {
 977         o(0xc3); /* ret */
 978     } else {
 979         o(0xc2); /* ret n */
 980         g(func_ret_sub);
 981         g(func_ret_sub >> 8);
 982     }
 983
 984     saved_ind = ind;
 985     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 986     /* align local size to word & save local variables */
 987     v = (func_scratch + -loc + 15) & -16;
 988
 989     if (v >= 4096) {
 990         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 991         oad(0xb8, v); /* mov stacksize, %eax */
 992         oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
 993         greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 994         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
 995     } else {
 996         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 997         o(0xec8148);  /* sub rsp, stacksize */
 998         gen_le32(v);
 999     }
1000
1001     cur_text_section->data_offset = saved_ind;
1002     pe_add_unwind_data(ind, saved_ind, v);
1003     ind = cur_text_section->data_offset;
1004 }
1005
1006 #else
1007
1008 static void gadd_sp(int val)
1009 {
1010     if (val == (char)val) {
1011         o(0xc48348);
1012         g(val);
1013     } else {
1014         oad(0xc48148, val); /* add $xxx, %rsp */
1015     }
1016 }
1017
1018 typedef enum X86_64_Mode {
1019   x86_64_mode_none,
1020   x86_64_mode_memory,
1021   x86_64_mode_integer,
1022   x86_64_mode_sse,
1023   x86_64_mode_x87
1024 } X86_64_Mode;
1025
1026 static X86_64_Mode classify_x86_64_merge(X86_64_Mode a, X86_64_Mode b)
1027 {
1028     if (a == b)
1029         return a;
1030     else if (a == x86_64_mode_none)
1031         return b;
1032     else if (b == x86_64_mode_none)
1033         return a;
1034     else if ((a == x86_64_mode_memory) || (b == x86_64_mode_memory))
1035         return x86_64_mode_memory;
1036     else if ((a == x86_64_mode_integer) || (b == x86_64_mode_integer))
1037         return x86_64_mode_integer;
1038     else if ((a == x86_64_mode_x87) || (b == x86_64_mode_x87))
1039         return x86_64_mode_memory;
1040     else
1041         return x86_64_mode_sse;
1042 }
1043
1044 static X86_64_Mode classify_x86_64_inner(CType *ty)
1045 {
1046     X86_64_Mode mode;
1047     Sym *f;
1048
1049     switch (ty->t & VT_BTYPE) {
1050     case VT_VOID: return x86_64_mode_none;
1051
1052     case VT_INT:
1053     case VT_BYTE:
1054     case VT_SHORT:
1055     case VT_LLONG:
1056     case VT_BOOL:
1057     case VT_PTR:
1058     case VT_FUNC:
1059     case VT_ENUM: return x86_64_mode_integer;
1060
1061     case VT_FLOAT:
1062     case VT_DOUBLE: return x86_64_mode_sse;
1063
1064     case VT_LDOUBLE: return x86_64_mode_x87;
1065
1066     case VT_STRUCT:
1067         f = ty->ref;
1068
1069         mode = x86_64_mode_none;
1070         for (f = f->next; f; f = f->next)
1071             mode = classify_x86_64_merge(mode, classify_x86_64_inner(&f->type));
1072
1073         return mode;
1074     }
1075
1076     assert(0);
1077 }
1078
1079 static X86_64_Mode classify_x86_64_arg(CType *ty, CType *ret, int *psize, int *palign, int *reg_count)
1080 {
1081     X86_64_Mode mode;
1082     int size, align, ret_t = 0;
1083
1084     if (ty->t & (VT_BITFIELD|VT_ARRAY)) {
1085         *psize = 8;
1086         *palign = 8;
1087         *reg_count = 1;
1088         ret_t = ty->t;
1089         mode = x86_64_mode_integer;
1090     } else {
1091         size = type_size(ty, &align);
1092         *psize = (size + 7) & ~7;
1093         *palign = (align + 7) & ~7;
1094
1095         if (size > 16) {
1096             mode = x86_64_mode_memory;
1097         } else {
1098             mode = classify_x86_64_inner(ty);
1099             switch (mode) {
1100             case x86_64_mode_integer:
1101                 if (size > 8) {
1102                     *reg_count = 2;
1103                     ret_t = VT_QLONG;
1104                 } else {
1105                     *reg_count = 1;
1106                     ret_t = (size > 4) ? VT_LLONG : VT_INT;
1107                 }
1108                 break;
1109
1110             case x86_64_mode_x87:
1111                 *reg_count = 1;
1112                 ret_t = VT_LDOUBLE;
1113                 break;
1114
1115             case x86_64_mode_sse:
1116                 if (size > 8) {
1117                     *reg_count = 2;
1118                     ret_t = VT_QFLOAT;
1119                 } else {
1120                     *reg_count = 1;
1121                     ret_t = (size > 4) ? VT_DOUBLE : VT_FLOAT;
1122                 }
1123                 break;
1124             default: break; /* nothing to be done for x86_64_mode_memory and x86_64_mode_none*/
1125             }
1126         }
1127     }
1128
1129     if (ret) {
1130         ret->ref = NULL;
1131         ret->t = ret_t;
1132     }
1133
1134     return mode;
1135 }
1136
1137 ST_FUNC int classify_x86_64_va_arg(CType *ty)
1138 {
1139     /* This definition must be synced with stdarg.h */
1140     enum __va_arg_type {
1141         __va_gen_reg, __va_float_reg, __va_stack
1142     };
1143     int size, align, reg_count;
1144     X86_64_Mode mode = classify_x86_64_arg(ty, NULL, &size, &align, &reg_count);
1145     switch (mode) {
1146     default: return __va_stack;
1147     case x86_64_mode_integer: return __va_gen_reg;
1148     case x86_64_mode_sse: return __va_float_reg;
1149     }
1150 }
1151
1152 /* Return the number of registers needed to return the struct, or 0 if
1153    returning via struct pointer. */
1154 ST_FUNC int gfunc_sret(CType *vt, int variadic, CType *ret, int *ret_align, int *regsize)
1155 {
1156     int size, align, reg_count;
1157     *ret_align = 1; // Never have to re-align return values for x86-64
1158     *regsize = 8;
1159     return (classify_x86_64_arg(vt, ret, &size, &align, &reg_count) != x86_64_mode_memory);
1160 }
1161
1162 #define REGN 6
1163 static const uint8_t arg_regs[REGN] = {
1164     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
1165 };
1166
1167 static int arg_prepare_reg(int idx) {
1168   if (idx == 2 || idx == 3)
1169       /* idx=2: r10, idx=3: r11 */
1170       return idx + 8;
1171   else
1172       return arg_regs[idx];
1173 }
1174
1175 /* Generate function call. The function address is pushed first, then
1176    all the parameters in call order. This functions pops all the
1177    parameters and the function address. */
1178 void gfunc_call(int nb_args)
1179 {
1180     X86_64_Mode mode;
1181     CType type;
1182     int size, align, r, args_size, stack_adjust, run_start, run_end, i, reg_count;
1183     int nb_reg_args = 0;
1184     int nb_sse_args = 0;
1185     int sse_reg, gen_reg;
1186
1187     /* calculate the number of integer/float register arguments */
1188     for(i = 0; i < nb_args; i++) {
1189         mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1190         if (mode == x86_64_mode_sse)
1191             nb_sse_args += reg_count;
1192         else if (mode == x86_64_mode_integer)
1193             nb_reg_args += reg_count;
1194     }
1195
1196     /* arguments are collected in runs. Each run is a collection of 8-byte aligned arguments
1197        and ended by a 16-byte aligned argument. This is because, from the point of view of
1198        the callee, argument alignment is computed from the bottom up. */
1199     /* for struct arguments, we need to call memcpy and the function
1200        call breaks register passing arguments we are preparing.
1201        So, we process arguments which will be passed by stack first. */
1202     gen_reg = nb_reg_args;
1203     sse_reg = nb_sse_args;
1204     run_start = 0;
1205     args_size = 0;
1206     while (run_start != nb_args) {
1207         int run_gen_reg = gen_reg, run_sse_reg = sse_reg;
1208
1209         run_end = nb_args;
1210         stack_adjust = 0;
1211         for(i = run_start; (i < nb_args) && (run_end == nb_args); i++) {
1212             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1213             switch (mode) {
1214             case x86_64_mode_memory:
1215             case x86_64_mode_x87:
1216             stack_arg:
1217                 if (align == 16)
1218                     run_end = i;
1219                 else
1220                     stack_adjust += size;
1221                 break;
1222
1223             case x86_64_mode_sse:
1224                 sse_reg -= reg_count;
1225                 if (sse_reg + reg_count > 8) goto stack_arg;
1226                 break;
1227
1228             case x86_64_mode_integer:
1229                 gen_reg -= reg_count;
1230                 if (gen_reg + reg_count > REGN) goto stack_arg;
1231                 break;
1232             default: break; /* nothing to be done for x86_64_mode_none */
1233             }
1234         }
1235
1236         gen_reg = run_gen_reg;
1237         sse_reg = run_sse_reg;
1238
1239         /* adjust stack to align SSE boundary */
1240         if (stack_adjust &= 15) {
1241             /* fetch cpu flag before the following sub will change the value */
1242             if (vtop >= vstack && (vtop->r & VT_VALMASK) == VT_CMP)
1243                 gv(RC_INT);
1244
1245             stack_adjust = 16 - stack_adjust;
1246             o(0x48);
1247             oad(0xec81, stack_adjust); /* sub $xxx, %rsp */
1248             args_size += stack_adjust;
1249         }
1250
1251         for(i = run_start; i < run_end;) {
1252             /* Swap argument to top, it will possibly be changed here,
1253               and might use more temps. At the end of the loop we keep
1254               in on the stack and swap it back to its original position
1255               if it is a register. */
1256             SValue tmp = vtop[0];
1257             int arg_stored = 1;
1258
1259             vtop[0] = vtop[-i];
1260             vtop[-i] = tmp;
1261             mode = classify_x86_64_arg(&vtop->type, NULL, &size, &align, &reg_count);
1262
1263             switch (vtop->type.t & VT_BTYPE) {
1264             case VT_STRUCT:
1265                 if (mode == x86_64_mode_sse) {
1266                     if (sse_reg > 8)
1267                         sse_reg -= reg_count;
1268                     else
1269                         arg_stored = 0;
1270                 } else if (mode == x86_64_mode_integer) {
1271                     if (gen_reg > REGN)
1272                         gen_reg -= reg_count;
1273                     else
1274                         arg_stored = 0;
1275                 }
1276
1277                 if (arg_stored) {
1278                     /* allocate the necessary size on stack */
1279                     o(0x48);
1280                     oad(0xec81, size); /* sub $xxx, %rsp */
1281                     /* generate structure store */
1282                     r = get_reg(RC_INT);
1283                     orex(1, r, 0, 0x89); /* mov %rsp, r */
1284                     o(0xe0 + REG_VALUE(r));
1285                     vset(&vtop->type, r | VT_LVAL, 0);
1286                     vswap();
1287                     vstore();
1288                     args_size += size;
1289                 }
1290                 break;
1291
1292             case VT_LDOUBLE:
1293                 assert(0);
1294                 break;
1295
1296             case VT_FLOAT:
1297             case VT_DOUBLE:
1298                 assert(mode == x86_64_mode_sse);
1299                 if (sse_reg > 8) {
1300                     --sse_reg;
1301                     r = gv(RC_FLOAT);
1302                     o(0x50); /* push $rax */
1303                     /* movq %xmmN, (%rsp) */
1304                     o(0xd60f66);
1305                     o(0x04 + REG_VALUE(r)*8);
1306                     o(0x24);
1307                     args_size += size;
1308                 } else {
1309                     arg_stored = 0;
1310                 }
1311                 break;
1312
1313             default:
1314                 assert(mode == x86_64_mode_integer);
1315                 /* simple type */
1316                 /* XXX: implicit cast ? */
1317                 if (gen_reg > REGN) {
1318                     --gen_reg;
1319                     r = gv(RC_INT);
1320                     orex(0,r,0,0x50 + REG_VALUE(r)); /* push r */
1321                     args_size += size;
1322                 } else {
1323                     arg_stored = 0;
1324                 }
1325                 break;
1326             }
1327
1328             /* And swap the argument back to it's original position.  */
1329             tmp = vtop[0];
1330             vtop[0] = vtop[-i];
1331             vtop[-i] = tmp;
1332
1333             if (arg_stored) {
1334               vrotb(i+1);
1335               assert((vtop->type.t == tmp.type.t) && (vtop->r == tmp.r));
1336               vpop();
1337               --nb_args;
1338               --run_end;
1339             } else {
1340               ++i;
1341             }
1342         }
1343
1344         /* handle 16 byte aligned arguments at end of run */
1345         run_start = i = run_end;
1346         while (i < nb_args) {
1347             /* Rotate argument to top since it will always be popped */
1348             mode = classify_x86_64_arg(&vtop[-i].type, NULL, &size, &align, &reg_count);
1349             if (align != 16)
1350               break;
1351
1352             vrotb(i+1);
1353
1354             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1355                 gv(RC_ST0);
1356                 oad(0xec8148, size); /* sub $xxx, %rsp */
1357                 o(0x7cdb); /* fstpt 0(%rsp) */
1358                 g(0x24);
1359                 g(0x00);
1360                 args_size += size;
1361             } else {
1362                 assert(mode == x86_64_mode_memory);
1363
1364                 /* allocate the necessary size on stack */
1365                 o(0x48);
1366                 oad(0xec81, size); /* sub $xxx, %rsp */
1367                 /* generate structure store */
1368                 r = get_reg(RC_INT);
1369                 orex(1, r, 0, 0x89); /* mov %rsp, r */
1370                 o(0xe0 + REG_VALUE(r));
1371                 vset(&vtop->type, r | VT_LVAL, 0);
1372                 vswap();
1373                 vstore();
1374                 args_size += size;
1375             }
1376
1377             vpop();
1378             --nb_args;
1379         }
1380     }
1381
1382     /* XXX This should be superfluous.  */
1383     save_regs(0); /* save used temporary registers */
1384
1385     /* then, we prepare register passing arguments.
1386        Note that we cannot set RDX and RCX in this loop because gv()
1387        may break these temporary registers. Let's use R10 and R11
1388        instead of them */
1389     assert(gen_reg <= REGN);
1390     assert(sse_reg <= 8);
1391     for(i = 0; i < nb_args; i++) {
1392         mode = classify_x86_64_arg(&vtop->type, &type, &size, &align, &reg_count);
1393         /* Alter stack entry type so that gv() knows how to treat it */
1394         vtop->type = type;
1395         if (mode == x86_64_mode_sse) {
1396             if (reg_count == 2) {
1397                 sse_reg -= 2;
1398                 gv(RC_FRET); /* Use pair load into xmm0 & xmm1 */
1399                 if (sse_reg) { /* avoid redundant movaps %xmm0, %xmm0 */
1400                     /* movaps %xmm0, %xmmN */
1401                     o(0x280f);
1402                     o(0xc0 + (sse_reg << 3));
1403                     /* movaps %xmm1, %xmmN */
1404                     o(0x280f);
1405                     o(0xc1 + ((sse_reg+1) << 3));
1406                 }
1407             } else {
1408                 assert(reg_count == 1);
1409                 --sse_reg;
1410                 /* Load directly to register */
1411                 gv(RC_XMM0 << sse_reg);
1412             }
1413         } else if (mode == x86_64_mode_integer) {
1414             /* simple type */
1415             /* XXX: implicit cast ? */
1416             int d;
1417             gen_reg -= reg_count;
1418             r = gv(RC_INT);
1419             d = arg_prepare_reg(gen_reg);
1420             orex(1,d,r,0x89); /* mov */
1421             o(0xc0 + REG_VALUE(r) * 8 + REG_VALUE(d));
1422             if (reg_count == 2) {
1423                 d = arg_prepare_reg(gen_reg+1);
1424                 orex(1,d,vtop->r2,0x89); /* mov */
1425                 o(0xc0 + REG_VALUE(vtop->r2) * 8 + REG_VALUE(d));
1426             }
1427         }
1428         vtop--;
1429     }
1430     assert(gen_reg == 0);
1431     assert(sse_reg == 0);
1432
1433     /* We shouldn't have many operands on the stack anymore, but the
1434        call address itself is still there, and it might be in %eax
1435        (or edx/ecx) currently, which the below writes would clobber.
1436        So evict all remaining operands here.  */
1437     save_regs(0);
1438
1439     /* Copy R10 and R11 into RDX and RCX, respectively */
1440     if (nb_reg_args > 2) {
1441         o(0xd2894c); /* mov %r10, %rdx */
1442         if (nb_reg_args > 3) {
1443             o(0xd9894c); /* mov %r11, %rcx */
1444         }
1445     }
1446
1447     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
1448     gcall_or_jmp(0);
1449     if (args_size)
1450         gadd_sp(args_size);
1451     vtop--;
1452 }
1453
1454
1455 #define FUNC_PROLOG_SIZE 11
1456
1457 static void push_arg_reg(int i) {
1458     loc -= 8;
1459     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
1460 }
1461
1462 /* generate function prolog of type 't' */
1463 void gfunc_prolog(CType *func_type)
1464 {
1465     X86_64_Mode mode;
1466     int i, addr, align, size, reg_count;
1467     int param_addr = 0, reg_param_index, sse_param_index;
1468     Sym *sym;
1469     CType *type;
1470
1471     sym = func_type->ref;
1472     addr = PTR_SIZE * 2;
1473     loc = 0;
1474     ind += FUNC_PROLOG_SIZE;
1475     func_sub_sp_offset = ind;
1476     func_ret_sub = 0;
1477
1478     if (func_type->ref->c == FUNC_ELLIPSIS) {
1479         int seen_reg_num, seen_sse_num, seen_stack_size;
1480         seen_reg_num = seen_sse_num = 0;
1481         /* frame pointer and return address */
1482         seen_stack_size = PTR_SIZE * 2;
1483         /* count the number of seen parameters */
1484         sym = func_type->ref;
1485         while ((sym = sym->next) != NULL) {
1486             type = &sym->type;
1487             mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1488             switch (mode) {
1489             default:
1490             stack_arg:
1491                 seen_stack_size = ((seen_stack_size + align - 1) & -align) + size;
1492                 break;
1493
1494             case x86_64_mode_integer:
1495                 if (seen_reg_num + reg_count <= 8) {
1496                     seen_reg_num += reg_count;
1497                 } else {
1498                     seen_reg_num = 8;
1499                     goto stack_arg;
1500                 }
1501                 break;
1502
1503             case x86_64_mode_sse:
1504                 if (seen_sse_num + reg_count <= 8) {
1505                     seen_sse_num += reg_count;
1506                 } else {
1507                     seen_sse_num = 8;
1508                     goto stack_arg;
1509                 }
1510                 break;
1511             }
1512         }
1513
1514         loc -= 16;
1515         /* movl $0x????????, -0x10(%rbp) */
1516         o(0xf045c7);
1517         gen_le32(seen_reg_num * 8);
1518         /* movl $0x????????, -0xc(%rbp) */
1519         o(0xf445c7);
1520         gen_le32(seen_sse_num * 16 + 48);
1521         /* movl $0x????????, -0x8(%rbp) */
1522         o(0xf845c7);
1523         gen_le32(seen_stack_size);
1524
1525         /* save all register passing arguments */
1526         for (i = 0; i < 8; i++) {
1527             loc -= 16;
1528             o(0xd60f66); /* movq */
1529             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
1530             /* movq $0, loc+8(%rbp) */
1531             o(0x85c748);
1532             gen_le32(loc + 8);
1533             gen_le32(0);
1534         }
1535         for (i = 0; i < REGN; i++) {
1536             push_arg_reg(REGN-1-i);
1537         }
1538     }
1539
1540     sym = func_type->ref;
1541     reg_param_index = 0;
1542     sse_param_index = 0;
1543
1544     /* if the function returns a structure, then add an
1545        implicit pointer parameter */
1546     func_vt = sym->type;
1547     mode = classify_x86_64_arg(&func_vt, NULL, &size, &align, &reg_count);
1548     if (mode == x86_64_mode_memory) {
1549         push_arg_reg(reg_param_index);
1550         func_vc = loc;
1551         reg_param_index++;
1552     }
1553     /* define parameters */
1554     while ((sym = sym->next) != NULL) {
1555         type = &sym->type;
1556         mode = classify_x86_64_arg(type, NULL, &size, &align, &reg_count);
1557         switch (mode) {
1558         case x86_64_mode_sse:
1559             if (sse_param_index + reg_count <= 8) {
1560                 /* save arguments passed by register */
1561                 loc -= reg_count * 8;
1562                 param_addr = loc;
1563                 for (i = 0; i < reg_count; ++i) {
1564                     o(0xd60f66); /* movq */
1565                     gen_modrm(sse_param_index, VT_LOCAL, NULL, param_addr + i*8);
1566                     ++sse_param_index;
1567                 }
1568             } else {
1569                 addr = (addr + align - 1) & -align;
1570                 param_addr = addr;
1571                 addr += size;
1572             }
1573             break;
1574
1575         case x86_64_mode_memory:
1576         case x86_64_mode_x87:
1577             addr = (addr + align - 1) & -align;
1578             param_addr = addr;
1579             addr += size;
1580             break;
1581
1582         case x86_64_mode_integer: {
1583             if (reg_param_index + reg_count <= REGN) {
1584                 /* save arguments passed by register */
1585                 loc -= reg_count * 8;
1586                 param_addr = loc;
1587                 for (i = 0; i < reg_count; ++i) {
1588                     gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, param_addr + i*8);
1589                     ++reg_param_index;
1590                 }
1591             } else {
1592                 addr = (addr + align - 1) & -align;
1593                 param_addr = addr;
1594                 addr += size;
1595             }
1596             break;
1597         }
1598         default: break; /* nothing to be done for x86_64_mode_none */
1599         }
1600         sym_push(sym->v & ~SYM_FIELD, type,
1601                  VT_LOCAL | VT_LVAL, param_addr);
1602     }
1603
1604 #ifdef CONFIG_TCC_BCHECK
1605     /* leave some room for bound checking code */
1606     if (tcc_state->do_bounds_check) {
1607         func_bound_offset = lbounds_section->data_offset;
1608         func_bound_ind = ind;
1609         oad(0xb8, 0); /* lbound section pointer */
1610         o(0xc78948);  /* mov  %rax,%rdi ## first arg in %rdi, this must be ptr */
1611         oad(0xb8, 0); /* call to function */
1612     }
1613 #endif
1614 }
1615
1616 /* generate function epilog */
1617 void gfunc_epilog(void)
1618 {
1619     int v, saved_ind;
1620
1621 #ifdef CONFIG_TCC_BCHECK
1622     if (tcc_state->do_bounds_check
1623         && func_bound_offset != lbounds_section->data_offset)
1624     {
1625         addr_t saved_ind;
1626         addr_t *bounds_ptr;
1627         Sym *sym_data;
1628
1629         /* add end of table info */
1630         bounds_ptr = section_ptr_add(lbounds_section, sizeof(addr_t));
1631         *bounds_ptr = 0;
1632
1633         /* generate bound local allocation */
1634         sym_data = get_sym_ref(&char_pointer_type, lbounds_section,
1635                                func_bound_offset, lbounds_section->data_offset);
1636         saved_ind = ind;
1637         ind = func_bound_ind;
1638         greloc(cur_text_section, sym_data, ind + 1, R_386_32);
1639         ind = ind + 5 + 3;
1640         gen_static_call(TOK___bound_local_new);
1641         ind = saved_ind;
1642
1643         /* generate bound check local freeing */
1644         o(0x5250); /* save returned value, if any */
1645         greloc(cur_text_section, sym_data, ind + 1, R_386_32);
1646         oad(0xb8, 0); /* mov xxx, %rax */
1647         o(0xc78948);  /* mov  %rax,%rdi ## first arg in %rdi, this must be ptr */
1648         gen_static_call(TOK___bound_local_delete);
1649         o(0x585a); /* restore returned value, if any */
1650     }
1651 #endif
1652     o(0xc9); /* leave */
1653     if (func_ret_sub == 0) {
1654         o(0xc3); /* ret */
1655     } else {
1656         o(0xc2); /* ret n */
1657         g(func_ret_sub);
1658         g(func_ret_sub >> 8);
1659     }
1660     /* align local size to word & save local variables */
1661     v = (-loc + 15) & -16;
1662     saved_ind = ind;
1663     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
1664     o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
1665     o(0xec8148);  /* sub rsp, stacksize */
1666     gen_le32(v);
1667     ind = saved_ind;
1668 }
1669
1670 #endif /* not PE */
1671
1672 /* generate a jump to a label */
1673 int gjmp(int t)
1674 {
1675     return psym(0xe9, t);
1676 }
1677
1678 /* generate a jump to a fixed address */
1679 void gjmp_addr(int a)
1680 {
1681     int r;
1682     r = a - ind - 2;
1683     if (r == (char)r) {
1684         g(0xeb);
1685         g(r);
1686     } else {
1687         oad(0xe9, a - ind - 5);
1688     }
1689 }
1690
1691 /* generate a test. set 'inv' to invert test. Stack entry is popped */
1692 int gtst(int inv, int t)
1693 {
1694     int v, *p;
1695
1696     v = vtop->r & VT_VALMASK;
1697     if (v == VT_CMP) {
1698         /* fast case : can jump directly since flags are set */
1699         if (vtop->c.i & 0x100)
1700           {
1701             /* This was a float compare.  If the parity flag is set
1702                the result was unordered.  For anything except != this
1703                means false and we don't jump (anding both conditions).
1704                For != this means true (oring both).
1705                Take care about inverting the test.  We need to jump
1706                to our target if the result was unordered and test wasn't NE,
1707                otherwise if unordered we don't want to jump.  */
1708             vtop->c.i &= ~0x100;
1709             if (!inv == (vtop->c.i != TOK_NE))
1710               o(0x067a);  /* jp +6 */
1711             else
1712               {
1713                 g(0x0f);
1714                 t = psym(0x8a, t); /* jp t */
1715               }
1716           }
1717         g(0x0f);
1718         t = psym((vtop->c.i - 16) ^ inv, t);
1719     } else if (v == VT_JMP || v == VT_JMPI) {
1720         /* && or || optimization */
1721         if ((v & 1) == inv) {
1722             /* insert vtop->c jump list in t */
1723             p = &vtop->c.i;
1724             while (*p != 0)
1725                 p = (int *)(cur_text_section->data + *p);
1726             *p = t;
1727             t = vtop->c.i;
1728         } else {
1729             t = gjmp(t);
1730             gsym(vtop->c.i);
1731         }
1732     }
1733     vtop--;
1734     return t;
1735 }
1736
1737 /* generate an integer binary operation */
1738 void gen_opi(int op)
1739 {
1740     int r, fr, opc, c;
1741     int ll, uu, cc;
1742
1743     ll = is64_type(vtop[-1].type.t);
1744     uu = (vtop[-1].type.t & VT_UNSIGNED) != 0;
1745     cc = (vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST;
1746
1747     switch(op) {
1748     case '+':
1749     case TOK_ADDC1: /* add with carry generation */
1750         opc = 0;
1751     gen_op8:
1752         if (cc && (!ll || (int)vtop->c.ll == vtop->c.ll)) {
1753             /* constant case */
1754             vswap();
1755             r = gv(RC_INT);
1756             vswap();
1757             c = vtop->c.i;
1758             if (c == (char)c) {
1759                 /* XXX: generate inc and dec for smaller code ? */
1760                 orex(ll, r, 0, 0x83);
1761                 o(0xc0 | (opc << 3) | REG_VALUE(r));
1762                 g(c);
1763             } else {
1764                 orex(ll, r, 0, 0x81);
1765                 oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
1766             }
1767         } else {
1768             gv2(RC_INT, RC_INT);
1769             r = vtop[-1].r;
1770             fr = vtop[0].r;
1771             orex(ll, r, fr, (opc << 3) | 0x01);
1772             o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
1773         }
1774         vtop--;
1775         if (op >= TOK_ULT && op <= TOK_GT) {
1776             vtop->r = VT_CMP;
1777             vtop->c.i = op;
1778         }
1779         break;
1780     case '-':
1781     case TOK_SUBC1: /* sub with carry generation */
1782         opc = 5;
1783         goto gen_op8;
1784     case TOK_ADDC2: /* add with carry use */
1785         opc = 2;
1786         goto gen_op8;
1787     case TOK_SUBC2: /* sub with carry use */
1788         opc = 3;
1789         goto gen_op8;
1790     case '&':
1791         opc = 4;
1792         goto gen_op8;
1793     case '^':
1794         opc = 6;
1795         goto gen_op8;
1796     case '|':
1797         opc = 1;
1798         goto gen_op8;
1799     case '*':
1800         gv2(RC_INT, RC_INT);
1801         r = vtop[-1].r;
1802         fr = vtop[0].r;
1803         orex(ll, fr, r, 0xaf0f); /* imul fr, r */
1804         o(0xc0 + REG_VALUE(fr) + REG_VALUE(r) * 8);
1805         vtop--;
1806         break;
1807     case TOK_SHL:
1808         opc = 4;
1809         goto gen_shift;
1810     case TOK_SHR:
1811         opc = 5;
1812         goto gen_shift;
1813     case TOK_SAR:
1814         opc = 7;
1815     gen_shift:
1816         opc = 0xc0 | (opc << 3);
1817         if (cc) {
1818             /* constant case */
1819             vswap();
1820             r = gv(RC_INT);
1821             vswap();
1822             orex(ll, r, 0, 0xc1); /* shl/shr/sar $xxx, r */
1823             o(opc | REG_VALUE(r));
1824             g(vtop->c.i & (ll ? 63 : 31));
1825         } else {
1826             /* we generate the shift in ecx */
1827             gv2(RC_INT, RC_RCX);
1828             r = vtop[-1].r;
1829             orex(ll, r, 0, 0xd3); /* shl/shr/sar %cl, r */
1830             o(opc | REG_VALUE(r));
1831         }
1832         vtop--;
1833         break;
1834     case TOK_UDIV:
1835     case TOK_UMOD:
1836         uu = 1;
1837         goto divmod;
1838     case '/':
1839     case '%':
1840     case TOK_PDIV:
1841         uu = 0;
1842     divmod:
1843         /* first operand must be in eax */
1844         /* XXX: need better constraint for second operand */
1845         gv2(RC_RAX, RC_RCX);
1846         r = vtop[-1].r;
1847         fr = vtop[0].r;
1848         vtop--;
1849         save_reg(TREG_RDX);
1850         orex(ll, 0, 0, uu ? 0xd231 : 0x99); /* xor %edx,%edx : cqto */
1851         orex(ll, fr, 0, 0xf7); /* div fr, %eax */
1852         o((uu ? 0xf0 : 0xf8) + REG_VALUE(fr));
1853         if (op == '%' || op == TOK_UMOD)
1854             r = TREG_RDX;
1855         else
1856             r = TREG_RAX;
1857         vtop->r = r;
1858         break;
1859     default:
1860         opc = 7;
1861         goto gen_op8;
1862     }
1863 }
1864
1865 void gen_opl(int op)
1866 {
1867     gen_opi(op);
1868 }
1869
1870 /* generate a floating point operation 'v = t1 op t2' instruction. The
1871    two operands are guaranted to have the same floating point type */
1872 /* XXX: need to use ST1 too */
1873 void gen_opf(int op)
1874 {
1875     int a, ft, fc, swapped, r;
1876     int float_type =
1877         (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1878
1879     /* convert constants to memory references */
1880     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1881         vswap();
1882         gv(float_type);
1883         vswap();
1884     }
1885     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1886         gv(float_type);
1887
1888     /* must put at least one value in the floating point register */
1889     if ((vtop[-1].r & VT_LVAL) &&
1890         (vtop[0].r & VT_LVAL)) {
1891         vswap();
1892         gv(float_type);
1893         vswap();
1894     }
1895     swapped = 0;
1896     /* swap the stack if needed so that t1 is the register and t2 is
1897        the memory reference */
1898     if (vtop[-1].r & VT_LVAL) {
1899         vswap();
1900         swapped = 1;
1901     }
1902     if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1903         if (op >= TOK_ULT && op <= TOK_GT) {
1904             /* load on stack second operand */
1905             load(TREG_ST0, vtop);
1906             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1907             if (op == TOK_GE || op == TOK_GT)
1908                 swapped = !swapped;
1909             else if (op == TOK_EQ || op == TOK_NE)
1910                 swapped = 0;
1911             if (swapped)
1912                 o(0xc9d9); /* fxch %st(1) */
1913             if (op == TOK_EQ || op == TOK_NE)
1914                 o(0xe9da); /* fucompp */
1915             else
1916                 o(0xd9de); /* fcompp */
1917             o(0xe0df); /* fnstsw %ax */
1918             if (op == TOK_EQ) {
1919                 o(0x45e480); /* and $0x45, %ah */
1920                 o(0x40fC80); /* cmp $0x40, %ah */
1921             } else if (op == TOK_NE) {
1922                 o(0x45e480); /* and $0x45, %ah */
1923                 o(0x40f480); /* xor $0x40, %ah */
1924                 op = TOK_NE;
1925             } else if (op == TOK_GE || op == TOK_LE) {
1926                 o(0x05c4f6); /* test $0x05, %ah */
1927                 op = TOK_EQ;
1928             } else {
1929                 o(0x45c4f6); /* test $0x45, %ah */
1930                 op = TOK_EQ;
1931             }
1932             vtop--;
1933             vtop->r = VT_CMP;
1934             vtop->c.i = op;
1935         } else {
1936             /* no memory reference possible for long double operations */
1937             load(TREG_ST0, vtop);
1938             swapped = !swapped;
1939
1940             switch(op) {
1941             default:
1942             case '+':
1943                 a = 0;
1944                 break;
1945             case '-':
1946                 a = 4;
1947                 if (swapped)
1948                     a++;
1949                 break;
1950             case '*':
1951                 a = 1;
1952                 break;
1953             case '/':
1954                 a = 6;
1955                 if (swapped)
1956                     a++;
1957                 break;
1958             }
1959             ft = vtop->type.t;
1960             fc = vtop->c.ul;
1961             o(0xde); /* fxxxp %st, %st(1) */
1962             o(0xc1 + (a << 3));
1963             vtop--;
1964         }
1965     } else {
1966         if (op >= TOK_ULT && op <= TOK_GT) {
1967             /* if saved lvalue, then we must reload it */
1968             r = vtop->r;
1969             fc = vtop->c.ul;
1970             if ((r & VT_VALMASK) == VT_LLOCAL) {
1971                 SValue v1;
1972                 r = get_reg(RC_INT);
1973                 v1.type.t = VT_PTR;
1974                 v1.r = VT_LOCAL | VT_LVAL;
1975                 v1.c.ul = fc;
1976                 load(r, &v1);
1977                 fc = 0;
1978             }
1979
1980             if (op == TOK_EQ || op == TOK_NE) {
1981                 swapped = 0;
1982             } else {
1983                 if (op == TOK_LE || op == TOK_LT)
1984                     swapped = !swapped;
1985                 if (op == TOK_LE || op == TOK_GE) {
1986                     op = 0x93; /* setae */
1987                 } else {
1988                     op = 0x97; /* seta */
1989                 }
1990             }
1991
1992             if (swapped) {
1993                 gv(RC_FLOAT);
1994                 vswap();
1995             }
1996             assert(!(vtop[-1].r & VT_LVAL));
1997
1998             if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE)
1999                 o(0x66);
2000             if (op == TOK_EQ || op == TOK_NE)
2001                 o(0x2e0f); /* ucomisd */
2002             else
2003                 o(0x2f0f); /* comisd */
2004
2005             if (vtop->r & VT_LVAL) {
2006                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
2007             } else {
2008                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
2009             }
2010
2011             vtop--;
2012             vtop->r = VT_CMP;
2013             vtop->c.i = op | 0x100;
2014         } else {
2015             assert((vtop->type.t & VT_BTYPE) != VT_LDOUBLE);
2016             switch(op) {
2017             default:
2018             case '+':
2019                 a = 0;
2020                 break;
2021             case '-':
2022                 a = 4;
2023                 break;
2024             case '*':
2025                 a = 1;
2026                 break;
2027             case '/':
2028                 a = 6;
2029                 break;
2030             }
2031             ft = vtop->type.t;
2032             fc = vtop->c.ul;
2033             assert((ft & VT_BTYPE) != VT_LDOUBLE);
2034
2035             r = vtop->r;
2036             /* if saved lvalue, then we must reload it */
2037             if ((vtop->r & VT_VALMASK) == VT_LLOCAL) {
2038                 SValue v1;
2039                 r = get_reg(RC_INT);
2040                 v1.type.t = VT_PTR;
2041                 v1.r = VT_LOCAL | VT_LVAL;
2042                 v1.c.ul = fc;
2043                 load(r, &v1);
2044                 fc = 0;
2045             }
2046
2047             assert(!(vtop[-1].r & VT_LVAL));
2048             if (swapped) {
2049                 assert(vtop->r & VT_LVAL);
2050                 gv(RC_FLOAT);
2051                 vswap();
2052             }
2053
2054             if ((ft & VT_BTYPE) == VT_DOUBLE) {
2055                 o(0xf2);
2056             } else {
2057                 o(0xf3);
2058             }
2059             o(0x0f);
2060             o(0x58 + a);
2061
2062             if (vtop->r & VT_LVAL) {
2063                 gen_modrm(vtop[-1].r, r, vtop->sym, fc);
2064             } else {
2065                 o(0xc0 + REG_VALUE(vtop[0].r) + REG_VALUE(vtop[-1].r)*8);
2066             }
2067
2068             vtop--;
2069         }
2070     }
2071 }
2072
2073 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
2074    and 'long long' cases. */
2075 void gen_cvt_itof(int t)
2076 {
2077     if ((t & VT_BTYPE) == VT_LDOUBLE) {
2078         save_reg(TREG_ST0);
2079         gv(RC_INT);
2080         if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
2081             /* signed long long to float/double/long double (unsigned case
2082                is handled generically) */
2083             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2084             o(0x242cdf); /* fildll (%rsp) */
2085             o(0x08c48348); /* add $8, %rsp */
2086         } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2087                    (VT_INT | VT_UNSIGNED)) {
2088             /* unsigned int to float/double/long double */
2089             o(0x6a); /* push $0 */
2090             g(0x00);
2091             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2092             o(0x242cdf); /* fildll (%rsp) */
2093             o(0x10c48348); /* add $16, %rsp */
2094         } else {
2095             /* int to float/double/long double */
2096             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
2097             o(0x2404db); /* fildl (%rsp) */
2098             o(0x08c48348); /* add $8, %rsp */
2099         }
2100         vtop->r = TREG_ST0;
2101     } else {
2102         int r = get_reg(RC_FLOAT);
2103         gv(RC_INT);
2104         o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT?1:0));
2105         if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
2106             (VT_INT | VT_UNSIGNED) ||
2107             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
2108             o(0x48); /* REX */
2109         }
2110         o(0x2a0f);
2111         o(0xc0 + (vtop->r & VT_VALMASK) + REG_VALUE(r)*8); /* cvtsi2sd */
2112         vtop->r = r;
2113     }
2114 }
2115
2116 /* convert from one floating point type to another */
2117 void gen_cvt_ftof(int t)
2118 {
2119     int ft, bt, tbt;
2120
2121     ft = vtop->type.t;
2122     bt = ft & VT_BTYPE;
2123     tbt = t & VT_BTYPE;
2124
2125     if (bt == VT_FLOAT) {
2126         gv(RC_FLOAT);
2127         if (tbt == VT_DOUBLE) {
2128             o(0x140f); /* unpcklps */
2129             o(0xc0 + REG_VALUE(vtop->r)*9);
2130             o(0x5a0f); /* cvtps2pd */
2131             o(0xc0 + REG_VALUE(vtop->r)*9);
2132         } else if (tbt == VT_LDOUBLE) {
2133             save_reg(RC_ST0);
2134             /* movss %xmm0,-0x10(%rsp) */
2135             o(0x110ff3);
2136             o(0x44 + REG_VALUE(vtop->r)*8);
2137             o(0xf024);
2138             o(0xf02444d9); /* flds -0x10(%rsp) */
2139             vtop->r = TREG_ST0;
2140         }
2141     } else if (bt == VT_DOUBLE) {
2142         gv(RC_FLOAT);
2143         if (tbt == VT_FLOAT) {
2144             o(0x140f66); /* unpcklpd */
2145             o(0xc0 + REG_VALUE(vtop->r)*9);
2146             o(0x5a0f66); /* cvtpd2ps */
2147             o(0xc0 + REG_VALUE(vtop->r)*9);
2148         } else if (tbt == VT_LDOUBLE) {
2149             save_reg(RC_ST0);
2150             /* movsd %xmm0,-0x10(%rsp) */
2151             o(0x110ff2);
2152             o(0x44 + REG_VALUE(vtop->r)*8);
2153             o(0xf024);
2154             o(0xf02444dd); /* fldl -0x10(%rsp) */
2155             vtop->r = TREG_ST0;
2156         }
2157     } else {
2158         int r;
2159         gv(RC_ST0);
2160         r = get_reg(RC_FLOAT);
2161         if (tbt == VT_DOUBLE) {
2162             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
2163             /* movsd -0x10(%rsp),%xmm0 */
2164             o(0x100ff2);
2165             o(0x44 + REG_VALUE(r)*8);
2166             o(0xf024);
2167             vtop->r = r;
2168         } else if (tbt == VT_FLOAT) {
2169             o(0xf0245cd9); /* fstps -0x10(%rsp) */
2170             /* movss -0x10(%rsp),%xmm0 */
2171             o(0x100ff3);
2172             o(0x44 + REG_VALUE(r)*8);
2173             o(0xf024);
2174             vtop->r = r;
2175         }
2176     }
2177 }
2178
2179 /* convert fp to int 't' type */
2180 void gen_cvt_ftoi(int t)
2181 {
2182     int ft, bt, size, r;
2183     ft = vtop->type.t;
2184     bt = ft & VT_BTYPE;
2185     if (bt == VT_LDOUBLE) {
2186         gen_cvt_ftof(VT_DOUBLE);
2187         bt = VT_DOUBLE;
2188     }
2189
2190     gv(RC_FLOAT);
2191     if (t != VT_INT)
2192         size = 8;
2193     else
2194         size = 4;
2195
2196     r = get_reg(RC_INT);
2197     if (bt == VT_FLOAT) {
2198         o(0xf3);
2199     } else if (bt == VT_DOUBLE) {
2200         o(0xf2);
2201     } else {
2202         assert(0);
2203     }
2204     orex(size == 8, r, 0, 0x2c0f); /* cvttss2si or cvttsd2si */
2205     o(0xc0 + REG_VALUE(vtop->r) + REG_VALUE(r)*8);
2206     vtop->r = r;
2207 }
2208
2209 /* computed goto support */
2210 void ggoto(void)
2211 {
2212     gcall_or_jmp(1);
2213     vtop--;
2214 }
2215
2216 /* Save the stack pointer onto the stack and return the location of its address */
2217 ST_FUNC void gen_vla_sp_save(int addr) {
2218     /* mov %rsp,addr(%rbp)*/
2219     gen_modrm64(0x89, TREG_RSP, VT_LOCAL, NULL, addr);
2220 }
2221
2222 /* Restore the SP from a location on the stack */
2223 ST_FUNC void gen_vla_sp_restore(int addr) {
2224     gen_modrm64(0x8b, TREG_RSP, VT_LOCAL, NULL, addr);
2225 }
2226
2227 /* Subtract from the stack pointer, and push the resulting value onto the stack */
2228 ST_FUNC void gen_vla_alloc(CType *type, int align) {
2229 #ifdef TCC_TARGET_PE
2230     /* alloca does more than just adjust %rsp on Windows */
2231     vpush_global_sym(&func_old_type, TOK_alloca);
2232     vswap(); /* Move alloca ref past allocation size */
2233     gfunc_call(1);
2234     vset(type, REG_IRET, 0);
2235 #else
2236     int r;
2237     r = gv(RC_INT); /* allocation size */
2238     /* sub r,%rsp */
2239     o(0x2b48);
2240     o(0xe0 | REG_VALUE(r));
2241     /* We align to 16 bytes rather than align */
2242     /* and ~15, %rsp */
2243     o(0xf0e48348);
2244     vpop();
2245 #endif
2246 }
2247
2248
2249 /* end of x86-64 code generator */
2250 /*************************************************************/
2251 #endif /* ! TARGET_DEFS_ONLY */
2252 /******************************************************/