x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #include <assert.h>
  24
  25 /* number of available registers */
  26 #define NB_REGS             5
  27
  28 /* a register can belong to several classes. The classes must be
  29    sorted from more general to more precise (see gv2() code which does
  30    assumptions on it). */
  31 #define RC_INT     0x0001 /* generic integer register */
  32 #define RC_FLOAT   0x0002 /* generic float register */
  33 #define RC_RAX     0x0004
  34 #define RC_RCX     0x0008
  35 #define RC_RDX     0x0010
  36 #define RC_XMM0    0x0020
  37 #define RC_ST0     0x0040 /* only for long double */
  38 #define RC_IRET    RC_RAX /* function return: integer register */
  39 #define RC_LRET    RC_RDX /* function return: second integer register */
  40 #define RC_FRET    RC_XMM0 /* function return: float register */
  41
  42 /* pretty names for the registers */
  43 enum {
  44     TREG_RAX = 0,
  45     TREG_RCX = 1,
  46     TREG_RDX = 2,
  47     TREG_RSI = 6,
  48     TREG_RDI = 7,
  49     TREG_R8  = 8,
  50     TREG_R9  = 9,
  51     TREG_R10 = 10,
  52     TREG_R11 = 11,
  53
  54     TREG_XMM0 = 3,
  55     TREG_ST0 = 4,
  56
  57     TREG_MEM = 0x10,
  58 };
  59
  60 #define REX_BASE(reg) (((reg) >> 3) & 1)
  61 #define REG_VALUE(reg) ((reg) & 7)
  62
  63 int reg_classes[NB_REGS] = {
  64     /* eax */ RC_INT | RC_RAX,
  65     /* ecx */ RC_INT | RC_RCX,
  66     /* edx */ RC_INT | RC_RDX,
  67     /* xmm0 */ RC_FLOAT | RC_XMM0,
  68     /* st0 */ RC_ST0,
  69 };
  70
  71 /* return registers for function */
  72 #define REG_IRET TREG_RAX /* single word int return register */
  73 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  74 #define REG_FRET TREG_XMM0 /* float return register */
  75
  76 /* defined if function parameters must be evaluated in reverse order */
  77 #define INVERT_FUNC_PARAMS
  78
  79 /* pointer size, in bytes */
  80 #define PTR_SIZE 8
  81
  82 /* long double size and alignment, in bytes */
  83 #define LDOUBLE_SIZE  16
  84 #define LDOUBLE_ALIGN 8
  85 /* maximum alignment (for aligned attribute support) */
  86 #define MAX_ALIGN     8
  87
  88 /******************************************************/
  89 /* ELF defines */
  90
  91 #define EM_TCC_TARGET EM_X86_64
  92
  93 /* relocation type for 32 bit data relocation */
  94 #define R_DATA_32   R_X86_64_64
  95 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
  96 #define R_COPY      R_X86_64_COPY
  97
  98 #define ELF_START_ADDR 0x08048000
  99 #define ELF_PAGE_SIZE  0x1000
 100
 101 /******************************************************/
 102
 103 static unsigned long func_sub_sp_offset;
 104 static int func_ret_sub;
 105
 106 /* XXX: make it faster ? */
 107 void g(int c)
 108 {
 109     int ind1;
 110     ind1 = ind + 1;
 111     if (ind1 > cur_text_section->data_allocated)
 112         section_realloc(cur_text_section, ind1);
 113     cur_text_section->data[ind] = c;
 114     ind = ind1;
 115 }
 116
 117 void o(unsigned int c)
 118 {
 119     while (c) {
 120         g(c);
 121         c = c >> 8;
 122     }
 123 }
 124
 125 void gen_le32(int c)
 126 {
 127     g(c);
 128     g(c >> 8);
 129     g(c >> 16);
 130     g(c >> 24);
 131 }
 132
 133 void gen_le64(int64_t c)
 134 {
 135     g(c);
 136     g(c >> 8);
 137     g(c >> 16);
 138     g(c >> 24);
 139     g(c >> 32);
 140     g(c >> 40);
 141     g(c >> 48);
 142     g(c >> 56);
 143 }
 144
 145 /* output a symbol and patch all calls to it */
 146 void gsym_addr(int t, int a)
 147 {
 148     int n, *ptr;
 149     while (t) {
 150         ptr = (int *)(cur_text_section->data + t);
 151         n = *ptr; /* next value */
 152         *ptr = a - t - 4;
 153         t = n;
 154     }
 155 }
 156
 157 void gsym(int t)
 158 {
 159     gsym_addr(t, ind);
 160 }
 161
 162 /* psym is used to put an instruction with a data field which is a
 163    reference to a symbol. It is in fact the same as oad ! */
 164 #define psym oad
 165
 166 static int is64_type(int t)
 167 {
 168     return ((t & VT_BTYPE) == VT_PTR ||
 169             (t & VT_BTYPE) == VT_FUNC ||
 170             (t & VT_BTYPE) == VT_LLONG);
 171 }
 172
 173 static int is_sse_float(int t) {
 174     int bt;
 175     bt = t & VT_BTYPE;
 176     return bt == VT_DOUBLE || bt == VT_FLOAT;
 177 }
 178
 179 /* instruction + 4 bytes data. Return the address of the data */
 180 static int oad(int c, int s)
 181 {
 182     int ind1;
 183
 184     o(c);
 185     ind1 = ind + 4;
 186     if (ind1 > cur_text_section->data_allocated)
 187         section_realloc(cur_text_section, ind1);
 188     *(int *)(cur_text_section->data + ind) = s;
 189     s = ind;
 190     ind = ind1;
 191     return s;
 192 }
 193
 194 /* output constant with relocation if 'r & VT_SYM' is true */
 195 static void gen_addr64(int r, Sym *sym, int64_t c)
 196 {
 197     if (r & VT_SYM)
 198         greloc(cur_text_section, sym, ind, R_X86_64_64);
 199     gen_le64(c);
 200 }
 201
 202 /* output constant with relocation if 'r & VT_SYM' is true */
 203 static void gen_addrpc32(int r, Sym *sym, int c)
 204 {
 205     if (r & VT_SYM)
 206         greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 207     gen_le32(c-4);
 208 }
 209
 210 /* output got address with relocation */
 211 static void gen_gotpcrel(int r, Sym *sym, int c)
 212 {
 213 #ifndef TCC_TARGET_PE
 214     Section *sr;
 215     ElfW(Rela) *rel;
 216     greloc(cur_text_section, sym, ind, R_X86_64_GOTPCREL);
 217     sr = cur_text_section->reloc;
 218     rel = (ElfW(Rela) *)(sr->data + sr->data_offset - sizeof(ElfW(Rela)));
 219     rel->r_addend = -4;
 220 #else
 221     printf("picpic: %s %x %x | %02x %02x %02x\n", get_tok_str(sym->v, NULL), c, r,
 222         cur_text_section->data[ind-3],
 223         cur_text_section->data[ind-2],
 224         cur_text_section->data[ind-1]
 225         );
 226     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 227 #endif
 228     gen_le32(0);
 229
 230     if (c) {
 231         /* we use add c, %xxx for displacement */
 232         o(0x48 + REX_BASE(r));
 233         o(0x81);
 234         o(0xc0 + REG_VALUE(r));
 235         gen_le32(c);
 236     }
 237 }
 238
 239 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 240 {
 241     op_reg = REG_VALUE(op_reg) << 3;
 242     if ((r & VT_VALMASK) == VT_CONST) {
 243         /* constant memory reference */
 244         o(0x05 | op_reg);
 245         if (is_got) {
 246             gen_gotpcrel(r, sym, c);
 247         } else {
 248             gen_addrpc32(r, sym, c);
 249         }
 250     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 251         /* currently, we use only ebp as base */
 252         if (c == (char)c) {
 253             /* short reference */
 254             o(0x45 | op_reg);
 255             g(c);
 256         } else {
 257             oad(0x85 | op_reg, c);
 258         }
 259     } else if ((r & VT_VALMASK) >= TREG_MEM) {
 260         if (c) {
 261             g(0x80 | op_reg | REG_VALUE(r));
 262             gen_le32(c);
 263         } else {
 264             g(0x00 | op_reg | REG_VALUE(r));
 265         }
 266     } else {
 267         g(0x00 | op_reg | (r & VT_VALMASK));
 268     }
 269 }
 270
 271 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 272    opcode bits */
 273 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 274 {
 275     gen_modrm_impl(op_reg, r, sym, c, 0);
 276 }
 277
 278 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 279    opcode bits */
 280 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 281 {
 282     int is_got;
 283     int rex = 0x48 | (REX_BASE(op_reg) << 2);
 284     if ((r & VT_VALMASK) != VT_CONST &&
 285         (r & VT_VALMASK) != VT_LOCAL) {
 286         rex |= REX_BASE(VT_VALMASK & r);
 287     }
 288     o(rex);
 289     o(opcode);
 290     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 291     gen_modrm_impl(op_reg, r, sym, c, is_got);
 292 }
 293
 294
 295 /* load 'r' from value 'sv' */
 296 void load(int r, SValue *sv)
 297 {
 298     int v, t, ft, fc, fr;
 299     SValue v1;
 300
 301     fr = sv->r;
 302     ft = sv->type.t;
 303     fc = sv->c.ul;
 304
 305 #ifndef TCC_TARGET_PE
 306     /* we use indirect access via got */
 307     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 308         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 309         /* use the result register as a temporal register */
 310         int tr = r | TREG_MEM;
 311         if (is_float(ft)) {
 312             /* we cannot use float registers as a temporal register */
 313             tr = get_reg(RC_INT) | TREG_MEM;
 314         }
 315         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 316
 317         /* load from the temporal register */
 318         fr = tr | VT_LVAL;
 319     }
 320 #endif
 321
 322     v = fr & VT_VALMASK;
 323     if (fr & VT_LVAL) {
 324         if (v == VT_LLOCAL) {
 325             v1.type.t = VT_PTR;
 326             v1.r = VT_LOCAL | VT_LVAL;
 327             v1.c.ul = fc;
 328             load(r, &v1);
 329             fr = r;
 330         }
 331         if ((ft & VT_BTYPE) == VT_FLOAT) {
 332             o(0x6e0f66); /* movd */
 333             r = 0;
 334         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 335             o(0x7e0ff3); /* movq */
 336             r = 0;
 337         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 338             o(0xdb); /* fldt */
 339             r = 5;
 340         } else if ((ft & VT_TYPE) == VT_BYTE) {
 341             o(0xbe0f);   /* movsbl */
 342         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 343             o(0xb60f);   /* movzbl */
 344         } else if ((ft & VT_TYPE) == VT_SHORT) {
 345             o(0xbf0f);   /* movswl */
 346         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 347             o(0xb70f);   /* movzwl */
 348         } else if (is64_type(ft)) {
 349             gen_modrm64(0x8b, r, fr, sv->sym, fc);
 350             return;
 351         } else {
 352             o(0x8b);   /* movl */
 353         }
 354         gen_modrm(r, fr, sv->sym, fc);
 355     } else {
 356         if (v == VT_CONST) {
 357             if ((ft & VT_BTYPE) == VT_LLONG) {
 358                 assert(!(fr & VT_SYM));
 359                 o(0x48);
 360                 o(0xb8 + REG_VALUE(r)); /* mov $xx, r */
 361                 gen_addr64(fr, sv->sym, sv->c.ull);
 362             } else {
 363                 if (fr & VT_SYM) {
 364 #ifndef TCC_TARGET_PE
 365                     if (sv->sym->type.t & VT_STATIC) {
 366 #endif
 367                         o(0x8d48);
 368                         o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 369                         gen_addrpc32(fr, sv->sym, fc);
 370 #ifndef TCC_TARGET_PE
 371                     } else {
 372                         o(0x8b48);
 373                         o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 374                         gen_gotpcrel(r, sv->sym, fc);
 375                     }
 376 #endif
 377                 } else {
 378                     o(0xb8 + REG_VALUE(r)); /* mov $xx, r */
 379                     gen_le32(fc);
 380                 }
 381             }
 382         } else if (v == VT_LOCAL) {
 383             o(0x48 | REX_BASE(r));
 384             o(0x8d); /* lea xxx(%ebp), r */
 385             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 386         } else if (v == VT_CMP) {
 387             oad(0xb8 + r, 0); /* mov $0, r */
 388             o(0x0f); /* setxx %br */
 389             o(fc);
 390             o(0xc0 + r);
 391         } else if (v == VT_JMP || v == VT_JMPI) {
 392             t = v & 1;
 393             oad(0xb8 + r, t); /* mov $1, r */
 394             o(0x05eb); /* jmp after */
 395             gsym(fc);
 396             oad(0xb8 + r, t ^ 1); /* mov $0, r */
 397         } else if (v != r) {
 398             if (r == TREG_XMM0) {
 399                 assert(v == TREG_ST0);
 400                 /* gen_cvt_ftof(VT_DOUBLE); */
 401                 o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 402                 /* movsd -0x10(%rsp),%xmm0 */
 403                 o(0x44100ff2);
 404                 o(0xf024);
 405             } else if (r == TREG_ST0) {
 406                 assert(v == TREG_XMM0);
 407                 /* gen_cvt_ftof(VT_LDOUBLE); */
 408                 /* movsd %xmm0,-0x10(%rsp) */
 409                 o(0x44110ff2);
 410                 o(0xf024);
 411                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 412             } else {
 413                 o(0x48 | REX_BASE(r) | (REX_BASE(v) << 2));
 414                 o(0x89);
 415                 o(0xc0 + r + v * 8); /* mov v, r */
 416             }
 417         }
 418     }
 419 }
 420
 421 /* store register 'r' in lvalue 'v' */
 422 void store(int r, SValue *v)
 423 {
 424     int fr, bt, ft, fc;
 425     int op64 = 0;
 426     /* store the REX prefix in this variable when PIC is enabled */
 427     int pic = 0;
 428
 429     ft = v->type.t;
 430     fc = v->c.ul;
 431     fr = v->r & VT_VALMASK;
 432     bt = ft & VT_BTYPE;
 433
 434 #ifndef TCC_TARGET_PE
 435     /* we need to access the variable via got */
 436     if (fr == VT_CONST && (v->r & VT_SYM)) {
 437         /* mov xx(%rip), %r11 */
 438         o(0x1d8b4c);
 439         gen_gotpcrel(TREG_R11, v->sym, v->c.ul);
 440         pic = is64_type(bt) ? 0x49 : 0x41;
 441     }
 442 #endif
 443
 444     /* XXX: incorrect if float reg to reg */
 445     if (bt == VT_FLOAT) {
 446         o(0x66);
 447         o(pic);
 448         o(0x7e0f); /* movd */
 449         r = 0;
 450     } else if (bt == VT_DOUBLE) {
 451         o(0x66);
 452         o(pic);
 453         o(0xd60f); /* movq */
 454         r = 0;
 455     } else if (bt == VT_LDOUBLE) {
 456         o(0xc0d9); /* fld %st(0) */
 457         o(pic);
 458         o(0xdb); /* fstpt */
 459         r = 7;
 460     } else {
 461         if (bt == VT_SHORT)
 462             o(0x66);
 463         o(pic);
 464         if (bt == VT_BYTE || bt == VT_BOOL)
 465             o(0x88);
 466         else if (is64_type(bt))
 467             op64 = 0x89;
 468         else
 469             o(0x89);
 470     }
 471     if (pic) {
 472         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 473         if (op64)
 474             o(op64);
 475         o(3 + (r << 3));
 476     } else if (op64) {
 477         if (fr == VT_CONST ||
 478             fr == VT_LOCAL ||
 479             (v->r & VT_LVAL)) {
 480             gen_modrm64(op64, r, v->r, v->sym, fc);
 481         } else if (fr != r) {
 482             /* XXX: don't we really come here? */
 483             abort();
 484             o(0xc0 + fr + r * 8); /* mov r, fr */
 485         }
 486     } else {
 487         if (fr == VT_CONST ||
 488             fr == VT_LOCAL ||
 489             (v->r & VT_LVAL)) {
 490             gen_modrm(r, v->r, v->sym, fc);
 491         } else if (fr != r) {
 492             /* XXX: don't we really come here? */
 493             abort();
 494             o(0xc0 + fr + r * 8); /* mov r, fr */
 495         }
 496     }
 497 }
 498
 499 static void gadd_sp(int val)
 500 {
 501     if (val == (char)val) {
 502         o(0xc48348);
 503         g(val);
 504     } else {
 505         oad(0xc48148, val); /* add $xxx, %rsp */
 506     }
 507 }
 508
 509 /* 'is_jmp' is '1' if it is a jump */
 510 static void gcall_or_jmp(int is_jmp)
 511 {
 512     int r;
 513     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
 514         /* constant case */
 515         if (vtop->r & VT_SYM) {
 516             /* relocation case */
 517             greloc(cur_text_section, vtop->sym,
 518                    ind + 1, R_X86_64_PC32);
 519         } else {
 520             /* put an empty PC32 relocation */
 521             put_elf_reloc(symtab_section, cur_text_section,
 522                           ind + 1, R_X86_64_PC32, 0);
 523         }
 524         oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
 525     } else {
 526         /* otherwise, indirect call */
 527         r = TREG_R11;
 528         load(r, vtop);
 529         o(0x41); /* REX */
 530         o(0xff); /* call/jmp *r */
 531         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 532     }
 533 }
 534
 535 #ifdef TCC_TARGET_PE
 536 #define REGN 4
 537 static uint8_t arg_regs[] = {
 538     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 539 };
 540 #else
 541 #define REGN 6
 542 static uint8_t arg_regs[REGN] = {
 543     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
 544 };
 545 #endif
 546
 547 /* Generate function call. The function address is pushed first, then
 548    all the parameters in call order. This functions pops all the
 549    parameters and the function address. */
 550 void gfunc_call(int nb_args)
 551 {
 552     int size, align, r, args_size, i;
 553     SValue *orig_vtop;
 554     int nb_reg_args = 0;
 555     int nb_sse_args = 0;
 556     int sse_reg, gen_reg;
 557
 558     /* calculate the number of integer/float arguments */
 559     args_size = 0;
 560     for(i = 0; i < nb_args; i++) {
 561         if ((vtop[-i].type.t & VT_BTYPE) == VT_STRUCT) {
 562             args_size += type_size(&vtop->type, &align);
 563         } else if ((vtop[-i].type.t & VT_BTYPE) == VT_LDOUBLE) {
 564             args_size += 16;
 565 #ifndef TCC_TARGET_PE
 566         } else if (is_sse_float(vtop[-i].type.t)) {
 567             nb_sse_args++;
 568             if (nb_sse_args > 8) args_size += 8;
 569 #endif
 570         } else {
 571             nb_reg_args++;
 572             if (nb_reg_args > REGN) args_size += 8;
 573         }
 574     }
 575
 576     /* for struct arguments, we need to call memcpy and the function
 577        call breaks register passing arguments we are preparing.
 578        So, we process arguments which will be passed by stack first. */
 579     orig_vtop = vtop;
 580     gen_reg = nb_reg_args;
 581     sse_reg = nb_sse_args;
 582
 583 #ifdef TCC_TARGET_PE
 584     save_regs(0); /* save used temporary registers */
 585 #endif
 586
 587     /* adjust stack to align SSE boundary */
 588     if (args_size &= 8) {
 589         o(0x50); /* push $rax */
 590     }
 591     for(i = 0; i < nb_args; i++) {
 592         if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
 593             size = type_size(&vtop->type, &align);
 594             /* align to stack align size */
 595             size = (size + 3) & ~3;
 596             /* allocate the necessary size on stack */
 597             o(0x48);
 598             oad(0xec81, size); /* sub $xxx, %rsp */
 599             /* generate structure store */
 600             r = get_reg(RC_INT);
 601             o(0x48 + REX_BASE(r));
 602             o(0x89); /* mov %rsp, r */
 603             o(0xe0 + r);
 604             {
 605                 /* following code breaks vtop[1] */
 606                 SValue tmp = vtop[1];
 607                 vset(&vtop->type, r | VT_LVAL, 0);
 608                 vswap();
 609                 vstore();
 610                 vtop[1] = tmp;
 611             }
 612             args_size += size;
 613         } else if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
 614             gv(RC_ST0);
 615             size = LDOUBLE_SIZE;
 616             oad(0xec8148, size); /* sub $xxx, %rsp */
 617             o(0x7cdb); /* fstpt 0(%rsp) */
 618             g(0x24);
 619             g(0x00);
 620             args_size += size;
 621         } else if (is_sse_float(vtop->type.t)) {
 622 #ifdef TCC_TARGET_PE
 623             int j = --gen_reg;
 624             if (j >= REGN) {
 625 #else
 626             int j = --sse_reg;
 627             if (j >= 8) {
 628 #endif
 629                 gv(RC_FLOAT);
 630                 o(0x50); /* push $rax */
 631                 /* movq %xmm0, (%rsp) */
 632                 o(0x04d60f66);
 633                 o(0x24);
 634                 args_size += 8;
 635             }
 636         } else {
 637             int j = --gen_reg;
 638             /* simple type */
 639             /* XXX: implicit cast ? */
 640             if (j >= REGN) {
 641                 r = gv(RC_INT);
 642                 o(0x50 + r); /* push r */
 643                 args_size += 8;
 644             }
 645         }
 646         vtop--;
 647     }
 648     vtop = orig_vtop;
 649
 650     /* then, we prepare register passing arguments.
 651        Note that we cannot set RDX and RCX in this loop because gv()
 652        may break these temporary registers. Let's use R10 and R11
 653        instead of them */
 654     gen_reg = nb_reg_args;
 655     sse_reg = nb_sse_args;
 656     for(i = 0; i < nb_args; i++) {
 657         if ((vtop->type.t & VT_BTYPE) == VT_STRUCT ||
 658             (vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
 659         } else if (is_sse_float(vtop->type.t)) {
 660 #ifdef TCC_TARGET_PE
 661             int j = --gen_reg;
 662             if (j < REGN) {
 663                 int d = arg_regs[j];
 664                 gv(RC_FLOAT); /* only one float register */
 665                 /* movaps %xmm0, %xmmN */
 666                 o(0x280f);
 667                 o(0xc0 + (j << 3));
 668                 o(0x50);
 669                 o(0xd60f66); /* movq %xmm0, (%rsp) */
 670                 o(0x2404 + (j << 3));
 671                 if (d < 8) {
 672                     o(0x58 + d); /* pop d */
 673                 } else {
 674                     o(0x58);
 675                     o(0xc08949 + d - 8);
 676                 }
 677             }
 678         } else {
 679             int j = --gen_reg;
 680             /* simple type */
 681             /* XXX: implicit cast ? */
 682             if (j < REGN) {
 683                 int d = arg_regs[j];
 684                 r = gv(RC_INT);
 685                 if (d != r) {
 686                     if (d < 8) {
 687                         o(0x8948); /* mov */
 688                         o(0xc0 + r * 8 + d);
 689                     } else {
 690                         o(0x8949); /* mov */
 691                         o(0xc0 + r * 8 + d - 8);
 692                     }
 693                 }
 694             }
 695 #else
 696             int j = --sse_reg;
 697             if (j < 8) {
 698                 gv(RC_FLOAT); /* only one float register */
 699                 /* movaps %xmm0, %xmmN */
 700                 o(0x280f);
 701                 o(0xc0 + (sse_reg << 3));
 702             }
 703         } else {
 704             int j = --gen_reg;
 705             /* simple type */
 706             /* XXX: implicit cast ? */
 707             if (j < REGN) {
 708                 r = gv(RC_INT);
 709                 if (j < 2) {
 710                     o(0x8948); /* mov */
 711                     o(0xc0 + r * 8 + arg_regs[j]);
 712                 } else if (j < 4) {
 713                     o(0x8949); /* mov */
 714                     /* j=2: r10, j=3: r11 */
 715                     o(0xc0 + r * 8 + j);
 716                 } else {
 717                     o(0x8949); /* mov */
 718                     /* j=4: r8, j=5: r9 */
 719                     o(0xc0 + r * 8 + j - 4);
 720                 }
 721             }
 722 #endif
 723         }
 724         vtop--;
 725     }
 726
 727 #ifdef TCC_TARGET_PE
 728     /* allocate scratch space */
 729     gadd_sp(-8*REGN);
 730     args_size += 8*REGN;
 731 #else
 732     save_regs(0); /* save used temporary registers */
 733
 734     /* Copy R10 and R11 into RDX and RCX, respectively */
 735     if (nb_reg_args > 2) {
 736         o(0xd2894c); /* mov %r10, %rdx */
 737         if (nb_reg_args > 3) {
 738             o(0xd9894c); /* mov %r11, %rcx */
 739         }
 740     }
 741
 742     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
 743 #endif
 744     gcall_or_jmp(0);
 745     if (args_size)
 746         gadd_sp(args_size);
 747     vtop--;
 748 }
 749
 750 #define FUNC_PROLOG_SIZE 11
 751
 752 static void push_arg_reg(int i) {
 753     loc -= 8;
 754     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
 755 }
 756
 757 /* generate function prolog of type 't' */
 758 void gfunc_prolog(CType *func_type)
 759 {
 760     int i, addr, align, size;
 761     int param_index, param_addr, reg_param_index, sse_param_index;
 762     Sym *sym;
 763     CType *type;
 764
 765     func_ret_sub = 0;
 766
 767     sym = func_type->ref;
 768     addr = PTR_SIZE * 2;
 769     loc = 0;
 770     ind += FUNC_PROLOG_SIZE;
 771     func_sub_sp_offset = ind;
 772
 773 #ifndef TCC_TARGET_PE
 774     if (func_type->ref->c == FUNC_ELLIPSIS) {
 775         int seen_reg_num, seen_sse_num, seen_stack_size;
 776         seen_reg_num = seen_sse_num = 0;
 777         /* frame pointer and return address */
 778         seen_stack_size = PTR_SIZE * 2;
 779         /* count the number of seen parameters */
 780         sym = func_type->ref;
 781         while ((sym = sym->next) != NULL) {
 782             type = &sym->type;
 783             if (is_sse_float(type->t)) {
 784                 if (seen_sse_num < 8) {
 785                     seen_sse_num++;
 786                 } else {
 787                     seen_stack_size += 8;
 788                 }
 789             } else if ((type->t & VT_BTYPE) == VT_STRUCT) {
 790                 size = type_size(type, &align);
 791                 size = (size + 3) & ~3;
 792                 seen_stack_size += size;
 793             } else if ((type->t & VT_BTYPE) == VT_LDOUBLE) {
 794                 seen_stack_size += LDOUBLE_SIZE;
 795             } else {
 796                 if (seen_reg_num < REGN) {
 797                     seen_reg_num++;
 798                 } else {
 799                     seen_stack_size += 8;
 800                 }
 801             }
 802         }
 803
 804         loc -= 16;
 805         /* movl $0x????????, -0x10(%rbp) */
 806         o(0xf045c7);
 807         gen_le32(seen_reg_num * 8);
 808         /* movl $0x????????, -0xc(%rbp) */
 809         o(0xf445c7);
 810         gen_le32(seen_sse_num * 16 + 48);
 811         /* movl $0x????????, -0x8(%rbp) */
 812         o(0xf845c7);
 813         gen_le32(seen_stack_size);
 814
 815         /* save all register passing arguments */
 816         for (i = 0; i < 8; i++) {
 817             loc -= 16;
 818             o(0xd60f66); /* movq */
 819             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
 820             /* movq $0, loc+8(%rbp) */
 821             o(0x85c748);
 822             gen_le32(loc + 8);
 823             gen_le32(0);
 824         }
 825         for (i = 0; i < REGN; i++) {
 826             push_arg_reg(REGN-1-i);
 827         }
 828     }
 829 #endif
 830
 831     sym = func_type->ref;
 832     param_index = 0;
 833     reg_param_index = 0;
 834     sse_param_index = 0;
 835
 836     /* if the function returns a structure, then add an
 837        implicit pointer parameter */
 838     func_vt = sym->type;
 839     if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
 840         push_arg_reg(reg_param_index);
 841         param_addr = loc;
 842
 843         func_vc = loc;
 844         param_index++;
 845         reg_param_index++;
 846     }
 847     /* define parameters */
 848     while ((sym = sym->next) != NULL) {
 849         type = &sym->type;
 850         size = type_size(type, &align);
 851         size = (size + 3) & ~3;
 852 #ifndef TCC_TARGET_PE
 853         if (is_sse_float(type->t)) {
 854             if (sse_param_index < 8) {
 855                 /* save arguments passed by register */
 856                 loc -= 8;
 857                 o(0xd60f66); /* movq */
 858                 gen_modrm(sse_param_index, VT_LOCAL, NULL, loc);
 859                 param_addr = loc;
 860             } else {
 861                 param_addr = addr;
 862                 addr += size;
 863             }
 864             sse_param_index++;
 865         } else
 866 #endif
 867         if ((type->t & VT_BTYPE) == VT_STRUCT ||
 868                    (type->t & VT_BTYPE) == VT_LDOUBLE) {
 869             param_addr = addr;
 870             addr += size;
 871         } else {
 872 #ifdef TCC_TARGET_PE
 873             if (reg_param_index < REGN) {
 874                 /* save arguments passed by register */
 875                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 876             }
 877             param_addr = addr;
 878             addr += 8;
 879 #else
 880             if (reg_param_index < REGN) {
 881                 /* save arguments passed by register */
 882                 push_arg_reg(reg_param_index);
 883                 param_addr = loc;
 884             } else {
 885                 param_addr = addr;
 886                 addr += 8;
 887             }
 888 #endif
 889             reg_param_index++;
 890         }
 891         sym_push(sym->v & ~SYM_FIELD, type,
 892                  VT_LOCAL | VT_LVAL, param_addr);
 893         param_index++;
 894     }
 895 #ifdef TCC_TARGET_PE
 896     if (func_type->ref->c == FUNC_ELLIPSIS) {
 897         for (i = reg_param_index; i < REGN; ++i) {
 898             gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, addr);
 899             addr += 8;
 900         }
 901     }
 902 #endif
 903 }
 904
 905 /* generate function epilog */
 906 void gfunc_epilog(void)
 907 {
 908     int v, saved_ind;
 909
 910     o(0xc9); /* leave */
 911     if (func_ret_sub == 0) {
 912         o(0xc3); /* ret */
 913     } else {
 914         o(0xc2); /* ret n */
 915         g(func_ret_sub);
 916         g(func_ret_sub >> 8);
 917     }
 918     /* align local size to word & save local variables */
 919     v = (-loc + 15) & -16;
 920     saved_ind = ind;
 921     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 922 #if 0 // def TCC_TARGET_PE - don't have __chkstk yet, because assembler does not work
 923     if (v >= 4096) {
 924         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 925         oad(0xb8, v); /* mov stacksize, %eax */
 926         oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
 927         greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 928     } else
 929 #endif
 930     {
 931         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 932         o(0xec8148);  /* sub rsp, stacksize */
 933         gen_le32(v);
 934     }
 935     ind = saved_ind;
 936 }
 937
 938 /* generate a jump to a label */
 939 int gjmp(int t)
 940 {
 941     return psym(0xe9, t);
 942 }
 943
 944 /* generate a jump to a fixed address */
 945 void gjmp_addr(int a)
 946 {
 947     int r;
 948     r = a - ind - 2;
 949     if (r == (char)r) {
 950         g(0xeb);
 951         g(r);
 952     } else {
 953         oad(0xe9, a - ind - 5);
 954     }
 955 }
 956
 957 /* generate a test. set 'inv' to invert test. Stack entry is popped */
 958 int gtst(int inv, int t)
 959 {
 960     int v, *p;
 961
 962     v = vtop->r & VT_VALMASK;
 963     if (v == VT_CMP) {
 964         /* fast case : can jump directly since flags are set */
 965         g(0x0f);
 966         t = psym((vtop->c.i - 16) ^ inv, t);
 967     } else if (v == VT_JMP || v == VT_JMPI) {
 968         /* && or || optimization */
 969         if ((v & 1) == inv) {
 970             /* insert vtop->c jump list in t */
 971             p = &vtop->c.i;
 972             while (*p != 0)
 973                 p = (int *)(cur_text_section->data + *p);
 974             *p = t;
 975             t = vtop->c.i;
 976         } else {
 977             t = gjmp(t);
 978             gsym(vtop->c.i);
 979         }
 980     } else {
 981         if (is_float(vtop->type.t) ||
 982             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
 983             vpushi(0);
 984             gen_op(TOK_NE);
 985         }
 986         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
 987             /* constant jmp optimization */
 988             if ((vtop->c.i != 0) != inv)
 989                 t = gjmp(t);
 990         } else {
 991             v = gv(RC_INT);
 992             o(0x85);
 993             o(0xc0 + v * 9);
 994             g(0x0f);
 995             t = psym(0x85 ^ inv, t);
 996         }
 997     }
 998     vtop--;
 999     return t;
1000 }
1001
1002 /* generate an integer binary operation */
1003 void gen_opi(int op)
1004 {
1005     int r, fr, opc, c;
1006
1007     switch(op) {
1008     case '+':
1009     case TOK_ADDC1: /* add with carry generation */
1010         opc = 0;
1011     gen_op8:
1012         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST &&
1013             !is64_type(vtop->type.t)) {
1014             /* constant case */
1015             vswap();
1016             r = gv(RC_INT);
1017             if (is64_type(vtop->type.t)) {
1018                 o(0x48 | REX_BASE(r));
1019             }
1020             vswap();
1021             c = vtop->c.i;
1022             if (c == (char)c) {
1023                 /* XXX: generate inc and dec for smaller code ? */
1024                 o(0x83);
1025                 o(0xc0 | (opc << 3) | REG_VALUE(r));
1026                 g(c);
1027             } else {
1028                 o(0x81);
1029                 oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
1030             }
1031         } else {
1032             gv2(RC_INT, RC_INT);
1033             r = vtop[-1].r;
1034             fr = vtop[0].r;
1035             if (opc != 7 ||
1036                 is64_type(vtop[0].type.t) || (vtop[0].type.t & VT_UNSIGNED) ||
1037                 is64_type(vtop[-1].type.t) || (vtop[-1].type.t & VT_UNSIGNED)) {
1038                 o(0x48 | REX_BASE(r) | (REX_BASE(fr) << 2));
1039             }
1040             o((opc << 3) | 0x01);
1041             o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
1042         }
1043         vtop--;
1044         if (op >= TOK_ULT && op <= TOK_GT) {
1045             vtop->r = VT_CMP;
1046             vtop->c.i = op;
1047         }
1048         break;
1049     case '-':
1050     case TOK_SUBC1: /* sub with carry generation */
1051         opc = 5;
1052         goto gen_op8;
1053     case TOK_ADDC2: /* add with carry use */
1054         opc = 2;
1055         goto gen_op8;
1056     case TOK_SUBC2: /* sub with carry use */
1057         opc = 3;
1058         goto gen_op8;
1059     case '&':
1060         opc = 4;
1061         goto gen_op8;
1062     case '^':
1063         opc = 6;
1064         goto gen_op8;
1065     case '|':
1066         opc = 1;
1067         goto gen_op8;
1068     case '*':
1069         gv2(RC_INT, RC_INT);
1070         r = vtop[-1].r;
1071         fr = vtop[0].r;
1072         if (is64_type(vtop[0].type.t) || (vtop[0].type.t & VT_UNSIGNED) ||
1073             is64_type(vtop[-1].type.t) || (vtop[-1].type.t & VT_UNSIGNED)) {
1074             o(0x48 | REX_BASE(fr) | (REX_BASE(r) << 2));
1075         }
1076         vtop--;
1077         o(0xaf0f); /* imul fr, r */
1078         o(0xc0 + fr + r * 8);
1079         break;
1080     case TOK_SHL:
1081         opc = 4;
1082         goto gen_shift;
1083     case TOK_SHR:
1084         opc = 5;
1085         goto gen_shift;
1086     case TOK_SAR:
1087         opc = 7;
1088     gen_shift:
1089         opc = 0xc0 | (opc << 3);
1090         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
1091             /* constant case */
1092             vswap();
1093             r = gv(RC_INT);
1094             if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
1095                 o(0x48 | REX_BASE(r));
1096                 c = 0x3f;
1097             } else {
1098                 c = 0x1f;
1099             }
1100             vswap();
1101             c &= vtop->c.i;
1102             o(0xc1); /* shl/shr/sar $xxx, r */
1103             o(opc | r);
1104             g(c);
1105         } else {
1106             /* we generate the shift in ecx */
1107             gv2(RC_INT, RC_RCX);
1108             r = vtop[-1].r;
1109             if ((vtop[-1].type.t & VT_BTYPE) == VT_LLONG) {
1110                 o(0x48 | REX_BASE(r));
1111             }
1112             o(0xd3); /* shl/shr/sar %cl, r */
1113             o(opc | r);
1114         }
1115         vtop--;
1116         break;
1117     case '/':
1118     case TOK_UDIV:
1119     case TOK_PDIV:
1120     case '%':
1121     case TOK_UMOD:
1122     case TOK_UMULL:
1123         /* first operand must be in eax */
1124         /* XXX: need better constraint for second operand */
1125         gv2(RC_RAX, RC_RCX);
1126         r = vtop[-1].r;
1127         fr = vtop[0].r;
1128         vtop--;
1129         save_reg(TREG_RDX);
1130         if (op == TOK_UMULL) {
1131             o(0xf7); /* mul fr */
1132             o(0xe0 + fr);
1133             vtop->r2 = TREG_RDX;
1134             r = TREG_RAX;
1135         } else {
1136             if (op == TOK_UDIV || op == TOK_UMOD) {
1137                 o(0xf7d231); /* xor %edx, %edx, div fr, %eax */
1138                 o(0xf0 + fr);
1139             } else {
1140                 if ((vtop->type.t & VT_BTYPE) & VT_LLONG) {
1141                     o(0x9948); /* cqto */
1142                     o(0x48 + REX_BASE(fr));
1143                 } else {
1144                     o(0x99); /* cltd */
1145                 }
1146                 o(0xf7); /* idiv fr, %eax */
1147                 o(0xf8 + fr);
1148             }
1149             if (op == '%' || op == TOK_UMOD)
1150                 r = TREG_RDX;
1151             else
1152                 r = TREG_RAX;
1153         }
1154         vtop->r = r;
1155         break;
1156     default:
1157         opc = 7;
1158         goto gen_op8;
1159     }
1160 }
1161
1162 void gen_opl(int op)
1163 {
1164     gen_opi(op);
1165 }
1166
1167 /* generate a floating point operation 'v = t1 op t2' instruction. The
1168    two operands are guaranted to have the same floating point type */
1169 /* XXX: need to use ST1 too */
1170 void gen_opf(int op)
1171 {
1172     int a, ft, fc, swapped, r;
1173     int float_type =
1174         (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1175
1176     /* convert constants to memory references */
1177     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1178         vswap();
1179         gv(float_type);
1180         vswap();
1181     }
1182     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1183         gv(float_type);
1184
1185     /* must put at least one value in the floating point register */
1186     if ((vtop[-1].r & VT_LVAL) &&
1187         (vtop[0].r & VT_LVAL)) {
1188         vswap();
1189         gv(float_type);
1190         vswap();
1191     }
1192     swapped = 0;
1193     /* swap the stack if needed so that t1 is the register and t2 is
1194        the memory reference */
1195     if (vtop[-1].r & VT_LVAL) {
1196         vswap();
1197         swapped = 1;
1198     }
1199     if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1200         if (op >= TOK_ULT && op <= TOK_GT) {
1201             /* load on stack second operand */
1202             load(TREG_ST0, vtop);
1203             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1204             if (op == TOK_GE || op == TOK_GT)
1205                 swapped = !swapped;
1206             else if (op == TOK_EQ || op == TOK_NE)
1207                 swapped = 0;
1208             if (swapped)
1209                 o(0xc9d9); /* fxch %st(1) */
1210             o(0xe9da); /* fucompp */
1211             o(0xe0df); /* fnstsw %ax */
1212             if (op == TOK_EQ) {
1213                 o(0x45e480); /* and $0x45, %ah */
1214                 o(0x40fC80); /* cmp $0x40, %ah */
1215             } else if (op == TOK_NE) {
1216                 o(0x45e480); /* and $0x45, %ah */
1217                 o(0x40f480); /* xor $0x40, %ah */
1218                 op = TOK_NE;
1219             } else if (op == TOK_GE || op == TOK_LE) {
1220                 o(0x05c4f6); /* test $0x05, %ah */
1221                 op = TOK_EQ;
1222             } else {
1223                 o(0x45c4f6); /* test $0x45, %ah */
1224                 op = TOK_EQ;
1225             }
1226             vtop--;
1227             vtop->r = VT_CMP;
1228             vtop->c.i = op;
1229         } else {
1230             /* no memory reference possible for long double operations */
1231             load(TREG_ST0, vtop);
1232             swapped = !swapped;
1233
1234             switch(op) {
1235             default:
1236             case '+':
1237                 a = 0;
1238                 break;
1239             case '-':
1240                 a = 4;
1241                 if (swapped)
1242                     a++;
1243                 break;
1244             case '*':
1245                 a = 1;
1246                 break;
1247             case '/':
1248                 a = 6;
1249                 if (swapped)
1250                     a++;
1251                 break;
1252             }
1253             ft = vtop->type.t;
1254             fc = vtop->c.ul;
1255             o(0xde); /* fxxxp %st, %st(1) */
1256             o(0xc1 + (a << 3));
1257             vtop--;
1258         }
1259     } else {
1260         if (op >= TOK_ULT && op <= TOK_GT) {
1261             /* if saved lvalue, then we must reload it */
1262             r = vtop->r;
1263             fc = vtop->c.ul;
1264             if ((r & VT_VALMASK) == VT_LLOCAL) {
1265                 SValue v1;
1266                 r = get_reg(RC_INT);
1267                 v1.type.t = VT_INT;
1268                 v1.r = VT_LOCAL | VT_LVAL;
1269                 v1.c.ul = fc;
1270                 load(r, &v1);
1271                 fc = 0;
1272             }
1273
1274             if (op == TOK_EQ || op == TOK_NE) {
1275                 swapped = 0;
1276             } else {
1277                 if (op == TOK_LE || op == TOK_LT)
1278                     swapped = !swapped;
1279                 if (op == TOK_LE || op == TOK_GE) {
1280                     op = 0x93; /* setae */
1281                 } else {
1282                     op = 0x97; /* seta */
1283                 }
1284             }
1285
1286             if (swapped) {
1287                 o(0x7e0ff3); /* movq */
1288                 gen_modrm(1, r, vtop->sym, fc);
1289
1290                 if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
1291                     o(0x66);
1292                 }
1293                 o(0x2e0f); /* ucomisd %xmm0, %xmm1 */
1294                 o(0xc8);
1295             } else {
1296                 if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
1297                     o(0x66);
1298                 }
1299                 o(0x2e0f); /* ucomisd */
1300                 gen_modrm(0, r, vtop->sym, fc);
1301             }
1302
1303             vtop--;
1304             vtop->r = VT_CMP;
1305             vtop->c.i = op;
1306         } else {
1307             /* no memory reference possible for long double operations */
1308             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1309                 load(TREG_XMM0, vtop);
1310                 swapped = !swapped;
1311             }
1312             switch(op) {
1313             default:
1314             case '+':
1315                 a = 0;
1316                 break;
1317             case '-':
1318                 a = 4;
1319                 break;
1320             case '*':
1321                 a = 1;
1322                 break;
1323             case '/':
1324                 a = 6;
1325                 break;
1326             }
1327             ft = vtop->type.t;
1328             fc = vtop->c.ul;
1329             if ((ft & VT_BTYPE) == VT_LDOUBLE) {
1330                 o(0xde); /* fxxxp %st, %st(1) */
1331                 o(0xc1 + (a << 3));
1332             } else {
1333                 /* if saved lvalue, then we must reload it */
1334                 r = vtop->r;
1335                 if ((r & VT_VALMASK) == VT_LLOCAL) {
1336                     SValue v1;
1337                     r = get_reg(RC_INT);
1338                     v1.type.t = VT_INT;
1339                     v1.r = VT_LOCAL | VT_LVAL;
1340                     v1.c.ul = fc;
1341                     load(r, &v1);
1342                     fc = 0;
1343                 }
1344                 if (swapped) {
1345                     /* movq %xmm0,%xmm1 */
1346                     o(0x7e0ff3);
1347                     o(0xc8);
1348                     load(TREG_XMM0, vtop);
1349                     /* subsd  %xmm1,%xmm0 (f2 0f 5c c1) */
1350                     if ((ft & VT_BTYPE) == VT_DOUBLE) {
1351                         o(0xf2);
1352                     } else {
1353                         o(0xf3);
1354                     }
1355                     o(0x0f);
1356                     o(0x58 + a);
1357                     o(0xc1);
1358                 } else {
1359                     if ((ft & VT_BTYPE) == VT_DOUBLE) {
1360                         o(0xf2);
1361                     } else {
1362                         o(0xf3);
1363                     }
1364                     o(0x0f);
1365                     o(0x58 + a);
1366                     gen_modrm(0, r, vtop->sym, fc);
1367                 }
1368             }
1369             vtop--;
1370         }
1371     }
1372 }
1373
1374 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
1375    and 'long long' cases. */
1376 void gen_cvt_itof(int t)
1377 {
1378     if ((t & VT_BTYPE) == VT_LDOUBLE) {
1379         save_reg(TREG_ST0);
1380         gv(RC_INT);
1381         if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
1382             /* signed long long to float/double/long double (unsigned case
1383                is handled generically) */
1384             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1385             o(0x242cdf); /* fildll (%rsp) */
1386             o(0x08c48348); /* add $8, %rsp */
1387         } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
1388                    (VT_INT | VT_UNSIGNED)) {
1389             /* unsigned int to float/double/long double */
1390             o(0x6a); /* push $0 */
1391             g(0x00);
1392             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1393             o(0x242cdf); /* fildll (%rsp) */
1394             o(0x10c48348); /* add $16, %rsp */
1395         } else {
1396             /* int to float/double/long double */
1397             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1398             o(0x2404db); /* fildl (%rsp) */
1399             o(0x08c48348); /* add $8, %rsp */
1400         }
1401         vtop->r = TREG_ST0;
1402     } else {
1403         save_reg(TREG_XMM0);
1404         gv(RC_INT);
1405         o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT));
1406         if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
1407             (VT_INT | VT_UNSIGNED) ||
1408             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
1409             o(0x48); /* REX */
1410         }
1411         o(0x2a0f);
1412         o(0xc0 + (vtop->r & VT_VALMASK)); /* cvtsi2sd */
1413         vtop->r = TREG_XMM0;
1414     }
1415 }
1416
1417 /* convert from one floating point type to another */
1418 void gen_cvt_ftof(int t)
1419 {
1420     int ft, bt, tbt;
1421
1422     ft = vtop->type.t;
1423     bt = ft & VT_BTYPE;
1424     tbt = t & VT_BTYPE;
1425
1426     if (bt == VT_FLOAT) {
1427         gv(RC_FLOAT);
1428         if (tbt == VT_DOUBLE) {
1429             o(0xc0140f); /* unpcklps */
1430             o(0xc05a0f); /* cvtps2pd */
1431         } else if (tbt == VT_LDOUBLE) {
1432             /* movss %xmm0,-0x10(%rsp) */
1433             o(0x44110ff3);
1434             o(0xf024);
1435             o(0xf02444d9); /* flds -0x10(%rsp) */
1436             vtop->r = TREG_ST0;
1437         }
1438     } else if (bt == VT_DOUBLE) {
1439         gv(RC_FLOAT);
1440         if (tbt == VT_FLOAT) {
1441             o(0xc0140f66); /* unpcklpd */
1442             o(0xc05a0f66); /* cvtpd2ps */
1443         } else if (tbt == VT_LDOUBLE) {
1444             /* movsd %xmm0,-0x10(%rsp) */
1445             o(0x44110ff2);
1446             o(0xf024);
1447             o(0xf02444dd); /* fldl -0x10(%rsp) */
1448             vtop->r = TREG_ST0;
1449         }
1450     } else {
1451         gv(RC_ST0);
1452         if (tbt == VT_DOUBLE) {
1453             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
1454             /* movsd -0x10(%rsp),%xmm0 */
1455             o(0x44100ff2);
1456             o(0xf024);
1457             vtop->r = TREG_XMM0;
1458         } else if (tbt == VT_FLOAT) {
1459             o(0xf0245cd9); /* fstps -0x10(%rsp) */
1460             /* movss -0x10(%rsp),%xmm0 */
1461             o(0x44100ff3);
1462             o(0xf024);
1463             vtop->r = TREG_XMM0;
1464         }
1465     }
1466 }
1467
1468 /* convert fp to int 't' type */
1469 void gen_cvt_ftoi(int t)
1470 {
1471     int ft, bt, size, r;
1472     ft = vtop->type.t;
1473     bt = ft & VT_BTYPE;
1474     if (bt == VT_LDOUBLE) {
1475         gen_cvt_ftof(VT_DOUBLE);
1476         bt = VT_DOUBLE;
1477     }
1478
1479     gv(RC_FLOAT);
1480     if (t != VT_INT)
1481         size = 8;
1482     else
1483         size = 4;
1484
1485     r = get_reg(RC_INT);
1486     if (bt == VT_FLOAT) {
1487         o(0xf3);
1488     } else if (bt == VT_DOUBLE) {
1489         o(0xf2);
1490     } else {
1491         assert(0);
1492     }
1493     if (size == 8) {
1494         o(0x48 + REX_BASE(r));
1495     }
1496     o(0x2c0f); /* cvttss2si or cvttsd2si */
1497     o(0xc0 + (REG_VALUE(r) << 3));
1498     vtop->r = r;
1499 }
1500
1501 /* computed goto support */
1502 void ggoto(void)
1503 {
1504     gcall_or_jmp(1);
1505     vtop--;
1506 }
1507
1508 /* end of x86-64 code generator */
1509 /*************************************************************/