x86_64-gen.c

   1 /*
   2  *  x86-64 code generator for TCC
   3  *
   4  *  Copyright (c) 2008 Shinichiro Hamaji
   5  *
   6  *  Based on i386-gen.c by Fabrice Bellard
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #include <assert.h>
  24
  25 /* number of available registers */
  26 #define NB_REGS             5
  27
  28 /* a register can belong to several classes. The classes must be
  29    sorted from more general to more precise (see gv2() code which does
  30    assumptions on it). */
  31 #define RC_INT     0x0001 /* generic integer register */
  32 #define RC_FLOAT   0x0002 /* generic float register */
  33 #define RC_RAX     0x0004
  34 #define RC_RCX     0x0008
  35 #define RC_RDX     0x0010
  36 #define RC_XMM0    0x0020
  37 #define RC_ST0     0x0040 /* only for long double */
  38 #define RC_IRET    RC_RAX /* function return: integer register */
  39 #define RC_LRET    RC_RDX /* function return: second integer register */
  40 #define RC_FRET    RC_XMM0 /* function return: float register */
  41
  42 /* pretty names for the registers */
  43 enum {
  44     TREG_RAX = 0,
  45     TREG_RCX = 1,
  46     TREG_RDX = 2,
  47     TREG_RSI = 6,
  48     TREG_RDI = 7,
  49     TREG_R8  = 8,
  50     TREG_R9  = 9,
  51     TREG_R10 = 10,
  52     TREG_R11 = 11,
  53
  54     TREG_XMM0 = 3,
  55     TREG_ST0 = 4,
  56
  57     TREG_MEM = 0x10,
  58 };
  59
  60 #define REX_BASE(reg) (((reg) >> 3) & 1)
  61 #define REG_VALUE(reg) ((reg) & 7)
  62
  63 int reg_classes[NB_REGS] = {
  64     /* eax */ RC_INT | RC_RAX,
  65     /* ecx */ RC_INT | RC_RCX,
  66     /* edx */ RC_INT | RC_RDX,
  67     /* xmm0 */ RC_FLOAT | RC_XMM0,
  68     /* st0 */ RC_ST0,
  69 };
  70
  71 /* return registers for function */
  72 #define REG_IRET TREG_RAX /* single word int return register */
  73 #define REG_LRET TREG_RDX /* second word return register (for long long) */
  74 #define REG_FRET TREG_XMM0 /* float return register */
  75
  76 /* defined if function parameters must be evaluated in reverse order */
  77 #define INVERT_FUNC_PARAMS
  78
  79 /* pointer size, in bytes */
  80 #define PTR_SIZE 8
  81
  82 /* long double size and alignment, in bytes */
  83 #define LDOUBLE_SIZE  16
  84 #define LDOUBLE_ALIGN 8
  85 /* maximum alignment (for aligned attribute support) */
  86 #define MAX_ALIGN     8
  87
  88 /******************************************************/
  89 /* ELF defines */
  90
  91 #define EM_TCC_TARGET EM_X86_64
  92
  93 /* relocation type for 32 bit data relocation */
  94 #define R_DATA_32   R_X86_64_64
  95 #define R_JMP_SLOT  R_X86_64_JUMP_SLOT
  96 #define R_COPY      R_X86_64_COPY
  97
  98 #define ELF_START_ADDR 0x08048000
  99 #define ELF_PAGE_SIZE  0x1000
 100
 101 /******************************************************/
 102
 103 static unsigned long func_sub_sp_offset;
 104 static int func_ret_sub;
 105
 106 /* XXX: make it faster ? */
 107 void g(int c)
 108 {
 109     int ind1;
 110     ind1 = ind + 1;
 111     if (ind1 > cur_text_section->data_allocated)
 112         section_realloc(cur_text_section, ind1);
 113     cur_text_section->data[ind] = c;
 114     ind = ind1;
 115 }
 116
 117 void o(unsigned int c)
 118 {
 119     while (c) {
 120         g(c);
 121         c = c >> 8;
 122     }
 123 }
 124
 125 void gen_le32(int c)
 126 {
 127     g(c);
 128     g(c >> 8);
 129     g(c >> 16);
 130     g(c >> 24);
 131 }
 132
 133 void gen_le64(int64_t c)
 134 {
 135     g(c);
 136     g(c >> 8);
 137     g(c >> 16);
 138     g(c >> 24);
 139     g(c >> 32);
 140     g(c >> 40);
 141     g(c >> 48);
 142     g(c >> 56);
 143 }
 144
 145 /* output a symbol and patch all calls to it */
 146 void gsym_addr(int t, int a)
 147 {
 148     int n, *ptr;
 149     while (t) {
 150         ptr = (int *)(cur_text_section->data + t);
 151         n = *ptr; /* next value */
 152         *ptr = a - t - 4;
 153         t = n;
 154     }
 155 }
 156
 157 void gsym(int t)
 158 {
 159     gsym_addr(t, ind);
 160 }
 161
 162 /* psym is used to put an instruction with a data field which is a
 163    reference to a symbol. It is in fact the same as oad ! */
 164 #define psym oad
 165
 166 static int is64_type(int t)
 167 {
 168     return ((t & VT_BTYPE) == VT_PTR ||
 169             (t & VT_BTYPE) == VT_FUNC ||
 170             (t & VT_BTYPE) == VT_LLONG);
 171 }
 172
 173 static int is_sse_float(int t) {
 174     int bt;
 175     bt = t & VT_BTYPE;
 176     return bt == VT_DOUBLE || bt == VT_FLOAT;
 177 }
 178
 179 /* instruction + 4 bytes data. Return the address of the data */
 180 static int oad(int c, int s)
 181 {
 182     int ind1;
 183
 184     o(c);
 185     ind1 = ind + 4;
 186     if (ind1 > cur_text_section->data_allocated)
 187         section_realloc(cur_text_section, ind1);
 188     *(int *)(cur_text_section->data + ind) = s;
 189     s = ind;
 190     ind = ind1;
 191     return s;
 192 }
 193
 194 #if 0
 195 /* output constant with relocation if 'r & VT_SYM' is true */
 196 static void gen_addr64(int r, Sym *sym, int64_t c)
 197 {
 198     if (r & VT_SYM)
 199         greloc(cur_text_section, sym, ind, R_X86_64_64);
 200     gen_le64(c);
 201 }
 202 #endif
 203
 204 /* output constant with relocation if 'r & VT_SYM' is true */
 205 static void gen_addrpc32(int r, Sym *sym, int c)
 206 {
 207     if (r & VT_SYM)
 208         greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 209     gen_le32(c-4);
 210 }
 211
 212 /* output got address with relocation */
 213 static void gen_gotpcrel(int r, Sym *sym, int c)
 214 {
 215 #ifndef TCC_TARGET_PE
 216     Section *sr;
 217     ElfW(Rela) *rel;
 218     greloc(cur_text_section, sym, ind, R_X86_64_GOTPCREL);
 219     sr = cur_text_section->reloc;
 220     rel = (ElfW(Rela) *)(sr->data + sr->data_offset - sizeof(ElfW(Rela)));
 221     rel->r_addend = -4;
 222 #else
 223     printf("picpic: %s %x %x | %02x %02x %02x\n", get_tok_str(sym->v, NULL), c, r,
 224         cur_text_section->data[ind-3],
 225         cur_text_section->data[ind-2],
 226         cur_text_section->data[ind-1]
 227         );
 228     greloc(cur_text_section, sym, ind, R_X86_64_PC32);
 229 #endif
 230     gen_le32(0);
 231
 232     if (c) {
 233         /* we use add c, %xxx for displacement */
 234         o(0x48 + REX_BASE(r));
 235         o(0x81);
 236         o(0xc0 + REG_VALUE(r));
 237         gen_le32(c);
 238     }
 239 }
 240
 241 static void gen_modrm_impl(int op_reg, int r, Sym *sym, int c, int is_got)
 242 {
 243     op_reg = REG_VALUE(op_reg) << 3;
 244     if ((r & VT_VALMASK) == VT_CONST) {
 245         /* constant memory reference */
 246         o(0x05 | op_reg);
 247         if (is_got) {
 248             gen_gotpcrel(r, sym, c);
 249         } else {
 250             gen_addrpc32(r, sym, c);
 251         }
 252     } else if ((r & VT_VALMASK) == VT_LOCAL) {
 253         /* currently, we use only ebp as base */
 254         if (c == (char)c) {
 255             /* short reference */
 256             o(0x45 | op_reg);
 257             g(c);
 258         } else {
 259             oad(0x85 | op_reg, c);
 260         }
 261     } else if ((r & VT_VALMASK) >= TREG_MEM) {
 262         if (c) {
 263             g(0x80 | op_reg | REG_VALUE(r));
 264             gen_le32(c);
 265         } else {
 266             g(0x00 | op_reg | REG_VALUE(r));
 267         }
 268     } else {
 269         g(0x00 | op_reg | (r & VT_VALMASK));
 270     }
 271 }
 272
 273 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 274    opcode bits */
 275 static void gen_modrm(int op_reg, int r, Sym *sym, int c)
 276 {
 277     gen_modrm_impl(op_reg, r, sym, c, 0);
 278 }
 279
 280 /* generate a modrm reference. 'op_reg' contains the addtionnal 3
 281    opcode bits */
 282 static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
 283 {
 284     int is_got;
 285     int rex = 0x48 | (REX_BASE(op_reg) << 2);
 286     if ((r & VT_VALMASK) != VT_CONST &&
 287         (r & VT_VALMASK) != VT_LOCAL) {
 288         rex |= REX_BASE(VT_VALMASK & r);
 289     }
 290     o(rex);
 291     o(opcode);
 292     is_got = (op_reg & TREG_MEM) && !(sym->type.t & VT_STATIC);
 293     gen_modrm_impl(op_reg, r, sym, c, is_got);
 294 }
 295
 296
 297 /* load 'r' from value 'sv' */
 298 void load(int r, SValue *sv)
 299 {
 300     int v, t, ft, fc, fr;
 301     SValue v1;
 302
 303     fr = sv->r;
 304     ft = sv->type.t;
 305     fc = sv->c.ul;
 306
 307 #ifndef TCC_TARGET_PE
 308     /* we use indirect access via got */
 309     if ((fr & VT_VALMASK) == VT_CONST && (fr & VT_SYM) &&
 310         (fr & VT_LVAL) && !(sv->sym->type.t & VT_STATIC)) {
 311         /* use the result register as a temporal register */
 312         int tr = r | TREG_MEM;
 313         if (is_float(ft)) {
 314             /* we cannot use float registers as a temporal register */
 315             tr = get_reg(RC_INT) | TREG_MEM;
 316         }
 317         gen_modrm64(0x8b, tr, fr, sv->sym, 0);
 318
 319         /* load from the temporal register */
 320         fr = tr | VT_LVAL;
 321     }
 322 #endif
 323
 324     v = fr & VT_VALMASK;
 325     if (fr & VT_LVAL) {
 326         if (v == VT_LLOCAL) {
 327             v1.type.t = VT_PTR;
 328             v1.r = VT_LOCAL | VT_LVAL;
 329             v1.c.ul = fc;
 330             load(r, &v1);
 331             fr = r;
 332         }
 333         if ((ft & VT_BTYPE) == VT_FLOAT) {
 334             o(0x6e0f66); /* movd */
 335             r = 0;
 336         } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
 337             o(0x7e0ff3); /* movq */
 338             r = 0;
 339         } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
 340             o(0xdb); /* fldt */
 341             r = 5;
 342         } else if ((ft & VT_TYPE) == VT_BYTE) {
 343             o(0xbe0f);   /* movsbl */
 344         } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
 345             o(0xb60f);   /* movzbl */
 346         } else if ((ft & VT_TYPE) == VT_SHORT) {
 347             o(0xbf0f);   /* movswl */
 348         } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
 349             o(0xb70f);   /* movzwl */
 350         } else if (is64_type(ft)) {
 351             gen_modrm64(0x8b, r, fr, sv->sym, fc);
 352             return;
 353         } else {
 354             o(0x8b);   /* movl */
 355         }
 356         gen_modrm(r, fr, sv->sym, fc);
 357     } else {
 358         if (v == VT_CONST) {
 359             if (fr & VT_SYM) {
 360 #ifdef TCC_TARGET_PE
 361                 o(0x8d48);
 362                 o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 363                 gen_addrpc32(fr, sv->sym, fc);
 364 #else
 365                 if (sv->sym->type.t & VT_STATIC) {
 366                     o(0x8d48);
 367                     o(0x05 + REG_VALUE(r) * 8); /* lea xx(%rip), r */
 368                     gen_addrpc32(fr, sv->sym, fc);
 369                 } else {
 370                     o(0x8b48);
 371                     o(0x05 + REG_VALUE(r) * 8); /* mov xx(%rip), r */
 372                     gen_gotpcrel(r, sv->sym, fc);
 373                 }
 374 #endif
 375             } else if (is64_type(ft)) {
 376                 o(0x48);
 377                 o(0xb8 + REG_VALUE(r)); /* mov $xx, r */
 378                 gen_le64(sv->c.ull);
 379             } else {
 380                 o(0xb8 + REG_VALUE(r)); /* mov $xx, r */
 381                 gen_le32(fc);
 382             }
 383         } else if (v == VT_LOCAL) {
 384             o(0x48 | REX_BASE(r));
 385             o(0x8d); /* lea xxx(%ebp), r */
 386             gen_modrm(r, VT_LOCAL, sv->sym, fc);
 387         } else if (v == VT_CMP) {
 388             oad(0xb8 + r, 0); /* mov $0, r */
 389             o(0x0f); /* setxx %br */
 390             o(fc);
 391             o(0xc0 + r);
 392         } else if (v == VT_JMP || v == VT_JMPI) {
 393             t = v & 1;
 394             oad(0xb8 + r, t); /* mov $1, r */
 395             o(0x05eb); /* jmp after */
 396             gsym(fc);
 397             oad(0xb8 + r, t ^ 1); /* mov $0, r */
 398         } else if (v != r) {
 399             if (r == TREG_XMM0) {
 400                 assert(v == TREG_ST0);
 401                 /* gen_cvt_ftof(VT_DOUBLE); */
 402                 o(0xf0245cdd); /* fstpl -0x10(%rsp) */
 403                 /* movsd -0x10(%rsp),%xmm0 */
 404                 o(0x44100ff2);
 405                 o(0xf024);
 406             } else if (r == TREG_ST0) {
 407                 assert(v == TREG_XMM0);
 408                 /* gen_cvt_ftof(VT_LDOUBLE); */
 409                 /* movsd %xmm0,-0x10(%rsp) */
 410                 o(0x44110ff2);
 411                 o(0xf024);
 412                 o(0xf02444dd); /* fldl -0x10(%rsp) */
 413             } else {
 414                 o(0x48 | REX_BASE(r) | (REX_BASE(v) << 2));
 415                 o(0x89);
 416                 o(0xc0 + r + v * 8); /* mov v, r */
 417             }
 418         }
 419     }
 420 }
 421
 422 /* store register 'r' in lvalue 'v' */
 423 void store(int r, SValue *v)
 424 {
 425     int fr, bt, ft, fc;
 426     int op64 = 0;
 427     /* store the REX prefix in this variable when PIC is enabled */
 428     int pic = 0;
 429
 430     ft = v->type.t;
 431     fc = v->c.ul;
 432     fr = v->r & VT_VALMASK;
 433     bt = ft & VT_BTYPE;
 434
 435 #ifndef TCC_TARGET_PE
 436     /* we need to access the variable via got */
 437     if (fr == VT_CONST && (v->r & VT_SYM)) {
 438         /* mov xx(%rip), %r11 */
 439         o(0x1d8b4c);
 440         gen_gotpcrel(TREG_R11, v->sym, v->c.ul);
 441         pic = is64_type(bt) ? 0x49 : 0x41;
 442     }
 443 #endif
 444
 445     /* XXX: incorrect if float reg to reg */
 446     if (bt == VT_FLOAT) {
 447         o(0x66);
 448         o(pic);
 449         o(0x7e0f); /* movd */
 450         r = 0;
 451     } else if (bt == VT_DOUBLE) {
 452         o(0x66);
 453         o(pic);
 454         o(0xd60f); /* movq */
 455         r = 0;
 456     } else if (bt == VT_LDOUBLE) {
 457         o(0xc0d9); /* fld %st(0) */
 458         o(pic);
 459         o(0xdb); /* fstpt */
 460         r = 7;
 461     } else {
 462         if (bt == VT_SHORT)
 463             o(0x66);
 464         o(pic);
 465         if (bt == VT_BYTE || bt == VT_BOOL)
 466             o(0x88);
 467         else if (is64_type(bt))
 468             op64 = 0x89;
 469         else
 470             o(0x89);
 471     }
 472     if (pic) {
 473         /* xxx r, (%r11) where xxx is mov, movq, fld, or etc */
 474         if (op64)
 475             o(op64);
 476         o(3 + (r << 3));
 477     } else if (op64) {
 478         if (fr == VT_CONST ||
 479             fr == VT_LOCAL ||
 480             (v->r & VT_LVAL)) {
 481             gen_modrm64(op64, r, v->r, v->sym, fc);
 482         } else if (fr != r) {
 483             /* XXX: don't we really come here? */
 484             abort();
 485             o(0xc0 + fr + r * 8); /* mov r, fr */
 486         }
 487     } else {
 488         if (fr == VT_CONST ||
 489             fr == VT_LOCAL ||
 490             (v->r & VT_LVAL)) {
 491             gen_modrm(r, v->r, v->sym, fc);
 492         } else if (fr != r) {
 493             /* XXX: don't we really come here? */
 494             abort();
 495             o(0xc0 + fr + r * 8); /* mov r, fr */
 496         }
 497     }
 498 }
 499
 500 static void gadd_sp(int val)
 501 {
 502     if (val == (char)val) {
 503         o(0xc48348);
 504         g(val);
 505     } else {
 506         oad(0xc48148, val); /* add $xxx, %rsp */
 507     }
 508 }
 509
 510 /* 'is_jmp' is '1' if it is a jump */
 511 static void gcall_or_jmp(int is_jmp)
 512 {
 513     int r;
 514     if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
 515         /* constant case */
 516         if (vtop->r & VT_SYM) {
 517             /* relocation case */
 518             greloc(cur_text_section, vtop->sym,
 519                    ind + 1, R_X86_64_PC32);
 520         } else {
 521             /* put an empty PC32 relocation */
 522             put_elf_reloc(symtab_section, cur_text_section,
 523                           ind + 1, R_X86_64_PC32, 0);
 524         }
 525         oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
 526     } else {
 527         /* otherwise, indirect call */
 528         r = TREG_R11;
 529         load(r, vtop);
 530         o(0x41); /* REX */
 531         o(0xff); /* call/jmp *r */
 532         o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
 533     }
 534 }
 535
 536 #ifdef TCC_TARGET_PE
 537 #define REGN 4
 538 static uint8_t arg_regs[] = {
 539     TREG_RCX, TREG_RDX, TREG_R8, TREG_R9
 540 };
 541 #else
 542 #define REGN 6
 543 static uint8_t arg_regs[REGN] = {
 544     TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
 545 };
 546 #endif
 547
 548 /* Generate function call. The function address is pushed first, then
 549    all the parameters in call order. This functions pops all the
 550    parameters and the function address. */
 551 void gfunc_call(int nb_args)
 552 {
 553     int size, align, r, args_size, i;
 554     SValue *orig_vtop;
 555     int nb_reg_args = 0;
 556     int nb_sse_args = 0;
 557     int sse_reg, gen_reg;
 558
 559     /* calculate the number of integer/float arguments */
 560     args_size = 0;
 561     for(i = 0; i < nb_args; i++) {
 562         if ((vtop[-i].type.t & VT_BTYPE) == VT_STRUCT) {
 563             args_size += type_size(&vtop->type, &align);
 564         } else if ((vtop[-i].type.t & VT_BTYPE) == VT_LDOUBLE) {
 565             args_size += 16;
 566 #ifndef TCC_TARGET_PE
 567         } else if (is_sse_float(vtop[-i].type.t)) {
 568             nb_sse_args++;
 569             if (nb_sse_args > 8) args_size += 8;
 570 #endif
 571         } else {
 572             nb_reg_args++;
 573             if (nb_reg_args > REGN) args_size += 8;
 574         }
 575     }
 576
 577     /* for struct arguments, we need to call memcpy and the function
 578        call breaks register passing arguments we are preparing.
 579        So, we process arguments which will be passed by stack first. */
 580     orig_vtop = vtop;
 581     gen_reg = nb_reg_args;
 582     sse_reg = nb_sse_args;
 583
 584 #ifdef TCC_TARGET_PE
 585     save_regs(0); /* save used temporary registers */
 586 #endif
 587
 588     /* adjust stack to align SSE boundary */
 589     if (args_size &= 8) {
 590         o(0x50); /* push $rax */
 591     }
 592     for(i = 0; i < nb_args; i++) {
 593         if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
 594             size = type_size(&vtop->type, &align);
 595             /* align to stack align size */
 596             size = (size + 3) & ~3;
 597             /* allocate the necessary size on stack */
 598             o(0x48);
 599             oad(0xec81, size); /* sub $xxx, %rsp */
 600             /* generate structure store */
 601             r = get_reg(RC_INT);
 602             o(0x48 + REX_BASE(r));
 603             o(0x89); /* mov %rsp, r */
 604             o(0xe0 + r);
 605             {
 606                 /* following code breaks vtop[1] */
 607                 SValue tmp = vtop[1];
 608                 vset(&vtop->type, r | VT_LVAL, 0);
 609                 vswap();
 610                 vstore();
 611                 vtop[1] = tmp;
 612             }
 613             args_size += size;
 614         } else if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
 615             gv(RC_ST0);
 616             size = LDOUBLE_SIZE;
 617             oad(0xec8148, size); /* sub $xxx, %rsp */
 618             o(0x7cdb); /* fstpt 0(%rsp) */
 619             g(0x24);
 620             g(0x00);
 621             args_size += size;
 622         } else if (is_sse_float(vtop->type.t)) {
 623 #ifdef TCC_TARGET_PE
 624             int j = --gen_reg;
 625             if (j >= REGN) {
 626 #else
 627             int j = --sse_reg;
 628             if (j >= 8) {
 629 #endif
 630                 gv(RC_FLOAT);
 631                 o(0x50); /* push $rax */
 632                 /* movq %xmm0, (%rsp) */
 633                 o(0x04d60f66);
 634                 o(0x24);
 635                 args_size += 8;
 636             }
 637         } else {
 638             int j = --gen_reg;
 639             /* simple type */
 640             /* XXX: implicit cast ? */
 641             if (j >= REGN) {
 642                 r = gv(RC_INT);
 643                 o(0x50 + r); /* push r */
 644                 args_size += 8;
 645             }
 646         }
 647         vtop--;
 648     }
 649     vtop = orig_vtop;
 650
 651     /* then, we prepare register passing arguments.
 652        Note that we cannot set RDX and RCX in this loop because gv()
 653        may break these temporary registers. Let's use R10 and R11
 654        instead of them */
 655     gen_reg = nb_reg_args;
 656     sse_reg = nb_sse_args;
 657     for(i = 0; i < nb_args; i++) {
 658         if ((vtop->type.t & VT_BTYPE) == VT_STRUCT ||
 659             (vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
 660         } else if (is_sse_float(vtop->type.t)) {
 661 #ifdef TCC_TARGET_PE
 662             int j = --gen_reg;
 663             if (j < REGN) {
 664                 int d = arg_regs[j];
 665                 gv(RC_FLOAT); /* only one float register */
 666                 /* movaps %xmm0, %xmmN */
 667                 o(0x280f);
 668                 o(0xc0 + (j << 3));
 669                 o(0x50);
 670                 o(0xd60f66); /* movq %xmm0, (%rsp) */
 671                 o(0x2404 + (j << 3));
 672                 if (d < 8) {
 673                     o(0x58 + d); /* pop d */
 674                 } else {
 675                     o(0x58);
 676                     o(0xc08949 + d - 8);
 677                 }
 678             }
 679         } else {
 680             int j = --gen_reg;
 681             /* simple type */
 682             /* XXX: implicit cast ? */
 683             if (j < REGN) {
 684                 int d = arg_regs[j];
 685                 r = gv(RC_INT);
 686                 if (d != r) {
 687                     if (d < 8) {
 688                         o(0x8948); /* mov */
 689                         o(0xc0 + r * 8 + d);
 690                     } else {
 691                         o(0x8949); /* mov */
 692                         o(0xc0 + r * 8 + d - 8);
 693                     }
 694                 }
 695             }
 696 #else
 697             int j = --sse_reg;
 698             if (j < 8) {
 699                 gv(RC_FLOAT); /* only one float register */
 700                 /* movaps %xmm0, %xmmN */
 701                 o(0x280f);
 702                 o(0xc0 + (sse_reg << 3));
 703             }
 704         } else {
 705             int j = --gen_reg;
 706             /* simple type */
 707             /* XXX: implicit cast ? */
 708             if (j < REGN) {
 709                 r = gv(RC_INT);
 710                 if (j < 2) {
 711                     o(0x8948); /* mov */
 712                     o(0xc0 + r * 8 + arg_regs[j]);
 713                 } else if (j < 4) {
 714                     o(0x8949); /* mov */
 715                     /* j=2: r10, j=3: r11 */
 716                     o(0xc0 + r * 8 + j);
 717                 } else {
 718                     o(0x8949); /* mov */
 719                     /* j=4: r8, j=5: r9 */
 720                     o(0xc0 + r * 8 + j - 4);
 721                 }
 722             }
 723 #endif
 724         }
 725         vtop--;
 726     }
 727
 728 #ifdef TCC_TARGET_PE
 729     /* allocate scratch space */
 730     gadd_sp(-8*REGN);
 731     args_size += 8*REGN;
 732 #else
 733     save_regs(0); /* save used temporary registers */
 734
 735     /* Copy R10 and R11 into RDX and RCX, respectively */
 736     if (nb_reg_args > 2) {
 737         o(0xd2894c); /* mov %r10, %rdx */
 738         if (nb_reg_args > 3) {
 739             o(0xd9894c); /* mov %r11, %rcx */
 740         }
 741     }
 742
 743     oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
 744 #endif
 745     gcall_or_jmp(0);
 746     if (args_size)
 747         gadd_sp(args_size);
 748     vtop--;
 749 }
 750
 751 #define FUNC_PROLOG_SIZE 11
 752
 753 static void push_arg_reg(int i) {
 754     loc -= 8;
 755     gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
 756 }
 757
 758 /* generate function prolog of type 't' */
 759 void gfunc_prolog(CType *func_type)
 760 {
 761     int i, addr, align, size;
 762     int param_index, param_addr, reg_param_index, sse_param_index;
 763     Sym *sym;
 764     CType *type;
 765
 766     func_ret_sub = 0;
 767
 768     sym = func_type->ref;
 769     addr = PTR_SIZE * 2;
 770     loc = 0;
 771     ind += FUNC_PROLOG_SIZE;
 772     func_sub_sp_offset = ind;
 773
 774 #ifndef TCC_TARGET_PE
 775     if (func_type->ref->c == FUNC_ELLIPSIS) {
 776         int seen_reg_num, seen_sse_num, seen_stack_size;
 777         seen_reg_num = seen_sse_num = 0;
 778         /* frame pointer and return address */
 779         seen_stack_size = PTR_SIZE * 2;
 780         /* count the number of seen parameters */
 781         sym = func_type->ref;
 782         while ((sym = sym->next) != NULL) {
 783             type = &sym->type;
 784             if (is_sse_float(type->t)) {
 785                 if (seen_sse_num < 8) {
 786                     seen_sse_num++;
 787                 } else {
 788                     seen_stack_size += 8;
 789                 }
 790             } else if ((type->t & VT_BTYPE) == VT_STRUCT) {
 791                 size = type_size(type, &align);
 792                 size = (size + 3) & ~3;
 793                 seen_stack_size += size;
 794             } else if ((type->t & VT_BTYPE) == VT_LDOUBLE) {
 795                 seen_stack_size += LDOUBLE_SIZE;
 796             } else {
 797                 if (seen_reg_num < REGN) {
 798                     seen_reg_num++;
 799                 } else {
 800                     seen_stack_size += 8;
 801                 }
 802             }
 803         }
 804
 805         loc -= 16;
 806         /* movl $0x????????, -0x10(%rbp) */
 807         o(0xf045c7);
 808         gen_le32(seen_reg_num * 8);
 809         /* movl $0x????????, -0xc(%rbp) */
 810         o(0xf445c7);
 811         gen_le32(seen_sse_num * 16 + 48);
 812         /* movl $0x????????, -0x8(%rbp) */
 813         o(0xf845c7);
 814         gen_le32(seen_stack_size);
 815
 816         /* save all register passing arguments */
 817         for (i = 0; i < 8; i++) {
 818             loc -= 16;
 819             o(0xd60f66); /* movq */
 820             gen_modrm(7 - i, VT_LOCAL, NULL, loc);
 821             /* movq $0, loc+8(%rbp) */
 822             o(0x85c748);
 823             gen_le32(loc + 8);
 824             gen_le32(0);
 825         }
 826         for (i = 0; i < REGN; i++) {
 827             push_arg_reg(REGN-1-i);
 828         }
 829     }
 830 #endif
 831
 832     sym = func_type->ref;
 833     param_index = 0;
 834     reg_param_index = 0;
 835     sse_param_index = 0;
 836
 837     /* if the function returns a structure, then add an
 838        implicit pointer parameter */
 839     func_vt = sym->type;
 840     if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
 841         push_arg_reg(reg_param_index);
 842         param_addr = loc;
 843
 844         func_vc = loc;
 845         param_index++;
 846         reg_param_index++;
 847     }
 848     /* define parameters */
 849     while ((sym = sym->next) != NULL) {
 850         type = &sym->type;
 851         size = type_size(type, &align);
 852         size = (size + 3) & ~3;
 853 #ifndef TCC_TARGET_PE
 854         if (is_sse_float(type->t)) {
 855             if (sse_param_index < 8) {
 856                 /* save arguments passed by register */
 857                 loc -= 8;
 858                 o(0xd60f66); /* movq */
 859                 gen_modrm(sse_param_index, VT_LOCAL, NULL, loc);
 860                 param_addr = loc;
 861             } else {
 862                 param_addr = addr;
 863                 addr += size;
 864             }
 865             sse_param_index++;
 866         } else
 867 #endif
 868         if ((type->t & VT_BTYPE) == VT_STRUCT ||
 869                    (type->t & VT_BTYPE) == VT_LDOUBLE) {
 870             param_addr = addr;
 871             addr += size;
 872         } else {
 873 #ifdef TCC_TARGET_PE
 874             if (reg_param_index < REGN) {
 875                 /* save arguments passed by register */
 876                 gen_modrm64(0x89, arg_regs[reg_param_index], VT_LOCAL, NULL, addr);
 877             }
 878             param_addr = addr;
 879             addr += 8;
 880 #else
 881             if (reg_param_index < REGN) {
 882                 /* save arguments passed by register */
 883                 push_arg_reg(reg_param_index);
 884                 param_addr = loc;
 885             } else {
 886                 param_addr = addr;
 887                 addr += 8;
 888             }
 889 #endif
 890             reg_param_index++;
 891         }
 892         sym_push(sym->v & ~SYM_FIELD, type,
 893                  VT_LOCAL | VT_LVAL, param_addr);
 894         param_index++;
 895     }
 896 #ifdef TCC_TARGET_PE
 897     if (func_type->ref->c == FUNC_ELLIPSIS) {
 898         for (i = reg_param_index; i < REGN; ++i) {
 899             gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, addr);
 900             addr += 8;
 901         }
 902     }
 903 #endif
 904 }
 905
 906 /* generate function epilog */
 907 void gfunc_epilog(void)
 908 {
 909     int v, saved_ind;
 910
 911     o(0xc9); /* leave */
 912     if (func_ret_sub == 0) {
 913         o(0xc3); /* ret */
 914     } else {
 915         o(0xc2); /* ret n */
 916         g(func_ret_sub);
 917         g(func_ret_sub >> 8);
 918     }
 919     /* align local size to word & save local variables */
 920     v = (-loc + 15) & -16;
 921     saved_ind = ind;
 922     ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
 923 #ifdef TCC_TARGET_PE
 924     if (v >= 4096) {
 925         Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
 926         oad(0xb8, v); /* mov stacksize, %eax */
 927         oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
 928         greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
 929         o(0x90); /* fill for FUNC_PROLOG_SIZE = 11 bytes */
 930     } else
 931 #endif
 932     {
 933         o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
 934         o(0xec8148);  /* sub rsp, stacksize */
 935         gen_le32(v);
 936     }
 937     ind = saved_ind;
 938 }
 939
 940 /* generate a jump to a label */
 941 int gjmp(int t)
 942 {
 943     return psym(0xe9, t);
 944 }
 945
 946 /* generate a jump to a fixed address */
 947 void gjmp_addr(int a)
 948 {
 949     int r;
 950     r = a - ind - 2;
 951     if (r == (char)r) {
 952         g(0xeb);
 953         g(r);
 954     } else {
 955         oad(0xe9, a - ind - 5);
 956     }
 957 }
 958
 959 /* generate a test. set 'inv' to invert test. Stack entry is popped */
 960 int gtst(int inv, int t)
 961 {
 962     int v, *p;
 963
 964     v = vtop->r & VT_VALMASK;
 965     if (v == VT_CMP) {
 966         /* fast case : can jump directly since flags are set */
 967         g(0x0f);
 968         t = psym((vtop->c.i - 16) ^ inv, t);
 969     } else if (v == VT_JMP || v == VT_JMPI) {
 970         /* && or || optimization */
 971         if ((v & 1) == inv) {
 972             /* insert vtop->c jump list in t */
 973             p = &vtop->c.i;
 974             while (*p != 0)
 975                 p = (int *)(cur_text_section->data + *p);
 976             *p = t;
 977             t = vtop->c.i;
 978         } else {
 979             t = gjmp(t);
 980             gsym(vtop->c.i);
 981         }
 982     } else {
 983         if (is_float(vtop->type.t) ||
 984             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
 985             vpushi(0);
 986             gen_op(TOK_NE);
 987         }
 988         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
 989             /* constant jmp optimization */
 990             if ((vtop->c.i != 0) != inv)
 991                 t = gjmp(t);
 992         } else {
 993             v = gv(RC_INT);
 994             o(0x85);
 995             o(0xc0 + v * 9);
 996             g(0x0f);
 997             t = psym(0x85 ^ inv, t);
 998         }
 999     }
1000     vtop--;
1001     return t;
1002 }
1003
1004 /* generate an integer binary operation */
1005 void gen_opi(int op)
1006 {
1007     int r, fr, opc, c;
1008
1009     switch(op) {
1010     case '+':
1011     case TOK_ADDC1: /* add with carry generation */
1012         opc = 0;
1013     gen_op8:
1014         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST &&
1015             !is64_type(vtop->type.t)) {
1016             /* constant case */
1017             vswap();
1018             r = gv(RC_INT);
1019             if (is64_type(vtop->type.t)) {
1020                 o(0x48 | REX_BASE(r));
1021             }
1022             vswap();
1023             c = vtop->c.i;
1024             if (c == (char)c) {
1025                 /* XXX: generate inc and dec for smaller code ? */
1026                 o(0x83);
1027                 o(0xc0 | (opc << 3) | REG_VALUE(r));
1028                 g(c);
1029             } else {
1030                 o(0x81);
1031                 oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
1032             }
1033         } else {
1034             gv2(RC_INT, RC_INT);
1035             r = vtop[-1].r;
1036             fr = vtop[0].r;
1037             if (opc != 7 ||
1038                 is64_type(vtop[0].type.t) || (vtop[0].type.t & VT_UNSIGNED) ||
1039                 is64_type(vtop[-1].type.t) || (vtop[-1].type.t & VT_UNSIGNED)) {
1040                 o(0x48 | REX_BASE(r) | (REX_BASE(fr) << 2));
1041             }
1042             o((opc << 3) | 0x01);
1043             o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
1044         }
1045         vtop--;
1046         if (op >= TOK_ULT && op <= TOK_GT) {
1047             vtop->r = VT_CMP;
1048             vtop->c.i = op;
1049         }
1050         break;
1051     case '-':
1052     case TOK_SUBC1: /* sub with carry generation */
1053         opc = 5;
1054         goto gen_op8;
1055     case TOK_ADDC2: /* add with carry use */
1056         opc = 2;
1057         goto gen_op8;
1058     case TOK_SUBC2: /* sub with carry use */
1059         opc = 3;
1060         goto gen_op8;
1061     case '&':
1062         opc = 4;
1063         goto gen_op8;
1064     case '^':
1065         opc = 6;
1066         goto gen_op8;
1067     case '|':
1068         opc = 1;
1069         goto gen_op8;
1070     case '*':
1071         gv2(RC_INT, RC_INT);
1072         r = vtop[-1].r;
1073         fr = vtop[0].r;
1074         if (is64_type(vtop[0].type.t) || (vtop[0].type.t & VT_UNSIGNED) ||
1075             is64_type(vtop[-1].type.t) || (vtop[-1].type.t & VT_UNSIGNED)) {
1076             o(0x48 | REX_BASE(fr) | (REX_BASE(r) << 2));
1077         }
1078         vtop--;
1079         o(0xaf0f); /* imul fr, r */
1080         o(0xc0 + fr + r * 8);
1081         break;
1082     case TOK_SHL:
1083         opc = 4;
1084         goto gen_shift;
1085     case TOK_SHR:
1086         opc = 5;
1087         goto gen_shift;
1088     case TOK_SAR:
1089         opc = 7;
1090     gen_shift:
1091         opc = 0xc0 | (opc << 3);
1092         if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
1093             /* constant case */
1094             vswap();
1095             r = gv(RC_INT);
1096             if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
1097                 o(0x48 | REX_BASE(r));
1098                 c = 0x3f;
1099             } else {
1100                 c = 0x1f;
1101             }
1102             vswap();
1103             c &= vtop->c.i;
1104             o(0xc1); /* shl/shr/sar $xxx, r */
1105             o(opc | r);
1106             g(c);
1107         } else {
1108             /* we generate the shift in ecx */
1109             gv2(RC_INT, RC_RCX);
1110             r = vtop[-1].r;
1111             if ((vtop[-1].type.t & VT_BTYPE) == VT_LLONG) {
1112                 o(0x48 | REX_BASE(r));
1113             }
1114             o(0xd3); /* shl/shr/sar %cl, r */
1115             o(opc | r);
1116         }
1117         vtop--;
1118         break;
1119     case '/':
1120     case TOK_UDIV:
1121     case TOK_PDIV:
1122     case '%':
1123     case TOK_UMOD:
1124     case TOK_UMULL:
1125         /* first operand must be in eax */
1126         /* XXX: need better constraint for second operand */
1127         gv2(RC_RAX, RC_RCX);
1128         r = vtop[-1].r;
1129         fr = vtop[0].r;
1130         vtop--;
1131         save_reg(TREG_RDX);
1132         if (op == TOK_UMULL) {
1133             o(0xf7); /* mul fr */
1134             o(0xe0 + fr);
1135             vtop->r2 = TREG_RDX;
1136             r = TREG_RAX;
1137         } else {
1138             if (op == TOK_UDIV || op == TOK_UMOD) {
1139                 o(0xf7d231); /* xor %edx, %edx, div fr, %eax */
1140                 o(0xf0 + fr);
1141             } else {
1142                 if ((vtop->type.t & VT_BTYPE) & VT_LLONG) {
1143                     o(0x9948); /* cqto */
1144                     o(0x48 + REX_BASE(fr));
1145                 } else {
1146                     o(0x99); /* cltd */
1147                 }
1148                 o(0xf7); /* idiv fr, %eax */
1149                 o(0xf8 + fr);
1150             }
1151             if (op == '%' || op == TOK_UMOD)
1152                 r = TREG_RDX;
1153             else
1154                 r = TREG_RAX;
1155         }
1156         vtop->r = r;
1157         break;
1158     default:
1159         opc = 7;
1160         goto gen_op8;
1161     }
1162 }
1163
1164 void gen_opl(int op)
1165 {
1166     gen_opi(op);
1167 }
1168
1169 /* generate a floating point operation 'v = t1 op t2' instruction. The
1170    two operands are guaranted to have the same floating point type */
1171 /* XXX: need to use ST1 too */
1172 void gen_opf(int op)
1173 {
1174     int a, ft, fc, swapped, r;
1175     int float_type =
1176         (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
1177
1178     /* convert constants to memory references */
1179     if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
1180         vswap();
1181         gv(float_type);
1182         vswap();
1183     }
1184     if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
1185         gv(float_type);
1186
1187     /* must put at least one value in the floating point register */
1188     if ((vtop[-1].r & VT_LVAL) &&
1189         (vtop[0].r & VT_LVAL)) {
1190         vswap();
1191         gv(float_type);
1192         vswap();
1193     }
1194     swapped = 0;
1195     /* swap the stack if needed so that t1 is the register and t2 is
1196        the memory reference */
1197     if (vtop[-1].r & VT_LVAL) {
1198         vswap();
1199         swapped = 1;
1200     }
1201     if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1202         if (op >= TOK_ULT && op <= TOK_GT) {
1203             /* load on stack second operand */
1204             load(TREG_ST0, vtop);
1205             save_reg(TREG_RAX); /* eax is used by FP comparison code */
1206             if (op == TOK_GE || op == TOK_GT)
1207                 swapped = !swapped;
1208             else if (op == TOK_EQ || op == TOK_NE)
1209                 swapped = 0;
1210             if (swapped)
1211                 o(0xc9d9); /* fxch %st(1) */
1212             o(0xe9da); /* fucompp */
1213             o(0xe0df); /* fnstsw %ax */
1214             if (op == TOK_EQ) {
1215                 o(0x45e480); /* and $0x45, %ah */
1216                 o(0x40fC80); /* cmp $0x40, %ah */
1217             } else if (op == TOK_NE) {
1218                 o(0x45e480); /* and $0x45, %ah */
1219                 o(0x40f480); /* xor $0x40, %ah */
1220                 op = TOK_NE;
1221             } else if (op == TOK_GE || op == TOK_LE) {
1222                 o(0x05c4f6); /* test $0x05, %ah */
1223                 op = TOK_EQ;
1224             } else {
1225                 o(0x45c4f6); /* test $0x45, %ah */
1226                 op = TOK_EQ;
1227             }
1228             vtop--;
1229             vtop->r = VT_CMP;
1230             vtop->c.i = op;
1231         } else {
1232             /* no memory reference possible for long double operations */
1233             load(TREG_ST0, vtop);
1234             swapped = !swapped;
1235
1236             switch(op) {
1237             default:
1238             case '+':
1239                 a = 0;
1240                 break;
1241             case '-':
1242                 a = 4;
1243                 if (swapped)
1244                     a++;
1245                 break;
1246             case '*':
1247                 a = 1;
1248                 break;
1249             case '/':
1250                 a = 6;
1251                 if (swapped)
1252                     a++;
1253                 break;
1254             }
1255             ft = vtop->type.t;
1256             fc = vtop->c.ul;
1257             o(0xde); /* fxxxp %st, %st(1) */
1258             o(0xc1 + (a << 3));
1259             vtop--;
1260         }
1261     } else {
1262         if (op >= TOK_ULT && op <= TOK_GT) {
1263             /* if saved lvalue, then we must reload it */
1264             r = vtop->r;
1265             fc = vtop->c.ul;
1266             if ((r & VT_VALMASK) == VT_LLOCAL) {
1267                 SValue v1;
1268                 r = get_reg(RC_INT);
1269                 v1.type.t = VT_INT;
1270                 v1.r = VT_LOCAL | VT_LVAL;
1271                 v1.c.ul = fc;
1272                 load(r, &v1);
1273                 fc = 0;
1274             }
1275
1276             if (op == TOK_EQ || op == TOK_NE) {
1277                 swapped = 0;
1278             } else {
1279                 if (op == TOK_LE || op == TOK_LT)
1280                     swapped = !swapped;
1281                 if (op == TOK_LE || op == TOK_GE) {
1282                     op = 0x93; /* setae */
1283                 } else {
1284                     op = 0x97; /* seta */
1285                 }
1286             }
1287
1288             if (swapped) {
1289                 o(0x7e0ff3); /* movq */
1290                 gen_modrm(1, r, vtop->sym, fc);
1291
1292                 if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
1293                     o(0x66);
1294                 }
1295                 o(0x2e0f); /* ucomisd %xmm0, %xmm1 */
1296                 o(0xc8);
1297             } else {
1298                 if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
1299                     o(0x66);
1300                 }
1301                 o(0x2e0f); /* ucomisd */
1302                 gen_modrm(0, r, vtop->sym, fc);
1303             }
1304
1305             vtop--;
1306             vtop->r = VT_CMP;
1307             vtop->c.i = op;
1308         } else {
1309             /* no memory reference possible for long double operations */
1310             if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
1311                 load(TREG_XMM0, vtop);
1312                 swapped = !swapped;
1313             }
1314             switch(op) {
1315             default:
1316             case '+':
1317                 a = 0;
1318                 break;
1319             case '-':
1320                 a = 4;
1321                 break;
1322             case '*':
1323                 a = 1;
1324                 break;
1325             case '/':
1326                 a = 6;
1327                 break;
1328             }
1329             ft = vtop->type.t;
1330             fc = vtop->c.ul;
1331             if ((ft & VT_BTYPE) == VT_LDOUBLE) {
1332                 o(0xde); /* fxxxp %st, %st(1) */
1333                 o(0xc1 + (a << 3));
1334             } else {
1335                 /* if saved lvalue, then we must reload it */
1336                 r = vtop->r;
1337                 if ((r & VT_VALMASK) == VT_LLOCAL) {
1338                     SValue v1;
1339                     r = get_reg(RC_INT);
1340                     v1.type.t = VT_INT;
1341                     v1.r = VT_LOCAL | VT_LVAL;
1342                     v1.c.ul = fc;
1343                     load(r, &v1);
1344                     fc = 0;
1345                 }
1346                 if (swapped) {
1347                     /* movq %xmm0,%xmm1 */
1348                     o(0x7e0ff3);
1349                     o(0xc8);
1350                     load(TREG_XMM0, vtop);
1351                     /* subsd  %xmm1,%xmm0 (f2 0f 5c c1) */
1352                     if ((ft & VT_BTYPE) == VT_DOUBLE) {
1353                         o(0xf2);
1354                     } else {
1355                         o(0xf3);
1356                     }
1357                     o(0x0f);
1358                     o(0x58 + a);
1359                     o(0xc1);
1360                 } else {
1361                     if ((ft & VT_BTYPE) == VT_DOUBLE) {
1362                         o(0xf2);
1363                     } else {
1364                         o(0xf3);
1365                     }
1366                     o(0x0f);
1367                     o(0x58 + a);
1368                     gen_modrm(0, r, vtop->sym, fc);
1369                 }
1370             }
1371             vtop--;
1372         }
1373     }
1374 }
1375
1376 /* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
1377    and 'long long' cases. */
1378 void gen_cvt_itof(int t)
1379 {
1380     if ((t & VT_BTYPE) == VT_LDOUBLE) {
1381         save_reg(TREG_ST0);
1382         gv(RC_INT);
1383         if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
1384             /* signed long long to float/double/long double (unsigned case
1385                is handled generically) */
1386             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1387             o(0x242cdf); /* fildll (%rsp) */
1388             o(0x08c48348); /* add $8, %rsp */
1389         } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
1390                    (VT_INT | VT_UNSIGNED)) {
1391             /* unsigned int to float/double/long double */
1392             o(0x6a); /* push $0 */
1393             g(0x00);
1394             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1395             o(0x242cdf); /* fildll (%rsp) */
1396             o(0x10c48348); /* add $16, %rsp */
1397         } else {
1398             /* int to float/double/long double */
1399             o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
1400             o(0x2404db); /* fildl (%rsp) */
1401             o(0x08c48348); /* add $8, %rsp */
1402         }
1403         vtop->r = TREG_ST0;
1404     } else {
1405         save_reg(TREG_XMM0);
1406         gv(RC_INT);
1407         o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT));
1408         if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
1409             (VT_INT | VT_UNSIGNED) ||
1410             (vtop->type.t & VT_BTYPE) == VT_LLONG) {
1411             o(0x48); /* REX */
1412         }
1413         o(0x2a0f);
1414         o(0xc0 + (vtop->r & VT_VALMASK)); /* cvtsi2sd */
1415         vtop->r = TREG_XMM0;
1416     }
1417 }
1418
1419 /* convert from one floating point type to another */
1420 void gen_cvt_ftof(int t)
1421 {
1422     int ft, bt, tbt;
1423
1424     ft = vtop->type.t;
1425     bt = ft & VT_BTYPE;
1426     tbt = t & VT_BTYPE;
1427
1428     if (bt == VT_FLOAT) {
1429         gv(RC_FLOAT);
1430         if (tbt == VT_DOUBLE) {
1431             o(0xc0140f); /* unpcklps */
1432             o(0xc05a0f); /* cvtps2pd */
1433         } else if (tbt == VT_LDOUBLE) {
1434             /* movss %xmm0,-0x10(%rsp) */
1435             o(0x44110ff3);
1436             o(0xf024);
1437             o(0xf02444d9); /* flds -0x10(%rsp) */
1438             vtop->r = TREG_ST0;
1439         }
1440     } else if (bt == VT_DOUBLE) {
1441         gv(RC_FLOAT);
1442         if (tbt == VT_FLOAT) {
1443             o(0xc0140f66); /* unpcklpd */
1444             o(0xc05a0f66); /* cvtpd2ps */
1445         } else if (tbt == VT_LDOUBLE) {
1446             /* movsd %xmm0,-0x10(%rsp) */
1447             o(0x44110ff2);
1448             o(0xf024);
1449             o(0xf02444dd); /* fldl -0x10(%rsp) */
1450             vtop->r = TREG_ST0;
1451         }
1452     } else {
1453         gv(RC_ST0);
1454         if (tbt == VT_DOUBLE) {
1455             o(0xf0245cdd); /* fstpl -0x10(%rsp) */
1456             /* movsd -0x10(%rsp),%xmm0 */
1457             o(0x44100ff2);
1458             o(0xf024);
1459             vtop->r = TREG_XMM0;
1460         } else if (tbt == VT_FLOAT) {
1461             o(0xf0245cd9); /* fstps -0x10(%rsp) */
1462             /* movss -0x10(%rsp),%xmm0 */
1463             o(0x44100ff3);
1464             o(0xf024);
1465             vtop->r = TREG_XMM0;
1466         }
1467     }
1468 }
1469
1470 /* convert fp to int 't' type */
1471 void gen_cvt_ftoi(int t)
1472 {
1473     int ft, bt, size, r;
1474     ft = vtop->type.t;
1475     bt = ft & VT_BTYPE;
1476     if (bt == VT_LDOUBLE) {
1477         gen_cvt_ftof(VT_DOUBLE);
1478         bt = VT_DOUBLE;
1479     }
1480
1481     gv(RC_FLOAT);
1482     if (t != VT_INT)
1483         size = 8;
1484     else
1485         size = 4;
1486
1487     r = get_reg(RC_INT);
1488     if (bt == VT_FLOAT) {
1489         o(0xf3);
1490     } else if (bt == VT_DOUBLE) {
1491         o(0xf2);
1492     } else {
1493         assert(0);
1494     }
1495     if (size == 8) {
1496         o(0x48 + REX_BASE(r));
1497     }
1498     o(0x2c0f); /* cvttss2si or cvttsd2si */
1499     o(0xc0 + (REG_VALUE(r) << 3));
1500     vtop->r = r;
1501 }
1502
1503 /* computed goto support */
1504 void ggoto(void)
1505 {
1506     gcall_or_jmp(1);
1507     vtop--;
1508 }
1509
1510 /* end of x86-64 code generator */
1511 /*************************************************************/