From 0a9873aa22731077fad295a4aad2fc1f390c8ac7 Mon Sep 17 00:00:00 2001
From: Shinichiro Hamaji <shinichiro.hamaji _at_ gmail.com>
Date: Tue, 2 Dec 2008 03:19:25 +0900
Subject: [PATCH] Add support of x86-64.

Most change was done in #ifdef TCC_TARGET_X86_64. So, nothing should be broken by this change.

Summary of current status of x86-64 support:

- produces x86-64 object files and executables.
- the x86-64 code generator is based on x86's.
-- for long long integers, we use 64bit registers instead of tcc's generic implementation.
-- for float or double, we use SSE. SSE registers are not utilized well (we only use xmm0 and xmm1).
-- for long double, we use x87 FPU.
- passes make test.
- passes ./libtcc_test.
- can compile tcc.c. The compiled tcc can compile tcc.c, too. (there should be some bugs since the binary size of tcc2 and tcc3 is differ where tcc tcc.c -o tcc2 and tcc2 tcc.c -o tcc3)
- can compile links browser. It seems working.
- not tested well. I tested this work only on my linux box with few programs.
- calling convention of long-double-integer or struct is not exactly the same as GCC's x86-64 ABI.
- implementation of tcc -run is naive (tcc -run tcctest.c works, but tcc -run tcc.c doesn't work). Relocating 64bit addresses seems to be not as simple as 32bit environments.
- shared object support isn't unimplemented
- no bounds checker support
- some builtin functions such as __divdi3 aren't supported
---
 Makefile     |   23 +-
 configure    |    6 +
 libtcc1.c    |    5 +
 stdarg.h     |   73 +++-
 tcc.c        |  138 +++++-
 tccelf.c     |  139 +++++-
 x86_64-gen.c | 1355 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 1724 insertions(+), 15 deletions(-)
 create mode 100644 x86_64-gen.c

diff --git a/Makefile b/Makefile
index 1e6adad8..6ff3eb56 100644
--- a/Makefile
+++ b/Makefile
@@ -9,20 +9,28 @@ LIBS=-lm
 ifndef CONFIG_NOLDL
 LIBS+=-ldl
 endif
+ifneq ($(ARCH),x86-64)
 BCHECK_O=bcheck.o
 endif
+endif
 CFLAGS_P=$(CFLAGS) -pg -static -DCONFIG_TCC_STATIC
 LIBS_P=
 
+ifneq ($(GCC_MAJOR),2)
+CFLAGS+=-fno-strict-aliasing
+endif
+
+ifeq ($(ARCH),i386)
 CFLAGS+=-mpreferred-stack-boundary=2
 ifeq ($(GCC_MAJOR),2)
 CFLAGS+=-m386 -malign-functions=0
 else
-CFLAGS+=-march=i386 -falign-functions=0 -fno-strict-aliasing
+CFLAGS+=-march=i386 -falign-functions=0
 ifneq ($(GCC_MAJOR),3)
 CFLAGS+=-Wno-pointer-sign -Wno-sign-compare
 endif
 endif
+endif
 
 DISAS=objdump -d
 INSTALL=install
@@ -50,6 +58,9 @@ ifdef CONFIG_CROSS
 PROGS+=c67-tcc$(EXESUF) i386-win32-tcc$(EXESUF)
 endif
 endif
+ifeq ($(ARCH),x86-64)
+PROGS=tcc$(EXESUF)
+endif
 
 ifdef CONFIG_USE_LIBGCC
 LIBTCC1=
@@ -163,6 +174,10 @@ ARMFLAGS += $(if $(shell grep -l "^Features.* \(vfp\|iwmmxt\) " /proc/cpuinfo),-
 tcc$(EXESUF): tcc.c arm-gen.c tccelf.c tccasm.c tcctok.h libtcc.h
 	$(CC) $(CFLAGS) -DTCC_TARGET_ARM $(ARMFLAGS) -o $@ $< $(LIBS)
 endif
+ifeq ($(ARCH),x86-64)
+tcc$(EXESUF): tcc.c tccelf.c tccasm.c tcctok.h libtcc.h x86_64-gen.c
+	$(CC) $(CFLAGS) -DTCC_TARGET_X86_64 -o $@ $< $(LIBS)
+endif
 endif
 
 # Cross Tiny C Compilers
@@ -238,7 +253,9 @@ else
 ifndef CONFIG_USE_LIBGCC
 	$(INSTALL) -m644 libtcc1.a "$(DESTDIR)$(tccdir)"
 endif
+ifneq ($(ARCH),x86-64)
 	$(INSTALL) -m644 $(BCHECK_O) "$(DESTDIR)$(tccdir)"
+endif
 	$(INSTALL) -m644 stdarg.h stddef.h stdbool.h float.h varargs.h \
                    tcclib.h "$(DESTDIR)$(tccdir)/include"
 endif
@@ -272,8 +289,12 @@ libtcc.o: tcc.c i386-gen.c Makefile
 ifdef CONFIG_WIN32
 	$(CC) $(CFLAGS) -DTCC_TARGET_PE -DLIBTCC -c -o $@ $<
 else
+ifeq ($(ARCH),x86-64)
+	$(CC) $(CFLAGS) -DTCC_TARGET_X86_64 -DLIBTCC -c -o $@ $<
+else
 	$(CC) $(CFLAGS) -DLIBTCC -c -o $@ $<
 endif
+endif
 
 libtcc.a: libtcc.o 
 	$(AR) rcs $@ $^
diff --git a/configure b/configure
index 63328cc2..f4c718d7 100755
--- a/configure
+++ b/configure
@@ -39,6 +39,9 @@ case "$cpu" in
   i386|i486|i586|i686|i86pc|BePC)
     cpu="x86"
   ;;
+  x86_64)
+    cpu="x86-64"
+  ;;
   armv4l)
     cpu="armv4l"
   ;;
@@ -313,6 +316,9 @@ echo "EXESUF=$EXESUF" >> config.mak
 if test "$cpu" = "x86" ; then
   echo "ARCH=i386" >> config.mak
   echo "#define HOST_I386 1" >> $TMPH
+elif test "$cpu" = "x86-64" ; then
+  echo "ARCH=x86-64" >> config.mak
+  echo "#define HOST_X86_64 1" >> $TMPH
 elif test "$cpu" = "armv4l" ; then
   echo "ARCH=arm" >> config.mak
   echo "#define HOST_ARM 1" >> $TMPH
diff --git a/libtcc1.c b/libtcc1.c
index 96bf22cf..b079477e 100644
--- a/libtcc1.c
+++ b/libtcc1.c
@@ -106,6 +106,9 @@ union float_long {
     long l;
 };
 
+/* XXX: we don't support several builtin supports for now */
+#ifndef __x86_64__
+
 /* XXX: use gcc/tcc intrinsic ? */
 #if defined(__i386__)
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
@@ -482,6 +485,8 @@ unsigned short __tcc_fpu_control = 0x137f;
 unsigned short __tcc_int_fpu_control = 0x137f | 0x0c00;
 #endif
 
+#endif /* !__x86_64__ */
+
 /* XXX: fix tcc's code generator to do this instead */
 float __floatundisf(unsigned long long a)
 {
diff --git a/stdarg.h b/stdarg.h
index a9b22b7b..899358ce 100644
--- a/stdarg.h
+++ b/stdarg.h
@@ -1,6 +1,75 @@
 #ifndef _STDARG_H
 #define _STDARG_H
 
+#ifdef __x86_64__
+
+#ifdef __TINYC__
+
+#include <stdlib.h>
+
+/* GCC compatible definition of va_list. */
+struct __va_list_struct {
+    unsigned int gp_offset;
+    unsigned int fp_offset;
+    union {
+        unsigned int overflow_offset;
+        char *overflow_arg_area;
+    };
+    char *reg_save_area;
+};
+
+typedef struct __va_list_struct *va_list;
+
+/* avoid #define malloc tcc_malloc.
+   XXX: add __malloc or something into libtcc? */
+inline void *__va_list_malloc(size_t size) { return malloc(size); }
+inline void __va_list_free(void *ptr) { free(ptr); }
+
+/* XXX: this lacks the support of aggregated types. */
+#define va_start(ap, last)                                              \
+    (ap = (va_list)__va_list_malloc(sizeof(struct __va_list_struct)),   \
+     *ap = *(struct __va_list_struct*)(                                 \
+         (char*)__builtin_frame_address(0) - 16),                       \
+     ap->overflow_arg_area = ((char *)__builtin_frame_address(0) +      \
+                              ap->overflow_offset),                     \
+     ap->reg_save_area = (char *)__builtin_frame_address(0) - 176 - 16  \
+        )
+#define va_arg(ap, type)                                        \
+    (*(type*)(__builtin_types_compatible_p(type, long double)   \
+              ? (ap->overflow_arg_area += 16,                   \
+                 ap->overflow_arg_area - 16)                    \
+              : __builtin_types_compatible_p(type, double)      \
+              ? (ap->fp_offset < 128 + 48                       \
+                 ? (ap->fp_offset += 16,                        \
+                    ap->reg_save_area + ap->fp_offset - 16)     \
+                 : (ap->overflow_arg_area += 8,                 \
+                    ap->overflow_arg_area - 8))                 \
+              : (ap->gp_offset < 48                             \
+                 ? (ap->gp_offset += 8,                         \
+                    ap->reg_save_area + ap->gp_offset - 8)      \
+                 : (ap->overflow_arg_area += 8,                 \
+                    ap->overflow_arg_area - 8))                 \
+        ))
+#define va_copy(dest, src)                                      \
+    ((dest) = (va_list)malloc(sizeof(struct __va_list_struct)), \
+     *(dest) = *(src))
+#define va_end(ap) __va_list_free(ap)
+
+#else
+
+/* for GNU C */
+
+typedef __builtin_va_list va_list;
+
+#define va_start(ap, last) __builtin_va_start(ap, last)
+#define va_arg(ap, type) __builtin_va_arg(ap, type)
+#define va_copy(dest, src) __builtin_va_copy(dest, src)
+#define va_end(ap) __builtin_va_end(ap)
+
+#endif
+
+#else
+
 typedef char *va_list;
 
 /* only correct for i386 */
@@ -9,8 +78,10 @@ typedef char *va_list;
 #define va_copy(dest, src) (dest) = (src)
 #define va_end(ap)
 
+#endif
+
 /* fix a buggy dependency on GCC in libio.h */
 typedef va_list __gnuc_va_list;
 #define _VA_LIST_DEFINED
 
-#endif
+#endif /* _STDARG_H */
diff --git a/tcc.c b/tcc.c
index afb1343d..0bfc5c01 100644
--- a/tcc.c
+++ b/tcc.c
@@ -79,15 +79,16 @@
 //#define TCC_TARGET_I386   /* i386 code generator */
 //#define TCC_TARGET_ARM    /* ARMv4 code generator */
 //#define TCC_TARGET_C67    /* TMS320C67xx code generator */
+//#define TCC_TARGET_X86_64 /* x86-64 code generator */
 
 /* default target is I386 */
 #if !defined(TCC_TARGET_I386) && !defined(TCC_TARGET_ARM) && \
-    !defined(TCC_TARGET_C67)
+    !defined(TCC_TARGET_C67) && !defined(TCC_TARGET_X86_64)
 #define TCC_TARGET_I386
 #endif
 
 #if !defined(_WIN32) && !defined(TCC_UCLIBC) && !defined(TCC_TARGET_ARM) && \
-    !defined(TCC_TARGET_C67)
+    !defined(TCC_TARGET_C67) && !defined(TCC_TARGET_X86_64)
 #define CONFIG_TCC_BCHECK /* enable bound checking code */
 #endif
 
@@ -96,7 +97,8 @@
 #endif
 
 /* define it to include assembler support */
-#if !defined(TCC_TARGET_ARM) && !defined(TCC_TARGET_C67)
+#if !defined(TCC_TARGET_ARM) && !defined(TCC_TARGET_C67) && \
+    !defined(TCC_TARGET_X86_64)
 #define CONFIG_TCC_ASM
 #endif
 
@@ -531,6 +533,12 @@ struct TCCState {
 
     /* output file for preprocessing */
     FILE *outfile;
+
+#ifdef TCC_TARGET_X86_64
+    /* buffer to store jump tables */
+    char *jmp_table;
+    int jmp_table_num;
+#endif
 };
 
 /* The current value can be: */
@@ -938,6 +946,10 @@ static inline int is_float(int t)
 #include "c67-gen.c"
 #endif
 
+#ifdef TCC_TARGET_X86_64
+#include "x86_64-gen.c"
+#endif
+
 #ifdef CONFIG_TCC_STATIC
 
 #define RTLD_LAZY       0x001
@@ -4769,26 +4781,33 @@ void save_reg(int r)
                 r = p->r & VT_VALMASK;
                 /* store register in the stack */
                 type = &p->type;
+#ifndef TCC_TARGET_X86_64
                 if ((p->r & VT_LVAL) || 
                     (!is_float(type->t) && (type->t & VT_BTYPE) != VT_LLONG))
                     type = &int_type;
+#else
+                if (p->r & VT_LVAL)
+                    type = &char_pointer_type;
+#endif
                 size = type_size(type, &align);
                 loc = (loc - size) & -align;
                 sv.type.t = type->t;
                 sv.r = VT_LOCAL | VT_LVAL;
                 sv.c.ul = loc;
                 store(r, &sv);
-#ifdef TCC_TARGET_I386
+#if defined(TCC_TARGET_I386) || defined(TCC_TARGET_X86_64)
                 /* x86 specific: need to pop fp register ST0 if saved */
                 if (r == TREG_ST0) {
                     o(0xd9dd); /* fstp %st(1) */
                 }
 #endif
+#ifndef TCC_TARGET_X86_64
                 /* special long long case */
                 if ((type->t & VT_BTYPE) == VT_LLONG) {
                     sv.c.ul += 4;
                     store(p->r2, &sv);
                 }
+#endif
                 l = loc;
                 saved = 1;
             }
@@ -4939,8 +4958,7 @@ void gbound(void)
    register value (such as structures). */
 int gv(int rc)
 {
-    int r, r2, rc2, bit_pos, bit_size, size, align, i;
-    unsigned long long ll;
+    int r, rc2, bit_pos, bit_size, size, align, i;
 
     /* NOTE: get_reg can modify vstack[] */
     if (vtop->type.t & VT_BITFIELD) {
@@ -5019,7 +5037,10 @@ int gv(int rc)
             ((vtop->type.t & VT_BTYPE) == VT_LLONG && 
              !(reg_classes[vtop->r2] & rc2))) {
             r = get_reg(rc);
+#ifndef TCC_TARGET_X86_64
             if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
+                int r2;
+                unsigned long long ll;
                 /* two register type load : expand to two words
                    temporarily */
                 if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
@@ -5059,7 +5080,9 @@ int gv(int rc)
                 vpop();
                 /* write second register */
                 vtop->r2 = r2;
-            } else if ((vtop->r & VT_LVAL) && !is_float(vtop->type.t)) {
+            } else
+#endif
+            if ((vtop->r & VT_LVAL) && !is_float(vtop->type.t)) {
                 int t1, t;
                 /* lvalue of scalar type : need to use lvalue type
                    because of possible cast */
@@ -5224,7 +5247,7 @@ void vpop(void)
 {
     int v;
     v = vtop->r & VT_VALMASK;
-#ifdef TCC_TARGET_I386
+#if defined(TCC_TARGET_I386) || defined(TCC_TARGET_X86_64)
     /* for x86, we need to pop the FP stack */
     if (v == TREG_ST0 && !nocode_wanted) {
         o(0xd9dd); /* fstp %st(1) */
@@ -5265,6 +5288,11 @@ void gv_dup(void)
         sv.type.t = VT_INT;
         if (is_float(t)) {
             rc = RC_FLOAT;
+#ifdef TCC_TARGET_X86_64
+            if ((t & VT_BTYPE) == VT_LDOUBLE) {
+                rc = RC_ST0;
+            }
+#endif
             sv.type.t = t;
         }
         r = gv(rc);
@@ -5278,6 +5306,7 @@ void gv_dup(void)
     }
 }
 
+#ifndef TCC_TARGET_X86_64
 /* generate CPU independent (unsigned) long long operations */
 void gen_opl(int op)
 {
@@ -5512,6 +5541,7 @@ void gen_opl(int op)
         break;
     }
 }
+#endif
 
 /* handle integer constant optimizations and various machine
    independent opt */
@@ -5790,7 +5820,11 @@ void gen_op(int op)
         if (op >= TOK_ULT && op <= TOK_LOR) {
             check_comparison_pointer_types(vtop - 1, vtop, op);
             /* pointers are handled are unsigned */
+#ifdef TCC_TARGET_X86_64
+            t = VT_LLONG | VT_UNSIGNED;
+#else
             t = VT_INT | VT_UNSIGNED;
+#endif
             goto std_op;
         }
         /* if both pointers, then it must be the '-' op */
@@ -5802,7 +5836,11 @@ void gen_op(int op)
             u = pointed_size(&vtop[-1].type);
             gen_opic(op);
             /* set to integer type */
+#ifdef TCC_TARGET_X86_64
+            vtop->type.t = VT_LLONG;
+#else
             vtop->type.t = VT_INT; 
+#endif
             vpushi(u);
             gen_op(TOK_PDIV);
         } else {
@@ -5815,8 +5853,18 @@ void gen_op(int op)
                 swap(&t1, &t2);
             }
             type1 = vtop[-1].type;
+#ifdef TCC_TARGET_X86_64
+            {
+                CValue cval;
+                CType ctype;
+                ctype.t = VT_LLONG;
+                cval.ull = pointed_size(&vtop[-1].type);
+                vsetc(&ctype, VT_CONST, &cval);
+            }
+#else
             /* XXX: cast to int ? (long long case) */
             vpushi(pointed_size(&vtop[-1].type));
+#endif
             gen_op('*');
 #ifdef CONFIG_TCC_BCHECK
             /* if evaluating constant expression, no code should be
@@ -6099,6 +6147,7 @@ static void gen_cast(CType *type)
             } else if ((dbt & VT_BTYPE) == VT_LLONG) {
                 if ((sbt & VT_BTYPE) != VT_LLONG) {
                     /* scalar to long long */
+#ifndef TCC_TARGET_X86_64
                     /* machine independent conversion */
                     gv(RC_INT);
                     /* generate high word */
@@ -6113,6 +6162,14 @@ static void gen_cast(CType *type)
                     /* patch second register */
                     vtop[-1].r2 = vtop->r;
                     vpop();
+#else
+                    int r = gv(RC_INT);
+                    if (sbt != (VT_INT | VT_UNSIGNED)) {
+                        /* x86_64 specific: movslq */
+                        o(0x6348);
+                        o(0xc0 + (REG_VALUE(r) << 3) + REG_VALUE(r));
+                    }
+#endif
                 }
             } else if (dbt == VT_BOOL) {
                 /* scalar to bool */
@@ -6571,20 +6628,31 @@ void vstore(void)
 #endif
         if (!nocode_wanted) {
             rc = RC_INT;
-            if (is_float(ft))
+            if (is_float(ft)) {
                 rc = RC_FLOAT;
+#ifdef TCC_TARGET_X86_64
+                if ((ft & VT_BTYPE) == VT_LDOUBLE) {
+                    rc = RC_ST0;
+                }
+#endif
+            }
             r = gv(rc);  /* generate value */
             /* if lvalue was saved on stack, must read it */
             if ((vtop[-1].r & VT_VALMASK) == VT_LLOCAL) {
                 SValue sv;
                 t = get_reg(RC_INT);
+#ifdef TCC_TARGET_X86_64
+                sv.type.t = VT_PTR;
+#else
                 sv.type.t = VT_INT;
+#endif
                 sv.r = VT_LOCAL | VT_LVAL;
                 sv.c.ul = vtop[-1].c.ul;
                 load(t, &sv);
                 vtop[-1].r = t | VT_LVAL;
             }
             store(r, vtop - 1);
+#ifndef TCC_TARGET_X86_64
             /* two word case handling : store second register at word + 4 */
             if ((ft & VT_BTYPE) == VT_LLONG) {
                 vswap();
@@ -6598,6 +6666,7 @@ void vstore(void)
                 /* XXX: it works because r2 is spilled last ! */
                 store(vtop->r2, vtop - 1);
             }
+#endif
         }
         vswap();
         vtop--; /* NOT vpop() because on x86 it would flush the fp stack */
@@ -7107,7 +7176,11 @@ the_end:
 
     /* long is never used as type */
     if ((t & VT_BTYPE) == VT_LONG)
+#ifndef TCC_TARGET_X86_64
         t = (t & ~VT_BTYPE) | VT_INT;
+#else
+        t = (t & ~VT_BTYPE) | VT_LLONG;
+#endif
     type->t = t;
     return type_found;
 }
@@ -8044,8 +8117,14 @@ static void expr_eq(void)
             if (vtop != vstack) {
                 /* needed to avoid having different registers saved in
                    each branch */
-                if (is_float(vtop->type.t))
+                if (is_float(vtop->type.t)) {
                     rc = RC_FLOAT;
+#ifdef TCC_TARGET_X86_64
+                    if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
+                        rc = RC_ST0;
+                    }
+#endif
+                }
                 else
                     rc = RC_INT;
                     gv(rc);
@@ -8115,6 +8194,11 @@ static void expr_eq(void)
             rc = RC_INT;
             if (is_float(type.t)) {
                 rc = RC_FLOAT;
+#ifdef TCC_TARGET_X86_64
+                if ((type.t & VT_BTYPE) == VT_LDOUBLE) {
+                    rc = RC_ST0;
+                }
+#endif
             } else if ((type.t & VT_BTYPE) == VT_LLONG) {
                 /* for long longs, we use fixed registers to avoid having
                    to handle a complicated move */
@@ -9982,6 +10066,30 @@ static int rt_get_caller_pc(unsigned long *paddr,
         return 0;
     }
 }
+#elif defined(__x86_64__)
+/* return the PC at frame level 'level'. Return non zero if not found */
+static int rt_get_caller_pc(unsigned long *paddr,
+                            ucontext_t *uc, int level)
+{
+    unsigned long fp;
+    int i;
+
+    if (level == 0) {
+        /* XXX: only support linux */
+        *paddr = uc->uc_mcontext.gregs[REG_RIP];
+        return 0;
+    } else {
+        fp = uc->uc_mcontext.gregs[REG_RBP];
+        for(i=1;i<level;i++) {
+            /* XXX: check address validity with program info */
+            if (fp <= 0x1000 || fp >= 0xc0000000)
+                return -1;
+            fp = ((unsigned long *)fp)[0];
+        }
+        *paddr = ((unsigned long *)fp)[1];
+        return 0;
+    }
+}
 #else
 
 #warning add arch specific rt_get_caller_pc()
@@ -10235,6 +10343,9 @@ TCCState *tcc_new(void)
 #if defined(TCC_TARGET_I386)
     tcc_define_symbol(s, "__i386__", NULL);
 #endif
+#if defined(TCC_TARGET_X86_64)
+    tcc_define_symbol(s, "__x86_64__", NULL);
+#endif
 #if defined(TCC_TARGET_ARM)
     tcc_define_symbol(s, "__ARM_ARCH_4__", NULL);
     tcc_define_symbol(s, "__arm_elf__", NULL);
@@ -10301,6 +10412,10 @@ TCCState *tcc_new(void)
     /* XXX: currently the PE linker is not ready to support that */
     s->leading_underscore = 1;
 #endif
+
+#ifdef TCC_TARGET_X86_64
+    s->jmp_table = NULL;
+#endif
     return s;
 }
 
@@ -10336,6 +10451,9 @@ void tcc_delete(TCCState *s1)
     dynarray_reset(&s1->include_paths, &s1->nb_include_paths);
     dynarray_reset(&s1->sysinclude_paths, &s1->nb_sysinclude_paths);
 
+#ifdef TCC_TARGET_X86_64
+    tcc_free(s1->jmp_table);
+#endif
     tcc_free(s1);
 }
 
diff --git a/tccelf.c b/tccelf.c
index d41d06cc..a0c23808 100644
--- a/tccelf.c
+++ b/tccelf.c
@@ -282,6 +282,9 @@ static void put_elf_reloc(Section *symtab, Section *s, unsigned long offset,
     rel = section_ptr_add(sr, sizeof(ElfW_Rel));
     rel->r_offset = offset;
     rel->r_info = ELFW(R_INFO)(symbol, type);
+#ifdef TCC_TARGET_X86_64
+    rel->r_addend = 0;
+#endif
 }
 
 /* put stab debug information */
@@ -469,6 +472,33 @@ static void relocate_syms(TCCState *s1, int do_resolve)
     }
 }
 
+#ifdef TCC_TARGET_X86_64
+#define JMP_TABLE_ENTRY_SIZE 14
+#define JMP_TABLE_ENTRY_MAX_NUM 4096
+static unsigned long add_jmp_table(TCCState *s1, unsigned long val)
+{
+    char *p;
+    if (!s1->jmp_table) {
+        int size = JMP_TABLE_ENTRY_SIZE * JMP_TABLE_ENTRY_MAX_NUM;
+        s1->jmp_table_num = 0;
+        s1->jmp_table = (char *)tcc_malloc(size);
+        set_pages_executable(s1->jmp_table, size);
+    }
+    if (s1->jmp_table_num == JMP_TABLE_ENTRY_MAX_NUM) {
+        error("relocating >%d symbols are not supported",
+              JMP_TABLE_ENTRY_MAX_NUM);
+    }
+    p = s1->jmp_table + s1->jmp_table_num * JMP_TABLE_ENTRY_SIZE;
+    s1->jmp_table_num++;
+    /* jmp *0x0(%rip) */
+    p[0] = 0xff;
+    p[1] = 0x25;
+    *(int *)(p + 2) = 0;
+    *(unsigned long *)(p + 6) = val;
+    return (unsigned long)p;
+}
+#endif
+
 /* relocate a given section (CPU dependent) */
 static void relocate_section(TCCState *s1, Section *s)
 {
@@ -493,6 +523,10 @@ static void relocate_section(TCCState *s1, Section *s)
         sym_index = ELFW(R_SYM)(rel->r_info);
         sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
         val = sym->st_value;
+#ifdef TCC_TARGET_X86_64
+        /* XXX: not tested */
+        val += rel->r_addend;
+#endif
         type = ELFW(R_TYPE)(rel->r_info);
         addr = s->sh_addr + rel->r_offset;
 
@@ -620,6 +654,62 @@ static void relocate_section(TCCState *s1, Section *s)
             fprintf(stderr,"FIXME: handle reloc type %x at %lx [%.8x] to %lx\n",
                     type,addr,(unsigned int )ptr,val);
             break;
+#elif defined(TCC_TARGET_X86_64)
+        case R_X86_64_64:
+            *(long long *)ptr += val;
+            break;
+        case R_X86_64_32:
+        case R_X86_64_32S:
+            *(int *)ptr += val;
+            break;
+        case R_X86_64_PC32: {
+            long diff = val - addr;
+            if (diff < -2147483648 || diff > 2147483647) {
+                /* XXX: naive support for over 32bit jump */
+                if (s1->output_type == TCC_OUTPUT_MEMORY) {
+                    val = add_jmp_table(s1, val);
+                    diff = val - addr;
+                }
+                if (diff <= -2147483647 || diff > 2147483647) {
+#if 0
+                    /* output memory map to debug easily */
+                    FILE* fp;
+                    char buf[4096];
+                    int size;
+                    Dl_info di;
+                    printf("%ld - %ld = %ld\n", val, addr, diff);
+                    dladdr((void *)addr, &di);
+                    printf("addr = %lx = %lx+%lx(%s) ptr=%p\n",
+                           addr, s->sh_addr, rel->r_offset, di.dli_sname,
+                           ptr);
+                    fp = fopen("/proc/self/maps", "r");
+                    size = fread(buf, 1, 4095, fp);
+                    buf[size] = '\0';
+                    printf("%s", buf);
+#endif
+                    error("internal error: relocation failed");
+                }
+            }
+            *(int *)ptr += val - addr;
+        }
+            break;
+        case R_X86_64_PLT32:
+            *(int *)ptr += val - addr;
+            break;
+        case R_X86_64_GLOB_DAT:
+        case R_X86_64_JUMP_SLOT:
+            *(int *)ptr = val;
+            break;
+        case R_X86_64_GOTPCREL:
+            *(int *)ptr += s1->got->sh_addr - addr;
+            break;
+        case R_X86_64_GOTTPOFF:
+            *(int *)ptr += val - s1->got->sh_addr;
+            break;
+        case R_X86_64_GOT32:
+            /* we load the got offset */
+            *(int *)ptr += s1->got_offsets[sym_index];
+            break;
 #else
 #error unsupported processor
 #endif
@@ -708,7 +798,8 @@ static void put32(unsigned char *p, uint32_t val)
     p[3] = val >> 24;
 }
 
-#if defined(TCC_TARGET_I386) || defined(TCC_TARGET_ARM)
+#if defined(TCC_TARGET_I386) || defined(TCC_TARGET_ARM) || \
+    defined(TCC_TARGET_X86_64)
 static uint32_t get32(unsigned char *p)
 {
     return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
@@ -769,8 +860,14 @@ static void put_got_entry(TCCState *s1,
         sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
         name = symtab_section->link->data + sym->st_name;
         offset = sym->st_value;
-#ifdef TCC_TARGET_I386
-        if (reloc_type == R_386_JMP_SLOT) {
+#if defined(TCC_TARGET_I386) || defined(TCC_TARGET_X86_64)
+        if (reloc_type ==
+#ifdef TCC_TARGET_X86_64
+            R_X86_64_JUMP_SLOT
+#else
+            R_386_JMP_SLOT
+#endif
+            ) {
             Section *plt;
             uint8_t *p;
             int modrm;
@@ -934,6 +1031,25 @@ static void build_got_entries(TCCState *s1)
                                   sym_index);
                 }
                 break;
+#elif defined(TCC_TARGET_X86_64)
+            case R_X86_64_GOT32:
+            case R_X86_64_GOTTPOFF:
+            case R_X86_64_GOTPCREL:
+            case R_X86_64_PLT32:
+                if (!s1->got)
+                    build_got(s1);
+                if (type == R_X86_64_GOT32 || type == R_X86_64_PLT32) {
+                    sym_index = ELFW(R_SYM)(rel->r_info);
+                    sym = &((ElfW(Sym) *)symtab_section->data)[sym_index];
+                    /* look at the symbol got offset. If none, then add one */
+                    if (type == R_X86_64_GOT32)
+                        reloc_type = R_X86_64_GLOB_DAT;
+                    else
+                        reloc_type = R_X86_64_JUMP_SLOT;
+                    put_got_entry(s1, reloc_type, sym->st_size, sym->st_info, 
+                                  sym_index);
+                }
+                break;
 #else
 #error unsupported CPU
 #endif
@@ -1130,6 +1246,8 @@ static char elf_interp[] = "/usr/libexec/ld-elf.so.1";
 #else
 #ifdef TCC_ARM_EABI
 static char elf_interp[] = "/lib/ld-linux.so.3";
+#elif defined(TCC_TARGET_X86_64)
+static char elf_interp[] = "/lib/ld-linux-x86-64.so.2";
 #else
 static char elf_interp[] = "/lib/ld-linux.so.2";
 #endif
@@ -1593,6 +1711,15 @@ int elf_output_file(TCCState *s1, const char *filename)
                         put32(p + 2, get32(p + 2) + s1->got->sh_addr);
                         p += 16;
                     }
+#elif defined(TCC_TARGET_X86_64)
+                    int x = s1->got->sh_addr - s1->plt->sh_addr - 6;
+                    put32(p + 2, get32(p + 2) + x);
+                    put32(p + 8, get32(p + 8) + x - 6);
+                    p += 16;
+                    while (p < p_end) {
+                        put32(p + 2, get32(p + 2) + x + s1->plt->data - p);
+                        p += 16;
+                    }
 #elif defined(TCC_TARGET_ARM)
                     int x;
                     x=s1->got->sh_addr - s1->plt->sh_addr - 12;
@@ -1632,9 +1759,15 @@ int elf_output_file(TCCState *s1, const char *filename)
             put_dt(dynamic, DT_SYMTAB, s1->dynsym->sh_addr);
             put_dt(dynamic, DT_STRSZ, dynstr->data_offset);
             put_dt(dynamic, DT_SYMENT, sizeof(ElfW(Sym)));
+#ifdef TCC_TARGET_X86_64
+            put_dt(dynamic, DT_RELA, rel_addr);
+            put_dt(dynamic, DT_RELASZ, rel_size);
+            put_dt(dynamic, DT_RELAENT, sizeof(ElfW_Rel));
+#else
             put_dt(dynamic, DT_REL, rel_addr);
             put_dt(dynamic, DT_RELSZ, rel_size);
             put_dt(dynamic, DT_RELENT, sizeof(ElfW_Rel));
+#endif
             if (do_debug)
                 put_dt(dynamic, DT_DEBUG, 0);
             put_dt(dynamic, DT_NULL, 0);
diff --git a/x86_64-gen.c b/x86_64-gen.c
new file mode 100644
index 00000000..2435d4d8
--- /dev/null
+++ b/x86_64-gen.c
@@ -0,0 +1,1355 @@
+/*
+ *  x86-64 code generator for TCC
+ *
+ *  Copyright (c) 2008 Shinichiro Hamaji
+ *
+ *  Based on i386-gen.c by Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <assert.h>
+
+/* number of available registers */
+#define NB_REGS             5
+
+/* a register can belong to several classes. The classes must be
+   sorted from more general to more precise (see gv2() code which does
+   assumptions on it). */
+#define RC_INT     0x0001 /* generic integer register */
+#define RC_FLOAT   0x0002 /* generic float register */
+#define RC_RAX     0x0004
+#define RC_RCX     0x0008
+#define RC_RDX     0x0010
+#define RC_XMM0    0x0020
+#define RC_ST0     0x0040 /* only for long double */
+#define RC_IRET    RC_RAX /* function return: integer register */
+#define RC_LRET    RC_RDX /* function return: second integer register */
+#define RC_FRET    RC_XMM0 /* function return: float register */
+
+/* pretty names for the registers */
+enum {
+    TREG_RAX = 0,
+    TREG_RCX = 1,
+    TREG_RDX = 2,
+    TREG_RSI = 6,
+    TREG_RDI = 7,
+    TREG_R8  = 8,
+    TREG_R9  = 9,
+    TREG_R10 = 10,
+    TREG_R11 = 11,
+
+    TREG_XMM0 = 3,
+    TREG_ST0 = 4,
+};
+
+#define REX_BASE(reg) ((reg) >> 3)
+#define REG_VALUE(reg) ((reg) & 7)
+
+int reg_classes[NB_REGS] = {
+    /* eax */ RC_INT | RC_RAX,
+    /* ecx */ RC_INT | RC_RCX,
+    /* edx */ RC_INT | RC_RDX,
+    /* xmm0 */ RC_FLOAT | RC_XMM0,
+    /* st0 */ RC_ST0,
+};
+
+/* return registers for function */
+#define REG_IRET TREG_RAX /* single word int return register */
+#define REG_LRET TREG_RDX /* second word return register (for long long) */
+#define REG_FRET TREG_XMM0 /* float return register */
+
+/* defined if function parameters must be evaluated in reverse order */
+#define INVERT_FUNC_PARAMS
+
+/* pointer size, in bytes */
+#define PTR_SIZE 8
+
+/* long double size and alignment, in bytes */
+#define LDOUBLE_SIZE  16
+#define LDOUBLE_ALIGN 8
+/* maximum alignment (for aligned attribute support) */
+#define MAX_ALIGN     8
+
+/******************************************************/
+/* ELF defines */
+
+#define EM_TCC_TARGET EM_X86_64
+
+/* relocation type for 32 bit data relocation */
+#define R_DATA_32   R_X86_64_32
+#define R_JMP_SLOT  R_X86_64_JUMP_SLOT
+#define R_COPY      R_X86_64_COPY
+
+#define ELF_START_ADDR 0x08048000
+#define ELF_PAGE_SIZE  0x1000
+
+/******************************************************/
+
+static unsigned long func_sub_sp_offset;
+static int func_ret_sub;
+
+/* XXX: make it faster ? */
+void g(int c)
+{
+    int ind1;
+    ind1 = ind + 1;
+    if (ind1 > cur_text_section->data_allocated)
+        section_realloc(cur_text_section, ind1);
+    cur_text_section->data[ind] = c;
+    ind = ind1;
+}
+
+void o(unsigned int c)
+{
+    while (c) {
+        g(c);
+        c = c >> 8;
+    }
+}
+
+void gen_le32(int c)
+{
+    g(c);
+    g(c >> 8);
+    g(c >> 16);
+    g(c >> 24);
+}
+
+void gen_le64(int64_t c)
+{
+    g(c);
+    g(c >> 8);
+    g(c >> 16);
+    g(c >> 24);
+    g(c >> 32);
+    g(c >> 40);
+    g(c >> 48);
+    g(c >> 56);
+}
+
+/* output a symbol and patch all calls to it */
+void gsym_addr(int t, int a)
+{
+    int n, *ptr;
+    while (t) {
+        ptr = (int *)(cur_text_section->data + t);
+        n = *ptr; /* next value */
+        *ptr = a - t - 4;
+        t = n;
+    }
+}
+
+void gsym(int t)
+{
+    gsym_addr(t, ind);
+}
+
+/* psym is used to put an instruction with a data field which is a
+   reference to a symbol. It is in fact the same as oad ! */
+#define psym oad
+
+static int is64_type(int t)
+{
+    return ((t & VT_BTYPE) == VT_PTR ||
+            (t & VT_BTYPE) == VT_FUNC ||
+            (t & VT_BTYPE) == VT_LLONG);
+}
+
+static int is_sse_float(int t) {
+    int bt;
+    bt = t & VT_BTYPE;
+    return bt == VT_DOUBLE || bt == VT_FLOAT;
+}
+
+/* instruction + 4 bytes data. Return the address of the data */
+static int oad(int c, int s)
+{
+    int ind1;
+
+    o(c);
+    ind1 = ind + 4;
+    if (ind1 > cur_text_section->data_allocated)
+        section_realloc(cur_text_section, ind1);
+    *(int *)(cur_text_section->data + ind) = s;
+    s = ind;
+    ind = ind1;
+    return s;
+}
+
+/* output constant with relocation if 'r & VT_SYM' is true */
+static void gen_addr64(int r, Sym *sym, int64_t c)
+{
+    if (r & VT_SYM)
+        greloc(cur_text_section, sym, ind, R_X86_64_64);
+    gen_le64(c);
+}
+
+/* output constant with relocation if 'r & VT_SYM' is true */
+static void gen_addr32(int r, Sym *sym, int c)
+{
+    if (r & VT_SYM)
+        greloc(cur_text_section, sym, ind, R_X86_64_32);
+    gen_le32(c);
+}
+
+/* output constant with relocation if 'r & VT_SYM' is true */
+static void gen_addrpc32(int r, Sym *sym, int c)
+{
+    if (r & VT_SYM)
+        greloc(cur_text_section, sym, ind, R_X86_64_PC32);
+    gen_le32(c-4);
+}
+
+/* generate a modrm reference. 'op_reg' contains the addtionnal 3
+   opcode bits */
+static void gen_modrm(int op_reg, int r, Sym *sym, int c)
+{
+    op_reg = op_reg << 3;
+    if ((r & VT_VALMASK) == VT_CONST) {
+        /* constant memory reference */
+        o(0x05 | op_reg);
+        gen_addrpc32(r, sym, c);
+    } else if ((r & VT_VALMASK) == VT_LOCAL) {
+        /* currently, we use only ebp as base */
+        if (c == (char)c) {
+            /* short reference */
+            o(0x45 | op_reg);
+            g(c);
+        } else {
+            oad(0x85 | op_reg, c);
+        }
+    } else {
+        g(0x00 | op_reg | (r & VT_VALMASK));
+    }
+}
+
+/* generate a modrm reference. 'op_reg' contains the addtionnal 3
+   opcode bits */
+static void gen_modrm64(int opcode, int op_reg, int r, Sym *sym, int c)
+{
+    int rex = 0x48 | (REX_BASE(op_reg) << 2);
+    if ((r & VT_VALMASK) != VT_CONST &&
+        (r & VT_VALMASK) != VT_LOCAL) {
+        rex |= REX_BASE(VT_VALMASK & r);
+    }
+    o(rex);
+    o(opcode);
+    op_reg = REG_VALUE(op_reg) << 3;
+    if ((r & VT_VALMASK) == VT_CONST) {
+        /* constant memory reference */
+        o(0x05 | op_reg);
+        gen_addrpc32(r, sym, c);
+    } else if ((r & VT_VALMASK) == VT_LOCAL) {
+        /* currently, we use only ebp as base */
+        if (c == (char)c) {
+            /* short reference */
+            o(0x45 | op_reg);
+            g(c);
+        } else {
+            oad(0x85 | op_reg, c);
+        }
+    } else {
+        g(0x00 | op_reg | (r & VT_VALMASK));
+    }
+}
+
+
+/* load 'r' from value 'sv' */
+void load(int r, SValue *sv)
+{
+    int v, t, ft, fc, fr;
+    SValue v1;
+
+    fr = sv->r;
+    ft = sv->type.t;
+    fc = sv->c.ul;
+
+    v = fr & VT_VALMASK;
+    if (fr & VT_LVAL) {
+        if (v == VT_LLOCAL) {
+            v1.type.t = VT_PTR;
+            v1.r = VT_LOCAL | VT_LVAL;
+            v1.c.ul = fc;
+            load(r, &v1);
+            fr = r;
+        }
+        if ((ft & VT_BTYPE) == VT_FLOAT) {
+            o(0x6e0f66); /* movd */
+            r = 0;
+        } else if ((ft & VT_BTYPE) == VT_DOUBLE) {
+            o(0x7e0ff3); /* movq */
+            r = 0;
+        } else if ((ft & VT_BTYPE) == VT_LDOUBLE) {
+            o(0xdb); /* fldt */
+            r = 5;
+        } else if ((ft & VT_TYPE) == VT_BYTE) {
+            o(0xbe0f);   /* movsbl */
+        } else if ((ft & VT_TYPE) == (VT_BYTE | VT_UNSIGNED)) {
+            o(0xb60f);   /* movzbl */
+        } else if ((ft & VT_TYPE) == VT_SHORT) {
+            o(0xbf0f);   /* movswl */
+        } else if ((ft & VT_TYPE) == (VT_SHORT | VT_UNSIGNED)) {
+            o(0xb70f);   /* movzwl */
+        } else if (is64_type(ft)) {
+            gen_modrm64(0x8b, r, fr, sv->sym, fc);
+            return;
+        } else {
+            o(0x8b);   /* movl */
+        }
+        gen_modrm(r, fr, sv->sym, fc);
+    } else {
+        if (v == VT_CONST) {
+            if ((ft & VT_TYPE) == VT_LLONG) {
+                o(0x48);
+                o(0xb8 + REG_VALUE(r)); /* mov $xx, r */
+                gen_addr64(fr, sv->sym, sv->c.ull);
+            } else {
+                o(0xc748);
+                o(0xc0 + REG_VALUE(r)); /* mov $xx, r */
+                gen_addr32(fr, sv->sym, fc);
+            }
+        } else if (v == VT_LOCAL) {
+            o(0x48 | REX_BASE(r));
+            o(0x8d); /* lea xxx(%ebp), r */
+            gen_modrm(r, VT_LOCAL, sv->sym, fc);
+        } else if (v == VT_CMP) {
+            oad(0xb8 + r, 0); /* mov $0, r */
+            o(0x0f); /* setxx %br */
+            o(fc);
+            o(0xc0 + r);
+        } else if (v == VT_JMP || v == VT_JMPI) {
+            t = v & 1;
+            oad(0xb8 + r, t); /* mov $1, r */
+            o(0x05eb); /* jmp after */
+            gsym(fc);
+            oad(0xb8 + r, t ^ 1); /* mov $0, r */
+        } else if (v != r) {
+            if (r == TREG_XMM0) {
+                assert(v == TREG_ST0);
+                /* gen_cvt_ftof(VT_DOUBLE); */
+                o(0xf0245cdd); /* fstpl -0x10(%rsp) */
+                /* movsd -0x10(%rsp),%xmm0 */
+                o(0x44100ff2);
+                o(0xf024);
+            } else if (r == TREG_ST0) {
+                assert(v == TREG_XMM0);
+                /* gen_cvt_ftof(VT_LDOUBLE); */
+                /* movsd %xmm0,-0x10(%rsp) */
+                o(0x44110ff2);
+                o(0xf024);
+                o(0xf02444dd); /* fldl -0x10(%rsp) */
+            } else {
+                o(0x48 | REX_BASE(r) | (REX_BASE(v) << 2));
+                o(0x89);
+                o(0xc0 + r + v * 8); /* mov v, r */
+            }
+        }
+    }
+}
+
+/* store register 'r' in lvalue 'v' */
+void store(int r, SValue *v)
+{
+    int fr, bt, ft, fc;
+    int op64 = 0;
+
+    ft = v->type.t;
+    fc = v->c.ul;
+    fr = v->r & VT_VALMASK;
+    bt = ft & VT_BTYPE;
+    /* XXX: incorrect if float reg to reg */
+    if (bt == VT_FLOAT) {
+        o(0x7e0f66); /* movd */
+        r = 0;
+    } else if (bt == VT_DOUBLE) {
+        o(0xd60f66); /* movq */
+        r = 0;
+    } else if (bt == VT_LDOUBLE) {
+        o(0xc0d9); /* fld %st(0) */
+        o(0xdb); /* fstpt */
+        r = 7;
+    } else {
+        if (bt == VT_SHORT)
+            o(0x66);
+        if (bt == VT_BYTE || bt == VT_BOOL)
+            o(0x88);
+        else if (is64_type(bt))
+            op64 = 0x89;
+        else
+            o(0x89);
+    }
+    if (op64) {
+        if (fr == VT_CONST ||
+            fr == VT_LOCAL ||
+            (v->r & VT_LVAL)) {
+            gen_modrm64(op64, r, v->r, v->sym, fc);
+        } else if (fr != r) {
+            /* XXX: don't we really come here? */
+            abort();
+            o(0xc0 + fr + r * 8); /* mov r, fr */
+        }
+    } else {
+        if (fr == VT_CONST ||
+            fr == VT_LOCAL ||
+            (v->r & VT_LVAL)) {
+            gen_modrm(r, v->r, v->sym, fc);
+        } else if (fr != r) {
+            /* XXX: don't we really come here? */
+            abort();
+            o(0xc0 + fr + r * 8); /* mov r, fr */
+        }
+    }
+}
+
+static void gadd_sp(int val)
+{
+    if (val == (char)val) {
+        o(0xc48348);
+        g(val);
+    } else {
+        oad(0xc48148, val); /* add $xxx, %rsp */
+    }
+}
+
+/* 'is_jmp' is '1' if it is a jump */
+static void gcall_or_jmp(int is_jmp)
+{
+    int r;
+    if ((vtop->r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
+        /* constant case */
+        if (vtop->r & VT_SYM) {
+            /* relocation case */
+            greloc(cur_text_section, vtop->sym,
+                   ind + 1, R_X86_64_PC32);
+        } else {
+            /* put an empty PC32 relocation */
+            put_elf_reloc(symtab_section, cur_text_section,
+                          ind + 1, R_X86_64_PC32, 0);
+        }
+        oad(0xe8 + is_jmp, vtop->c.ul - 4); /* call/jmp im */
+    } else {
+        /* otherwise, indirect call */
+        r = TREG_R11;
+        load(r, vtop);
+        o(0x41); /* REX */
+        o(0xff); /* call/jmp *r */
+        o(0xd0 + REG_VALUE(r) + (is_jmp << 4));
+    }
+}
+
+static uint8_t arg_regs[6] = {
+    TREG_RDI, TREG_RSI, TREG_RDX, TREG_RCX, TREG_R8, TREG_R9
+};
+/* Generate function call. The function address is pushed first, then
+   all the parameters in call order. This functions pops all the
+   parameters and the function address. */
+void gfunc_call(int nb_args)
+{
+    int size, align, r, args_size, i, func_call;
+    Sym *func_sym;
+    SValue *orig_vtop;
+    int nb_reg_args = 0;
+    int nb_sse_args = 0;
+    int sse_reg, gen_reg;
+
+    /* calculate the number of integer/float arguments */
+    args_size = 0;
+    for(i = 0; i < nb_args; i++) {
+        if ((vtop[-i].type.t & VT_BTYPE) == VT_STRUCT) {
+            args_size += type_size(&vtop->type, &align);
+        } else if ((vtop[-i].type.t & VT_BTYPE) == VT_LDOUBLE) {
+            args_size += 16;
+        } else if (is_sse_float(vtop[-i].type.t)) {
+            nb_sse_args++;
+            if (nb_sse_args > 8) args_size += 8;
+        } else {
+            nb_reg_args++;
+            if (nb_reg_args > 6) args_size += 8;
+        }
+    }
+
+    /* for struct arguments, we need to call memcpy and the function
+       call breaks register passing arguments we are preparing.
+       So, we process arguments which will be passed by stack first. */
+    orig_vtop = vtop;
+    gen_reg = nb_reg_args;
+    sse_reg = nb_sse_args;
+    /* adjust stack to align SSE boundary */
+    if (args_size &= 8) {
+        o(0x50); /* push $rax */
+    }
+    for(i = 0; i < nb_args; i++) {
+        if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
+            size = type_size(&vtop->type, &align);
+            /* align to stack align size */
+            size = (size + 3) & ~3;
+            /* allocate the necessary size on stack */
+            o(0x48);
+            oad(0xec81, size); /* sub $xxx, %rsp */
+            /* generate structure store */
+            r = get_reg(RC_INT);
+            o(0x48 + REX_BASE(r));
+            o(0x89); /* mov %rsp, r */
+            o(0xe0 + r);
+            {
+                /* following code breaks vtop[1] */
+                SValue tmp = vtop[1];
+                vset(&vtop->type, r | VT_LVAL, 0);
+                vswap();
+                vstore();
+                vtop[1] = tmp;
+            }
+            args_size += size;
+        } else if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
+            gv(RC_ST0);
+            size = LDOUBLE_SIZE;
+            oad(0xec8148, size); /* sub $xxx, %rsp */
+            o(0x7cdb); /* fstpt 0(%rsp) */
+            g(0x24);
+            g(0x00);
+            args_size += size;
+        } else if (is_sse_float(vtop->type.t)) {
+            int j = --sse_reg;
+            if (j >= 8) {
+                gv(RC_FLOAT);
+                o(0x50); /* push $rax */
+                /* movq %xmm0, (%rsp) */
+                o(0x04d60f66);
+                o(0x24);
+                args_size += 8;
+            }
+        } else {
+            int j = --gen_reg;
+            /* simple type */
+            /* XXX: implicit cast ? */
+            if (j >= 6) {
+                r = gv(RC_INT);
+                o(0x50 + r); /* push r */
+                args_size += 8;
+            }
+        }
+        vtop--;
+    }
+    vtop = orig_vtop;
+
+    /* then, we prepare register passing arguments.
+       Note that we cannot set RDX and RCX in this loop because gv()
+       may break these temporary registers. Let's use R10 and R11
+       instead of them */
+    gen_reg = nb_reg_args;
+    sse_reg = nb_sse_args;
+    for(i = 0; i < nb_args; i++) {
+        if ((vtop->type.t & VT_BTYPE) == VT_STRUCT ||
+            (vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
+        } else if (is_sse_float(vtop->type.t)) {
+            int j = --sse_reg;
+            if (j < 8) {
+                gv(RC_FLOAT); /* only one float register */
+                /* movaps %xmm0, %xmmN */
+                o(0x280f);
+                o(0xc0 + (sse_reg << 3));
+            }
+        } else {
+            int j = --gen_reg;
+            /* simple type */
+            /* XXX: implicit cast ? */
+            if (j < 6) {
+                r = gv(RC_INT);
+                if (j < 2) {
+                    o(0x8948); /* mov */
+                    o(0xc0 + r * 8 + arg_regs[j]);
+                } else if (j < 4) {
+                    o(0x8949); /* mov */
+                    /* j=2: r10, j=3: r11 */
+                    o(0xc0 + r * 8 + j);
+                } else {
+                    o(0x8949); /* mov */
+                    /* j=4: r8, j=5: r9 */
+                    o(0xc0 + r * 8 + j - 4);
+                }
+            }
+        }
+        vtop--;
+    }
+
+    /* Copy R10 and R11 into RDX and RCX, respectively */
+    if (nb_reg_args > 2) {
+        o(0xd2894c); /* mov %r10, %rdx */
+        if (nb_reg_args > 3) {
+            o(0xd9894c); /* mov %r11, %rcx */
+        }
+    }
+
+    save_regs(0); /* save used temporary registers */
+
+    func_sym = vtop->type.ref;
+    func_call = FUNC_CALL(func_sym->r);
+    oad(0xb8, nb_sse_args < 8 ? nb_sse_args : 8); /* mov nb_sse_args, %eax */
+    gcall_or_jmp(0);
+    if (args_size)
+        gadd_sp(args_size);
+    vtop--;
+}
+
+#ifdef TCC_TARGET_PE
+/* XXX: support PE? */
+#warning "PE isn't tested at all"
+#define FUNC_PROLOG_SIZE 12
+#else
+#define FUNC_PROLOG_SIZE 11
+#endif
+
+static void push_arg_reg(int i) {
+    loc -= 8;
+    gen_modrm64(0x89, arg_regs[i], VT_LOCAL, NULL, loc);
+}
+
+/* generate function prolog of type 't' */
+void gfunc_prolog(CType *func_type)
+{
+    int i, addr, align, size, func_call;
+    int param_index, param_addr, reg_param_index, sse_param_index;
+    Sym *sym;
+    CType *type;
+
+    func_ret_sub = 0;
+
+    sym = func_type->ref;
+    func_call = FUNC_CALL(sym->r);
+    addr = PTR_SIZE * 2;
+    loc = 0;
+    ind += FUNC_PROLOG_SIZE;
+    func_sub_sp_offset = ind;
+
+    if (func_type->ref->c == FUNC_ELLIPSIS) {
+        int seen_reg_num, seen_sse_num, seen_stack_size;
+        seen_reg_num = seen_sse_num = 0;
+        /* frame pointer and return address */
+        seen_stack_size = PTR_SIZE * 2;
+        /* count the number of seen parameters */
+        sym = func_type->ref;
+        while ((sym = sym->next) != NULL) {
+            type = &sym->type;
+            if (is_sse_float(type->t)) {
+                if (seen_sse_num < 8) {
+                    seen_sse_num++;
+                } else {
+                    seen_stack_size += 8;
+                }
+            } else if ((type->t & VT_BTYPE) == VT_STRUCT) {
+                size = type_size(type, &align);
+                size = (size + 3) & ~3;
+                seen_stack_size += size;
+            } else if ((type->t & VT_BTYPE) == VT_LDOUBLE) {
+                seen_stack_size += LDOUBLE_SIZE;
+            } else {
+                if (seen_reg_num < 6) {
+                    seen_reg_num++;
+                } else {
+                    seen_stack_size += 8;
+                }
+            }
+        }
+
+        loc -= 16;
+        /* movl $0x????????, -0x10(%rbp) */
+        o(0xf045c7);
+        gen_le32(seen_reg_num * 8);
+        /* movl $0x????????, -0xc(%rbp) */
+        o(0xf445c7);
+        gen_le32(seen_sse_num * 16 + 48);
+        /* movl $0x????????, -0x8(%rbp) */
+        o(0xf845c7);
+        gen_le32(seen_stack_size);
+
+        /* save all register passing arguments */
+        for (i = 0; i < 8; i++) {
+            loc -= 16;
+            o(0xd60f66); /* movq */
+            gen_modrm(7 - i, VT_LOCAL, NULL, loc);
+            /* movq $0, loc+8(%rbp) */
+            o(0x85c748);
+            gen_le32(loc + 8);
+            gen_le32(0);
+        }
+        for (i = 0; i < 6; i++) {
+            push_arg_reg(5 - i);
+        }
+    }
+
+    sym = func_type->ref;
+    param_index = 0;
+    reg_param_index = 0;
+    sse_param_index = 0;
+
+    /* if the function returns a structure, then add an
+       implicit pointer parameter */
+    func_vt = sym->type;
+    if ((func_vt.t & VT_BTYPE) == VT_STRUCT) {
+        push_arg_reg(reg_param_index);
+        param_addr = loc;
+
+        func_vc = loc;
+        param_index++;
+        reg_param_index++;
+    }
+    /* define parameters */
+    while ((sym = sym->next) != NULL) {
+        type = &sym->type;
+        size = type_size(type, &align);
+        size = (size + 3) & ~3;
+        if (is_sse_float(type->t)) {
+            if (sse_param_index < 8) {
+                /* save arguments passed by register */
+                loc -= 8;
+                o(0xd60f66); /* movq */
+                gen_modrm(sse_param_index, VT_LOCAL, NULL, loc);
+                param_addr = loc;
+            } else {
+                param_addr = addr;
+                addr += size;
+            }
+            sse_param_index++;
+        } else if ((type->t & VT_BTYPE) == VT_STRUCT ||
+                   (type->t & VT_BTYPE) == VT_LDOUBLE) {
+            param_addr = addr;
+            addr += size;
+        } else {
+            if (reg_param_index < 6) {
+                /* save arguments passed by register */
+                push_arg_reg(reg_param_index);
+                param_addr = loc;
+            } else {
+                param_addr = addr;
+                addr += 8;
+            }
+            reg_param_index++;
+        }
+        sym_push(sym->v & ~SYM_FIELD, type,
+                 VT_LOCAL | VT_LVAL, param_addr);
+        param_index++;
+    }
+}
+
+/* generate function epilog */
+void gfunc_epilog(void)
+{
+    int v, saved_ind;
+
+    o(0xc9); /* leave */
+    if (func_ret_sub == 0) {
+        o(0xc3); /* ret */
+    } else {
+        o(0xc2); /* ret n */
+        g(func_ret_sub);
+        g(func_ret_sub >> 8);
+    }
+    /* align local size to word & save local variables */
+    v = (-loc + 15) & -16;
+    saved_ind = ind;
+    ind = func_sub_sp_offset - FUNC_PROLOG_SIZE;
+#ifdef TCC_TARGET_PE
+    if (v >= 4096) {
+        Sym *sym = external_global_sym(TOK___chkstk, &func_old_type, 0);
+        oad(0xb8, v); /* mov stacksize, %eax */
+        oad(0xe8, -4); /* call __chkstk, (does the stackframe too) */
+        greloc(cur_text_section, sym, ind-4, R_X86_64_PC32);
+    } else
+#endif
+    {
+        o(0xe5894855);  /* push %rbp, mov %rsp, %rbp */
+        o(0xec8148);  /* sub rsp, stacksize */
+        gen_le32(v);
+#if FUNC_PROLOG_SIZE == 12
+        o(0x90);  /* adjust to FUNC_PROLOG_SIZE */
+#endif
+    }
+    ind = saved_ind;
+}
+
+/* generate a jump to a label */
+int gjmp(int t)
+{
+    return psym(0xe9, t);
+}
+
+/* generate a jump to a fixed address */
+void gjmp_addr(int a)
+{
+    int r;
+    r = a - ind - 2;
+    if (r == (char)r) {
+        g(0xeb);
+        g(r);
+    } else {
+        oad(0xe9, a - ind - 5);
+    }
+}
+
+/* generate a test. set 'inv' to invert test. Stack entry is popped */
+int gtst(int inv, int t)
+{
+    int v, *p;
+
+    v = vtop->r & VT_VALMASK;
+    if (v == VT_CMP) {
+        /* fast case : can jump directly since flags are set */
+        g(0x0f);
+        t = psym((vtop->c.i - 16) ^ inv, t);
+    } else if (v == VT_JMP || v == VT_JMPI) {
+        /* && or || optimization */
+        if ((v & 1) == inv) {
+            /* insert vtop->c jump list in t */
+            p = &vtop->c.i;
+            while (*p != 0)
+                p = (int *)(cur_text_section->data + *p);
+            *p = t;
+            t = vtop->c.i;
+        } else {
+            t = gjmp(t);
+            gsym(vtop->c.i);
+        }
+    } else {
+        /* XXX: not tested */
+        if (is_float(vtop->type.t) ||
+            (vtop->type.t & VT_BTYPE) == VT_LLONG) {
+            vpushi(0);
+            gen_op(TOK_NE);
+        }
+        if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
+            /* constant jmp optimization */
+            if ((vtop->c.i != 0) != inv)
+                t = gjmp(t);
+        } else {
+            v = gv(RC_INT);
+            o(0x85);
+            o(0xc0 + v * 9);
+            g(0x0f);
+            t = psym(0x85 ^ inv, t);
+        }
+    }
+    vtop--;
+    return t;
+}
+
+/* generate an integer binary operation */
+void gen_opi(int op)
+{
+    int r, fr, opc, c;
+
+    switch(op) {
+    case '+':
+    case TOK_ADDC1: /* add with carry generation */
+        opc = 0;
+    gen_op8:
+        if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST &&
+            !is64_type(vtop->type.t)) {
+            /* constant case */
+            vswap();
+            r = gv(RC_INT);
+            if (is64_type(vtop->type.t)) {
+                o(0x48 | REX_BASE(r));
+            }
+            vswap();
+            c = vtop->c.i;
+            if (c == (char)c) {
+                /* XXX: generate inc and dec for smaller code ? */
+                o(0x83);
+                o(0xc0 | (opc << 3) | REG_VALUE(r));
+                g(c);
+            } else {
+                o(0x81);
+                oad(0xc0 | (opc << 3) | REG_VALUE(r), c);
+            }
+        } else {
+            gv2(RC_INT, RC_INT);
+            r = vtop[-1].r;
+            fr = vtop[0].r;
+            if (opc != 7 ||
+                is64_type(vtop[0].type.t) || (vtop[0].type.t & VT_UNSIGNED) ||
+                is64_type(vtop[-1].type.t) || (vtop[-1].type.t & VT_UNSIGNED)) {
+                o(0x48 | REX_BASE(r) | (REX_BASE(fr) << 2));
+            }
+            o((opc << 3) | 0x01);
+            o(0xc0 + REG_VALUE(r) + REG_VALUE(fr) * 8);
+        }
+        vtop--;
+        if (op >= TOK_ULT && op <= TOK_GT) {
+            vtop->r = VT_CMP;
+            vtop->c.i = op;
+        }
+        break;
+    case '-':
+    case TOK_SUBC1: /* sub with carry generation */
+        opc = 5;
+        goto gen_op8;
+    case TOK_ADDC2: /* add with carry use */
+        opc = 2;
+        goto gen_op8;
+    case TOK_SUBC2: /* sub with carry use */
+        opc = 3;
+        goto gen_op8;
+    case '&':
+        opc = 4;
+        goto gen_op8;
+    case '^':
+        opc = 6;
+        goto gen_op8;
+    case '|':
+        opc = 1;
+        goto gen_op8;
+    case '*':
+        gv2(RC_INT, RC_INT);
+        r = vtop[-1].r;
+        fr = vtop[0].r;
+        if (is64_type(vtop[0].type.t) || (vtop[0].type.t & VT_UNSIGNED) ||
+            is64_type(vtop[-1].type.t) || (vtop[-1].type.t & VT_UNSIGNED)) {
+            o(0x48 | REX_BASE(fr) | (REX_BASE(r) << 2));
+        }
+        vtop--;
+        o(0xaf0f); /* imul fr, r */
+        o(0xc0 + fr + r * 8);
+        break;
+    case TOK_SHL:
+        opc = 4;
+        goto gen_shift;
+    case TOK_SHR:
+        opc = 5;
+        goto gen_shift;
+    case TOK_SAR:
+        opc = 7;
+    gen_shift:
+        opc = 0xc0 | (opc << 3);
+        if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
+            /* constant case */
+            vswap();
+            r = gv(RC_INT);
+            if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
+                o(0x48 | REX_BASE(r));
+                c = 0x3f;
+            } else {
+                c = 0x1f;
+            }
+            vswap();
+            c &= vtop->c.i;
+            o(0xc1); /* shl/shr/sar $xxx, r */
+            o(opc | r);
+            g(c);
+        } else {
+            /* we generate the shift in ecx */
+            gv2(RC_INT, RC_RCX);
+            r = vtop[-1].r;
+            if ((vtop[-1].type.t & VT_BTYPE) == VT_LLONG) {
+                o(0x48 | REX_BASE(r));
+            }
+            o(0xd3); /* shl/shr/sar %cl, r */
+            o(opc | r);
+        }
+        vtop--;
+        break;
+    case '/':
+    case TOK_UDIV:
+    case TOK_PDIV:
+    case '%':
+    case TOK_UMOD:
+    case TOK_UMULL:
+        /* first operand must be in eax */
+        /* XXX: need better constraint for second operand */
+        gv2(RC_RAX, RC_RCX);
+        r = vtop[-1].r;
+        fr = vtop[0].r;
+        vtop--;
+        save_reg(TREG_RDX);
+        if (op == TOK_UMULL) {
+            o(0xf7); /* mul fr */
+            o(0xe0 + fr);
+            vtop->r2 = TREG_RDX;
+            r = TREG_RAX;
+        } else {
+            if (op == TOK_UDIV || op == TOK_UMOD) {
+                o(0xf7d231); /* xor %edx, %edx, div fr, %eax */
+                o(0xf0 + fr);
+            } else {
+                if ((vtop->type.t & VT_BTYPE) & VT_LLONG) {
+                    o(0x9948); /* cqto */
+                    o(0x48 + REX_BASE(fr));
+                } else {
+                    o(0x99); /* cltd */
+                }
+                o(0xf7); /* idiv fr, %eax */
+                o(0xf8 + fr);
+            }
+            if (op == '%' || op == TOK_UMOD)
+                r = TREG_RDX;
+            else
+                r = TREG_RAX;
+        }
+        vtop->r = r;
+        break;
+    default:
+        opc = 7;
+        goto gen_op8;
+    }
+}
+
+void gen_opl(int op)
+{
+    gen_opi(op);
+}
+
+/* generate a floating point operation 'v = t1 op t2' instruction. The
+   two operands are guaranted to have the same floating point type */
+/* XXX: need to use ST1 too */
+void gen_opf(int op)
+{
+    int a, ft, fc, swapped, r;
+    int float_type =
+        (vtop->type.t & VT_BTYPE) == VT_LDOUBLE ? RC_ST0 : RC_FLOAT;
+
+    /* convert constants to memory references */
+    if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
+        vswap();
+        gv(float_type);
+        vswap();
+    }
+    if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST)
+        gv(float_type);
+
+    /* must put at least one value in the floating point register */
+    if ((vtop[-1].r & VT_LVAL) &&
+        (vtop[0].r & VT_LVAL)) {
+        vswap();
+        gv(float_type);
+        vswap();
+    }
+    swapped = 0;
+    /* swap the stack if needed so that t1 is the register and t2 is
+       the memory reference */
+    if (vtop[-1].r & VT_LVAL) {
+        vswap();
+        swapped = 1;
+    }
+    if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
+        if (op >= TOK_ULT && op <= TOK_GT) {
+            /* load on stack second operand */
+            load(TREG_ST0, vtop);
+            save_reg(TREG_RAX); /* eax is used by FP comparison code */
+            if (op == TOK_GE || op == TOK_GT)
+                swapped = !swapped;
+            else if (op == TOK_EQ || op == TOK_NE)
+                swapped = 0;
+            if (swapped)
+                o(0xc9d9); /* fxch %st(1) */
+            o(0xe9da); /* fucompp */
+            o(0xe0df); /* fnstsw %ax */
+            if (op == TOK_EQ) {
+                o(0x45e480); /* and $0x45, %ah */
+                o(0x40fC80); /* cmp $0x40, %ah */
+            } else if (op == TOK_NE) {
+                o(0x45e480); /* and $0x45, %ah */
+                o(0x40f480); /* xor $0x40, %ah */
+                op = TOK_NE;
+            } else if (op == TOK_GE || op == TOK_LE) {
+                o(0x05c4f6); /* test $0x05, %ah */
+                op = TOK_EQ;
+            } else {
+                o(0x45c4f6); /* test $0x45, %ah */
+                op = TOK_EQ;
+            }
+            vtop--;
+            vtop->r = VT_CMP;
+            vtop->c.i = op;
+        } else {
+            /* no memory reference possible for long double operations */
+            load(TREG_ST0, vtop);
+            swapped = !swapped;
+
+            switch(op) {
+            default:
+            case '+':
+                a = 0;
+                break;
+            case '-':
+                a = 4;
+                if (swapped)
+                    a++;
+                break;
+            case '*':
+                a = 1;
+                break;
+            case '/':
+                a = 6;
+                if (swapped)
+                    a++;
+                break;
+            }
+            ft = vtop->type.t;
+            fc = vtop->c.ul;
+            o(0xde); /* fxxxp %st, %st(1) */
+            o(0xc1 + (a << 3));
+            vtop--;
+        }
+    } else {
+        if (op >= TOK_ULT && op <= TOK_GT) {
+            /* if saved lvalue, then we must reload it */
+            r = vtop->r;
+            fc = vtop->c.ul;
+            if ((r & VT_VALMASK) == VT_LLOCAL) {
+                SValue v1;
+                r = get_reg(RC_INT);
+                v1.type.t = VT_INT;
+                v1.r = VT_LOCAL | VT_LVAL;
+                v1.c.ul = fc;
+                load(r, &v1);
+                fc = 0;
+            }
+
+            if (op == TOK_EQ || op == TOK_NE) {
+                swapped = 0;
+            } else {
+                if (op == TOK_LE || op == TOK_LT)
+                    swapped = !swapped;
+                if (op == TOK_LE || op == TOK_GE) {
+                    op = 0x93; /* setae */
+                } else {
+                    op = 0x97; /* seta */
+                }
+            }
+
+            if (swapped) {
+                o(0x7e0ff3); /* movq */
+                gen_modrm(1, r, vtop->sym, fc);
+
+                if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
+                    o(0x66);
+                }
+                o(0x2e0f); /* ucomisd %xmm0, %xmm1 */
+                o(0xc8);
+            } else {
+                if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
+                    o(0x66);
+                }
+                o(0x2e0f); /* ucomisd */
+                gen_modrm(0, r, vtop->sym, fc);
+            }
+
+            vtop--;
+            vtop->r = VT_CMP;
+            vtop->c.i = op;
+        } else {
+            /* no memory reference possible for long double operations */
+            if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
+                load(TREG_XMM0, vtop);
+                swapped = !swapped;
+            }
+            switch(op) {
+            default:
+            case '+':
+                a = 0;
+                break;
+            case '-':
+                a = 4;
+                break;
+            case '*':
+                a = 1;
+                break;
+            case '/':
+                a = 6;
+                break;
+            }
+            ft = vtop->type.t;
+            fc = vtop->c.ul;
+            if ((ft & VT_BTYPE) == VT_LDOUBLE) {
+                o(0xde); /* fxxxp %st, %st(1) */
+                o(0xc1 + (a << 3));
+            } else {
+                /* if saved lvalue, then we must reload it */
+                r = vtop->r;
+                if ((r & VT_VALMASK) == VT_LLOCAL) {
+                    SValue v1;
+                    r = get_reg(RC_INT);
+                    v1.type.t = VT_INT;
+                    v1.r = VT_LOCAL | VT_LVAL;
+                    v1.c.ul = fc;
+                    load(r, &v1);
+                    fc = 0;
+                }
+                if (swapped) {
+                    /* movq %xmm0,%xmm1 */
+                    o(0x7e0ff3);
+                    o(0xc8);
+                    load(TREG_XMM0, vtop);
+                    /* subsd  %xmm1,%xmm0 (f2 0f 5c c1) */
+                    if ((ft & VT_BTYPE) == VT_DOUBLE) {
+                        o(0xf2);
+                    } else {
+                        o(0xf3);
+                    }
+                    o(0x0f);
+                    o(0x58 + a);
+                    o(0xc1);
+                } else {
+                    if ((ft & VT_BTYPE) == VT_DOUBLE) {
+                        o(0xf2);
+                    } else {
+                        o(0xf3);
+                    }
+                    o(0x0f);
+                    o(0x58 + a);
+                    gen_modrm(0, r, vtop->sym, fc);
+                }
+            }
+            vtop--;
+        }
+    }
+}
+
+/* convert integers to fp 't' type. Must handle 'int', 'unsigned int'
+   and 'long long' cases. */
+void gen_cvt_itof(int t)
+{
+    if ((t & VT_BTYPE) == VT_LDOUBLE) {
+        save_reg(TREG_ST0);
+        gv(RC_INT);
+        if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
+            /* signed long long to float/double/long double (unsigned case
+               is handled generically) */
+            o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
+            o(0x242cdf); /* fildll (%rsp) */
+            o(0x08c48348); /* add $8, %rsp */
+        } else if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
+                   (VT_INT | VT_UNSIGNED)) {
+            /* unsigned int to float/double/long double */
+            o(0x6a); /* push $0 */
+            g(0x00);
+            o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
+            o(0x242cdf); /* fildll (%rsp) */
+            o(0x10c48348); /* add $16, %rsp */
+        } else {
+            /* int to float/double/long double */
+            o(0x50 + (vtop->r & VT_VALMASK)); /* push r */
+            o(0x2404db); /* fildl (%rsp) */
+            o(0x08c48348); /* add $8, %rsp */
+        }
+        vtop->r = TREG_ST0;
+    } else {
+        save_reg(TREG_XMM0);
+        gv(RC_INT);
+        o(0xf2 + ((t & VT_BTYPE) == VT_FLOAT));
+        if ((vtop->type.t & (VT_BTYPE | VT_UNSIGNED)) ==
+            (VT_INT | VT_UNSIGNED) ||
+            (vtop->type.t & VT_BTYPE) == VT_LLONG) {
+            o(0x48); /* REX */
+        }
+        o(0x2a0f);
+        o(0xc0 + (vtop->r & VT_VALMASK)); /* cvtsi2sd */
+        vtop->r = TREG_XMM0;
+    }
+}
+
+/* convert from one floating point type to another */
+void gen_cvt_ftof(int t)
+{
+    int ft, bt, tbt;
+
+    ft = vtop->type.t;
+    bt = ft & VT_BTYPE;
+    tbt = t & VT_BTYPE;
+
+    if (bt == VT_FLOAT) {
+        gv(RC_FLOAT);
+        if (tbt == VT_DOUBLE) {
+            o(0xc0140f); /* unpcklps */
+            o(0xc05a0f); /* cvtps2pd */
+        } else if (tbt == VT_LDOUBLE) {
+            /* movss %xmm0,-0x10(%rsp) */
+            o(0x44110ff3);
+            o(0xf024);
+            o(0xf02444d9); /* flds -0x10(%rsp) */
+            vtop->r = TREG_ST0;
+        }
+    } else if (bt == VT_DOUBLE) {
+        gv(RC_FLOAT);
+        if (tbt == VT_FLOAT) {
+            o(0xc0140f66); /* unpcklpd */
+            o(0xc05a0f66); /* cvtpd2ps */
+        } else if (tbt == VT_LDOUBLE) {
+            /* movsd %xmm0,-0x10(%rsp) */
+            o(0x44110ff2);
+            o(0xf024);
+            o(0xf02444dd); /* fldl -0x10(%rsp) */
+            vtop->r = TREG_ST0;
+        }
+    } else {
+        gv(RC_ST0);
+        if (tbt == VT_DOUBLE) {
+            o(0xf0245cdd); /* fstpl -0x10(%rsp) */
+            /* movsd -0x10(%rsp),%xmm0 */
+            o(0x44100ff2);
+            o(0xf024);
+            vtop->r = TREG_XMM0;
+        } else if (tbt == VT_FLOAT) {
+            o(0xf0245cd9); /* fstps -0x10(%rsp) */
+            /* movss -0x10(%rsp),%xmm0 */
+            o(0x44100ff3);
+            o(0xf024);
+            vtop->r = TREG_XMM0;
+        }
+    }
+}
+
+/* convert fp to int 't' type */
+void gen_cvt_ftoi(int t)
+{
+    int ft, bt, size, r;
+    ft = vtop->type.t;
+    bt = ft & VT_BTYPE;
+    if (bt == VT_LDOUBLE) {
+        gen_cvt_ftof(VT_DOUBLE);
+        bt = VT_DOUBLE;
+    }
+
+    gv(RC_FLOAT);
+    if (t != VT_INT)
+        size = 8;
+    else
+        size = 4;
+
+    r = get_reg(RC_INT);
+    if (bt == VT_FLOAT) {
+        o(0xf3);
+    } else if (bt == VT_DOUBLE) {
+        o(0xf2);
+    } else {
+        assert(0);
+    }
+    if (size == 8) {
+        o(0x48 + REX_BASE(r));
+    }
+    o(0x2c0f); /* cvttss2si or cvttsd2si */
+    o(0xc0 + (REG_VALUE(r) << 3));
+    vtop->r = r;
+}
+
+/* computed goto support */
+void ggoto(void)
+{
+    gcall_or_jmp(1);
+    vtop--;
+}
+
+/* end of x86-64 code generator */
+/*************************************************************/
-- 
2.11.4.GIT