From d85d250fa229d5ceb73ce5fedfd1087727f53a79 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 4 May 2008 17:53:31 -0700 Subject: [PATCH] First cut at AVX machinery. First cut at AVX machinery support. The only instruction implemented is VPERMIL2PS, and it's probably buggy. I'm checking this in with the hope that other people can start helping out with (a) testing this, and (b) adding instructions. NDISASM support is not there yet. --- assemble.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---- disasm.c | 10 +++++ insns.dat | 6 +++ insns.h | 2 + insns.pl | 3 +- nasm.c | 12 +++--- nasm.h | 27 ++++++++------ nasmlib.c | 6 +-- preproc.c | 4 +- test/avx.asm | 9 +++++ wsaa.h | 6 +-- 11 files changed, 169 insertions(+), 35 deletions(-) create mode 100644 test/avx.asm diff --git a/assemble.c b/assemble.c index 60fe369e..0c3f5f1b 100644 --- a/assemble.c +++ b/assemble.c @@ -48,11 +48,25 @@ * kindly to a zero byte in the _middle_ of a compile time * string constant, so I had to put this hack in.) * \171 - placement of DREX suffix in the absence of an EA + * \172\ab - the register number from operand a in bits 7..4, with + * the 4-bit immediate from operand b in bits 0..3. * \2ab - a ModRM, calculated on EA in operand a, with the spare * field equal to digit b. * \250..\253 - same as \150..\153, except warn if the 64-bit operand * is not equal to the truncated and sign-extended 32-bit * operand; used for 32-bit immediates in 64-bit mode. + * \260..\263 - this instruction uses VEX rather than REX, with the + * V field taken from operand 0..3. + * \270 - this instruction uses VEX rather than REX, with the + * V field set to 1111b. + * + * VEX prefixes are followed by the sequence: + * \1mm\1wp where mm is the M field; and wp is: + * 01 0ww lpp + * ww = 0 for W = 0 + * ww = 1 for W = 1 + * ww = 2 for W used as REX.W + * * \310 - indicates fixed 16-bit address size, i.e. optional 0x67. * \311 - indicates fixed 32-bit address size, i.e. optional 0x67. * \312 - (disassembler only) marker on LOOP, LOOPxx instructions. @@ -190,7 +204,7 @@ static void out(int64_t offset, int32_t segto, const void *data, errfunc(ERR_PANIC, "OUT_ADDRESS with size > 8"); return; } - + WRITEADDR(q, *(int64_t *)data, size); data = p; type = OUT_RAWDATA; @@ -964,7 +978,7 @@ static int64_t calcsize(int32_t segment, int64_t offset, int bits, case 0163: length++; ins->rex |= REX_D; - ins->drexdst = regval(&ins->oprs[c & 3]); + ins->drexdst = regval(opx); break; case 0164: case 0165: @@ -972,19 +986,40 @@ static int64_t calcsize(int32_t segment, int64_t offset, int bits, case 0167: length++; ins->rex |= REX_D|REX_OC; - ins->drexdst = regval(&ins->oprs[c & 3]); + ins->drexdst = regval(opx); break; case 0170: length++; break; case 0171: break; + case 0172: + codes++; + length++; + break; case 0250: case 0251: case 0252: case 0253: length += is_sbyte64(ins, c & 3) ? 1 : 4; break; + case 0260: + case 0261: + case 0262: + case 0263: + length += 2; + ins->rex |= REX_V; + ins->drexdst = regval(opx); + ins->vex_m = *codes++; + ins->vex_wlp = *codes++; + break; + case 0270: + length += 2; + ins->rex |= REX_V; + ins->drexdst = 0; + ins->vex_m = *codes++; + ins->vex_wlp = *codes++; + break; case 0300: case 0301: case 0302: @@ -1093,12 +1128,40 @@ static int64_t calcsize(int32_t segment, int64_t offset, int bits, ins->rex &= rex_mask; - if (ins->rex & REX_D) { + if (ins->rex & REX_V) { + int bad32 = REX_R|REX_W|REX_X|REX_B; + + if (ins->rex & REX_H) { + errfunc(ERR_NONFATAL, "cannot use high register in vex instruction"); + return -1; + } + switch (ins->vex_wlp & 030) { + case 000: + ins->rex &= ~REX_W; + break; + case 010: + ins->rex |= REX_W; + bad32 &= ~REX_W; + break; + default: + /* Follow REX_W */ + break; + } + + if (bits != 64 && ((ins->rex & bad32) || ins->drexdst > 7)) { + errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode"); + return -1; + } + if (ins->vex_m != 1 || (ins->rex & (REX_W|REX_R|REX_B))) + length += 3; + else + length += 2; + } else if (ins->rex & REX_D) { if (ins->rex & REX_H) { errfunc(ERR_NONFATAL, "cannot use high register in drex instruction"); return -1; } - if (bits != 64 && ((ins->rex & (REX_W|REX_X|REX_B)) || + if (bits != 64 && ((ins->rex & (REX_R|REX_W|REX_X|REX_B)) || ins->drexdst > 7)) { errfunc(ERR_NONFATAL, "invalid operands in non-64-bit mode"); return -1; @@ -1126,7 +1189,7 @@ static int64_t calcsize(int32_t segment, int64_t offset, int bits, } #define EMIT_REX() \ - if (!(ins->rex & REX_D) && (ins->rex & REX_REAL) && (bits == 64)) { \ + if (!(ins->rex & (REX_D|REX_V)) && (ins->rex & REX_REAL) && (bits == 64)) { \ ins->rex = (ins->rex & REX_REAL)|REX_P; \ out(offset, segment, &ins->rex, OUT_RAWDATA, 1, NO_SEG, NO_SEG); \ ins->rex = 0; \ @@ -1507,6 +1570,26 @@ static void gencode(int32_t segment, int64_t offset, int bits, offset++; break; + case 0172: + c = *codes++; + opx = &ins->oprs[c >> 3]; + bytes[0] = regvals[opx->basereg] << 4; + opx = &ins->oprs[c & 7]; + if (opx->segment != NO_SEG || opx->wrt != NO_SEG) { + errfunc(ERR_NONFATAL, + "non-absolute expression not permitted as argument %d", + c & 7); + } else { + if (opx->offset & ~15) { + errfunc(ERR_WARNING | ERR_WARN_NOV, + "four-bit argument exceeds bounds"); + } + bytes[0] |= opx->offset & 15; + } + out(offset, segment, bytes, OUT_RAWDATA, 1, NO_SEG, NO_SEG); + offset++; + break; + case 0250: case 0251: case 0252: @@ -1525,6 +1608,28 @@ static void gencode(int32_t segment, int64_t offset, int bits, } break; + case 0260: + case 0261: + case 0262: + case 0263: + case 0270: + codes += 2; + if (ins->vex_m != 1 || (ins->rex & (REX_W|REX_X|REX_B))) { + bytes[0] = 0xc4; + bytes[1] = ins->vex_m | ((ins->rex & 7) << 5); + bytes[2] = ((ins->rex & REX_W) << (7-3)) | + (ins->drexdst << 3) | (ins->vex_wlp & 07); + out(offset, segment, &bytes, OUT_RAWDATA, 3, NO_SEG, NO_SEG); + offset += 3; + } else { + bytes[0] = 0xc5; + bytes[1] = ((ins->rex & REX_R) << (7-2)) | + (ins->drexdst << 3) | (ins->vex_wlp & 07); + out(offset, segment, &bytes, OUT_RAWDATA, 2, NO_SEG, NO_SEG); + offset += 2; + } + break; + case 0300: case 0301: case 0302: @@ -1887,7 +1992,7 @@ static int matches(const struct itemplate *itemp, insn * instruction, int bits) int32_t type = instruction->oprs[i].type; if (!(type & SIZE_MASK)) type |= size[i]; - + if (itemp->opd[i] & SAME_AS) { int j = itemp->opd[i] & ~SAME_AS; if (type != instruction->oprs[j].type || diff --git a/disasm.c b/disasm.c index e6b97ad7..8263d508 100644 --- a/disasm.c +++ b/disasm.c @@ -643,6 +643,16 @@ static int matches(const struct itemplate *t, uint8_t *data, return false; break; + case 0172: + { + uint8_t ximm = *data++; + c = *r++; + ins->oprs[c >> 3].basereg = ximm >> 4; + ins->oprs[c >> 3].segment |= SEG_RMREG; + ins->oprs[c & 7].offset = ximm & 15; + } + break; + case4(0200): case4(0204): case4(0210): diff --git a/insns.dat b/insns.dat index 23027035..7d8e050d 100644 --- a/insns.dat +++ b/insns.dat @@ -2023,6 +2023,12 @@ GETSEC void \2\x0F\x37 KATMAI PFRCP mmxreg,mmxrm \323\2\x0F\x0F\110\1\x86 PENT,3DNOW,SQ,CYRIX PFRSQRT mmxreg,mmxrm \323\2\x0F\x0F\110\1\x87 PENT,3DNOW,SQ,CYRIX +;# Intel AVX instructions +VPERMIL2PS xmmreg,xmmreg,xmmrm,xmmreg,imm \260\103\101\1\x48\123\172\34 AVX,SANDYBANKS +VPERMIL2PS xmmreg,xmmreg,xmmreg,xmmrm,imm \260\103\111\1\x48\132\172\14 AVX,SANDYBANKS +VPERMIL2PS ymmreg,ymmreg,ymmrm,ymmreg,imm \260\103\105\1\x48\123\172\34 AVX,SANDYBANKS +VPERMIL2PS ymmreg,ymmreg,ymmreg,ymmrm,imm \260\103\115\1\x48\132\172\14 AVX,SANDYBANKS + ;# VIA (Centaur) security instructions XSTORE void \3\x0F\xA7\xC0 PENT,CYRIX XCRYPTECB void \333\3\x0F\xA7\xC8 PENT,CYRIX diff --git a/insns.h b/insns.h index 49cc1851..7a0ecff7 100644 --- a/insns.h +++ b/insns.h @@ -98,6 +98,7 @@ extern const struct disasm_index itable[256]; #define IF_SSE41 0x00800000UL /* it's an SSE4.1 instruction */ #define IF_SSE42 0x00800000UL /* HACK NEED TO REORGANIZE THESE BITS */ #define IF_SSE5 0x00800000UL /* HACK NEED TO REORGANIZE THESE BITS */ +#define IF_AVX 0x00800000UL /* HACK NEED TO REORGANIZE THESE BITS */ #define IF_PMASK 0xFF000000UL /* the mask for processor types */ #define IF_PLEVEL 0x0F000000UL /* the mask for processor instr. level */ /* also the highest possible processor */ @@ -114,6 +115,7 @@ extern const struct disasm_index itable[256]; #define IF_PRESCOTT 0x09000000UL /* Prescott instructions */ #define IF_X86_64 0x0A000000UL /* x86-64 instruction (long or legacy mode) */ #define IF_NEHALEM 0x0B000000UL /* Nehalem instruction */ +#define IF_SANDYBANKS 0x0C000000UL /* Sandy Banks instruction */ #define IF_X64 (IF_LONG|IF_X86_64) #define IF_IA64 0x0F000000UL /* IA64 instructions (in x86 mode) */ #define IF_CYRIX 0x10000000UL /* Cyrix-specific instruction */ diff --git a/insns.pl b/insns.pl index 8192e90e..3218ef4e 100644 --- a/insns.pl +++ b/insns.pl @@ -241,8 +241,7 @@ sub format { $operands =~ s/imm(\d+)/imm|bits$1/g; $operands =~ s/imm/immediate/g; $operands =~ s/rm(\d+)/rm_gpr|bits$1/g; - $operands =~ s/mmxrm/rm_mmx/g; - $operands =~ s/xmmrm/rm_xmm/g; + $operands =~ s/(mmx|xmm|ymm)rm/rm_$1/g; $operands =~ s/\=([0-9]+)/same_as|$1/g; if ($operands eq 'void') { @ops = (); diff --git a/nasm.c b/nasm.c index 2af74ef1..85865e8e 100644 --- a/nasm.c +++ b/nasm.c @@ -216,7 +216,7 @@ static void define_macros_early(void) strftime(temp, sizeof temp, "__UTC_TIME_NUM__=%H%M%S", &gm); pp_pre_define(temp); } - + if (gm_p) posix_time = posix_mktime(&gm); else if (lt_p) @@ -502,7 +502,7 @@ static bool process_arg(char *p, char *q) case 'O': /* Optimization level */ { int opt; - + if (!*param) { /* Naked -O == -Ox */ optimizing = INT_MAX >> 1; /* Almost unlimited */ @@ -512,7 +512,7 @@ static bool process_arg(char *p, char *q) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': opt = strtoul(param, ¶m, 10); - + /* -O0 -> optimizing == -1, 0.98 behaviour */ /* -O1 -> optimizing == 0, 0.98.09 behaviour */ if (opt < 2) @@ -520,18 +520,18 @@ static bool process_arg(char *p, char *q) else optimizing = opt; break; - + case 'v': case '+': param++; opt_verbose_info = true; break; - + case 'x': param++; optimizing = INT_MAX >> 1; /* Almost unlimited */ break; - + default: report_error(ERR_FATAL, "unknown optimization option -O%c\n", diff --git a/nasm.h b/nasm.h index 0f8d51fc..052bbe73 100644 --- a/nasm.h +++ b/nasm.h @@ -540,7 +540,7 @@ typedef uint32_t opflags_t; #define IP_REL 0x0002c000U /* IP-relative offset */ /* memory which matches any type of r/m operand */ -#define MEMORY_ANY (MEMORY|RM_GPR|RM_MMX|RM_XMM) +#define MEMORY_ANY (MEMORY|RM_GPR|RM_MMX|RM_XMM|RM_YMM) /* special type of immediate operand */ #define UNITY 0x00012000U /* for shift/rotate instructions */ @@ -564,16 +564,17 @@ enum ccode { /* condition code names */ /* * REX flags */ -#define REX_OC 0x0200 /* DREX suffix has the OC0 bit set */ -#define REX_D 0x0100 /* Instruction uses DREX instead of REX */ -#define REX_H 0x80 /* High register present, REX forbidden */ -#define REX_P 0x40 /* REX prefix present/required */ -#define REX_L 0x20 /* Use LOCK prefix instead of REX.R */ -#define REX_W 0x08 /* 64-bit operand size */ -#define REX_R 0x04 /* ModRM reg extension */ -#define REX_X 0x02 /* SIB index extension */ -#define REX_B 0x01 /* ModRM r/m extension */ #define REX_REAL 0x4f /* Actual REX prefix bits */ +#define REX_B 0x01 /* ModRM r/m extension */ +#define REX_X 0x02 /* SIB index extension */ +#define REX_R 0x04 /* ModRM reg extension */ +#define REX_W 0x08 /* 64-bit operand size */ +#define REX_L 0x20 /* Use LOCK prefix instead of REX.R */ +#define REX_P 0x40 /* REX prefix present/required */ +#define REX_H 0x80 /* High register present, REX forbidden */ +#define REX_D 0x0100 /* Instruction uses DREX instead of REX */ +#define REX_OC 0x0200 /* DREX suffix has the OC0 bit set */ +#define REX_V 0x0400 /* Instruction uses VEX instead of REX */ /* * Note that because segment registers may be used as instruction @@ -651,7 +652,7 @@ enum prefix_pos { MAXPREFIX /* Total number of prefix slots */ }; -#define MAX_OPERANDS 4 +#define MAX_OPERANDS 5 typedef struct insn { /* an instruction itself */ char *label; /* the label defined, or NULL */ @@ -667,7 +668,9 @@ typedef struct insn { /* an instruction itself */ int32_t times; /* repeat count (TIMES prefix) */ int forw_ref; /* is there a forward reference? */ int rex; /* Special REX Prefix */ - int drexdst; /* Destination register for DREX suffix */ + int drexdst; /* Destination register for DREX/VEX suffix */ + int vex_m; /* M register for VEX prefix */ + int vex_wlp; /* W, P and L information for VEX prefix */ } insn; enum geninfo { GI_SWITCH }; diff --git a/nasmlib.c b/nasmlib.c index d5cf207f..03a28bb9 100644 --- a/nasmlib.c +++ b/nasmlib.c @@ -671,7 +671,7 @@ void saa_wleb128u(struct SAA *psaa, int value) ptemp++; len++; } while (value != 0); - saa_wbytes(psaa, temp, len); + saa_wbytes(psaa, temp, len); } /* write signed LEB128 value to SAA */ @@ -703,8 +703,8 @@ void saa_wleb128s(struct SAA *psaa, int value) *ptemp = byte; ptemp++; len++; - } - saa_wbytes(psaa, temp, len); + } + saa_wbytes(psaa, temp, len); } void saa_rewind(struct SAA *s) diff --git a/preproc.c b/preproc.c index 3a12fc84..f3ef2729 100644 --- a/preproc.c +++ b/preproc.c @@ -1585,14 +1585,14 @@ static bool if_condition(Token * tline, enum preproc_token ct) iftype: t = tline = expand_smacro(tline); - + while (tok_type_(t, TOK_WHITESPACE) || (needtype == TOK_NUMBER && tok_type_(t, TOK_OTHER) && (t->text[0] == '-' || t->text[0] == '+') && !t->text[1])) t = t->next; - + j = tok_type_(t, needtype); break; diff --git a/test/avx.asm b/test/avx.asm new file mode 100644 index 00000000..9f35940c --- /dev/null +++ b/test/avx.asm @@ -0,0 +1,9 @@ + bits 64 + vpermil2ps xmm0,xmm1,[rdi],xmm3,0 + vpermil2ps xmm0,xmm1,xmm2,[rdi],1 + vpermil2ps ymm0,ymm1,ymm2,ymm3,2 + vpermil2ps ymm0,ymm1,ymm2,[rdi],3 + vpermil2ps ymm0,ymm1,[rdi],ymm3,2 + vpermil2ps ymm0,ymm1,ymm2,[rdi],3 + + \ No newline at end of file diff --git a/wsaa.h b/wsaa.h index 69124d69..63f8335d 100644 --- a/wsaa.h +++ b/wsaa.h @@ -32,13 +32,13 @@ #else /* !X86_MEMORY */ -#define WSAACHAR(s,p,v) \ +#define WSAACHAR(s,p,v) \ do { \ *(uint8_t *)(p) = (v); \ saa_wbytes(s, p, 1); \ } while (0) -#define WSAASHORT(s,p,v) \ +#define WSAASHORT(s,p,v) \ do { \ uint16_t _wss_v = (v); \ uint8_t *_wss_p = (uint8_t *)(p); \ @@ -58,7 +58,7 @@ saa_wbytes(s, _wsl_p, 4); \ } while (0) -#define WSAADLONG(s,p,v) \ +#define WSAADLONG(s,p,v) \ do { \ uint64_t _wsq_v = (v); \ uint8_t *_wsq_p = (uint8_t *)(p); \ -- 2.11.4.GIT