From 472b067e39a11a47ae3fa7cd7d3142558f78969d Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Sun, 17 Mar 2019 21:41:42 +0100 Subject: [PATCH] amd64: Implement RDRAND, VCVTPH2PS and VCVTPS2PH. Bug 398870 - Please add support for instruction vcvtps2ph Bug 353370 - RDRAND amd64->IR: unhandled instruction bytes: 0x48 0xF 0xC7 0xF0 This commit implements: * amd64 RDRAND instruction, on hosts that have it. * amd64 VCVTPH2PS and VCVTPS2PH, on hosts that have it. The presence/absence of these on the host is now reflected in the CPUID results returned to the guest. So code that tests for these features in CPUID and acts accordingly should "just work". * New test cases, none/tests/amd64/rdrand and none/tests/amd64/f16c. These are built if the host's assembler can handle them, in the usual way. --- VEX/priv/guest_amd64_defs.h | 8 +- VEX/priv/guest_amd64_helpers.c | 43 +- VEX/priv/guest_amd64_toIR.c | 482 +++++++++++++----- VEX/priv/host_amd64_defs.c | 55 ++- VEX/priv/host_amd64_defs.h | 5 +- VEX/priv/host_amd64_isel.c | 73 ++- VEX/priv/ir_defs.c | 12 + VEX/priv/main_main.c | 2 + VEX/pub/libvex.h | 2 + VEX/pub/libvex_ir.h | 8 +- configure.ac | 47 +- coregrind/m_machine.c | 22 +- memcheck/mc_translate.c | 81 ++++ memcheck/tests/vbit-test/irops.c | 3 + none/tests/amd64/Makefile.am | 8 + none/tests/amd64/f16c.c | 198 ++++++++ none/tests/amd64/f16c.stderr.exp | 0 none/tests/amd64/f16c.stdout.exp | 972 +++++++++++++++++++++++++++++++++++++ none/tests/amd64/f16c.vgtest | 3 + none/tests/amd64/rdrand.c | 116 +++++ none/tests/amd64/rdrand.stderr.exp | 0 none/tests/amd64/rdrand.stdout.exp | 33 ++ none/tests/amd64/rdrand.vgtest | 3 + tests/x86_amd64_features.c | 10 +- 24 files changed, 2031 insertions(+), 155 deletions(-) create mode 100644 none/tests/amd64/f16c.c create mode 100644 none/tests/amd64/f16c.stderr.exp create mode 100644 none/tests/amd64/f16c.stdout.exp create mode 100644 none/tests/amd64/f16c.vgtest create mode 100644 none/tests/amd64/rdrand.c create mode 100644 none/tests/amd64/rdrand.stderr.exp create mode 100644 none/tests/amd64/rdrand.stdout.exp create mode 100644 none/tests/amd64/rdrand.vgtest diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h index 88593e65d..169b122ba 100644 --- a/VEX/priv/guest_amd64_defs.h +++ b/VEX/priv/guest_amd64_defs.h @@ -168,7 +168,8 @@ extern void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st ); extern void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st ); extern void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st ); extern void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st ); -extern void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st ); +extern void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st, + ULong hasF16C, ULong hasRDRAND ); extern void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* ); @@ -192,6 +193,11 @@ extern void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, extern void amd64g_dirtyhelper_SxDT ( void* address, ULong op /* 0 or 1 */ ); +// This returns a 32-bit value from the host's RDRAND in bits 31:0, and the +// resulting C flag value in bit 32. +extern ULong amd64g_dirtyhelper_RDRAND ( void ); + + /* Helps with PCMP{I,E}STR{I,M}. CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really, diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c index f12b71e94..c7a0719c6 100644 --- a/VEX/priv/guest_amd64_helpers.c +++ b/VEX/priv/guest_amd64_helpers.c @@ -3313,8 +3313,11 @@ void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st ) address sizes : 39 bits physical, 48 bits virtual power management: */ -void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st ) +void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st, + ULong hasF16C, ULong hasRDRAND ) { + vassert((hasF16C >> 1) == 0ULL); + vassert((hasRDRAND >> 1) == 0ULL); # define SET_ABCD(_a,_b,_c,_d) \ do { st->guest_RAX = (ULong)(_a); \ st->guest_RBX = (ULong)(_b); \ @@ -3329,10 +3332,14 @@ void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st ) case 0x00000000: SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69); break; - case 0x00000001: - /* Don't advertise RDRAND support, bit 30 in ECX. */ - SET_ABCD(0x000306c3, 0x02100800, 0x3ffafbff, 0xbfebfbff); + case 0x00000001: { + // As a baseline, advertise neither F16C (ecx:29) nor RDRAND (ecx:30), + // but patch in support for them as directed by the caller. + UInt ecx_extra + = (hasF16C ? (1U << 29) : 0) | (hasRDRAND ? (1U << 30) : 0); + SET_ABCD(0x000306c3, 0x02100800, (0x1ffafbff | ecx_extra), 0xbfebfbff); break; + } case 0x00000002: SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000); break; @@ -3740,6 +3747,34 @@ void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) { # endif } +/* CALLED FROM GENERATED CODE */ +/* DIRTY HELPER (non-referentially-transparent) */ +/* Horrible hack. On non-amd64 platforms, do nothing. On amd64 targets, get a + 32 bit random number using RDRAND, and return it and the associated rflags.C + value. */ +ULong amd64g_dirtyhelper_RDRAND ( void ) { +# if defined(__x86_64__) + ULong res = 0; + ULong cflag = 0; + __asm__ __volatile__( + "movq $0, %%r11 ; " + "movq $0, %%r12 ; " + "rdrand %%r11d ; " + "setc %%r12b ; " + "movq %%r11, %0 ; " + "movq %%r12, %1" + : "=r"(res), "=r"(cflag) : : "r11", "r12" + ); + res &= 0xFFFFFFFFULL; + cflag &= 1ULL; + return (cflag << 32) | res; +# else + /* There's nothing we can sensibly do. Return a value denoting + "I succeeded, and the random bits are all zero" :-/ */ + return 1ULL << 32; +# endif +} + /*---------------------------------------------------------------*/ /*--- Helpers for MMX/SSE/SSE2. ---*/ /*---------------------------------------------------------------*/ diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 664cad605..7e57933d5 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -761,6 +761,13 @@ static Bool haveF2noF3 ( Prefix pfx ) toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2); } +/* Return True iff pfx has F2 and F3 clear */ +static Bool haveNoF2noF3 ( Prefix pfx ) +{ + return + toBool((pfx & (PFX_F2|PFX_F3)) == 0); +} + /* Return True iff pfx has 66, F2 and F3 clear */ static Bool haveNo66noF2noF3 ( Prefix pfx ) { @@ -16931,6 +16938,41 @@ static Long dis_xTESTy_256 ( const VexAbiInfo* vbi, Prefix pfx, } +/* Handles 128 and 256 bit versions of VCVTPH2PS. */ +static Long dis_VCVTPH2PS ( const VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool is256bit ) +{ + /* This is a width-doubling load or reg-reg move, that does conversion on the + transferred data. */ + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp srcE = newTemp(is256bit ? Ity_V128 : Ity_I64); + + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + assign(srcE, is256bit ? unop(Iop_V256toV128_0, getYMMReg(rE)) + : unop(Iop_V128to64, getXMMReg(rE))); + delta += 1; + DIP("vcvtph2ps %s,%s\n", nameXMMReg(rE), + (is256bit ? nameYMMReg: nameXMMReg)(rG)); + } else { + Int alen = 0; + HChar dis_buf[50]; + IRTemp addr = disAMode(&alen, vbi, pfx, delta, dis_buf, 0); + // I don't think we need an alignment check here (not 100% sure tho.) + assign(srcE, loadLE(is256bit ? Ity_V128 : Ity_I64, mkexpr(addr))); + delta += alen; + DIP( "vcvtph2ps %s,%s\n", dis_buf, + (is256bit ? nameYMMReg: nameXMMReg)(rG)); + } + + IRExpr* res = unop(is256bit ? Iop_F16toF32x8 : Iop_F16toF32x4, mkexpr(srcE)); + (is256bit ? putYMMReg : putYMMRegLoAndZU)(rG, res); + + return delta; +} + + /* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */ static Long dis_PMOVxXBW_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx, Bool xIsZ ) @@ -21966,8 +22008,18 @@ Long dis_ESC_0F ( } vassert(fName); vassert(fAddr); - d = unsafeIRDirty_0_N ( 0/*regparms*/, - fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) ); + IRExpr** args = NULL; + if (fAddr == &amd64g_dirtyhelper_CPUID_avx2) { + Bool hasF16C = (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C) != 0; + Bool hasRDRAND = (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDRAND) != 0; + args = mkIRExprVec_3(IRExpr_GSPTR(), + mkIRExpr_HWord(hasF16C ? 1 : 0), + mkIRExpr_HWord(hasRDRAND ? 1 : 0)); + } else { + args = mkIRExprVec_1(IRExpr_GSPTR()); + } + d = unsafeIRDirty_0_N ( 0/*regparms*/, fName, fAddr, args ); + /* declare guest state effects */ d->nFxState = 4; vex_bzero(&d->fxState, sizeof(d->fxState)); @@ -22169,141 +22221,233 @@ Long dis_ESC_0F ( return delta; } - case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */ - IRType elemTy = sz==4 ? Ity_I32 : Ity_I64; - IRTemp expdHi = newTemp(elemTy); - IRTemp expdLo = newTemp(elemTy); - IRTemp dataHi = newTemp(elemTy); - IRTemp dataLo = newTemp(elemTy); - IRTemp oldHi = newTemp(elemTy); - IRTemp oldLo = newTemp(elemTy); - IRTemp flags_old = newTemp(Ity_I64); - IRTemp flags_new = newTemp(Ity_I64); - IRTemp success = newTemp(Ity_I1); - IROp opOR = sz==4 ? Iop_Or32 : Iop_Or64; - IROp opXOR = sz==4 ? Iop_Xor32 : Iop_Xor64; - IROp opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64; - IRExpr* zero = sz==4 ? mkU32(0) : mkU64(0); - IRTemp expdHi64 = newTemp(Ity_I64); - IRTemp expdLo64 = newTemp(Ity_I64); - - /* Translate this using a DCAS, even if there is no LOCK - prefix. Life is too short to bother with generating two - different translations for the with/without-LOCK-prefix - cases. */ - *expect_CAS = True; - - /* Decode, and generate address. */ - if (have66(pfx)) goto decode_failure; - if (sz != 4 && sz != 8) goto decode_failure; - if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) - goto decode_failure; + case 0xC7: { modrm = getUChar(delta); - if (epartIsReg(modrm)) goto decode_failure; - if (gregLO3ofRM(modrm) != 1) goto decode_failure; - if (haveF2orF3(pfx)) { - /* Since the e-part is memory only, F2 or F3 (one or the - other) is acceptable if LOCK is also present. But only - for cmpxchg8b. */ - if (sz == 8) goto decode_failure; - if (haveF2andF3(pfx) || !haveLOCK(pfx)) goto decode_failure; - } - addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); - delta += alen; + // Detecting valid CMPXCHG combinations is pretty complex. + Bool isValidCMPXCHG = gregLO3ofRM(modrm) == 1; + if (isValidCMPXCHG) { + if (have66(pfx)) isValidCMPXCHG = False; + if (sz != 4 && sz != 8) isValidCMPXCHG = False; + if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) + isValidCMPXCHG = False; + if (epartIsReg(modrm)) isValidCMPXCHG = False; + if (haveF2orF3(pfx)) { + /* Since the e-part is memory only, F2 or F3 (one or the + other) is acceptable if LOCK is also present. But only + for cmpxchg8b. */ + if (sz == 8) isValidCMPXCHG = False; + if (haveF2andF3(pfx) || !haveLOCK(pfx)) isValidCMPXCHG = False; + } + } + + /* 0F C7 /1 (with qualifications) = CMPXCHG */ + if (isValidCMPXCHG) { + // Note that we've already read the modrm byte by this point, but we + // haven't moved delta past it. + IRType elemTy = sz==4 ? Ity_I32 : Ity_I64; + IRTemp expdHi = newTemp(elemTy); + IRTemp expdLo = newTemp(elemTy); + IRTemp dataHi = newTemp(elemTy); + IRTemp dataLo = newTemp(elemTy); + IRTemp oldHi = newTemp(elemTy); + IRTemp oldLo = newTemp(elemTy); + IRTemp flags_old = newTemp(Ity_I64); + IRTemp flags_new = newTemp(Ity_I64); + IRTemp success = newTemp(Ity_I1); + IROp opOR = sz==4 ? Iop_Or32 : Iop_Or64; + IROp opXOR = sz==4 ? Iop_Xor32 : Iop_Xor64; + IROp opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64; + IRExpr* zero = sz==4 ? mkU32(0) : mkU64(0); + IRTemp expdHi64 = newTemp(Ity_I64); + IRTemp expdLo64 = newTemp(Ity_I64); + + /* Translate this using a DCAS, even if there is no LOCK + prefix. Life is too short to bother with generating two + different translations for the with/without-LOCK-prefix + cases. */ + *expect_CAS = True; - /* cmpxchg16b requires an alignment check. */ - if (sz == 8) - gen_SEGV_if_not_16_aligned( addr ); + /* Generate address */ + vassert(!epartIsReg(modrm)); + addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 ); + delta += alen; - /* Get the expected and new values. */ - assign( expdHi64, getIReg64(R_RDX) ); - assign( expdLo64, getIReg64(R_RAX) ); - - /* These are the correctly-sized expected and new values. - However, we also get expdHi64/expdLo64 above as 64-bits - regardless, because we will need them later in the 32-bit - case (paradoxically). */ - assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64)) - : mkexpr(expdHi64) ); - assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64)) - : mkexpr(expdLo64) ); - assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) ); - assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) ); - - /* Do the DCAS */ - stmt( IRStmt_CAS( - mkIRCAS( oldHi, oldLo, - Iend_LE, mkexpr(addr), - mkexpr(expdHi), mkexpr(expdLo), - mkexpr(dataHi), mkexpr(dataLo) - ))); - - /* success when oldHi:oldLo == expdHi:expdLo */ - assign( success, - binop(opCasCmpEQ, - binop(opOR, - binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)), - binop(opXOR, mkexpr(oldLo), mkexpr(expdLo)) - ), - zero - )); - - /* If the DCAS is successful, that is to say oldHi:oldLo == - expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX, - which is where they came from originally. Both the actual - contents of these two regs, and any shadow values, are - unchanged. If the DCAS fails then we're putting into - RDX:RAX the value seen in memory. */ - /* Now of course there's a complication in the 32-bit case - (bah!): if the DCAS succeeds, we need to leave RDX:RAX - unchanged; but if we use the same scheme as in the 64-bit - case, we get hit by the standard rule that a write to the - bottom 32 bits of an integer register zeros the upper 32 - bits. And so the upper halves of RDX and RAX mysteriously - become zero. So we have to stuff back in the original - 64-bit values which we previously stashed in - expdHi64:expdLo64, even if we're doing a cmpxchg8b. */ - /* It's just _so_ much fun ... */ - putIRegRDX( 8, - IRExpr_ITE( mkexpr(success), - mkexpr(expdHi64), - sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi)) - : mkexpr(oldHi) - )); - putIRegRAX( 8, - IRExpr_ITE( mkexpr(success), - mkexpr(expdLo64), - sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo)) - : mkexpr(oldLo) - )); - - /* Copy the success bit into the Z flag and leave the others - unchanged */ - assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all())); - assign( - flags_new, - binop(Iop_Or64, - binop(Iop_And64, mkexpr(flags_old), - mkU64(~AMD64G_CC_MASK_Z)), - binop(Iop_Shl64, - binop(Iop_And64, - unop(Iop_1Uto64, mkexpr(success)), mkU64(1)), - mkU8(AMD64G_CC_SHIFT_Z)) )); + /* cmpxchg16b requires an alignment check. */ + if (sz == 8) + gen_SEGV_if_not_16_aligned( addr ); - stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); - stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) )); - stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); - /* Set NDEP even though it isn't used. This makes - redundant-PUT elimination of previous stores to this field - work better. */ - stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) )); + /* Get the expected and new values. */ + assign( expdHi64, getIReg64(R_RDX) ); + assign( expdLo64, getIReg64(R_RAX) ); + + /* These are the correctly-sized expected and new values. + However, we also get expdHi64/expdLo64 above as 64-bits + regardless, because we will need them later in the 32-bit + case (paradoxically). */ + assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64)) + : mkexpr(expdHi64) ); + assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64)) + : mkexpr(expdLo64) ); + assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) ); + assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) ); + + /* Do the DCAS */ + stmt( IRStmt_CAS( + mkIRCAS( oldHi, oldLo, + Iend_LE, mkexpr(addr), + mkexpr(expdHi), mkexpr(expdLo), + mkexpr(dataHi), mkexpr(dataLo) + ))); - /* Sheesh. Aren't you glad it was me and not you that had to - write and validate all this grunge? */ + /* success when oldHi:oldLo == expdHi:expdLo */ + assign( success, + binop(opCasCmpEQ, + binop(opOR, + binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)), + binop(opXOR, mkexpr(oldLo), mkexpr(expdLo)) + ), + zero + )); + + /* If the DCAS is successful, that is to say oldHi:oldLo == + expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX, + which is where they came from originally. Both the actual + contents of these two regs, and any shadow values, are + unchanged. If the DCAS fails then we're putting into + RDX:RAX the value seen in memory. */ + /* Now of course there's a complication in the 32-bit case + (bah!): if the DCAS succeeds, we need to leave RDX:RAX + unchanged; but if we use the same scheme as in the 64-bit + case, we get hit by the standard rule that a write to the + bottom 32 bits of an integer register zeros the upper 32 + bits. And so the upper halves of RDX and RAX mysteriously + become zero. So we have to stuff back in the original + 64-bit values which we previously stashed in + expdHi64:expdLo64, even if we're doing a cmpxchg8b. */ + /* It's just _so_ much fun ... */ + putIRegRDX( 8, + IRExpr_ITE( mkexpr(success), + mkexpr(expdHi64), + sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi)) + : mkexpr(oldHi) + )); + putIRegRAX( 8, + IRExpr_ITE( mkexpr(success), + mkexpr(expdLo64), + sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo)) + : mkexpr(oldLo) + )); + + /* Copy the success bit into the Z flag and leave the others + unchanged */ + assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all())); + assign( + flags_new, + binop(Iop_Or64, + binop(Iop_And64, mkexpr(flags_old), + mkU64(~AMD64G_CC_MASK_Z)), + binop(Iop_Shl64, + binop(Iop_And64, + unop(Iop_1Uto64, mkexpr(success)), mkU64(1)), + mkU8(AMD64G_CC_SHIFT_Z)) )); - DIP("cmpxchg8b %s\n", dis_buf); - return delta; + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + /* Set NDEP even though it isn't used. This makes + redundant-PUT elimination of previous stores to this field + work better. */ + stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) )); + + /* Sheesh. Aren't you glad it was me and not you that had to + write and validate all this grunge? */ + + DIP("cmpxchg8b %s\n", dis_buf); + return delta; + } // if (isValidCMPXCHG) + + /* 0F C7 /6 no-F2-or-F3 = RDRAND */ + if (gregLO3ofRM(modrm) == 6/*RDRAND*/ + && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDRAND) + && epartIsReg(modrm) && haveNoF2noF3(pfx) + && (sz == 8 || sz == 4 || sz == 2)) { + delta++; // move past modrm + IRType ty = szToITy(sz); + + // Pull a first 32 bits of randomness, plus C flag, out of the host. + IRTemp pairLO = newTemp(Ity_I64); + IRDirty* dLO + = unsafeIRDirty_1_N(pairLO, 0/*regparms*/, + "amd64g_dirtyhelper_RDRAND", + &amd64g_dirtyhelper_RDRAND, mkIRExprVec_0()); + // There are no guest state or memory effects to declare for |dLO|. + stmt( IRStmt_Dirty(dLO) ); + + IRTemp randsLO = newTemp(Ity_I32); + assign(randsLO, unop(Iop_64to32, mkexpr(pairLO))); + IRTemp cLO = newTemp(Ity_I64); + assign(cLO, binop(Iop_Shr64, mkexpr(pairLO), mkU8(32))); + + // We'll assemble the final pairing in (cFinal, randsNearlyFinal). + IRTemp randsNearlyFinal = newTemp(Ity_I64); + IRTemp cFinal = newTemp(Ity_I64); + + if (ty == Ity_I64) { + // Pull another 32 bits of randomness out of the host. + IRTemp pairHI = newTemp(Ity_I64); + IRDirty* dHI + = unsafeIRDirty_1_N(pairHI, 0/*regparms*/, + "amd64g_dirtyhelper_RDRAND", + &amd64g_dirtyhelper_RDRAND, mkIRExprVec_0()); + // There are no guest state or memory effects to declare for |dHI|. + stmt( IRStmt_Dirty(dHI) ); + + IRTemp randsHI = newTemp(Ity_I32); + assign(randsHI, unop(Iop_64to32, mkexpr(pairHI))); + IRTemp cHI = newTemp(Ity_I64); + assign(cHI, binop(Iop_Shr64, mkexpr(pairHI), mkU8(32))); + assign(randsNearlyFinal, binop(Iop_32HLto64, + mkexpr(randsHI), mkexpr(randsLO))); + assign(cFinal, binop(Iop_And64, + binop(Iop_And64, mkexpr(cHI), mkexpr(cLO)), + mkU64(1))); + } else { + assign(randsNearlyFinal, unop(Iop_32Uto64, mkexpr(randsLO))); + assign(cFinal, binop(Iop_And64, mkexpr(cLO), mkU64(1))); + } + + /* Now cFinal[0] is the final success/failure flag (cFinal[0] == 1 + means success). But there's another twist. If we failed then the + returned value must be forced to zero. Otherwise we could have the + situation, when sz==8, where one of the host calls failed but the + other didn't. This would give cFinal[0] == 0 (correctly) but + randsNearlyFinal not being zero, because it contains the 32 bit + result of the non-failing call. */ + IRTemp randsFinal = newTemp(Ity_I64); + assign(randsFinal, + binop(Iop_And64, + mkexpr(randsNearlyFinal), + binop(Iop_Sar64, + binop(Iop_Shl64, mkexpr(cFinal), mkU8(63)), + mkU8(63)) + )); + + // So, finally, update the guest state. + putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(randsFinal))); + + // Set C=, O,S,Z,A,P = 0. cFinal has already been + // masked so only the lowest bit remains. + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(cFinal) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) )); + + DIP("rdrand %s", nameIRegE(sz, pfx, modrm)); + return delta; + } + + goto decode_failure; } case 0xC8: /* BSWAP %eax */ @@ -28354,6 +28498,23 @@ Long dis_ESC_0F38__VEX ( } break; + case 0x13: + /* VCVTPH2PS xmm2/m64, xmm1 = VEX.128.66.0F38.W0 13 /r */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/ + && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) { + delta = dis_VCVTPH2PS( vbi, pfx, delta, /*is256bit=*/False ); + goto decode_success; + } + /* VCVTPH2PS xmm2/m128, xmm1 = VEX.256.66.0F38.W0 13 /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/ + && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) { + delta = dis_VCVTPH2PS( vbi, pfx, delta, /*is256bit=*/True ); + goto decode_success; + } + break; + case 0x16: /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */ if (have66noF2noF3(pfx) @@ -30283,6 +30444,50 @@ static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 ) return res; } +/* Handles 128 and 256 bit versions of VCVTPS2PH. */ +static Long dis_VCVTPS2PH ( const VexAbiInfo* vbi, Prefix pfx, + Long delta, Bool is256bit ) +{ + /* This is a width-halving store or reg-reg move, that does conversion on the + transferred data. */ + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + IRTemp rm = newTemp(Ity_I32); + IROp op = is256bit ? Iop_F32toF16x8 : Iop_F32toF16x4; + IRExpr* srcG = (is256bit ? getYMMReg : getXMMReg)(rG); + + /* (imm & 3) contains an Intel-encoded rounding mode. Because that encoding + is the same as the encoding for IRRoundingMode, we can use that value + directly in the IR as a rounding mode. */ + + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + UInt imm = getUChar(delta); + assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3)); + IRExpr* res = binop(op, mkexpr(rm), srcG); + if (!is256bit) + res = unop(Iop_64UtoV128, res); + putYMMRegLoAndZU(rE, res); + DIP("vcvtps2ph $%u,%s,%s\n", + imm, (is256bit ? nameYMMReg : nameXMMReg)(rG), nameXMMReg(rE)); + } else { + Int alen = 0; + HChar dis_buf[50]; + IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + UInt imm = getUChar(delta); + assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3)); + IRExpr* res = binop(op, mkexpr(rm), srcG); + storeLE(mkexpr(addr), res); + DIP("vcvtps2ph $%u,%s,%s\n", + imm, (is256bit ? nameYMMReg : nameXMMReg)(rG), dis_buf); + } + delta++; + /* doesn't use vvvv */ + return delta; +} + __attribute__((noinline)) static Long dis_ESC_0F3A__VEX ( @@ -31247,6 +31452,23 @@ Long dis_ESC_0F3A__VEX ( } break; + case 0x1D: + /* VCVTPS2PH imm8, xmm2, xmm1/m64 = VEX.128.66.0F3A.W0 1D /r ib */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/ + && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) { + delta = dis_VCVTPS2PH( vbi, pfx, delta, /*is256bit=*/False ); + goto decode_success; + } + /* VCVTPS2PH imm8, ymm2, ymm1/m128 = VEX.256.66.0F3A.W0 1D /r ib */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/ + && (archinfo->hwcaps & VEX_HWCAPS_AMD64_F16C)) { + delta = dis_VCVTPS2PH( vbi, pfx, delta, /*is256bit=*/True ); + goto decode_success; + } + break; + case 0x20: /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */ if (have66noF2noF3(pfx) diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 9a2e2bd16..5cb49b151 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -590,6 +590,8 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_UNPCKLQ: return "punpcklq"; case Asse_PSHUFB: return "pshufb"; case Asse_PMADDUBSW: return "pmaddubsw"; + case Asse_F32toF16: return "vcvtps2ph(rm_field=$0x4)."; + case Asse_F16toF32: return "vcvtph2ps."; default: vpanic("showAMD64SseOp"); } } @@ -1672,7 +1674,9 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) || i->Ain.Sse32Fx4.op == Asse_RSQRTF || i->Ain.Sse32Fx4.op == Asse_SQRTF || i->Ain.Sse32Fx4.op == Asse_I2F - || i->Ain.Sse32Fx4.op == Asse_F2I ); + || i->Ain.Sse32Fx4.op == Asse_F2I + || i->Ain.Sse32Fx4.op == Asse_F32toF16 + || i->Ain.Sse32Fx4.op == Asse_F16toF32 ); addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src); addHRegUse(u, unary ? HRmWrite : HRmModify, i->Ain.Sse32Fx4.dst); @@ -3690,15 +3694,52 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, i->Ain.SseLdzLO.addr); goto done; - case Ain_Sse32Fx4: + case Ain_Sse32Fx4: { + UInt srcRegNo = vregEnc3210(i->Ain.Sse32Fx4.src); + UInt dstRegNo = vregEnc3210(i->Ain.Sse32Fx4.dst); + // VEX encoded cases + switch (i->Ain.Sse32Fx4.op) { + case Asse_F16toF32: { // vcvtph2ps %xmmS, %xmmD + UInt s = srcRegNo; + UInt d = dstRegNo; + // VCVTPH2PS %xmmS, %xmmD (s and d are both xmm regs, range 0 .. 15) + // 0xC4 : ~d3 1 ~s3 0 0 0 1 0 : 0x79 : 0x13 : 1 1 d2 d1 d0 s2 s1 s0 + UInt byte2 = ((((~d)>>3)&1)<<7) | (1<<6) + | ((((~s)>>3)&1)<<5) | (1<<1); + UInt byte5 = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0); + *p++ = 0xC4; + *p++ = byte2; + *p++ = 0x79; + *p++ = 0x13; + *p++ = byte5; + goto done; + } + case Asse_F32toF16: { // vcvtps2ph $4, %xmmS, %xmmD + UInt s = srcRegNo; + UInt d = dstRegNo; + // VCVTPS2PH $4, %xmmS, %xmmD (s and d both xmm regs, range 0 .. 15) + // 0xC4 : ~s3 1 ~d3 0 0 0 1 1 : 0x79 + // : 0x1D : 11 s2 s1 s0 d2 d1 d0 : 0x4 + UInt byte2 = ((((~s)>>3)&1)<<7) | (1<<6) + | ((((~d)>>3)&1)<<5) | (1<<1) | (1 << 0); + UInt byte5 = (1<<7) | (1<<6) | ((s&7) << 3) | ((d&7) << 0); + *p++ = 0xC4; + *p++ = byte2; + *p++ = 0x79; + *p++ = 0x1D; + *p++ = byte5; + *p++ = 0x04; + goto done; + } + default: break; + } + // After this point, REX encoded cases only xtra = 0; switch (i->Ain.Sse32Fx4.op) { case Asse_F2I: *p++ = 0x66; break; default: break; } - *p++ = clearWBit( - rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst), - vregEnc3210(i->Ain.Sse32Fx4.src) )); + *p++ = clearWBit(rexAMode_R_enc_enc(dstRegNo, srcRegNo)); *p++ = 0x0F; switch (i->Ain.Sse32Fx4.op) { case Asse_ADDF: *p++ = 0x58; break; @@ -3718,11 +3759,11 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break; default: goto bad; } - p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32Fx4.dst), - vregEnc3210(i->Ain.Sse32Fx4.src) ); + p = doAMode_R_enc_enc(p, dstRegNo, srcRegNo); if (xtra & 0x100) *p++ = toUChar(xtra & 0xFF); goto done; + } case Ain_Sse64Fx2: xtra = 0; diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index 0a665fec0..93a6a5adf 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -345,7 +345,10 @@ typedef Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ, // Only for SSSE3 capable hosts: Asse_PSHUFB, - Asse_PMADDUBSW + Asse_PMADDUBSW, + // Only for F16C capable hosts: + Asse_F32toF16, // F32 to F16 conversion, aka vcvtps2ph + Asse_F16toF32, // F16 to F32 conversion, aka vcvtph2ps } AMD64SseOp; diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 1f226d980..673909029 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -1434,6 +1434,19 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } + // Half-float vector conversion + if (e->Iex.Binop.op == Iop_F32toF16x4 + && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) { + HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2); + HReg dstV = newVRegV(env); + HReg dstI = newVRegI(env); + set_SSE_rounding_mode( env, e->Iex.Binop.arg1 ); + addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV)); + set_SSE_rounding_default(env); + addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False)); + return dstI; + } + break; } @@ -3354,6 +3367,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) } case Iop_32UtoV128: { + // FIXME maybe just use MOVQ here? HReg dst = newVRegV(env); AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP()); AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg); @@ -3363,6 +3377,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) } case Iop_64UtoV128: { + // FIXME maybe just use MOVQ here? HReg dst = newVRegV(env); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg); @@ -3379,6 +3394,17 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo; } + case Iop_F16toF32x4: { + if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) { + HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); + HReg dst = newVRegV(env); + addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True)); + addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst)); + return dst; + } + break; + } + default: break; } /* switch (e->Iex.Unop.op) */ @@ -3787,6 +3813,31 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } + // Half-float vector conversion + case Iop_F32toF16x8: { + if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) { + HReg srcHi, srcLo; + iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + set_SSE_rounding_mode( env, e->Iex.Binop.arg1 ); + addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi)); + addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo)); + set_SSE_rounding_default(env); + // Now we have the result in dstHi[63:0] and dstLo[63:0], but we + // need to compact all that into one register. There's probably a + // more elegant way to do this, but .. + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi)); + // dstHi is now 127:64 = useful data, 63:0 = zero + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo)); + // dstLo is now 127:64 = zero, 63:0 = useful data + addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo)); + return dstLo; + } + break; + } + default: break; } /* switch (e->Iex.Binop.op) */ @@ -4017,6 +4068,24 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, return; } + case Iop_F16toF32x8: { + if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) { + HReg src = iselVecExpr(env, e->Iex.Unop.arg); + HReg srcCopy = newVRegV(env); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + // Copy src, since we'll need to modify it. + addInstr(env, mk_vMOVsd_RR(src, srcCopy)); + addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy)); + addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi)); + *rHi = dstHi; + *rLo = dstLo; + return; + } + break; + } + default: break; } /* switch (e->Iex.Unop.op) */ @@ -5138,7 +5207,9 @@ HInstrArray* iselSB_AMD64 ( const IRSB* bb, | VEX_HWCAPS_AMD64_AVX | VEX_HWCAPS_AMD64_RDTSCP | VEX_HWCAPS_AMD64_BMI - | VEX_HWCAPS_AMD64_AVX2))); + | VEX_HWCAPS_AMD64_AVX2 + | VEX_HWCAPS_AMD64_F16C + | VEX_HWCAPS_AMD64_RDRAND))); /* Check that the host's endianness is as expected. */ vassert(archinfo_host->endness == VexEndnessLE); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 6c1224963..0502f02fb 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -444,6 +444,7 @@ void ppIROp ( IROp op ) case Iop_F32toI32Sx4: vex_printf("F32toI32Sx4"); return; case Iop_F32toF16x4_DEP: vex_printf("F32toF16x4_DEP"); return; + case Iop_F32toF16x4: vex_printf("F32toF16x4"); return; case Iop_F16toF32x4: vex_printf("F16toF32x4"); return; case Iop_F16toF64x2: vex_printf("F16toF64x2"); return; case Iop_F64toF16x2_DEP: vex_printf("F64toF16x2_DEP"); return; @@ -1243,6 +1244,8 @@ void ppIROp ( IROp op ) case Iop_Div32Fx8: vex_printf("Div32Fx8"); return; case Iop_I32StoF32x8: vex_printf("I32StoF32x8"); return; case Iop_F32toI32Sx8: vex_printf("F32toI32Sx8"); return; + case Iop_F32toF16x8: vex_printf("F32toF16x8"); return; + case Iop_F16toF32x8: vex_printf("F16toF32x8"); return; case Iop_AndV256: vex_printf("AndV256"); return; case Iop_OrV256: vex_printf("OrV256"); return; case Iop_XorV256: vex_printf("XorV256"); return; @@ -3000,6 +3003,9 @@ void typeOfPrimop ( IROp op, case Iop_F32toI32Sx4: BINARY(ity_RMode,Ity_V128, Ity_V128); + case Iop_F32toF16x4: + BINARY(ity_RMode,Ity_V128, Ity_I64); + case Iop_64HLtoV128: BINARY(Ity_I64,Ity_I64, Ity_V128); @@ -3592,9 +3598,15 @@ void typeOfPrimop ( IROp op, case Iop_F32toI32Sx8: BINARY(ity_RMode,Ity_V256, Ity_V256); + case Iop_F32toF16x8: + BINARY(ity_RMode,Ity_V256, Ity_V128); + case Iop_V256toV128_1: case Iop_V256toV128_0: UNARY(Ity_V256, Ity_V128); + case Iop_F16toF32x8: + UNARY(Ity_V128, Ity_V256); + case Iop_QandUQsh8x16: case Iop_QandUQsh16x8: case Iop_QandUQsh32x4: case Iop_QandUQsh64x2: case Iop_QandSQsh8x16: case Iop_QandSQsh16x8: diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c index f387f166b..c045887a7 100644 --- a/VEX/priv/main_main.c +++ b/VEX/priv/main_main.c @@ -1579,6 +1579,8 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps ) { VEX_HWCAPS_AMD64_AVX, "avx" }, { VEX_HWCAPS_AMD64_AVX2, "avx2" }, { VEX_HWCAPS_AMD64_BMI, "bmi" }, + { VEX_HWCAPS_AMD64_F16C, "f16c" }, + { VEX_HWCAPS_AMD64_RDRAND, "rdrand" }, }; /* Allocate a large enough buffer */ static HChar buf[sizeof prefix + diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h index 629a258a9..f4152a1fc 100644 --- a/VEX/pub/libvex.h +++ b/VEX/pub/libvex.h @@ -99,6 +99,8 @@ typedef #define VEX_HWCAPS_AMD64_RDTSCP (1<<9) /* RDTSCP instruction */ #define VEX_HWCAPS_AMD64_BMI (1<<10) /* BMI1 instructions */ #define VEX_HWCAPS_AMD64_AVX2 (1<<11) /* AVX2 instructions */ +#define VEX_HWCAPS_AMD64_RDRAND (1<<13) /* RDRAND instructions */ +#define VEX_HWCAPS_AMD64_F16C (1<<14) /* F16C instructions */ /* ppc32: baseline capability is integer only */ #define VEX_HWCAPS_PPC32_F (1<<8) /* basic (non-optional) FP */ diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 61d22016d..4b0b05ebc 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1417,8 +1417,9 @@ typedef /* --- Single to/from half conversion --- */ /* FIXME: what kind of rounding in F32x4 -> F16x4 case? */ // FIXME these carry no rounding mode - Iop_F32toF16x4_DEP, /* F32x4 -> F16x4, NO ROUNDING MODE */ - Iop_F16toF32x4, /* F16x4 -> F32x4 */ + Iop_F32toF16x4_DEP, /* F32x4(==V128) -> F16x4(==I64), NO ROUNDING MODE */ + Iop_F32toF16x4, /* IRRoundingMode(I32) x V128 -> I64 */ + Iop_F16toF32x4, /* F16x4 -> F32x4 */ /* -- Double to/from half conversion -- */ Iop_F64toF16x2_DEP, // F64x2 -> F16x2, NO ROUNDING MODE @@ -1987,6 +1988,9 @@ typedef Iop_I32StoF32x8, /* IRRoundingMode(I32) x V256 -> V256 */ Iop_F32toI32Sx8, /* IRRoundingMode(I32) x V256 -> V256 */ + Iop_F32toF16x8, /* IRRoundingMode(I32) x V256 -> V128 */ + Iop_F16toF32x8, /* F16x8(==V128) -> F32x8(==V256) */ + Iop_Sqrt32Fx8, Iop_Sqrt64Fx4, Iop_RSqrtEst32Fx8, diff --git a/configure.ac b/configure.ac index 8779cf029..912dec3a9 100644 --- a/configure.ac +++ b/configure.ac @@ -2925,9 +2925,52 @@ AC_MSG_RESULT([no]) AM_CONDITIONAL(BUILD_ADX_TESTS, test x$ac_have_as_adx = xyes) -# Does the C compiler support the "ifunc" attribute +# does the amd64 assembler understand the RDRAND instruction? # Note, this doesn't generate a C-level symbol. It generates a -# automake-level symbol (BUILD_IFUNC_TESTS), used in test Makefile.am's +# automake-level symbol (BUILD_RDRAND_TESTS), used in test Makefile.am's +AC_MSG_CHECKING([if amd64 assembler knows the RDRAND instruction]) + +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[ + do { + asm ("rdrand %r14"); + asm ("rdrand %r14d"); + asm ("rdrand %r14w"); + } while (0) +]])], [ +ac_have_as_rdrand=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_as_rdrand=no +AC_MSG_RESULT([no]) +]) + +AM_CONDITIONAL(BUILD_RDRAND_TESTS, test x$ac_have_as_rdrand = xyes) + + +# does the amd64 assembler understand the F16C instructions (VCVTPH2PS and +# VCVTPS2PH) ? +# Note, this doesn't generate a C-level symbol. It generates a +# automake-level symbol (BUILD_F16C_TESTS), used in test Makefile.am's +AC_MSG_CHECKING([if amd64 assembler knows the F16C instructions]) + +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[ + do { + asm ("vcvtph2ps %xmm5, %ymm10"); + // If we put the dollar sign and zero together, the shell processing + // this configure.ac script substitutes the command name in. Sigh. + asm ("vcvtps2ph $" "0, %ymm10, %xmm5"); + } while (0) +]])], [ +ac_have_as_f16c=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_as_f16c=no +AC_MSG_RESULT([no]) +]) + +AM_CONDITIONAL(BUILD_F16C_TESTS, test x$ac_have_as_f16c = xyes) + + # does the x86/amd64 assembler understand MOVBE? # Note, this doesn't generate a C-level symbol. It generates a # automake-level symbol (BUILD_MOVBE_TESTS), used in test Makefile.am's diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c index 7aa051bd1..69425a3ac 100644 --- a/coregrind/m_machine.c +++ b/coregrind/m_machine.c @@ -945,7 +945,7 @@ Bool VG_(machine_get_hwcaps)( void ) #elif defined(VGA_amd64) { Bool have_sse3, have_ssse3, have_cx8, have_cx16; Bool have_lzcnt, have_avx, have_bmi, have_avx2; - Bool have_rdtscp; + Bool have_rdtscp, have_rdrand, have_f16c; UInt eax, ebx, ecx, edx, max_basic, max_extended; ULong xgetbv_0 = 0; HChar vstr[13]; @@ -953,7 +953,7 @@ Bool VG_(machine_get_hwcaps)( void ) have_sse3 = have_ssse3 = have_cx8 = have_cx16 = have_lzcnt = have_avx = have_bmi = have_avx2 - = have_rdtscp = False; + = have_rdtscp = have_rdrand = have_f16c = False; eax = ebx = ecx = edx = max_basic = max_extended = 0; @@ -983,13 +983,15 @@ Bool VG_(machine_get_hwcaps)( void ) // we assume that SSE1 and SSE2 are available by default have_sse3 = (ecx & (1<<0)) != 0; /* True => have sse3 insns */ have_ssse3 = (ecx & (1<<9)) != 0; /* True => have Sup SSE3 insns */ + // fma is ecx:12 // sse41 is ecx:19 // sse42 is ecx:20 - // xsave is ecx:26 // osxsave is ecx:27 // avx is ecx:28 - // fma is ecx:12 + have_f16c = (ecx & (1<<29)) != 0; /* True => have F16C insns */ + have_rdrand = (ecx & (1<<30)) != 0; /* True => have RDRAND insns */ + have_avx = False; /* have_fma = False; */ if ( (ecx & ((1<<28)|(1<<27)|(1<<26))) == ((1<<28)|(1<<27)|(1<<26)) ) { @@ -1057,6 +1059,14 @@ Bool VG_(machine_get_hwcaps)( void ) have_avx2 = (ebx & (1<<5)) != 0; /* True => have AVX2 */ } + /* Sanity check for RDRAND and F16C. These don't actually *need* AVX2, but + it's convenient to restrict them to the AVX2 case since the simulated + CPUID we'll offer them on has AVX2 as a base. */ + if (!have_avx2) { + have_f16c = False; + have_rdrand = False; + } + va = VexArchAMD64; vai.endness = VexEndnessLE; vai.hwcaps = (have_sse3 ? VEX_HWCAPS_AMD64_SSE3 : 0) @@ -1066,7 +1076,9 @@ Bool VG_(machine_get_hwcaps)( void ) | (have_avx ? VEX_HWCAPS_AMD64_AVX : 0) | (have_bmi ? VEX_HWCAPS_AMD64_BMI : 0) | (have_avx2 ? VEX_HWCAPS_AMD64_AVX2 : 0) - | (have_rdtscp ? VEX_HWCAPS_AMD64_RDTSCP : 0); + | (have_rdtscp ? VEX_HWCAPS_AMD64_RDTSCP : 0) + | (have_f16c ? VEX_HWCAPS_AMD64_F16C : 0) + | (have_rdrand ? VEX_HWCAPS_AMD64_RDRAND : 0); VG_(machine_get_cache_info)(&vai); diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index e3086b613..1b06f8402 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -4838,6 +4838,40 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, binop(Iop_V128HLtoV256, qV, shV)); } + case Iop_F32toF16x4: { + // First, PCast the input vector, retaining the 32x4 format. + IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4 + // Now truncate each 32 bit lane to 16 bits. Since we already PCasted + // the input, we're not going to lose any information. + IRAtom* pcHI64 + = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2 + IRAtom* pcLO64 + = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2 + IRAtom* narrowed + = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4, + pcHI64, pcLO64)); // 16x4 + // Finally, roll in any badness from the rounding mode. + IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1); + return mkUifU64(mce, narrowed, rmPCasted); + } + + case Iop_F32toF16x8: { + // Same scheme as for Iop_F32toF16x4. + IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8 + IRAtom* pcHI128 + = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1, + pcasted)); // 32x4 + IRAtom* pcLO128 + = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0, + pcasted)); // 32x4 + IRAtom* narrowed + = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8, + pcHI128, pcLO128)); // 16x8 + // Finally, roll in any badness from the rounding mode. + IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1); + return mkUifUV128(mce, narrowed, rmPCasted); + } + default: ppIROp(op); VG_(tool_panic)("memcheck:expr2vbits_Binop"); @@ -5166,6 +5200,10 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_QNarrowUn64Sto32Sx2: case Iop_QNarrowUn64Sto32Ux2: case Iop_QNarrowUn64Uto32Ux2: + return vectorNarrowUnV128(mce, op, vatom); + + // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not + // right. case Iop_F32toF16x4_DEP: return vectorNarrowUnV128(mce, op, vatom); @@ -5175,9 +5213,52 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_Widen16Uto32x4: case Iop_Widen32Sto64x2: case Iop_Widen32Uto64x2: + return vectorWidenI64(mce, op, vatom); + case Iop_F16toF32x4: + // JRS 2019 Mar 17: this definitely isn't right, but it probably works + // OK by accident if -- as seems likely -- the F16 to F32 conversion + // preserves will generate an output 32 bits with at least one 1 bit + // set if there's one or more 1 bits set in the input 16 bits. More + // correct code for this is just below, but commented out, so as to + // avoid short-term backend failures on targets that can't do + // Iop_Interleave{LO,HI}16x4. return vectorWidenI64(mce, op, vatom); + case Iop_F16toF32x8: { + // PCast the input at 16x8. This makes each lane hold either all + // zeroes or all ones. + IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8 + // Now double the width of each lane to 32 bits. Because the lanes are + // all zeroes or all ones, we can just copy the each lane twice into + // the result. Here's the low half: + IRAtom* widenedLO // :: I32x4 + = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8, + pcasted, pcasted)); + // And the high half: + IRAtom* widenedHI // :: I32x4 + = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8, + pcasted, pcasted)); + // Glue them back together: + return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, + widenedHI, widenedLO)); + } + + // See comment just above, for Iop_F16toF32x4 + //case Iop_F16toF32x4: { + // // Same scheme as F16toF32x4 + // IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4 + // IRAtom* widenedLO // :: I32x2 + // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4, + // pcasted, pcasted)); + // IRAtom* widenedHI // :: I32x4 + // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4, + // pcasted, pcasted)); + // // Glue them back together: + // return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, + // widenedHI, widenedLO)); + //} + case Iop_PwAddL32Ux2: case Iop_PwAddL32Sx2: return mkPCastTo(mce, Ity_I64, diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index 03cc5c12e..65617b273 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -655,6 +655,7 @@ static irop_t irops[] = { { DEFOP(Iop_Fixed32UToF32x4_RN, UNDEF_UNKNOWN), }, { DEFOP(Iop_Fixed32SToF32x4_RN, UNDEF_UNKNOWN), }, { DEFOP(Iop_F32toF16x4_DEP, UNDEF_UNKNOWN), }, + { DEFOP(Iop_F32toF16x4, UNDEF_UNKNOWN), }, { DEFOP(Iop_F16toF32x4, UNDEF_UNKNOWN), }, { DEFOP(Iop_F64toF16x2_DEP, UNDEF_UNKNOWN), }, { DEFOP(Iop_F16toF64x2, UNDEF_UNKNOWN), }, @@ -1128,6 +1129,8 @@ static irop_t irops[] = { { DEFOP(Iop_Div32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_I32StoF32x8, UNDEF_UNKNOWN), }, { DEFOP(Iop_F32toI32Sx8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_F32toF16x8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_F16toF32x8, UNDEF_UNKNOWN) }, { DEFOP(Iop_Sqrt32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Sqrt64Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_RSqrtEst32Fx8, UNDEF_UNKNOWN), }, diff --git a/none/tests/amd64/Makefile.am b/none/tests/amd64/Makefile.am index a5e4d9964..2ec7682bc 100644 --- a/none/tests/amd64/Makefile.am +++ b/none/tests/amd64/Makefile.am @@ -42,6 +42,7 @@ EXTRA_DIST = \ clc.vgtest clc.stdout.exp clc.stderr.exp \ crc32.vgtest crc32.stdout.exp crc32.stderr.exp \ cmpxchg.vgtest cmpxchg.stdout.exp cmpxchg.stderr.exp \ + f16c.vgtest f16c.stderr.exp f16c.stdout.exp \ faultstatus.disabled faultstatus.stderr.exp \ fb_test_amd64.vgtest \ fb_test_amd64.stderr.exp fb_test_amd64.stdout.exp \ @@ -76,6 +77,7 @@ EXTRA_DIST = \ pcmpxstrx64w.stderr.exp pcmpxstrx64w.stdout.exp \ pcmpxstrx64w.vgtest \ rcl-amd64.vgtest rcl-amd64.stdout.exp rcl-amd64.stderr.exp \ + rdrand.vgtest rdrand.stdout.exp rdrand.stderr.exp \ redundantRexW.vgtest redundantRexW.stdout.exp \ redundantRexW.stderr.exp \ smc1.stderr.exp smc1.stdout.exp smc1.vgtest \ @@ -155,6 +157,12 @@ endif if BUILD_MPX_TESTS check_PROGRAMS += mpx endif +if BUILD_F16C_TESTS + check_PROGRAMS += f16c +endif +if BUILD_RDRAND_TESTS + check_PROGRAMS += rdrand +endif # DDD: these need to be made to work on Darwin like the x86/ ones were. diff --git a/none/tests/amd64/f16c.c b/none/tests/amd64/f16c.c new file mode 100644 index 000000000..db27e4f43 --- /dev/null +++ b/none/tests/amd64/f16c.c @@ -0,0 +1,198 @@ + +#include +#include +#include +#include "tests/malloc.h" + +typedef unsigned char UChar; +typedef unsigned int UInt; +typedef unsigned long int UWord; +typedef unsigned long long int ULong; + +#define IS_32_ALIGNED(_ptr) (0 == (0x1F & (UWord)(_ptr))) + +typedef union { UChar u8[32]; UInt u32[8]; } YMM; + +typedef struct { YMM a1; YMM a2; YMM a3; YMM a4; ULong u64; } Block; + +void showYMM ( YMM* vec ) +{ + int i; + assert(IS_32_ALIGNED(vec)); + for (i = 31; i >= 0; i--) { + printf("%02x", (UInt)vec->u8[i]); + if (i > 0 && 0 == ((i+0) & 7)) printf("."); + } +} + +void showBlock ( char* msg, Block* block ) +{ + printf(" %s\n", msg); + printf(" "); showYMM(&block->a1); printf("\n"); + printf(" "); showYMM(&block->a2); printf("\n"); + printf(" "); showYMM(&block->a3); printf("\n"); + printf(" "); showYMM(&block->a4); printf("\n"); + printf(" %016llx\n", block->u64); +} + +UChar randUChar ( void ) +{ + static UInt seed = 80021; + seed = 1103515245 * seed + 12345; + return (seed >> 17) & 0xFF; +} + +void randBlock ( Block* b ) +{ + int i; + UChar* p = (UChar*)b; + for (i = 0; i < sizeof(Block); i++) + p[i] = randUChar(); +} + + +/* Generate a function test_NAME, that tests the given insn, in both + its mem and reg forms. The reg form of the insn may mention, as + operands only %ymm6, %ymm7, %ymm8, %ymm9 and %r14. The mem form of + the insn may mention as operands only (%rax), %ymm7, %ymm8, %ymm9 + and %r14. It's OK for the insn to clobber ymm0, as this is needed + for testing PCMPxSTRx, and ymm6, as this is needed for testing + MOVMASK variants. */ + +#define GEN_test_RandM(_name, _reg_form, _mem_form) \ + \ + __attribute__ ((noinline)) static void test_##_name ( void ) \ + { \ + Block* b = memalign32(sizeof(Block)); \ + randBlock(b); \ + printf("%s(reg)\n", #_name); \ + showBlock("before", b); \ + __asm__ __volatile__( \ + "vmovdqa 0(%0),%%ymm7" "\n\t" \ + "vmovdqa 32(%0),%%ymm8" "\n\t" \ + "vmovdqa 64(%0),%%ymm6" "\n\t" \ + "vmovdqa 96(%0),%%ymm9" "\n\t" \ + "movq 128(%0),%%r14" "\n\t" \ + _reg_form "\n\t" \ + "vmovdqa %%ymm7, 0(%0)" "\n\t" \ + "vmovdqa %%ymm8, 32(%0)" "\n\t" \ + "vmovdqa %%ymm6, 64(%0)" "\n\t" \ + "vmovdqa %%ymm9, 96(%0)" "\n\t" \ + "movq %%r14, 128(%0)" "\n\t" \ + : /*OUT*/ \ + : /*IN*/"r"(b) \ + : /*TRASH*/"xmm0","xmm7","xmm8","xmm6","xmm9","r14","memory","cc" \ + ); \ + showBlock("after", b); \ + randBlock(b); \ + printf("%s(mem)\n", #_name); \ + showBlock("before", b); \ + __asm__ __volatile__( \ + "leaq 0(%0),%%rax" "\n\t" \ + "vmovdqa 32(%0),%%ymm8" "\n\t" \ + "vmovdqa 64(%0),%%ymm7" "\n\t" \ + "vmovdqa 96(%0),%%ymm9" "\n\t" \ + "movq 128(%0),%%r14" "\n\t" \ + _mem_form "\n\t" \ + "vmovdqa %%ymm8, 32(%0)" "\n\t" \ + "vmovdqa %%ymm7, 64(%0)" "\n\t" \ + "vmovdqa %%ymm9, 96(%0)" "\n\t" \ + "movq %%r14, 128(%0)" "\n\t" \ + : /*OUT*/ \ + : /*IN*/"r"(b) \ + : /*TRASH*/"xmm6", \ + "xmm0","xmm8","xmm7","xmm9","r14","rax","memory","cc" \ + ); \ + showBlock("after", b); \ + printf("\n"); \ + free(b); \ + } + +#define GEN_test_Ronly(_name, _reg_form) \ + GEN_test_RandM(_name, _reg_form, "") +#define GEN_test_Monly(_name, _mem_form) \ + GEN_test_RandM(_name, "", _mem_form) + +GEN_test_RandM(VCVTPH2PS_128, + "vcvtph2ps %%xmm6, %%xmm8", + "vcvtph2ps (%%rax), %%xmm8"); +GEN_test_RandM(VCVTPH2PS_256, + "vcvtph2ps %%xmm6, %%ymm8", + "vcvtph2ps (%%rax), %%ymm8"); + +GEN_test_RandM(VCVTPS2PH_128_0, + "vcvtps2ph $0, %%xmm8, %%xmm6", + "vcvtps2ph $0, %%xmm8, (%%rax)"); +GEN_test_RandM(VCVTPS2PH_256_0, + "vcvtps2ph $0, %%ymm8, %%xmm6", + "vcvtps2ph $0, %%ymm8, (%%rax)"); + +GEN_test_RandM(VCVTPS2PH_128_1, + "vcvtps2ph $1, %%xmm8, %%xmm6", + "vcvtps2ph $1, %%xmm8, (%%rax)"); +GEN_test_RandM(VCVTPS2PH_256_1, + "vcvtps2ph $1, %%ymm8, %%xmm6", + "vcvtps2ph $1, %%ymm8, (%%rax)"); + +GEN_test_RandM(VCVTPS2PH_128_2, + "vcvtps2ph $2, %%xmm8, %%xmm6", + "vcvtps2ph $2, %%xmm8, (%%rax)"); +GEN_test_RandM(VCVTPS2PH_256_2, + "vcvtps2ph $2, %%ymm8, %%xmm6", + "vcvtps2ph $2, %%ymm8, (%%rax)"); + +GEN_test_RandM(VCVTPS2PH_128_3, + "vcvtps2ph $3, %%xmm8, %%xmm6", + "vcvtps2ph $3, %%xmm8, (%%rax)"); +GEN_test_RandM(VCVTPS2PH_256_3, + "vcvtps2ph $3, %%ymm8, %%xmm6", + "vcvtps2ph $3, %%ymm8, (%%rax)"); + +GEN_test_RandM(VCVTPS2PH_128_4, + "vcvtps2ph $4, %%xmm8, %%xmm6", + "vcvtps2ph $4, %%xmm8, (%%rax)"); +GEN_test_RandM(VCVTPS2PH_256_4, + "vcvtps2ph $4, %%ymm8, %%xmm6", + "vcvtps2ph $4, %%ymm8, (%%rax)"); + +/* Comment duplicated above, for convenient reference: + Allowed operands in test insns: + Reg form: %ymm6, %ymm7, %ymm8, %ymm9 and %r14. + Mem form: (%rax), %ymm7, %ymm8, %ymm9 and %r14. + Imm8 etc fields are also allowed, where they make sense. + Both forms may use ymm0 as scratch. Mem form may also use + ymm6 as scratch. +*/ + +#define N_DEFAULT_ITERS 3 + +// Do the specified test some number of times +#define DO_N(_iters, _testfn) \ + do { int i; for (i = 0; i < (_iters); i++) { test_##_testfn(); } } while (0) + +// Do the specified test the default number of times +#define DO_D(_testfn) DO_N(N_DEFAULT_ITERS, _testfn) + + +int main ( void ) +{ + DO_D( VCVTPH2PS_128 ); + DO_D( VCVTPH2PS_256 ); + + DO_D( VCVTPS2PH_128_0 ); + DO_D( VCVTPS2PH_256_0 ); + + DO_D( VCVTPS2PH_128_1 ); + DO_D( VCVTPS2PH_256_1 ); + + DO_D( VCVTPS2PH_128_2 ); + DO_D( VCVTPS2PH_256_2 ); + + DO_D( VCVTPS2PH_128_3 ); + DO_D( VCVTPS2PH_256_3 ); + + DO_D( VCVTPS2PH_128_4 ); + DO_D( VCVTPS2PH_256_4 ); + + return 0; +} diff --git a/none/tests/amd64/f16c.stderr.exp b/none/tests/amd64/f16c.stderr.exp new file mode 100644 index 000000000..e69de29bb diff --git a/none/tests/amd64/f16c.stdout.exp b/none/tests/amd64/f16c.stdout.exp new file mode 100644 index 000000000..27cf7611b --- /dev/null +++ b/none/tests/amd64/f16c.stdout.exp @@ -0,0 +1,972 @@ +VCVTPH2PS_128(reg) + before + 7d6528c5fa956a0d.69c3e9a6af27d13b.5175e39d19c9ca1e.98f24a4984175700 + b6d2fb5aa7bc5127.fe9915e556a044b2.60b160857d45c484.47b8d8c0eeef1e50 + 065d77195d623e6b.842adc6450659e17.19a348215c3a67fd.399182c2dbcc2d38 + cb509970b8136c85.d740b80eb7839b97.d89998df5035ed36.4a4bc43968bc40e5 + 56b01a12b0ca1583 + after + 7d6528c5fa956a0d.69c3e9a6af27d13b.5175e39d19c9ca1e.98f24a4984175700 + 0000000000000000.0000000000000000.3f322000b8308000.c37980003da70000 + 065d77195d623e6b.842adc6450659e17.19a348215c3a67fd.399182c2dbcc2d38 + cb509970b8136c85.d740b80eb7839b97.d89998df5035ed36.4a4bc43968bc40e5 + 56b01a12b0ca1583 +VCVTPH2PS_128(mem) + before + 398e0039cf03663d.5ff85bc9535c191f.d3a727d1a705f65d.f9dd4a29f8c093db + cfaff39be272ef40.20a1bb92cbc97fe8.542da4983df76c96.d8bc5c6dee699597 + f4e06e2205236eb7.6897b536bbe4da8a.369dab4f9465b86e.d182c916cebc2e17 + 84ededbc53239dcf.95264321bf3b68b2.55c2b9e2c95c9810.407b8d9035449b06 + 81f2a547be8d1811 + after + 398e0039cf03663d.5ff85bc9535c191f.d3a727d1a705f65d.f9dd4a29f8c093db + 0000000000000000.0000000000000000.c73ba00041452000.c7180000ba7b6000 + f4e06e2205236eb7.6897b536bbe4da8a.369dab4f9465b86e.d182c916cebc2e17 + 84ededbc53239dcf.95264321bf3b68b2.55c2b9e2c95c9810.407b8d9035449b06 + 81f2a547be8d1811 + +VCVTPH2PS_128(reg) + before + f0350ca70523e0e4.5ba1ec54e87d39b3.019963bf7459630b.8d69483df7e8c6a9 + e98ebd1ca893312a.54cae7d5e13dfe91.0a3e0f7c75cb0842.b95ed64d3b13ff64 + c84ab71340684590.4d325b2d5a70a792.0a5f45c55f1c9202.b76ddefcb0ebfe6e + e9b5f3f66b2e58c1.21a6c3476d21f1e5.5f490104ced83ff8.6262dd37727c80f3 + 96084deb9ed0411e + after + f0350ca70523e0e4.5ba1ec54e87d39b3.019963bf7459630b.8d69483df7e8c6a9 + 0000000000000000.0000000000000000.beeda000c3df8000.be1d6000ffcdc000 + c84ab71340684590.4d325b2d5a70a792.0a5f45c55f1c9202.b76ddefcb0ebfe6e + e9b5f3f66b2e58c1.21a6c3476d21f1e5.5f490104ced83ff8.6262dd37727c80f3 + 96084deb9ed0411e +VCVTPH2PS_128(mem) + before + 2e2dac0350f6fd1c.a81b6e33c572a86a.acf29b0f395c98b4.63483da65c8c49d0 + 089b756aa3f77018.61c82534e9bf6f37.c9e25f72d82e582b.73a8f718a8c3ec35 + ff1f240eb3e1553f.6f07136773a2ead3.56428c5a66a2ec77.ecb42ac54b0966d4 + ee8536da9dbf68bc.3026343700a654eb.2ddd9db4ffc411c4.28bad218e4ebf159 + 8404eb7f0cf4ca6f + after + 2e2dac0350f6fd1c.a81b6e33c572a86a.acf29b0f395c98b4.63483da65c8c49d0 + 0000000000000000.0000000000000000.446900003fb4c000.43918000413a0000 + ff1f240eb3e1553f.6f07136773a2ead3.56428c5a66a2ec77.ecb42ac54b0966d4 + ee8536da9dbf68bc.3026343700a654eb.2ddd9db4ffc411c4.28bad218e4ebf159 + 8404eb7f0cf4ca6f + +VCVTPH2PS_128(reg) + before + 5cdf726562b02dc2.b39925ba7d9d67bc.ff6f850f2c57ea2a.2c810e6dc1a1833d + 0c9761367fac55ff.28276f9a6e880c6b.372f015d9242e83d.2ef85b6fc544fd0f + f078b65e01737fd2.2bfa8f668c8b14f4.36b2a38dcef18acf.0e0f01a829ba3c66 + 65ce6d498492e7e7.96df010bf4b23b84.57436a097df30b8d.aa927a03090dfc6d + dc4c446c804bf950 + after + 5cdf726562b02dc2.b39925ba7d9d67bc.ff6f850f2c57ea2a.2c810e6dc1a1833d + 0000000000000000.0000000000000000.39c1e00037d40000.3d3740003f8cc000 + f078b65e01737fd2.2bfa8f668c8b14f4.36b2a38dcef18acf.0e0f01a829ba3c66 + 65ce6d498492e7e7.96df010bf4b23b84.57436a097df30b8d.aa927a03090dfc6d + dc4c446c804bf950 +VCVTPH2PS_128(mem) + before + 810bdacfab80ee3d.c5e48064a393c8e9.47a34273c10a3c47.f5304f3e3ad1a923 + 769ab818a5b7985e.6d08ed19fa045f84.1810cd8c109ed568.6ec34f98a2199d3c + 95c45b338afcb3df.b984aed62671e865.e6f21d40fc7bc013.1c4a678450562685 + bc563e0c775bfaed.05a5c205c3659f38.8e17b17da2acb976.5d0f926ce1157eaa + 8b5fccbef0e1e256 + after + 810bdacfab80ee3d.c5e48064a393c8e9.47a34273c10a3c47.f5304f3e3ad1a923 + 0000000000000000.0000000000000000.c6a6000041e7c000.3f5a2000bd246000 + 95c45b338afcb3df.b984aed62671e865.e6f21d40fc7bc013.1c4a678450562685 + bc563e0c775bfaed.05a5c205c3659f38.8e17b17da2acb976.5d0f926ce1157eaa + 8b5fccbef0e1e256 + +VCVTPH2PS_256(reg) + before + 048612e51a468e36.c51cdd8f87e12ab4.acb722146c6cbfa9.ea4a022e1d3d7dbb + 22cf5e4cfad1bdf5.8de2b4a9d799ff5f.0c05cb6ebd128663.d7568e3e8a3ac80e + 4288ae612c0dad40.f0733f448390351b.80ddba7e53e42d12.3208cf9b04b0569c + c1fbfd8f4d8698c2.cb9dfb4ea5d18713.6489eab2c96df363.d52c4330a7aae391 + 9d8e66ea90352a18 + after + 048612e51a468e36.c51cdd8f87e12ab4.acb722146c6cbfa9.ea4a022e1d3d7dbb + b75d0000bf4fc000.427c80003da24000.3e410000c1f36000.3896000042d38000 + 4288ae612c0dad40.f0733f448390351b.80ddba7e53e42d12.3208cf9b04b0569c + c1fbfd8f4d8698c2.cb9dfb4ea5d18713.6489eab2c96df363.d52c4330a7aae391 + 9d8e66ea90352a18 +VCVTPH2PS_256(mem) + before + 66fab2b3db5ce85e.f9754842f9c9ba28.f82a63b15c68b274.14575775bc3a1202 + 0c3ca578a32bd88e.474289e7cb61501e.54e7f35bc162726a.ec91fe34c7d6c79a + 6b1fba2604afb8d5.08aebee85fda964f.bba02737f3c98220.4784d95987cd4ed8 + 5f706da71bf2425f.9605e2b252c1c868.09217c310baca0c3.837be65197abe268 + fbc4208894fdc0f5 + after + 66fab2b3db5ce85e.f9754842f9c9ba28.f82a63b15c68b274.14575775bc3a1202 + c705400044762000.438d0000be4e8000.3a8ae00042eea000.bf8740003a404000 + 6b1fba2604afb8d5.08aebee85fda964f.bba02737f3c98220.4784d95987cd4ed8 + 5f706da71bf2425f.9605e2b252c1c868.09217c310baca0c3.837be65197abe268 + fbc4208894fdc0f5 + +VCVTPH2PS_256(reg) + before + 0aaa836b194e242c.c5fc3ae904033357.4e92f1b240a12214.1a366d352714867e + 0e780c65c22b4ab8.778d9ed6d9eb46ea.8ca3e752c306df00.caab752f630ff07e + 627bb6e12d1f6d46.51ef145cb9b83843.ac82c1007a7d3cd8.f54b130cdaa89cef + 61ff7d4df3b6ca81.31f01866bd76c58f.0a7c7a27fe917447.77e3c0b6a9ec44fc + 2c3ffa1aebe6a4d2 + after + 0aaa836b194e242c.c5fc3ae904033357.4e92f1b240a12214.1a366d352714867e + bd904000c0200000.474fa0003f9b0000.c6a960003a618000.c3550000bb9de000 + 627bb6e12d1f6d46.51ef145cb9b83843.ac82c1007a7d3cd8.f54b130cdaa89cef + 61ff7d4df3b6ca81.31f01866bd76c58f.0a7c7a27fe917447.77e3c0b6a9ec44fc + 2c3ffa1aebe6a4d2 +VCVTPH2PS_256(mem) + before + f02b3b25bca27a9c.69505d14b27d9d16.f25b26e0042fa9fa.02dd0e32eecfc5fa + 9f7301c1392d8087.d4ba52a206ff21b1.70fbbab6a7f19faf.f0f1798fe3c1699c + 15e3c8dc7e9273bf.0088596389c893fd.879d51d4c5c764db.3004b7a97cf69dda + 2d460a61a5dd0f6f.47086cc3da642fa7.130d662777beb4a9.1e61c5ec52f79c60 + 16559ec50352a3d9 + after + f02b3b25bca27a9c.69505d14b27d9d16.f25b26e0042fa9fa.02dd0e32eecfc5fa + c64b60003cdc0000.3885e000bd3f4000.3837400039c64000.c5d9e000c0bf4000 + 15e3c8dc7e9273bf.0088596389c893fd.879d51d4c5c764db.3004b7a97cf69dda + 2d460a61a5dd0f6f.47086cc3da642fa7.130d662777beb4a9.1e61c5ec52f79c60 + 16559ec50352a3d9 + +VCVTPH2PS_256(reg) + before + 742c3e9e2b92eef2.c569453ccd1b0fc4.0784892e9360315b.f0177599dbe14b46 + 9432a2e46543b956.b819f459105730e9.9a49ac115048d4c4.f987fa170d3ce4dd + d2b3c4044ef23fb2.e22093a48a9d2e0b.5da3cfd6aea6558e.0c28728e28dc3c9c + 89fba268812abdb2.1e4a9e0958fac555.adddf0eb4808f067.04c857e949cc0fac + bc3127138b19183c + after + 742c3e9e2b92eef2.c569453ccd1b0fc4.0784892e9360315b.f0177599dbe14b46 + 43b46000c1fac000.bdd4c00042b1c000.398500004651c000.3d1b80003f938000 + d2b3c4044ef23fb2.e22093a48a9d2e0b.5da3cfd6aea6558e.0c28728e28dc3c9c + 89fba268812abdb2.1e4a9e0958fac555.adddf0eb4808f067.04c857e949cc0fac + bc3127138b19183c +VCVTPH2PS_256(mem) + before + 12305efa0acd1475.1755377e9a786f01.4a6592749579b0f4.e4450ababbfae0f9 + e1917689e3f6bf86.d70f7fb13667914c.413cead25e27ac14.5f2619b1a20662f0 + 0420edac31a0d599.2573776df1835e3e.de9a220dce0e75e0.7acb193b9abab2f9 + 59a93d4f11d611db.5cce191e65591384.ff4cb613013cc685.918107c43ea20cc0 + 0194ddb82b49abf0 + after + 12305efa0acd1475.1755377e9a786f01.4a6592749579b0f4.e4450ababbfae0f9 + 414ca000ba4e8000.baaf2000be1e8000.c488a00039574000.bf7f4000c41f2000 + 0420edac31a0d599.2573776df1835e3e.de9a220dce0e75e0.7acb193b9abab2f9 + 59a93d4f11d611db.5cce191e65591384.ff4cb613013cc685.918107c43ea20cc0 + 0194ddb82b49abf0 + +VCVTPS2PH_128_0(reg) + before + 24509983fc3bcc36.baf7e45e9fa43077.da6c63303173ecc9.7e1e22cf15bd5c2f + 570037914d04ab3d.05d75ec6f616ee9a.fa99500fef6024ba.39dce32c239cf309 + f6f2b14fbb3184b2.141625713239066f.17a0dc273ba9f803.0a52741849e54740 + 5d700527e24d9241.c57eb74d70183523.8fcf04e5b2dca44f.cf4c517ea3a413ff + 6295f64a4ce61473 + after + 24509983fc3bcc36.baf7e45e9fa43077.da6c63303173ecc9.7e1e22cf15bd5c2f + 570037914d04ab3d.05d75ec6f616ee9a.fa99500fef6024ba.39dce32c239cf309 + 0000000000000000.0000000000000000.0000000000000000.fc00fc000ee70000 + 5d700527e24d9241.c57eb74d70183523.8fcf04e5b2dca44f.cf4c517ea3a413ff + 6295f64a4ce61473 +VCVTPS2PH_128_0(mem) + before + a0fae06860b606c7.e8c72e865de41295.f2db8f44cbbf37e2.bc70c3b3ef84644b + 66478ac4fc21a428.f34428d9c8833f5b.78fb29445f3bc8d7.fcd015ff8f2e73a3 + 8e48704b3c31abc2.da30ef8bc0b5573e.34a901384a97a32f.a93bf6332d650e02 + f5c90ee73af5d7c0.f9da7f07e00794eb.00b0940ba5e08516.20fd62bd65b57115 + be625608d5abd787 + after + a0fae06860b606c7.e8c72e865de41295.f2db8f44cbbf37e2.7c007c00fc008000 + 66478ac4fc21a428.f34428d9c8833f5b.78fb29445f3bc8d7.fcd015ff8f2e73a3 + 8e48704b3c31abc2.da30ef8bc0b5573e.34a901384a97a32f.a93bf6332d650e02 + f5c90ee73af5d7c0.f9da7f07e00794eb.00b0940ba5e08516.20fd62bd65b57115 + be625608d5abd787 + +VCVTPS2PH_128_0(reg) + before + ddb5cd8016d27d05.7796e0861576e44f.ac8dd5bbc503330e.b9dd5dab8e212ab7 + db43c391c6b69f3a.f17a6312e7c28d9a.4e94ec120b386f52.3bfcd80321664d3e + 125934a781e479d3.3d431279cce48fce.3d3cc0784c2f8563.63d9810079bbabd9 + df411d2ee2e7467c.38bb69a6e1e9a617.d4d14e592776b1ef.0b40d58cb22d00b1 + 10fd4e94e9c808f5 + after + ddb5cd8016d27d05.7796e0861576e44f.ac8dd5bbc503330e.b9dd5dab8e212ab7 + db43c391c6b69f3a.f17a6312e7c28d9a.4e94ec120b386f52.3bfcd80321664d3e + 0000000000000000.0000000000000000.0000000000000000.7c0000001fe70000 + df411d2ee2e7467c.38bb69a6e1e9a617.d4d14e592776b1ef.0b40d58cb22d00b1 + 10fd4e94e9c808f5 +VCVTPS2PH_128_0(mem) + before + 4edb6a053a967ecf.9e477892854b43e0.beafe48541dc8da0.6f9f902235982fa0 + a1a7a4c9c0a51f6b.acb9433f079dacac.abeb000208c90296.69f2843d15223a22 + e52e79ce9700a7f7.63e279a20368bc8b.db3b370954bcbf24.20162517609f0f22 + f63a63fedcb4d29c.200d17261638b12a.2a6a07863ec28077.ef56701db49bea4c + 31005fb9ada2074b + after + 4edb6a053a967ecf.9e477892854b43e0.beafe48541dc8da0.800000007c000000 + a1a7a4c9c0a51f6b.acb9433f079dacac.abeb000208c90296.69f2843d15223a22 + e52e79ce9700a7f7.63e279a20368bc8b.db3b370954bcbf24.20162517609f0f22 + f63a63fedcb4d29c.200d17261638b12a.2a6a07863ec28077.ef56701db49bea4c + 31005fb9ada2074b + +VCVTPS2PH_128_0(reg) + before + 445ef059e641a1cc.b097e047aacc5b89.3f871736dc9ac535.7446eb65e4e703bb + 83bd1e68fb03f57b.ef136b941e54ffe8.1c9c7740ef193457.959960926235021b + 4969e55289753f03.8f7980d1535979e5.139832afee423c3d.6930e0fad3ba39c4 + f4ad41832c22ba11.6c949cea66e687ae.80c745ef729f1792.ccd7e987538166e1 + 9a5af627ff97439f + after + 445ef059e641a1cc.b097e047aacc5b89.3f871736dc9ac535.7446eb65e4e703bb + 83bd1e68fb03f57b.ef136b941e54ffe8.1c9c7740ef193457.959960926235021b + 0000000000000000.0000000000000000.0000000000000000.0000fc0080007c00 + f4ad41832c22ba11.6c949cea66e687ae.80c745ef729f1792.ccd7e987538166e1 + 9a5af627ff97439f +VCVTPS2PH_128_0(mem) + before + af8484c5f3078d2a.ded72f677f96a350.623139cb7207e36c.bf75aa6c1abe0103 + e6230d4d4add00ad.6431aa6a1e5e366d.4c1cd56194c94a4e.2ced5f927f2b383c + 1d010fab20265755.e309aef8a605af13.0821eb96e737777e.237d5fcd3f71f6e8 + 2feb05cb92ed4f4d.b5a9377eb31749ef.710cf757885d2728.006fa689f61c78b4 + 1f1030333fb8fa4b + after + af8484c5f3078d2a.ded72f677f96a350.623139cb7207e36c.7c00800000007c00 + e6230d4d4add00ad.6431aa6a1e5e366d.4c1cd56194c94a4e.2ced5f927f2b383c + 1d010fab20265755.e309aef8a605af13.0821eb96e737777e.237d5fcd3f71f6e8 + 2feb05cb92ed4f4d.b5a9377eb31749ef.710cf757885d2728.006fa689f61c78b4 + 1f1030333fb8fa4b + +VCVTPS2PH_256_0(reg) + before + dbacfa35b7d2b75a.f8ad6b99bb3fa4c2.385e4166df2141ad.63a8769192481679 + 928efefdf9f5ec8d.5313bd01b82612e0.c673c91ec9aed3f8.b9c3e32f2103009d + 9f043af6a1aed58f.1ee978efa4b054d2.bc36ca100a4a3a7d.5127ba1c529aa0bf + 5e58aa8b4c88ae0d.34fa174f9ce927c4.76f140aa4182b4e7.06a17746411ab40c + b3fd9698098ef5b0 + after + dbacfa35b7d2b75a.f8ad6b99bb3fa4c2.385e4166df2141ad.63a8769192481679 + 928efefdf9f5ec8d.5313bd01b82612e0.c673c91ec9aed3f8.b9c3e32f2103009d + 0000000000000000.0000000000000000.8000fc007c008298.f39efc008e1f0000 + 5e58aa8b4c88ae0d.34fa174f9ce927c4.76f140aa4182b4e7.06a17746411ab40c + b3fd9698098ef5b0 +VCVTPS2PH_256_0(mem) + before + 360794fec60222d6.2ad7482a960fb2b2.7014160ebbdb47e4.51f2275707e17ae4 + 698bec649583f5aa.61cd123e19cf1e2b.b001f1161e946f5c.a7837c83faf3cb1d + 2a541ab7911c2b5a.5e86033374552e23.ce8e2455e0205c58.d5f13a9ab645e140 + 532f9ae1d7da8010.7c4e1775412d1d47.a8872cb61d8aca05.37885d08d662faf9 + 2993e139f7d64ff4 + after + 360794fec60222d6.2ad7482a960fb2b2.7c0080007c000000.800000008000fc00 + 698bec649583f5aa.61cd123e19cf1e2b.b001f1161e946f5c.a7837c83faf3cb1d + 2a541ab7911c2b5a.5e86033374552e23.ce8e2455e0205c58.d5f13a9ab645e140 + 532f9ae1d7da8010.7c4e1775412d1d47.a8872cb61d8aca05.37885d08d662faf9 + 2993e139f7d64ff4 + +VCVTPS2PH_256_0(reg) + before + 0760c299b42e1fdc.c2e9e9cf82c7aff8.19714a711ce12843.18b88425f2de758f + 2d39fd95a9f5a45d.514c816eaff2763f.8f3a9991a2ff8bc2.fceca88e7b281821 + f76b8d9773b81b24.de24e0a879648e11.3cf6fe426e128171.2ef114ddd37570e8 + c1426e0dae01c0dd.433f816bfd2bb699.7af177f11da748fc.8b9145fe16d0390f + f099b6dd61462ec3 + after + 0760c299b42e1fdc.c2e9e9cf82c7aff8.19714a711ce12843.18b88425f2de758f + 2d39fd95a9f5a45d.514c816eaff2763f.8f3a9991a2ff8bc2.fceca88e7b281821 + 0000000000000000.0000000000000000.000080007c008000.80008000fc007c00 + c1426e0dae01c0dd.433f816bfd2bb699.7af177f11da748fc.8b9145fe16d0390f + f099b6dd61462ec3 +VCVTPS2PH_256_0(mem) + before + 37d5e366d0e20c30.e70a9c61f55fce33.5d68e1a25652a804.a77700084a491a0e + 3d1148867eb08f81.c50f1401e45b82d3.086a7a39a1e6217d.1dd493f591843454 + de18612787bc73e3.b79cd05818831869.2112ca1cf9f1dd31.3a542e238fe5d179 + f7b8ab3708137382.19ffced22c62cba0.822c4c377b82984c.5842cbfee0f72e2a + d4ec68f21f468712 + after + 37d5e366d0e20c30.e70a9c61f55fce33.288a7c00e879fc00.0000800000008000 + 3d1148867eb08f81.c50f1401e45b82d3.086a7a39a1e6217d.1dd493f591843454 + de18612787bc73e3.b79cd05818831869.2112ca1cf9f1dd31.3a542e238fe5d179 + f7b8ab3708137382.19ffced22c62cba0.822c4c377b82984c.5842cbfee0f72e2a + d4ec68f21f468712 + +VCVTPS2PH_256_0(reg) + before + 0b9c016be95f18de.62bba1a11cc04c89.478209dbbd84d925.08847c7642a20df9 + 579f90d5d9cd1c3a.fceebf50e0d0ba24.9c727edf66767ca3.8fe6d7c56a5ff965 + 1541139c8b1cd0d1.a11d81326f4e7880.761b274ac4c4f0c7.f31ed81010c417bc + a1cd852d9cd97050.2d146432e64644c9.30c9028972f8733d.11f7fa4450de2529 + c33ebc4b44b8ddd8 + after + 0b9c016be95f18de.62bba1a11cc04c89.478209dbbd84d925.08847c7642a20df9 + 579f90d5d9cd1c3a.fceebf50e0d0ba24.9c727edf66767ca3.8fe6d7c56a5ff965 + 0000000000000000.0000000000000000.7c00fc00fc00fc00.80007c0080007c00 + a1cd852d9cd97050.2d146432e64644c9.30c9028972f8733d.11f7fa4450de2529 + c33ebc4b44b8ddd8 +VCVTPS2PH_256_0(mem) + before + e6c097130b5efcf6.5791e2f2a78f3762.7c9fe23c60c5d82b.25c80a060da03fb0 + 56470887bfdd3daf.94d7265949ca62b4.6a8a793cf9d5f0d1.b3633c2f304791cd + ef9f8c927c405d2f.b2ed4ecc1e172df2.d3a0a41fce854ae7.35e7926e777aa43f + 8d969e225f9318a0.0e1d55b9c001d4c7.93aee0cffbdea09a.06a10a317fc4b5b3 + 84db9fe3e4b100d4 + after + e6c097130b5efcf6.5791e2f2a78f3762.7c00beea80007c00.7c00fc0080010000 + 56470887bfdd3daf.94d7265949ca62b4.6a8a793cf9d5f0d1.b3633c2f304791cd + ef9f8c927c405d2f.b2ed4ecc1e172df2.d3a0a41fce854ae7.35e7926e777aa43f + 8d969e225f9318a0.0e1d55b9c001d4c7.93aee0cffbdea09a.06a10a317fc4b5b3 + 84db9fe3e4b100d4 + +VCVTPS2PH_128_1(reg) + before + 09e14df041cdc14f.0bf7ba2283e22a31.04b4378bce1492e0.8680a7399beeae16 + f30110c432a534d0.478d5d7e053a4e0c.f0fdf0aee1dda4e8.88e2774acbc13287 + 9c86e5cb54c59402.1c25022200a7415e.2e467d8e98e7468c.75a0cbeda561e618 + 62bbc77143b71e92.668b24fb9133bf52.1adad8978cbfb478.29861f0d48dc87f5 + 805ff098ce3ed14b + after + 09e14df041cdc14f.0bf7ba2283e22a31.04b4378bce1492e0.8680a7399beeae16 + f30110c432a534d0.478d5d7e053a4e0c.f0fdf0aee1dda4e8.88e2774acbc13287 + 0000000000000000.0000000000000000.0000000000000000.fc00fc008001fc00 + 62bbc77143b71e92.668b24fb9133bf52.1adad8978cbfb478.29861f0d48dc87f5 + 805ff098ce3ed14b +VCVTPS2PH_128_1(mem) + before + 55f9b97953917f46.9fedb2229a090d2c.018b42f3d3ec8415.1004ff355bf02957 + 861ef69cf4e34e11.b168a24af5479e7b.c9f1d5f8e2de4bd3.6c11edd5a106e2d6 + ee9b23edcc40fad9.f2789356f1fb0d2b.99885af4db13d1b7.894d9fe1f98d1aa0 + 683cbc58f8b23fca.bf6982b029b396ea.4f1e4ed5da99d2ee.c5040fc700120f62 + 7b813bf15120fbc8 + after + 55f9b97953917f46.9fedb2229a090d2c.018b42f3d3ec8415.fc00fc007bff8001 + 861ef69cf4e34e11.b168a24af5479e7b.c9f1d5f8e2de4bd3.6c11edd5a106e2d6 + ee9b23edcc40fad9.f2789356f1fb0d2b.99885af4db13d1b7.894d9fe1f98d1aa0 + 683cbc58f8b23fca.bf6982b029b396ea.4f1e4ed5da99d2ee.c5040fc700120f62 + 7b813bf15120fbc8 + +VCVTPS2PH_128_1(reg) + before + 07121ecd88441b7d.d2cc3eca9347d80f.74876ac63afb7562.c67d2c86fa7c09a3 + c501b4c64209aa2e.0719232dba0b82d5.6e1d4703bf5de53f.d97270f257c73303 + 109cfa471afbe686.e2ede96f8809f947.077815d35567232e.66c997070e860c39 + 462deabeada60932.41150c7a1a4df892.9ce5d1a297a56adb.474e1bb03bc55073 + 5acd7ad9f991bada + after + 07121ecd88441b7d.d2cc3eca9347d80f.74876ac63afb7562.c67d2c86fa7c09a3 + c501b4c64209aa2e.0719232dba0b82d5.6e1d4703bf5de53f.d97270f257c73303 + 0000000000000000.0000000000000000.0000000000000000.7bffbaf0fc007bff + 462deabeada60932.41150c7a1a4df892.9ce5d1a297a56adb.474e1bb03bc55073 + 5acd7ad9f991bada +VCVTPS2PH_128_1(mem) + before + 79112f6f64f5079c.c201829797974fdd.fe5d063c8be33ce1.89ad76dc21a1f8f1 + 82e7b65c99fdf3e5.df63bd3c7359f634.f791559ff8d88161.2a1f00ed91e9071d + 4f1c8c8db3b639e1.fba1981add7938e3.067d74917c37833e.db866b418009d40a + be7a2cefcf2b96bb.70050d9d72825295.09eddffd330cfda3.f82db3448c8c9a65 + de62d56351fe96da + after + 79112f6f64f5079c.c201829797974fdd.fe5d063c8be33ce1.fc00fc0000008001 + 82e7b65c99fdf3e5.df63bd3c7359f634.f791559ff8d88161.2a1f00ed91e9071d + 4f1c8c8db3b639e1.fba1981add7938e3.067d74917c37833e.db866b418009d40a + be7a2cefcf2b96bb.70050d9d72825295.09eddffd330cfda3.f82db3448c8c9a65 + de62d56351fe96da + +VCVTPS2PH_128_1(reg) + before + e76fcc086aeb0414.a9cd126c0869c6a0.9cdd1a32cd007ff7.daac12cf3a64acbd + 6fa194a173e020c0.ede3baf27b7b85bb.d973ba438b80fdb5.56878af3ad4a4cb8 + d444299809682589.6787a06c436d8e39.8514e93e478d067a.5a4ac156a6cb98bf + 71a4885bc70f501c.f18441c67d4b9e45.fa0ba48e9db3d6f2.c0c135e244f24dfe + 65b86284a1cb27a3 + after + e76fcc086aeb0414.a9cd126c0869c6a0.9cdd1a32cd007ff7.daac12cf3a64acbd + 6fa194a173e020c0.ede3baf27b7b85bb.d973ba438b80fdb5.56878af3ad4a4cb8 + 0000000000000000.0000000000000000.0000000000000000.fc0080017bff8001 + 71a4885bc70f501c.f18441c67d4b9e45.fa0ba48e9db3d6f2.c0c135e244f24dfe + 65b86284a1cb27a3 +VCVTPS2PH_128_1(mem) + before + 26fbc229d962e2d7.a20cab554a62dd24.68a718ec4422710c.95a6e59e2a7fabcb + dd55d3bb09c439c9.c3ca90f22dec084f.a9bca1cab4fdc2ba.b330aadc8a7cbfaf + 67940f284cfce9a3.3028339e0d3a0c46.8e8f584ceae94e7a.aef4eeb358364f4a + a082f55bbf17ae91.45738ec585d726b8.f4ecb95e02f1d179.e33fad8f313a9649 + af5de4ddb013d258 + after + 26fbc229d962e2d7.a20cab554a62dd24.68a718ec4422710c.8001800880018001 + dd55d3bb09c439c9.c3ca90f22dec084f.a9bca1cab4fdc2ba.b330aadc8a7cbfaf + 67940f284cfce9a3.3028339e0d3a0c46.8e8f584ceae94e7a.aef4eeb358364f4a + a082f55bbf17ae91.45738ec585d726b8.f4ecb95e02f1d179.e33fad8f313a9649 + af5de4ddb013d258 + +VCVTPS2PH_256_1(reg) + before + 6c9a8e07714d3d22.64ecfe407d2043c1.5df79fd3324f914f.b79f41ec172107e2 + 7742a77a11751354.8f9ea7c3a323665c.d6006035af2e8bb7.b3736be34585abe2 + 2ca02ba32b169299.fd646dd04c2dd191.0bd9cf5599014e9d.c435b32da92a7aa5 + e70216ec5cbcf49e.8a09cb539549408a.57d0e8a18b5417ad.c6b295b85f1c3056 + 95b48e6f81658922 + after + 6c9a8e07714d3d22.64ecfe407d2043c1.5df79fd3324f914f.b79f41ec172107e2 + 7742a77a11751354.8f9ea7c3a323665c.d6006035af2e8bb7.b3736be34585abe2 + 0000000000000000.0000000000000000.7bff000080018001.fc00800180016c2d + e70216ec5cbcf49e.8a09cb539549408a.57d0e8a18b5417ad.c6b295b85f1c3056 + 95b48e6f81658922 +VCVTPS2PH_256_1(mem) + before + 0d07193d2e134034.05b265c33ff4760f.125b3d3899837173.182fa58322b12192 + 0a79b210803112b9.e0fd139371495497.7124406c74e81e7a.a9430469f9a6aaf9 + 68d5d5d393ccbadd.d52f1cc78e47c9e3.83314ed9438203c8.655800beacbeec8e + 03e6fe7283eff6cb.40947ccd307b129e.244ee56d2260de8c.aaba95edd88623fc + d2b5bf6419898df0 + after + 0d07193d2e134034.05b265c33ff4760f.00008001fc007bff.7bff7bff8001fc00 + 0a79b210803112b9.e0fd139371495497.7124406c74e81e7a.a9430469f9a6aaf9 + 68d5d5d393ccbadd.d52f1cc78e47c9e3.83314ed9438203c8.655800beacbeec8e + 03e6fe7283eff6cb.40947ccd307b129e.244ee56d2260de8c.aaba95edd88623fc + d2b5bf6419898df0 + +VCVTPS2PH_256_1(reg) + before + 3b947b8f0a536415.b779aada6ea680b0.7d772f10f5706b75.304780122c8b69f0 + 3fa5c4d84771e518.605a54f56dfe15b7.e82632fc79b30f14.83e79bb67d116120 + 3b3296ac6d6e4ba4.d95578b09e02700d.ddeb80fe57ce3c26.f9fcb34432fe8249 + 8c88ed7717d6d466.003692f81dbe4ede.b71315802c502c58.6d5043a8665c8797 + bdb0c6ce36392d36 + after + 3b947b8f0a536415.b779aada6ea680b0.7d772f10f5706b75.304780122c8b69f0 + 3fa5c4d84771e518.605a54f56dfe15b7.e82632fc79b30f14.83e79bb67d116120 + 0000000000000000.0000000000000000.3d2e7b8f7bff7bff.fc007bff80017bff + 8c88ed7717d6d466.003692f81dbe4ede.b71315802c502c58.6d5043a8665c8797 + bdb0c6ce36392d36 +VCVTPS2PH_256_1(mem) + before + c4e8bba2bda13050.8cf3c5a6e236ba0a.b0c81fb7053f6b55.d4eaedef93c21b55 + 5dc49b10189f4c14.98bf1ba36919393b.c4d999db7390839e.8fbc05b829b247ca + 6610e404623f3cac.0d37eadc490b8fa6.1a337e4f82bd51e7.44d5584589abea63 + ba97106bb88dbd45.45a92ebc1d99f6f6.8da34afe4ed3935a.4f80e2044f3a41cb + 0b0b9f6018e987ae + after + c4e8bba2bda13050.8cf3c5a6e236ba0a.7bff000080017bff.e6cd7bff80010000 + 5dc49b10189f4c14.98bf1ba36919393b.c4d999db7390839e.8fbc05b829b247ca + 6610e404623f3cac.0d37eadc490b8fa6.1a337e4f82bd51e7.44d5584589abea63 + ba97106bb88dbd45.45a92ebc1d99f6f6.8da34afe4ed3935a.4f80e2044f3a41cb + 0b0b9f6018e987ae + +VCVTPS2PH_256_1(reg) + before + d6c08bc57f47f9ba.34279d2f35968b0a.9d5fe4af824eabd8.f8f577d6f4dd0223 + 0beca39f21ddd399.b28a073ef6656128.71a6062013b6eaf8.39f583c290e85d6f + 05dbe25a9a3951f7.0e8dc8821606fcca.1eca927d6d5eee01.2a6fe8ae3cfe5e6a + 22d9446284e6ae81.26fc5ee9b286181e.fe1783322bd1f4a0.a92e2587172ec23f + 90ffb3373b81451b + after + d6c08bc57f47f9ba.34279d2f35968b0a.9d5fe4af824eabd8.f8f577d6f4dd0223 + 0beca39f21ddd399.b28a073ef6656128.71a6062013b6eaf8.39f583c290e85d6f + 0000000000000000.0000000000000000.000000008001fc00.7bff00000fac8001 + 22d9446284e6ae81.26fc5ee9b286181e.fe1783322bd1f4a0.a92e2587172ec23f + 90ffb3373b81451b +VCVTPS2PH_256_1(mem) + before + bdaf0fabc405a22a.bd31c5237e7128e3.d4a3445ee5f0714d.6ed9d5a9ea9b3880 + 0a08b6d0cfc59797.3131620a2265f8c8.f64df6cdcb51c286.ca5b844f4549f54e + 55d7239077cddd8e.dc2316810c4e5ddd.66c8f02281b3c8f2.6eeb8d90d86668b6 + 78e7d2d9d92a333d.1854ddf6d8b991ce.01deaf4923243fc0.b6d3ebd9407ecd63 + fe609a94181e6002 + after + bdaf0fabc405a22a.bd31c5237e7128e3.0000fc0000000000.fc00fc00fc006a4f + 0a08b6d0cfc59797.3131620a2265f8c8.f64df6cdcb51c286.ca5b844f4549f54e + 55d7239077cddd8e.dc2316810c4e5ddd.66c8f02281b3c8f2.6eeb8d90d86668b6 + 78e7d2d9d92a333d.1854ddf6d8b991ce.01deaf4923243fc0.b6d3ebd9407ecd63 + fe609a94181e6002 + +VCVTPS2PH_128_2(reg) + before + a2de962ffdd15c3e.50063f9610e753cd.4210b3d32431d146.a45cad2eccb0e21a + fe98dc158b24fec4.bafee7b33811fa6d.b7a39486894259f1.290e68be98626e2d + 6ddc67b25da28240.909c451c6eb3e447.d1587d7aa579647d.6dc05be3a4469f24 + 4df433720fd7245d.afacd5bdced9cd88.ee7d691b14613094.4d3d038a0b69312c + a353dba0ead5df70 + after + a2de962ffdd15c3e.50063f9610e753cd.4210b3d32431d146.a45cad2eccb0e21a + fe98dc158b24fec4.bafee7b33811fa6d.b7a39486894259f1.290e68be98626e2d + 0000000000000000.0000000000000000.0000000000000000.8147800000018000 + 4df433720fd7245d.afacd5bdced9cd88.ee7d691b14613094.4d3d038a0b69312c + a353dba0ead5df70 +VCVTPS2PH_128_2(mem) + before + 4ccc5e105c99661d.f92e3cc13e4f1fc8.f3fa1382738f705b.685c54d57186f6e2 + 24750ac67ebe825f.cdd47e0b8597b02c.38527c577ae28aed.9c423a145875f514 + 071b5bad6b52ee61.2533f6bc813a1336.5b808a28feded669.e77b184466b967d6 + d187cbb340606850.5c979f40cdc58392.364fbbe21b8d12fc.a353e8d137de89d3 + 2d16a827667197b8 + after + 4ccc5e105c99661d.f92e3cc13e4f1fc8.f3fa1382738f705b.034a7c0080007c00 + 24750ac67ebe825f.cdd47e0b8597b02c.38527c577ae28aed.9c423a145875f514 + 071b5bad6b52ee61.2533f6bc813a1336.5b808a28feded669.e77b184466b967d6 + d187cbb340606850.5c979f40cdc58392.364fbbe21b8d12fc.a353e8d137de89d3 + 2d16a827667197b8 + +VCVTPS2PH_128_2(reg) + before + e11053b38ffdcd30.5e88d8c318f5aa57.d04b750405c33deb.a68d8a6feefdf8d2 + 1b8ce6e04f0e66e8.8ae9fdca101c70a3.dc9d7472c7c07dee.870474bd92394516 + 37d75b1941319f8c.3175b6b243e17860.dbd798f8ac487f46.b581f3b7244eb4f5 + 913db0cc02f1b3c7.2ff97f68cd517cb9.2b46de0152e87ea0.0ccf8549bf47029a + 68bca55e8030eb32 + after + e11053b38ffdcd30.5e88d8c318f5aa57.d04b750405c33deb.a68d8a6feefdf8d2 + 1b8ce6e04f0e66e8.8ae9fdca101c70a3.dc9d7472c7c07dee.870474bd92394516 + 0000000000000000.0000000000000000.0000000000000000.fbfffbff80008000 + 913db0cc02f1b3c7.2ff97f68cd517cb9.2b46de0152e87ea0.0ccf8549bf47029a + 68bca55e8030eb32 +VCVTPS2PH_128_2(mem) + before + a513cfe482162be8.850ae0642ddae046.6041d5d9cb7738db.263641f8552cb7a7 + 9e9f80c6e2047dea.6f8ae74d5f7960b4.a01933ef595f6af1.2af3bd4b509e6608 + 312d32f1bb069e61.ab09c2f3335970be.cb4d15989216cc28.91c94f65dfccc66f + 3989634f2a294a7c.95d26cc246074b10.bda9f7bf92a71bac.b903f1b29f411487 + fcefa19f2c8a8cfd + after + a513cfe482162be8.850ae0642ddae046.6041d5d9cb7738db.80007c0000017c00 + 9e9f80c6e2047dea.6f8ae74d5f7960b4.a01933ef595f6af1.2af3bd4b509e6608 + 312d32f1bb069e61.ab09c2f3335970be.cb4d15989216cc28.91c94f65dfccc66f + 3989634f2a294a7c.95d26cc246074b10.bda9f7bf92a71bac.b903f1b29f411487 + fcefa19f2c8a8cfd + +VCVTPS2PH_128_2(reg) + before + b8d75a9620326a7d.927f8ecd4a783d65.8932e026330d2e55.52f8564f761e13a8 + 470818041ac5e9b2.18db305838ff3248.e3761d8b97fa553a.6508ac365a886f48 + 06ced856b4d04648.a668c3da0fcbe652.ffe81c5e0d57fc6a.d4a3775f58f0ecba + 527594f68adebded.1af4c541ebe715af.39d4db0931b25e92.7a9632b68f624628 + 32ad5a2818eb39be + after + b8d75a9620326a7d.927f8ecd4a783d65.8932e026330d2e55.52f8564f761e13a8 + 470818041ac5e9b2.18db305838ff3248.e3761d8b97fa553a.6508ac365a886f48 + 0000000000000000.0000000000000000.0000000000000000.fbff80007c007c00 + 527594f68adebded.1af4c541ebe715af.39d4db0931b25e92.7a9632b68f624628 + 32ad5a2818eb39be +VCVTPS2PH_128_2(mem) + before + dab3699f129680a9.85484a52397b894a.4f49b178e95f7a8a.ed8854faa096b85e + 4d76dd08966fd815.fc95f5d55c34e70e.2034036b2540d210.764f859cf68f4679 + 66c03150c383fd2d.13a692ea909413e3.6b813705ba95d96d.32746a5ace2a448f + 035ee161b2ddaa1e.27c81bff70274976.0afcca34c46a4acc.7c44fda2c4f3ed4e + b00b3cdf75747e60 + after + dab3699f129680a9.85484a52397b894a.4f49b178e95f7a8a.000100017c00fbff + 4d76dd08966fd815.fc95f5d55c34e70e.2034036b2540d210.764f859cf68f4679 + 66c03150c383fd2d.13a692ea909413e3.6b813705ba95d96d.32746a5ace2a448f + 035ee161b2ddaa1e.27c81bff70274976.0afcca34c46a4acc.7c44fda2c4f3ed4e + b00b3cdf75747e60 + +VCVTPS2PH_256_2(reg) + before + 2915227d7d3b3371.fe1c6a2981899c14.92478e7f987ac472.db7137e460cce35a + 45aeabe876d3472e.35c647934c948f3a.b16fe6d6a518c184.b9abfaffa9c65e42 + 5e21d38dffc9f743.8228f38b2e093fbc.e08c1f71338e7c57.7f778f72bc6577b1 + d2bc96d6b1a87f5b.c30eedfc43f567c8.7be936badd663098.0aa27329b5b3ecd2 + 37f62011aebf77d2 + after + 2915227d7d3b3371.fe1c6a2981899c14.92478e7f987ac472.db7137e460cce35a + 45aeabe876d3472e.35c647934c948f3a.b16fe6d6a518c184.b9abfaffa9c65e42 + 0000000000000000.0000000000000000.6d767c0000197c00.800080008d5f8000 + d2bc96d6b1a87f5b.c30eedfc43f567c8.7be936badd663098.0aa27329b5b3ecd2 + 37f62011aebf77d2 +VCVTPS2PH_256_2(mem) + before + e0401415c692d5dd.fbc9f1302bfc1b23.d243aed4a862c488.ded3251e3f2e1bf3 + e34bca20163ac21e.3795df0806520647.8d94b3ff795f1228.6597ea0af6727713 + 1ae5bf20bcc2f9c2.e06c5cc8e1357d72.cece7967d1f50cd5.6c7f80e89ebd80a5 + 62d86d00d43737f5.549a65de5531bc50.72bb7bf9cc326fbb.4fd7e326d29b7454 + 6aed102f2e988dcd + after + e0401415c692d5dd.fbc9f1302bfc1b23.fbff0001012c0001.80007c007c00fbff + e34bca20163ac21e.3795df0806520647.8d94b3ff795f1228.6597ea0af6727713 + 1ae5bf20bcc2f9c2.e06c5cc8e1357d72.cece7967d1f50cd5.6c7f80e89ebd80a5 + 62d86d00d43737f5.549a65de5531bc50.72bb7bf9cc326fbb.4fd7e326d29b7454 + 6aed102f2e988dcd + +VCVTPS2PH_256_2(reg) + before + 190c026f4f4108bb.97f152ac79a338e2.ed6bf4b500d2fe8f.552735a28721f705 + b87fb552d02120cc.96fce910c815b7b5.082a07b97ea580d9.54e0244c1dcf60e0 + a29325444ec512a9.39c5af18dc96719b.022499566a367eda.49b0c2e5ab476577 + 35954eb164b81a01.5d181eb0d13422c0.35a6a7f8600f343f.11658d574d95c3f7 + 88c900901dc5368c + after + 190c026f4f4108bb.97f152ac79a338e2.ed6bf4b500d2fe8f.552735a28721f705 + b87fb552d02120cc.96fce910c815b7b5.082a07b97ea580d9.54e0244c1dcf60e0 + 0000000000000000.0000000000000000.83fefbff8000fbff.00017c007c000001 + 35954eb164b81a01.5d181eb0d13422c0.35a6a7f8600f343f.11658d574d95c3f7 + 88c900901dc5368c +VCVTPS2PH_256_2(mem) + before + 8aa9987b39e47961.cccf2d05af86747e.dec1b4c5c4fa8650.fefa2b0bfdbeddb4 + f3d1cf04bdfd4aa3.c38dbdaccabb5bcc.988bec41d1f55876.ba6d23fbddcfb6e4 + 9f0d2317c41d637d.751dfa1352e40c98.674442111330555e.c34a8a359bcdfb7c + 6a280fa06b4f801c.40c9e0a4e28cc38e.27b63222a6b73935.76df5c23d344e727 + 2f76953322c0b892 + after + 8aa9987b39e47961.cccf2d05af86747e.fbffafeadc6dfbff.8000fbff9369fbff + f3d1cf04bdfd4aa3.c38dbdaccabb5bcc.988bec41d1f55876.ba6d23fbddcfb6e4 + 9f0d2317c41d637d.751dfa1352e40c98.674442111330555e.c34a8a359bcdfb7c + 6a280fa06b4f801c.40c9e0a4e28cc38e.27b63222a6b73935.76df5c23d344e727 + 2f76953322c0b892 + +VCVTPS2PH_256_2(reg) + before + 4a5c32cf23cea869.30f00f8bcd9f5fac.7fdf6bcd1740bc59.b4ae395fa797c027 + 23de2e6573f9f357.cd2f9fc5071aba58.8da998f88c8b32a6.eaf8d1b431daa560 + 16458560adcdd709.1db23c3834cb4d4d.c8746293ddf96221.a55f780d618fa50b + 7fe0332c6ed78e2a.fc4561d270bed6b6.8a8cc509a7178875.c1b1aa5552bf7b54 + 1ab8e17b2178e568 + after + 4a5c32cf23cea869.30f00f8bcd9f5fac.7fdf6bcd1740bc59.b4ae395fa797c027 + 23de2e6573f9f357.cd2f9fc5071aba58.8da998f88c8b32a6.eaf8d1b431daa560 + 0000000000000000.0000000000000000.00017c00fbff0001.80008000fbff0001 + 7fe0332c6ed78e2a.fc4561d270bed6b6.8a8cc509a7178875.c1b1aa5552bf7b54 + 1ab8e17b2178e568 +VCVTPS2PH_256_2(mem) + before + 8c419b68e9c69d73.bafa353551a25467.46b48a7dd8000fc0.313cbec68670df4e + f3185309c7b360a0.23de85e7f3ba676c.d7ca3327879cb597.17d247361590a45a + 2a09854ad64de91c.16da21aeefac01e4.8b55d9bb9a9e8466.9a985ec5f0031343 + 0fe0ad1832a0f513.ef3804f7e2035f7c.3d1ff6252d13375a.14dcfee0b45668b5 + e2e823f1fc15de5d + after + 8c419b68e9c69d73.bafa353551a25467.fbfffbff0001fbff.fbff800000010001 + f3185309c7b360a0.23de85e7f3ba676c.d7ca3327879cb597.17d247361590a45a + 2a09854ad64de91c.16da21aeefac01e4.8b55d9bb9a9e8466.9a985ec5f0031343 + 0fe0ad1832a0f513.ef3804f7e2035f7c.3d1ff6252d13375a.14dcfee0b45668b5 + e2e823f1fc15de5d + +VCVTPS2PH_128_3(reg) + before + 6109ca6565cab2e7.7d69475df9b640b0.0a452b2c674cbddf.cbf508515b068b9e + eb8aeda98a0320fe.506fd007449d8620.c34d90bb1a1256ba.10a38a2b40833c5f + ddb98a28084c634f.63bfc3013161828e.759b310e98e167b9.e8f5f99ff99706c8 + 94e09c4d7a2fb985.94259c37dc0df227.7e7d09937d452c87.2eb7cf99a14da407 + c0b48a0655b1d345 + after + 6109ca6565cab2e7.7d69475df9b640b0.0a452b2c674cbddf.cbf508515b068b9e + eb8aeda98a0320fe.506fd007449d8620.c34d90bb1a1256ba.10a38a2b40833c5f + 0000000000000000.0000000000000000.0000000000000000.da6c000000004419 + 94e09c4d7a2fb985.94259c37dc0df227.7e7d09937d452c87.2eb7cf99a14da407 + c0b48a0655b1d345 +VCVTPS2PH_128_3(mem) + before + 78baa5d030d04fb1.6a4d20867d3a5b4d.bd6dd8955fad8f17.393d14b564cbe1d0 + 34939ce54eb5d374.bc4a103eacf98853.bc63f107d94d1889.02284fdfe9fec142 + ce0cec2fcc6d1cbd.e6246ae1a4f77a42.6cd3657964fa47a9.348ab47fa96b0987 + 24f10f9cc602e6b6.442729db00c06ec7.a888afd71cbfd9a5.2daf41013f9df44b + 48e3f1cf4820c03b + after + 78baa5d030d04fb1.6a4d20867d3a5b4d.bd6dd8955fad8f17.a31ffbff0000fbff + 34939ce54eb5d374.bc4a103eacf98853.bc63f107d94d1889.02284fdfe9fec142 + ce0cec2fcc6d1cbd.e6246ae1a4f77a42.6cd3657964fa47a9.348ab47fa96b0987 + 24f10f9cc602e6b6.442729db00c06ec7.a888afd71cbfd9a5.2daf41013f9df44b + 48e3f1cf4820c03b + +VCVTPS2PH_128_3(reg) + before + e072c1566081a703.100e83175782ed8c.329e49985ce0a08d.4e504c0d1ea88aa7 + 53a7ab02214be64e.702ec38c9cf9ec6a.0cd7c78555e44c41.38f5b60885c215db + fbcfad402a0ab8c9.1e1f4ce7b072a07d.2e1c9d0c8757ad8f.43446bb26e18386e + 3637c27a144a5b20.f8ab9814aff9c5f0.f4bac99b8dc50022.4c09e6f9f4b7ac8c + 2d0fa3c734a93060 + after + e072c1566081a703.100e83175782ed8c.329e49985ce0a08d.4e504c0d1ea88aa7 + 53a7ab02214be64e.702ec38c9cf9ec6a.0cd7c78555e44c41.38f5b60885c215db + 0000000000000000.0000000000000000.0000000000000000.00007bff07ad8000 + 3637c27a144a5b20.f8ab9814aff9c5f0.f4bac99b8dc50022.4c09e6f9f4b7ac8c + 2d0fa3c734a93060 +VCVTPS2PH_128_3(mem) + before + c3241e0a49fd7e17.5e28e61e7d9809fe.d89f25ffb69a16f0.bafd469c03bb81a7 + eb12d4ad50bc53dc.d1f115970180fe0f.9bc76e95e06250a9.dc31117d86c46bc9 + 7fa6409c64f46bdc.e9dd4c503b8c7801.1defefc04a5c2f46.b6a224a9b26dfb35 + 5caddec3a1b08243.033786b7c84ab17d.3be2256e10956ff4.a49c7d8b21406d97 + 026a179172ccfc9a + after + c3241e0a49fd7e17.5e28e61e7d9809fe.d89f25ffb69a16f0.8000fbfffbff8000 + eb12d4ad50bc53dc.d1f115970180fe0f.9bc76e95e06250a9.dc31117d86c46bc9 + 7fa6409c64f46bdc.e9dd4c503b8c7801.1defefc04a5c2f46.b6a224a9b26dfb35 + 5caddec3a1b08243.033786b7c84ab17d.3be2256e10956ff4.a49c7d8b21406d97 + 026a179172ccfc9a + +VCVTPS2PH_128_3(reg) + before + 2c59ee263f9ae6eb.5ef02a0e24fd533c.7c4dbf374346e632.cf6e8a894c18cbde + 7db5feb724386535.623ea06909e69bf4.ae69f33c480a53ca.b65d9cff1df10031 + 53ca44aebd31b525.4262bdc16b771596.f6d81f33742433f2.cc7dd6bb9c2cca19 + 09e4bb78a8121467.db27fc0066bc7f4f.b0e63d866320c355.ed98b4a9e8d6e4c1 + f57ab3b51afb0c56 + after + 2c59ee263f9ae6eb.5ef02a0e24fd533c.7c4dbf374346e632.cf6e8a894c18cbde + 7db5feb724386535.623ea06909e69bf4.ae69f33c480a53ca.b65d9cff1df10031 + 0000000000000000.0000000000000000.0000000000000000.80007bff80370000 + 09e4bb78a8121467.db27fc0066bc7f4f.b0e63d866320c355.ed98b4a9e8d6e4c1 + f57ab3b51afb0c56 +VCVTPS2PH_128_3(mem) + before + c0f14ecb50a5fc04.fb4f5f827e66bca6.095bd91417c2934b.39df4ba2b0883fa0 + 2ac801d7a6e270f6.84562c36ddb9ea8e.a8c8d0e79a950eb5.eb0e45f4f7eae27e + 0fca48c537bd2658.02471f026197d9cd.943b5e67093fabba.23c025e6d5d2e99c + 4bc49f812043d857.cf6c3250a58dc60d.a980fe7f83ce2785.3d42e9e6e5cb90a6 + 95de8b5fc4611347 + after + c0f14ecb50a5fc04.fb4f5f827e66bca6.095bd91417c2934b.80008000fbfffbff + 2ac801d7a6e270f6.84562c36ddb9ea8e.a8c8d0e79a950eb5.eb0e45f4f7eae27e + 0fca48c537bd2658.02471f026197d9cd.943b5e67093fabba.23c025e6d5d2e99c + 4bc49f812043d857.cf6c3250a58dc60d.a980fe7f83ce2785.3d42e9e6e5cb90a6 + 95de8b5fc4611347 + +VCVTPS2PH_256_3(reg) + before + 87e109bc0d20ad2c.ba8283f87c7f421f.4912638e4626edfa.c3622c1b224d3e43 + 6f975f6b5d959b00.38d06f14677d22db.cb85ad27dfef8a41.beaf642702c9ac20 + a94b87d74f4b1970.a17adfc3fe4a32b8.b0100d870c73d98e.7631228f404d2c47 + 914b7f6c80ce6328.d14c4ff05df12fe2.56017d1a6a3e158c.c6b5e33ff7e57be5 + 8c072223439e5525 + after + 87e109bc0d20ad2c.ba8283f87c7f421f.4912638e4626edfa.c3622c1b224d3e43 + 6f975f6b5d959b00.38d06f14677d22db.cb85ad27dfef8a41.beaf642702c9ac20 + 0000000000000000.0000000000000000.7bff7bff06837bff.fbfffbffb57b0000 + 914b7f6c80ce6328.d14c4ff05df12fe2.56017d1a6a3e158c.c6b5e33ff7e57be5 + 8c072223439e5525 +VCVTPS2PH_256_3(mem) + before + a3f35b2742837634.83e142978babb4d3.a4113b879f7ed584.17a2fb4c94dd7be8 + e7450a380da0993e.db5accc20d6d491e.f5972073e0fedfcb.5040828927db464e + 33c9ac40c2027b6e.f502195aa1a15db4.a2a879a23d7c0ae2.5e270e3ebfc4b369 + 654abc2d7db4d8f1.2c2526cce3d22e42.8611c200d10412f6.9ba39cd5f625fee5 + 640027bc6b896370 + after + a3f35b2742837634.83e142978babb4d3.fbff0000fbff0000.fbfffbff7bff0000 + e7450a380da0993e.db5accc20d6d491e.f5972073e0fedfcb.5040828927db464e + 33c9ac40c2027b6e.f502195aa1a15db4.a2a879a23d7c0ae2.5e270e3ebfc4b369 + 654abc2d7db4d8f1.2c2526cce3d22e42.8611c200d10412f6.9ba39cd5f625fee5 + 640027bc6b896370 + +VCVTPS2PH_256_3(reg) + before + 168aaa5db77c1eb3.5895b6ea59c26bf2.de0fee83708cf673.7d9e7877b9a3b333 + 0b8c2426798b6a5d.e77616637239f19e.660b6deae45bf2f5.621a15f41064a8c4 + a0538e824a418418.f0b95884e5242620.20669f6fdacc5d71.13531763f353aed5 + 312ce5ddc92aa790.4e2af939ce90c5a5.676d807dee6a7596.6a13f9b17d7d8194 + 452737c8cbeddc2b + after + 168aaa5db77c1eb3.5895b6ea59c26bf2.de0fee83708cf673.7d9e7877b9a3b333 + 0b8c2426798b6a5d.e77616637239f19e.660b6deae45bf2f5.621a15f41064a8c4 + 0000000000000000.0000000000000000.00007bfffbff7bff.7bfffbff7bff0000 + 312ce5ddc92aa790.4e2af939ce90c5a5.676d807dee6a7596.6a13f9b17d7d8194 + 452737c8cbeddc2b +VCVTPS2PH_256_3(mem) + before + 7f5b4e96f8b07cc6.1a6126a38fd31173.dc95746e47878c59.996a0d80bdc2740e + f37bb80620d01d92.b83e4c403ac7fc6a.78c5450f6f173567.11aa41e4e25f9685 + 7e53f304605c7bbb.651153206692a424.fc88e808604c7cfc.2a781815facd19a8 + fdaf7a7cd45d516c.7e2538b0aec1474b.46a8d94636311f44.42228e7fa1993723 + b3f2a08f714e2da1 + after + 7f5b4e96f8b07cc6.1a6126a38fd31173.fbff000082f9163f.7bff7bff0000fbff + f37bb80620d01d92.b83e4c403ac7fc6a.78c5450f6f173567.11aa41e4e25f9685 + 7e53f304605c7bbb.651153206692a424.fc88e808604c7cfc.2a781815facd19a8 + fdaf7a7cd45d516c.7e2538b0aec1474b.46a8d94636311f44.42228e7fa1993723 + b3f2a08f714e2da1 + +VCVTPS2PH_256_3(reg) + before + db3648af097836cf.4a5aca5a97e15cd2.5fc5f55aaedf1f8b.30f295b30ed2d86a + 1438844d02a38f59.43215d8ac5f35818.643e888b03796992.9732973d033b649a + bc4550d3fa5c74ea.c2d1b1f87b9f006c.e9dcecb049196109.58335bce32797f02 + 2da9a6af8d2f212e.a4344190d5f211f7.4aeb1e341b4e429f.4dc35e54b697e4cc + 53ac1abaaba25024 + after + db3648af097836cf.4a5aca5a97e15cd2.5fc5f55aaedf1f8b.30f295b30ed2d86a + 1438844d02a38f59.43215d8ac5f35818.643e888b03796992.9732973d033b649a + 0000000000000000.0000000000000000.00000000590aef9a.7bff000080000000 + 2da9a6af8d2f212e.a4344190d5f211f7.4aeb1e341b4e429f.4dc35e54b697e4cc + 53ac1abaaba25024 +VCVTPS2PH_256_3(mem) + before + 47bb0dec2ea57f37.c5af844c56a6d2d3.c616893fedf747e7.e3b7188215a149fe + 03bdb2d65bac2c31.dea5e516f24fc282.024505efe2bb5e68.0f8bd808d4a0b2d2 + 647b85644dc3143d.d5d5c579fcb62eea.358c328ece4911a6.2cb55931f3d6b9c8 + 46c4038221f7f388.078c20e1106551b5.3bb68b07cdad1dcc.957f97690fcf998c + a6368e1cc3188fca + after + 47bb0dec2ea57f37.c5af844c56a6d2d3.00007bfffbfffbff.0000fbff0000fbff + 03bdb2d65bac2c31.dea5e516f24fc282.024505efe2bb5e68.0f8bd808d4a0b2d2 + 647b85644dc3143d.d5d5c579fcb62eea.358c328ece4911a6.2cb55931f3d6b9c8 + 46c4038221f7f388.078c20e1106551b5.3bb68b07cdad1dcc.957f97690fcf998c + a6368e1cc3188fca + +VCVTPS2PH_128_4(reg) + before + bb263bb7ac3dd62d.8563a61df253853d.ce16f2bacbea6990.f0908c45fcf43e06 + 2f9b99a465c8ac61.fd23ec1fdce48589.87bf3870c9d1b026.30e6b13676282f82 + 60e0a4508b474b13.8ad25076fcb5b098.8ed3ed6fa5a46224.d78477c55858ae69 + a9435828b945f0ef.083a4f0c6dd2c295.409d0d24fbf1bd35.c23659debd8d75ea + cbc7d36dc1d5402f + after + bb263bb7ac3dd62d.8563a61df253853d.ce16f2bacbea6990.f0908c45fcf43e06 + 2f9b99a465c8ac61.fd23ec1fdce48589.87bf3870c9d1b026.30e6b13676282f82 + 0000000000000000.0000000000000000.0000000000000000.8000fc0000007c00 + a9435828b945f0ef.083a4f0c6dd2c295.409d0d24fbf1bd35.c23659debd8d75ea + cbc7d36dc1d5402f +VCVTPS2PH_128_4(mem) + before + cf05615f813bcd64.650eb2968b4fd6a0.532863cf4c4877ad.f76b95fa6844fb06 + a9ba7f9e19ccd6b6.f28eac089ff03bd3.47680aaab4228a0b.10877f5c87275943 + 37b3aa17a4931751.aa0f44e98eb45934.c0c5bf89c26cb8dc.e73ec9b8f5291397 + 54bc5db73e9c4e61.ebdd75a5f6276c6e.9a0dfe589133bc4c.d8dc6e794dd364af + 3fad6a0b2cb38936 + after + cf05615f813bcd64.650eb2968b4fd6a0.532863cf4c4877ad.7b40800300008000 + a9ba7f9e19ccd6b6.f28eac089ff03bd3.47680aaab4228a0b.10877f5c87275943 + 37b3aa17a4931751.aa0f44e98eb45934.c0c5bf89c26cb8dc.e73ec9b8f5291397 + 54bc5db73e9c4e61.ebdd75a5f6276c6e.9a0dfe589133bc4c.d8dc6e794dd364af + 3fad6a0b2cb38936 + +VCVTPS2PH_128_4(reg) + before + 78fcbada2d54bed9.dca1146904f43511.0f443ca873d6b22d.b10a44033e825486 + df175852ed423e44.ab2d4b1812a6898d.7490935e9f4d651f.e1890b76e4653ab7 + d04842df070a4722.9d2e7eb283be0602.740c78331916c2ee.0656d19da0e92b0a + a7dc73ed18371320.8e6e2a227349679c.6d05e6937bbf0446.fc3d11658d19e2ac + 9e0a48b8c8011cc8 + after + 78fcbada2d54bed9.dca1146904f43511.0f443ca873d6b22d.b10a44033e825486 + df175852ed423e44.ab2d4b1812a6898d.7490935e9f4d651f.e1890b76e4653ab7 + 0000000000000000.0000000000000000.0000000000000000.7c008000fc00fc00 + a7dc73ed18371320.8e6e2a227349679c.6d05e6937bbf0446.fc3d11658d19e2ac + 9e0a48b8c8011cc8 +VCVTPS2PH_128_4(mem) + before + ca89f0846cae958b.bf1fe8e9bb56dee9.59baca54ff526986.b9c7d9eb61d469d4 + 5b8587b3952b0921.765d9b3d8cf2e62a.dcdeda3442e5c8ed.b59e4ea568df2b44 + 2ccb8833608433b6.27e28a572897658e.f2a6d6ae590f40fd.bed6402f2b6e8641 + 1b276fefe9c6d174.2ef9b0a22bd197c3.76de3baf5fdb8ce1.2ebbabf3470db878 + 62988b5f5746fb94 + after + ca89f0846cae958b.bf1fe8e9bb56dee9.59baca54ff526986.fc00572e80147c00 + 5b8587b3952b0921.765d9b3d8cf2e62a.dcdeda3442e5c8ed.b59e4ea568df2b44 + 2ccb8833608433b6.27e28a572897658e.f2a6d6ae590f40fd.bed6402f2b6e8641 + 1b276fefe9c6d174.2ef9b0a22bd197c3.76de3baf5fdb8ce1.2ebbabf3470db878 + 62988b5f5746fb94 + +VCVTPS2PH_128_4(reg) + before + b6badcdef8a78c42.0365b8d34bfc9c8a.e7f00989302dba72.46518421715669c6 + 876d9bdcc5bca72e.bf51e0cba2325322.ad11927ad336084a.3ccd2df1aa8a93d7 + 2ffdc2c55b0f8703.2db762c30b75b069.dea946e0b179bef5.361cb20c2785c541 + 0bb64f05552e696e.2762baa7a1d0708a.d50420276581181f.0f0b8f5d0353bc2f + a26641cf5aff34ce + after + b6badcdef8a78c42.0365b8d34bfc9c8a.e7f00989302dba72.46518421715669c6 + 876d9bdcc5bca72e.bf51e0cba2325322.ad11927ad336084a.3ccd2df1aa8a93d7 + 0000000000000000.0000000000000000.0000000000000000.8000fc0026698000 + 0bb64f05552e696e.2762baa7a1d0708a.d50420276581181f.0f0b8f5d0353bc2f + a26641cf5aff34ce +VCVTPS2PH_128_4(mem) + before + ccf943504995e94a.77e43d084fa5891a.8b20646381504fb2.ed6e1ebb8cda5175 + 6a8f11cbec2196ce.cfd2893ae6ff22b4.33bbdde4c7ff080c.84323c09c110a7a3 + 56f6272c5eb0f887.6fce2e08b6c871e8.fc009f148dec59c5.f49e747ba1b05354 + 6ef8025fbb4dcba1.b32227dc5a8cb261.c3bb28e1f220fb09.9bfcc47ec7469435 + d399277fd05ca4f2 + after + ccf943504995e94a.77e43d084fa5891a.8b20646381504fb2.0001fc008000c885 + 6a8f11cbec2196ce.cfd2893ae6ff22b4.33bbdde4c7ff080c.84323c09c110a7a3 + 56f6272c5eb0f887.6fce2e08b6c871e8.fc009f148dec59c5.f49e747ba1b05354 + 6ef8025fbb4dcba1.b32227dc5a8cb261.c3bb28e1f220fb09.9bfcc47ec7469435 + d399277fd05ca4f2 + +VCVTPS2PH_256_4(reg) + before + fac199e95780c036.8c621d512005ca47.f71b72246ed821cc.62b7f4350fa9ad03 + 6ac01727f93e24ad.8d00f4eca9e2b2a5.97050b4a8f37f9d4.b7c27dfe029229e0 + 81e19ba751200b05.4e9e031d71f33fe9.ef2cecbc58357726.9ca64127e7e72ccc + 997223d4fcb4b3e7.a908c6d194412d3c.5bdb13e5665fd76e.ee30ba9cd9b572f2 + 892d6649f507b77e + after + fac199e95780c036.8c621d512005ca47.f71b72246ed821cc.62b7f4350fa9ad03 + 6ac01727f93e24ad.8d00f4eca9e2b2a5.97050b4a8f37f9d4.b7c27dfe029229e0 + 0000000000000000.0000000000000000.7c00fc0080008000.8000800081850000 + 997223d4fcb4b3e7.a908c6d194412d3c.5bdb13e5665fd76e.ee30ba9cd9b572f2 + 892d6649f507b77e +VCVTPS2PH_256_4(mem) + before + 4966c11a56eab69e.0fbdaa1a95855502.7b09baf22fda37cd.37607b3155405557 + 0baa45fb18692c7b.3f0f2ce5b8203000.a06e5a6e5dc91ac6.de05200cbf652c8e + a9c470d95890d444.86b5a6a102107e8e.f40422303b1b9254.ae17ffe2435999be + 007fbd9e326c6c23.3ef813ba0fe17c57.53958e24e4db5aa2.02e2121f7aa8d894 + 36b2573003de24bc + after + 4966c11a56eab69e.0fbdaa1a95855502.0000000038798281.80007c00fc00bb29 + 0baa45fb18692c7b.3f0f2ce5b8203000.a06e5a6e5dc91ac6.de05200cbf652c8e + a9c470d95890d444.86b5a6a102107e8e.f40422303b1b9254.ae17ffe2435999be + 007fbd9e326c6c23.3ef813ba0fe17c57.53958e24e4db5aa2.02e2121f7aa8d894 + 36b2573003de24bc + +VCVTPS2PH_256_4(reg) + before + a5d2c97f7788bae1.eca9a838c108ae44.c4276f9d7a206608.9aed1b36751530dd + ad90647a7432d6af.490cad91217056d5.748db6b4df58784c.a3da435209d5ce30 + a936258b9666b4d4.f37549976fb022ff.aa75e46ddb16edd0.4b278464bc28f0c8 + f411072078b28ffa.c6b0f635c6d5ccf1.c32cf63309e40240.6e9f5a58ac1a54d1 + e80f3f1bf2b5b476 + after + a5d2c97f7788bae1.eca9a838c108ae44.c4276f9d7a206608.9aed1b36751530dd + ad90647a7432d6af.490cad91217056d5.748db6b4df58784c.a3da435209d5ce30 + 0000000000000000.0000000000000000.80007c007c000000.7c00fc0080000000 + f411072078b28ffa.c6b0f635c6d5ccf1.c32cf63309e40240.6e9f5a58ac1a54d1 + e80f3f1bf2b5b476 +VCVTPS2PH_256_4(mem) + before + 9444b197ac07cce3.ec6d05a4b6a1a4cd.9e88325743eb11d5.19fee7710650f247 + 50072abaf61c5a46.eb961e83edc02ffa.57cb79e901fcadd7.0937b3956de6fb92 + f727286eebfe18c9.4fd84b29b99a6b2d.cd4345d71d165b24.ef23560adb3157cc + 656f27a1a7bbc398.727e8a02b5bb9511.dbbd140db245d8e8.270ecc3cebbd43a2 + 0d4312973a16fac0 + after + 9444b197ac07cce3.ec6d05a4b6a1a4cd.7c00fc00fc00fc00.7c00000000007c00 + 50072abaf61c5a46.eb961e83edc02ffa.57cb79e901fcadd7.0937b3956de6fb92 + f727286eebfe18c9.4fd84b29b99a6b2d.cd4345d71d165b24.ef23560adb3157cc + 656f27a1a7bbc398.727e8a02b5bb9511.dbbd140db245d8e8.270ecc3cebbd43a2 + 0d4312973a16fac0 + +VCVTPS2PH_256_4(reg) + before + fd0f238763c9b9d1.76aaa13e475e17e0.b2d6d57a7db0e953.5f056177dd93e04f + 52bffb790361bc82.06a61431e6f4cfcd.692a2afdae04a39e.34e7a802b90e2f84 + 6a9d96d7b56b3f7e.f02dfb66a188a88b.f4c785f8e443fea0.362f659862c280b3 + a0f5f10f15717d72.120cd2c993275e44.b0f9e0d5b9fa3702.41a91527f6b99009 + 302032998e011bb2 + after + fd0f238763c9b9d1.76aaa13e475e17e0.b2d6d57a7db0e953.5f056177dd93e04f + 52bffb790361bc82.06a61431e6f4cfcd.692a2afdae04a39e.34e7a802b90e2f84 + 0000000000000000.0000000000000000.7c0000000000fc00.7c00800000078871 + a0f5f10f15717d72.120cd2c993275e44.b0f9e0d5b9fa3702.41a91527f6b99009 + 302032998e011bb2 +VCVTPS2PH_256_4(mem) + before + e1613adc48a6dcd9.5015078bc002b309.470f1546d9dbad27.f70c3901ccb48a72 + 2f38a8db40b290ab.d648d4b952a71df1.6a0141c98eb2505e.264b8be9b6fd329c + f571f9829134f354.8dd9540466eef7d3.59b0d13fcfb80416.9a04d2f816626c2c + 11d8a7bd5735c0ff.d31583d898627c5e.efe64192b7f7857a.ad810a9a856e74cd + bc0f303ba1ad862b + after + e1613adc48a6dcd9.5015078bc002b309.00004595fc007c00.7c0080000000807f + 2f38a8db40b290ab.d648d4b952a71df1.6a0141c98eb2505e.264b8be9b6fd329c + f571f9829134f354.8dd9540466eef7d3.59b0d13fcfb80416.9a04d2f816626c2c + 11d8a7bd5735c0ff.d31583d898627c5e.efe64192b7f7857a.ad810a9a856e74cd + bc0f303ba1ad862b + diff --git a/none/tests/amd64/f16c.vgtest b/none/tests/amd64/f16c.vgtest new file mode 100644 index 000000000..759ea0726 --- /dev/null +++ b/none/tests/amd64/f16c.vgtest @@ -0,0 +1,3 @@ +prog: f16c +prereq: test -x f16c && ../../../tests/x86_amd64_features amd64-f16c +vgopts: -q diff --git a/none/tests/amd64/rdrand.c b/none/tests/amd64/rdrand.c new file mode 100644 index 000000000..124d7a8be --- /dev/null +++ b/none/tests/amd64/rdrand.c @@ -0,0 +1,116 @@ + +#include +#include +#include +#include "tests/malloc.h" + +typedef unsigned char UChar; +typedef unsigned int UInt; +typedef unsigned long int UWord; +typedef unsigned long long int ULong; + +// What can we actually test here? The instructions take no input and +// produce output which is by definition totally random. So apart from +// not simply failing insn decode, there's nothing much to test. + +// Get 10 values of each size, and check that they are not all the same +// (otherwise something's obviously wrong). Now, statistically, it's +// highly unlikely that they are all the same. For 10 16 bit ints, the +// probability of them being all the same is (I'd guess) (2^-16) ^ (10-1), +// that is, 2^-144. + +ULong do_rdrand64 ( void ) +{ + while (1) { + ULong res = 0; + ULong cflag = 0; + __asm__ __volatile__( + "movabsq $0x5555555555555555, %%r11 ; " + "movq $0, %%r12 ; " + "rdrand %%r11 ; " + "setc %%r12b ; " + "movq %%r11, %0 ; " + "movq %%r12, %1" + : "=r"(res), "=r"(cflag) : : "r11", "r12" + ); + if (cflag == 1) + return res; + } + /*NOTREACHED*/ +} + +ULong do_rdrand32 ( void ) +{ + while (1) { + ULong res = 0; + ULong cflag = 0; + __asm__ __volatile__( + "movabsq $0x5555555555555555, %%r11 ; " + "movq $0, %%r12 ; " + "rdrand %%r11d ; " + "setc %%r12b ; " + "movq %%r11, %0 ; " + "movq %%r12, %1" + : "=r"(res), "=r"(cflag) : : "r11", "r12" + ); + if (cflag == 1) + return res; + } + /*NOTREACHED*/ +} + +ULong do_rdrand16 ( void ) +{ + while (1) { + ULong res = 0; + ULong cflag = 0; + __asm__ __volatile__( + "movabsq $0x5555555555555555, %%r11 ; " + "movq $0, %%r12 ; " + "rdrand %%r11w ; " + "setc %%r12b ; " + "movq %%r11, %0 ; " + "movq %%r12, %1" + : "=r"(res), "=r"(cflag) : : "r11", "r12" + ); + if (cflag == 1) + return res; + } + /*NOTREACHED*/ +} + +void do_test ( ULong(*fn)(void), + ULong mask + /* with 1s indicating the random bits in the result */ ) +{ + ULong arr[10]; + for (UInt i = 0; i < 10; i++) { + arr[i] = fn(); + } + + // They really should all be different (to an extremely high probabilty. + // See comment above. + int allSame = 1/*true*/; // really, a Bool + for (UInt i = 1; i < 10; i++) { + if (arr[i] != arr[0]) { + allSame = 0/*false*/; + break; + } + } + assert(!allSame); + + // The 0/32/48 leading bits of the result should have a particular value, + // depending on the insn. So print them, with the random part masked out. + for (UInt i = 0; i < 10; i++) { + printf("0x%016llx\n", arr[i] & ~mask); + } + printf("\n"); +} + +int main ( void ) +{ + do_test( do_rdrand64, 0xFFFFFFFFFFFFFFFFULL ); + do_test( do_rdrand32, 0x00000000FFFFFFFFULL ); + do_test( do_rdrand16, 0x000000000000FFFFULL ); + return 0; +} diff --git a/none/tests/amd64/rdrand.stderr.exp b/none/tests/amd64/rdrand.stderr.exp new file mode 100644 index 000000000..e69de29bb diff --git a/none/tests/amd64/rdrand.stdout.exp b/none/tests/amd64/rdrand.stdout.exp new file mode 100644 index 000000000..dccfdc913 --- /dev/null +++ b/none/tests/amd64/rdrand.stdout.exp @@ -0,0 +1,33 @@ +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 + +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 +0x0000000000000000 + +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 +0x5555555555550000 + diff --git a/none/tests/amd64/rdrand.vgtest b/none/tests/amd64/rdrand.vgtest new file mode 100644 index 000000000..9d1c57b5e --- /dev/null +++ b/none/tests/amd64/rdrand.vgtest @@ -0,0 +1,3 @@ +prog: rdrand +prereq: test -x rdrand && ../../../tests/x86_amd64_features amd64-rdrand +vgopts: -q diff --git a/tests/x86_amd64_features.c b/tests/x86_amd64_features.c index c74589888..3f6d3a736 100644 --- a/tests/x86_amd64_features.c +++ b/tests/x86_amd64_features.c @@ -7,8 +7,8 @@ // This file determines x86/AMD64 features a processor supports. // // We return: -// - 0 if the machine matches the asked-for feature. -// - 1 if the machine does not. +// - 0 if the machine has the asked-for feature. +// - 1 if the machine doesn't have the asked-for feature. // - 2 if the asked-for feature isn't recognised (this will be the case for // any feature if run on a non-x86/AMD64 machine). // - 3 if there was a usage error (it also prints an error message). @@ -129,6 +129,12 @@ static Bool go(char* cpu) level = 0x80000001; cmask = 1 << 16; require_amd = True; + } else if (strcmp (cpu, "amd64-f16c" ) == 0) { + level = 1; + cmask = 1 << 29; + } else if (strcmp (cpu, "amd64-rdrand" ) == 0) { + level = 1; + cmask = 1 << 30; #endif } else { return UNRECOGNISED_FEATURE; -- 2.11.4.GIT