From b078fabb56e34115e55357c319d589b9455dd189 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Sat, 22 Dec 2018 07:23:00 +0100 Subject: [PATCH] amd64 pipeline: generate much better code for pshufb mm/xmm/ymm. n-i-bz. pshufb mm/xmm/ymm rearranges byte lanes in vector registers. It's fairly widely used, but we generated terrible code for it. With this patch, we just generate, at the back end, pshufb plus a bit of masking, which is a great improvement. --- VEX/priv/guest_amd64_toIR.c | 107 +++++---------------------------------- VEX/priv/host_amd64_defs.c | 3 ++ VEX/priv/host_amd64_defs.h | 3 +- VEX/priv/host_amd64_isel.c | 15 +++++- VEX/priv/host_generic_simd128.c | 14 +++++ VEX/priv/host_generic_simd128.h | 5 ++ VEX/priv/host_generic_simd64.c | 25 ++++++++- VEX/priv/host_generic_simd64.h | 3 +- VEX/priv/ir_defs.c | 7 ++- VEX/pub/libvex_ir.h | 18 ++++++- memcheck/mc_translate.c | 26 +++++++--- memcheck/tests/vbit-test/irops.c | 2 + 12 files changed, 120 insertions(+), 108 deletions(-) diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index e753ffa72..2451a292e 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -15583,90 +15583,16 @@ Long dis_ESC_0F__SSE3 ( Bool* decode_OK, static IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ ) { - IRTemp sHi = newTemp(Ity_I64); - IRTemp sLo = newTemp(Ity_I64); - IRTemp dHi = newTemp(Ity_I64); - IRTemp dLo = newTemp(Ity_I64); - IRTemp rHi = newTemp(Ity_I64); - IRTemp rLo = newTemp(Ity_I64); - IRTemp sevens = newTemp(Ity_I64); - IRTemp mask0x80hi = newTemp(Ity_I64); - IRTemp mask0x80lo = newTemp(Ity_I64); - IRTemp maskBit3hi = newTemp(Ity_I64); - IRTemp maskBit3lo = newTemp(Ity_I64); - IRTemp sAnd7hi = newTemp(Ity_I64); - IRTemp sAnd7lo = newTemp(Ity_I64); - IRTemp permdHi = newTemp(Ity_I64); - IRTemp permdLo = newTemp(Ity_I64); - IRTemp res = newTemp(Ity_V128); - - assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); - assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); - assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); - assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); - - assign( sevens, mkU64(0x0707070707070707ULL) ); - - /* mask0x80hi = Not(SarN8x8(sHi,7)) - maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7) - sAnd7hi = And(sHi,sevens) - permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi), - And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) ) - rHi = And(permdHi,mask0x80hi) - */ - assign( - mask0x80hi, - unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7)))); - - assign( - maskBit3hi, - binop(Iop_SarN8x8, - binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)), - mkU8(7))); - - assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens))); - - assign( - permdHi, - binop( - Iop_Or64, - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)), - mkexpr(maskBit3hi)), - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)), - unop(Iop_Not64,mkexpr(maskBit3hi))) )); - - assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) ); - - /* And the same for the lower half of the result. What fun. */ - - assign( - mask0x80lo, - unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7)))); - - assign( - maskBit3lo, - binop(Iop_SarN8x8, - binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)), - mkU8(7))); - - assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens))); - - assign( - permdLo, - binop( - Iop_Or64, - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)), - mkexpr(maskBit3lo)), - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)), - unop(Iop_Not64,mkexpr(maskBit3lo))) )); - - assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) ); - - assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))); + IRTemp halfMask = newTemp(Ity_I64); + assign(halfMask, mkU64(0x8F8F8F8F8F8F8F8FULL)); + IRExpr* mask = binop(Iop_64HLtoV128, mkexpr(halfMask), mkexpr(halfMask)); + IRTemp res = newTemp(Ity_V128); + assign(res, + binop(Iop_PermOrZero8x16, + mkexpr(dV), + // Mask off bits [6:3] of each source operand lane + binop(Iop_AndV128, mkexpr(sV), mask) + )); return res; } @@ -15945,15 +15871,10 @@ Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK, putMMXReg( gregLO3ofRM(modrm), binop( - Iop_And64, - /* permute the lanes */ - binop( - Iop_Perm8x8, - mkexpr(dV), - binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL)) - ), - /* mask off lanes which have (index & 0x80) == 0x80 */ - unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7))) + Iop_PermOrZero8x8, + mkexpr(dV), + // Mask off bits [6:3] of each source operand lane + binop(Iop_And64, mkexpr(sV), mkU64(0x8787878787878787ULL)) ) ); goto decode_success; diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index a554e28ed..48ca268ab 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -584,6 +584,7 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_UNPCKLW: return "punpcklw"; case Asse_UNPCKLD: return "punpckld"; case Asse_UNPCKLQ: return "punpcklq"; + case Asse_PSHUFB: return "pshufb"; default: vpanic("showAMD64SseOp"); } } @@ -3799,6 +3800,8 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Asse_UNPCKLW: XX(0x66); XX(rex); XX(0x0F); XX(0x61); break; case Asse_UNPCKLD: XX(0x66); XX(rex); XX(0x0F); XX(0x62); break; case Asse_UNPCKLQ: XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break; + case Asse_PSHUFB: XX(0x66); XX(rex); + XX(0x0F); XX(0x38); XX(0x00); break; default: goto bad; } p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst), diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index 68e199ad3..6a72943f9 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -339,7 +339,8 @@ typedef Asse_SAR16, Asse_SAR32, Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW, Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ, - Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ + Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ, + Asse_PSHUFB // Only for SSSE3 capable hosts } AMD64SseOp; diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 05e2e725a..7974c8036 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -1122,8 +1122,8 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) fn = (HWord)h_generic_calc_CatOddLanes16x4; break; case Iop_CatEvenLanes16x4: fn = (HWord)h_generic_calc_CatEvenLanes16x4; break; - case Iop_Perm8x8: - fn = (HWord)h_generic_calc_Perm8x8; break; + case Iop_PermOrZero8x8: + fn = (HWord)h_generic_calc_PermOrZero8x8; break; case Iop_Max8Ux8: fn = (HWord)h_generic_calc_Max8Ux8; break; @@ -3437,6 +3437,17 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } + case Iop_PermOrZero8x16: + if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) { + op = Asse_PSHUFB; + goto do_SseReRg; + } + // Otherwise we'll have to generate a call to + // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a + // host which doesn't have SSSE3, in which case we don't expect this + // IROp to enter the compilation pipeline in the first place. + break; + case Iop_QNarrowBin32Sto16Sx8: op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg; case Iop_QNarrowBin16Sto8Sx16: diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c index 62cb88eb5..8c6f70770 100644 --- a/VEX/priv/host_generic_simd128.c +++ b/VEX/priv/host_generic_simd128.c @@ -368,6 +368,20 @@ void VEX_REGPARM(3) res->w32[3] = argL->w32[ argR->w32[3] & 3 ]; } +//void VEX_REGPARM(3) +// h_generic_calc_PermOrZero8x16 ( /*OUT*/V128* res, +// V128* argL, V128* argR ) +//{ +// for (UInt i = 0; i < 16; i++) { +// UChar ix = argR->w8[i]; +// Char zeroingMask = (Char)ix; +// zeroingMask ^= 0x80; +// zeroingMask >>= 7; +// ix &= 15; +// res->w8[i] = (argL->w8[ix] & zeroingMask) & 0xFF; +// } +//} + UInt /*not-regparm*/ h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo ) { diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h index 0b63ab364..18b35180a 100644 --- a/VEX/priv/host_generic_simd128.h +++ b/VEX/priv/host_generic_simd128.h @@ -86,6 +86,11 @@ extern VEX_REGPARM(3) extern VEX_REGPARM(3) void h_generic_calc_Perm32x4 ( /*OUT*/V128*, V128*, V128* ); +// This is correct and tested, but isn't used because we just generate +// PSHUFB on amd64 instead. +//extern VEX_REGPARM(3) +// void h_generic_calc_PermOrZero8x16 ( /*OUT*/V128*, V128*, V128* ); + extern /*not-regparm*/ UInt h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo ); diff --git a/VEX/priv/host_generic_simd64.c b/VEX/priv/host_generic_simd64.c index 48a55bde5..a7e80b4d0 100644 --- a/VEX/priv/host_generic_simd64.c +++ b/VEX/priv/host_generic_simd64.c @@ -137,6 +137,14 @@ static inline UChar index8x8 ( ULong w64, UChar ix ) { return toUChar((w64 >> (8*ix)) & 0xFF); } +static inline UChar indexOrZero8x8 ( ULong w64, UChar ix ) { + Char zeroingMask = (Char)ix; + zeroingMask ^= 0x80; + zeroingMask >>= 7; + ix &= 7; + return toUChar( ((w64 >> (8*ix)) & zeroingMask) & 0xFF ); +} + /* Scalar helpers. */ @@ -974,7 +982,8 @@ ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) ); } -/* misc hack looking for a proper home */ +/* ------------ Permutation ------------ */ + ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) { return mk8x8( @@ -989,6 +998,20 @@ ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) ); } +ULong h_generic_calc_PermOrZero8x8 ( ULong aa, ULong bb ) +{ + return mk8x8( + indexOrZero8x8(aa, sel8x8_7(bb)), + indexOrZero8x8(aa, sel8x8_6(bb)), + indexOrZero8x8(aa, sel8x8_5(bb)), + indexOrZero8x8(aa, sel8x8_4(bb)), + indexOrZero8x8(aa, sel8x8_3(bb)), + indexOrZero8x8(aa, sel8x8_2(bb)), + indexOrZero8x8(aa, sel8x8_1(bb)), + indexOrZero8x8(aa, sel8x8_0(bb)) + ); +} + /* ------------ Shifting ------------ */ /* Note that because these primops are undefined if the shift amount equals or exceeds the lane width, the shift amount is masked so diff --git a/VEX/priv/host_generic_simd64.h b/VEX/priv/host_generic_simd64.h index a0f1ed8e2..e26ef6c8a 100644 --- a/VEX/priv/host_generic_simd64.h +++ b/VEX/priv/host_generic_simd64.h @@ -102,7 +102,8 @@ extern ULong h_generic_calc_InterleaveLO32x2 ( ULong, ULong ); extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong ); extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong ); -extern ULong h_generic_calc_Perm8x8 ( ULong, ULong ); +extern ULong h_generic_calc_Perm8x8 ( ULong, ULong ); +extern ULong h_generic_calc_PermOrZero8x8 ( ULong, ULong ); extern ULong h_generic_calc_ShlN8x8 ( ULong, UInt ); extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt ); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 322103385..ae1c20373 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -622,6 +622,7 @@ void ppIROp ( IROp op ) case Iop_Sal32x2: vex_printf("Sal32x2"); return; case Iop_Sal64x1: vex_printf("Sal64x1"); return; case Iop_Perm8x8: vex_printf("Perm8x8"); return; + case Iop_PermOrZero8x8: vex_printf("PermOrZero8x8"); return; case Iop_Reverse8sIn16_x4: vex_printf("Reverse8sIn16_x4"); return; case Iop_Reverse8sIn32_x2: vex_printf("Reverse8sIn32_x2"); return; case Iop_Reverse16sIn32_x2: vex_printf("Reverse16sIn32_x2"); return; @@ -1125,6 +1126,7 @@ void ppIROp ( IROp op ) case Iop_SliceV128: vex_printf("SliceV128"); return; case Iop_Perm8x16: vex_printf("Perm8x16"); return; + case Iop_PermOrZero8x16: vex_printf("PermOrZero8x16"); return; case Iop_Perm32x4: vex_printf("Perm32x4"); return; case Iop_Perm8x16x2: vex_printf("Perm8x16x2"); return; case Iop_Reverse8sIn16_x8: vex_printf("Reverse8sIn16_x8"); return; @@ -2661,7 +2663,7 @@ void typeOfPrimop ( IROp op, case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4: case Iop_InterleaveOddLanes8x8: case Iop_InterleaveEvenLanes8x8: case Iop_InterleaveOddLanes16x4: case Iop_InterleaveEvenLanes16x4: - case Iop_Perm8x8: + case Iop_Perm8x8: case Iop_PermOrZero8x8: case Iop_Max8Ux8: case Iop_Max16Ux4: case Iop_Max32Ux2: case Iop_Max8Sx8: case Iop_Max16Sx4: case Iop_Max32Sx2: case Iop_Max32Fx2: case Iop_Min32Fx2: @@ -3132,7 +3134,8 @@ void typeOfPrimop ( IROp op, case Iop_PackOddLanes8x16: case Iop_PackEvenLanes8x16: case Iop_PackOddLanes16x8: case Iop_PackEvenLanes16x8: case Iop_PackOddLanes32x4: case Iop_PackEvenLanes32x4: - case Iop_Perm8x16: case Iop_Perm32x4: + case Iop_Perm8x16: case Iop_PermOrZero8x16: + case Iop_Perm32x4: case Iop_RecipStep32Fx4: case Iop_RecipStep64Fx2: case Iop_RSqrtStep32Fx4: case Iop_RSqrtStep64Fx2: case Iop_CipherV128: diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 93fa5acad..459d14b6c 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1061,9 +1061,16 @@ typedef as indexed by control vector bytes: for i in 0 .. 7 . result[i] = argL[ argR[i] ] argR[i] values may only be in the range 0 .. 7, else behaviour - is undefined. */ + is undefined. That is, argR[i][7:3] must be zero. */ Iop_Perm8x8, + /* PERMUTING with optional zeroing: + for i in 0 .. 7 . result[i] = if argR[i] bit 7 is set + then zero else argL[ argR[i] ] + argR[i][6:3] must be zero, else behaviour is undefined. + */ + Iop_PermOrZero8x8, + /* MISC CONVERSION -- get high bits of each byte lane, a la x86/amd64 pmovmskb */ Iop_GetMSBs8x8, /* I64 -> I8 */ @@ -1842,10 +1849,17 @@ typedef as indexed by control vector bytes: for i in 0 .. 15 . result[i] = argL[ argR[i] ] argR[i] values may only be in the range 0 .. 15, else behaviour - is undefined. */ + is undefined. That is, argR[i][7:4] must be zero. */ Iop_Perm8x16, Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */ + /* PERMUTING with optional zeroing: + for i in 0 .. 15 . result[i] = if argR[i] bit 7 is set + then zero else argL[ argR[i] ] + argR[i][6:4] must be zero, else behaviour is undefined. + */ + Iop_PermOrZero8x16, + /* same, but Triop (argL consists of two 128-bit parts) */ /* correct range for argR values is 0..31 */ /* (V128, V128, V128) -> V128 */ diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index 04ed864a1..6e449e2c9 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -3641,10 +3641,22 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, complainIfUndefined(mce, atom2, NULL); return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2)); - /* Perm8x8: rearrange values in left arg using steering values - from right arg. So rearrange the vbits in the same way but - pessimise wrt steering values. */ + /* Perm8x8: rearrange values in left arg using steering values from + right arg. So rearrange the vbits in the same way but pessimise wrt + steering values. We assume that unused bits in the steering value + are defined zeros, so we can safely PCast within each lane of the the + steering value without having to take precautions to avoid a + dependency on those unused bits. + + This is also correct for PermOrZero8x8, but it is a bit subtle. For + each lane, if bit 7 of the steering value is zero, then we'll steer + the shadow value exactly as per Perm8x8. If that bit is one, then + the operation will set the resulting (concrete) value to zero. That + means it is defined, and should have a shadow value of zero. Hence + in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way + as Perm8x8) and then pessimise against the steering values. */ case Iop_Perm8x8: + case Iop_PermOrZero8x8: return mkUifU64( mce, assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)), @@ -4121,10 +4133,12 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, complainIfUndefined(mce, atom2, NULL); return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)); - /* Perm8x16: rearrange values in left arg using steering values - from right arg. So rearrange the vbits in the same way but - pessimise wrt steering values. Perm32x4 ditto. */ + /* Perm8x16: rearrange values in left arg using steering values + from right arg. So rearrange the vbits in the same way but + pessimise wrt steering values. Perm32x4 ditto. */ + /* PermOrZero8x16: see comments above for PermOrZero8x8. */ case Iop_Perm8x16: + case Iop_PermOrZero8x16: return mkUifUV128( mce, assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)), diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index e8bf67dcd..66b40ef00 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -534,6 +534,7 @@ static irop_t irops[] = { { DEFOP(Iop_Reverse16sIn64_x1, UNDEF_UNKNOWN), }, { DEFOP(Iop_Reverse32sIn64_x1, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm8x8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_PermOrZero8x8, UNDEF_UNKNOWN), }, { DEFOP(Iop_GetMSBs8x8, UNDEF_UNKNOWN), }, { DEFOP(Iop_RecipEst32Ux2, UNDEF_UNKNOWN), }, { DEFOP(Iop_RSqrtEst32Ux2, UNDEF_UNKNOWN), }, @@ -1033,6 +1034,7 @@ static irop_t irops[] = { { DEFOP(Iop_Reverse32sIn64_x2, UNDEF_UNKNOWN), }, { DEFOP(Iop_Reverse1sIn8_x16, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm8x16, UNDEF_UNKNOWN), }, + { DEFOP(Iop_PermOrZero8x16, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm32x4, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm8x16x2, UNDEF_UNKNOWN), }, { DEFOP(Iop_GetMSBs8x16, UNDEF_UNKNOWN), }, -- 2.11.4.GIT