From a4eee021952f965c2fd789868be5f6887db1f696 Mon Sep 17 00:00:00 2001 From: Calvin Buckley Date: Thu, 28 Feb 2019 10:31:12 -0400 Subject: [PATCH] PPC JIT optimizations (System.Math instruction inlining) (#11964) * [ppc] Optimize floating point performance in the JIT sqrt/sqrtf/abs * [ppc] Integer Min/Max inlining wiht POWER5 isel instruction Compresses a function call down to two instruction, for both signed and unsigned, long and int. Needs to check for POWE5 though, and I don't know the effects of this on ppc32. FP version using fsel is also desirable, but not in the scope of this commit. * [ppc] Check for POWER5 compatible CPU before using isel * [ppc] Don't use long min/max inlining for ppc32 Other 64-bit operations are marked for ppc64 only. * [ppc] also ifdef that out for ppc32 too * [ppc] Rounding function optimizations --- mono/arch/ppc/ppc-codegen.h | 27 ++++++++ mono/mini/cpu-ppc.md | 16 +++++ mono/mini/cpu-ppc64.md | 15 ++++ mono/mini/mini-ops.h | 5 +- mono/mini/mini-ppc.c | 160 ++++++++++++++++++++++++++++++++++++++++++- mono/utils/mono-hwcap-ppc.c | 32 +++++---- mono/utils/mono-hwcap-vars.h | 1 + 7 files changed, 240 insertions(+), 16 deletions(-) diff --git a/mono/arch/ppc/ppc-codegen.h b/mono/arch/ppc/ppc-codegen.h index 25b805a4a09..44f0ce0cb3e 100644 --- a/mono/arch/ppc/ppc-codegen.h +++ b/mono/arch/ppc/ppc-codegen.h @@ -754,6 +754,33 @@ my and Ximian's copyright to this code. ;) /* this marks the end of my work, ct */ +/* Introduced in Power ISA 2.02 (P4?) */ +#define ppc_frinx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (392 << 1) | Rc) +#define ppc_frin(c,D,B) ppc_frinx(c,D,B,0) +#define ppc_frind(c,D,B) ppc_frinx(c,D,B,1) + +#define ppc_fripx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (456 << 1) | Rc) +#define ppc_frip(c,D,B) ppc_fripx(c,D,B,0) +#define ppc_fripd(c,D,B) ppc_fripx(c,D,B,1) + +#define ppc_frizx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (424 << 1) | Rc) +#define ppc_friz(c,D,B) ppc_frizx(c,D,B,0) +#define ppc_frizd(c,D,B) ppc_frizx(c,D,B,1) + +#define ppc_frimx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (488 << 1) | Rc) +#define ppc_frim(c,D,B) ppc_frimx(c,D,B,0) +#define ppc_frimd(c,D,B) ppc_frimx(c,D,B,1) + +/* + * Introduced in Power ISA 2.03 (P5) + * This is an A-form instruction like many of the FP arith ops, + * but arranged slightly differently (swap record and reserved area) + */ +#define ppc_isel(c,D,A,B,C) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (15 << 1) | 0) +#define ppc_isellt(c,D,A,B) ppc_isel(c,D,A,B,0) +#define ppc_iselgt(c,D,A,B) ppc_isel(c,D,A,B,1) +#define ppc_iseleq(c,D,A,B) ppc_isel(c,D,A,B,2) + /* PPC64 */ /* The following FP instructions are not are available to 32-bit diff --git a/mono/mini/cpu-ppc.md b/mono/mini/cpu-ppc.md index 4e7e5f6591a..cb31d18c908 100644 --- a/mono/mini/cpu-ppc.md +++ b/mono/mini/cpu-ppc.md @@ -209,7 +209,13 @@ endfilter: src1:i len:32 aotconst: dest:i len:8 load_gotaddr: dest:i len:32 got_entry: dest:i src1:b len:32 +abs: dest:f src1:f len:4 sqrt: dest:f src1:f len:4 +sqrtf: dest:f src1:f len:4 +round: dest:f src1:f len:4 +ppc_trunc: dest:f src1:f len:4 +ppc_ceil: dest:f src1:f len:4 +ppc_floor: dest:f src1:f len:4 adc: dest:i src1:i src2:i len:4 addcc: dest:i src1:i src2:i len:4 subcc: dest:i src1:i src2:i len:4 @@ -323,6 +329,16 @@ icompare_imm: src1:i len:12 long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:32 +# shouldn't use long stuff on ppc32 +#long_min: dest:i src1:i src2:i len:8 clob:1 +#long_min_un: dest:i src1:i src2:i len:8 clob:1 +#long_max: dest:i src1:i src2:i len:8 clob:1 +#long_max_un: dest:i src1:i src2:i len:8 clob:1 +int_min: dest:i src1:i src2:i len:8 clob:1 +int_max: dest:i src1:i src2:i len:8 clob:1 +int_min_un: dest:i src1:i src2:i len:8 clob:1 +int_max_un: dest:i src1:i src2:i len:8 clob:1 + vcall2: len:20 clob:c vcall2_reg: src1:i len:8 clob:c vcall2_membase: src1:b len:16 clob:c diff --git a/mono/mini/cpu-ppc64.md b/mono/mini/cpu-ppc64.md index 46596201b75..f0651f5f8d0 100644 --- a/mono/mini/cpu-ppc64.md +++ b/mono/mini/cpu-ppc64.md @@ -213,7 +213,13 @@ endfilter: src1:i len:20 aotconst: dest:i len:8 load_gotaddr: dest:i len:32 got_entry: dest:i src1:b len:32 +abs: dest:f src1:f len:4 sqrt: dest:f src1:f len:4 +sqrtf: dest:f src1:f len:4 +round: dest:f src1:f len:4 +ppc_trunc: dest:f src1:f len:4 +ppc_ceil: dest:f src1:f len:4 +ppc_floor: dest:f src1:f len:4 adc: dest:i src1:i src2:i len:4 addcc: dest:i src1:i src2:i len:4 subcc: dest:i src1:i src2:i len:4 @@ -389,6 +395,15 @@ long_xor_imm: dest:i src1:i clob:1 len:4 lcompare: src1:i src2:i len:4 lcompare_imm: src1:i len:12 +long_min: dest:i src1:i src2:i len:8 clob:1 +long_min_un: dest:i src1:i src2:i len:8 clob:1 +long_max: dest:i src1:i src2:i len:8 clob:1 +long_max_un: dest:i src1:i src2:i len:8 clob:1 +int_min: dest:i src1:i src2:i len:8 clob:1 +int_max: dest:i src1:i src2:i len:8 clob:1 +int_min_un: dest:i src1:i src2:i len:8 clob:1 +int_max_un: dest:i src1:i src2:i len:8 clob:1 + #long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:30 vcall2: len:36 clob:c diff --git a/mono/mini/mini-ops.h b/mono/mini/mini-ops.h index 6582659477a..d8688362d8b 100644 --- a/mono/mini/mini-ops.h +++ b/mono/mini/mini-ops.h @@ -1205,7 +1205,10 @@ MINI_OP(OP_AMD64_SAVE_SP_TO_LMF, "amd64_save_sp_to_lmf", NONE, NONE, NON #if defined(TARGET_POWERPC) MINI_OP(OP_PPC_SUBFIC, "ppc_subfic", IREG, IREG, NONE) MINI_OP(OP_PPC_SUBFZE, "ppc_subfze", IREG, IREG, NONE) -MINI_OP(OP_PPC_CHECK_FINITE, "ppc_check_finite", NONE, IREG, NONE) +MINI_OP(OP_PPC_CHECK_FINITE, "ppc_check_finite", NONE, IREG, NONE) +MINI_OP(OP_PPC_CEIL, "ppc_ceil", FREG, FREG, NONE) +MINI_OP(OP_PPC_FLOOR, "ppc_floor", FREG, FREG, NONE) +MINI_OP(OP_PPC_TRUNC, "ppc_trunc", FREG, FREG, NONE) #endif #if defined(TARGET_ARM) || defined(TARGET_ARM64) diff --git a/mono/mini/mini-ppc.c b/mono/mini/mini-ppc.c index e3f6e838ec3..7e96dce6025 100644 --- a/mono/mini/mini-ppc.c +++ b/mono/mini/mini-ppc.c @@ -40,6 +40,9 @@ #include #endif +static GENERATE_TRY_GET_CLASS_WITH_CACHE (math, "System", "Math") +static GENERATE_TRY_GET_CLASS_WITH_CACHE (mathf, "System", "MathF") + #define FORCE_INDIR_CALL 1 enum { @@ -62,6 +65,7 @@ enum { PPC_ISA_2X = 1 << 3, PPC_ISA_64 = 1 << 4, PPC_MOVE_FPR_GPR = 1 << 5, + PPC_ISA_2_03 = 1 << 6, PPC_HW_CAP_END }; @@ -553,6 +557,9 @@ mono_arch_init (void) if (mono_hwcap_ppc_is_isa_2x) cpu_hw_caps |= PPC_ISA_2X; + if (mono_hwcap_ppc_is_isa_2_03) + cpu_hw_caps |= PPC_ISA_2_03; + if (mono_hwcap_ppc_is_isa_64) cpu_hw_caps |= PPC_ISA_64; @@ -4214,6 +4221,24 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; #endif } + case OP_ROUND: + ppc_frind (code, ins->dreg, ins->sreg1); + break; + case OP_PPC_TRUNC: + ppc_frizd (code, ins->dreg, ins->sreg1); + break; + case OP_PPC_CEIL: + ppc_fripd (code, ins->dreg, ins->sreg1); + break; + case OP_PPC_FLOOR: + ppc_frimd (code, ins->dreg, ins->sreg1); + break; + case OP_ABS: + ppc_fabsd (code, ins->dreg, ins->sreg1); + break; + case OP_SQRTF: + ppc_fsqrtsd (code, ins->dreg, ins->sreg1); + break; case OP_SQRT: ppc_fsqrtd (code, ins->dreg, ins->sreg1); break; @@ -4236,6 +4261,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) /* emulated */ g_assert_not_reached (); break; + /* These min/max require POWER5 */ + case OP_IMIN: + ppc_cmp (code, 0, 0, ins->sreg1, ins->sreg2); + ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; + case OP_IMIN_UN: + ppc_cmpl (code, 0, 0, ins->sreg1, ins->sreg2); + ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; + case OP_IMAX: + ppc_cmp (code, 0, 0, ins->sreg1, ins->sreg2); + ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; + case OP_IMAX_UN: + ppc_cmpl (code, 0, 0, ins->sreg1, ins->sreg2); + ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; + CASE_PPC64 (OP_LMIN) + ppc_cmpl (code, 0, 1, ins->sreg1, ins->sreg2); + ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; + CASE_PPC64 (OP_LMIN_UN) + ppc_cmpl (code, 0, 1, ins->sreg1, ins->sreg2); + ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; + CASE_PPC64 (OP_LMAX) + ppc_cmp (code, 0, 1, ins->sreg1, ins->sreg2); + ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; + CASE_PPC64 (OP_LMAX_UN) + ppc_cmpl (code, 0, 1, ins->sreg1, ins->sreg2); + ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2); + break; case OP_FCOMPARE: ppc_fcmpu (code, 0, ins->sreg1, ins->sreg2); break; @@ -5610,8 +5668,106 @@ mono_arch_get_cie_program (void) MonoInst* mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args) { - /* FIXME: */ - return NULL; + MonoInst *ins = NULL; + int opcode = 0; + + if (cmethod->klass == mono_class_try_get_math_class ()) { + if (strcmp (cmethod->name, "Sqrt") == 0) { + opcode = OP_SQRT; + } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) { + opcode = OP_ABS; + } + + if (opcode && fsig->param_count == 1) { + MONO_INST_NEW (cfg, ins, opcode); + ins->type = STACK_R8; + ins->dreg = mono_alloc_freg (cfg); + ins->sreg1 = args [0]->dreg; + MONO_ADD_INS (cfg->cbb, ins); + } + + /* Check for Min/Max for (u)int(32|64) */ + opcode = 0; + if (cpu_hw_caps & PPC_ISA_2_03) { + if (strcmp (cmethod->name, "Min") == 0) { + if (fsig->params [0]->type == MONO_TYPE_I4) + opcode = OP_IMIN; + if (fsig->params [0]->type == MONO_TYPE_U4) + opcode = OP_IMIN_UN; +#ifdef __mono_ppc64__ + else if (fsig->params [0]->type == MONO_TYPE_I8) + opcode = OP_LMIN; + else if (fsig->params [0]->type == MONO_TYPE_U8) + opcode = OP_LMIN_UN; +#endif + } else if (strcmp (cmethod->name, "Max") == 0) { + if (fsig->params [0]->type == MONO_TYPE_I4) + opcode = OP_IMAX; + if (fsig->params [0]->type == MONO_TYPE_U4) + opcode = OP_IMAX_UN; +#ifdef __mono_ppc64__ + else if (fsig->params [0]->type == MONO_TYPE_I8) + opcode = OP_LMAX; + else if (fsig->params [0]->type == MONO_TYPE_U8) + opcode = OP_LMAX_UN; +#endif + } + /* + * TODO: Floating point version with fsel, but fsel has + * some peculiarities (need a scratch reg unless + * comparing with 0, NaN/Inf behaviour (then MathF too) + */ + } + + if (opcode && fsig->param_count == 2) { + MONO_INST_NEW (cfg, ins, opcode); + ins->type = fsig->params [0]->type == MONO_TYPE_I4 ? STACK_I4 : STACK_I8; + ins->dreg = mono_alloc_ireg (cfg); + ins->sreg1 = args [0]->dreg; + ins->sreg2 = args [1]->dreg; + MONO_ADD_INS (cfg->cbb, ins); + } + + /* Rounding instructions */ + opcode = 0; + if ((cpu_hw_caps & PPC_ISA_2X) && (fsig->param_count == 1) && (fsig->params [0]->type == MONO_TYPE_R8)) { + /* + * XXX: sysmath.c and frin imply round is a little bit + * more complicated than expected? but amd64 does this? + * (also, no float versions of these ops, but frsp + * could be preprended?) + */ + if (!strcmp (cmethod->name, "Round")) + opcode = OP_ROUND; + else if (!strcmp (cmethod->name, "Floor")) + opcode = OP_PPC_FLOOR; + else if (!strcmp (cmethod->name, "Ceiling")) + opcode = OP_PPC_CEIL; + else if (!strcmp (cmethod->name, "Truncate")) + opcode = OP_PPC_TRUNC; + if (opcode != 0) { + MONO_INST_NEW (cfg, ins, opcode); + ins->type = STACK_R8; + ins->dreg = mono_alloc_freg (cfg); + ins->sreg1 = args [0]->dreg; + MONO_ADD_INS (cfg->cbb, ins); + } + } + } + if (cmethod->klass == mono_class_try_get_mathf_class ()) { + if (strcmp (cmethod->name, "Sqrt") == 0) { + opcode = OP_SQRTF; + } /* XXX: POWER has no single-precision normal FPU abs? */ + + if (opcode && fsig->param_count == 1) { + MONO_INST_NEW (cfg, ins, opcode); + ins->type = STACK_R4; + ins->dreg = mono_alloc_freg (cfg); + ins->sreg1 = args [0]->dreg; + MONO_ADD_INS (cfg->cbb, ins); + } + } + return ins; } host_mgreg_t diff --git a/mono/utils/mono-hwcap-ppc.c b/mono/utils/mono-hwcap-ppc.c index 2c1cb154317..0680ee37e2e 100644 --- a/mono/utils/mono-hwcap-ppc.c +++ b/mono/utils/mono-hwcap-ppc.c @@ -60,6 +60,16 @@ mono_hwcap_arch_init (void) if (hwcap & (0x00080000 | 0x00040000 | 0x00020000 | 0x00010000 | 0x00000800 | 0x00001000)) mono_hwcap_ppc_is_isa_2x = TRUE; + /* PPC_FEATURE_POWER4, PPC_FEATURE_POWER5, PPC_FEATURE_POWER5_PLUS, + PPC_FEATURE_CELL_BE, PPC_FEATURE_PA6T, PPC_FEATURE_ARCH_2_05 */ + if (hwcap & (0x00080000 | 0x00040000 | 0x00020000 | 0x00010000 | 0x00000800 | 0x00001000)) + mono_hwcap_ppc_is_isa_2x = TRUE; + + /* PPC_FEATURE_POWER5, PPC_FEATURE_POWER5_PLUS, + PPC_FEATURE_CELL_BE, PPC_FEATURE_PA6T, PPC_FEATURE_ARCH_2_05 */ + if (hwcap & (0x00040000 | 0x00020000 | 0x00010000 | 0x00000800 | 0x00001000)) + mono_hwcap_ppc_is_isa_2_03 = TRUE; + /* PPC_FEATURE_64 */ if (hwcap & 0x40000000) mono_hwcap_ppc_is_isa_64 = TRUE; @@ -76,26 +86,22 @@ mono_hwcap_arch_init (void) mono_hwcap_ppc_has_multiple_ls_units = TRUE; } #elif defined(_AIX) - if (__cpu64()) - mono_hwcap_ppc_is_isa_64 = TRUE; - if (__power_4_andup()) - mono_hwcap_ppc_is_isa_2x = TRUE; - if (__power_5_andup()) + /* Compatible platforms for Mono (V7R1, 6.1.9) require at least P4. */ + mono_hwcap_ppc_is_isa_64 = TRUE; + mono_hwcap_ppc_is_isa_2x = TRUE; + if (__power_5_andup()) { + mono_hwcap_ppc_is_isa_2_03 = TRUE; mono_hwcap_ppc_has_icache_snoop = TRUE; + } /* not on POWER8 */ if (__power_4() || __power_5() || __power_6() || __power_7()) mono_hwcap_ppc_has_multiple_ls_units = TRUE; /* - * I dont see a way to get extended POWER6 and the PV_6_1 - * def seems to be trigged on the POWER6 here despite not - * having these extended instructions, so POWER7 it is - */ - /* - * WARNING: reports that this doesn't actually work, try - * to re-enable after more investigation + * This instruction is only available in POWER6 "raw mode" and unlikely + * to work; I couldn't get it to work on the POWER6s I tried. */ /* - if (__power_7_andup()) + if (__power_6()) mono_hwcap_ppc_has_move_fpr_gpr = TRUE; */ #endif diff --git a/mono/utils/mono-hwcap-vars.h b/mono/utils/mono-hwcap-vars.h index 5fefe5ca5e4..1596ca48d71 100644 --- a/mono/utils/mono-hwcap-vars.h +++ b/mono/utils/mono-hwcap-vars.h @@ -27,6 +27,7 @@ MONO_HWCAP_VAR(arm_has_thumb2) MONO_HWCAP_VAR(ppc_has_icache_snoop) MONO_HWCAP_VAR(ppc_is_isa_2x) +MONO_HWCAP_VAR(ppc_is_isa_2_03) MONO_HWCAP_VAR(ppc_is_isa_64) MONO_HWCAP_VAR(ppc_has_move_fpr_gpr) MONO_HWCAP_VAR(ppc_has_multiple_ls_units) -- 2.11.4.GIT