From a4eee021952f965c2fd789868be5f6887db1f696 Mon Sep 17 00:00:00 2001
From: Calvin Buckley <calvin@cmpct.info>
Date: Thu, 28 Feb 2019 10:31:12 -0400
Subject: [PATCH] PPC JIT optimizations (System.Math instruction inlining)
 (#11964)

* [ppc] Optimize floating point performance in the JIT sqrt/sqrtf/abs

* [ppc] Integer Min/Max inlining wiht POWER5 isel instruction

Compresses a function call down to two instruction, for both signed
and unsigned, long and int. Needs to check for POWE5 though, and I
don't know the effects of this on ppc32. FP version using fsel is
also desirable, but not in the scope of this commit.

* [ppc] Check for POWER5 compatible CPU before using isel

* [ppc] Don't use long min/max inlining for ppc32

Other 64-bit operations are marked for ppc64 only.

* [ppc] also ifdef that out for ppc32 too

* [ppc] Rounding function optimizations
---
 mono/arch/ppc/ppc-codegen.h  |  27 ++++++++
 mono/mini/cpu-ppc.md         |  16 +++++
 mono/mini/cpu-ppc64.md       |  15 ++++
 mono/mini/mini-ops.h         |   5 +-
 mono/mini/mini-ppc.c         | 160 ++++++++++++++++++++++++++++++++++++++++++-
 mono/utils/mono-hwcap-ppc.c  |  32 +++++----
 mono/utils/mono-hwcap-vars.h |   1 +
 7 files changed, 240 insertions(+), 16 deletions(-)

diff --git a/mono/arch/ppc/ppc-codegen.h b/mono/arch/ppc/ppc-codegen.h
index 25b805a4a09..44f0ce0cb3e 100644
--- a/mono/arch/ppc/ppc-codegen.h
+++ b/mono/arch/ppc/ppc-codegen.h
@@ -754,6 +754,33 @@ my and Ximian's copyright to this code. ;)
 
 /* this marks the end of my work, ct */
 
+/* Introduced in Power ISA 2.02 (P4?) */
+#define ppc_frinx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (392 << 1) | Rc)
+#define ppc_frin(c,D,B) ppc_frinx(c,D,B,0)
+#define ppc_frind(c,D,B) ppc_frinx(c,D,B,1)
+
+#define ppc_fripx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (456 << 1) | Rc)
+#define ppc_frip(c,D,B) ppc_fripx(c,D,B,0)
+#define ppc_fripd(c,D,B) ppc_fripx(c,D,B,1)
+
+#define ppc_frizx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (424 << 1) | Rc)
+#define ppc_friz(c,D,B) ppc_frizx(c,D,B,0)
+#define ppc_frizd(c,D,B) ppc_frizx(c,D,B,1)
+
+#define ppc_frimx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (488 << 1) | Rc)
+#define ppc_frim(c,D,B) ppc_frimx(c,D,B,0)
+#define ppc_frimd(c,D,B) ppc_frimx(c,D,B,1)
+
+/*
+ * Introduced in Power ISA 2.03 (P5)
+ * This is an A-form instruction like many of the FP arith ops,
+ * but arranged slightly differently (swap record and reserved area)
+ */
+#define ppc_isel(c,D,A,B,C) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (15 << 1) | 0)
+#define ppc_isellt(c,D,A,B) ppc_isel(c,D,A,B,0)
+#define ppc_iselgt(c,D,A,B) ppc_isel(c,D,A,B,1)
+#define ppc_iseleq(c,D,A,B) ppc_isel(c,D,A,B,2)
+
 /* PPC64 */
 
 /* The following FP instructions are not are available to 32-bit
diff --git a/mono/mini/cpu-ppc.md b/mono/mini/cpu-ppc.md
index 4e7e5f6591a..cb31d18c908 100644
--- a/mono/mini/cpu-ppc.md
+++ b/mono/mini/cpu-ppc.md
@@ -209,7 +209,13 @@ endfilter: src1:i len:32
 aotconst: dest:i len:8
 load_gotaddr: dest:i len:32
 got_entry: dest:i src1:b len:32
+abs: dest:f src1:f len:4
 sqrt: dest:f src1:f len:4
+sqrtf: dest:f src1:f len:4
+round: dest:f src1:f len:4
+ppc_trunc: dest:f src1:f len:4
+ppc_ceil: dest:f src1:f len:4
+ppc_floor: dest:f src1:f len:4
 adc: dest:i src1:i src2:i len:4
 addcc: dest:i src1:i src2:i len:4
 subcc: dest:i src1:i src2:i len:4
@@ -323,6 +329,16 @@ icompare_imm: src1:i len:12
 
 long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:32
 
+# shouldn't use long stuff on ppc32
+#long_min: dest:i src1:i src2:i len:8 clob:1
+#long_min_un: dest:i src1:i src2:i len:8 clob:1
+#long_max: dest:i src1:i src2:i len:8 clob:1
+#long_max_un: dest:i src1:i src2:i len:8 clob:1
+int_min: dest:i src1:i src2:i len:8 clob:1
+int_max: dest:i src1:i src2:i len:8 clob:1
+int_min_un: dest:i src1:i src2:i len:8 clob:1
+int_max_un: dest:i src1:i src2:i len:8 clob:1
+
 vcall2: len:20 clob:c
 vcall2_reg: src1:i len:8 clob:c
 vcall2_membase: src1:b len:16 clob:c
diff --git a/mono/mini/cpu-ppc64.md b/mono/mini/cpu-ppc64.md
index 46596201b75..f0651f5f8d0 100644
--- a/mono/mini/cpu-ppc64.md
+++ b/mono/mini/cpu-ppc64.md
@@ -213,7 +213,13 @@ endfilter: src1:i len:20
 aotconst: dest:i len:8
 load_gotaddr: dest:i len:32
 got_entry: dest:i src1:b len:32
+abs: dest:f src1:f len:4
 sqrt: dest:f src1:f len:4
+sqrtf: dest:f src1:f len:4
+round: dest:f src1:f len:4
+ppc_trunc: dest:f src1:f len:4
+ppc_ceil: dest:f src1:f len:4
+ppc_floor: dest:f src1:f len:4
 adc: dest:i src1:i src2:i len:4
 addcc: dest:i src1:i src2:i len:4
 subcc: dest:i src1:i src2:i len:4
@@ -389,6 +395,15 @@ long_xor_imm: dest:i src1:i clob:1 len:4
 lcompare: src1:i src2:i len:4
 lcompare_imm: src1:i len:12
 
+long_min: dest:i src1:i src2:i len:8 clob:1
+long_min_un: dest:i src1:i src2:i len:8 clob:1
+long_max: dest:i src1:i src2:i len:8 clob:1
+long_max_un: dest:i src1:i src2:i len:8 clob:1
+int_min: dest:i src1:i src2:i len:8 clob:1
+int_max: dest:i src1:i src2:i len:8 clob:1
+int_min_un: dest:i src1:i src2:i len:8 clob:1
+int_max_un: dest:i src1:i src2:i len:8 clob:1
+
 #long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:30
 
 vcall2: len:36 clob:c
diff --git a/mono/mini/mini-ops.h b/mono/mini/mini-ops.h
index 6582659477a..d8688362d8b 100644
--- a/mono/mini/mini-ops.h
+++ b/mono/mini/mini-ops.h
@@ -1205,7 +1205,10 @@ MINI_OP(OP_AMD64_SAVE_SP_TO_LMF,         "amd64_save_sp_to_lmf", NONE, NONE, NON
 #if  defined(TARGET_POWERPC)
 MINI_OP(OP_PPC_SUBFIC,             "ppc_subfic", IREG, IREG, NONE)
 MINI_OP(OP_PPC_SUBFZE,             "ppc_subfze", IREG, IREG, NONE)
-MINI_OP(OP_PPC_CHECK_FINITE,           "ppc_check_finite", NONE, IREG, NONE)
+MINI_OP(OP_PPC_CHECK_FINITE,       "ppc_check_finite", NONE, IREG, NONE)
+MINI_OP(OP_PPC_CEIL,               "ppc_ceil", FREG, FREG, NONE)
+MINI_OP(OP_PPC_FLOOR,              "ppc_floor", FREG, FREG, NONE)
+MINI_OP(OP_PPC_TRUNC,              "ppc_trunc", FREG, FREG, NONE)
 #endif
 
 #if defined(TARGET_ARM) || defined(TARGET_ARM64)
diff --git a/mono/mini/mini-ppc.c b/mono/mini/mini-ppc.c
index e3f6e838ec3..7e96dce6025 100644
--- a/mono/mini/mini-ppc.c
+++ b/mono/mini/mini-ppc.c
@@ -40,6 +40,9 @@
 #include <sys/systemcfg.h>
 #endif
 
+static GENERATE_TRY_GET_CLASS_WITH_CACHE (math, "System", "Math")
+static GENERATE_TRY_GET_CLASS_WITH_CACHE (mathf, "System", "MathF")
+
 #define FORCE_INDIR_CALL 1
 
 enum {
@@ -62,6 +65,7 @@ enum {
 	PPC_ISA_2X            = 1 << 3,
 	PPC_ISA_64            = 1 << 4,
 	PPC_MOVE_FPR_GPR      = 1 << 5,
+	PPC_ISA_2_03          = 1 << 6,
 	PPC_HW_CAP_END
 };
 
@@ -553,6 +557,9 @@ mono_arch_init (void)
 	if (mono_hwcap_ppc_is_isa_2x)
 		cpu_hw_caps |= PPC_ISA_2X;
 
+	if (mono_hwcap_ppc_is_isa_2_03)
+		cpu_hw_caps |= PPC_ISA_2_03;
+
 	if (mono_hwcap_ppc_is_isa_64)
 		cpu_hw_caps |= PPC_ISA_64;
 
@@ -4214,6 +4221,24 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			break;
 #endif
 		}
+		case OP_ROUND:
+			ppc_frind (code, ins->dreg, ins->sreg1);
+			break;
+		case OP_PPC_TRUNC:
+			ppc_frizd (code, ins->dreg, ins->sreg1);
+			break;
+		case OP_PPC_CEIL:
+			ppc_fripd (code, ins->dreg, ins->sreg1);
+			break;
+		case OP_PPC_FLOOR:
+			ppc_frimd (code, ins->dreg, ins->sreg1);
+			break;
+		case OP_ABS:
+			ppc_fabsd (code, ins->dreg, ins->sreg1);
+			break;
+		case OP_SQRTF:
+			ppc_fsqrtsd (code, ins->dreg, ins->sreg1);
+			break;
 		case OP_SQRT:
 			ppc_fsqrtd (code, ins->dreg, ins->sreg1);
 			break;
@@ -4236,6 +4261,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			/* emulated */
 			g_assert_not_reached ();
 			break;
+		/* These min/max require POWER5 */
+		case OP_IMIN:
+			ppc_cmp (code, 0, 0, ins->sreg1, ins->sreg2);
+			ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
+		case OP_IMIN_UN:
+			ppc_cmpl (code, 0, 0, ins->sreg1, ins->sreg2);
+			ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
+		case OP_IMAX:
+			ppc_cmp (code, 0, 0, ins->sreg1, ins->sreg2);
+			ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
+		case OP_IMAX_UN:
+			ppc_cmpl (code, 0, 0, ins->sreg1, ins->sreg2);
+			ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
+		CASE_PPC64 (OP_LMIN)
+			ppc_cmpl (code, 0, 1, ins->sreg1, ins->sreg2);
+			ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
+		CASE_PPC64 (OP_LMIN_UN)
+			ppc_cmpl (code, 0, 1, ins->sreg1, ins->sreg2);
+			ppc_isellt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
+		CASE_PPC64 (OP_LMAX)
+			ppc_cmp (code, 0, 1, ins->sreg1, ins->sreg2);
+			ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
+		CASE_PPC64 (OP_LMAX_UN)
+			ppc_cmpl (code, 0, 1, ins->sreg1, ins->sreg2);
+			ppc_iselgt (code, ins->dreg, ins->sreg1, ins->sreg2);
+			break;
 		case OP_FCOMPARE:
 			ppc_fcmpu (code, 0, ins->sreg1, ins->sreg2);
 			break;
@@ -5610,8 +5668,106 @@ mono_arch_get_cie_program (void)
 MonoInst*
 mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
 {
-	/* FIXME: */
-	return NULL;
+	MonoInst *ins = NULL;
+	int opcode = 0;
+
+	if (cmethod->klass == mono_class_try_get_math_class ()) {
+		if (strcmp (cmethod->name, "Sqrt") == 0) {
+			opcode = OP_SQRT;
+		} else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
+			opcode = OP_ABS;
+		}
+
+		if (opcode && fsig->param_count == 1) {
+			MONO_INST_NEW (cfg, ins, opcode);
+			ins->type = STACK_R8;
+			ins->dreg = mono_alloc_freg (cfg);
+			ins->sreg1 = args [0]->dreg;
+			MONO_ADD_INS (cfg->cbb, ins);
+		}
+
+		/* Check for Min/Max for (u)int(32|64) */
+		opcode = 0;
+		if (cpu_hw_caps & PPC_ISA_2_03) {
+			if (strcmp (cmethod->name, "Min") == 0) {
+				if (fsig->params [0]->type == MONO_TYPE_I4)
+					opcode = OP_IMIN;
+				if (fsig->params [0]->type == MONO_TYPE_U4)
+					opcode = OP_IMIN_UN;
+#ifdef __mono_ppc64__
+				else if (fsig->params [0]->type == MONO_TYPE_I8)
+					opcode = OP_LMIN;
+				else if (fsig->params [0]->type == MONO_TYPE_U8)
+					opcode = OP_LMIN_UN;
+#endif
+			} else if (strcmp (cmethod->name, "Max") == 0) {
+				if (fsig->params [0]->type == MONO_TYPE_I4)
+					opcode = OP_IMAX;
+				if (fsig->params [0]->type == MONO_TYPE_U4)
+					opcode = OP_IMAX_UN;
+#ifdef __mono_ppc64__
+				else if (fsig->params [0]->type == MONO_TYPE_I8)
+					opcode = OP_LMAX;
+				else if (fsig->params [0]->type == MONO_TYPE_U8)
+					opcode = OP_LMAX_UN;
+#endif
+			}
+			/*
+			 * TODO: Floating point version with fsel, but fsel has
+			 * some peculiarities (need a scratch reg unless
+			 * comparing with 0, NaN/Inf behaviour (then MathF too)
+			 */
+		}
+
+		if (opcode && fsig->param_count == 2) {
+			MONO_INST_NEW (cfg, ins, opcode);
+			ins->type = fsig->params [0]->type == MONO_TYPE_I4 ? STACK_I4 : STACK_I8;
+			ins->dreg = mono_alloc_ireg (cfg);
+			ins->sreg1 = args [0]->dreg;
+			ins->sreg2 = args [1]->dreg;
+			MONO_ADD_INS (cfg->cbb, ins);
+		}
+
+		/* Rounding instructions */
+		opcode = 0;
+		if ((cpu_hw_caps & PPC_ISA_2X) && (fsig->param_count == 1) && (fsig->params [0]->type == MONO_TYPE_R8)) {
+			/*
+			 * XXX: sysmath.c and frin imply round is a little bit
+			 * more complicated than expected? but amd64 does this?
+			 * (also, no float versions of these ops, but frsp
+			 * could be preprended?)
+			 */
+			if (!strcmp (cmethod->name, "Round"))
+				opcode = OP_ROUND;
+			else if (!strcmp (cmethod->name, "Floor"))
+				opcode = OP_PPC_FLOOR;
+			else if (!strcmp (cmethod->name, "Ceiling"))
+				opcode = OP_PPC_CEIL;
+			else if (!strcmp (cmethod->name, "Truncate"))
+				opcode = OP_PPC_TRUNC;
+			if (opcode != 0) {
+				MONO_INST_NEW (cfg, ins, opcode);
+				ins->type = STACK_R8;
+				ins->dreg = mono_alloc_freg (cfg);
+				ins->sreg1 = args [0]->dreg;
+				MONO_ADD_INS (cfg->cbb, ins);
+			}
+		}
+	}
+	if (cmethod->klass == mono_class_try_get_mathf_class ()) {
+		if (strcmp (cmethod->name, "Sqrt") == 0) {
+			opcode = OP_SQRTF;
+		} /* XXX: POWER has no single-precision normal FPU abs? */
+
+		if (opcode && fsig->param_count == 1) {
+			MONO_INST_NEW (cfg, ins, opcode);
+			ins->type = STACK_R4;
+			ins->dreg = mono_alloc_freg (cfg);
+			ins->sreg1 = args [0]->dreg;
+			MONO_ADD_INS (cfg->cbb, ins);
+		}
+	}
+	return ins;
 }
 
 host_mgreg_t
diff --git a/mono/utils/mono-hwcap-ppc.c b/mono/utils/mono-hwcap-ppc.c
index 2c1cb154317..0680ee37e2e 100644
--- a/mono/utils/mono-hwcap-ppc.c
+++ b/mono/utils/mono-hwcap-ppc.c
@@ -60,6 +60,16 @@ mono_hwcap_arch_init (void)
 		if (hwcap & (0x00080000 | 0x00040000 | 0x00020000 | 0x00010000 | 0x00000800 | 0x00001000))
 			mono_hwcap_ppc_is_isa_2x = TRUE;
 
+		/* PPC_FEATURE_POWER4, PPC_FEATURE_POWER5, PPC_FEATURE_POWER5_PLUS,
+		   PPC_FEATURE_CELL_BE, PPC_FEATURE_PA6T, PPC_FEATURE_ARCH_2_05 */
+		if (hwcap & (0x00080000 | 0x00040000 | 0x00020000 | 0x00010000 | 0x00000800 | 0x00001000))
+			mono_hwcap_ppc_is_isa_2x = TRUE;
+
+		/* PPC_FEATURE_POWER5, PPC_FEATURE_POWER5_PLUS,
+		   PPC_FEATURE_CELL_BE, PPC_FEATURE_PA6T, PPC_FEATURE_ARCH_2_05 */
+		if (hwcap & (0x00040000 | 0x00020000 | 0x00010000 | 0x00000800 | 0x00001000))
+			mono_hwcap_ppc_is_isa_2_03 = TRUE;
+
 		/* PPC_FEATURE_64 */
 		if (hwcap & 0x40000000)
 			mono_hwcap_ppc_is_isa_64 = TRUE;
@@ -76,26 +86,22 @@ mono_hwcap_arch_init (void)
 			mono_hwcap_ppc_has_multiple_ls_units = TRUE;
 	}
 #elif defined(_AIX)
-	if (__cpu64())
-		mono_hwcap_ppc_is_isa_64 = TRUE;
-	if (__power_4_andup())
-		mono_hwcap_ppc_is_isa_2x = TRUE;
-	if (__power_5_andup())
+	/* Compatible platforms for Mono (V7R1, 6.1.9) require at least P4. */
+	mono_hwcap_ppc_is_isa_64 = TRUE;
+	mono_hwcap_ppc_is_isa_2x = TRUE;
+	if (__power_5_andup()) {
+		mono_hwcap_ppc_is_isa_2_03 = TRUE;
 		mono_hwcap_ppc_has_icache_snoop = TRUE;
+	}
 	/* not on POWER8 */
 	if (__power_4() || __power_5() || __power_6() || __power_7())
 		mono_hwcap_ppc_has_multiple_ls_units = TRUE;
 	/*
-	 * I dont see a way to get extended POWER6 and the PV_6_1
-	 * def seems to be trigged on the POWER6 here despite not
-	 * having these extended instructions, so POWER7 it is
-	 */
-	/*
-	 * WARNING: reports that this doesn't actually work, try
-	 * to re-enable after more investigation
+	 * This instruction is only available in POWER6 "raw mode" and unlikely
+	 * to work; I couldn't get it to work on the POWER6s I tried.
 	 */
 	/*
-	if (__power_7_andup())
+	if (__power_6())
 		mono_hwcap_ppc_has_move_fpr_gpr = TRUE;
 	 */
 #endif
diff --git a/mono/utils/mono-hwcap-vars.h b/mono/utils/mono-hwcap-vars.h
index 5fefe5ca5e4..1596ca48d71 100644
--- a/mono/utils/mono-hwcap-vars.h
+++ b/mono/utils/mono-hwcap-vars.h
@@ -27,6 +27,7 @@ MONO_HWCAP_VAR(arm_has_thumb2)
 
 MONO_HWCAP_VAR(ppc_has_icache_snoop)
 MONO_HWCAP_VAR(ppc_is_isa_2x)
+MONO_HWCAP_VAR(ppc_is_isa_2_03)
 MONO_HWCAP_VAR(ppc_is_isa_64)
 MONO_HWCAP_VAR(ppc_has_move_fpr_gpr)
 MONO_HWCAP_VAR(ppc_has_multiple_ls_units)
-- 
2.11.4.GIT