From 2e9c5df855566402a314864580ec7dd908be2112 Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Thu, 29 Aug 2019 11:32:43 +0300
Subject: [PATCH] LLVM: Add more Math intrinsics, introduce `--cpu` flag
 (#16309)

* Add Math Intrinsics, introduce --cpu flag

* cpu -> mcpu
---
 man/mono.1                                         |  3 +
 mono/mini/aot-compiler.c                           | 22 ++++++-
 mono/mini/intrinsics.c                             | 26 +++++++-
 mono/mini/mini-llvm.c                              | 76 ++++++++++++++++++++++
 mono/mini/mini-ops.h                               |  6 ++
 mono/mini/mini.c                                   |  1 +
 mono/mini/mini.h                                   |  3 +
 netcore/System.Private.CoreLib/src/System/Math.cs  |  2 +-
 netcore/System.Private.CoreLib/src/System/MathF.cs |  2 +-
 9 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/man/mono.1 b/man/mono.1
index 2c31830eb26..4370decd60e 100644
--- a/man/mono.1
+++ b/man/mono.1
@@ -246,6 +246,9 @@ program that comes with Mono, and calling it like this:
 
 .fi
 .TP
+.I mcpu=[native o generic]
+cpu=native allows AOT mode to use all instructions current CPU supports, e.g. AVX2, SSE42, etc.
+Default value is 'generic'.
 .I llvm-outfile=[filename]
 Gives the path for the temporary LLVM bitcode file created during AOT.
 .I dedup
diff --git a/mono/mini/aot-compiler.c b/mono/mini/aot-compiler.c
index 38dfc73b2ee..1cb15715294 100644
--- a/mono/mini/aot-compiler.c
+++ b/mono/mini/aot-compiler.c
@@ -230,6 +230,7 @@ typedef struct MonoAotOptions {
 	char *logfile;
 	char *llvm_opts;
 	char *llvm_llc;
+	gboolean use_current_cpu;
 	gboolean dump_json;
 	gboolean profile_only;
 	gboolean no_opt;
@@ -1103,7 +1104,7 @@ arch_init (MonoAotCompile *acfg)
 	acfg->user_symbol_prefix = "";
 
 #if TARGET_X86 || TARGET_AMD64
-	const gboolean has_custom_args = !!acfg->aot_opts.llvm_llc;
+	const gboolean has_custom_args = !!acfg->aot_opts.llvm_llc || acfg->aot_opts.use_current_cpu;
 #endif
 
 #if defined(TARGET_X86)
@@ -8050,6 +8051,15 @@ mono_aot_parse_options (const char *aot_options, MonoAotOptions *opts)
 			opts->no_opt = TRUE;
 		} else if (str_begins_with (arg, "clangxx=")) {
 			opts->clangxx = g_strdup (arg + strlen ("clangxx="));
+		} else if (str_begins_with (arg, "mcpu=")) {
+			if (!strcmp(arg, "mcpu=native")) {
+				opts->use_current_cpu = TRUE;
+			} else if (!strcmp(arg, "mcpu=generic")) {
+				opts->use_current_cpu = FALSE;
+			} else {
+				printf ("mcpu can only be 'native' or 'generic' (default).\n");
+				exit (0);
+			}
 		} else if (str_begins_with (arg, "depfile=")) {
 			opts->depfile = g_strdup (arg + strlen ("depfile="));
 		} else if (str_begins_with (arg, "help") || str_begins_with (arg, "?")) {
@@ -8446,6 +8456,8 @@ compile_method (MonoAotCompile *acfg, MonoMethod *method)
 		flags = (JitFlags)(flags | JIT_FLAG_DIRECT_PINVOKE);
 	if (acfg->aot_opts.interp)
 		flags = (JitFlags)(flags | JIT_FLAG_INTERP);
+	if (acfg->aot_opts.use_current_cpu)
+		flags = (JitFlags)(flags | JIT_FLAG_USE_CURRENT_CPU);
 
 	jit_time_start = mono_time_track_start ();
 	cfg = mini_method_compile (method, acfg->opts, mono_get_root_domain (), flags, 0, index);
@@ -9633,6 +9645,10 @@ emit_llvm_file (MonoAotCompile *acfg)
 		opts = g_strdup_printf ("%s %s", opts, acfg->aot_opts.llvm_opts);
 	}
 
+	if (acfg->aot_opts.use_current_cpu) {
+		opts = g_strdup_printf ("%s -mcpu=native", opts);
+	}
+
 	if (mono_use_fast_math) {
 		// same parameters are passed to llc and LLVM JIT
 		opts = g_strdup_printf ("%s -fp-contract=fast -enable-no-infs-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -enable-no-trapping-fp-math -enable-unsafe-fp-math", opts);
@@ -9705,6 +9721,10 @@ emit_llvm_file (MonoAotCompile *acfg)
 		g_string_append_printf (acfg->llc_args, " %s", acfg->aot_opts.llvm_llc);
 	}
 
+	if (acfg->aot_opts.use_current_cpu) {
+		g_string_append (acfg->llc_args, " -mcpu=native");
+	}
+
 	command = g_strdup_printf ("\"%sllc\" %s -o \"%s\" \"%s.opt.bc\"", acfg->aot_opts.llvm_path, acfg->llc_args->str, output_fname, acfg->tmpbasename);
 	g_free (output_fname);
 
diff --git a/mono/mini/intrinsics.c b/mono/mini/intrinsics.c
index a276b5f72f4..4546e03098c 100644
--- a/mono/mini/intrinsics.c
+++ b/mono/mini/intrinsics.c
@@ -119,6 +119,12 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
 			opcode = OP_ABSF;
 		else if (!strcmp (cmethod->name, "Sqrt"))
 			opcode = OP_SQRTF;
+		else if (!strcmp (cmethod->name, "Floor"))
+			opcode = OP_FLOORF;
+		else if (!strcmp (cmethod->name, "Ceiling"))
+			opcode = OP_CEILF;
+		else if (!strcmp (cmethod->name, "FusedMultiplyAdd"))
+			opcode = OP_FMAF;
 		// Max and Min can only be optimized in fast math mode
 		else if (!strcmp (cmethod->name, "Max") && mono_use_fast_math)
 			opcode = OP_RMAX;
@@ -126,13 +132,17 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
 			opcode = OP_RMIN;
 		else if (!strcmp (cmethod->name, "Pow"))
 			opcode = OP_RPOW;
-		if (opcode) {
+		if (opcode && fsig->param_count > 0) {
 			MONO_INST_NEW (cfg, ins, opcode);
 			ins->type = STACK_R8;
 			ins->dreg = mono_alloc_dreg (cfg, (MonoStackType)ins->type);
 			ins->sreg1 = args [0]->dreg;
-			if (fsig->param_count == 2)
+			if (fsig->param_count == 2) { // POW
 				ins->sreg2 = args [1]->dreg;
+			} else if (fsig->param_count == 3) { // FMA
+				ins->sreg2 = args [1]->dreg;
+				ins->sreg3 = args [2]->dreg;
+			}
 			MONO_ADD_INS (cfg->cbb, ins);
 		}
 	}
@@ -144,15 +154,25 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
 			opcode = OP_COS;
 		} else if (strcmp (cmethod->name, "Sqrt") == 0) {
 			opcode = OP_SQRT;
+		} else if (strcmp (cmethod->name, "Floor") == 0) {
+			opcode = OP_FLOOR;
+		} else if (strcmp (cmethod->name, "Ceiling") == 0) {
+			opcode = OP_CEIL;
+		} else if (strcmp (cmethod->name, "FusedMultiplyAdd") == 0) {
+			opcode = OP_FMA;
 		} else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) {
 			opcode = OP_ABS;
 		}
 
-		if (opcode && fsig->param_count == 1) {
+		if (opcode && fsig->param_count > 0) {
 			MONO_INST_NEW (cfg, ins, opcode);
 			ins->type = STACK_R8;
 			ins->dreg = mono_alloc_dreg (cfg, (MonoStackType)ins->type);
 			ins->sreg1 = args [0]->dreg;
+			if (fsig->param_count == 3) { // FMA
+				ins->sreg2 = args [1]->dreg;
+				ins->sreg3 = args [2]->dreg;
+			}
 			MONO_ADD_INS (cfg->cbb, ins);
 		}
 
diff --git a/mono/mini/mini-llvm.c b/mono/mini/mini-llvm.c
index 623f4937d7c..94dd6afedac 100644
--- a/mono/mini/mini-llvm.c
+++ b/mono/mini/mini-llvm.c
@@ -293,11 +293,17 @@ typedef enum {
 	INTRINS_SIN,
 	INTRINS_COS,
 	INTRINS_SQRT,
+	INTRINS_FLOOR,
+	INTRINS_CEIL,
+	INTRINS_FMA,
 	INTRINS_FABS,
 	INTRINS_ABSF,
 	INTRINS_SINF,
 	INTRINS_COSF,
 	INTRINS_SQRTF,
+	INTRINS_FLOORF,
+	INTRINS_CEILF,
+	INTRINS_FMAF,
 	INTRINS_POWF,
 	INTRINS_EXPECT_I8,
 	INTRINS_EXPECT_I1,
@@ -5967,6 +5973,54 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
 			values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_SQRTF), args, 1, dname);
 			break;
 		}
+		case OP_FLOOR: {
+			LLVMValueRef args [1];
+
+			args [0] = convert (ctx, lhs, LLVMDoubleType ());
+			values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FLOOR), args, 1, dname);
+			break;
+		}
+		case OP_FLOORF: {
+			LLVMValueRef args [1];
+
+			args [0] = convert (ctx, lhs, LLVMFloatType ());
+			values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FLOORF), args, 1, dname);
+			break;
+		}
+		case OP_CEIL: {
+			LLVMValueRef args [1];
+
+			args [0] = convert (ctx, lhs, LLVMDoubleType ());
+			values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_CEIL), args, 1, dname);
+			break;
+		}
+		case OP_CEILF: {
+			LLVMValueRef args [1];
+
+			args [0] = convert (ctx, lhs, LLVMFloatType ());
+			values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_CEILF), args, 1, dname);
+			break;
+		}
+		case OP_FMA: {
+			LLVMValueRef args [3];
+
+			args [0] = convert (ctx, values [ins->sreg1], LLVMDoubleType ());
+			args [1] = convert (ctx, values [ins->sreg2], LLVMDoubleType ());
+			args [2] = convert (ctx, values [ins->sreg3], LLVMDoubleType ());
+			
+			values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FMA), args, 3, dname);
+			break;
+		}
+		case OP_FMAF: {
+			LLVMValueRef args [3];
+
+			args [0] = convert (ctx, values [ins->sreg1], LLVMFloatType ());
+			args [1] = convert (ctx, values [ins->sreg2], LLVMFloatType ());
+			args [2] = convert (ctx, values [ins->sreg3], LLVMFloatType ());
+			
+			values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FMAF), args, 3, dname);
+			break;
+		}
 		case OP_ABS: {
 			LLVMValueRef args [1];
 
@@ -8470,6 +8524,12 @@ static IntrinsicDesc intrinsics[] = {
 	{INTRINS_SIN, "llvm.sin.f64"},
 	{INTRINS_COS, "llvm.cos.f64"},
 	{INTRINS_SQRT, "llvm.sqrt.f64"},
+	{INTRINS_FLOOR, "llvm.floor.f64"},
+	{INTRINS_FLOORF, "llvm.floor.f32"},
+	{INTRINS_CEIL, "llvm.ceil.f64"},
+	{INTRINS_CEILF, "llvm.ceil.f32"},
+	{INTRINS_FMA, "llvm.fma.f64"},
+	{INTRINS_FMAF, "llvm.fma.f32"},
 	/* This isn't an intrinsic, instead llvm seems to special case it by name */
 	{INTRINS_FABS, "fabs"},
 	{INTRINS_ABSF, "llvm.fabs.f32"},
@@ -8616,9 +8676,23 @@ add_intrinsic (LLVMModuleRef module, int id)
 		AddFunc (module, name, ret_type, params, 2);
 		break;
 	}
+	case INTRINS_FMA: {
+		LLVMTypeRef params [] = { LLVMDoubleType (), LLVMDoubleType (), LLVMDoubleType () };
+
+		AddFunc (module, name, LLVMDoubleType (), params, 3);
+		break;
+	}
+	case INTRINS_FMAF: {
+		LLVMTypeRef params [] = { LLVMFloatType (), LLVMFloatType (), LLVMFloatType () };
+
+		AddFunc (module, name, LLVMFloatType (), params, 3);
+		break;
+	}
 	case INTRINS_SIN:
 	case INTRINS_COS:
 	case INTRINS_SQRT:
+	case INTRINS_FLOOR:
+	case INTRINS_CEIL:
 	case INTRINS_FABS: {
 		LLVMTypeRef params [] = { LLVMDoubleType () };
 
@@ -8628,6 +8702,8 @@ add_intrinsic (LLVMModuleRef module, int id)
 	case INTRINS_SINF:
 	case INTRINS_COSF:
 	case INTRINS_SQRTF:
+	case INTRINS_FLOORF:
+	case INTRINS_CEILF:
 	case INTRINS_ABSF: {
 		LLVMTypeRef params [] = { LLVMFloatType () };
 
diff --git a/mono/mini/mini-ops.h b/mono/mini/mini-ops.h
index 175a388fee1..714431b1a31 100644
--- a/mono/mini/mini-ops.h
+++ b/mono/mini/mini-ops.h
@@ -712,10 +712,16 @@ MINI_OP(OP_TAN,     "tan", FREG, FREG, NONE)
 MINI_OP(OP_ATAN,    "atan", FREG, FREG, NONE)
 MINI_OP(OP_SQRT,    "sqrt", FREG, FREG, NONE)
 MINI_OP(OP_ROUND,   "round", FREG, FREG, NONE)
+MINI_OP(OP_CEIL,    "ceil", FREG, FREG, NONE)
+MINI_OP(OP_FLOOR,   "floor", FREG, FREG, NONE)
+MINI_OP3(OP_FMA,     "fma", FREG, FREG, FREG, FREG)
 MINI_OP(OP_SINF,     "sinf", FREG, FREG, NONE)
 MINI_OP(OP_COSF,     "cosf", FREG, FREG, NONE)
 MINI_OP(OP_ABSF,     "absf", FREG, FREG, NONE)
 MINI_OP(OP_SQRTF,    "sqrtf", FREG, FREG, NONE)
+MINI_OP(OP_CEILF,    "ceilf", FREG, FREG, NONE)
+MINI_OP(OP_FLOORF,   "floorf", FREG, FREG, NONE)
+MINI_OP3(OP_FMAF,     "fmaf", FREG, FREG, FREG, FREG)
 
 /* Operations that can be computed at constants at JIT time  */
 MINI_OP(OP_ACOS,     "acos", FREG, FREG, NONE)
diff --git a/mono/mini/mini.c b/mono/mini/mini.c
index c21c27a4d4a..7bc4b70c5af 100644
--- a/mono/mini/mini.c
+++ b/mono/mini/mini.c
@@ -3146,6 +3146,7 @@ mini_method_compile (MonoMethod *method, guint32 opts, MonoDomain *domain, JitFl
 	cfg->gen_sdb_seq_points = mini_debug_options.gen_sdb_seq_points;
 	cfg->llvm_only = (flags & JIT_FLAG_LLVM_ONLY) != 0;
 	cfg->interp = (flags & JIT_FLAG_INTERP) != 0;
+	cfg->use_current_cpu = (flags & JIT_FLAG_USE_CURRENT_CPU) != 0;
 	cfg->backend = current_backend;
 
 #ifdef HOST_ANDROID
diff --git a/mono/mini/mini.h b/mono/mini/mini.h
index 7b95bf096b2..ac98a092e15 100644
--- a/mono/mini/mini.h
+++ b/mono/mini/mini.h
@@ -1220,6 +1220,8 @@ typedef enum {
 	JIT_FLAG_DISCARD_RESULTS = (1 << 8),
 	/* Whenever to generate code which can work with the interpreter */
 	JIT_FLAG_INTERP = (1 << 9),
+	/* Allow AOT to use all current CPU instructions */
+	JIT_FLAG_USE_CURRENT_CPU = (1 << 10),
 } JitFlags;
 
 /* Bit-fields in the MonoBasicBlock.region */
@@ -1433,6 +1435,7 @@ typedef struct {
 	guint            r4fp : 1;
 	guint            llvm_only : 1;
 	guint            interp : 1;
+	guint            use_current_cpu : 1;
 	guint            domainvar_inited : 1;
 	guint8           uses_simd_intrinsics;
 	int              r4_stack_type;
diff --git a/netcore/System.Private.CoreLib/src/System/Math.cs b/netcore/System.Private.CoreLib/src/System/Math.cs
index addda9d3732..6f6fe9eb724 100644
--- a/netcore/System.Private.CoreLib/src/System/Math.cs
+++ b/netcore/System.Private.CoreLib/src/System/Math.cs
@@ -77,7 +77,7 @@ namespace System
 		[MethodImpl (MethodImplOptions.InternalCall)]
 		public static extern double Tanh (double value);
 
-		// [Intrinsic] TODO: implement FMA intrinsic
+		[Intrinsic]
 		[MethodImpl (MethodImplOptions.InternalCall)]
 		public static extern double FusedMultiplyAdd (double x, double y, double z);
 
diff --git a/netcore/System.Private.CoreLib/src/System/MathF.cs b/netcore/System.Private.CoreLib/src/System/MathF.cs
index 86061a319d8..16d04343814 100644
--- a/netcore/System.Private.CoreLib/src/System/MathF.cs
+++ b/netcore/System.Private.CoreLib/src/System/MathF.cs
@@ -71,7 +71,7 @@ namespace System
 		[MethodImpl (MethodImplOptions.InternalCall)]
 		public static extern float Tanh (float x);
 
-		// [Intrinsic] TODO: implement intrinsic  (FMA)
+		[Intrinsic]
 		[MethodImpl  (MethodImplOptions.InternalCall)]
 		public static extern float FusedMultiplyAdd (float x, float y, float z);
 
-- 
2.11.4.GIT