From 2e9c5df855566402a314864580ec7dd908be2112 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Thu, 29 Aug 2019 11:32:43 +0300 Subject: [PATCH] LLVM: Add more Math intrinsics, introduce `--cpu` flag (#16309) * Add Math Intrinsics, introduce --cpu flag * cpu -> mcpu --- man/mono.1 | 3 + mono/mini/aot-compiler.c | 22 ++++++- mono/mini/intrinsics.c | 26 +++++++- mono/mini/mini-llvm.c | 76 ++++++++++++++++++++++ mono/mini/mini-ops.h | 6 ++ mono/mini/mini.c | 1 + mono/mini/mini.h | 3 + netcore/System.Private.CoreLib/src/System/Math.cs | 2 +- netcore/System.Private.CoreLib/src/System/MathF.cs | 2 +- 9 files changed, 135 insertions(+), 6 deletions(-) diff --git a/man/mono.1 b/man/mono.1 index 2c31830eb26..4370decd60e 100644 --- a/man/mono.1 +++ b/man/mono.1 @@ -246,6 +246,9 @@ program that comes with Mono, and calling it like this: .fi .TP +.I mcpu=[native o generic] +cpu=native allows AOT mode to use all instructions current CPU supports, e.g. AVX2, SSE42, etc. +Default value is 'generic'. .I llvm-outfile=[filename] Gives the path for the temporary LLVM bitcode file created during AOT. .I dedup diff --git a/mono/mini/aot-compiler.c b/mono/mini/aot-compiler.c index 38dfc73b2ee..1cb15715294 100644 --- a/mono/mini/aot-compiler.c +++ b/mono/mini/aot-compiler.c @@ -230,6 +230,7 @@ typedef struct MonoAotOptions { char *logfile; char *llvm_opts; char *llvm_llc; + gboolean use_current_cpu; gboolean dump_json; gboolean profile_only; gboolean no_opt; @@ -1103,7 +1104,7 @@ arch_init (MonoAotCompile *acfg) acfg->user_symbol_prefix = ""; #if TARGET_X86 || TARGET_AMD64 - const gboolean has_custom_args = !!acfg->aot_opts.llvm_llc; + const gboolean has_custom_args = !!acfg->aot_opts.llvm_llc || acfg->aot_opts.use_current_cpu; #endif #if defined(TARGET_X86) @@ -8050,6 +8051,15 @@ mono_aot_parse_options (const char *aot_options, MonoAotOptions *opts) opts->no_opt = TRUE; } else if (str_begins_with (arg, "clangxx=")) { opts->clangxx = g_strdup (arg + strlen ("clangxx=")); + } else if (str_begins_with (arg, "mcpu=")) { + if (!strcmp(arg, "mcpu=native")) { + opts->use_current_cpu = TRUE; + } else if (!strcmp(arg, "mcpu=generic")) { + opts->use_current_cpu = FALSE; + } else { + printf ("mcpu can only be 'native' or 'generic' (default).\n"); + exit (0); + } } else if (str_begins_with (arg, "depfile=")) { opts->depfile = g_strdup (arg + strlen ("depfile=")); } else if (str_begins_with (arg, "help") || str_begins_with (arg, "?")) { @@ -8446,6 +8456,8 @@ compile_method (MonoAotCompile *acfg, MonoMethod *method) flags = (JitFlags)(flags | JIT_FLAG_DIRECT_PINVOKE); if (acfg->aot_opts.interp) flags = (JitFlags)(flags | JIT_FLAG_INTERP); + if (acfg->aot_opts.use_current_cpu) + flags = (JitFlags)(flags | JIT_FLAG_USE_CURRENT_CPU); jit_time_start = mono_time_track_start (); cfg = mini_method_compile (method, acfg->opts, mono_get_root_domain (), flags, 0, index); @@ -9633,6 +9645,10 @@ emit_llvm_file (MonoAotCompile *acfg) opts = g_strdup_printf ("%s %s", opts, acfg->aot_opts.llvm_opts); } + if (acfg->aot_opts.use_current_cpu) { + opts = g_strdup_printf ("%s -mcpu=native", opts); + } + if (mono_use_fast_math) { // same parameters are passed to llc and LLVM JIT opts = g_strdup_printf ("%s -fp-contract=fast -enable-no-infs-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -enable-no-trapping-fp-math -enable-unsafe-fp-math", opts); @@ -9705,6 +9721,10 @@ emit_llvm_file (MonoAotCompile *acfg) g_string_append_printf (acfg->llc_args, " %s", acfg->aot_opts.llvm_llc); } + if (acfg->aot_opts.use_current_cpu) { + g_string_append (acfg->llc_args, " -mcpu=native"); + } + command = g_strdup_printf ("\"%sllc\" %s -o \"%s\" \"%s.opt.bc\"", acfg->aot_opts.llvm_path, acfg->llc_args->str, output_fname, acfg->tmpbasename); g_free (output_fname); diff --git a/mono/mini/intrinsics.c b/mono/mini/intrinsics.c index a276b5f72f4..4546e03098c 100644 --- a/mono/mini/intrinsics.c +++ b/mono/mini/intrinsics.c @@ -119,6 +119,12 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign opcode = OP_ABSF; else if (!strcmp (cmethod->name, "Sqrt")) opcode = OP_SQRTF; + else if (!strcmp (cmethod->name, "Floor")) + opcode = OP_FLOORF; + else if (!strcmp (cmethod->name, "Ceiling")) + opcode = OP_CEILF; + else if (!strcmp (cmethod->name, "FusedMultiplyAdd")) + opcode = OP_FMAF; // Max and Min can only be optimized in fast math mode else if (!strcmp (cmethod->name, "Max") && mono_use_fast_math) opcode = OP_RMAX; @@ -126,13 +132,17 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign opcode = OP_RMIN; else if (!strcmp (cmethod->name, "Pow")) opcode = OP_RPOW; - if (opcode) { + if (opcode && fsig->param_count > 0) { MONO_INST_NEW (cfg, ins, opcode); ins->type = STACK_R8; ins->dreg = mono_alloc_dreg (cfg, (MonoStackType)ins->type); ins->sreg1 = args [0]->dreg; - if (fsig->param_count == 2) + if (fsig->param_count == 2) { // POW ins->sreg2 = args [1]->dreg; + } else if (fsig->param_count == 3) { // FMA + ins->sreg2 = args [1]->dreg; + ins->sreg3 = args [2]->dreg; + } MONO_ADD_INS (cfg->cbb, ins); } } @@ -144,15 +154,25 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign opcode = OP_COS; } else if (strcmp (cmethod->name, "Sqrt") == 0) { opcode = OP_SQRT; + } else if (strcmp (cmethod->name, "Floor") == 0) { + opcode = OP_FLOOR; + } else if (strcmp (cmethod->name, "Ceiling") == 0) { + opcode = OP_CEIL; + } else if (strcmp (cmethod->name, "FusedMultiplyAdd") == 0) { + opcode = OP_FMA; } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) { opcode = OP_ABS; } - if (opcode && fsig->param_count == 1) { + if (opcode && fsig->param_count > 0) { MONO_INST_NEW (cfg, ins, opcode); ins->type = STACK_R8; ins->dreg = mono_alloc_dreg (cfg, (MonoStackType)ins->type); ins->sreg1 = args [0]->dreg; + if (fsig->param_count == 3) { // FMA + ins->sreg2 = args [1]->dreg; + ins->sreg3 = args [2]->dreg; + } MONO_ADD_INS (cfg->cbb, ins); } diff --git a/mono/mini/mini-llvm.c b/mono/mini/mini-llvm.c index 623f4937d7c..94dd6afedac 100644 --- a/mono/mini/mini-llvm.c +++ b/mono/mini/mini-llvm.c @@ -293,11 +293,17 @@ typedef enum { INTRINS_SIN, INTRINS_COS, INTRINS_SQRT, + INTRINS_FLOOR, + INTRINS_CEIL, + INTRINS_FMA, INTRINS_FABS, INTRINS_ABSF, INTRINS_SINF, INTRINS_COSF, INTRINS_SQRTF, + INTRINS_FLOORF, + INTRINS_CEILF, + INTRINS_FMAF, INTRINS_POWF, INTRINS_EXPECT_I8, INTRINS_EXPECT_I1, @@ -5967,6 +5973,54 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_SQRTF), args, 1, dname); break; } + case OP_FLOOR: { + LLVMValueRef args [1]; + + args [0] = convert (ctx, lhs, LLVMDoubleType ()); + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FLOOR), args, 1, dname); + break; + } + case OP_FLOORF: { + LLVMValueRef args [1]; + + args [0] = convert (ctx, lhs, LLVMFloatType ()); + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FLOORF), args, 1, dname); + break; + } + case OP_CEIL: { + LLVMValueRef args [1]; + + args [0] = convert (ctx, lhs, LLVMDoubleType ()); + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_CEIL), args, 1, dname); + break; + } + case OP_CEILF: { + LLVMValueRef args [1]; + + args [0] = convert (ctx, lhs, LLVMFloatType ()); + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_CEILF), args, 1, dname); + break; + } + case OP_FMA: { + LLVMValueRef args [3]; + + args [0] = convert (ctx, values [ins->sreg1], LLVMDoubleType ()); + args [1] = convert (ctx, values [ins->sreg2], LLVMDoubleType ()); + args [2] = convert (ctx, values [ins->sreg3], LLVMDoubleType ()); + + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FMA), args, 3, dname); + break; + } + case OP_FMAF: { + LLVMValueRef args [3]; + + args [0] = convert (ctx, values [ins->sreg1], LLVMFloatType ()); + args [1] = convert (ctx, values [ins->sreg2], LLVMFloatType ()); + args [2] = convert (ctx, values [ins->sreg3], LLVMFloatType ()); + + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, INTRINS_FMAF), args, 3, dname); + break; + } case OP_ABS: { LLVMValueRef args [1]; @@ -8470,6 +8524,12 @@ static IntrinsicDesc intrinsics[] = { {INTRINS_SIN, "llvm.sin.f64"}, {INTRINS_COS, "llvm.cos.f64"}, {INTRINS_SQRT, "llvm.sqrt.f64"}, + {INTRINS_FLOOR, "llvm.floor.f64"}, + {INTRINS_FLOORF, "llvm.floor.f32"}, + {INTRINS_CEIL, "llvm.ceil.f64"}, + {INTRINS_CEILF, "llvm.ceil.f32"}, + {INTRINS_FMA, "llvm.fma.f64"}, + {INTRINS_FMAF, "llvm.fma.f32"}, /* This isn't an intrinsic, instead llvm seems to special case it by name */ {INTRINS_FABS, "fabs"}, {INTRINS_ABSF, "llvm.fabs.f32"}, @@ -8616,9 +8676,23 @@ add_intrinsic (LLVMModuleRef module, int id) AddFunc (module, name, ret_type, params, 2); break; } + case INTRINS_FMA: { + LLVMTypeRef params [] = { LLVMDoubleType (), LLVMDoubleType (), LLVMDoubleType () }; + + AddFunc (module, name, LLVMDoubleType (), params, 3); + break; + } + case INTRINS_FMAF: { + LLVMTypeRef params [] = { LLVMFloatType (), LLVMFloatType (), LLVMFloatType () }; + + AddFunc (module, name, LLVMFloatType (), params, 3); + break; + } case INTRINS_SIN: case INTRINS_COS: case INTRINS_SQRT: + case INTRINS_FLOOR: + case INTRINS_CEIL: case INTRINS_FABS: { LLVMTypeRef params [] = { LLVMDoubleType () }; @@ -8628,6 +8702,8 @@ add_intrinsic (LLVMModuleRef module, int id) case INTRINS_SINF: case INTRINS_COSF: case INTRINS_SQRTF: + case INTRINS_FLOORF: + case INTRINS_CEILF: case INTRINS_ABSF: { LLVMTypeRef params [] = { LLVMFloatType () }; diff --git a/mono/mini/mini-ops.h b/mono/mini/mini-ops.h index 175a388fee1..714431b1a31 100644 --- a/mono/mini/mini-ops.h +++ b/mono/mini/mini-ops.h @@ -712,10 +712,16 @@ MINI_OP(OP_TAN, "tan", FREG, FREG, NONE) MINI_OP(OP_ATAN, "atan", FREG, FREG, NONE) MINI_OP(OP_SQRT, "sqrt", FREG, FREG, NONE) MINI_OP(OP_ROUND, "round", FREG, FREG, NONE) +MINI_OP(OP_CEIL, "ceil", FREG, FREG, NONE) +MINI_OP(OP_FLOOR, "floor", FREG, FREG, NONE) +MINI_OP3(OP_FMA, "fma", FREG, FREG, FREG, FREG) MINI_OP(OP_SINF, "sinf", FREG, FREG, NONE) MINI_OP(OP_COSF, "cosf", FREG, FREG, NONE) MINI_OP(OP_ABSF, "absf", FREG, FREG, NONE) MINI_OP(OP_SQRTF, "sqrtf", FREG, FREG, NONE) +MINI_OP(OP_CEILF, "ceilf", FREG, FREG, NONE) +MINI_OP(OP_FLOORF, "floorf", FREG, FREG, NONE) +MINI_OP3(OP_FMAF, "fmaf", FREG, FREG, FREG, FREG) /* Operations that can be computed at constants at JIT time */ MINI_OP(OP_ACOS, "acos", FREG, FREG, NONE) diff --git a/mono/mini/mini.c b/mono/mini/mini.c index c21c27a4d4a..7bc4b70c5af 100644 --- a/mono/mini/mini.c +++ b/mono/mini/mini.c @@ -3146,6 +3146,7 @@ mini_method_compile (MonoMethod *method, guint32 opts, MonoDomain *domain, JitFl cfg->gen_sdb_seq_points = mini_debug_options.gen_sdb_seq_points; cfg->llvm_only = (flags & JIT_FLAG_LLVM_ONLY) != 0; cfg->interp = (flags & JIT_FLAG_INTERP) != 0; + cfg->use_current_cpu = (flags & JIT_FLAG_USE_CURRENT_CPU) != 0; cfg->backend = current_backend; #ifdef HOST_ANDROID diff --git a/mono/mini/mini.h b/mono/mini/mini.h index 7b95bf096b2..ac98a092e15 100644 --- a/mono/mini/mini.h +++ b/mono/mini/mini.h @@ -1220,6 +1220,8 @@ typedef enum { JIT_FLAG_DISCARD_RESULTS = (1 << 8), /* Whenever to generate code which can work with the interpreter */ JIT_FLAG_INTERP = (1 << 9), + /* Allow AOT to use all current CPU instructions */ + JIT_FLAG_USE_CURRENT_CPU = (1 << 10), } JitFlags; /* Bit-fields in the MonoBasicBlock.region */ @@ -1433,6 +1435,7 @@ typedef struct { guint r4fp : 1; guint llvm_only : 1; guint interp : 1; + guint use_current_cpu : 1; guint domainvar_inited : 1; guint8 uses_simd_intrinsics; int r4_stack_type; diff --git a/netcore/System.Private.CoreLib/src/System/Math.cs b/netcore/System.Private.CoreLib/src/System/Math.cs index addda9d3732..6f6fe9eb724 100644 --- a/netcore/System.Private.CoreLib/src/System/Math.cs +++ b/netcore/System.Private.CoreLib/src/System/Math.cs @@ -77,7 +77,7 @@ namespace System [MethodImpl (MethodImplOptions.InternalCall)] public static extern double Tanh (double value); - // [Intrinsic] TODO: implement FMA intrinsic + [Intrinsic] [MethodImpl (MethodImplOptions.InternalCall)] public static extern double FusedMultiplyAdd (double x, double y, double z); diff --git a/netcore/System.Private.CoreLib/src/System/MathF.cs b/netcore/System.Private.CoreLib/src/System/MathF.cs index 86061a319d8..16d04343814 100644 --- a/netcore/System.Private.CoreLib/src/System/MathF.cs +++ b/netcore/System.Private.CoreLib/src/System/MathF.cs @@ -71,7 +71,7 @@ namespace System [MethodImpl (MethodImplOptions.InternalCall)] public static extern float Tanh (float x); - // [Intrinsic] TODO: implement intrinsic (FMA) + [Intrinsic] [MethodImpl (MethodImplOptions.InternalCall)] public static extern float FusedMultiplyAdd (float x, float y, float z); -- 2.11.4.GIT