[Facades] Use the Open.snk key for the System.ValueTuple facade (#4173)
[mono-project.git] / mono / mini / simd-intrinsics.c
blob9b31c3fa69b07f4cc5e90e8722a29678fc524da2
1 /*
2 * simd-instrisics.c: simd support for intrinsics
4 * Author:
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
8 */
10 #include <config.h>
11 #include <stdio.h>
13 #include "mini.h"
14 #include "ir-emit.h"
15 #include "mono/utils/bsearch.h"
16 #include <mono/metadata/abi-details.h>
19 General notes on SIMD intrinsics
21 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
22 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
23 TODO extend op_to_op_dest_membase to handle simd ops
24 TODO add support for indexed versions of simd ops
25 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
26 TODO make sure locals, arguments and spills are properly aligned.
27 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
28 TODO add stuff to man pages
29 TODO document this under /docs
30 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
31 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
32 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
33 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
34 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
35 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
36 TODO check if we need to init the SSE control word with better precision.
37 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
38 TODO make SimdRuntime.get_AccelMode work under AOT
39 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
40 TODO extend bounds checking code to support for range checking.
42 General notes for SIMD intrinsics.
44 -Bad extractor and constructor performance
45 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
46 It will be loaded in the FP stack just to be pushed on the call stack.
48 A similar thing happens with Vector4f constructor that require float vars to be
50 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
51 trip to the FP stack is desirable.
53 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
54 for simd and fp.
57 -Promote OP_EXTRACT_I4 to a STORE op
58 The advantage of this change is that it could have a _membase version and promote further optimizations.
60 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
61 without a OP_LDADDR.
64 #if defined (MONO_ARCH_SIMD_INTRINSICS)
66 #if defined (DISABLE_JIT)
68 void
69 mono_simd_intrinsics_init (void)
73 #else
75 //#define IS_DEBUG_ON(cfg) (0)
77 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
78 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
79 enum {
80 SIMD_EMIT_BINARY,
81 SIMD_EMIT_UNARY,
82 SIMD_EMIT_SETTER,
83 SIMD_EMIT_GETTER,
84 SIMD_EMIT_GETTER_QWORD,
85 SIMD_EMIT_CTOR,
86 SIMD_EMIT_CAST,
87 SIMD_EMIT_SHUFFLE,
88 SIMD_EMIT_SHIFT,
89 SIMD_EMIT_EQUALITY,
90 SIMD_EMIT_LOAD_ALIGNED,
91 SIMD_EMIT_STORE,
92 SIMD_EMIT_EXTRACT_MASK,
93 SIMD_EMIT_PREFETCH
96 #ifdef HAVE_ARRAY_ELEM_INIT
97 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
98 #define MSGSTRFIELD1(line) str##line
99 static const struct msgstr_t {
100 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
101 #include "simd-methods.h"
102 #undef SIMD_METHOD
103 } method_names = {
104 #define SIMD_METHOD(str,name) str,
105 #include "simd-methods.h"
106 #undef SIMD_METHOD
109 enum {
110 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
111 #include "simd-methods.h"
113 #define method_name(idx) ((const char*)&method_names + (idx))
115 #else
116 #define SIMD_METHOD(str,name) str,
117 static const char * const method_names [] = {
118 #include "simd-methods.h"
119 NULL
121 #undef SIMD_METHOD
122 #define SIMD_METHOD(str,name) name,
123 enum {
124 #include "simd-methods.h"
125 SN_LAST
128 #define method_name(idx) (method_names [(idx)])
130 #endif
132 typedef struct {
133 guint16 name;
134 guint16 opcode;
135 guint8 simd_version_flags;
136 guint8 simd_emit_mode : 4;
137 guint8 flags : 4;
138 } SimdIntrinsic;
140 static const SimdIntrinsic vector4f_intrinsics[] = {
141 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
142 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
143 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
144 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
145 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
146 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
147 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
148 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
149 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
150 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
151 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
152 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
153 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
154 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
155 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
156 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
157 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
158 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
159 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
160 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
161 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
162 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
163 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
164 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
165 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
166 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
167 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
168 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
169 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
170 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
171 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
172 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
173 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
174 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
175 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
176 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
177 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
178 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
179 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
180 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
181 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
182 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
183 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
184 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
185 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
186 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
187 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
188 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
189 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
190 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
191 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
194 static const SimdIntrinsic vector2d_intrinsics[] = {
195 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
196 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
197 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
198 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
199 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
200 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
201 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
202 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
203 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
204 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
205 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
206 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
207 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
208 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
209 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
210 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
211 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
212 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
213 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
214 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
215 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
217 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
218 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
219 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
220 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
221 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
222 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
223 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
224 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
225 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
226 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
227 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
228 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
229 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
232 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
233 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
234 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
235 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
238 static const SimdIntrinsic vector2ul_intrinsics[] = {
239 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
240 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
241 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
242 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
243 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
244 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
245 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
246 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
247 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
248 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
249 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
250 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
251 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
252 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
253 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
254 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
255 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
256 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
257 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
258 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
259 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
260 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
261 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
262 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
265 static const SimdIntrinsic vector2l_intrinsics[] = {
266 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
267 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
268 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
269 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
270 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
271 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
272 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
273 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
274 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
275 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
276 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
277 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
278 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
279 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
280 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
281 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
282 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
283 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
284 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
285 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
286 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
287 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
288 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
289 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
290 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
293 static const SimdIntrinsic vector4ui_intrinsics[] = {
294 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
295 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
296 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
297 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
298 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
299 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
300 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
301 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
302 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
303 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
304 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
305 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
306 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
307 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
308 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
309 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
310 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
311 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
312 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
313 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
314 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
315 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
316 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
317 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
318 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
319 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
320 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
321 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
322 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
323 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
324 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
325 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
326 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
327 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
328 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
331 static const SimdIntrinsic vector4i_intrinsics[] = {
332 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
333 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
334 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
335 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
336 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
337 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
338 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
339 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
340 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
341 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
342 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
343 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
344 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
345 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
346 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
347 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
348 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
349 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
350 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
351 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
352 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
353 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
354 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
355 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
356 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
357 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
358 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
359 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
360 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
361 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
362 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
363 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
364 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
365 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
366 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
367 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
368 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
369 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
372 static const SimdIntrinsic vector8us_intrinsics[] = {
373 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
374 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
375 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
376 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
377 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
378 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
379 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
380 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
381 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
382 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
383 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
384 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
385 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
386 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
387 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
388 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
389 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
390 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
391 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
392 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
393 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
395 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
396 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
397 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
398 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
399 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
400 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
401 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
402 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
403 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
404 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
406 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
407 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
408 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
409 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
410 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
411 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
412 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
413 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
414 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
415 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
416 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
417 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
418 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
419 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
420 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
423 static const SimdIntrinsic vector8s_intrinsics[] = {
424 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
425 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
426 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
427 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
428 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
429 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
430 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
431 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
432 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
436 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
437 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
438 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
439 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
440 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
441 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
442 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
443 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
444 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
446 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
447 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
448 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
449 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
450 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
451 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
452 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
453 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
454 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
455 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
457 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
458 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
459 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
460 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
461 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
462 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
463 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
464 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
465 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
466 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
467 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
468 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
469 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
470 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
471 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
474 static const SimdIntrinsic vector16b_intrinsics[] = {
475 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
476 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
477 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
478 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
480 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
481 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
482 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
483 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
484 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
485 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
486 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
487 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
488 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
489 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
490 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
492 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
493 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
500 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
501 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
502 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
503 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
504 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
505 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
506 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
507 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
508 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
509 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
510 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
511 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
512 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
513 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
514 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
515 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
516 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
517 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
523 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
524 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
525 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
526 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
527 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
528 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
530 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
531 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
535 Missing:
536 setters
538 static const SimdIntrinsic vector16sb_intrinsics[] = {
539 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
540 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
541 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
542 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
544 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
545 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
546 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
547 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
548 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
549 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
550 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
551 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
552 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
553 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
554 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
555 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
556 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
563 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
564 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
565 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
566 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
567 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
568 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
569 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
570 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
571 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
572 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
573 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
574 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
575 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
576 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
577 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
578 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
579 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
580 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
587 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
589 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
590 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
591 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
592 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
593 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
594 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
597 static guint32 simd_supported_versions;
599 static MonoInst* emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
600 static MonoInst* emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
602 /*TODO match using number of parameters as well*/
603 static int
604 simd_intrinsic_compare_by_name (const void *key, const void *value)
606 return strcmp (key, method_name (((SimdIntrinsic *)value)->name));
609 typedef enum {
610 VREG_USED = 0x01,
611 VREG_HAS_XZERO_BB0 = 0x02,
612 VREG_HAS_OTHER_OP_BB0 = 0x04,
613 VREG_SINGLE_BB_USE = 0x08,
614 VREG_MANY_BB_USE = 0x10,
615 } KillFlags;
617 void
618 mono_simd_intrinsics_init (void)
620 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
621 /*TODO log the supported flags*/
624 static inline gboolean
625 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
627 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
628 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
629 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
630 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
631 return TRUE;
633 return FALSE;
636 static inline gboolean
637 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
639 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
640 return FALSE;
642 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
643 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
644 vreg_flags [reg] |= VREG_MANY_BB_USE;
645 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
646 return TRUE;
647 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
648 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
649 target_bb [reg] = bb;
650 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
651 return TRUE;
653 return FALSE;
657 This pass recalculate which vars need MONO_INST_INDIRECT.
659 We cannot do this for non SIMD vars since code like mono_get_vtable_var
660 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
662 void
663 mono_simd_simplify_indirection (MonoCompile *cfg)
665 int i, max_vreg = 0;
666 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
667 MonoInst *ins;
668 char *vreg_flags;
670 for (i = 0; i < cfg->num_varinfo; i++) {
671 MonoInst *var = cfg->varinfo [i];
672 if (var->klass->simd_type) {
673 var->flags &= ~MONO_INST_INDIRECT;
674 max_vreg = MAX (var->dreg, max_vreg);
678 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
679 if (!first_bb && bb->code)
680 first_bb = bb;
681 for (ins = bb->code; ins; ins = ins->next) {
682 if (ins->opcode == OP_LDADDR) {
683 MonoInst *var = (MonoInst*)ins->inst_p0;
684 if (var->klass->simd_type) {
685 var->flags |= MONO_INST_INDIRECT;
691 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
692 vreg_flags = (char *)g_malloc0 (max_vreg + 1);
693 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
695 for (i = 0; i < cfg->num_varinfo; i++) {
696 MonoInst *var = cfg->varinfo [i];
697 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
698 vreg_flags [var->dreg] = VREG_USED;
699 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
703 /*Scan the first basic block looking xzeros not used*/
704 for (ins = first_bb->code; ins; ins = ins->next) {
705 int num_sregs;
706 int sregs [MONO_MAX_SRC_REGS];
708 if (ins->opcode == OP_XZERO) {
709 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
710 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
711 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
713 continue;
715 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
716 continue;
717 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
718 continue;
719 num_sregs = mono_inst_get_src_registers (ins, sregs);
720 for (i = 0; i < num_sregs; ++i) {
721 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
722 break;
726 if (IS_DEBUG_ON (cfg)) {
727 for (i = 0; i < cfg->num_varinfo; i++) {
728 MonoInst *var = cfg->varinfo [i];
729 if (var->klass->simd_type) {
730 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
731 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
732 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
733 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
738 /*TODO stop here if no var is xzero only*/
741 Scan all other bb and check if it has only one other use
742 Ideally this would be done after an extended bb formation pass
744 FIXME This pass could use dominator information to properly
745 place the XZERO on the bb that dominates all uses of the var,
746 but this will have zero effect with the current local reg alloc
748 TODO simply the use of flags.
751 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
752 for (ins = bb->code; ins; ins = ins->next) {
753 int num_sregs;
754 int sregs [MONO_MAX_SRC_REGS];
756 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
757 continue;
758 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
759 continue;
760 num_sregs = mono_inst_get_src_registers (ins, sregs);
761 for (i = 0; i < num_sregs; ++i) {
762 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
763 max_vreg, vreg_flags, target_bb))
764 continue;
769 for (i = 0; i < cfg->num_varinfo; i++) {
770 MonoInst *var = cfg->varinfo [i];
771 if (!var->klass->simd_type)
772 continue;
773 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
774 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
775 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
776 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
778 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
779 continue;
780 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
781 int num_sregs, j;
782 int sregs [MONO_MAX_SRC_REGS];
783 gboolean found = FALSE;
785 num_sregs = mono_inst_get_src_registers (ins, sregs);
786 for (j = 0; j < num_sregs; ++j) {
787 if (sregs [j] == var->dreg)
788 found = TRUE;
790 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
791 if (ins->dreg == var->dreg && !found) {
792 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i, target_bb [var->dreg]->block_num););
793 break;
794 } else if (found) {
795 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i, target_bb [var->dreg]->block_num); );
796 MonoInst *tmp;
797 MONO_INST_NEW (cfg, tmp, OP_XZERO);
798 tmp->dreg = var->dreg;
799 tmp->type = STACK_VTYPE;
800 tmp->klass = var->klass;
801 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
802 break;
807 for (ins = first_bb->code; ins; ins = ins->next) {
808 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE)) {
809 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins->dreg); mono_print_ins(ins));
810 NULLIFY_INS (ins);
814 g_free (vreg_flags);
815 g_free (target_bb);
819 * This function expect that src be a value.
821 static int
822 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
824 const char *spec = INS_INFO (src->opcode);
826 if (src->opcode == OP_XMOVE) {
827 return src->sreg1;
828 } else if (spec [MONO_INST_DEST] == 'x') {
829 return src->dreg;
830 } else if (src->opcode == OP_VCALL) {
831 return src->dreg;
834 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
835 mono_print_ins (src);
836 g_assert_not_reached ();
840 * This function will load the value if needed.
842 static int
843 load_simd_vreg_class (MonoCompile *cfg, MonoClass *klass, MonoInst *src, gboolean *indirect)
845 const char *spec = INS_INFO (src->opcode);
847 if (indirect)
848 *indirect = FALSE;
849 if (src->opcode == OP_XMOVE) {
850 return src->sreg1;
851 } else if (src->opcode == OP_LDADDR) {
852 int res = ((MonoInst*)src->inst_p0)->dreg;
853 NULLIFY_INS (src);
854 return res;
855 } else if (spec [MONO_INST_DEST] == 'x') {
856 return src->dreg;
857 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
858 MonoInst *ins;
859 if (indirect)
860 *indirect = TRUE;
862 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
863 ins->klass = klass;
864 ins->sreg1 = src->dreg;
865 ins->type = STACK_VTYPE;
866 ins->dreg = alloc_ireg (cfg);
867 MONO_ADD_INS (cfg->cbb, ins);
868 return ins->dreg;
870 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
871 mono_print_ins (src);
872 g_assert_not_reached ();
875 static int
876 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
878 return load_simd_vreg_class (cfg, cmethod->klass, src, indirect);
881 /*We share the var with fconv_to_r8_x to save some stack space.*/
882 static MonoInst*
883 get_double_spill_area (MonoCompile *cfg)
885 if (!cfg->fconv_to_r8_x_var) {
886 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
887 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
889 return cfg->fconv_to_r8_x_var;
891 static MonoInst*
892 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
894 if (!cfg->simd_ctor_var) {
895 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
896 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
898 return cfg->simd_ctor_var;
901 static int
902 mono_type_to_expand_op (MonoType *type)
904 switch (type->type) {
905 case MONO_TYPE_I1:
906 case MONO_TYPE_U1:
907 return OP_EXPAND_I1;
908 case MONO_TYPE_I2:
909 case MONO_TYPE_U2:
910 return OP_EXPAND_I2;
911 case MONO_TYPE_I4:
912 case MONO_TYPE_U4:
913 return OP_EXPAND_I4;
914 case MONO_TYPE_I8:
915 case MONO_TYPE_U8:
916 return OP_EXPAND_I8;
917 case MONO_TYPE_R4:
918 return OP_EXPAND_R4;
919 case MONO_TYPE_R8:
920 return OP_EXPAND_R8;
921 default:
922 g_assert_not_reached ();
926 static int
927 type_to_comp_op (MonoType *t)
929 switch (t->type) {
930 case MONO_TYPE_I1:
931 case MONO_TYPE_U1:
932 return OP_PCMPEQB;
933 case MONO_TYPE_I2:
934 case MONO_TYPE_U2:
935 return OP_PCMPEQW;
936 case MONO_TYPE_I4:
937 case MONO_TYPE_U4:
938 return OP_PCMPEQD;
939 case MONO_TYPE_I8:
940 case MONO_TYPE_U8:
941 return OP_PCMPEQQ;
942 case MONO_TYPE_R4:
943 return OP_COMPPS;
944 case MONO_TYPE_R8:
945 return OP_COMPPD;
946 default:
947 g_assert_not_reached ();
948 return -1;
952 static int
953 type_to_gt_op (MonoType *t)
955 switch (t->type) {
956 case MONO_TYPE_I1:
957 return OP_PCMPGTB;
958 case MONO_TYPE_I2:
959 return OP_PCMPGTW;
960 case MONO_TYPE_I4:
961 return OP_PCMPGTD;
962 case MONO_TYPE_I8:
963 return OP_PCMPGTQ;
964 default:
965 return -1;
969 static int
970 type_to_padd_op (MonoType *t)
972 switch (t->type) {
973 case MONO_TYPE_U1:
974 case MONO_TYPE_I1:
975 return OP_PADDB;
976 case MONO_TYPE_U2:
977 case MONO_TYPE_I2:
978 return OP_PADDW;
979 case MONO_TYPE_U4:
980 case MONO_TYPE_I4:
981 return OP_PADDD;
982 case MONO_TYPE_U8:
983 case MONO_TYPE_I8:
984 return OP_PADDQ;
985 case MONO_TYPE_R4:
986 return OP_ADDPS;
987 case MONO_TYPE_R8:
988 return OP_ADDPD;
989 default:
990 break;
992 return -1;
995 static int
996 type_to_psub_op (MonoType *t)
998 switch (t->type) {
999 case MONO_TYPE_U1:
1000 case MONO_TYPE_I1:
1001 return OP_PSUBB;
1002 case MONO_TYPE_U2:
1003 case MONO_TYPE_I2:
1004 return OP_PSUBW;
1005 case MONO_TYPE_U4:
1006 case MONO_TYPE_I4:
1007 return OP_PSUBD;
1008 case MONO_TYPE_U8:
1009 case MONO_TYPE_I8:
1010 return OP_PSUBQ;
1011 case MONO_TYPE_R4:
1012 return OP_SUBPS;
1013 case MONO_TYPE_R8:
1014 return OP_SUBPD;
1015 default:
1016 break;
1018 return -1;
1021 static int
1022 type_to_pmul_op (MonoType *t)
1024 switch (t->type) {
1025 case MONO_TYPE_U2:
1026 case MONO_TYPE_I2:
1027 return OP_PMULW;
1028 case MONO_TYPE_U4:
1029 case MONO_TYPE_I4:
1030 return OP_PMULD;
1031 case MONO_TYPE_U8:
1032 case MONO_TYPE_I8:
1033 return OP_PMULQ;
1034 case MONO_TYPE_R4:
1035 return OP_MULPS;
1036 case MONO_TYPE_R8:
1037 return OP_MULPD;
1038 default:
1039 break;
1041 return -1;
1044 static int
1045 type_to_pdiv_op (MonoType *t)
1047 switch (t->type) {
1048 case MONO_TYPE_R4:
1049 return OP_DIVPS;
1050 case MONO_TYPE_R8:
1051 return OP_DIVPD;
1052 default:
1053 break;
1055 return -1;
1058 static int
1059 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoClass *klass, MonoType *param_type, MonoInst *src)
1061 MonoInst *ins;
1062 int expand_op;
1064 if (mono_class_from_mono_type (param_type)->simd_type)
1065 return get_simd_vreg (cfg, NULL, src);
1067 expand_op = mono_type_to_expand_op (param_type);
1068 MONO_INST_NEW (cfg, ins, expand_op);
1069 ins->klass = klass;
1070 ins->sreg1 = src->dreg;
1071 ins->type = STACK_VTYPE;
1072 ins->dreg = alloc_ireg (cfg);
1073 MONO_ADD_INS (cfg->cbb, ins);
1075 if (expand_op == OP_EXPAND_R4)
1076 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1077 else if (expand_op == OP_EXPAND_R8)
1078 ins->backend.spill_var = get_double_spill_area (cfg);
1080 return ins->dreg;
1084 * simd_intrinsic_emit_binary_op:
1086 * Emit a binary SIMD opcode.
1087 * @LHS/@RHS are the two arguments, they can be either a SIMD type or a scalar one. Scalar arguments are
1088 * expanded to the SIMD type.
1090 static MonoInst*
1091 simd_intrinsic_emit_binary_op (MonoCompile *cfg, int opcode, int flags, MonoClass *klass, MonoType *lhs_type, MonoType *rhs_type, MonoInst *lhs, MonoInst *rhs)
1093 MonoInst* ins;
1094 int left_vreg, right_vreg;
1096 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, lhs_type, lhs);
1097 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, rhs_type, rhs);
1099 MONO_INST_NEW (cfg, ins, opcode);
1100 ins->klass = klass;
1101 ins->sreg1 = left_vreg;
1102 ins->sreg2 = right_vreg;
1103 ins->type = STACK_VTYPE;
1104 ins->dreg = alloc_ireg (cfg);
1105 ins->inst_c0 = flags;
1106 MONO_ADD_INS (cfg->cbb, ins);
1107 return ins;
1110 static MonoInst*
1111 simd_intrinsic_emit_binary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1113 MonoMethodSignature *sig = mono_method_signature (cmethod);
1115 g_assert (sig->param_count == 2);
1117 return simd_intrinsic_emit_binary_op (cfg, intrinsic->opcode, intrinsic->flags, cmethod->klass, sig->params [0], sig->params [1], args [0], args [1]);
1120 static MonoInst*
1121 simd_intrinsic_emit_unary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1123 MonoInst* ins;
1124 int vreg;
1126 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1128 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1129 ins->klass = cmethod->klass;
1130 ins->sreg1 = vreg;
1131 ins->type = STACK_VTYPE;
1132 ins->dreg = alloc_ireg (cfg);
1133 MONO_ADD_INS (cfg->cbb, ins);
1134 return ins;
1137 static int
1138 mono_type_to_extract_op (MonoType *type)
1140 switch (type->type) {
1141 case MONO_TYPE_I1:
1142 return OP_EXTRACT_I1;
1143 case MONO_TYPE_U1:
1144 return OP_EXTRACT_U1;
1145 case MONO_TYPE_I2:
1146 return OP_EXTRACT_I2;
1147 case MONO_TYPE_U2:
1148 return OP_EXTRACT_U2;
1149 case MONO_TYPE_I4:
1150 case MONO_TYPE_U4:
1151 case MONO_TYPE_R4:
1152 return OP_EXTRACT_I4;
1153 default:
1154 g_assert_not_reached ();
1158 /*Returns the amount to shift the element index to get the dword it belongs to*/
1159 static int
1160 mono_type_elements_shift_bits (MonoType *type)
1162 switch (type->type) {
1163 case MONO_TYPE_I1:
1164 case MONO_TYPE_U1:
1165 return 2;
1166 case MONO_TYPE_I2:
1167 case MONO_TYPE_U2:
1168 return 1;
1169 case MONO_TYPE_I4:
1170 case MONO_TYPE_U4:
1171 case MONO_TYPE_R4:
1172 return 0;
1173 default:
1174 g_assert_not_reached ();
1178 static G_GNUC_UNUSED int
1179 mono_type_to_insert_op (MonoType *type)
1181 switch (type->type) {
1182 case MONO_TYPE_I1:
1183 case MONO_TYPE_U1:
1184 return OP_INSERT_I1;
1185 case MONO_TYPE_I2:
1186 case MONO_TYPE_U2:
1187 return OP_INSERT_I2;
1188 case MONO_TYPE_I4:
1189 case MONO_TYPE_U4:
1190 return OP_INSERT_I4;
1191 case MONO_TYPE_I8:
1192 case MONO_TYPE_U8:
1193 return OP_INSERT_I8;
1194 case MONO_TYPE_R4:
1195 return OP_INSERT_R4;
1196 case MONO_TYPE_R8:
1197 return OP_INSERT_R8;
1198 default:
1199 g_assert_not_reached ();
1203 static int
1204 mono_type_to_slow_insert_op (MonoType *type)
1206 switch (type->type) {
1207 case MONO_TYPE_I1:
1208 case MONO_TYPE_U1:
1209 return OP_INSERTX_U1_SLOW;
1210 case MONO_TYPE_I2:
1211 case MONO_TYPE_U2:
1212 return OP_INSERT_I2;
1213 case MONO_TYPE_I4:
1214 case MONO_TYPE_U4:
1215 return OP_INSERTX_I4_SLOW;
1216 case MONO_TYPE_I8:
1217 case MONO_TYPE_U8:
1218 return OP_INSERTX_I8_SLOW;
1219 case MONO_TYPE_R4:
1220 return OP_INSERTX_R4_SLOW;
1221 case MONO_TYPE_R8:
1222 return OP_INSERTX_R8_SLOW;
1223 default:
1224 g_assert_not_reached ();
1228 static MonoInst*
1229 simd_intrinsic_emit_setter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1231 MonoInst *ins;
1232 MonoMethodSignature *sig = mono_method_signature (cmethod);
1233 int size, align;
1234 gboolean indirect;
1235 int dreg;
1237 size = mono_type_size (sig->params [0], &align);
1239 if (COMPILE_LLVM (cfg)) {
1240 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1241 ins->klass = cmethod->klass;
1242 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1243 ins->sreg2 = args [1]->dreg;
1244 ins->inst_c0 = intrinsic->opcode;
1245 MONO_ADD_INS (cfg->cbb, ins);
1246 } else if (size == 2 || size == 4 || size == 8) {
1247 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1248 ins->klass = cmethod->klass;
1249 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1250 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1251 ins->sreg2 = args [1]->dreg;
1252 ins->inst_c0 = intrinsic->opcode;
1253 if (sig->params [0]->type == MONO_TYPE_R4)
1254 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1255 else if (sig->params [0]->type == MONO_TYPE_R8)
1256 ins->backend.spill_var = get_double_spill_area (cfg);
1257 MONO_ADD_INS (cfg->cbb, ins);
1258 } else {
1259 int vreg, sreg;
1261 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1262 ins->klass = cmethod->klass;
1263 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1264 ins->type = STACK_I4;
1265 ins->dreg = vreg = alloc_ireg (cfg);
1266 ins->inst_c0 = intrinsic->opcode / 2;
1267 MONO_ADD_INS (cfg->cbb, ins);
1269 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1270 ins->klass = cmethod->klass;
1271 ins->sreg1 = vreg;
1272 ins->sreg2 = args [1]->dreg;
1273 ins->dreg = sreg;
1274 ins->inst_c0 = intrinsic->opcode;
1275 MONO_ADD_INS (cfg->cbb, ins);
1278 if (indirect) {
1279 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1280 ins->klass = cmethod->klass;
1281 ins->dreg = args [0]->dreg;
1282 ins->sreg1 = dreg;
1283 MONO_ADD_INS (cfg->cbb, ins);
1285 return ins;
1289 * simd_intrinsic_emit_getter_op:
1291 * Emit IR for loading an element of a SIMD value.
1293 * @klass is the simd type, @type is the element type.
1295 static MonoInst*
1296 simd_intrinsic_emit_getter_op (MonoCompile *cfg, int index, MonoClass *klass, MonoType *type, MonoInst *arg)
1298 MonoInst *ins;
1299 int vreg, shift_bits;
1301 vreg = load_simd_vreg_class (cfg, klass, arg, NULL);
1303 if (type->type == MONO_TYPE_I8 || type->type == MONO_TYPE_U8 || type->type == MONO_TYPE_R8) {
1304 MonoInst *ins;
1305 gboolean is_r8 = type->type == MONO_TYPE_R8;
1307 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1308 ins->klass = klass;
1309 ins->sreg1 = vreg;
1310 ins->inst_c0 = index;
1311 if (is_r8) {
1312 ins->type = STACK_R8;
1313 ins->dreg = alloc_freg (cfg);
1314 ins->backend.spill_var = get_double_spill_area (cfg);
1315 } else {
1316 ins->type = STACK_I8;
1317 ins->dreg = alloc_lreg (cfg);
1319 MONO_ADD_INS (cfg->cbb, ins);
1320 return ins;
1323 shift_bits = mono_type_elements_shift_bits (type);
1325 if ((index >> shift_bits) && !cfg->compile_llvm) {
1326 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1327 ins->klass = klass;
1328 ins->sreg1 = vreg;
1329 ins->inst_c0 = index >> shift_bits;
1330 ins->type = STACK_VTYPE;
1331 ins->dreg = vreg = alloc_ireg (cfg);
1332 MONO_ADD_INS (cfg->cbb, ins);
1335 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (type));
1336 ins->klass = klass;
1337 ins->sreg1 = vreg;
1338 ins->type = STACK_I4;
1339 ins->dreg = vreg = alloc_ireg (cfg);
1340 if (cfg->compile_llvm)
1341 ins->inst_c0 = index;
1342 else
1343 ins->inst_c0 = index & ((1 << shift_bits) - 1);
1344 MONO_ADD_INS (cfg->cbb, ins);
1346 if (type->type == MONO_TYPE_R4) {
1347 MONO_INST_NEW (cfg, ins, cfg->r4fp ? OP_ICONV_TO_R4_RAW : OP_MOVE_I4_TO_F);
1348 ins->klass = mono_defaults.single_class;
1349 ins->sreg1 = vreg;
1350 ins->type = cfg->r4_stack_type;
1351 ins->dreg = alloc_freg (cfg);
1352 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1353 MONO_ADD_INS (cfg->cbb, ins);
1355 return ins;
1358 static MonoInst*
1359 simd_intrinsic_emit_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1361 MonoMethodSignature *sig = mono_method_signature (cmethod);
1363 return simd_intrinsic_emit_getter_op (cfg, intrinsic->opcode, cmethod->klass, sig->ret, args [0]);
1366 static MonoInst*
1367 simd_intrinsic_emit_long_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1369 MonoInst *ins;
1370 int vreg;
1371 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1373 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1375 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1376 ins->klass = cmethod->klass;
1377 ins->sreg1 = vreg;
1378 ins->inst_c0 = intrinsic->opcode;
1379 if (is_r8) {
1380 ins->type = STACK_R8;
1381 ins->dreg = alloc_freg (cfg);
1382 ins->backend.spill_var = get_double_spill_area (cfg);
1383 } else {
1384 ins->type = STACK_I8;
1385 ins->dreg = alloc_lreg (cfg);
1387 MONO_ADD_INS (cfg->cbb, ins);
1389 return ins;
1392 static MonoInst*
1393 simd_intrinsic_emit_ctor (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1395 MonoInst *ins = NULL;
1396 int i, addr_reg;
1397 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1398 MonoMethodSignature *sig = mono_method_signature (cmethod);
1399 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1400 int arg_size = mono_type_size (sig->params [0], &i);
1401 int opcode;
1403 if (sig->param_count == 1) {
1404 int dreg;
1406 if (is_ldaddr) {
1407 dreg = args [0]->inst_i0->dreg;
1408 NULLIFY_INS (args [0]);
1409 } else {
1410 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1411 dreg = alloc_ireg (cfg);
1414 if (intrinsic)
1415 opcode = intrinsic->opcode;
1416 else
1417 opcode = mono_type_to_expand_op (sig->params [0]);
1418 MONO_INST_NEW (cfg, ins, opcode);
1419 ins->klass = cmethod->klass;
1420 ins->sreg1 = args [1]->dreg;
1421 ins->type = STACK_VTYPE;
1422 ins->dreg = dreg;
1424 MONO_ADD_INS (cfg->cbb, ins);
1425 if (sig->params [0]->type == MONO_TYPE_R4)
1426 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1427 else if (sig->params [0]->type == MONO_TYPE_R8)
1428 ins->backend.spill_var = get_double_spill_area (cfg);
1430 if (!is_ldaddr) {
1431 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1432 ins->dreg = args [0]->dreg;
1433 ins->sreg1 = dreg;
1434 MONO_ADD_INS (cfg->cbb, ins);
1436 return ins;
1439 if (is_ldaddr) {
1440 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1441 MONO_ADD_INS (cfg->cbb, ins);
1442 addr_reg = ins->dreg;
1443 } else {
1444 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1445 addr_reg = args [0]->dreg;
1448 for (i = sig->param_count - 1; i >= 0; --i) {
1449 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1452 if (sig->param_count * arg_size < 16) {
1453 /* If there are not enough arguments, fill the rest with 0s */
1454 for (i = sig->param_count; i < 16 / arg_size; ++i) {
1455 switch (arg_size) {
1456 case 4:
1457 MONO_EMIT_NEW_STORE_MEMBASE_IMM (cfg, OP_STOREI4_MEMBASE_IMM, addr_reg, i * arg_size, 0);
1458 break;
1459 default:
1460 g_assert_not_reached ();
1461 break;
1466 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1467 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1468 NULLIFY_INS (args [0]);
1470 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1471 ins->klass = cmethod->klass;
1472 ins->sreg1 = addr_reg;
1473 ins->type = STACK_VTYPE;
1474 ins->dreg = vreg;
1475 MONO_ADD_INS (cfg->cbb, ins);
1477 return ins;
1480 static MonoInst*
1481 simd_intrinsic_emit_cast (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1483 MonoInst *ins;
1484 MonoClass *klass;
1485 int vreg;
1487 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1489 if (cmethod->is_inflated)
1490 /* Vector<T> */
1491 klass = mono_class_from_mono_type (mono_method_signature (cmethod)->ret);
1492 else
1493 klass = cmethod->klass;
1495 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1496 ins->klass = klass;
1497 ins->type = STACK_VTYPE;
1498 ins->sreg1 = vreg;
1499 ins->dreg = alloc_ireg (cfg);
1500 MONO_ADD_INS (cfg->cbb, ins);
1501 return ins;
1504 static MonoInst*
1505 simd_intrinsic_emit_shift (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1507 MonoInst *ins;
1508 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1510 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1512 if (args [1]->opcode != OP_ICONST) {
1513 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1514 ins->klass = mono_defaults.int32_class;
1515 ins->sreg1 = args [1]->dreg;
1516 ins->type = STACK_I4;
1517 ins->dreg = vreg2 = alloc_ireg (cfg);
1518 MONO_ADD_INS (cfg->cbb, ins);
1520 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1523 MONO_INST_NEW (cfg, ins, opcode);
1524 ins->klass = cmethod->klass;
1525 ins->sreg1 = vreg;
1526 ins->sreg2 = vreg2;
1528 if (args [1]->opcode == OP_ICONST) {
1529 ins->inst_imm = args [1]->inst_c0;
1530 NULLIFY_INS (args [1]);
1533 ins->type = STACK_VTYPE;
1534 ins->dreg = alloc_ireg (cfg);
1535 MONO_ADD_INS (cfg->cbb, ins);
1536 return ins;
1539 static inline gboolean
1540 mono_op_is_packed_compare (int op)
1542 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1545 static MonoInst*
1546 simd_intrinsic_emit_equality_op (MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args, int opcode, int flags)
1548 MonoInst* ins;
1549 int left_vreg, right_vreg, tmp_vreg;
1551 left_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1552 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1554 MONO_INST_NEW (cfg, ins, opcode);
1555 ins->klass = cmethod->klass;
1556 ins->sreg1 = left_vreg;
1557 ins->sreg2 = right_vreg;
1558 ins->type = STACK_VTYPE;
1559 ins->klass = cmethod->klass;
1560 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1561 ins->inst_c0 = flags;
1562 MONO_ADD_INS (cfg->cbb, ins);
1564 /*FIXME the next ops are SSE specific*/
1565 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1566 ins->klass = cmethod->klass;
1567 ins->sreg1 = tmp_vreg;
1568 ins->type = STACK_I4;
1569 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1570 MONO_ADD_INS (cfg->cbb, ins);
1572 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1573 if (mono_op_is_packed_compare (opcode) || flags == SIMD_COMP_EQ) {
1574 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1575 NEW_UNALU (cfg, ins, flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1576 } else {
1577 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1578 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1580 MONO_ADD_INS (cfg->cbb, ins);
1581 return ins;
1584 static MonoInst*
1585 simd_intrinsic_emit_equality (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1587 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, intrinsic->opcode, intrinsic->flags);
1590 static MonoInst*
1591 simd_intrinsic_emit_shuffle (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1593 MonoInst *ins;
1594 int vreg, vreg2 = -1;
1595 int param_count = mono_method_signature (cmethod)->param_count;
1597 if (args [param_count - 1]->opcode != OP_ICONST) {
1598 /*TODO Shuffle with non literals is not yet supported */
1599 return NULL;
1602 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1603 if (param_count == 3)
1604 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1606 NULLIFY_INS (args [param_count - 1]);
1609 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1610 ins->klass = cmethod->klass;
1611 ins->sreg1 = vreg;
1612 ins->sreg2 = vreg2;
1613 ins->inst_c0 = args [param_count - 1]->inst_c0;
1614 ins->type = STACK_VTYPE;
1615 ins->dreg = alloc_ireg (cfg);
1616 MONO_ADD_INS (cfg->cbb, ins);
1618 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1619 ins->opcode = OP_SHUFPS;
1620 return ins;
1623 static MonoInst*
1624 simd_intrinsic_emit_load_aligned (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1626 MonoInst *ins;
1628 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1629 ins->klass = cmethod->klass;
1630 ins->sreg1 = args [0]->dreg;
1631 ins->type = STACK_VTYPE;
1632 ins->dreg = alloc_ireg (cfg);
1633 MONO_ADD_INS (cfg->cbb, ins);
1634 return ins;
1637 static MonoInst*
1638 simd_intrinsic_emit_store (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1640 MonoInst *ins;
1641 int vreg;
1643 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1645 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1646 ins->klass = cmethod->klass;
1647 ins->dreg = args [0]->dreg;
1648 ins->sreg1 = vreg;
1649 ins->type = STACK_VTYPE;
1650 MONO_ADD_INS (cfg->cbb, ins);
1651 return ins;
1654 static MonoInst*
1655 simd_intrinsic_emit_extract_mask (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1657 MonoInst *ins;
1658 int vreg;
1660 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1662 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1663 ins->klass = cmethod->klass;
1664 ins->sreg1 = vreg;
1665 ins->type = STACK_I4;
1666 ins->dreg = alloc_ireg (cfg);
1667 MONO_ADD_INS (cfg->cbb, ins);
1669 return ins;
1672 static MonoInst*
1673 simd_intrinsic_emit_prefetch (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1675 MonoInst *ins;
1677 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1678 ins->klass = cmethod->klass;
1679 ins->sreg1 = args [0]->dreg;
1680 ins->backend.arg_info = intrinsic->flags;
1681 MONO_ADD_INS (cfg->cbb, ins);
1682 return ins;
1685 static MonoInst*
1686 simd_intrinsic_emit_const (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1688 MonoInst *ins;
1690 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1691 ins->klass = cmethod->klass;
1692 ins->type = STACK_VTYPE;
1693 ins->dreg = alloc_xreg (cfg);
1694 MONO_ADD_INS (cfg->cbb, ins);
1695 return ins;
1698 static const char *
1699 simd_version_name (guint32 version)
1701 switch (version) {
1702 case SIMD_VERSION_SSE1:
1703 return "sse1";
1704 case SIMD_VERSION_SSE2:
1705 return "sse2";
1706 case SIMD_VERSION_SSE3:
1707 return "sse3";
1708 case SIMD_VERSION_SSSE3:
1709 return "ssse3";
1710 case SIMD_VERSION_SSE41:
1711 return "sse41";
1712 case SIMD_VERSION_SSE42:
1713 return "sse42";
1714 case SIMD_VERSION_SSE4a:
1715 return "sse4a";
1717 return "n/a";
1720 static MonoInst*
1721 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsic *intrinsics, guint32 size)
1723 const SimdIntrinsic *result = (const SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
1724 if (!result) {
1725 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1726 return NULL;
1728 if (IS_DEBUG_ON (cfg)) {
1729 int i, max;
1730 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1731 max = fsig->param_count + fsig->hasthis;
1732 for (i = 0; i < max; ++i) {
1733 printf ("param %d: ", i);
1734 mono_print_ins (args [i]);
1737 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1738 if (IS_DEBUG_ON (cfg)) {
1739 int x;
1740 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1741 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1742 if (result->simd_version_flags & (1 << x))
1743 printf ("%s ", simd_version_name (1 << x));
1745 printf ("\n");
1747 return NULL;
1750 switch (result->simd_emit_mode) {
1751 case SIMD_EMIT_BINARY:
1752 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1753 case SIMD_EMIT_UNARY:
1754 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1755 case SIMD_EMIT_SETTER:
1756 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1757 case SIMD_EMIT_GETTER:
1758 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1759 case SIMD_EMIT_GETTER_QWORD:
1760 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1761 case SIMD_EMIT_CTOR:
1762 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1763 case SIMD_EMIT_CAST:
1764 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1765 case SIMD_EMIT_SHUFFLE:
1766 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1767 case SIMD_EMIT_SHIFT:
1768 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1769 case SIMD_EMIT_EQUALITY:
1770 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1771 case SIMD_EMIT_LOAD_ALIGNED:
1772 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1773 case SIMD_EMIT_STORE:
1774 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1775 case SIMD_EMIT_EXTRACT_MASK:
1776 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1777 case SIMD_EMIT_PREFETCH:
1778 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1780 g_assert_not_reached ();
1783 static int
1784 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1786 MonoInst *ins;
1787 guint32 size;
1788 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1790 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1791 mult_reg = alloc_preg (cfg);
1792 array_reg = arr->dreg;
1793 index_reg = index->dreg;
1795 #if SIZEOF_VOID_P == 8
1796 /* The array reg is 64 bits but the index reg is only 32 */
1797 index2_reg = alloc_preg (cfg);
1798 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1799 #else
1800 index2_reg = index_reg;
1801 #endif
1802 index3_reg = alloc_preg (cfg);
1804 if (check_bounds) {
1805 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1806 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1807 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1810 add_reg = alloc_preg (cfg);
1812 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1813 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1814 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1815 ins->type = STACK_PTR;
1816 MONO_ADD_INS (cfg->cbb, ins);
1818 return add_reg;
1821 static MonoInst*
1822 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1824 if ((!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) && fsig->param_count == 2) {
1825 MonoInst *load;
1826 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1828 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1829 load->klass = cmethod->klass;
1830 load->sreg1 = addr;
1831 load->type = STACK_VTYPE;
1832 load->dreg = alloc_ireg (cfg);
1833 MONO_ADD_INS (cfg->cbb, load);
1835 return load;
1837 if ((!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) && fsig->param_count == 3) {
1838 MonoInst *store;
1839 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1840 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1842 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1843 store->klass = cmethod->klass;
1844 store->dreg = addr;
1845 store->sreg1 = vreg;
1846 MONO_ADD_INS (cfg->cbb, store);
1848 return store;
1850 if (!strcmp ("IsAligned", cmethod->name) && fsig->param_count == 2) {
1851 MonoInst *ins;
1852 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1854 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1855 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1856 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1857 MONO_ADD_INS (cfg->cbb, ins);
1859 return ins;
1861 return NULL;
1864 static MonoInst*
1865 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1867 if (!strcmp ("get_AccelMode", cmethod->name) && fsig->param_count == 0) {
1868 MonoInst *ins;
1869 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1870 return ins;
1872 return NULL;
1875 static gboolean
1876 is_sys_numerics_assembly (MonoAssembly *assembly)
1878 return !strcmp ("System.Numerics", assembly->aname.name);
1881 static gboolean
1882 is_sys_numerics_vectors_assembly (MonoAssembly *assembly)
1884 return !strcmp ("System.Numerics.Vectors", assembly->aname.name);
1887 MonoInst*
1888 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1890 const char *class_name;
1892 if (is_sys_numerics_assembly (cmethod->klass->image->assembly))
1893 return emit_sys_numerics_intrinsics (cfg, cmethod, fsig, args);
1895 if (is_sys_numerics_vectors_assembly (cmethod->klass->image->assembly))
1896 return emit_sys_numerics_vectors_intrinsics (cfg, cmethod, fsig, args);
1898 if (strcmp ("Mono.Simd", cmethod->klass->image->assembly->aname.name) ||
1899 strcmp ("Mono.Simd", cmethod->klass->name_space))
1900 return NULL;
1902 class_name = cmethod->klass->name;
1903 if (!strcmp ("SimdRuntime", class_name))
1904 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1906 if (!strcmp ("ArrayExtensions", class_name))
1907 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1909 if (!strcmp ("VectorOperations", class_name)) {
1910 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1911 return NULL;
1912 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1913 } else if (!cmethod->klass->simd_type)
1914 return NULL;
1916 cfg->uses_simd_intrinsics = 1;
1917 if (!strcmp ("Vector2d", class_name))
1918 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsic));
1919 if (!strcmp ("Vector4f", class_name))
1920 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsic));
1921 if (!strcmp ("Vector2ul", class_name))
1922 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsic));
1923 if (!strcmp ("Vector2l", class_name))
1924 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsic));
1925 if (!strcmp ("Vector4ui", class_name))
1926 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsic));
1927 if (!strcmp ("Vector4i", class_name))
1928 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsic));
1929 if (!strcmp ("Vector8us", class_name))
1930 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsic));
1931 if (!strcmp ("Vector8s", class_name))
1932 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsic));
1933 if (!strcmp ("Vector16b", class_name))
1934 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsic));
1935 if (!strcmp ("Vector16sb", class_name))
1936 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsic));
1938 return NULL;
1941 // The entries should be ordered by name
1942 // System.Numerics.Vector2/Vector3/Vector4
1943 static const SimdIntrinsic vector2_intrinsics[] = {
1944 { SN_ctor, OP_EXPAND_R4 },
1945 { SN_Abs },
1946 { SN_Dot, OP_DPPS },
1947 { SN_Equals, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
1948 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
1949 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
1950 { SN_SquareRoot, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
1951 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
1952 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
1953 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
1954 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
1957 static MonoInst*
1958 emit_vector_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1960 const SimdIntrinsic *intrins;
1961 MonoMethodSignature *sig = mono_method_signature (cmethod);
1964 * Vector2/3/4 are handled the same way, since the underlying SIMD type is the same (4 * r4).
1966 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector2_intrinsics, sizeof (vector2_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
1967 if (!intrins) {
1968 //printf ("%s\n", mono_method_full_name (cmethod, 1));
1969 return NULL;
1972 if (cfg->verbose_level > 1) {
1973 char *name = mono_method_full_name (cmethod, TRUE);
1974 printf (" SIMD intrinsic %s\n", name);
1975 g_free (name);
1978 switch (intrins->name) {
1979 case SN_ctor:
1980 return simd_intrinsic_emit_ctor (intrins, cfg, cmethod, args);
1981 break;
1982 case SN_Equals:
1983 return simd_intrinsic_emit_equality (intrins, cfg, cmethod, args);
1984 break;
1985 case SN_SquareRoot:
1986 return simd_intrinsic_emit_unary (intrins, cfg, cmethod, args);
1987 break;
1988 case SN_Dot:
1989 if (COMPILE_LLVM (cfg)) {
1990 MonoInst *ins;
1992 ins = simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
1993 /* The end result is in the lowest element */
1994 return simd_intrinsic_emit_getter_op (cfg, 0, cmethod->klass, mono_method_signature (cmethod)->ret, ins);
1996 break;
1997 case SN_Abs: {
1998 // abs(x) = max(x, sub(0,x))
1999 MonoInst *sub;
2000 MonoInst *zero;
2002 MONO_INST_NEW (cfg, zero, OP_XZERO);
2003 zero->dreg = alloc_xreg (cfg);
2004 zero->klass = cmethod->klass;
2005 MONO_ADD_INS (cfg->cbb, zero);
2007 sub = simd_intrinsic_emit_binary_op (cfg, OP_SUBPS, 0, cmethod->klass, sig->params [0], sig->params [0], zero, args [0]);
2008 return simd_intrinsic_emit_binary_op (cfg, OP_MAXPS, 0, cmethod->klass, sig->params [0], sig->params [0], args [0], sub);
2010 case SN_Max:
2011 case SN_Min:
2012 case SN_op_Addition:
2013 case SN_op_Division:
2014 case SN_op_Multiply:
2015 case SN_op_Subtraction:
2016 return simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2017 default:
2018 break;
2021 return NULL;
2024 static const SimdIntrinsic vector_t_intrinsics[] = {
2025 { SN_ctor },
2026 { SN_Abs },
2027 { SN_CopyTo },
2028 { SN_Equals },
2029 { SN_GreaterThan },
2030 { SN_GreaterThanOrEqual },
2031 { SN_LessThan },
2032 { SN_LessThanOrEqual },
2033 { SN_get_AllOnes, OP_XONES },
2034 { SN_get_Count },
2035 { SN_get_Item },
2036 { SN_get_Zero, OP_XZERO },
2037 { SN_op_Addition },
2038 { SN_op_Division },
2039 { SN_op_Explicit },
2040 { SN_op_Multiply },
2041 { SN_op_Subtraction }
2044 static MonoInst*
2045 emit_vector_t_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2047 const SimdIntrinsic *intrins;
2048 MonoType *etype;
2049 MonoInst *ins;
2050 int size, len, index;
2052 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector_t_intrinsics, sizeof (vector_t_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2053 if (!intrins) {
2054 //printf ("%s\n", mono_method_full_name (cmethod, 1));
2055 return NULL;
2058 etype = mono_class_get_context (cmethod->klass)->class_inst->type_argv [0];
2059 size = mono_class_value_size (mono_class_from_mono_type (etype), NULL);
2060 g_assert (size);
2061 len = 16 / size;
2063 if (!MONO_TYPE_IS_PRIMITIVE (etype))
2064 return NULL;
2066 if (cfg->verbose_level > 1) {
2067 char *name = mono_method_full_name (cmethod, TRUE);
2068 printf (" SIMD intrinsic %s\n", name);
2069 g_free (name);
2072 switch (intrins->name) {
2073 case SN_get_Count:
2074 EMIT_NEW_ICONST (cfg, ins, len);
2075 return ins;
2076 case SN_get_AllOnes:
2077 case SN_get_Zero:
2078 return simd_intrinsic_emit_const (intrins, cfg, cmethod, args);
2079 case SN_get_Item:
2080 g_assert (fsig->param_count == 1);
2081 if (args [1]->opcode != OP_ICONST)
2082 return NULL;
2083 index = args [1]->inst_c0;
2084 if (index < 0 || index >= len)
2085 return NULL;
2086 return simd_intrinsic_emit_getter_op (cfg, index, cmethod->klass, etype, args [0]);
2087 case SN_ctor:
2088 if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
2089 return simd_intrinsic_emit_ctor (NULL, cfg, cmethod, args);
2090 if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) {
2091 MonoInst *array_ins = args [1];
2092 MonoInst *index_ins;
2093 MonoInst *ldelema_ins;
2094 MonoInst *var;
2095 int end_index_reg;
2097 if (args [0]->opcode != OP_LDADDR)
2098 return NULL;
2100 /* .ctor (T[]) or .ctor (T[], index) */
2102 if (fsig->param_count == 2) {
2103 index_ins = args [2];
2104 } else {
2105 EMIT_NEW_ICONST (cfg, index_ins, 0);
2108 /* Emit index check for the end (index + len - 1 < array length) */
2109 end_index_reg = alloc_ireg (cfg);
2110 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2111 MONO_EMIT_BOUNDS_CHECK (cfg, array_ins->dreg, MonoArray, max_length, end_index_reg);
2113 /* Load the array slice into the simd reg */
2114 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2115 g_assert (args [0]->opcode == OP_LDADDR);
2116 var = args [0]->inst_p0;
2117 EMIT_NEW_LOAD_MEMBASE (cfg, ins, OP_LOADX_MEMBASE, var->dreg, ldelema_ins->dreg, 0);
2118 ins->klass = cmethod->klass;
2119 return args [0];
2121 break;
2122 case SN_op_Explicit:
2123 return simd_intrinsic_emit_cast (intrins, cfg, cmethod, args);
2124 case SN_Equals:
2125 if (fsig->param_count == 1)
2126 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, type_to_comp_op (etype), SIMD_COMP_EQ);
2127 if (fsig->param_count == 2)
2128 return simd_intrinsic_emit_binary_op (cfg, type_to_comp_op (etype), 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2129 break;
2131 case SN_GreaterThan:
2132 case SN_GreaterThanOrEqual:
2133 case SN_LessThan: {
2134 MonoInst *cmp1, *cmp2;
2135 int eq_op, gt_op;
2137 switch (etype->type) {
2138 case MONO_TYPE_I1:
2139 case MONO_TYPE_I2:
2140 case MONO_TYPE_I4:
2141 case MONO_TYPE_I8:
2142 break;
2143 default:
2144 return NULL;
2147 eq_op = type_to_comp_op (etype);
2148 gt_op = type_to_gt_op (etype);
2150 switch (intrins->name) {
2151 case SN_GreaterThan:
2152 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2153 case SN_LessThanOrEqual:
2154 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2155 case SN_GreaterThanOrEqual:
2156 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2157 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2158 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2159 case SN_LessThan:
2160 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2161 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2162 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2163 default:
2164 g_assert_not_reached ();
2165 break;
2168 case SN_Abs:
2169 /* Vector<T>.Abs */
2170 switch (etype->type) {
2171 case MONO_TYPE_U1:
2172 case MONO_TYPE_U2:
2173 case MONO_TYPE_U4:
2174 case MONO_TYPE_U8: {
2175 MonoInst *ins;
2177 /* No-op */
2178 MONO_INST_NEW (cfg, ins, OP_XMOVE);
2179 ins->klass = cmethod->klass;
2180 ins->type = STACK_VTYPE;
2181 ins->sreg1 = args [0]->dreg;
2182 ins->dreg = alloc_xreg (cfg);
2183 MONO_ADD_INS (cfg->cbb, ins);
2184 return ins;
2186 default:
2187 break;
2189 break;
2190 case SN_op_Addition: {
2191 int op = type_to_padd_op (etype);
2192 if (op != -1)
2193 return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2194 break;
2196 case SN_op_Subtraction: {
2197 int op = type_to_psub_op (etype);
2198 if (op != -1)
2199 return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2200 break;
2202 case SN_op_Multiply: {
2203 int op = type_to_pmul_op (etype);
2204 if (op != -1)
2205 return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2206 break;
2208 case SN_op_Division: {
2209 int op = type_to_pdiv_op (etype);
2210 if (op != -1)
2211 return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2212 break;
2214 case SN_CopyTo: {
2215 MonoInst *array_ins = args [1];
2216 MonoInst *index_ins = args [2];
2217 MonoInst *ldelema_ins;
2218 MonoInst *var;
2219 int end_index_reg;
2221 if (args [0]->opcode != OP_LDADDR)
2222 return NULL;
2224 /* Emit index check for the end (index + len - 1 < array length) */
2225 end_index_reg = alloc_ireg (cfg);
2226 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2228 int length_reg = alloc_ireg (cfg);
2229 MONO_EMIT_NEW_LOAD_MEMBASE_OP_FAULT (cfg, OP_LOADI4_MEMBASE, length_reg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length));
2230 MONO_EMIT_NEW_BIALU (cfg, OP_COMPARE, -1, length_reg, end_index_reg);
2231 MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, "ArgumentException");
2233 /* Load the simd reg into the array slice */
2234 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2235 g_assert (args [0]->opcode == OP_LDADDR);
2236 var = args [0]->inst_p0;
2237 EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, ldelema_ins->dreg, 0, var->dreg);
2238 ins->klass = cmethod->klass;
2239 return args [0];
2240 break;
2242 default:
2243 break;
2246 return NULL;
2250 * emit_sys_numerics_intrinsics:
2252 * Emit intrinsics for the System.Numerics assembly.
2254 static MonoInst*
2255 emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2257 const char *nspace = cmethod->klass->name_space;
2258 const char *class_name = cmethod->klass->name;
2260 if (cfg->r4fp)
2261 // FIXME:
2262 return NULL;
2264 if (!strcmp ("Vector2", class_name) || !strcmp ("Vector4", class_name) || !strcmp ("Vector3", class_name))
2265 return emit_vector_intrinsics (cfg, cmethod, fsig, args);
2267 if (!strcmp ("Vector`1", class_name))
2268 return emit_vector_t_intrinsics (cfg, cmethod, fsig, args);
2270 if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2271 if (!strcmp (cmethod->name, "get_IsHardwareAccelerated")) {
2272 MonoInst *ins;
2274 if (simd_supported_versions)
2275 EMIT_NEW_ICONST (cfg, ins, 1);
2276 else
2277 EMIT_NEW_ICONST (cfg, ins, 0);
2278 ins->type = STACK_I4;
2279 return ins;
2283 return NULL;
2286 static MonoInst*
2287 emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2289 const char *class_name = cmethod->klass->name;
2291 if (cfg->r4fp)
2292 // FIXME:
2293 return NULL;
2295 if (!strcmp (class_name, "Vector`1"))
2296 return emit_vector_t_intrinsics (cfg, cmethod, fsig, args);
2297 return NULL;
2300 MonoInst*
2301 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2303 if (cfg->r4fp)
2304 // FIXME:
2305 return NULL;
2307 if (is_sys_numerics_assembly (field->parent->image->assembly)) {
2308 int index = -1;
2310 if (!strcmp (field->parent->name, "Vector2") ||
2311 !strcmp (field->parent->name, "Vector3") ||
2312 !strcmp (field->parent->name, "Vector4")) {
2313 if (!strcmp (field->name, "X"))
2314 index = 0;
2315 else if (!strcmp (field->name, "Y"))
2316 index = 1;
2317 else if (!strcmp (field->name, "Z"))
2318 index = 2;
2319 else if (!strcmp (field->name, "W"))
2320 index = 3;
2323 if (index != -1) {
2324 if (cfg->verbose_level > 1)
2325 printf (" SIMD intrinsic field access: %s\n", field->name);
2327 return simd_intrinsic_emit_getter_op (cfg, index, field->parent, mono_field_get_type (field), addr);
2330 return NULL;
2333 #endif /* DISABLE_JIT */
2335 #else
2337 MonoInst*
2338 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2340 return NULL;
2343 #endif /* MONO_ARCH_SIMD_INTRINSICS */