2010-04-19 Rodrigo Kumpera <rkumpera@novell.com>
[mono.git] / mono / mini / simd-intrinsics.c
blobaaf6a1935df772c8d327bbe61f6b49cd49f27d1b
1 /*
2 * simd-instrisics.c: simd support for intrinsics
4 * Author:
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
8 */
10 #include <config.h>
11 #include <stdio.h>
13 #include "mini.h"
14 #include "ir-emit.h"
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
52 for simd and fp.
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
59 without a OP_LDADDR.
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
68 enum {
69 SIMD_EMIT_BINARY,
70 SIMD_EMIT_UNARY,
71 SIMD_EMIT_SETTER,
72 SIMD_EMIT_GETTER,
73 SIMD_EMIT_GETTER_QWORD,
74 SIMD_EMIT_CTOR,
75 SIMD_EMIT_CAST,
76 SIMD_EMIT_SHUFFLE,
77 SIMD_EMIT_SHIFT,
78 SIMD_EMIT_EQUALITY,
79 SIMD_EMIT_LOAD_ALIGNED,
80 SIMD_EMIT_STORE,
81 SIMD_EMIT_EXTRACT_MASK,
82 SIMD_EMIT_PREFETCH
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
91 #undef SIMD_METHOD
92 } method_names = {
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
95 #undef SIMD_METHOD
98 enum {
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
104 #else
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
108 NULL
110 #undef SIMD_METHOD
111 #define SIMD_METHOD(str,name) name,
112 enum {
113 #include "simd-methods.h"
114 SN_LAST
117 #define method_name(idx) (method_names [(idx)])
119 #endif
121 typedef struct {
122 guint16 name;
123 guint16 opcode;
124 guint8 simd_version_flags;
125 guint8 simd_emit_mode : 4;
126 guint8 flags : 4;
127 } SimdIntrinsc;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
132 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
133 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
168 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
169 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
170 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
171 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
172 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
173 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
174 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
175 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
176 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
177 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
182 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
183 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
184 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
185 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
186 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
187 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
188 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
189 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
190 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
191 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
192 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
193 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
194 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
195 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
196 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
197 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
198 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
199 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
200 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
201 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
202 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
203 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
204 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
205 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
206 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
207 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
208 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
209 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
210 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
211 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
212 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
213 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
214 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
215 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
217 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
220 static const SimdIntrinsc vector2ul_intrinsics[] = {
221 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
222 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
223 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
224 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
225 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
226 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
227 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
228 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
229 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
232 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
233 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
234 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
235 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
236 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
237 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
238 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
239 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
240 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
241 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
242 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
243 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
246 static const SimdIntrinsc vector2l_intrinsics[] = {
247 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
248 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
249 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
250 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
251 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
252 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
253 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
254 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
255 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
256 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
257 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
258 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
259 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
260 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
261 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
263 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
264 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
265 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
266 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
267 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
268 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
269 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
270 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
273 static const SimdIntrinsc vector4ui_intrinsics[] = {
274 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
275 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
276 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
277 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
278 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
279 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
280 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
281 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
282 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
283 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
284 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
285 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
287 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
288 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
289 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
290 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
291 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
292 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
293 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
294 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
296 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
297 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
298 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
300 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
301 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
302 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
303 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
304 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
305 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
306 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
307 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
308 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
311 static const SimdIntrinsc vector4i_intrinsics[] = {
312 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
313 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
314 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
315 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
316 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
317 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
318 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
319 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
320 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
321 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
322 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
323 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
324 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
325 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
326 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
327 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
328 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
329 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
330 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
331 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
332 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
333 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
334 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
335 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
336 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
337 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
338 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
339 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
340 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
341 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
342 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
343 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
344 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
345 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
346 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
347 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
350 static const SimdIntrinsc vector8us_intrinsics[] = {
351 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
352 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
353 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
354 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
355 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
356 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
357 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
358 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
359 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
360 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
361 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
362 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
363 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
364 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
365 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
366 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
367 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
368 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
369 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
370 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
371 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
372 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
373 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
374 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
375 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
376 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
377 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
378 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
379 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
380 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
381 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
382 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
384 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
385 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
386 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
387 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
388 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
389 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
390 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
391 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
392 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
393 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
394 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
395 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
396 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
397 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
398 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
401 static const SimdIntrinsc vector8s_intrinsics[] = {
402 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
403 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
404 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
406 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
407 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
408 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
409 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
410 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
411 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
412 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
413 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
414 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
415 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
416 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
417 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
418 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
419 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
420 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
421 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
422 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
423 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
424 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
425 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
426 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
427 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
428 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
429 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
430 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
431 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
432 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
435 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
437 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
438 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
439 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
440 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
441 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
442 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
443 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
444 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
445 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
446 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
447 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
448 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
449 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
452 static const SimdIntrinsc vector16b_intrinsics[] = {
453 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
454 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
455 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
457 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
458 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
459 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
460 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
461 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
462 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
463 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
464 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
465 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
466 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
467 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
468 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
469 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
470 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
471 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
472 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
473 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
474 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
475 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
476 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
477 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
478 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
479 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
480 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
481 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
482 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
483 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
484 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
485 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
486 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
487 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
488 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
489 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
490 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
492 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
493 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
494 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
495 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
496 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
497 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
498 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
499 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
500 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
501 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
502 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
503 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
504 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
505 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
506 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
507 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
508 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
509 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
513 Missing:
514 setters
516 static const SimdIntrinsc vector16sb_intrinsics[] = {
517 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
518 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
519 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
520 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
521 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
522 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
523 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
524 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
525 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
526 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
527 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
528 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
529 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
530 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
531 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
532 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
533 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
534 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
535 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
536 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
537 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
538 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
539 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
540 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
541 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
542 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
543 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
544 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
545 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
546 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
547 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
548 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
549 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
550 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
551 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
552 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
553 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
554 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
555 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
556 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
557 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
558 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
559 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
560 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
561 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
562 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
563 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
564 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
565 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
566 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
567 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
568 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
569 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
570 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
571 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
572 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
575 static guint32 simd_supported_versions;
577 /*TODO match using number of parameters as well*/
578 static int
579 simd_intrinsic_compare_by_name (const void *key, const void *value)
581 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
584 typedef enum {
585 VREG_USED = 0x01,
586 VREG_HAS_XZERO_BB0 = 0x02,
587 VREG_HAS_OTHER_OP_BB0 = 0x04,
588 VREG_SINGLE_BB_USE = 0x08,
589 VREG_MANY_BB_USE = 0x10,
590 } KillFlags;
592 void
593 mono_simd_intrinsics_init (void)
595 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
596 /*TODO log the supported flags*/
599 static inline gboolean
600 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
602 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
603 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
604 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
605 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
606 return TRUE;
608 return FALSE;
611 static inline gboolean
612 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
614 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
615 return FALSE;
617 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
618 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
619 vreg_flags [reg] |= VREG_MANY_BB_USE;
620 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
621 return TRUE;
622 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
623 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
624 target_bb [reg] = bb;
625 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
626 return TRUE;
628 return FALSE;
632 This pass recalculate which vars need MONO_INST_INDIRECT.
634 We cannot do this for non SIMD vars since code like mono_get_vtable_var
635 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
637 void
638 mono_simd_simplify_indirection (MonoCompile *cfg)
640 int i, max_vreg = 0;
641 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
642 MonoInst *ins;
643 char *vreg_flags;
645 for (i = 0; i < cfg->num_varinfo; i++) {
646 MonoInst *var = cfg->varinfo [i];
647 if (var->klass->simd_type) {
648 var->flags &= ~MONO_INST_INDIRECT;
649 max_vreg = MAX (var->dreg, max_vreg);
653 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
654 if (!first_bb && bb->code)
655 first_bb = bb;
656 for (ins = bb->code; ins; ins = ins->next) {
657 if (ins->opcode == OP_LDADDR) {
658 MonoInst *var = (MonoInst*)ins->inst_p0;
659 if (var->klass->simd_type) {
660 var->flags |= MONO_INST_INDIRECT;
666 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
667 vreg_flags = g_malloc0 (max_vreg + 1);
668 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
670 for (i = 0; i < cfg->num_varinfo; i++) {
671 MonoInst *var = cfg->varinfo [i];
672 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
673 vreg_flags [var->dreg] = VREG_USED;
674 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
678 /*Scan the first basic block looking xzeros not used*/
679 for (ins = first_bb->code; ins; ins = ins->next) {
680 int num_sregs;
681 int sregs [MONO_MAX_SRC_REGS];
683 if (ins->opcode == OP_XZERO) {
684 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
685 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
686 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
688 continue;
690 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
691 continue;
692 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
693 continue;
694 num_sregs = mono_inst_get_src_registers (ins, sregs);
695 for (i = 0; i < num_sregs; ++i) {
696 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
697 break;
701 if (IS_DEBUG_ON (cfg)) {
702 for (i = 0; i < cfg->num_varinfo; i++) {
703 MonoInst *var = cfg->varinfo [i];
704 if (var->klass->simd_type) {
705 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
706 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
707 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
708 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
713 /*TODO stop here if no var is xzero only*/
716 Scan all other bb and check if it has only one other use
717 Ideally this would be done after an extended bb formation pass
719 FIXME This pass could use dominator information to properly
720 place the XZERO on the bb that dominates all uses of the var,
721 but this will have zero effect with the current local reg alloc
723 TODO simply the use of flags.
726 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
727 for (ins = bb->code; ins; ins = ins->next) {
728 int num_sregs;
729 int sregs [MONO_MAX_SRC_REGS];
731 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
732 continue;
733 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
734 continue;
735 num_sregs = mono_inst_get_src_registers (ins, sregs);
736 for (i = 0; i < num_sregs; ++i) {
737 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
738 max_vreg, vreg_flags, target_bb))
739 continue;
744 for (i = 0; i < cfg->num_varinfo; i++) {
745 MonoInst *var = cfg->varinfo [i];
746 if (!var->klass->simd_type)
747 continue;
748 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
749 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
750 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
751 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
753 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
754 continue;
755 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
756 int num_sregs, j;
757 int sregs [MONO_MAX_SRC_REGS];
758 gboolean found = FALSE;
760 num_sregs = mono_inst_get_src_registers (ins, sregs);
761 for (j = 0; j < num_sregs; ++j) {
762 if (sregs [i] == var->dreg)
763 found = TRUE;
765 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
766 if (ins->dreg == var->dreg && !found) {
767 break;
768 } else if (found) {
769 MonoInst *tmp;
770 MONO_INST_NEW (cfg, tmp, OP_XZERO);
771 tmp->dreg = var->dreg;
772 tmp->type = STACK_VTYPE;
773 tmp->klass = var->klass;
774 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
775 break;
780 for (ins = first_bb->code; ins; ins = ins->next) {
781 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
782 NULLIFY_INS (ins);
785 g_free (vreg_flags);
786 g_free (target_bb);
790 * This function expect that src be a value.
792 static int
793 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
795 if (src->opcode == OP_XMOVE) {
796 return src->sreg1;
797 } else if (src->type == STACK_VTYPE) {
798 return src->dreg;
800 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
801 mono_print_ins (src);
802 g_assert_not_reached ();
806 * This function will load the value if needed.
808 static int
809 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
811 if (indirect)
812 *indirect = FALSE;
813 if (src->opcode == OP_XMOVE) {
814 return src->sreg1;
815 } else if (src->opcode == OP_LDADDR) {
816 int res = ((MonoInst*)src->inst_p0)->dreg;
817 NULLIFY_INS (src);
818 return res;
819 } else if (src->type == STACK_VTYPE) {
820 return src->dreg;
821 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
822 MonoInst *ins;
823 if (indirect)
824 *indirect = TRUE;
826 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
827 ins->klass = cmethod->klass;
828 ins->sreg1 = src->dreg;
829 ins->type = STACK_VTYPE;
830 ins->dreg = alloc_ireg (cfg);
831 MONO_ADD_INS (cfg->cbb, ins);
832 return ins->dreg;
834 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
835 mono_print_ins (src);
836 g_assert_not_reached ();
839 static MonoInst*
840 get_int_to_float_spill_area (MonoCompile *cfg)
842 if (!cfg->iconv_raw_var) {
843 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
844 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
846 return cfg->iconv_raw_var;
849 /*We share the var with fconv_to_r8_x to save some stack space.*/
850 static MonoInst*
851 get_double_spill_area (MonoCompile *cfg)
853 if (!cfg->fconv_to_r8_x_var) {
854 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
855 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
857 return cfg->fconv_to_r8_x_var;
859 static MonoInst*
860 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
862 if (!cfg->simd_ctor_var) {
863 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
864 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
866 return cfg->simd_ctor_var;
869 static int
870 mono_type_to_expand_op (MonoType *type)
872 switch (type->type) {
873 case MONO_TYPE_I1:
874 case MONO_TYPE_U1:
875 return OP_EXPAND_I1;
876 case MONO_TYPE_I2:
877 case MONO_TYPE_U2:
878 return OP_EXPAND_I2;
879 case MONO_TYPE_I4:
880 case MONO_TYPE_U4:
881 return OP_EXPAND_I4;
882 case MONO_TYPE_I8:
883 case MONO_TYPE_U8:
884 return OP_EXPAND_I8;
885 case MONO_TYPE_R4:
886 return OP_EXPAND_R4;
887 case MONO_TYPE_R8:
888 return OP_EXPAND_R8;
890 g_assert_not_reached ();
893 static int
894 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
896 MonoInst *ins;
897 MonoMethodSignature *sig = mono_method_signature (cmethod);
898 int expand_op;
900 g_assert (sig->param_count == 2);
901 g_assert (position == 0 || position == 1);
903 if (mono_class_from_mono_type (sig->params [position])->simd_type)
904 return get_simd_vreg (cfg, cmethod, src);
906 expand_op = mono_type_to_expand_op (sig->params [position]);
907 MONO_INST_NEW (cfg, ins, expand_op);
908 ins->klass = cmethod->klass;
909 ins->sreg1 = src->dreg;
910 ins->type = STACK_VTYPE;
911 ins->dreg = alloc_ireg (cfg);
912 MONO_ADD_INS (cfg->cbb, ins);
914 if (expand_op == OP_EXPAND_R4)
915 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
916 else if (expand_op == OP_EXPAND_R8)
917 ins->backend.spill_var = get_double_spill_area (cfg);
919 return ins->dreg;
922 static MonoInst*
923 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
925 MonoInst* ins;
926 int left_vreg, right_vreg;
928 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
929 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
932 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
933 ins->klass = cmethod->klass;
934 ins->sreg1 = left_vreg;
935 ins->sreg2 = right_vreg;
936 ins->type = STACK_VTYPE;
937 ins->dreg = alloc_ireg (cfg);
938 ins->inst_c0 = intrinsic->flags;
939 MONO_ADD_INS (cfg->cbb, ins);
940 return ins;
943 static MonoInst*
944 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
946 MonoInst* ins;
947 int vreg;
949 vreg = get_simd_vreg (cfg, cmethod, args [0]);
951 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
952 ins->klass = cmethod->klass;
953 ins->sreg1 = vreg;
954 ins->type = STACK_VTYPE;
955 ins->dreg = alloc_ireg (cfg);
956 MONO_ADD_INS (cfg->cbb, ins);
957 return ins;
960 static int
961 mono_type_to_extract_op (MonoType *type)
963 switch (type->type) {
964 case MONO_TYPE_I1:
965 return OP_EXTRACT_I1;
966 case MONO_TYPE_U1:
967 return OP_EXTRACT_U1;
968 case MONO_TYPE_I2:
969 return OP_EXTRACT_I2;
970 case MONO_TYPE_U2:
971 return OP_EXTRACT_U2;
972 case MONO_TYPE_I4:
973 case MONO_TYPE_U4:
974 case MONO_TYPE_R4:
975 return OP_EXTRACT_I4;
977 g_assert_not_reached ();
980 /*Returns the amount to shift the element index to get the dword it belongs to*/
981 static int
982 mono_type_elements_shift_bits (MonoType *type)
984 switch (type->type) {
985 case MONO_TYPE_I1:
986 case MONO_TYPE_U1:
987 return 2;
988 case MONO_TYPE_I2:
989 case MONO_TYPE_U2:
990 return 1;
991 case MONO_TYPE_I4:
992 case MONO_TYPE_U4:
993 case MONO_TYPE_R4:
994 return 0;
996 g_assert_not_reached ();
999 static int
1000 mono_type_to_slow_insert_op (MonoType *type)
1002 switch (type->type) {
1003 case MONO_TYPE_I1:
1004 case MONO_TYPE_U1:
1005 return OP_INSERTX_U1_SLOW;
1006 case MONO_TYPE_I2:
1007 case MONO_TYPE_U2:
1008 return OP_INSERT_I2;
1009 case MONO_TYPE_I4:
1010 case MONO_TYPE_U4:
1011 return OP_INSERTX_I4_SLOW;
1012 case MONO_TYPE_I8:
1013 case MONO_TYPE_U8:
1014 return OP_INSERTX_I8_SLOW;
1015 case MONO_TYPE_R4:
1016 return OP_INSERTX_R4_SLOW;
1017 case MONO_TYPE_R8:
1018 return OP_INSERTX_R8_SLOW;
1020 g_assert_not_reached ();
1023 static MonoInst*
1024 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1026 MonoInst *ins;
1027 MonoMethodSignature *sig = mono_method_signature (cmethod);
1028 int size, align;
1029 gboolean indirect;
1030 int dreg;
1032 size = mono_type_size (sig->params [0], &align);
1034 if (size == 2 || size == 4 || size == 8) {
1035 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1036 ins->klass = cmethod->klass;
1037 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1038 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1039 ins->sreg2 = args [1]->dreg;
1040 ins->inst_c0 = intrinsic->opcode;
1041 if (sig->params [0]->type == MONO_TYPE_R4)
1042 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1043 else if (sig->params [0]->type == MONO_TYPE_R8)
1044 ins->backend.spill_var = get_double_spill_area (cfg);
1045 MONO_ADD_INS (cfg->cbb, ins);
1046 } else {
1047 int vreg, sreg;
1049 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1050 ins->klass = cmethod->klass;
1051 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1052 ins->type = STACK_I4;
1053 ins->dreg = vreg = alloc_ireg (cfg);
1054 ins->inst_c0 = intrinsic->opcode / 2;
1055 MONO_ADD_INS (cfg->cbb, ins);
1057 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1058 ins->klass = cmethod->klass;
1059 ins->sreg1 = vreg;
1060 ins->sreg2 = args [1]->dreg;
1061 ins->dreg = sreg;
1062 ins->inst_c0 = intrinsic->opcode;
1063 MONO_ADD_INS (cfg->cbb, ins);
1066 if (indirect) {
1067 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1068 ins->klass = cmethod->klass;
1069 ins->dreg = args [0]->dreg;
1070 ins->sreg1 = dreg;
1071 MONO_ADD_INS (cfg->cbb, ins);
1073 return ins;
1076 static MonoInst*
1077 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1079 MonoInst *ins;
1080 MonoMethodSignature *sig = mono_method_signature (cmethod);
1081 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1083 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1085 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1086 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1087 ins->klass = cmethod->klass;
1088 ins->sreg1 = vreg;
1089 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1090 ins->type = STACK_VTYPE;
1091 ins->dreg = vreg = alloc_ireg (cfg);
1092 MONO_ADD_INS (cfg->cbb, ins);
1095 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1096 ins->klass = cmethod->klass;
1097 ins->sreg1 = vreg;
1098 ins->type = STACK_I4;
1099 ins->dreg = vreg = alloc_ireg (cfg);
1100 if (cfg->compile_llvm)
1101 ins->inst_c0 = intrinsic->opcode;
1102 else
1103 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1104 MONO_ADD_INS (cfg->cbb, ins);
1106 if (sig->ret->type == MONO_TYPE_R4) {
1107 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1108 ins->klass = mono_defaults.single_class;
1109 ins->sreg1 = vreg;
1110 ins->type = STACK_R8;
1111 ins->dreg = alloc_freg (cfg);
1112 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1113 MONO_ADD_INS (cfg->cbb, ins);
1115 return ins;
1118 static MonoInst*
1119 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1121 MonoInst *ins;
1122 int vreg;
1123 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1125 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1127 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1128 ins->klass = cmethod->klass;
1129 ins->sreg1 = vreg;
1130 ins->inst_c0 = intrinsic->opcode;
1131 if (is_r8) {
1132 ins->type = STACK_R8;
1133 ins->dreg = alloc_freg (cfg);
1134 ins->backend.spill_var = get_double_spill_area (cfg);
1135 } else {
1136 ins->type = STACK_I8;
1137 ins->dreg = alloc_lreg (cfg);
1139 MONO_ADD_INS (cfg->cbb, ins);
1141 return ins;
1144 static MonoInst*
1145 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1147 MonoInst *ins = NULL;
1148 int i, addr_reg;
1149 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1150 MonoMethodSignature *sig = mono_method_signature (cmethod);
1151 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1152 int arg_size = mono_type_size (sig->params [0], &i);
1154 if (sig->param_count == 1) {
1155 int dreg;
1157 if (is_ldaddr) {
1158 dreg = args [0]->inst_i0->dreg;
1159 NULLIFY_INS (args [0]);
1160 } else {
1161 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1162 dreg = alloc_ireg (cfg);
1165 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1166 ins->klass = cmethod->klass;
1167 ins->sreg1 = args [1]->dreg;
1168 ins->type = STACK_VTYPE;
1169 ins->dreg = dreg;
1171 MONO_ADD_INS (cfg->cbb, ins);
1172 if (sig->params [0]->type == MONO_TYPE_R4)
1173 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1174 else if (sig->params [0]->type == MONO_TYPE_R8)
1175 ins->backend.spill_var = get_double_spill_area (cfg);
1177 if (!is_ldaddr) {
1178 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1179 ins->dreg = args [0]->dreg;
1180 ins->sreg1 = dreg;
1181 MONO_ADD_INS (cfg->cbb, ins);
1183 return ins;
1186 if (is_ldaddr) {
1187 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1188 MONO_ADD_INS (cfg->cbb, ins);
1189 addr_reg = ins->dreg;
1190 } else {
1191 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1192 addr_reg = args [0]->dreg;
1195 for (i = sig->param_count - 1; i >= 0; --i) {
1196 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1199 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1200 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1201 NULLIFY_INS (args [0]);
1203 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1204 ins->klass = cmethod->klass;
1205 ins->sreg1 = addr_reg;
1206 ins->type = STACK_VTYPE;
1207 ins->dreg = vreg;
1208 MONO_ADD_INS (cfg->cbb, ins);
1210 return ins;
1213 static MonoInst*
1214 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1216 MonoInst *ins;
1217 int vreg;
1219 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1221 //TODO macroize this
1222 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1223 ins->klass = cmethod->klass;
1224 ins->type = STACK_VTYPE;
1225 ins->sreg1 = vreg;
1226 ins->dreg = alloc_ireg (cfg);
1227 MONO_ADD_INS (cfg->cbb, ins);
1228 return ins;
1231 static MonoInst*
1232 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1234 MonoInst *ins;
1235 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1237 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1239 if (args [1]->opcode != OP_ICONST) {
1240 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1241 ins->klass = mono_defaults.int32_class;
1242 ins->sreg1 = args [1]->dreg;
1243 ins->type = STACK_I4;
1244 ins->dreg = vreg2 = alloc_ireg (cfg);
1245 MONO_ADD_INS (cfg->cbb, ins);
1247 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1250 MONO_INST_NEW (cfg, ins, opcode);
1251 ins->klass = cmethod->klass;
1252 ins->sreg1 = vreg;
1253 ins->sreg2 = vreg2;
1255 if (args [1]->opcode == OP_ICONST) {
1256 ins->inst_imm = args [1]->inst_c0;
1257 NULLIFY_INS (args [1]);
1260 ins->type = STACK_VTYPE;
1261 ins->dreg = alloc_ireg (cfg);
1262 MONO_ADD_INS (cfg->cbb, ins);
1263 return ins;
1266 static inline gboolean
1267 mono_op_is_packed_compare (int op)
1269 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1272 static MonoInst*
1273 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1275 MonoInst* ins;
1276 int left_vreg, right_vreg, tmp_vreg;
1278 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1279 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1282 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1283 ins->klass = cmethod->klass;
1284 ins->sreg1 = left_vreg;
1285 ins->sreg2 = right_vreg;
1286 ins->type = STACK_VTYPE;
1287 ins->klass = cmethod->klass;
1288 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1289 ins->inst_c0 = intrinsic->flags;
1290 MONO_ADD_INS (cfg->cbb, ins);
1292 /*FIXME the next ops are SSE specific*/
1293 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1294 ins->klass = cmethod->klass;
1295 ins->sreg1 = tmp_vreg;
1296 ins->type = STACK_I4;
1297 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1298 MONO_ADD_INS (cfg->cbb, ins);
1300 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1301 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1302 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1303 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1304 } else {
1305 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1306 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1308 MONO_ADD_INS (cfg->cbb, ins);
1309 return ins;
1313 static MonoInst*
1314 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1316 MonoInst *ins;
1317 int vreg;
1319 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
1321 if (args [1]->opcode != OP_ICONST) {
1322 /*TODO Shuffle with non literals is not yet supported */
1323 return NULL;
1325 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1326 NULLIFY_INS (args [1]);
1328 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1329 ins->klass = cmethod->klass;
1330 ins->sreg1 = vreg;
1331 ins->inst_c0 = args [1]->inst_c0;
1332 ins->type = STACK_VTYPE;
1333 ins->dreg = alloc_ireg (cfg);
1334 MONO_ADD_INS (cfg->cbb, ins);
1335 return ins;
1338 static MonoInst*
1339 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1341 MonoInst *ins;
1343 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1344 ins->klass = cmethod->klass;
1345 ins->sreg1 = args [0]->dreg;
1346 ins->type = STACK_VTYPE;
1347 ins->dreg = alloc_ireg (cfg);
1348 MONO_ADD_INS (cfg->cbb, ins);
1349 return ins;
1352 static MonoInst*
1353 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1355 MonoInst *ins;
1356 int vreg;
1358 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1360 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1361 ins->klass = cmethod->klass;
1362 ins->dreg = args [0]->dreg;
1363 ins->sreg1 = vreg;
1364 ins->type = STACK_VTYPE;
1365 MONO_ADD_INS (cfg->cbb, ins);
1366 return ins;
1369 static MonoInst*
1370 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1372 MonoInst *ins;
1373 int vreg;
1375 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1377 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1378 ins->klass = cmethod->klass;
1379 ins->sreg1 = vreg;
1380 ins->type = STACK_I4;
1381 ins->dreg = alloc_ireg (cfg);
1382 MONO_ADD_INS (cfg->cbb, ins);
1384 return ins;
1387 static MonoInst*
1388 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1390 MonoInst *ins;
1392 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1393 ins->klass = cmethod->klass;
1394 ins->sreg1 = args [0]->dreg;
1395 ins->backend.arg_info = intrinsic->flags;
1396 MONO_ADD_INS (cfg->cbb, ins);
1397 return ins;
1400 static const char *
1401 simd_version_name (guint32 version)
1403 switch (version) {
1404 case SIMD_VERSION_SSE1:
1405 return "sse1";
1406 case SIMD_VERSION_SSE2:
1407 return "sse2";
1408 case SIMD_VERSION_SSE3:
1409 return "sse3";
1410 case SIMD_VERSION_SSSE3:
1411 return "ssse3";
1412 case SIMD_VERSION_SSE41:
1413 return "sse41";
1414 case SIMD_VERSION_SSE42:
1415 return "sse42";
1416 case SIMD_VERSION_SSE4a:
1417 return "sse4a";
1419 return "n/a";
1422 static MonoInst*
1423 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1425 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1426 if (!result) {
1427 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1428 return NULL;
1430 if (IS_DEBUG_ON (cfg)) {
1431 int i, max;
1432 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1433 max = fsig->param_count + fsig->hasthis;
1434 for (i = 0; i < max; ++i) {
1435 printf ("param %d: ", i);
1436 mono_print_ins (args [i]);
1439 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1440 if (IS_DEBUG_ON (cfg)) {
1441 int x;
1442 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1443 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1444 if (result->simd_version_flags & (1 << x))
1445 printf ("%s ", simd_version_name (1 << x));
1447 printf ("\n");
1449 return NULL;
1452 switch (result->simd_emit_mode) {
1453 case SIMD_EMIT_BINARY:
1454 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1455 case SIMD_EMIT_UNARY:
1456 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1457 case SIMD_EMIT_SETTER:
1458 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1459 case SIMD_EMIT_GETTER:
1460 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1461 case SIMD_EMIT_GETTER_QWORD:
1462 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1463 case SIMD_EMIT_CTOR:
1464 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1465 case SIMD_EMIT_CAST:
1466 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1467 case SIMD_EMIT_SHUFFLE:
1468 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1469 case SIMD_EMIT_SHIFT:
1470 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1471 case SIMD_EMIT_EQUALITY:
1472 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1473 case SIMD_EMIT_LOAD_ALIGNED:
1474 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1475 case SIMD_EMIT_STORE:
1476 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1477 case SIMD_EMIT_EXTRACT_MASK:
1478 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1479 case SIMD_EMIT_PREFETCH:
1480 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1482 g_assert_not_reached ();
1485 static int
1486 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1488 MonoInst *ins;
1489 guint32 size;
1490 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1492 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1493 mult_reg = alloc_preg (cfg);
1494 array_reg = arr->dreg;
1495 index_reg = index->dreg;
1497 #if SIZEOF_VOID_P == 8
1498 /* The array reg is 64 bits but the index reg is only 32 */
1499 index2_reg = alloc_preg (cfg);
1500 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1501 #else
1502 index2_reg = index_reg;
1503 #endif
1504 index3_reg = alloc_preg (cfg);
1506 if (check_bounds) {
1507 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1508 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1509 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1512 add_reg = alloc_preg (cfg);
1514 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1515 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1516 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1517 ins->type = STACK_PTR;
1518 MONO_ADD_INS (cfg->cbb, ins);
1520 return add_reg;
1523 static MonoInst*
1524 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1526 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1527 MonoInst *load;
1528 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1530 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1531 load->klass = cmethod->klass;
1532 load->sreg1 = addr;
1533 load->type = STACK_VTYPE;
1534 load->dreg = alloc_ireg (cfg);
1535 MONO_ADD_INS (cfg->cbb, load);
1537 return load;
1539 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1540 MonoInst *store;
1541 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1542 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1544 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1545 store->klass = cmethod->klass;
1546 store->dreg = addr;
1547 store->sreg1 = vreg;
1548 MONO_ADD_INS (cfg->cbb, store);
1550 return store;
1552 if (!strcmp ("IsAligned", cmethod->name)) {
1553 MonoInst *ins;
1554 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1556 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1557 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1558 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1559 MONO_ADD_INS (cfg->cbb, ins);
1561 return ins;
1563 return NULL;
1566 static MonoInst*
1567 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1569 if (!strcmp ("get_AccelMode", cmethod->name)) {
1570 MonoInst *ins;
1571 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1572 return ins;
1574 return NULL;
1577 MonoInst*
1578 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1580 const char *class_name;
1582 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1583 return NULL;
1585 class_name = cmethod->klass->name;
1586 if (!strcmp ("SimdRuntime", class_name))
1587 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1589 if (!strcmp ("ArrayExtensions", class_name))
1590 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1592 if (!strcmp ("VectorOperations", class_name)) {
1593 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1594 return NULL;
1595 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1596 } else if (!cmethod->klass->simd_type)
1597 return NULL;
1599 cfg->uses_simd_intrinsics = 1;
1600 if (!strcmp ("Vector2d", class_name))
1601 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1602 if (!strcmp ("Vector4f", class_name))
1603 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1604 if (!strcmp ("Vector2ul", class_name))
1605 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1606 if (!strcmp ("Vector2l", class_name))
1607 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1608 if (!strcmp ("Vector4ui", class_name))
1609 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1610 if (!strcmp ("Vector4i", class_name))
1611 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1612 if (!strcmp ("Vector8us", class_name))
1613 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1614 if (!strcmp ("Vector8s", class_name))
1615 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1616 if (!strcmp ("Vector16b", class_name))
1617 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1618 if (!strcmp ("Vector16sb", class_name))
1619 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));
1621 return NULL;
1624 #endif