Reorganize the scan-minor-copy/scan.h header files a bit. Move the nursery copying...
[mono-project.git] / mono / mini / simd-intrinsics.c
blob24334ce1b73326ac705d3b115d5123f7544324de
1 /*
2 * simd-instrisics.c: simd support for intrinsics
4 * Author:
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
8 */
10 #include <config.h>
11 #include <stdio.h>
13 #include "mini.h"
14 #include "ir-emit.h"
17 General notes on SIMD intrinsics
19 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
20 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
21 TODO extend op_to_op_dest_membase to handle simd ops
22 TODO add support for indexed versions of simd ops
23 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
24 TODO make sure locals, arguments and spills are properly aligned.
25 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
26 TODO add stuff to man pages
27 TODO document this under /docs
28 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
29 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
30 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
31 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
32 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
33 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
34 TODO check if we need to init the SSE control word with better precision.
35 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
36 TODO make SimdRuntime.get_AccelMode work under AOT
37 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
38 TODO extend bounds checking code to support for range checking.
40 General notes for SIMD intrinsics.
42 -Bad extractor and constructor performance
43 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
44 It will be loaded in the FP stack just to be pushed on the call stack.
46 A similar thing happens with Vector4f constructor that require float vars to be
48 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
49 trip to the FP stack is desirable.
51 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
52 for simd and fp.
55 -Promote OP_EXTRACT_I4 to a STORE op
56 The advantage of this change is that it could have a _membase version and promote further optimizations.
58 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
59 without a OP_LDADDR.
62 #ifdef MONO_ARCH_SIMD_INTRINSICS
64 //#define IS_DEBUG_ON(cfg) (0)
66 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
67 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
68 enum {
69 SIMD_EMIT_BINARY,
70 SIMD_EMIT_UNARY,
71 SIMD_EMIT_SETTER,
72 SIMD_EMIT_GETTER,
73 SIMD_EMIT_GETTER_QWORD,
74 SIMD_EMIT_CTOR,
75 SIMD_EMIT_CAST,
76 SIMD_EMIT_SHUFFLE,
77 SIMD_EMIT_SHIFT,
78 SIMD_EMIT_EQUALITY,
79 SIMD_EMIT_LOAD_ALIGNED,
80 SIMD_EMIT_STORE,
81 SIMD_EMIT_EXTRACT_MASK,
82 SIMD_EMIT_PREFETCH
85 #ifdef HAVE_ARRAY_ELEM_INIT
86 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
87 #define MSGSTRFIELD1(line) str##line
88 static const struct msgstr_t {
89 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
90 #include "simd-methods.h"
91 #undef SIMD_METHOD
92 } method_names = {
93 #define SIMD_METHOD(str,name) str,
94 #include "simd-methods.h"
95 #undef SIMD_METHOD
98 enum {
99 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
100 #include "simd-methods.h"
102 #define method_name(idx) ((const char*)&method_names + (idx))
104 #else
105 #define SIMD_METHOD(str,name) str,
106 static const char * const method_names [] = {
107 #include "simd-methods.h"
108 NULL
110 #undef SIMD_METHOD
111 #define SIMD_METHOD(str,name) name,
112 enum {
113 #include "simd-methods.h"
114 SN_LAST
117 #define method_name(idx) (method_names [(idx)])
119 #endif
121 typedef struct {
122 guint16 name;
123 guint16 opcode;
124 guint8 simd_version_flags;
125 guint8 simd_emit_mode : 4;
126 guint8 flags : 4;
127 } SimdIntrinsc;
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
132 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
133 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
141 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
142 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
143 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
144 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
145 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
146 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
147 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
148 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
149 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
150 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
151 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
152 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
153 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
154 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
155 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
156 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
157 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
158 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
159 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
160 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
161 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
162 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
163 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
164 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
165 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
166 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
167 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
168 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
169 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
170 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
171 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
172 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
173 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
174 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
175 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
176 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
177 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
178 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
179 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
180 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
183 static const SimdIntrinsc vector2d_intrinsics[] = {
184 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
185 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
186 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
187 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
188 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
189 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
190 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
191 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
192 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
193 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
194 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
195 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
196 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
197 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
198 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
199 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
200 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
201 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
202 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
203 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
204 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
205 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
206 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
207 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
208 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
209 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
210 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
211 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
212 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
213 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
214 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
215 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
217 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
220 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
221 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
222 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
223 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
224 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
227 static const SimdIntrinsc vector2ul_intrinsics[] = {
228 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
229 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
230 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
231 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
232 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
233 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
234 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
235 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
236 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
237 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
238 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
239 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
240 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
241 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
242 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
243 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
244 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
245 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
246 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
247 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
248 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
249 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
250 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
251 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
254 static const SimdIntrinsc vector2l_intrinsics[] = {
255 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
256 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
257 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
258 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
259 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
260 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
261 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
262 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
263 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
264 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
265 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
266 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
267 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
268 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
269 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
270 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
271 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
272 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
273 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
274 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
275 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
276 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
277 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
278 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
279 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
282 static const SimdIntrinsc vector4ui_intrinsics[] = {
283 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
284 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
285 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
287 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
288 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
289 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
290 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
291 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
292 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
293 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
294 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
295 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
296 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
297 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
298 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
300 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
301 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
302 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
303 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
304 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
305 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
306 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
307 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
308 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
309 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
310 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
311 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
312 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
313 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
314 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
315 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
316 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
317 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
320 static const SimdIntrinsc vector4i_intrinsics[] = {
321 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
322 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
323 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
324 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
325 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
326 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
327 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
328 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
329 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
330 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
331 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
332 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
333 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
334 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
335 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
336 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
337 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
338 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
339 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
340 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
341 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
342 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
343 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
344 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
345 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
346 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
347 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
348 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
349 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
350 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
351 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
352 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
353 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
354 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
355 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
356 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
357 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
358 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
361 static const SimdIntrinsc vector8us_intrinsics[] = {
362 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
363 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
364 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
365 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
366 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
367 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
368 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
369 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
370 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
371 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
372 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
373 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
374 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
375 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
376 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
377 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
378 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
379 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
380 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
381 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
382 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
383 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
384 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
385 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
386 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
387 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
388 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
389 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
390 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
391 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
392 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
393 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
395 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
396 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
397 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
398 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
399 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
400 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
401 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
402 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
403 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
404 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
405 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
406 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
407 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
408 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
409 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
412 static const SimdIntrinsc vector8s_intrinsics[] = {
413 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
414 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
415 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
416 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
417 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
418 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
419 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
420 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
421 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
422 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
423 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
424 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
425 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
426 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
427 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
428 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
429 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
430 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
431 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
432 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
435 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
436 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
437 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
438 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
439 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
440 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
441 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
442 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
443 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
444 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
446 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
447 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
448 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
449 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
450 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
451 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
452 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
453 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
454 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
455 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
456 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
457 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
458 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
459 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
460 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
463 static const SimdIntrinsc vector16b_intrinsics[] = {
464 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
465 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
466 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
467 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
468 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
469 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
470 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
471 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
472 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
473 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
474 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
475 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
476 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
477 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
478 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
482 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
483 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
484 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
485 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
486 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
487 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
488 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
489 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
490 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
491 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
492 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
493 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
494 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
498 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
499 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
500 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
501 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
502 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
503 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
504 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
505 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
506 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
507 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
508 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
509 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
510 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
511 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
512 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
513 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
514 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
515 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
516 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
517 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
518 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
524 Missing:
525 setters
527 static const SimdIntrinsc vector16sb_intrinsics[] = {
528 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
529 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
530 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
531 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
532 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
533 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
534 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
535 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
536 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
537 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
538 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
539 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
540 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
541 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
542 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
545 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
546 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
547 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
548 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
549 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
550 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
551 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
552 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
553 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
554 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
555 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
556 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
557 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
561 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
562 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
563 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
564 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
565 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
566 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
567 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
568 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
569 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
570 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
571 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
572 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
573 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
574 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
575 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
576 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
577 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
578 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
579 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
580 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
581 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 static guint32 simd_supported_versions;
588 /*TODO match using number of parameters as well*/
589 static int
590 simd_intrinsic_compare_by_name (const void *key, const void *value)
592 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
595 typedef enum {
596 VREG_USED = 0x01,
597 VREG_HAS_XZERO_BB0 = 0x02,
598 VREG_HAS_OTHER_OP_BB0 = 0x04,
599 VREG_SINGLE_BB_USE = 0x08,
600 VREG_MANY_BB_USE = 0x10,
601 } KillFlags;
603 void
604 mono_simd_intrinsics_init (void)
606 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
607 /*TODO log the supported flags*/
610 static inline gboolean
611 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
613 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
614 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
615 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
616 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
617 return TRUE;
619 return FALSE;
622 static inline gboolean
623 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
625 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
626 return FALSE;
628 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
629 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
630 vreg_flags [reg] |= VREG_MANY_BB_USE;
631 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
632 return TRUE;
633 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
634 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
635 target_bb [reg] = bb;
636 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
637 return TRUE;
639 return FALSE;
643 This pass recalculate which vars need MONO_INST_INDIRECT.
645 We cannot do this for non SIMD vars since code like mono_get_vtable_var
646 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
648 void
649 mono_simd_simplify_indirection (MonoCompile *cfg)
651 int i, max_vreg = 0;
652 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
653 MonoInst *ins;
654 char *vreg_flags;
656 for (i = 0; i < cfg->num_varinfo; i++) {
657 MonoInst *var = cfg->varinfo [i];
658 if (var->klass->simd_type) {
659 var->flags &= ~MONO_INST_INDIRECT;
660 max_vreg = MAX (var->dreg, max_vreg);
664 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
665 if (!first_bb && bb->code)
666 first_bb = bb;
667 for (ins = bb->code; ins; ins = ins->next) {
668 if (ins->opcode == OP_LDADDR) {
669 MonoInst *var = (MonoInst*)ins->inst_p0;
670 if (var->klass->simd_type) {
671 var->flags |= MONO_INST_INDIRECT;
677 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
678 vreg_flags = g_malloc0 (max_vreg + 1);
679 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
681 for (i = 0; i < cfg->num_varinfo; i++) {
682 MonoInst *var = cfg->varinfo [i];
683 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
684 vreg_flags [var->dreg] = VREG_USED;
685 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
689 /*Scan the first basic block looking xzeros not used*/
690 for (ins = first_bb->code; ins; ins = ins->next) {
691 int num_sregs;
692 int sregs [MONO_MAX_SRC_REGS];
694 if (ins->opcode == OP_XZERO) {
695 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
696 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
697 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
699 continue;
701 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
702 continue;
703 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
704 continue;
705 num_sregs = mono_inst_get_src_registers (ins, sregs);
706 for (i = 0; i < num_sregs; ++i) {
707 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
708 break;
712 if (IS_DEBUG_ON (cfg)) {
713 for (i = 0; i < cfg->num_varinfo; i++) {
714 MonoInst *var = cfg->varinfo [i];
715 if (var->klass->simd_type) {
716 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
717 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
718 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
719 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
724 /*TODO stop here if no var is xzero only*/
727 Scan all other bb and check if it has only one other use
728 Ideally this would be done after an extended bb formation pass
730 FIXME This pass could use dominator information to properly
731 place the XZERO on the bb that dominates all uses of the var,
732 but this will have zero effect with the current local reg alloc
734 TODO simply the use of flags.
737 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
738 for (ins = bb->code; ins; ins = ins->next) {
739 int num_sregs;
740 int sregs [MONO_MAX_SRC_REGS];
742 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
743 continue;
744 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
745 continue;
746 num_sregs = mono_inst_get_src_registers (ins, sregs);
747 for (i = 0; i < num_sregs; ++i) {
748 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
749 max_vreg, vreg_flags, target_bb))
750 continue;
755 for (i = 0; i < cfg->num_varinfo; i++) {
756 MonoInst *var = cfg->varinfo [i];
757 if (!var->klass->simd_type)
758 continue;
759 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
760 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
761 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
762 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
764 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
765 continue;
766 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
767 int num_sregs, j;
768 int sregs [MONO_MAX_SRC_REGS];
769 gboolean found = FALSE;
771 num_sregs = mono_inst_get_src_registers (ins, sregs);
772 for (j = 0; j < num_sregs; ++j) {
773 if (sregs [i] == var->dreg)
774 found = TRUE;
776 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
777 if (ins->dreg == var->dreg && !found) {
778 break;
779 } else if (found) {
780 MonoInst *tmp;
781 MONO_INST_NEW (cfg, tmp, OP_XZERO);
782 tmp->dreg = var->dreg;
783 tmp->type = STACK_VTYPE;
784 tmp->klass = var->klass;
785 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
786 break;
791 for (ins = first_bb->code; ins; ins = ins->next) {
792 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
793 NULLIFY_INS (ins);
796 g_free (vreg_flags);
797 g_free (target_bb);
801 * This function expect that src be a value.
803 static int
804 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
806 if (src->opcode == OP_XMOVE) {
807 return src->sreg1;
808 } else if (src->type == STACK_VTYPE) {
809 return src->dreg;
811 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
812 mono_print_ins (src);
813 g_assert_not_reached ();
817 * This function will load the value if needed.
819 static int
820 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
822 if (indirect)
823 *indirect = FALSE;
824 if (src->opcode == OP_XMOVE) {
825 return src->sreg1;
826 } else if (src->opcode == OP_LDADDR) {
827 int res = ((MonoInst*)src->inst_p0)->dreg;
828 NULLIFY_INS (src);
829 return res;
830 } else if (src->type == STACK_VTYPE) {
831 return src->dreg;
832 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
833 MonoInst *ins;
834 if (indirect)
835 *indirect = TRUE;
837 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
838 ins->klass = cmethod->klass;
839 ins->sreg1 = src->dreg;
840 ins->type = STACK_VTYPE;
841 ins->dreg = alloc_ireg (cfg);
842 MONO_ADD_INS (cfg->cbb, ins);
843 return ins->dreg;
845 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
846 mono_print_ins (src);
847 g_assert_not_reached ();
850 static MonoInst*
851 get_int_to_float_spill_area (MonoCompile *cfg)
853 if (!cfg->iconv_raw_var) {
854 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
855 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
857 return cfg->iconv_raw_var;
860 /*We share the var with fconv_to_r8_x to save some stack space.*/
861 static MonoInst*
862 get_double_spill_area (MonoCompile *cfg)
864 if (!cfg->fconv_to_r8_x_var) {
865 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
866 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
868 return cfg->fconv_to_r8_x_var;
870 static MonoInst*
871 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
873 if (!cfg->simd_ctor_var) {
874 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
875 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
877 return cfg->simd_ctor_var;
880 static int
881 mono_type_to_expand_op (MonoType *type)
883 switch (type->type) {
884 case MONO_TYPE_I1:
885 case MONO_TYPE_U1:
886 return OP_EXPAND_I1;
887 case MONO_TYPE_I2:
888 case MONO_TYPE_U2:
889 return OP_EXPAND_I2;
890 case MONO_TYPE_I4:
891 case MONO_TYPE_U4:
892 return OP_EXPAND_I4;
893 case MONO_TYPE_I8:
894 case MONO_TYPE_U8:
895 return OP_EXPAND_I8;
896 case MONO_TYPE_R4:
897 return OP_EXPAND_R4;
898 case MONO_TYPE_R8:
899 return OP_EXPAND_R8;
900 default:
901 g_assert_not_reached ();
905 static int
906 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, int position)
908 MonoInst *ins;
909 MonoMethodSignature *sig = mono_method_signature (cmethod);
910 int expand_op;
912 g_assert (sig->param_count == 2);
913 g_assert (position == 0 || position == 1);
915 if (mono_class_from_mono_type (sig->params [position])->simd_type)
916 return get_simd_vreg (cfg, cmethod, src);
918 expand_op = mono_type_to_expand_op (sig->params [position]);
919 MONO_INST_NEW (cfg, ins, expand_op);
920 ins->klass = cmethod->klass;
921 ins->sreg1 = src->dreg;
922 ins->type = STACK_VTYPE;
923 ins->dreg = alloc_ireg (cfg);
924 MONO_ADD_INS (cfg->cbb, ins);
926 if (expand_op == OP_EXPAND_R4)
927 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
928 else if (expand_op == OP_EXPAND_R8)
929 ins->backend.spill_var = get_double_spill_area (cfg);
931 return ins->dreg;
934 static MonoInst*
935 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
937 MonoInst* ins;
938 int left_vreg, right_vreg;
940 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [0], 0);
941 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, cmethod, args [1], 1);
944 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
945 ins->klass = cmethod->klass;
946 ins->sreg1 = left_vreg;
947 ins->sreg2 = right_vreg;
948 ins->type = STACK_VTYPE;
949 ins->dreg = alloc_ireg (cfg);
950 ins->inst_c0 = intrinsic->flags;
951 MONO_ADD_INS (cfg->cbb, ins);
952 return ins;
955 static MonoInst*
956 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
958 MonoInst* ins;
959 int vreg;
961 vreg = get_simd_vreg (cfg, cmethod, args [0]);
963 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
964 ins->klass = cmethod->klass;
965 ins->sreg1 = vreg;
966 ins->type = STACK_VTYPE;
967 ins->dreg = alloc_ireg (cfg);
968 MONO_ADD_INS (cfg->cbb, ins);
969 return ins;
972 static int
973 mono_type_to_extract_op (MonoType *type)
975 switch (type->type) {
976 case MONO_TYPE_I1:
977 return OP_EXTRACT_I1;
978 case MONO_TYPE_U1:
979 return OP_EXTRACT_U1;
980 case MONO_TYPE_I2:
981 return OP_EXTRACT_I2;
982 case MONO_TYPE_U2:
983 return OP_EXTRACT_U2;
984 case MONO_TYPE_I4:
985 case MONO_TYPE_U4:
986 case MONO_TYPE_R4:
987 return OP_EXTRACT_I4;
988 default:
989 g_assert_not_reached ();
993 /*Returns the amount to shift the element index to get the dword it belongs to*/
994 static int
995 mono_type_elements_shift_bits (MonoType *type)
997 switch (type->type) {
998 case MONO_TYPE_I1:
999 case MONO_TYPE_U1:
1000 return 2;
1001 case MONO_TYPE_I2:
1002 case MONO_TYPE_U2:
1003 return 1;
1004 case MONO_TYPE_I4:
1005 case MONO_TYPE_U4:
1006 case MONO_TYPE_R4:
1007 return 0;
1008 default:
1009 g_assert_not_reached ();
1013 static G_GNUC_UNUSED int
1014 mono_type_to_insert_op (MonoType *type)
1016 switch (type->type) {
1017 case MONO_TYPE_I1:
1018 case MONO_TYPE_U1:
1019 return OP_INSERT_I1;
1020 case MONO_TYPE_I2:
1021 case MONO_TYPE_U2:
1022 return OP_INSERT_I2;
1023 case MONO_TYPE_I4:
1024 case MONO_TYPE_U4:
1025 return OP_INSERT_I4;
1026 case MONO_TYPE_I8:
1027 case MONO_TYPE_U8:
1028 return OP_INSERT_I8;
1029 case MONO_TYPE_R4:
1030 return OP_INSERT_R4;
1031 case MONO_TYPE_R8:
1032 return OP_INSERT_R8;
1033 default:
1034 g_assert_not_reached ();
1038 static int
1039 mono_type_to_slow_insert_op (MonoType *type)
1041 switch (type->type) {
1042 case MONO_TYPE_I1:
1043 case MONO_TYPE_U1:
1044 return OP_INSERTX_U1_SLOW;
1045 case MONO_TYPE_I2:
1046 case MONO_TYPE_U2:
1047 return OP_INSERT_I2;
1048 case MONO_TYPE_I4:
1049 case MONO_TYPE_U4:
1050 return OP_INSERTX_I4_SLOW;
1051 case MONO_TYPE_I8:
1052 case MONO_TYPE_U8:
1053 return OP_INSERTX_I8_SLOW;
1054 case MONO_TYPE_R4:
1055 return OP_INSERTX_R4_SLOW;
1056 case MONO_TYPE_R8:
1057 return OP_INSERTX_R8_SLOW;
1058 default:
1059 g_assert_not_reached ();
1063 static MonoInst*
1064 simd_intrinsic_emit_setter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1066 MonoInst *ins;
1067 MonoMethodSignature *sig = mono_method_signature (cmethod);
1068 int size, align;
1069 gboolean indirect;
1070 int dreg;
1072 size = mono_type_size (sig->params [0], &align);
1074 if (COMPILE_LLVM (cfg)) {
1075 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1076 ins->klass = cmethod->klass;
1077 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1078 ins->sreg2 = args [1]->dreg;
1079 ins->inst_c0 = intrinsic->opcode;
1080 MONO_ADD_INS (cfg->cbb, ins);
1081 } else if (size == 2 || size == 4 || size == 8) {
1082 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1083 ins->klass = cmethod->klass;
1084 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1085 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1086 ins->sreg2 = args [1]->dreg;
1087 ins->inst_c0 = intrinsic->opcode;
1088 if (sig->params [0]->type == MONO_TYPE_R4)
1089 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1090 else if (sig->params [0]->type == MONO_TYPE_R8)
1091 ins->backend.spill_var = get_double_spill_area (cfg);
1092 MONO_ADD_INS (cfg->cbb, ins);
1093 } else {
1094 int vreg, sreg;
1096 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1097 ins->klass = cmethod->klass;
1098 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1099 ins->type = STACK_I4;
1100 ins->dreg = vreg = alloc_ireg (cfg);
1101 ins->inst_c0 = intrinsic->opcode / 2;
1102 MONO_ADD_INS (cfg->cbb, ins);
1104 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1105 ins->klass = cmethod->klass;
1106 ins->sreg1 = vreg;
1107 ins->sreg2 = args [1]->dreg;
1108 ins->dreg = sreg;
1109 ins->inst_c0 = intrinsic->opcode;
1110 MONO_ADD_INS (cfg->cbb, ins);
1113 if (indirect) {
1114 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1115 ins->klass = cmethod->klass;
1116 ins->dreg = args [0]->dreg;
1117 ins->sreg1 = dreg;
1118 MONO_ADD_INS (cfg->cbb, ins);
1120 return ins;
1123 static MonoInst*
1124 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1126 MonoInst *ins;
1127 MonoMethodSignature *sig = mono_method_signature (cmethod);
1128 int vreg, shift_bits = mono_type_elements_shift_bits (sig->ret);
1130 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1132 if ((intrinsic->opcode >> shift_bits) && !cfg->compile_llvm) {
1133 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1134 ins->klass = cmethod->klass;
1135 ins->sreg1 = vreg;
1136 ins->inst_c0 = intrinsic->opcode >> shift_bits;
1137 ins->type = STACK_VTYPE;
1138 ins->dreg = vreg = alloc_ireg (cfg);
1139 MONO_ADD_INS (cfg->cbb, ins);
1142 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (sig->ret));
1143 ins->klass = cmethod->klass;
1144 ins->sreg1 = vreg;
1145 ins->type = STACK_I4;
1146 ins->dreg = vreg = alloc_ireg (cfg);
1147 if (cfg->compile_llvm)
1148 ins->inst_c0 = intrinsic->opcode;
1149 else
1150 ins->inst_c0 = intrinsic->opcode & ((1 << shift_bits) - 1);
1151 MONO_ADD_INS (cfg->cbb, ins);
1153 if (sig->ret->type == MONO_TYPE_R4) {
1154 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
1155 ins->klass = mono_defaults.single_class;
1156 ins->sreg1 = vreg;
1157 ins->type = STACK_R8;
1158 ins->dreg = alloc_freg (cfg);
1159 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1160 MONO_ADD_INS (cfg->cbb, ins);
1162 return ins;
1165 static MonoInst*
1166 simd_intrinsic_emit_long_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1168 MonoInst *ins;
1169 int vreg;
1170 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1172 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1174 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1175 ins->klass = cmethod->klass;
1176 ins->sreg1 = vreg;
1177 ins->inst_c0 = intrinsic->opcode;
1178 if (is_r8) {
1179 ins->type = STACK_R8;
1180 ins->dreg = alloc_freg (cfg);
1181 ins->backend.spill_var = get_double_spill_area (cfg);
1182 } else {
1183 ins->type = STACK_I8;
1184 ins->dreg = alloc_lreg (cfg);
1186 MONO_ADD_INS (cfg->cbb, ins);
1188 return ins;
1191 static MonoInst*
1192 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1194 MonoInst *ins = NULL;
1195 int i, addr_reg;
1196 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1197 MonoMethodSignature *sig = mono_method_signature (cmethod);
1198 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1199 int arg_size = mono_type_size (sig->params [0], &i);
1201 if (sig->param_count == 1) {
1202 int dreg;
1204 if (is_ldaddr) {
1205 dreg = args [0]->inst_i0->dreg;
1206 NULLIFY_INS (args [0]);
1207 } else {
1208 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1209 dreg = alloc_ireg (cfg);
1212 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1213 ins->klass = cmethod->klass;
1214 ins->sreg1 = args [1]->dreg;
1215 ins->type = STACK_VTYPE;
1216 ins->dreg = dreg;
1218 MONO_ADD_INS (cfg->cbb, ins);
1219 if (sig->params [0]->type == MONO_TYPE_R4)
1220 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
1221 else if (sig->params [0]->type == MONO_TYPE_R8)
1222 ins->backend.spill_var = get_double_spill_area (cfg);
1224 if (!is_ldaddr) {
1225 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1226 ins->dreg = args [0]->dreg;
1227 ins->sreg1 = dreg;
1228 MONO_ADD_INS (cfg->cbb, ins);
1230 return ins;
1233 if (is_ldaddr) {
1234 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1235 MONO_ADD_INS (cfg->cbb, ins);
1236 addr_reg = ins->dreg;
1237 } else {
1238 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1239 addr_reg = args [0]->dreg;
1242 for (i = sig->param_count - 1; i >= 0; --i) {
1243 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1246 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1247 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1248 NULLIFY_INS (args [0]);
1250 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1251 ins->klass = cmethod->klass;
1252 ins->sreg1 = addr_reg;
1253 ins->type = STACK_VTYPE;
1254 ins->dreg = vreg;
1255 MONO_ADD_INS (cfg->cbb, ins);
1257 return ins;
1260 static MonoInst*
1261 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1263 MonoInst *ins;
1264 int vreg;
1266 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1268 //TODO macroize this
1269 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1270 ins->klass = cmethod->klass;
1271 ins->type = STACK_VTYPE;
1272 ins->sreg1 = vreg;
1273 ins->dreg = alloc_ireg (cfg);
1274 MONO_ADD_INS (cfg->cbb, ins);
1275 return ins;
1278 static MonoInst*
1279 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1281 MonoInst *ins;
1282 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1284 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1286 if (args [1]->opcode != OP_ICONST) {
1287 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1288 ins->klass = mono_defaults.int32_class;
1289 ins->sreg1 = args [1]->dreg;
1290 ins->type = STACK_I4;
1291 ins->dreg = vreg2 = alloc_ireg (cfg);
1292 MONO_ADD_INS (cfg->cbb, ins);
1294 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1297 MONO_INST_NEW (cfg, ins, opcode);
1298 ins->klass = cmethod->klass;
1299 ins->sreg1 = vreg;
1300 ins->sreg2 = vreg2;
1302 if (args [1]->opcode == OP_ICONST) {
1303 ins->inst_imm = args [1]->inst_c0;
1304 NULLIFY_INS (args [1]);
1307 ins->type = STACK_VTYPE;
1308 ins->dreg = alloc_ireg (cfg);
1309 MONO_ADD_INS (cfg->cbb, ins);
1310 return ins;
1313 static inline gboolean
1314 mono_op_is_packed_compare (int op)
1316 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1319 static MonoInst*
1320 simd_intrinsic_emit_equality (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1322 MonoInst* ins;
1323 int left_vreg, right_vreg, tmp_vreg;
1325 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
1326 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1329 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1330 ins->klass = cmethod->klass;
1331 ins->sreg1 = left_vreg;
1332 ins->sreg2 = right_vreg;
1333 ins->type = STACK_VTYPE;
1334 ins->klass = cmethod->klass;
1335 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1336 ins->inst_c0 = intrinsic->flags;
1337 MONO_ADD_INS (cfg->cbb, ins);
1339 /*FIXME the next ops are SSE specific*/
1340 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1341 ins->klass = cmethod->klass;
1342 ins->sreg1 = tmp_vreg;
1343 ins->type = STACK_I4;
1344 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1345 MONO_ADD_INS (cfg->cbb, ins);
1347 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1348 if (mono_op_is_packed_compare (intrinsic->opcode) || intrinsic->flags == SIMD_COMP_EQ) {
1349 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1350 NEW_UNALU (cfg, ins, intrinsic->flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1351 } else {
1352 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1353 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1355 MONO_ADD_INS (cfg->cbb, ins);
1356 return ins;
1360 static MonoInst*
1361 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1363 MonoInst *ins;
1364 int vreg, vreg2 = -1;
1365 int param_count = mono_method_signature (cmethod)->param_count;
1367 if (args [param_count - 1]->opcode != OP_ICONST) {
1368 /*TODO Shuffle with non literals is not yet supported */
1369 return NULL;
1372 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1373 if (param_count == 3)
1374 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1376 NULLIFY_INS (args [param_count - 1]);
1379 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1380 ins->klass = cmethod->klass;
1381 ins->sreg1 = vreg;
1382 ins->sreg2 = vreg2;
1383 ins->inst_c0 = args [param_count - 1]->inst_c0;
1384 ins->type = STACK_VTYPE;
1385 ins->dreg = alloc_ireg (cfg);
1386 MONO_ADD_INS (cfg->cbb, ins);
1388 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1389 ins->opcode = OP_SHUFPS;
1390 return ins;
1393 static MonoInst*
1394 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1396 MonoInst *ins;
1398 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1399 ins->klass = cmethod->klass;
1400 ins->sreg1 = args [0]->dreg;
1401 ins->type = STACK_VTYPE;
1402 ins->dreg = alloc_ireg (cfg);
1403 MONO_ADD_INS (cfg->cbb, ins);
1404 return ins;
1407 static MonoInst*
1408 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1410 MonoInst *ins;
1411 int vreg;
1413 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1415 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1416 ins->klass = cmethod->klass;
1417 ins->dreg = args [0]->dreg;
1418 ins->sreg1 = vreg;
1419 ins->type = STACK_VTYPE;
1420 MONO_ADD_INS (cfg->cbb, ins);
1421 return ins;
1424 static MonoInst*
1425 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1427 MonoInst *ins;
1428 int vreg;
1430 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1432 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1433 ins->klass = cmethod->klass;
1434 ins->sreg1 = vreg;
1435 ins->type = STACK_I4;
1436 ins->dreg = alloc_ireg (cfg);
1437 MONO_ADD_INS (cfg->cbb, ins);
1439 return ins;
1442 static MonoInst*
1443 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1445 MonoInst *ins;
1447 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1448 ins->klass = cmethod->klass;
1449 ins->sreg1 = args [0]->dreg;
1450 ins->backend.arg_info = intrinsic->flags;
1451 MONO_ADD_INS (cfg->cbb, ins);
1452 return ins;
1455 static const char *
1456 simd_version_name (guint32 version)
1458 switch (version) {
1459 case SIMD_VERSION_SSE1:
1460 return "sse1";
1461 case SIMD_VERSION_SSE2:
1462 return "sse2";
1463 case SIMD_VERSION_SSE3:
1464 return "sse3";
1465 case SIMD_VERSION_SSSE3:
1466 return "ssse3";
1467 case SIMD_VERSION_SSE41:
1468 return "sse41";
1469 case SIMD_VERSION_SSE42:
1470 return "sse42";
1471 case SIMD_VERSION_SSE4a:
1472 return "sse4a";
1474 return "n/a";
1477 static MonoInst*
1478 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1480 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1481 if (!result) {
1482 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1483 return NULL;
1485 if (IS_DEBUG_ON (cfg)) {
1486 int i, max;
1487 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1488 max = fsig->param_count + fsig->hasthis;
1489 for (i = 0; i < max; ++i) {
1490 printf ("param %d: ", i);
1491 mono_print_ins (args [i]);
1494 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1495 if (IS_DEBUG_ON (cfg)) {
1496 int x;
1497 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1498 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1499 if (result->simd_version_flags & (1 << x))
1500 printf ("%s ", simd_version_name (1 << x));
1502 printf ("\n");
1504 return NULL;
1507 switch (result->simd_emit_mode) {
1508 case SIMD_EMIT_BINARY:
1509 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1510 case SIMD_EMIT_UNARY:
1511 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1512 case SIMD_EMIT_SETTER:
1513 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1514 case SIMD_EMIT_GETTER:
1515 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1516 case SIMD_EMIT_GETTER_QWORD:
1517 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1518 case SIMD_EMIT_CTOR:
1519 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1520 case SIMD_EMIT_CAST:
1521 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1522 case SIMD_EMIT_SHUFFLE:
1523 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1524 case SIMD_EMIT_SHIFT:
1525 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1526 case SIMD_EMIT_EQUALITY:
1527 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1528 case SIMD_EMIT_LOAD_ALIGNED:
1529 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1530 case SIMD_EMIT_STORE:
1531 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1532 case SIMD_EMIT_EXTRACT_MASK:
1533 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1534 case SIMD_EMIT_PREFETCH:
1535 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1537 g_assert_not_reached ();
1540 static int
1541 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1543 MonoInst *ins;
1544 guint32 size;
1545 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1547 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1548 mult_reg = alloc_preg (cfg);
1549 array_reg = arr->dreg;
1550 index_reg = index->dreg;
1552 #if SIZEOF_VOID_P == 8
1553 /* The array reg is 64 bits but the index reg is only 32 */
1554 index2_reg = alloc_preg (cfg);
1555 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1556 #else
1557 index2_reg = index_reg;
1558 #endif
1559 index3_reg = alloc_preg (cfg);
1561 if (check_bounds) {
1562 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1563 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1564 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1567 add_reg = alloc_preg (cfg);
1569 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1570 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1571 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, G_STRUCT_OFFSET (MonoArray, vector));
1572 ins->type = STACK_PTR;
1573 MONO_ADD_INS (cfg->cbb, ins);
1575 return add_reg;
1578 static MonoInst*
1579 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1581 if (!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) {
1582 MonoInst *load;
1583 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1585 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1586 load->klass = cmethod->klass;
1587 load->sreg1 = addr;
1588 load->type = STACK_VTYPE;
1589 load->dreg = alloc_ireg (cfg);
1590 MONO_ADD_INS (cfg->cbb, load);
1592 return load;
1594 if (!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) {
1595 MonoInst *store;
1596 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1597 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1599 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1600 store->klass = cmethod->klass;
1601 store->dreg = addr;
1602 store->sreg1 = vreg;
1603 MONO_ADD_INS (cfg->cbb, store);
1605 return store;
1607 if (!strcmp ("IsAligned", cmethod->name)) {
1608 MonoInst *ins;
1609 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1611 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1612 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1613 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1614 MONO_ADD_INS (cfg->cbb, ins);
1616 return ins;
1618 return NULL;
1621 static MonoInst*
1622 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1624 if (!strcmp ("get_AccelMode", cmethod->name)) {
1625 MonoInst *ins;
1626 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1627 return ins;
1629 return NULL;
1632 MonoInst*
1633 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1635 const char *class_name;
1637 if (strcmp ("Mono.Simd", cmethod->klass->name_space))
1638 return NULL;
1640 class_name = cmethod->klass->name;
1641 if (!strcmp ("SimdRuntime", class_name))
1642 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1644 if (!strcmp ("ArrayExtensions", class_name))
1645 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
1647 if (!strcmp ("VectorOperations", class_name)) {
1648 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
1649 return NULL;
1650 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
1651 } else if (!cmethod->klass->simd_type)
1652 return NULL;
1654 cfg->uses_simd_intrinsics = 1;
1655 if (!strcmp ("Vector2d", class_name))
1656 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1657 if (!strcmp ("Vector4f", class_name))
1658 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1659 if (!strcmp ("Vector2ul", class_name))
1660 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1661 if (!strcmp ("Vector2l", class_name))
1662 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1663 if (!strcmp ("Vector4ui", class_name))
1664 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1665 if (!strcmp ("Vector4i", class_name))
1666 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1667 if (!strcmp ("Vector8us", class_name))
1668 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1669 if (!strcmp ("Vector8s", class_name))
1670 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1671 if (!strcmp ("Vector16b", class_name))
1672 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1673 if (!strcmp ("Vector16sb", class_name))
1674 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));
1676 return NULL;
1679 #endif