2009-01-07 Zoltan Varga <vargaz@gmail.com>
[mono-project.git] / mono / mini / simd-intrinsics.c
blob7369c9322fdfed1868ab73042a339117110a2dc6
1 /*
2 * simd-instrisics.c: simd support for intrinsics
4 * Author:
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
8 */
10 #include <config.h>
11 #include <stdio.h>
13 #define NEW_IR
14 #include "mini.h"
15 #include "ir-emit.h"
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
34 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
35 TODO check if we need to init the SSE control word with better precision.
36 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
37 TODO make SimdRuntime.get_AccelMode work under AOT
39 General notes for SIMD intrinsics.
41 -Bad extractor and constructor performance
42 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
43 It will be loaded in the FP stack just to be pushed on the call stack.
45 A similar thing happens with Vector4f constructor that require float vars to be
47 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
48 trip to the FP stack is desirable.
50 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
51 for simd and fp.
54 -Promote OP_EXTRACT_I4 to a STORE op
55 The advantage of this change is that it could have a _membase version and promote further optimizations.
57 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
58 without a OP_LDADDR.
61 #ifdef MONO_ARCH_SIMD_INTRINSICS
63 //#define IS_DEBUG_ON(cfg) (0)
65 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
66 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
67 enum {
68 SIMD_EMIT_BINARY,
69 SIMD_EMIT_UNARY,
70 SIMD_EMIT_GETTER,
71 SIMD_EMIT_CTOR,
72 SIMD_EMIT_CAST,
73 SIMD_EMIT_SHUFFLE,
74 SIMD_EMIT_SHIFT,
75 SIMD_EMIT_LOAD_ALIGNED,
76 SIMD_EMIT_STORE,
77 SIMD_EMIT_EXTRACT_MASK,
78 SIMD_EMIT_PREFETCH
81 #ifdef HAVE_ARRAY_ELEM_INIT
82 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
83 #define MSGSTRFIELD1(line) str##line
84 static const struct msgstr_t {
85 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
86 #include "simd-methods.h"
87 #undef SIMD_METHOD
88 } method_names = {
89 #define SIMD_METHOD(str,name) str,
90 #include "simd-methods.h"
91 #undef SIMD_METHOD
94 enum {
95 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
96 #include "simd-methods.h"
98 #define method_name(idx) ((const char*)&method_names + (idx))
100 #else
101 #define SIMD_METHOD(str,name) str,
102 static const char * const method_names [] = {
103 #include "simd-methods.h"
104 NULL
106 #undef SIMD_METHOD
107 #define SIMD_METHOD(str,name) name,
108 enum {
109 #include "simd-methods.h"
110 SN_LAST
113 #define method_name(idx) (method_names [(idx)])
115 #endif
117 typedef struct {
118 guint16 name;
119 guint16 opcode;
120 guint8 simd_emit_mode : 4;
121 guint8 simd_version : 4;
122 guint8 flags;
123 } SimdIntrinsc;
126 Missing:
127 setters
129 static const SimdIntrinsc vector4f_intrinsics[] = {
130 { SN_ctor, 0, SIMD_EMIT_CTOR },
131 { SN_AddSub, OP_ADDSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
132 { SN_AndNot, OP_ANDNPS, SIMD_EMIT_BINARY },
133 { SN_CompareEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
134 { SN_CompareLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
135 { SN_CompareLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
136 { SN_CompareNotEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
137 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
138 { SN_CompareNotLessThan, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
139 { SN_CompareOrdered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
140 { SN_CompareUnordered, OP_COMPPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
141 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
142 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
143 { SN_HorizontalAdd, OP_HADDPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
144 { SN_HorizontalSub, OP_HSUBPS, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
145 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_EMIT_BINARY },
146 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_EMIT_BINARY },
147 { SN_InvSqrt, OP_RSQRTPS, SIMD_EMIT_UNARY },
148 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
149 { SN_Max, OP_MAXPS, SIMD_EMIT_BINARY },
150 { SN_Min, OP_MINPS, SIMD_EMIT_BINARY },
151 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
152 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
153 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
154 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
155 { SN_Reciprocal, OP_RCPPS, SIMD_EMIT_UNARY },
156 { SN_Shuffle, OP_SHUFLEPS, SIMD_EMIT_SHUFFLE },
157 { SN_Sqrt, OP_SQRTPS, SIMD_EMIT_UNARY },
158 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
159 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_EMIT_STORE },
160 { SN_get_W, 3, SIMD_EMIT_GETTER },
161 { SN_get_X, 0, SIMD_EMIT_GETTER },
162 { SN_get_Y, 1, SIMD_EMIT_GETTER },
163 { SN_get_Z, 2, SIMD_EMIT_GETTER },
164 { SN_op_Addition, OP_ADDPS, SIMD_EMIT_BINARY },
165 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_EMIT_BINARY },
166 { SN_op_BitwiseOr, OP_ORPS, SIMD_EMIT_BINARY },
167 { SN_op_Division, OP_DIVPS, SIMD_EMIT_BINARY },
168 { SN_op_ExclusiveOr, OP_XORPS, SIMD_EMIT_BINARY },
169 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
170 { SN_op_Multiply, OP_MULPS, SIMD_EMIT_BINARY },
171 { SN_op_Subtraction, OP_SUBPS, SIMD_EMIT_BINARY },
175 Missing:
176 .ctor
177 getters
178 setters
180 static const SimdIntrinsc vector2d_intrinsics[] = {
181 { SN_AddSub, OP_ADDSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
182 { SN_AndNot, OP_ANDNPD, SIMD_EMIT_BINARY },
183 { SN_CompareEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_EQ },
184 { SN_CompareLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LE },
185 { SN_CompareLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_LT },
186 { SN_CompareNotEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NEQ },
187 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLE },
188 { SN_CompareNotLessThan, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_NLT },
189 { SN_CompareOrdered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_ORD },
190 { SN_CompareUnordered, OP_COMPPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1, SIMD_COMP_UNORD },
191 { SN_Duplicate, OP_DUPPD, SIMD_EMIT_UNARY, SIMD_VERSION_SSE3 },
192 { SN_HorizontalAdd, OP_HADDPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
193 { SN_HorizontalSub, OP_HSUBPD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE3 },
194 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_EMIT_BINARY },
195 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_EMIT_BINARY },
196 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
197 { SN_Max, OP_MAXPD, SIMD_EMIT_BINARY },
198 { SN_Min, OP_MINPD, SIMD_EMIT_BINARY },
199 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
200 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
201 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
202 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
203 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
204 { SN_op_Addition, OP_ADDPD, SIMD_EMIT_BINARY },
205 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_EMIT_BINARY },
206 { SN_op_BitwiseOr, OP_ORPD, SIMD_EMIT_BINARY },
207 { SN_op_Division, OP_DIVPD, SIMD_EMIT_BINARY },
208 { SN_op_ExclusiveOr, OP_XORPD, SIMD_EMIT_BINARY },
209 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
210 { SN_op_Multiply, OP_MULPD, SIMD_EMIT_BINARY },
211 { SN_op_Subtraction, OP_SUBPD, SIMD_EMIT_BINARY },
215 Missing:
216 .ctor
217 getters
218 setters
220 static const SimdIntrinsc vector2ul_intrinsics[] = {
221 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
222 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
223 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
224 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
225 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
226 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
227 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
228 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
229 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
230 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
231 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
232 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
233 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
234 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
235 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
236 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
237 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
238 { SN_op_RightShift, OP_PSHRQ, SIMD_EMIT_SHIFT },
239 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
243 Missing:
244 .ctor
245 getters
246 setters
248 static const SimdIntrinsc vector2l_intrinsics[] = {
249 { SN_CompareEqual, OP_PCMPEQQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
250 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_EMIT_BINARY, SIMD_VERSION_SSE42 },
251 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
252 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
253 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
254 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
255 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
256 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
257 { SN_ShiftRightLogic, OP_PSHRQ, SIMD_EMIT_SHIFT },
258 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
259 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_EMIT_BINARY },
260 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_EMIT_BINARY },
261 { SN_op_Addition, OP_PADDQ, SIMD_EMIT_BINARY },
262 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
263 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
264 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
265 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
266 { SN_op_LeftShift, OP_PSHLQ, SIMD_EMIT_SHIFT },
267 { SN_op_Multiply, OP_PMULQ, SIMD_EMIT_BINARY },
268 { SN_op_Subtraction, OP_PSUBQ, SIMD_EMIT_BINARY },
272 Missing:
273 .ctor
274 getters
275 setters
277 static const SimdIntrinsc vector4ui_intrinsics[] = {
278 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
279 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
280 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
281 { SN_Max, OP_PMAXD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
282 { SN_Min, OP_PMIND_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
283 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
284 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
285 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
286 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
287 { SN_ShiftRightArithmetic, OP_PSARD, SIMD_EMIT_SHIFT },
288 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
289 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
290 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
291 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
292 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
293 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
294 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
295 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
296 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
297 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
298 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
299 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
300 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
301 { SN_op_RightShift, OP_PSHRD, SIMD_EMIT_SHIFT },
302 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
306 Missing:
307 .ctor
308 getters
309 setters
311 static const SimdIntrinsc vector4i_intrinsics[] = {
312 { SN_CompareEqual, OP_PCMPEQD, SIMD_EMIT_BINARY },
313 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_EMIT_BINARY },
314 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
315 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
316 { SN_Max, OP_PMAXD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
317 { SN_Min, OP_PMIND, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
318 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_EMIT_BINARY },
319 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
320 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
321 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
322 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
323 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
324 { SN_ShiftRightLogic, OP_PSHRD, SIMD_EMIT_SHIFT },
325 { SN_Shuffle, OP_PSHUFLED, SIMD_EMIT_SHUFFLE },
326 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
327 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_EMIT_BINARY },
328 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_EMIT_BINARY },
329 { SN_op_Addition, OP_PADDD, SIMD_EMIT_BINARY },
330 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
331 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
332 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
333 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
334 { SN_op_LeftShift, OP_PSHLD, SIMD_EMIT_SHIFT },
335 { SN_op_Multiply, OP_PMULD, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
336 { SN_op_RightShift, OP_PSARD, SIMD_EMIT_SHIFT },
337 { SN_op_Subtraction, OP_PSUBD, SIMD_EMIT_BINARY },
341 Missing:
342 .ctor
343 getters
344 setters
346 static const SimdIntrinsc vector8us_intrinsics[] = {
347 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_EMIT_BINARY },
348 { SN_Average, OP_PAVGW_UN, SIMD_EMIT_BINARY },
349 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
350 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
351 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
352 { SN_Max, OP_PMAXW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
353 { SN_Min, OP_PMINW_UN, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
354 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_EMIT_BINARY },
355 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
356 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
357 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
358 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
359 { SN_ShiftRightArithmetic, OP_PSARW, SIMD_EMIT_SHIFT },
360 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
361 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
362 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
363 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
364 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
365 { SN_SubWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
366 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
367 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
368 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
369 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
370 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
371 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
372 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
373 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
374 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
375 { SN_op_RightShift, OP_PSHRW, SIMD_EMIT_SHIFT },
376 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
380 Missing:
381 .ctor
382 getters
383 setters
385 static const SimdIntrinsc vector8s_intrinsics[] = {
386 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_EMIT_BINARY },
387 { SN_CompareEqual, OP_PCMPEQW, SIMD_EMIT_BINARY },
388 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_EMIT_BINARY },
389 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
390 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
391 { SN_Max, OP_PMAXW, SIMD_EMIT_BINARY },
392 { SN_Min, OP_PMINW, SIMD_EMIT_BINARY },
393 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_EMIT_BINARY },
394 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_EMIT_BINARY },
395 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_EMIT_BINARY },
396 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
397 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
398 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
399 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
400 { SN_ShiftRightLogic, OP_PSHRW, SIMD_EMIT_SHIFT },
401 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_EMIT_SHUFFLE },
402 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_EMIT_SHUFFLE },
403 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
404 { SN_SubWithSaturation, OP_PSUBW_SAT_UN, SIMD_EMIT_BINARY },
405 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_EMIT_BINARY },
406 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_EMIT_BINARY },
407 { SN_op_Addition, OP_PADDW, SIMD_EMIT_BINARY },
408 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
409 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
410 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
411 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
412 { SN_op_LeftShift, OP_PSHLW, SIMD_EMIT_SHIFT },
413 { SN_op_Multiply, OP_PMULW, SIMD_EMIT_BINARY },
414 { SN_op_RightShift, OP_PSARW, SIMD_EMIT_SHIFT },
415 { SN_op_Subtraction, OP_PSUBW, SIMD_EMIT_BINARY },
419 Missing:
420 .ctor
421 getters
422 setters
424 static const SimdIntrinsc vector16b_intrinsics[] = {
425 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_EMIT_BINARY },
426 { SN_Average, OP_PAVGB_UN, SIMD_EMIT_BINARY },
427 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
428 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
429 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
430 { SN_Max, OP_PMAXB_UN, SIMD_EMIT_BINARY },
431 { SN_Min, OP_PMINB_UN, SIMD_EMIT_BINARY },
432 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
433 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
434 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
435 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
436 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
437 { SN_SubWithSaturation, OP_PSUBB_SAT_UN, SIMD_EMIT_BINARY },
438 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_EMIT_BINARY },
439 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
440 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
441 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
442 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
443 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
444 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
445 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
446 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
450 Missing:
451 .ctor
452 getters
453 setters
455 static const SimdIntrinsc vector16sb_intrinsics[] = {
456 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_EMIT_BINARY },
457 { SN_CompareEqual, OP_PCMPEQB, SIMD_EMIT_BINARY },
458 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_EMIT_BINARY },
459 { SN_ExtractByteMask, 0, SIMD_EMIT_EXTRACT_MASK },
460 { SN_LoadAligned, 0, SIMD_EMIT_LOAD_ALIGNED },
461 { SN_Max, OP_PMAXB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
462 { SN_Min, OP_PMINB, SIMD_EMIT_BINARY, SIMD_VERSION_SSE41 },
463 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_0 },
464 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_1 },
465 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_2 },
466 { SN_PrefetchNonTemporal, 0, SIMD_EMIT_PREFETCH, SIMD_VERSION_SSE1, SIMD_PREFETCH_MODE_NTA },
467 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_EMIT_STORE },
468 { SN_SubWithSaturation, OP_PSUBB_SAT, SIMD_EMIT_BINARY },
469 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_EMIT_BINARY },
470 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_EMIT_BINARY },
471 { SN_op_Addition, OP_PADDB, SIMD_EMIT_BINARY },
472 { SN_op_BitwiseAnd, OP_PAND, SIMD_EMIT_BINARY },
473 { SN_op_BitwiseOr, OP_POR, SIMD_EMIT_BINARY },
474 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY },
475 { SN_op_Explicit, 0, SIMD_EMIT_CAST },
476 { SN_op_Subtraction, OP_PSUBB, SIMD_EMIT_BINARY },
479 static guint32 simd_supported_versions;
481 /*TODO match using number of parameters as well*/
482 static int
483 simd_intrinsic_compare_by_name (const void *key, const void *value)
485 return strcmp (key, method_name (((SimdIntrinsc *)value)->name));
488 typedef enum {
489 VREG_USED = 0x01,
490 VREG_HAS_XZERO_BB0 = 0x02,
491 VREG_HAS_OTHER_OP_BB0 = 0x04,
492 VREG_SINGLE_BB_USE = 0x08,
493 VREG_MANY_BB_USE = 0x10,
494 } KillFlags;
496 void
497 mono_simd_intrinsics_init (void)
499 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
500 /*TODO log the supported flags*/
503 static inline gboolean
504 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
506 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
507 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
508 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
509 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
510 return TRUE;
512 return FALSE;
515 static inline gboolean
516 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
518 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
519 return FALSE;
521 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
522 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
523 vreg_flags [reg] |= VREG_MANY_BB_USE;
524 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
525 return TRUE;
526 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
527 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
528 target_bb [reg] = bb;
529 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
530 return TRUE;
532 return FALSE;
535 This pass recalculate which vars need MONO_INST_INDIRECT.
537 We cannot do this for non SIMD vars since code like mono_get_vtable_var
538 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
540 void
541 mono_simd_simplify_indirection (MonoCompile *cfg)
543 int i, max_vreg = 0;
544 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
545 MonoInst *ins;
546 char *vreg_flags;
548 for (i = 0; i < cfg->num_varinfo; i++) {
549 MonoInst *var = cfg->varinfo [i];
550 if (var->klass->simd_type) {
551 var->flags &= ~MONO_INST_INDIRECT;
552 max_vreg = MAX (var->dreg, max_vreg);
556 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
557 if (!first_bb && bb->code)
558 first_bb = bb;
559 for (ins = bb->code; ins; ins = ins->next) {
560 if (ins->opcode == OP_LDADDR) {
561 MonoInst *var = (MonoInst*)ins->inst_p0;
562 if (var->klass->simd_type) {
563 var->flags |= MONO_INST_INDIRECT;
569 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
570 vreg_flags = g_malloc0 (max_vreg + 1);
571 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
573 for (i = 0; i < cfg->num_varinfo; i++) {
574 MonoInst *var = cfg->varinfo [i];
575 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
576 vreg_flags [var->dreg] = VREG_USED;
577 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
581 /*Scan the first basic block looking xzeros not used*/
582 for (ins = first_bb->code; ins; ins = ins->next) {
583 if (ins->opcode == OP_XZERO) {
584 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
585 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
586 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
588 continue;
590 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
591 continue;
593 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
594 continue;
595 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg1, max_vreg, vreg_flags))
596 continue;
597 if (apply_vreg_first_block_interference (cfg, ins, ins->sreg2, max_vreg, vreg_flags))
598 continue;
601 if (IS_DEBUG_ON (cfg)) {
602 for (i = 0; i < cfg->num_varinfo; i++) {
603 MonoInst *var = cfg->varinfo [i];
604 if (var->klass->simd_type) {
605 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
606 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
607 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
608 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
613 /*TODO stop here if no var is xzero only*/
616 Scan all other bb and check if it has only one other use
617 Ideally this would be done after an extended bb formation pass
619 FIXME This pass could use dominator information to properly
620 place the XZERO on the bb that dominates all uses of the var,
621 but this will have zero effect with the current local reg alloc
623 TODO simply the use of flags.
626 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
627 for (ins = bb->code; ins; ins = ins->next) {
629 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
630 continue;
631 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
632 continue;
633 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg1, bb, max_vreg, vreg_flags, target_bb))
634 continue;
635 if (apply_vreg_following_block_interference (cfg, ins, ins->sreg2, bb, max_vreg, vreg_flags, target_bb))
636 continue;
640 for (i = 0; i < cfg->num_varinfo; i++) {
641 MonoInst *var = cfg->varinfo [i];
642 if (!var->klass->simd_type)
643 continue;
644 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
645 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
646 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
647 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
649 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
650 continue;
651 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
652 /*We can, pretty much kill it.*/
653 if (ins->dreg == var->dreg) {
654 break;
655 } else if (ins->sreg1 == var->dreg || ins->sreg2 == var->dreg) {
656 MonoInst *tmp;
657 MONO_INST_NEW (cfg, tmp, OP_XZERO);
658 tmp->dreg = var->dreg;
659 tmp->type = STACK_VTYPE;
660 tmp->klass = var->klass;
661 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
662 break;
667 for (ins = first_bb->code; ins; ins = ins->next) {
668 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE))
669 NULLIFY_INS (ins);
672 g_free (vreg_flags);
673 g_free (target_bb);
677 * This function expect that src be a value.
679 static int
680 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
682 if (src->opcode == OP_XMOVE) {
683 return src->sreg1;
684 } else if (src->type == STACK_VTYPE) {
685 return src->dreg;
687 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
688 mono_print_ins (src);
689 g_assert_not_reached ();
693 * This function will load the value if needed.
695 static int
696 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
698 if (src->opcode == OP_XMOVE) {
699 return src->sreg1;
700 } else if (src->opcode == OP_LDADDR) {
701 int res = ((MonoInst*)src->inst_p0)->dreg;
702 NULLIFY_INS (src);
703 return res;
704 } else if (src->type == STACK_VTYPE) {
705 return src->dreg;
706 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
707 MonoInst *ins;
709 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
710 ins->klass = cmethod->klass;
711 ins->sreg1 = src->dreg;
712 ins->type = STACK_VTYPE;
713 ins->dreg = alloc_ireg (cfg);
714 MONO_ADD_INS (cfg->cbb, ins);
715 return ins->dreg;
717 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
718 mono_print_ins (src);
719 g_assert_not_reached ();
722 static MonoInst*
723 get_int_to_float_spill_area (MonoCompile *cfg)
725 if (!cfg->iconv_raw_var) {
726 cfg->iconv_raw_var = mono_compile_create_var (cfg, &mono_defaults.int32_class->byval_arg, OP_LOCAL);
727 cfg->iconv_raw_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
729 return cfg->iconv_raw_var;
732 static MonoInst*
733 simd_intrinsic_emit_binary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
735 MonoInst* ins;
736 int left_vreg, right_vreg;
738 left_vreg = get_simd_vreg (cfg, cmethod, args [0]);
739 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
742 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
743 ins->klass = cmethod->klass;
744 ins->sreg1 = left_vreg;
745 ins->sreg2 = right_vreg;
746 ins->type = STACK_VTYPE;
747 ins->klass = cmethod->klass;
748 ins->dreg = alloc_ireg (cfg);
749 ins->inst_c0 = intrinsic->flags;
750 MONO_ADD_INS (cfg->cbb, ins);
751 return ins;
754 static MonoInst*
755 simd_intrinsic_emit_unary (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
757 MonoInst* ins;
758 int vreg;
760 vreg = get_simd_vreg (cfg, cmethod, args [0]);
762 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
763 ins->klass = cmethod->klass;
764 ins->sreg1 = vreg;
765 ins->type = STACK_VTYPE;
766 ins->dreg = alloc_ireg (cfg);
767 MONO_ADD_INS (cfg->cbb, ins);
768 return ins;
771 static MonoInst*
772 simd_intrinsic_emit_getter (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
774 MonoInst *tmp, *ins;
775 int vreg;
777 vreg = load_simd_vreg (cfg, cmethod, args [0]);
779 if (intrinsic->opcode) {
780 MONO_INST_NEW (cfg, ins, OP_SHUFLEPS);
781 ins->klass = cmethod->klass;
782 ins->sreg1 = vreg;
783 ins->inst_c0 = intrinsic->opcode;
784 ins->type = STACK_VTYPE;
785 ins->dreg = vreg = alloc_ireg (cfg);
786 MONO_ADD_INS (cfg->cbb, ins);
789 MONO_INST_NEW (cfg, tmp, OP_EXTRACT_I4);
790 tmp->klass = cmethod->klass;
791 tmp->sreg1 = vreg;
792 tmp->type = STACK_I4;
793 tmp->dreg = alloc_ireg (cfg);
794 MONO_ADD_INS (cfg->cbb, tmp);
796 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_R8_RAW);
797 ins->klass = mono_defaults.single_class;
798 ins->sreg1 = tmp->dreg;
799 ins->type = STACK_R8;
800 ins->dreg = alloc_freg (cfg);
801 ins->backend.spill_var = get_int_to_float_spill_area (cfg);
802 MONO_ADD_INS (cfg->cbb, ins);
803 return ins;
806 static MonoInst*
807 simd_intrinsic_emit_ctor (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
809 MonoInst *ins;
810 int i;
812 for (i = 1; i < 5; ++i) {
813 MONO_INST_NEW (cfg, ins, OP_PUSH_R4);
814 ins->sreg1 = args [5 - i]->dreg;
815 ins->klass = args [5 - i]->klass;
816 MONO_ADD_INS (cfg->cbb, ins);
819 if (args [0]->opcode == OP_LDADDR) { /*Eliminate LDADDR if it's initing a local var*/
820 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
821 NULLIFY_INS (args [0]);
823 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
824 ins->klass = cmethod->klass;
825 ins->type = STACK_VTYPE;
826 ins->dreg = vreg;
827 MONO_ADD_INS (cfg->cbb, ins);
828 } else {
829 int vreg = alloc_ireg (cfg);
831 MONO_INST_NEW (cfg, ins, OP_LOADX_STACK);
832 ins->klass = cmethod->klass;
833 ins->type = STACK_VTYPE;
834 ins->dreg = vreg;
835 MONO_ADD_INS (cfg->cbb, ins);
837 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE_REG);
838 ins->klass = cmethod->klass;
839 ins->dreg = args [0]->dreg;
840 ins->sreg1 = vreg;
841 MONO_ADD_INS (cfg->cbb, ins);
843 return ins;
846 static MonoInst*
847 simd_intrinsic_emit_cast (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
849 MonoInst *ins;
850 int vreg;
852 vreg = get_simd_vreg (cfg, cmethod, args [0]);
854 //TODO macroize this
855 MONO_INST_NEW (cfg, ins, OP_XMOVE);
856 ins->klass = cmethod->klass;
857 ins->type = STACK_VTYPE;
858 ins->sreg1 = vreg;
859 ins->dreg = alloc_ireg (cfg);
860 MONO_ADD_INS (cfg->cbb, ins);
861 return ins;
864 static MonoInst*
866 simd_intrinsic_emit_shift (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
868 MonoInst *ins;
869 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
871 vreg = get_simd_vreg (cfg, cmethod, args [0]);
873 if (args [1]->opcode != OP_ICONST) {
874 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
875 ins->klass = mono_defaults.int32_class;
876 ins->sreg1 = args [1]->dreg;
877 ins->type = STACK_I4;
878 ins->dreg = vreg2 = alloc_ireg (cfg);
879 MONO_ADD_INS (cfg->cbb, ins);
881 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
884 MONO_INST_NEW (cfg, ins, opcode);
885 ins->klass = cmethod->klass;
886 ins->sreg1 = vreg;
887 ins->sreg2 = vreg2;
889 if (args [1]->opcode == OP_ICONST) {
890 ins->inst_imm = args [1]->inst_c0;
891 NULLIFY_INS (args [1]);
894 ins->type = STACK_VTYPE;
895 ins->dreg = alloc_ireg (cfg);
896 MONO_ADD_INS (cfg->cbb, ins);
897 return ins;
901 static MonoInst*
902 simd_intrinsic_emit_shuffle (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
904 MonoInst *ins;
905 int vreg;
907 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
909 if (args [1]->opcode != OP_ICONST) {
910 g_warning ("Shuffle with non literals is not yet supported");
911 g_assert_not_reached ();
913 vreg = get_simd_vreg (cfg, cmethod, args [0]);
914 NULLIFY_INS (args [1]);
916 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
917 ins->klass = cmethod->klass;
918 ins->sreg1 = vreg;
919 ins->inst_c0 = args [1]->inst_c0;
920 ins->type = STACK_VTYPE;
921 ins->dreg = alloc_ireg (cfg);
922 MONO_ADD_INS (cfg->cbb, ins);
923 return ins;
926 static MonoInst*
927 simd_intrinsic_emit_load_aligned (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
929 MonoInst *ins;
931 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
932 ins->klass = cmethod->klass;
933 ins->sreg1 = args [0]->dreg;
934 ins->type = STACK_VTYPE;
935 ins->dreg = alloc_ireg (cfg);
936 MONO_ADD_INS (cfg->cbb, ins);
937 return ins;
940 static MonoInst*
941 simd_intrinsic_emit_store (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
943 MonoInst *ins;
944 int vreg;
946 vreg = get_simd_vreg (cfg, cmethod, args [1]);
948 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
949 ins->klass = cmethod->klass;
950 ins->dreg = args [0]->dreg;
951 ins->sreg1 = vreg;
952 ins->type = STACK_VTYPE;
953 MONO_ADD_INS (cfg->cbb, ins);
954 return ins;
957 static MonoInst*
958 simd_intrinsic_emit_extract_mask (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
960 MonoInst *ins;
961 int vreg;
963 vreg = get_simd_vreg (cfg, cmethod, args [0]);
965 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
966 ins->klass = cmethod->klass;
967 ins->sreg1 = vreg;
968 ins->type = STACK_I4;
969 ins->dreg = alloc_ireg (cfg);
970 MONO_ADD_INS (cfg->cbb, ins);
972 return ins;
975 static MonoInst*
976 simd_intrinsic_emit_prefetch (const SimdIntrinsc *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
978 MonoInst *ins;
980 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
981 ins->klass = cmethod->klass;
982 ins->sreg1 = args [0]->dreg;
983 ins->backend.arg_info = intrinsic->flags;
984 MONO_ADD_INS (cfg->cbb, ins);
985 return ins;
988 static const char *
989 simd_version_name (guint32 version)
991 switch (version) {
992 case SIMD_VERSION_SSE1:
993 return "sse1";
994 case SIMD_VERSION_SSE2:
995 return "sse2";
996 case SIMD_VERSION_SSE3:
997 return "sse3";
998 case SIMD_VERSION_SSSE3:
999 return "ssse3";
1000 case SIMD_VERSION_SSE41:
1001 return "sse41";
1002 case SIMD_VERSION_SSE42:
1003 return "sse42";
1004 case SIMD_VERSION_SSE4a:
1005 return "sse4a";
1007 return "n/a";
1010 static MonoInst*
1011 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsc *intrinsics, guint32 size)
1013 const SimdIntrinsc * result = bsearch (cmethod->name, intrinsics, size, sizeof (SimdIntrinsc), &simd_intrinsic_compare_by_name);
1014 if (!result) {
1015 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1016 return NULL;
1018 if (IS_DEBUG_ON (cfg)) {
1019 int i, max;
1020 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1021 max = fsig->param_count + fsig->hasthis;
1022 for (i = 0; i < max; ++i) {
1023 printf ("param %d: ", i);
1024 mono_print_ins (args [i]);
1027 if (result->simd_version && !((1 << result->simd_version) & simd_supported_versions)) {
1028 if (IS_DEBUG_ON (cfg))
1029 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod->klass->name, cmethod->name, fsig->param_count, simd_version_name (result->simd_version));
1030 return NULL;
1033 switch (result->simd_emit_mode) {
1034 case SIMD_EMIT_BINARY:
1035 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1036 case SIMD_EMIT_UNARY:
1037 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1038 case SIMD_EMIT_GETTER:
1039 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1040 case SIMD_EMIT_CTOR:
1041 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1042 case SIMD_EMIT_CAST:
1043 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1044 case SIMD_EMIT_SHUFFLE:
1045 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1046 case SIMD_EMIT_SHIFT:
1047 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1048 case SIMD_EMIT_LOAD_ALIGNED:
1049 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1050 case SIMD_EMIT_STORE:
1051 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1052 case SIMD_EMIT_EXTRACT_MASK:
1053 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1054 case SIMD_EMIT_PREFETCH:
1055 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1057 g_assert_not_reached ();
1060 static MonoInst*
1061 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1063 if (!strcmp ("get_AccelMode", cmethod->name)) {
1064 MonoInst *ins;
1065 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1066 return ins;
1068 return NULL;
1071 MonoInst*
1072 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1074 if (!strcmp ("Mono.Simd", cmethod->klass->name_space) && !strcmp ("SimdRuntime", cmethod->klass->name))
1075 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
1076 if (!cmethod->klass->simd_type)
1077 return NULL;
1078 cfg->uses_simd_intrinsics = 1;
1079 if (!strcmp ("Vector2d", cmethod->klass->name))
1080 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsc));
1081 if (!strcmp ("Vector4f", cmethod->klass->name))
1082 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsc));
1083 if (!strcmp ("Vector2ul", cmethod->klass->name))
1084 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsc));
1085 if (!strcmp ("Vector2l", cmethod->klass->name))
1086 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsc));
1087 if (!strcmp ("Vector4ui", cmethod->klass->name))
1088 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsc));
1089 if (!strcmp ("Vector4i", cmethod->klass->name))
1090 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsc));
1091 if (!strcmp ("Vector8us", cmethod->klass->name))
1092 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsc));
1093 if (!strcmp ("Vector8s", cmethod->klass->name))
1094 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsc));
1095 if (!strcmp ("Vector16b", cmethod->klass->name))
1096 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsc));
1097 if (!strcmp ("Vector16sb", cmethod->klass->name))
1098 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsc));
1100 return NULL;
1103 #endif