2 * simd-instrisics.c: simd support for intrinsics
5 * Rodrigo Kumpera (rkumpera@novell.com)
7 * (C) 2008 Novell, Inc.
18 General notes on SIMD intrinsics
20 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
21 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
22 TODO extend op_to_op_dest_membase to handle simd ops
23 TODO add support for indexed versions of simd ops
24 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
25 TODO make sure locals, arguments and spills are properly aligned.
26 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
27 TODO add stuff to man pages
28 TODO document this under /docs
29 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
30 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like iconv_to_r8_raw. (or just pinst sse ops)
31 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
32 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
33 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
34 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
35 TODO check if we need to init the SSE control word with better precision.
36 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
37 TODO make SimdRuntime.get_AccelMode work under AOT
39 General notes for SIMD intrinsics.
41 -Bad extractor and constructor performance
42 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
43 It will be loaded in the FP stack just to be pushed on the call stack.
45 A similar thing happens with Vector4f constructor that require float vars to be
47 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
48 trip to the FP stack is desirable.
50 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
54 -Promote OP_EXTRACT_I4 to a STORE op
55 The advantage of this change is that it could have a _membase version and promote further optimizations.
57 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
61 #ifdef MONO_ARCH_SIMD_INTRINSICS
63 //#define IS_DEBUG_ON(cfg) (0)
65 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
66 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
75 SIMD_EMIT_LOAD_ALIGNED
,
77 SIMD_EMIT_EXTRACT_MASK
,
81 #ifdef HAVE_ARRAY_ELEM_INIT
82 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
83 #define MSGSTRFIELD1(line) str##line
84 static const struct msgstr_t
{
85 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
86 #include "simd-methods.h"
89 #define SIMD_METHOD(str,name) str,
90 #include "simd-methods.h"
95 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
96 #include "simd-methods.h"
98 #define method_name(idx) ((const char*)&method_names + (idx))
101 #define SIMD_METHOD(str,name) str,
102 static const char * const method_names
[] = {
103 #include "simd-methods.h"
107 #define SIMD_METHOD(str,name) name,
109 #include "simd-methods.h"
113 #define method_name(idx) (method_names [(idx)])
120 guint8 simd_emit_mode
: 4;
121 guint8 simd_version
: 4;
129 static const SimdIntrinsc vector4f_intrinsics
[] = {
130 { SN_ctor
, 0, SIMD_EMIT_CTOR
},
131 { SN_AddSub
, OP_ADDSUBPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE3
},
132 { SN_AndNot
, OP_ANDNPS
, SIMD_EMIT_BINARY
},
133 { SN_CompareEqual
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_EQ
},
134 { SN_CompareLessEqual
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_LE
},
135 { SN_CompareLessThan
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_LT
},
136 { SN_CompareNotEqual
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_NEQ
},
137 { SN_CompareNotLessEqual
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_NLE
},
138 { SN_CompareNotLessThan
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_NLT
},
139 { SN_CompareOrdered
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_ORD
},
140 { SN_CompareUnordered
, OP_COMPPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_UNORD
},
141 { SN_DuplicateHigh
, OP_DUPPS_HIGH
, SIMD_EMIT_UNARY
, SIMD_VERSION_SSE3
},
142 { SN_DuplicateLow
, OP_DUPPS_LOW
, SIMD_EMIT_UNARY
, SIMD_VERSION_SSE3
},
143 { SN_HorizontalAdd
, OP_HADDPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE3
},
144 { SN_HorizontalSub
, OP_HSUBPS
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE3
},
145 { SN_InterleaveHigh
, OP_UNPACK_HIGHPS
, SIMD_EMIT_BINARY
},
146 { SN_InterleaveLow
, OP_UNPACK_LOWPS
, SIMD_EMIT_BINARY
},
147 { SN_InvSqrt
, OP_RSQRTPS
, SIMD_EMIT_UNARY
},
148 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
149 { SN_Max
, OP_MAXPS
, SIMD_EMIT_BINARY
},
150 { SN_Min
, OP_MINPS
, SIMD_EMIT_BINARY
},
151 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
152 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
153 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
154 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
155 { SN_Reciprocal
, OP_RCPPS
, SIMD_EMIT_UNARY
},
156 { SN_Shuffle
, OP_SHUFLEPS
, SIMD_EMIT_SHUFFLE
},
157 { SN_Sqrt
, OP_SQRTPS
, SIMD_EMIT_UNARY
},
158 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
159 { SN_StoreNonTemporal
, OP_STOREX_NTA_MEMBASE_REG
, SIMD_EMIT_STORE
},
160 { SN_get_W
, 3, SIMD_EMIT_GETTER
},
161 { SN_get_X
, 0, SIMD_EMIT_GETTER
},
162 { SN_get_Y
, 1, SIMD_EMIT_GETTER
},
163 { SN_get_Z
, 2, SIMD_EMIT_GETTER
},
164 { SN_op_Addition
, OP_ADDPS
, SIMD_EMIT_BINARY
},
165 { SN_op_BitwiseAnd
, OP_ANDPS
, SIMD_EMIT_BINARY
},
166 { SN_op_BitwiseOr
, OP_ORPS
, SIMD_EMIT_BINARY
},
167 { SN_op_Division
, OP_DIVPS
, SIMD_EMIT_BINARY
},
168 { SN_op_ExclusiveOr
, OP_XORPS
, SIMD_EMIT_BINARY
},
169 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
170 { SN_op_Multiply
, OP_MULPS
, SIMD_EMIT_BINARY
},
171 { SN_op_Subtraction
, OP_SUBPS
, SIMD_EMIT_BINARY
},
180 static const SimdIntrinsc vector2d_intrinsics
[] = {
181 { SN_AddSub
, OP_ADDSUBPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE3
},
182 { SN_AndNot
, OP_ANDNPD
, SIMD_EMIT_BINARY
},
183 { SN_CompareEqual
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_EQ
},
184 { SN_CompareLessEqual
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_LE
},
185 { SN_CompareLessThan
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_LT
},
186 { SN_CompareNotEqual
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_NEQ
},
187 { SN_CompareNotLessEqual
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_NLE
},
188 { SN_CompareNotLessThan
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_NLT
},
189 { SN_CompareOrdered
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_ORD
},
190 { SN_CompareUnordered
, OP_COMPPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
, SIMD_COMP_UNORD
},
191 { SN_Duplicate
, OP_DUPPD
, SIMD_EMIT_UNARY
, SIMD_VERSION_SSE3
},
192 { SN_HorizontalAdd
, OP_HADDPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE3
},
193 { SN_HorizontalSub
, OP_HSUBPD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE3
},
194 { SN_InterleaveHigh
, OP_UNPACK_HIGHPD
, SIMD_EMIT_BINARY
},
195 { SN_InterleaveLow
, OP_UNPACK_LOWPD
, SIMD_EMIT_BINARY
},
196 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
197 { SN_Max
, OP_MAXPD
, SIMD_EMIT_BINARY
},
198 { SN_Min
, OP_MINPD
, SIMD_EMIT_BINARY
},
199 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
200 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
201 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
202 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
203 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
204 { SN_op_Addition
, OP_ADDPD
, SIMD_EMIT_BINARY
},
205 { SN_op_BitwiseAnd
, OP_ANDPD
, SIMD_EMIT_BINARY
},
206 { SN_op_BitwiseOr
, OP_ORPD
, SIMD_EMIT_BINARY
},
207 { SN_op_Division
, OP_DIVPD
, SIMD_EMIT_BINARY
},
208 { SN_op_ExclusiveOr
, OP_XORPD
, SIMD_EMIT_BINARY
},
209 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
210 { SN_op_Multiply
, OP_MULPD
, SIMD_EMIT_BINARY
},
211 { SN_op_Subtraction
, OP_SUBPD
, SIMD_EMIT_BINARY
},
220 static const SimdIntrinsc vector2ul_intrinsics
[] = {
221 { SN_CompareEqual
, OP_PCMPEQQ
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
222 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
223 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
224 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
225 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
226 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
227 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
228 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
229 { SN_UnpackHigh
, OP_UNPACK_HIGHQ
, SIMD_EMIT_BINARY
},
230 { SN_UnpackLow
, OP_UNPACK_LOWQ
, SIMD_EMIT_BINARY
},
231 { SN_op_Addition
, OP_PADDQ
, SIMD_EMIT_BINARY
},
232 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
233 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
234 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
235 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
236 { SN_op_LeftShift
, OP_PSHLQ
, SIMD_EMIT_SHIFT
},
237 { SN_op_Multiply
, OP_PMULQ
, SIMD_EMIT_BINARY
},
238 { SN_op_RightShift
, OP_PSHRQ
, SIMD_EMIT_SHIFT
},
239 { SN_op_Subtraction
, OP_PSUBQ
, SIMD_EMIT_BINARY
},
248 static const SimdIntrinsc vector2l_intrinsics
[] = {
249 { SN_CompareEqual
, OP_PCMPEQQ
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
250 { SN_CompareGreaterThan
, OP_PCMPGTQ
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE42
},
251 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
252 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
253 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
254 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
255 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
256 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
257 { SN_ShiftRightLogic
, OP_PSHRQ
, SIMD_EMIT_SHIFT
},
258 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
259 { SN_UnpackHigh
, OP_UNPACK_HIGHQ
, SIMD_EMIT_BINARY
},
260 { SN_UnpackLow
, OP_UNPACK_LOWQ
, SIMD_EMIT_BINARY
},
261 { SN_op_Addition
, OP_PADDQ
, SIMD_EMIT_BINARY
},
262 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
263 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
264 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
265 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
266 { SN_op_LeftShift
, OP_PSHLQ
, SIMD_EMIT_SHIFT
},
267 { SN_op_Multiply
, OP_PMULQ
, SIMD_EMIT_BINARY
},
268 { SN_op_Subtraction
, OP_PSUBQ
, SIMD_EMIT_BINARY
},
277 static const SimdIntrinsc vector4ui_intrinsics
[] = {
278 { SN_CompareEqual
, OP_PCMPEQD
, SIMD_EMIT_BINARY
},
279 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
280 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
281 { SN_Max
, OP_PMAXD_UN
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
282 { SN_Min
, OP_PMIND_UN
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
283 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
284 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
285 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
286 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
287 { SN_ShiftRightArithmetic
, OP_PSARD
, SIMD_EMIT_SHIFT
},
288 { SN_Shuffle
, OP_PSHUFLED
, SIMD_EMIT_SHUFFLE
},
289 { SN_SignedPackWithSignedSaturation
, OP_PACKD
, SIMD_EMIT_BINARY
},
290 { SN_SignedPackWithUnsignedSaturation
, OP_PACKD_UN
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
291 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
292 { SN_UnpackHigh
, OP_UNPACK_HIGHD
, SIMD_EMIT_BINARY
},
293 { SN_UnpackLow
, OP_UNPACK_LOWD
, SIMD_EMIT_BINARY
},
294 { SN_op_Addition
, OP_PADDD
, SIMD_EMIT_BINARY
},
295 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
296 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
297 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
298 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
299 { SN_op_LeftShift
, OP_PSHLD
, SIMD_EMIT_SHIFT
},
300 { SN_op_Multiply
, OP_PMULD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
301 { SN_op_RightShift
, OP_PSHRD
, SIMD_EMIT_SHIFT
},
302 { SN_op_Subtraction
, OP_PSUBD
, SIMD_EMIT_BINARY
},
311 static const SimdIntrinsc vector4i_intrinsics
[] = {
312 { SN_CompareEqual
, OP_PCMPEQD
, SIMD_EMIT_BINARY
},
313 { SN_CompareGreaterThan
, OP_PCMPGTD
, SIMD_EMIT_BINARY
},
314 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
315 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
316 { SN_Max
, OP_PMAXD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
317 { SN_Min
, OP_PMIND
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
318 { SN_PackWithSignedSaturation
, OP_PACKD
, SIMD_EMIT_BINARY
},
319 { SN_PackWithUnsignedSaturation
, OP_PACKD_UN
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
320 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
321 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
322 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
323 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
324 { SN_ShiftRightLogic
, OP_PSHRD
, SIMD_EMIT_SHIFT
},
325 { SN_Shuffle
, OP_PSHUFLED
, SIMD_EMIT_SHUFFLE
},
326 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
327 { SN_UnpackHigh
, OP_UNPACK_HIGHD
, SIMD_EMIT_BINARY
},
328 { SN_UnpackLow
, OP_UNPACK_LOWD
, SIMD_EMIT_BINARY
},
329 { SN_op_Addition
, OP_PADDD
, SIMD_EMIT_BINARY
},
330 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
331 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
332 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
333 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
334 { SN_op_LeftShift
, OP_PSHLD
, SIMD_EMIT_SHIFT
},
335 { SN_op_Multiply
, OP_PMULD
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
336 { SN_op_RightShift
, OP_PSARD
, SIMD_EMIT_SHIFT
},
337 { SN_op_Subtraction
, OP_PSUBD
, SIMD_EMIT_BINARY
},
346 static const SimdIntrinsc vector8us_intrinsics
[] = {
347 { SN_AddWithSaturation
, OP_PADDW_SAT_UN
, SIMD_EMIT_BINARY
},
348 { SN_Average
, OP_PAVGW_UN
, SIMD_EMIT_BINARY
},
349 { SN_CompareEqual
, OP_PCMPEQW
, SIMD_EMIT_BINARY
},
350 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
351 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
352 { SN_Max
, OP_PMAXW_UN
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
353 { SN_Min
, OP_PMINW_UN
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
354 { SN_MultiplyStoreHigh
, OP_PMULW_HIGH_UN
, SIMD_EMIT_BINARY
},
355 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
356 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
357 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
358 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
359 { SN_ShiftRightArithmetic
, OP_PSARW
, SIMD_EMIT_SHIFT
},
360 { SN_ShuffleHigh
, OP_PSHUFLEW_HIGH
, SIMD_EMIT_SHUFFLE
},
361 { SN_ShuffleLow
, OP_PSHUFLEW_LOW
, SIMD_EMIT_SHUFFLE
},
362 { SN_SignedPackWithSignedSaturation
, OP_PACKW
, SIMD_EMIT_BINARY
},
363 { SN_SignedPackWithUnsignedSaturation
, OP_PACKW_UN
, SIMD_EMIT_BINARY
},
364 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
365 { SN_SubWithSaturation
, OP_PSUBW_SAT_UN
, SIMD_EMIT_BINARY
},
366 { SN_UnpackHigh
, OP_UNPACK_HIGHW
, SIMD_EMIT_BINARY
},
367 { SN_UnpackLow
, OP_UNPACK_LOWW
, SIMD_EMIT_BINARY
},
368 { SN_op_Addition
, OP_PADDW
, SIMD_EMIT_BINARY
},
369 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
370 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
371 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
372 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
373 { SN_op_LeftShift
, OP_PSHLW
, SIMD_EMIT_SHIFT
},
374 { SN_op_Multiply
, OP_PMULW
, SIMD_EMIT_BINARY
},
375 { SN_op_RightShift
, OP_PSHRW
, SIMD_EMIT_SHIFT
},
376 { SN_op_Subtraction
, OP_PSUBW
, SIMD_EMIT_BINARY
},
385 static const SimdIntrinsc vector8s_intrinsics
[] = {
386 { SN_AddWithSaturation
, OP_PADDW_SAT
, SIMD_EMIT_BINARY
},
387 { SN_CompareEqual
, OP_PCMPEQW
, SIMD_EMIT_BINARY
},
388 { SN_CompareGreaterThan
, OP_PCMPGTW
, SIMD_EMIT_BINARY
},
389 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
390 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
391 { SN_Max
, OP_PMAXW
, SIMD_EMIT_BINARY
},
392 { SN_Min
, OP_PMINW
, SIMD_EMIT_BINARY
},
393 { SN_MultiplyStoreHigh
, OP_PMULW_HIGH
, SIMD_EMIT_BINARY
},
394 { SN_PackWithSignedSaturation
, OP_PACKW
, SIMD_EMIT_BINARY
},
395 { SN_PackWithUnsignedSaturation
, OP_PACKW_UN
, SIMD_EMIT_BINARY
},
396 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
397 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
398 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
399 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
400 { SN_ShiftRightLogic
, OP_PSHRW
, SIMD_EMIT_SHIFT
},
401 { SN_ShuffleHigh
, OP_PSHUFLEW_HIGH
, SIMD_EMIT_SHUFFLE
},
402 { SN_ShuffleLow
, OP_PSHUFLEW_LOW
, SIMD_EMIT_SHUFFLE
},
403 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
404 { SN_SubWithSaturation
, OP_PSUBW_SAT_UN
, SIMD_EMIT_BINARY
},
405 { SN_UnpackHigh
, OP_UNPACK_HIGHW
, SIMD_EMIT_BINARY
},
406 { SN_UnpackLow
, OP_UNPACK_LOWW
, SIMD_EMIT_BINARY
},
407 { SN_op_Addition
, OP_PADDW
, SIMD_EMIT_BINARY
},
408 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
409 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
410 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
411 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
412 { SN_op_LeftShift
, OP_PSHLW
, SIMD_EMIT_SHIFT
},
413 { SN_op_Multiply
, OP_PMULW
, SIMD_EMIT_BINARY
},
414 { SN_op_RightShift
, OP_PSARW
, SIMD_EMIT_SHIFT
},
415 { SN_op_Subtraction
, OP_PSUBW
, SIMD_EMIT_BINARY
},
424 static const SimdIntrinsc vector16b_intrinsics
[] = {
425 { SN_AddWithSaturation
, OP_PADDB_SAT_UN
, SIMD_EMIT_BINARY
},
426 { SN_Average
, OP_PAVGB_UN
, SIMD_EMIT_BINARY
},
427 { SN_CompareEqual
, OP_PCMPEQB
, SIMD_EMIT_BINARY
},
428 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
429 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
430 { SN_Max
, OP_PMAXB_UN
, SIMD_EMIT_BINARY
},
431 { SN_Min
, OP_PMINB_UN
, SIMD_EMIT_BINARY
},
432 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
433 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
434 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
435 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
436 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
437 { SN_SubWithSaturation
, OP_PSUBB_SAT_UN
, SIMD_EMIT_BINARY
},
438 { SN_SumOfAbsoluteDifferences
, OP_PSUM_ABS_DIFF
, SIMD_EMIT_BINARY
},
439 { SN_UnpackHigh
, OP_UNPACK_HIGHB
, SIMD_EMIT_BINARY
},
440 { SN_UnpackLow
, OP_UNPACK_LOWB
, SIMD_EMIT_BINARY
},
441 { SN_op_Addition
, OP_PADDB
, SIMD_EMIT_BINARY
},
442 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
443 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
444 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
445 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
446 { SN_op_Subtraction
, OP_PSUBB
, SIMD_EMIT_BINARY
},
455 static const SimdIntrinsc vector16sb_intrinsics
[] = {
456 { SN_AddWithSaturation
, OP_PADDB_SAT
, SIMD_EMIT_BINARY
},
457 { SN_CompareEqual
, OP_PCMPEQB
, SIMD_EMIT_BINARY
},
458 { SN_CompareGreaterThan
, OP_PCMPGTB
, SIMD_EMIT_BINARY
},
459 { SN_ExtractByteMask
, 0, SIMD_EMIT_EXTRACT_MASK
},
460 { SN_LoadAligned
, 0, SIMD_EMIT_LOAD_ALIGNED
},
461 { SN_Max
, OP_PMAXB
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
462 { SN_Min
, OP_PMINB
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE41
},
463 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_0
},
464 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_1
},
465 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_2
},
466 { SN_PrefetchNonTemporal
, 0, SIMD_EMIT_PREFETCH
, SIMD_VERSION_SSE1
, SIMD_PREFETCH_MODE_NTA
},
467 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_EMIT_STORE
},
468 { SN_SubWithSaturation
, OP_PSUBB_SAT
, SIMD_EMIT_BINARY
},
469 { SN_UnpackHigh
, OP_UNPACK_HIGHB
, SIMD_EMIT_BINARY
},
470 { SN_UnpackLow
, OP_UNPACK_LOWB
, SIMD_EMIT_BINARY
},
471 { SN_op_Addition
, OP_PADDB
, SIMD_EMIT_BINARY
},
472 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_EMIT_BINARY
},
473 { SN_op_BitwiseOr
, OP_POR
, SIMD_EMIT_BINARY
},
474 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
},
475 { SN_op_Explicit
, 0, SIMD_EMIT_CAST
},
476 { SN_op_Subtraction
, OP_PSUBB
, SIMD_EMIT_BINARY
},
479 static guint32 simd_supported_versions
;
481 /*TODO match using number of parameters as well*/
483 simd_intrinsic_compare_by_name (const void *key
, const void *value
)
485 return strcmp (key
, method_name (((SimdIntrinsc
*)value
)->name
));
490 VREG_HAS_XZERO_BB0
= 0x02,
491 VREG_HAS_OTHER_OP_BB0
= 0x04,
492 VREG_SINGLE_BB_USE
= 0x08,
493 VREG_MANY_BB_USE
= 0x10,
497 mono_simd_intrinsics_init (void)
499 simd_supported_versions
= mono_arch_cpu_enumerate_simd_versions ();
500 /*TODO log the supported flags*/
503 static inline gboolean
504 apply_vreg_first_block_interference (MonoCompile
*cfg
, MonoInst
*ins
, int reg
, int max_vreg
, char *vreg_flags
)
506 if (reg
!= -1 && reg
<= max_vreg
&& vreg_flags
[reg
]) {
507 vreg_flags
[reg
] &= ~VREG_HAS_XZERO_BB0
;
508 vreg_flags
[reg
] |= VREG_HAS_OTHER_OP_BB0
;
509 DEBUG (printf ("[simd-simplify] R%d used: ", reg
); mono_print_ins(ins
));
515 static inline gboolean
516 apply_vreg_following_block_interference (MonoCompile
*cfg
, MonoInst
*ins
, int reg
, MonoBasicBlock
*bb
, int max_vreg
, char *vreg_flags
, MonoBasicBlock
**target_bb
)
518 if (reg
== -1 || reg
> max_vreg
|| !(vreg_flags
[reg
] & VREG_HAS_XZERO_BB0
) || target_bb
[reg
] == bb
)
521 if (vreg_flags
[reg
] & VREG_SINGLE_BB_USE
) {
522 vreg_flags
[reg
] &= ~VREG_SINGLE_BB_USE
;
523 vreg_flags
[reg
] |= VREG_MANY_BB_USE
;
524 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg
); mono_print_ins(ins
));
526 } else if (!(vreg_flags
[reg
] & VREG_MANY_BB_USE
)) {
527 vreg_flags
[reg
] |= VREG_SINGLE_BB_USE
;
528 target_bb
[reg
] = bb
;
529 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg
); mono_print_ins(ins
));
535 This pass recalculate which vars need MONO_INST_INDIRECT.
537 We cannot do this for non SIMD vars since code like mono_get_vtable_var
538 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
541 mono_simd_simplify_indirection (MonoCompile
*cfg
)
544 MonoBasicBlock
*bb
, *first_bb
= NULL
, **target_bb
;
548 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
549 MonoInst
*var
= cfg
->varinfo
[i
];
550 if (var
->klass
->simd_type
) {
551 var
->flags
&= ~MONO_INST_INDIRECT
;
552 max_vreg
= MAX (var
->dreg
, max_vreg
);
556 for (bb
= cfg
->bb_entry
; bb
; bb
= bb
->next_bb
) {
557 if (!first_bb
&& bb
->code
)
559 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
560 if (ins
->opcode
== OP_LDADDR
) {
561 MonoInst
*var
= (MonoInst
*)ins
->inst_p0
;
562 if (var
->klass
->simd_type
) {
563 var
->flags
|= MONO_INST_INDIRECT
;
569 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg
));
570 vreg_flags
= g_malloc0 (max_vreg
+ 1);
571 target_bb
= g_new0 (MonoBasicBlock
*, max_vreg
+ 1);
573 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
574 MonoInst
*var
= cfg
->varinfo
[i
];
575 if (var
->klass
->simd_type
&& !(var
->flags
& (MONO_INST_INDIRECT
|MONO_INST_VOLATILE
))) {
576 vreg_flags
[var
->dreg
] = VREG_USED
;
577 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i
, var
->dreg
));
581 /*Scan the first basic block looking xzeros not used*/
582 for (ins
= first_bb
->code
; ins
; ins
= ins
->next
) {
583 if (ins
->opcode
== OP_XZERO
) {
584 if (!(vreg_flags
[ins
->dreg
] & VREG_HAS_OTHER_OP_BB0
)) {
585 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins
->dreg
); mono_print_ins(ins
));
586 vreg_flags
[ins
->dreg
] |= VREG_HAS_XZERO_BB0
;
590 if (ins
->opcode
== OP_LDADDR
&& apply_vreg_first_block_interference (cfg
, ins
, ((MonoInst
*)ins
->inst_p0
)->dreg
, max_vreg
, vreg_flags
))
593 if (apply_vreg_first_block_interference (cfg
, ins
, ins
->dreg
, max_vreg
, vreg_flags
))
595 if (apply_vreg_first_block_interference (cfg
, ins
, ins
->sreg1
, max_vreg
, vreg_flags
))
597 if (apply_vreg_first_block_interference (cfg
, ins
, ins
->sreg2
, max_vreg
, vreg_flags
))
601 if (IS_DEBUG_ON (cfg
)) {
602 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
603 MonoInst
*var
= cfg
->varinfo
[i
];
604 if (var
->klass
->simd_type
) {
605 if ((vreg_flags
[var
->dreg
] & VREG_HAS_XZERO_BB0
))
606 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var
->dreg
));
607 if ((vreg_flags
[var
->dreg
] & VREG_HAS_OTHER_OP_BB0
))
608 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var
->dreg
));
613 /*TODO stop here if no var is xzero only*/
616 Scan all other bb and check if it has only one other use
617 Ideally this would be done after an extended bb formation pass
619 FIXME This pass could use dominator information to properly
620 place the XZERO on the bb that dominates all uses of the var,
621 but this will have zero effect with the current local reg alloc
623 TODO simply the use of flags.
626 for (bb
= first_bb
->next_bb
; bb
; bb
= bb
->next_bb
) {
627 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
629 if (ins
->opcode
== OP_LDADDR
&& apply_vreg_following_block_interference (cfg
, ins
, ((MonoInst
*)ins
->inst_p0
)->dreg
, bb
, max_vreg
, vreg_flags
, target_bb
))
631 if (apply_vreg_following_block_interference (cfg
, ins
, ins
->dreg
, bb
, max_vreg
, vreg_flags
, target_bb
))
633 if (apply_vreg_following_block_interference (cfg
, ins
, ins
->sreg1
, bb
, max_vreg
, vreg_flags
, target_bb
))
635 if (apply_vreg_following_block_interference (cfg
, ins
, ins
->sreg2
, bb
, max_vreg
, vreg_flags
, target_bb
))
640 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
641 MonoInst
*var
= cfg
->varinfo
[i
];
642 if (!var
->klass
->simd_type
)
644 if ((vreg_flags
[var
->dreg
] & VREG_SINGLE_BB_USE
))
645 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var
->dreg
));
646 if ((vreg_flags
[var
->dreg
] & VREG_MANY_BB_USE
))
647 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var
->dreg
));
649 if (!(vreg_flags
[var
->dreg
] & VREG_SINGLE_BB_USE
))
651 for (ins
= target_bb
[var
->dreg
]->code
; ins
; ins
= ins
->next
) {
652 /*We can, pretty much kill it.*/
653 if (ins
->dreg
== var
->dreg
) {
655 } else if (ins
->sreg1
== var
->dreg
|| ins
->sreg2
== var
->dreg
) {
657 MONO_INST_NEW (cfg
, tmp
, OP_XZERO
);
658 tmp
->dreg
= var
->dreg
;
659 tmp
->type
= STACK_VTYPE
;
660 tmp
->klass
= var
->klass
;
661 mono_bblock_insert_before_ins (target_bb
[var
->dreg
], ins
, tmp
);
667 for (ins
= first_bb
->code
; ins
; ins
= ins
->next
) {
668 if (ins
->opcode
== OP_XZERO
&& (vreg_flags
[ins
->dreg
] & VREG_SINGLE_BB_USE
))
677 * This function expect that src be a value.
680 get_simd_vreg (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
*src
)
682 if (src
->opcode
== OP_XMOVE
) {
684 } else if (src
->type
== STACK_VTYPE
) {
687 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
688 mono_print_ins (src
);
689 g_assert_not_reached ();
693 * This function will load the value if needed.
696 load_simd_vreg (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
*src
)
698 if (src
->opcode
== OP_XMOVE
) {
700 } else if (src
->opcode
== OP_LDADDR
) {
701 int res
= ((MonoInst
*)src
->inst_p0
)->dreg
;
704 } else if (src
->type
== STACK_VTYPE
) {
706 } else if (src
->type
== STACK_PTR
|| src
->type
== STACK_MP
) {
709 MONO_INST_NEW (cfg
, ins
, OP_LOADX_MEMBASE
);
710 ins
->klass
= cmethod
->klass
;
711 ins
->sreg1
= src
->dreg
;
712 ins
->type
= STACK_VTYPE
;
713 ins
->dreg
= alloc_ireg (cfg
);
714 MONO_ADD_INS (cfg
->cbb
, ins
);
717 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src
->type
);
718 mono_print_ins (src
);
719 g_assert_not_reached ();
723 get_int_to_float_spill_area (MonoCompile
*cfg
)
725 if (!cfg
->iconv_raw_var
) {
726 cfg
->iconv_raw_var
= mono_compile_create_var (cfg
, &mono_defaults
.int32_class
->byval_arg
, OP_LOCAL
);
727 cfg
->iconv_raw_var
->flags
|= MONO_INST_VOLATILE
; /*FIXME, use the don't regalloc flag*/
729 return cfg
->iconv_raw_var
;
733 simd_intrinsic_emit_binary (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
736 int left_vreg
, right_vreg
;
738 left_vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
739 right_vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
742 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
743 ins
->klass
= cmethod
->klass
;
744 ins
->sreg1
= left_vreg
;
745 ins
->sreg2
= right_vreg
;
746 ins
->type
= STACK_VTYPE
;
747 ins
->klass
= cmethod
->klass
;
748 ins
->dreg
= alloc_ireg (cfg
);
749 ins
->inst_c0
= intrinsic
->flags
;
750 MONO_ADD_INS (cfg
->cbb
, ins
);
755 simd_intrinsic_emit_unary (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
760 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
762 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
763 ins
->klass
= cmethod
->klass
;
765 ins
->type
= STACK_VTYPE
;
766 ins
->dreg
= alloc_ireg (cfg
);
767 MONO_ADD_INS (cfg
->cbb
, ins
);
772 simd_intrinsic_emit_getter (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
777 vreg
= load_simd_vreg (cfg
, cmethod
, args
[0]);
779 if (intrinsic
->opcode
) {
780 MONO_INST_NEW (cfg
, ins
, OP_SHUFLEPS
);
781 ins
->klass
= cmethod
->klass
;
783 ins
->inst_c0
= intrinsic
->opcode
;
784 ins
->type
= STACK_VTYPE
;
785 ins
->dreg
= vreg
= alloc_ireg (cfg
);
786 MONO_ADD_INS (cfg
->cbb
, ins
);
789 MONO_INST_NEW (cfg
, tmp
, OP_EXTRACT_I4
);
790 tmp
->klass
= cmethod
->klass
;
792 tmp
->type
= STACK_I4
;
793 tmp
->dreg
= alloc_ireg (cfg
);
794 MONO_ADD_INS (cfg
->cbb
, tmp
);
796 MONO_INST_NEW (cfg
, ins
, OP_ICONV_TO_R8_RAW
);
797 ins
->klass
= mono_defaults
.single_class
;
798 ins
->sreg1
= tmp
->dreg
;
799 ins
->type
= STACK_R8
;
800 ins
->dreg
= alloc_freg (cfg
);
801 ins
->backend
.spill_var
= get_int_to_float_spill_area (cfg
);
802 MONO_ADD_INS (cfg
->cbb
, ins
);
807 simd_intrinsic_emit_ctor (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
812 for (i
= 1; i
< 5; ++i
) {
813 MONO_INST_NEW (cfg
, ins
, OP_PUSH_R4
);
814 ins
->sreg1
= args
[5 - i
]->dreg
;
815 ins
->klass
= args
[5 - i
]->klass
;
816 MONO_ADD_INS (cfg
->cbb
, ins
);
819 if (args
[0]->opcode
== OP_LDADDR
) { /*Eliminate LDADDR if it's initing a local var*/
820 int vreg
= ((MonoInst
*)args
[0]->inst_p0
)->dreg
;
821 NULLIFY_INS (args
[0]);
823 MONO_INST_NEW (cfg
, ins
, OP_LOADX_STACK
);
824 ins
->klass
= cmethod
->klass
;
825 ins
->type
= STACK_VTYPE
;
827 MONO_ADD_INS (cfg
->cbb
, ins
);
829 int vreg
= alloc_ireg (cfg
);
831 MONO_INST_NEW (cfg
, ins
, OP_LOADX_STACK
);
832 ins
->klass
= cmethod
->klass
;
833 ins
->type
= STACK_VTYPE
;
835 MONO_ADD_INS (cfg
->cbb
, ins
);
837 MONO_INST_NEW (cfg
, ins
, OP_STOREX_MEMBASE_REG
);
838 ins
->klass
= cmethod
->klass
;
839 ins
->dreg
= args
[0]->dreg
;
841 MONO_ADD_INS (cfg
->cbb
, ins
);
847 simd_intrinsic_emit_cast (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
852 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
855 MONO_INST_NEW (cfg
, ins
, OP_XMOVE
);
856 ins
->klass
= cmethod
->klass
;
857 ins
->type
= STACK_VTYPE
;
859 ins
->dreg
= alloc_ireg (cfg
);
860 MONO_ADD_INS (cfg
->cbb
, ins
);
866 simd_intrinsic_emit_shift (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
869 int vreg
, vreg2
= -1, opcode
= intrinsic
->opcode
;
871 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
873 if (args
[1]->opcode
!= OP_ICONST
) {
874 MONO_INST_NEW (cfg
, ins
, OP_ICONV_TO_X
);
875 ins
->klass
= mono_defaults
.int32_class
;
876 ins
->sreg1
= args
[1]->dreg
;
877 ins
->type
= STACK_I4
;
878 ins
->dreg
= vreg2
= alloc_ireg (cfg
);
879 MONO_ADD_INS (cfg
->cbb
, ins
);
881 ++opcode
; /*The shift_reg version op is always +1 from the regular one.*/
884 MONO_INST_NEW (cfg
, ins
, opcode
);
885 ins
->klass
= cmethod
->klass
;
889 if (args
[1]->opcode
== OP_ICONST
) {
890 ins
->inst_imm
= args
[1]->inst_c0
;
891 NULLIFY_INS (args
[1]);
894 ins
->type
= STACK_VTYPE
;
895 ins
->dreg
= alloc_ireg (cfg
);
896 MONO_ADD_INS (cfg
->cbb
, ins
);
902 simd_intrinsic_emit_shuffle (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
907 /*TODO Exposing shuffle is not a good thing as it's non obvious. We should come up with better abstractions*/
909 if (args
[1]->opcode
!= OP_ICONST
) {
910 g_warning ("Shuffle with non literals is not yet supported");
911 g_assert_not_reached ();
913 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
914 NULLIFY_INS (args
[1]);
916 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
917 ins
->klass
= cmethod
->klass
;
919 ins
->inst_c0
= args
[1]->inst_c0
;
920 ins
->type
= STACK_VTYPE
;
921 ins
->dreg
= alloc_ireg (cfg
);
922 MONO_ADD_INS (cfg
->cbb
, ins
);
927 simd_intrinsic_emit_load_aligned (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
931 MONO_INST_NEW (cfg
, ins
, OP_LOADX_ALIGNED_MEMBASE
);
932 ins
->klass
= cmethod
->klass
;
933 ins
->sreg1
= args
[0]->dreg
;
934 ins
->type
= STACK_VTYPE
;
935 ins
->dreg
= alloc_ireg (cfg
);
936 MONO_ADD_INS (cfg
->cbb
, ins
);
941 simd_intrinsic_emit_store (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
946 vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
948 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
949 ins
->klass
= cmethod
->klass
;
950 ins
->dreg
= args
[0]->dreg
;
952 ins
->type
= STACK_VTYPE
;
953 MONO_ADD_INS (cfg
->cbb
, ins
);
958 simd_intrinsic_emit_extract_mask (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
963 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
965 MONO_INST_NEW (cfg
, ins
, OP_EXTRACT_MASK
);
966 ins
->klass
= cmethod
->klass
;
968 ins
->type
= STACK_I4
;
969 ins
->dreg
= alloc_ireg (cfg
);
970 MONO_ADD_INS (cfg
->cbb
, ins
);
976 simd_intrinsic_emit_prefetch (const SimdIntrinsc
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
980 MONO_INST_NEW (cfg
, ins
, OP_PREFETCH_MEMBASE
);
981 ins
->klass
= cmethod
->klass
;
982 ins
->sreg1
= args
[0]->dreg
;
983 ins
->backend
.arg_info
= intrinsic
->flags
;
984 MONO_ADD_INS (cfg
->cbb
, ins
);
989 simd_version_name (guint32 version
)
992 case SIMD_VERSION_SSE1
:
994 case SIMD_VERSION_SSE2
:
996 case SIMD_VERSION_SSE3
:
998 case SIMD_VERSION_SSSE3
:
1000 case SIMD_VERSION_SSE41
:
1002 case SIMD_VERSION_SSE42
:
1004 case SIMD_VERSION_SSE4a
:
1011 emit_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
, const SimdIntrinsc
*intrinsics
, guint32 size
)
1013 const SimdIntrinsc
* result
= bsearch (cmethod
->name
, intrinsics
, size
, sizeof (SimdIntrinsc
), &simd_intrinsic_compare_by_name
);
1015 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod
->klass
->name
, cmethod
->name
, fsig
->param_count
));
1018 if (IS_DEBUG_ON (cfg
)) {
1020 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod
->klass
->name
, cmethod
->name
, fsig
->param_count
, method_name (result
->name
));
1021 max
= fsig
->param_count
+ fsig
->hasthis
;
1022 for (i
= 0; i
< max
; ++i
) {
1023 printf ("param %d: ", i
);
1024 mono_print_ins (args
[i
]);
1027 if (result
->simd_version
&& !((1 << result
->simd_version
) & simd_supported_versions
)) {
1028 if (IS_DEBUG_ON (cfg
))
1029 printf ("function %s::%s/%d requires unsuported SIMD instruction set %s \n", cmethod
->klass
->name
, cmethod
->name
, fsig
->param_count
, simd_version_name (result
->simd_version
));
1033 switch (result
->simd_emit_mode
) {
1034 case SIMD_EMIT_BINARY
:
1035 return simd_intrinsic_emit_binary (result
, cfg
, cmethod
, args
);
1036 case SIMD_EMIT_UNARY
:
1037 return simd_intrinsic_emit_unary (result
, cfg
, cmethod
, args
);
1038 case SIMD_EMIT_GETTER
:
1039 return simd_intrinsic_emit_getter (result
, cfg
, cmethod
, args
);
1040 case SIMD_EMIT_CTOR
:
1041 return simd_intrinsic_emit_ctor (result
, cfg
, cmethod
, args
);
1042 case SIMD_EMIT_CAST
:
1043 return simd_intrinsic_emit_cast (result
, cfg
, cmethod
, args
);
1044 case SIMD_EMIT_SHUFFLE
:
1045 return simd_intrinsic_emit_shuffle (result
, cfg
, cmethod
, args
);
1046 case SIMD_EMIT_SHIFT
:
1047 return simd_intrinsic_emit_shift (result
, cfg
, cmethod
, args
);
1048 case SIMD_EMIT_LOAD_ALIGNED
:
1049 return simd_intrinsic_emit_load_aligned (result
, cfg
, cmethod
, args
);
1050 case SIMD_EMIT_STORE
:
1051 return simd_intrinsic_emit_store (result
, cfg
, cmethod
, args
);
1052 case SIMD_EMIT_EXTRACT_MASK
:
1053 return simd_intrinsic_emit_extract_mask (result
, cfg
, cmethod
, args
);
1054 case SIMD_EMIT_PREFETCH
:
1055 return simd_intrinsic_emit_prefetch (result
, cfg
, cmethod
, args
);
1057 g_assert_not_reached ();
1061 emit_simd_runtime_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
1063 if (!strcmp ("get_AccelMode", cmethod
->name
)) {
1065 EMIT_NEW_ICONST (cfg
, ins
, simd_supported_versions
);
1072 mono_emit_simd_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
1074 if (!strcmp ("Mono.Simd", cmethod
->klass
->name_space
) && !strcmp ("SimdRuntime", cmethod
->klass
->name
))
1075 return emit_simd_runtime_intrinsics (cfg
, cmethod
, fsig
, args
);
1076 if (!cmethod
->klass
->simd_type
)
1078 cfg
->uses_simd_intrinsics
= 1;
1079 if (!strcmp ("Vector2d", cmethod
->klass
->name
))
1080 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2d_intrinsics
, sizeof (vector2d_intrinsics
) / sizeof (SimdIntrinsc
));
1081 if (!strcmp ("Vector4f", cmethod
->klass
->name
))
1082 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4f_intrinsics
, sizeof (vector4f_intrinsics
) / sizeof (SimdIntrinsc
));
1083 if (!strcmp ("Vector2ul", cmethod
->klass
->name
))
1084 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2ul_intrinsics
, sizeof (vector2ul_intrinsics
) / sizeof (SimdIntrinsc
));
1085 if (!strcmp ("Vector2l", cmethod
->klass
->name
))
1086 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2l_intrinsics
, sizeof (vector2l_intrinsics
) / sizeof (SimdIntrinsc
));
1087 if (!strcmp ("Vector4ui", cmethod
->klass
->name
))
1088 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4ui_intrinsics
, sizeof (vector4ui_intrinsics
) / sizeof (SimdIntrinsc
));
1089 if (!strcmp ("Vector4i", cmethod
->klass
->name
))
1090 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4i_intrinsics
, sizeof (vector4i_intrinsics
) / sizeof (SimdIntrinsc
));
1091 if (!strcmp ("Vector8us", cmethod
->klass
->name
))
1092 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector8us_intrinsics
, sizeof (vector8us_intrinsics
) / sizeof (SimdIntrinsc
));
1093 if (!strcmp ("Vector8s", cmethod
->klass
->name
))
1094 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector8s_intrinsics
, sizeof (vector8s_intrinsics
) / sizeof (SimdIntrinsc
));
1095 if (!strcmp ("Vector16b", cmethod
->klass
->name
))
1096 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector16b_intrinsics
, sizeof (vector16b_intrinsics
) / sizeof (SimdIntrinsc
));
1097 if (!strcmp ("Vector16sb", cmethod
->klass
->name
))
1098 return emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector16sb_intrinsics
, sizeof (vector16sb_intrinsics
) / sizeof (SimdIntrinsc
));