3 * simd support for intrinsics
6 * Rodrigo Kumpera (rkumpera@novell.com)
8 * (C) 2008 Novell, Inc.
16 #include <mono/metadata/abi-details.h>
17 #include <mono/metadata/reflection-internals.h>
18 #include <mono/utils/mono-compiler.h>
19 #include <mono/utils/bsearch.h>
22 General notes on SIMD intrinsics
24 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
25 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
26 TODO extend op_to_op_dest_membase to handle simd ops
27 TODO add support for indexed versions of simd ops
28 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
29 TODO make sure locals, arguments and spills are properly aligned.
30 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
31 TODO add stuff to man pages
32 TODO document this under /docs
33 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
34 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
35 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
36 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
37 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
38 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
39 TODO check if we need to init the SSE control word with better precision.
40 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
41 TODO make SimdRuntime.get_AccelMode work under AOT
42 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
43 TODO extend bounds checking code to support for range checking.
45 General notes for SIMD intrinsics.
47 -Bad extractor and constructor performance
48 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
49 It will be loaded in the FP stack just to be pushed on the call stack.
51 A similar thing happens with Vector4f constructor that require float vars to be
53 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
54 trip to the FP stack is desirable.
56 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
60 -Promote OP_EXTRACT_I4 to a STORE op
61 The advantage of this change is that it could have a _membase version and promote further optimizations.
63 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
67 #if defined (MONO_ARCH_SIMD_INTRINSICS) && !defined(ENABLE_NETCORE)
69 #if defined (DISABLE_JIT)
72 mono_simd_intrinsics_init (void)
78 //#define IS_DEBUG_ON(cfg) (0)
80 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
81 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
87 SIMD_EMIT_GETTER_QWORD
,
93 SIMD_EMIT_LOAD_ALIGNED
,
95 SIMD_EMIT_EXTRACT_MASK
,
99 // This, instead of an array of pointers, to optimize away a pointer and a relocation per string.
100 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
101 #define MSGSTRFIELD1(line) str##line
102 static const struct msgstr_t
{
103 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
104 #include "simd-methods.h"
107 #define SIMD_METHOD(str,name) str,
108 #include "simd-methods.h"
113 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
114 #include "simd-methods.h"
116 #define method_name(idx) ((const char*)&method_names + (idx))
121 guint8 simd_version_flags
;
122 guint8 simd_emit_mode
: 4;
126 static const SimdIntrinsic vector4f_intrinsics
[] = {
127 { SN_ctor
, OP_EXPAND_R4
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
128 { SN_AddSub
, OP_ADDSUBPS
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
129 { SN_AndNot
, OP_ANDNPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
130 { SN_CompareEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_EQ
},
131 { SN_CompareLessEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LE
},
132 { SN_CompareLessThan
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LT
},
133 { SN_CompareNotEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NEQ
},
134 { SN_CompareNotLessEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLE
},
135 { SN_CompareNotLessThan
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLT
},
136 { SN_CompareOrdered
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_ORD
},
137 { SN_CompareUnordered
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_UNORD
},
138 { SN_ConvertToDouble
, OP_CVTPS2PD
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
139 { SN_ConvertToInt
, OP_CVTPS2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
140 { SN_ConvertToIntTruncated
, OP_CVTTPS2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
141 { SN_DuplicateHigh
, OP_DUPPS_HIGH
, SIMD_VERSION_SSE3
, SIMD_EMIT_UNARY
},
142 { SN_DuplicateLow
, OP_DUPPS_LOW
, SIMD_VERSION_SSE3
, SIMD_EMIT_UNARY
},
143 { SN_HorizontalAdd
, OP_HADDPS
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
144 { SN_HorizontalSub
, OP_HSUBPS
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
145 { SN_InterleaveHigh
, OP_UNPACK_HIGHPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
146 { SN_InterleaveLow
, OP_UNPACK_LOWPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
147 { SN_InvSqrt
, OP_RSQRTPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
148 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
149 { SN_Max
, OP_MAXPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
150 { SN_Min
, OP_MINPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
151 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
152 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
153 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
154 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
155 { SN_Reciprocal
, OP_RCPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
156 { SN_Shuffle
, OP_PSHUFLED
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
157 { SN_Sqrt
, OP_SQRTPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
158 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
159 { SN_StoreNonTemporal
, OP_STOREX_NTA_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
160 { SN_get_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
161 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
162 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
163 { SN_get_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
164 { SN_op_Addition
, OP_ADDPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
165 { SN_op_BitwiseAnd
, OP_ANDPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
166 { SN_op_BitwiseOr
, OP_ORPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
167 { SN_op_Division
, OP_DIVPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
168 { SN_op_Equality
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
169 { SN_op_ExclusiveOr
, OP_XORPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
170 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
171 { SN_op_Inequality
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
172 { SN_op_Multiply
, OP_MULPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
173 { SN_op_Subtraction
, OP_SUBPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
174 { SN_set_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
175 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
176 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
177 { SN_set_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
}
180 static const SimdIntrinsic vector2d_intrinsics
[] = {
181 { SN_ctor
, OP_EXPAND_R8
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
182 { SN_AddSub
, OP_ADDSUBPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
,},
183 { SN_AndNot
, OP_ANDNPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
184 { SN_CompareEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_EQ
},
185 { SN_CompareLessEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LE
},
186 { SN_CompareLessThan
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LT
},
187 { SN_CompareNotEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NEQ
},
188 { SN_CompareNotLessEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLE
},
189 { SN_CompareNotLessThan
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLT
},
190 { SN_CompareOrdered
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_ORD
},
191 { SN_CompareUnordered
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_UNORD
},
192 { SN_ConvertToFloat
, OP_CVTPD2PS
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
193 { SN_ConvertToInt
, OP_CVTPD2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
194 { SN_ConvertToIntTruncated
, OP_CVTTPD2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
195 { SN_Duplicate
, OP_DUPPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_UNARY
},
196 { SN_HorizontalAdd
, OP_HADDPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
197 { SN_HorizontalSub
, OP_HSUBPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
198 { SN_InterleaveHigh
, OP_UNPACK_HIGHPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
199 { SN_InterleaveLow
, OP_UNPACK_LOWPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
200 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
201 { SN_Max
, OP_MAXPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
202 { SN_Min
, OP_MINPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
203 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
204 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
205 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
206 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
207 { SN_Shuffle
, OP_SHUFPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
208 { SN_Sqrt
, OP_SQRTPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
209 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
210 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
211 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
212 { SN_op_Addition
, OP_ADDPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
213 { SN_op_BitwiseAnd
, OP_ANDPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
214 { SN_op_BitwiseOr
, OP_ORPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
215 { SN_op_Division
, OP_DIVPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
216 { SN_op_ExclusiveOr
, OP_XORPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
217 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
218 { SN_op_Multiply
, OP_MULPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
219 { SN_op_Subtraction
, OP_SUBPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
220 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
221 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
224 static const SimdIntrinsic vector2ul_intrinsics
[] = {
225 { SN_ctor
, OP_EXPAND_I8
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
226 { SN_CompareEqual
, OP_PCMPEQQ
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
227 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
228 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
229 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
230 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
231 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
232 { SN_Shuffle
, OP_SHUFPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
233 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
234 { SN_UnpackHigh
, OP_UNPACK_HIGHQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
235 { SN_UnpackLow
, OP_UNPACK_LOWQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
236 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
237 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
238 { SN_op_Addition
, OP_PADDQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
239 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
240 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
241 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
},
242 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
243 { SN_op_LeftShift
, OP_PSHLQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
244 { SN_op_Multiply
, OP_PMULQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
245 { SN_op_RightShift
, OP_PSHRQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
246 { SN_op_Subtraction
, OP_PSUBQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
247 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
248 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
251 static const SimdIntrinsic vector2l_intrinsics
[] = {
252 { SN_ctor
, OP_EXPAND_I8
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
253 { SN_CompareEqual
, OP_PCMPEQQ
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
254 { SN_CompareGreaterThan
, OP_PCMPGTQ
, SIMD_VERSION_SSE42
, SIMD_EMIT_BINARY
},
255 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
256 { SN_LogicalRightShift
, OP_PSHRQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
257 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
258 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
259 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
260 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
261 { SN_Shuffle
, OP_SHUFPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
262 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
263 { SN_UnpackHigh
, OP_UNPACK_HIGHQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
264 { SN_UnpackLow
, OP_UNPACK_LOWQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
265 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
266 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
267 { SN_op_Addition
, OP_PADDQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
268 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
269 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
270 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
271 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
272 { SN_op_LeftShift
, OP_PSHLQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
273 { SN_op_Multiply
, OP_PMULQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
274 { SN_op_Subtraction
, OP_PSUBQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
275 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
276 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
279 static const SimdIntrinsic vector4ui_intrinsics
[] = {
280 { SN_ctor
, OP_EXPAND_I4
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
281 { SN_ArithmeticRightShift
, OP_PSARD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
282 { SN_CompareEqual
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
283 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
284 { SN_Max
, OP_PMAXD_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
285 { SN_Min
, OP_PMIND_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
286 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
287 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
288 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
289 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
290 { SN_Shuffle
, OP_PSHUFLED
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
291 { SN_SignedPackWithSignedSaturation
, OP_PACKD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
292 { SN_SignedPackWithUnsignedSaturation
, OP_PACKD_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
293 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
294 { SN_UnpackHigh
, OP_UNPACK_HIGHD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
295 { SN_UnpackLow
, OP_UNPACK_LOWD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
296 { SN_get_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
297 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
298 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
299 { SN_get_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
300 { SN_op_Addition
, OP_PADDD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
301 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
302 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
303 { SN_op_Equality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
304 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
305 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
306 { SN_op_Inequality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
307 { SN_op_LeftShift
, OP_PSHLD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
308 { SN_op_Multiply
, OP_PMULD
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
309 { SN_op_RightShift
, OP_PSHRD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
310 { SN_op_Subtraction
, OP_PSUBD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
311 { SN_set_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
312 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
313 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
314 { SN_set_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
317 static const SimdIntrinsic vector4i_intrinsics
[] = {
318 { SN_ctor
, OP_EXPAND_I4
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
319 { SN_CompareEqual
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
320 { SN_CompareGreaterThan
, OP_PCMPGTD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
321 { SN_ConvertToDouble
, OP_CVTDQ2PD
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
322 { SN_ConvertToFloat
, OP_CVTDQ2PS
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
323 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
324 { SN_LogicalRightShift
, OP_PSHRD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
325 { SN_Max
, OP_PMAXD
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
326 { SN_Min
, OP_PMIND
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
327 { SN_PackWithSignedSaturation
, OP_PACKD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
328 { SN_PackWithUnsignedSaturation
, OP_PACKD_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
329 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
330 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
331 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
332 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
333 { SN_Shuffle
, OP_PSHUFLED
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
334 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
335 { SN_UnpackHigh
, OP_UNPACK_HIGHD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
336 { SN_UnpackLow
, OP_UNPACK_LOWD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
337 { SN_get_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
338 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
339 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
340 { SN_get_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
341 { SN_op_Addition
, OP_PADDD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
342 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
343 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
344 { SN_op_Equality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
345 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
346 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
347 { SN_op_Inequality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
348 { SN_op_LeftShift
, OP_PSHLD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
349 { SN_op_Multiply
, OP_PMULD
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
350 { SN_op_RightShift
, OP_PSARD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
351 { SN_op_Subtraction
, OP_PSUBD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
352 { SN_set_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
353 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
354 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
355 { SN_set_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
358 static const SimdIntrinsic vector8us_intrinsics
[] = {
359 { SN_ctor
, OP_EXPAND_I2
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
360 { SN_AddWithSaturation
, OP_PADDW_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
361 { SN_ArithmeticRightShift
, OP_PSARW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
362 { SN_Average
, OP_PAVGW_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
363 { SN_CompareEqual
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
},
364 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
365 { SN_Max
, OP_PMAXW_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
366 { SN_Min
, OP_PMINW_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
367 { SN_MultiplyStoreHigh
, OP_PMULW_HIGH_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
368 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
369 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
370 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
371 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
372 { SN_ShuffleHigh
, OP_PSHUFLEW_HIGH
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
373 { SN_ShuffleLow
, OP_PSHUFLEW_LOW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
374 { SN_SignedPackWithSignedSaturation
, OP_PACKW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
375 { SN_SignedPackWithUnsignedSaturation
, OP_PACKW_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
376 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
377 { SN_SubtractWithSaturation
, OP_PSUBW_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
378 { SN_UnpackHigh
, OP_UNPACK_HIGHW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
379 { SN_UnpackLow
, OP_UNPACK_LOWW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
380 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
381 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
382 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
383 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
384 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
385 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
386 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
387 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
388 { SN_op_Addition
, OP_PADDW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
389 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
390 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
391 { SN_op_Equality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
392 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
393 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
394 { SN_op_Inequality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
395 { SN_op_LeftShift
, OP_PSHLW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
396 { SN_op_Multiply
, OP_PMULW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
397 { SN_op_RightShift
, OP_PSHRW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
398 { SN_op_Subtraction
, OP_PSUBW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
399 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
400 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
401 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
402 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
403 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
404 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
405 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
406 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
409 static const SimdIntrinsic vector8s_intrinsics
[] = {
410 { SN_ctor
, OP_EXPAND_I2
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
411 { SN_AddWithSaturation
, OP_PADDW_SAT
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
412 { SN_CompareEqual
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
413 { SN_CompareGreaterThan
, OP_PCMPGTW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
414 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
415 { SN_LogicalRightShift
, OP_PSHRW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
416 { SN_Max
, OP_PMAXW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
417 { SN_Min
, OP_PMINW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
418 { SN_MultiplyStoreHigh
, OP_PMULW_HIGH
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
419 { SN_PackWithSignedSaturation
, OP_PACKW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
420 { SN_PackWithUnsignedSaturation
, OP_PACKW_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
421 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
422 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
423 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
424 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
425 { SN_ShuffleHigh
, OP_PSHUFLEW_HIGH
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
426 { SN_ShuffleLow
, OP_PSHUFLEW_LOW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
427 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
428 { SN_SubtractWithSaturation
, OP_PSUBW_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
429 { SN_UnpackHigh
, OP_UNPACK_HIGHW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
430 { SN_UnpackLow
, OP_UNPACK_LOWW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
431 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
432 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
433 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
434 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
435 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
436 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
437 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
438 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
439 { SN_op_Addition
, OP_PADDW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
440 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
441 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
442 { SN_op_Equality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
443 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
444 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
445 { SN_op_Inequality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
446 { SN_op_LeftShift
, OP_PSHLW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
447 { SN_op_Multiply
, OP_PMULW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
448 { SN_op_RightShift
, OP_PSARW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
449 { SN_op_Subtraction
, OP_PSUBW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
450 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
451 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
452 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
453 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
454 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
455 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
456 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
457 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
460 static const SimdIntrinsic vector16b_intrinsics
[] = {
461 { SN_ctor
, OP_EXPAND_I1
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
462 { SN_AddWithSaturation
, OP_PADDB_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
463 { SN_Average
, OP_PAVGB_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
464 { SN_CompareEqual
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
465 { SN_ExtractByteMask
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_EXTRACT_MASK
},
466 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
467 { SN_Max
, OP_PMAXB_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
468 { SN_Min
, OP_PMINB_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
469 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
470 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
471 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
472 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
473 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
474 { SN_SubtractWithSaturation
, OP_PSUBB_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
475 { SN_SumOfAbsoluteDifferences
, OP_PSUM_ABS_DIFF
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
476 { SN_UnpackHigh
, OP_UNPACK_HIGHB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
477 { SN_UnpackLow
, OP_UNPACK_LOWB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
478 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
479 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
480 { SN_get_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
481 { SN_get_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
482 { SN_get_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
483 { SN_get_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
484 { SN_get_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
485 { SN_get_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
486 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
487 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
488 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
489 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
490 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
491 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
492 { SN_get_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
493 { SN_get_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
494 { SN_op_Addition
, OP_PADDB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
495 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
496 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
497 { SN_op_Equality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
498 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
499 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
500 { SN_op_Inequality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
501 { SN_op_Subtraction
, OP_PSUBB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
502 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
503 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
504 { SN_set_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
505 { SN_set_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
506 { SN_set_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
507 { SN_set_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
508 { SN_set_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
509 { SN_set_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
510 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
511 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
512 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
513 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
514 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
515 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
516 { SN_set_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
517 { SN_set_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
524 static const SimdIntrinsic vector16sb_intrinsics
[] = {
525 { SN_ctor
, OP_EXPAND_I1
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
526 { SN_AddWithSaturation
, OP_PADDB_SAT
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
527 { SN_CompareEqual
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
528 { SN_CompareGreaterThan
, OP_PCMPGTB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
529 { SN_ExtractByteMask
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_EXTRACT_MASK
},
530 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
531 { SN_Max
, OP_PMAXB
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
532 { SN_Min
, OP_PMINB
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
533 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
534 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
535 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
536 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
537 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
538 { SN_SubtractWithSaturation
, OP_PSUBB_SAT
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
539 { SN_UnpackHigh
, OP_UNPACK_HIGHB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
540 { SN_UnpackLow
, OP_UNPACK_LOWB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
541 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
542 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
543 { SN_get_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
544 { SN_get_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
545 { SN_get_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
546 { SN_get_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
547 { SN_get_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
548 { SN_get_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
549 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
550 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
551 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
552 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
553 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
554 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
555 { SN_get_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
556 { SN_get_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
557 { SN_op_Addition
, OP_PADDB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
558 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
559 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
560 { SN_op_Equality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
561 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
562 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
563 { SN_op_Inequality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
564 { SN_op_Subtraction
, OP_PSUBB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
565 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
566 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
567 { SN_set_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
568 { SN_set_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
569 { SN_set_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
570 { SN_set_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
571 { SN_set_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
572 { SN_set_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
573 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
574 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
575 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
576 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
577 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
578 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
579 { SN_set_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
580 { SN_set_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
583 static guint32 simd_supported_versions
;
585 static MonoInst
* emit_sys_numerics_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
);
586 static MonoInst
* emit_sys_numerics_vectors_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
);
588 /*TODO match using number of parameters as well*/
590 simd_intrinsic_compare_by_name (const void *key
, const void *value
)
592 return strcmp ((const char*)key
, method_name (((SimdIntrinsic
*)value
)->name
));
597 VREG_HAS_XZERO_BB0
= 0x02,
598 VREG_HAS_OTHER_OP_BB0
= 0x04,
599 VREG_SINGLE_BB_USE
= 0x08,
600 VREG_MANY_BB_USE
= 0x10,
604 mono_simd_intrinsics_init (void)
606 simd_supported_versions
= mono_arch_cpu_enumerate_simd_versions ();
607 /*TODO log the supported flags*/
611 apply_vreg_first_block_interference (MonoCompile
*cfg
, MonoInst
*ins
, int reg
, int max_vreg
, char *vreg_flags
)
613 if (reg
!= -1 && reg
<= max_vreg
&& vreg_flags
[reg
]) {
614 vreg_flags
[reg
] &= ~VREG_HAS_XZERO_BB0
;
615 vreg_flags
[reg
] |= VREG_HAS_OTHER_OP_BB0
;
616 DEBUG (printf ("[simd-simplify] R%d used: ", reg
); mono_print_ins(ins
));
623 apply_vreg_following_block_interference (MonoCompile
*cfg
, MonoInst
*ins
, int reg
, MonoBasicBlock
*bb
, int max_vreg
, char *vreg_flags
, MonoBasicBlock
**target_bb
)
625 if (reg
== -1 || reg
> max_vreg
|| !(vreg_flags
[reg
] & VREG_HAS_XZERO_BB0
) || target_bb
[reg
] == bb
)
628 if (vreg_flags
[reg
] & VREG_SINGLE_BB_USE
) {
629 vreg_flags
[reg
] &= ~VREG_SINGLE_BB_USE
;
630 vreg_flags
[reg
] |= VREG_MANY_BB_USE
;
631 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg
); mono_print_ins(ins
));
633 } else if (!(vreg_flags
[reg
] & VREG_MANY_BB_USE
)) {
634 vreg_flags
[reg
] |= VREG_SINGLE_BB_USE
;
635 target_bb
[reg
] = bb
;
636 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg
); mono_print_ins(ins
));
643 This pass recalculate which vars need MONO_INST_INDIRECT.
645 We cannot do this for non SIMD vars since code like mono_get_vtable_var
646 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
649 mono_simd_simplify_indirection (MonoCompile
*cfg
)
652 MonoBasicBlock
*bb
, *first_bb
= NULL
, **target_bb
;
656 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
657 MonoInst
*var
= cfg
->varinfo
[i
];
658 if (m_class_is_simd_type (var
->klass
)) {
659 var
->flags
&= ~MONO_INST_INDIRECT
;
660 max_vreg
= MAX (var
->dreg
, max_vreg
);
664 for (bb
= cfg
->bb_entry
; bb
; bb
= bb
->next_bb
) {
665 if (!first_bb
&& bb
->code
)
667 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
668 if (ins
->opcode
== OP_LDADDR
) {
669 MonoInst
*var
= (MonoInst
*)ins
->inst_p0
;
670 if (m_class_is_simd_type (var
->klass
)) {
671 var
->flags
|= MONO_INST_INDIRECT
;
677 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg
));
678 vreg_flags
= (char *)g_malloc0 (max_vreg
+ 1);
679 target_bb
= g_new0 (MonoBasicBlock
*, max_vreg
+ 1);
681 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
682 MonoInst
*var
= cfg
->varinfo
[i
];
683 if (m_class_is_simd_type (var
->klass
) && !(var
->flags
& (MONO_INST_INDIRECT
|MONO_INST_VOLATILE
))) {
684 vreg_flags
[var
->dreg
] = VREG_USED
;
685 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i
, var
->dreg
));
689 /*Scan the first basic block looking xzeros not used*/
690 for (ins
= first_bb
->code
; ins
; ins
= ins
->next
) {
692 int sregs
[MONO_MAX_SRC_REGS
];
694 if (ins
->opcode
== OP_XZERO
) {
695 if (!(vreg_flags
[ins
->dreg
] & VREG_HAS_OTHER_OP_BB0
)) {
696 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins
->dreg
); mono_print_ins(ins
));
697 vreg_flags
[ins
->dreg
] |= VREG_HAS_XZERO_BB0
;
701 if (ins
->opcode
== OP_LDADDR
&& apply_vreg_first_block_interference (cfg
, ins
, ((MonoInst
*)ins
->inst_p0
)->dreg
, max_vreg
, vreg_flags
))
703 if (apply_vreg_first_block_interference (cfg
, ins
, ins
->dreg
, max_vreg
, vreg_flags
))
705 num_sregs
= mono_inst_get_src_registers (ins
, sregs
);
706 for (i
= 0; i
< num_sregs
; ++i
) {
707 if (apply_vreg_first_block_interference (cfg
, ins
, sregs
[i
], max_vreg
, vreg_flags
))
712 if (IS_DEBUG_ON (cfg
)) {
713 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
714 MonoInst
*var
= cfg
->varinfo
[i
];
715 if (m_class_is_simd_type (var
->klass
)) {
716 if ((vreg_flags
[var
->dreg
] & VREG_HAS_XZERO_BB0
))
717 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var
->dreg
));
718 if ((vreg_flags
[var
->dreg
] & VREG_HAS_OTHER_OP_BB0
))
719 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var
->dreg
));
724 /*TODO stop here if no var is xzero only*/
727 Scan all other bb and check if it has only one other use
728 Ideally this would be done after an extended bb formation pass
730 FIXME This pass could use dominator information to properly
731 place the XZERO on the bb that dominates all uses of the var,
732 but this will have zero effect with the current local reg alloc
734 TODO simply the use of flags.
737 for (bb
= first_bb
->next_bb
; bb
; bb
= bb
->next_bb
) {
738 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
740 int sregs
[MONO_MAX_SRC_REGS
];
742 if (ins
->opcode
== OP_LDADDR
&& apply_vreg_following_block_interference (cfg
, ins
, ((MonoInst
*)ins
->inst_p0
)->dreg
, bb
, max_vreg
, vreg_flags
, target_bb
))
744 if (apply_vreg_following_block_interference (cfg
, ins
, ins
->dreg
, bb
, max_vreg
, vreg_flags
, target_bb
))
746 num_sregs
= mono_inst_get_src_registers (ins
, sregs
);
747 for (i
= 0; i
< num_sregs
; ++i
) {
748 if (apply_vreg_following_block_interference (cfg
, ins
, sregs
[i
], bb
,
749 max_vreg
, vreg_flags
, target_bb
))
755 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
756 MonoInst
*var
= cfg
->varinfo
[i
];
757 if (!m_class_is_simd_type (var
->klass
))
759 if ((vreg_flags
[var
->dreg
] & VREG_SINGLE_BB_USE
))
760 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var
->dreg
));
761 if ((vreg_flags
[var
->dreg
] & VREG_MANY_BB_USE
))
762 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var
->dreg
));
764 if (!(vreg_flags
[var
->dreg
] & VREG_SINGLE_BB_USE
))
766 for (ins
= target_bb
[var
->dreg
]->code
; ins
; ins
= ins
->next
) {
768 int sregs
[MONO_MAX_SRC_REGS
];
769 gboolean found
= FALSE
;
771 num_sregs
= mono_inst_get_src_registers (ins
, sregs
);
772 for (j
= 0; j
< num_sregs
; ++j
) {
773 if (sregs
[j
] == var
->dreg
)
776 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
777 if (ins
->dreg
== var
->dreg
&& !found
) {
778 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i
, target_bb
[var
->dreg
]->block_num
););
781 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i
, target_bb
[var
->dreg
]->block_num
); );
783 MONO_INST_NEW (cfg
, tmp
, OP_XZERO
);
784 tmp
->dreg
= var
->dreg
;
785 tmp
->type
= STACK_VTYPE
;
786 tmp
->klass
= var
->klass
;
787 mono_bblock_insert_before_ins (target_bb
[var
->dreg
], ins
, tmp
);
793 for (ins
= first_bb
->code
; ins
; ins
= ins
->next
) {
794 if (ins
->opcode
== OP_XZERO
&& (vreg_flags
[ins
->dreg
] & VREG_SINGLE_BB_USE
)) {
795 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins
->dreg
); mono_print_ins(ins
));
805 * Windows x64 value type ABI uses reg/stack references (ArgValuetypeAddrInIReg/ArgValuetypeAddrOnStack)
806 * for function arguments. When using SIMD intrinsics arguments optimized into OP_ARG needs to be decomposed
807 * into correspondig SIMD LOADX/STOREX instructions.
809 #if defined(TARGET_WIN32) && defined(TARGET_AMD64)
811 decompose_vtype_opt_uses_simd_intrinsics (MonoCompile
*cfg
, MonoInst
*ins
)
813 if (cfg
->uses_simd_intrinsics
& MONO_CFG_USES_SIMD_INTRINSICS_DECOMPOSE_VTYPE
)
816 switch (ins
->opcode
) {
819 case OP_LOADX_MEMBASE
:
820 case OP_LOADX_ALIGNED_MEMBASE
:
821 case OP_STOREX_MEMBASE
:
822 case OP_STOREX_ALIGNED_MEMBASE_REG
:
830 decompose_vtype_opt_load_arg (MonoCompile
*cfg
, MonoBasicBlock
*bb
, MonoInst
*ins
, gint32
*sreg_int32
)
832 guint32
*sreg
= (guint32
*)sreg_int32
;
833 MonoInst
*src_var
= get_vreg_to_inst (cfg
, *sreg
);
834 if (src_var
&& src_var
->opcode
== OP_ARG
&& src_var
->klass
&& MONO_CLASS_IS_SIMD (cfg
, src_var
->klass
)) {
835 MonoInst
*varload_ins
, *load_ins
;
836 NEW_VARLOADA (cfg
, varload_ins
, src_var
, src_var
->inst_vtype
);
837 mono_bblock_insert_before_ins (bb
, ins
, varload_ins
);
838 MONO_INST_NEW (cfg
, load_ins
, OP_LOADX_MEMBASE
);
839 load_ins
->klass
= src_var
->klass
;
840 load_ins
->type
= STACK_VTYPE
;
841 load_ins
->sreg1
= varload_ins
->dreg
;
842 load_ins
->dreg
= alloc_xreg (cfg
);
843 mono_bblock_insert_after_ins (bb
, varload_ins
, load_ins
);
844 *sreg
= load_ins
->dreg
;
849 mono_simd_decompose_intrinsic (MonoCompile
*cfg
, MonoBasicBlock
*bb
, MonoInst
*ins
)
851 if (cfg
->opt
& MONO_OPT_SIMD
&& decompose_vtype_opt_uses_simd_intrinsics (cfg
, ins
)) {
852 decompose_vtype_opt_load_arg (cfg
, bb
, ins
, &(ins
->sreg1
));
853 decompose_vtype_opt_load_arg (cfg
, bb
, ins
, &(ins
->sreg2
));
854 decompose_vtype_opt_load_arg (cfg
, bb
, ins
, &(ins
->sreg3
));
855 MonoInst
*dest_var
= get_vreg_to_inst (cfg
, ins
->dreg
);
856 if (dest_var
&& dest_var
->opcode
== OP_ARG
&& dest_var
->klass
&& MONO_CLASS_IS_SIMD (cfg
, dest_var
->klass
)) {
857 MonoInst
*varload_ins
, *store_ins
;
858 ins
->dreg
= alloc_xreg (cfg
);
859 NEW_VARLOADA (cfg
, varload_ins
, dest_var
, dest_var
->inst_vtype
);
860 mono_bblock_insert_after_ins (bb
, ins
, varload_ins
);
861 MONO_INST_NEW (cfg
, store_ins
, OP_STOREX_MEMBASE
);
862 store_ins
->klass
= dest_var
->klass
;
863 store_ins
->type
= STACK_VTYPE
;
864 store_ins
->sreg1
= ins
->dreg
;
865 store_ins
->dreg
= varload_ins
->dreg
;
866 mono_bblock_insert_after_ins (bb
, varload_ins
, store_ins
);
872 mono_simd_decompose_intrinsics (MonoCompile
*cfg
)
877 for (bb
= cfg
->bb_entry
; bb
; bb
= bb
->next_bb
) {
878 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
879 mono_simd_decompose_intrinsic (cfg
, bb
, ins
);
885 mono_simd_decompose_intrinsic (MonoCompile
*cfg
, MonoBasicBlock
*bb
, MonoInst
*ins
)
890 mono_simd_decompose_intrinsics (MonoCompile
*cfg
)
893 #endif /*defined(TARGET_WIN32) && defined(TARGET_AMD64)*/
896 * This function expect that src be a value.
899 get_simd_vreg (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
*src
)
901 const char *spec
= INS_INFO (src
->opcode
);
903 if (src
->opcode
== OP_XMOVE
) {
905 } else if (spec
[MONO_INST_DEST
] == 'x') {
907 } else if (src
->opcode
== OP_VCALL
|| src
->opcode
== OP_VCALL_MEMBASE
) {
911 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
912 mono_print_ins (src
);
913 g_assert_not_reached ();
917 * This function will load the value if needed.
920 load_simd_vreg_class (MonoCompile
*cfg
, MonoClass
*klass
, MonoInst
*src
, gboolean
*indirect
)
922 const char *spec
= INS_INFO (src
->opcode
);
926 if (src
->opcode
== OP_XMOVE
) {
928 } else if (src
->opcode
== OP_LDADDR
) {
929 int res
= ((MonoInst
*)src
->inst_p0
)->dreg
;
931 } else if (spec
[MONO_INST_DEST
] == 'x') {
933 } else if (src
->type
== STACK_PTR
|| src
->type
== STACK_MP
) {
938 MONO_INST_NEW (cfg
, ins
, OP_LOADX_MEMBASE
);
940 ins
->sreg1
= src
->dreg
;
941 ins
->type
= STACK_VTYPE
;
942 ins
->dreg
= alloc_ireg (cfg
);
943 MONO_ADD_INS (cfg
->cbb
, ins
);
946 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src
->type
);
947 mono_print_ins (src
);
948 g_assert_not_reached ();
952 load_simd_vreg (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
*src
, gboolean
*indirect
)
954 return load_simd_vreg_class (cfg
, cmethod
->klass
, src
, indirect
);
957 /*We share the var with fconv_to_r8_x to save some stack space.*/
959 get_double_spill_area (MonoCompile
*cfg
)
961 if (!cfg
->fconv_to_r8_x_var
) {
962 cfg
->fconv_to_r8_x_var
= mono_compile_create_var (cfg
, m_class_get_byval_arg (mono_defaults
.double_class
), OP_LOCAL
);
963 cfg
->fconv_to_r8_x_var
->flags
|= MONO_INST_VOLATILE
; /*FIXME, use the don't regalloc flag*/
965 return cfg
->fconv_to_r8_x_var
;
968 get_simd_ctor_spill_area (MonoCompile
*cfg
, MonoClass
*avector_klass
)
970 if (!cfg
->simd_ctor_var
) {
971 cfg
->simd_ctor_var
= mono_compile_create_var (cfg
, m_class_get_byval_arg (avector_klass
), OP_LOCAL
);
972 cfg
->simd_ctor_var
->flags
|= MONO_INST_VOLATILE
; /*FIXME, use the don't regalloc flag*/
974 return cfg
->simd_ctor_var
;
978 mono_type_to_expand_op (MonoType
*type
)
980 switch (type
->type
) {
998 g_assert_not_reached ();
1003 type_to_comp_op (MonoType
*t
)
1023 g_assert_not_reached ();
1029 type_to_gt_op (MonoType
*t
)
1046 type_to_padd_op (MonoType
*t
)
1072 type_to_psub_op (MonoType
*t
)
1098 type_to_pmul_op (MonoType
*t
)
1112 /* PMULQ multiplies two 32 bit numbers into a 64 bit one */
1123 type_to_pdiv_op (MonoType
*t
)
1137 type_to_pxor_op (MonoType
*t
)
1140 * These opcodes have the same semantics, but using the
1141 * correctly typed version is better for performance.
1154 type_to_pand_op (MonoType
*t
)
1167 type_to_por_op (MonoType
*t
)
1180 type_to_pmin_op (MonoType
*t
)
1205 type_to_pmax_op (MonoType
*t
)
1230 get_simd_vreg_or_expanded_scalar (MonoCompile
*cfg
, MonoClass
*klass
, MonoType
*param_type
, MonoInst
*src
)
1235 if (m_class_is_simd_type (mono_class_from_mono_type_internal (param_type
)))
1236 return get_simd_vreg (cfg
, NULL
, src
);
1238 expand_op
= mono_type_to_expand_op (param_type
);
1239 MONO_INST_NEW (cfg
, ins
, expand_op
);
1241 ins
->sreg1
= src
->dreg
;
1242 ins
->type
= STACK_VTYPE
;
1243 ins
->dreg
= alloc_ireg (cfg
);
1244 MONO_ADD_INS (cfg
->cbb
, ins
);
1246 if (expand_op
== OP_EXPAND_R4
)
1247 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1248 else if (expand_op
== OP_EXPAND_R8
)
1249 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1255 * simd_intrinsic_emit_binary_op:
1257 * Emit a binary SIMD opcode.
1258 * @LHS/@RHS are the two arguments, they can be either a SIMD type or a scalar one. Scalar arguments are
1259 * expanded to the SIMD type.
1262 simd_intrinsic_emit_binary_op (MonoCompile
*cfg
, int opcode
, int flags
, MonoClass
*klass
, MonoType
*lhs_type
, MonoType
*rhs_type
, MonoInst
*lhs
, MonoInst
*rhs
)
1265 int left_vreg
, right_vreg
;
1267 left_vreg
= get_simd_vreg_or_expanded_scalar (cfg
, klass
, lhs_type
, lhs
);
1268 right_vreg
= get_simd_vreg_or_expanded_scalar (cfg
, klass
, rhs_type
, rhs
);
1270 MONO_INST_NEW (cfg
, ins
, opcode
);
1272 ins
->sreg1
= left_vreg
;
1273 ins
->sreg2
= right_vreg
;
1274 ins
->type
= STACK_VTYPE
;
1275 ins
->dreg
= alloc_ireg (cfg
);
1276 ins
->inst_c0
= flags
;
1277 MONO_ADD_INS (cfg
->cbb
, ins
);
1282 simd_intrinsic_emit_binary (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1284 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1286 g_assert (sig
->param_count
== 2);
1288 return simd_intrinsic_emit_binary_op (cfg
, intrinsic
->opcode
, intrinsic
->flags
, cmethod
->klass
, sig
->params
[0], sig
->params
[1], args
[0], args
[1]);
1292 simd_intrinsic_emit_unary (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1297 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1299 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1300 ins
->klass
= cmethod
->klass
;
1302 ins
->type
= STACK_VTYPE
;
1303 ins
->dreg
= alloc_ireg (cfg
);
1304 MONO_ADD_INS (cfg
->cbb
, ins
);
1309 mono_type_to_extract_op (MonoType
*type
)
1311 switch (type
->type
) {
1313 return OP_EXTRACT_I1
;
1315 return OP_EXTRACT_U1
;
1317 return OP_EXTRACT_I2
;
1319 return OP_EXTRACT_U2
;
1323 return OP_EXTRACT_I4
;
1325 g_assert_not_reached ();
1329 /*Returns the amount to shift the element index to get the dword it belongs to*/
1331 mono_type_elements_shift_bits (MonoType
*type
)
1333 switch (type
->type
) {
1345 g_assert_not_reached ();
1349 static G_GNUC_UNUSED
int
1350 mono_type_to_insert_op (MonoType
*type
)
1352 switch (type
->type
) {
1355 return OP_INSERT_I1
;
1358 return OP_INSERT_I2
;
1361 return OP_INSERT_I4
;
1364 return OP_INSERT_I8
;
1366 return OP_INSERT_R4
;
1368 return OP_INSERT_R8
;
1370 g_assert_not_reached ();
1375 mono_type_to_slow_insert_op (MonoType
*type
)
1377 switch (type
->type
) {
1380 return OP_INSERTX_U1_SLOW
;
1383 return OP_INSERT_I2
;
1386 return OP_INSERTX_I4_SLOW
;
1389 return OP_INSERTX_I8_SLOW
;
1391 return OP_INSERTX_R4_SLOW
;
1393 return OP_INSERTX_R8_SLOW
;
1395 g_assert_not_reached ();
1400 simd_intrinsic_emit_setter (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1403 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1408 size
= mono_type_size (sig
->params
[0], &align
);
1410 if (COMPILE_LLVM (cfg
)) {
1411 MONO_INST_NEW (cfg
, ins
, mono_type_to_insert_op (sig
->params
[0]));
1412 ins
->klass
= cmethod
->klass
;
1413 ins
->dreg
= ins
->sreg1
= dreg
= load_simd_vreg (cfg
, cmethod
, args
[0], &indirect
);
1414 ins
->sreg2
= args
[1]->dreg
;
1415 ins
->inst_c0
= intrinsic
->opcode
;
1416 MONO_ADD_INS (cfg
->cbb
, ins
);
1417 } else if (size
== 2 || size
== 4 || size
== 8) {
1418 MONO_INST_NEW (cfg
, ins
, mono_type_to_slow_insert_op (sig
->params
[0]));
1419 ins
->klass
= cmethod
->klass
;
1420 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1421 ins
->dreg
= ins
->sreg1
= dreg
= load_simd_vreg (cfg
, cmethod
, args
[0], &indirect
);
1422 ins
->sreg2
= args
[1]->dreg
;
1423 ins
->inst_c0
= intrinsic
->opcode
;
1424 if (sig
->params
[0]->type
== MONO_TYPE_R4
)
1425 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1426 else if (sig
->params
[0]->type
== MONO_TYPE_R8
)
1427 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1428 MONO_ADD_INS (cfg
->cbb
, ins
);
1432 MONO_INST_NEW (cfg
, ins
, OP_EXTRACTX_U2
);
1433 ins
->klass
= cmethod
->klass
;
1434 ins
->sreg1
= sreg
= dreg
= load_simd_vreg (cfg
, cmethod
, args
[0], &indirect
);
1435 ins
->type
= STACK_I4
;
1436 ins
->dreg
= vreg
= alloc_ireg (cfg
);
1437 ins
->inst_c0
= intrinsic
->opcode
/ 2;
1438 MONO_ADD_INS (cfg
->cbb
, ins
);
1440 MONO_INST_NEW (cfg
, ins
, OP_INSERTX_U1_SLOW
);
1441 ins
->klass
= cmethod
->klass
;
1443 ins
->sreg2
= args
[1]->dreg
;
1445 ins
->inst_c0
= intrinsic
->opcode
;
1446 MONO_ADD_INS (cfg
->cbb
, ins
);
1450 MONO_INST_NEW (cfg
, ins
, OP_STOREX_MEMBASE
);
1451 ins
->klass
= cmethod
->klass
;
1452 ins
->dreg
= args
[0]->dreg
;
1454 MONO_ADD_INS (cfg
->cbb
, ins
);
1460 * simd_intrinsic_emit_getter_op:
1462 * Emit IR for loading an element of a SIMD value.
1464 * @klass is the simd type, @type is the element type.
1467 simd_intrinsic_emit_getter_op (MonoCompile
*cfg
, int index
, MonoClass
*klass
, MonoType
*type
, MonoInst
*arg
)
1470 int vreg
, shift_bits
;
1472 vreg
= load_simd_vreg_class (cfg
, klass
, arg
, NULL
);
1474 if (type
->type
== MONO_TYPE_I8
|| type
->type
== MONO_TYPE_U8
|| type
->type
== MONO_TYPE_R8
) {
1476 gboolean is_r8
= type
->type
== MONO_TYPE_R8
;
1478 MONO_INST_NEW (cfg
, ins
, is_r8
? OP_EXTRACT_R8
: OP_EXTRACT_I8
);
1481 ins
->inst_c0
= index
;
1483 ins
->type
= STACK_R8
;
1484 ins
->dreg
= alloc_freg (cfg
);
1485 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1487 ins
->type
= STACK_I8
;
1488 ins
->dreg
= alloc_lreg (cfg
);
1490 MONO_ADD_INS (cfg
->cbb
, ins
);
1494 shift_bits
= mono_type_elements_shift_bits (type
);
1496 if ((index
>> shift_bits
) && !cfg
->compile_llvm
) {
1497 MONO_INST_NEW (cfg
, ins
, OP_PSHUFLED
);
1500 ins
->inst_c0
= index
>> shift_bits
;
1501 ins
->type
= STACK_VTYPE
;
1502 ins
->dreg
= vreg
= alloc_ireg (cfg
);
1503 MONO_ADD_INS (cfg
->cbb
, ins
);
1506 MONO_INST_NEW (cfg
, ins
, mono_type_to_extract_op (type
));
1509 ins
->type
= STACK_I4
;
1510 ins
->dreg
= vreg
= alloc_ireg (cfg
);
1511 if (cfg
->compile_llvm
)
1512 ins
->inst_c0
= index
;
1514 ins
->inst_c0
= index
& ((1 << shift_bits
) - 1);
1515 MONO_ADD_INS (cfg
->cbb
, ins
);
1517 if (type
->type
== MONO_TYPE_R4
) {
1518 MONO_INST_NEW (cfg
, ins
, cfg
->r4fp
? OP_ICONV_TO_R4_RAW
: OP_MOVE_I4_TO_F
);
1519 ins
->klass
= mono_defaults
.single_class
;
1521 ins
->type
= cfg
->r4_stack_type
;
1522 ins
->dreg
= alloc_freg (cfg
);
1523 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1524 MONO_ADD_INS (cfg
->cbb
, ins
);
1530 simd_intrinsic_emit_getter (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1532 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1534 return simd_intrinsic_emit_getter_op (cfg
, intrinsic
->opcode
, cmethod
->klass
, sig
->ret
, args
[0]);
1538 simd_intrinsic_emit_long_getter (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1542 gboolean is_r8
= mono_method_signature_internal (cmethod
)->ret
->type
== MONO_TYPE_R8
;
1544 vreg
= load_simd_vreg (cfg
, cmethod
, args
[0], NULL
);
1546 MONO_INST_NEW (cfg
, ins
, is_r8
? OP_EXTRACT_R8
: OP_EXTRACT_I8
);
1547 ins
->klass
= cmethod
->klass
;
1549 ins
->inst_c0
= intrinsic
->opcode
;
1551 ins
->type
= STACK_R8
;
1552 ins
->dreg
= alloc_freg (cfg
);
1553 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1555 ins
->type
= STACK_I8
;
1556 ins
->dreg
= alloc_lreg (cfg
);
1558 MONO_ADD_INS (cfg
->cbb
, ins
);
1564 simd_intrinsic_emit_ctor (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1566 MonoInst
*ins
= NULL
;
1568 gboolean is_ldaddr
= (args
[0]->opcode
== OP_LDADDR
&& args
[0]->inst_left
->opcode
!= OP_ARG
);
1569 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1570 int store_op
= mono_type_to_store_membase (cfg
, sig
->params
[0]);
1571 int arg_size
= mono_type_size (sig
->params
[0], &i
);
1574 if (sig
->param_count
== 1) {
1578 dreg
= args
[0]->inst_i0
->dreg
;
1579 NULLIFY_INS (args
[0]);
1581 g_assert (args
[0]->type
== STACK_MP
|| args
[0]->type
== STACK_PTR
);
1582 dreg
= alloc_ireg (cfg
);
1586 opcode
= intrinsic
->opcode
;
1588 opcode
= mono_type_to_expand_op (sig
->params
[0]);
1589 MONO_INST_NEW (cfg
, ins
, opcode
);
1590 ins
->klass
= cmethod
->klass
;
1591 ins
->sreg1
= args
[1]->dreg
;
1592 ins
->type
= STACK_VTYPE
;
1594 MONO_ADD_INS (cfg
->cbb
, ins
);
1595 if (sig
->params
[0]->type
== MONO_TYPE_R4
)
1596 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1597 else if (sig
->params
[0]->type
== MONO_TYPE_R8
)
1598 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1601 MONO_INST_NEW (cfg
, ins
, OP_STOREX_MEMBASE
);
1602 ins
->dreg
= args
[0]->dreg
;
1604 MONO_ADD_INS (cfg
->cbb
, ins
);
1610 NEW_VARLOADA (cfg
, ins
, get_simd_ctor_spill_area (cfg
, cmethod
->klass
), &cmethod
->klass
->byref_arg
);
1611 MONO_ADD_INS (cfg
->cbb
, ins
);
1612 addr_reg
= ins
->dreg
;
1614 g_assert (args
[0]->type
== STACK_MP
|| args
[0]->type
== STACK_PTR
);
1615 addr_reg
= args
[0]->dreg
;
1618 for (i
= sig
->param_count
- 1; i
>= 0; --i
) {
1619 EMIT_NEW_STORE_MEMBASE (cfg
, ins
, store_op
, addr_reg
, i
* arg_size
, args
[i
+ 1]->dreg
);
1622 if (sig
->param_count
* arg_size
< 16) {
1623 /* If there are not enough arguments, fill the rest with 0s */
1624 for (i
= sig
->param_count
; i
< 16 / arg_size
; ++i
) {
1627 MONO_EMIT_NEW_STORE_MEMBASE_IMM (cfg
, OP_STOREI4_MEMBASE_IMM
, addr_reg
, i
* arg_size
, 0);
1630 g_assert_not_reached ();
1636 if (is_ldaddr
) { /*Eliminate LDADDR if it's initing a local var*/
1637 int vreg
= ((MonoInst
*)args
[0]->inst_p0
)->dreg
;
1638 NULLIFY_INS (args
[0]);
1640 MONO_INST_NEW (cfg
, ins
, OP_LOADX_MEMBASE
);
1641 ins
->klass
= cmethod
->klass
;
1642 ins
->sreg1
= addr_reg
;
1643 ins
->type
= STACK_VTYPE
;
1645 MONO_ADD_INS (cfg
->cbb
, ins
);
1651 simd_intrinsic_emit_cast (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1657 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1659 if (cmethod
->is_inflated
)
1661 klass
= mono_class_from_mono_type_internal (mono_method_signature_internal (cmethod
)->ret
);
1663 klass
= cmethod
->klass
;
1665 MONO_INST_NEW (cfg
, ins
, OP_XMOVE
);
1667 ins
->type
= STACK_VTYPE
;
1669 ins
->dreg
= alloc_ireg (cfg
);
1670 MONO_ADD_INS (cfg
->cbb
, ins
);
1675 simd_intrinsic_emit_shift (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1678 int vreg
, vreg2
= -1, opcode
= intrinsic
->opcode
;
1680 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1682 if (args
[1]->opcode
!= OP_ICONST
) {
1683 MONO_INST_NEW (cfg
, ins
, OP_ICONV_TO_X
);
1684 ins
->klass
= mono_defaults
.int32_class
;
1685 ins
->sreg1
= args
[1]->dreg
;
1686 ins
->type
= STACK_I4
;
1687 ins
->dreg
= vreg2
= alloc_ireg (cfg
);
1688 MONO_ADD_INS (cfg
->cbb
, ins
);
1690 ++opcode
; /*The shift_reg version op is always +1 from the regular one.*/
1693 MONO_INST_NEW (cfg
, ins
, opcode
);
1694 ins
->klass
= cmethod
->klass
;
1698 if (args
[1]->opcode
== OP_ICONST
) {
1699 ins
->inst_imm
= args
[1]->inst_c0
;
1700 NULLIFY_INS (args
[1]);
1703 ins
->type
= STACK_VTYPE
;
1704 ins
->dreg
= alloc_ireg (cfg
);
1705 MONO_ADD_INS (cfg
->cbb
, ins
);
1710 mono_op_is_packed_compare (int op
)
1712 return op
>= OP_PCMPEQB
&& op
<= OP_PCMPEQQ
;
1716 simd_intrinsic_emit_equality_op (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
, int opcode
, int flags
)
1719 int left_vreg
, right_vreg
, tmp_vreg
;
1721 left_vreg
= load_simd_vreg (cfg
, cmethod
, args
[0], NULL
);
1722 right_vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
1724 MONO_INST_NEW (cfg
, ins
, opcode
);
1725 ins
->klass
= cmethod
->klass
;
1726 ins
->sreg1
= left_vreg
;
1727 ins
->sreg2
= right_vreg
;
1728 ins
->type
= STACK_VTYPE
;
1729 ins
->klass
= cmethod
->klass
;
1730 ins
->dreg
= tmp_vreg
= alloc_ireg (cfg
);
1731 ins
->inst_c0
= flags
;
1732 MONO_ADD_INS (cfg
->cbb
, ins
);
1734 /*FIXME the next ops are SSE specific*/
1735 MONO_INST_NEW (cfg
, ins
, OP_EXTRACT_MASK
);
1736 ins
->klass
= cmethod
->klass
;
1737 ins
->sreg1
= tmp_vreg
;
1738 ins
->type
= STACK_I4
;
1739 ins
->dreg
= tmp_vreg
= alloc_ireg (cfg
);
1740 MONO_ADD_INS (cfg
->cbb
, ins
);
1742 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1743 if (mono_op_is_packed_compare (opcode
) || flags
== SIMD_COMP_EQ
) {
1744 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_COMPARE_IMM
, -1, tmp_vreg
, 0xFFFF);
1745 NEW_UNALU (cfg
, ins
, flags
== SIMD_COMP_EQ
? OP_CEQ
: OP_CLT_UN
, tmp_vreg
, -1);
1747 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_COMPARE_IMM
, -1, tmp_vreg
, 0);
1748 NEW_UNALU (cfg
, ins
, OP_CGT_UN
, tmp_vreg
, -1);
1750 MONO_ADD_INS (cfg
->cbb
, ins
);
1755 simd_intrinsic_emit_equality (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1757 return simd_intrinsic_emit_equality_op (cfg
, cmethod
, args
, intrinsic
->opcode
, intrinsic
->flags
);
1761 simd_intrinsic_emit_shuffle (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1764 int vreg
, vreg2
= -1;
1765 int param_count
= mono_method_signature_internal (cmethod
)->param_count
;
1767 if (args
[param_count
- 1]->opcode
!= OP_ICONST
) {
1768 /*TODO Shuffle with non literals is not yet supported */
1772 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1773 if (param_count
== 3)
1774 vreg2
= get_simd_vreg (cfg
, cmethod
, args
[1]);
1776 NULLIFY_INS (args
[param_count
- 1]);
1779 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1780 ins
->klass
= cmethod
->klass
;
1783 ins
->inst_c0
= args
[param_count
- 1]->inst_c0
;
1784 ins
->type
= STACK_VTYPE
;
1785 ins
->dreg
= alloc_ireg (cfg
);
1786 MONO_ADD_INS (cfg
->cbb
, ins
);
1788 if (param_count
== 3 && ins
->opcode
== OP_PSHUFLED
)
1789 ins
->opcode
= OP_SHUFPS
;
1794 simd_intrinsic_emit_load_aligned (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1798 MONO_INST_NEW (cfg
, ins
, OP_LOADX_ALIGNED_MEMBASE
);
1799 ins
->klass
= cmethod
->klass
;
1800 ins
->sreg1
= args
[0]->dreg
;
1801 ins
->type
= STACK_VTYPE
;
1802 ins
->dreg
= alloc_ireg (cfg
);
1803 MONO_ADD_INS (cfg
->cbb
, ins
);
1808 simd_intrinsic_emit_store (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1813 vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
1815 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1816 ins
->klass
= cmethod
->klass
;
1817 ins
->dreg
= args
[0]->dreg
;
1819 ins
->type
= STACK_VTYPE
;
1820 MONO_ADD_INS (cfg
->cbb
, ins
);
1825 simd_intrinsic_emit_extract_mask (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1830 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1832 MONO_INST_NEW (cfg
, ins
, OP_EXTRACT_MASK
);
1833 ins
->klass
= cmethod
->klass
;
1835 ins
->type
= STACK_I4
;
1836 ins
->dreg
= alloc_ireg (cfg
);
1837 MONO_ADD_INS (cfg
->cbb
, ins
);
1843 simd_intrinsic_emit_prefetch (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1847 MONO_INST_NEW (cfg
, ins
, OP_PREFETCH_MEMBASE
);
1848 ins
->klass
= cmethod
->klass
;
1849 ins
->sreg1
= args
[0]->dreg
;
1850 ins
->backend
.arg_info
= intrinsic
->flags
;
1851 MONO_ADD_INS (cfg
->cbb
, ins
);
1856 simd_intrinsic_emit_const (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1860 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1861 ins
->klass
= cmethod
->klass
;
1862 ins
->type
= STACK_VTYPE
;
1863 ins
->dreg
= alloc_xreg (cfg
);
1864 MONO_ADD_INS (cfg
->cbb
, ins
);
1869 simd_version_name (guint32 version
)
1872 case SIMD_VERSION_SSE1
:
1874 case SIMD_VERSION_SSE2
:
1876 case SIMD_VERSION_SSE3
:
1878 case SIMD_VERSION_SSSE3
:
1880 case SIMD_VERSION_SSE41
:
1882 case SIMD_VERSION_SSE42
:
1884 case SIMD_VERSION_SSE4a
:
1891 emit_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
, const SimdIntrinsic
*intrinsics
, guint32 size
)
1893 const SimdIntrinsic
*result
= (const SimdIntrinsic
*)mono_binary_search (cmethod
->name
, intrinsics
, size
, sizeof (SimdIntrinsic
), &simd_intrinsic_compare_by_name
);
1895 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", m_class_get_name (cmethod
->klass
), cmethod
->name
, fsig
->param_count
));
1898 if (IS_DEBUG_ON (cfg
)) {
1900 printf ("found call to intrinsic %s::%s/%d -> %s\n", m_class_get_name (cmethod
->klass
), cmethod
->name
, fsig
->param_count
, method_name (result
->name
));
1901 max
= fsig
->param_count
+ fsig
->hasthis
;
1902 for (i
= 0; i
< max
; ++i
) {
1903 printf ("param %d: ", i
);
1904 mono_print_ins (args
[i
]);
1907 if (result
->simd_version_flags
&& !(result
->simd_version_flags
& simd_supported_versions
)) {
1908 if (IS_DEBUG_ON (cfg
)) {
1910 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", m_class_get_name (cmethod
->klass
), cmethod
->name
, fsig
->param_count
);
1911 for (x
= 1; x
<= SIMD_VERSION_INDEX_END
; x
++)
1912 if (result
->simd_version_flags
& (1 << x
))
1913 printf ("%s ", simd_version_name (1 << x
));
1920 switch (result
->simd_emit_mode
) {
1921 case SIMD_EMIT_BINARY
:
1922 return simd_intrinsic_emit_binary (result
, cfg
, cmethod
, args
);
1923 case SIMD_EMIT_UNARY
:
1924 return simd_intrinsic_emit_unary (result
, cfg
, cmethod
, args
);
1925 case SIMD_EMIT_SETTER
:
1926 return simd_intrinsic_emit_setter (result
, cfg
, cmethod
, args
);
1927 case SIMD_EMIT_GETTER
:
1928 return simd_intrinsic_emit_getter (result
, cfg
, cmethod
, args
);
1929 case SIMD_EMIT_GETTER_QWORD
:
1930 return simd_intrinsic_emit_long_getter (result
, cfg
, cmethod
, args
);
1931 case SIMD_EMIT_CTOR
:
1932 return simd_intrinsic_emit_ctor (result
, cfg
, cmethod
, args
);
1933 case SIMD_EMIT_CAST
:
1934 return simd_intrinsic_emit_cast (result
, cfg
, cmethod
, args
);
1935 case SIMD_EMIT_SHUFFLE
:
1936 return simd_intrinsic_emit_shuffle (result
, cfg
, cmethod
, args
);
1937 case SIMD_EMIT_SHIFT
:
1938 return simd_intrinsic_emit_shift (result
, cfg
, cmethod
, args
);
1939 case SIMD_EMIT_EQUALITY
:
1940 return simd_intrinsic_emit_equality (result
, cfg
, cmethod
, args
);
1941 case SIMD_EMIT_LOAD_ALIGNED
:
1942 return simd_intrinsic_emit_load_aligned (result
, cfg
, cmethod
, args
);
1943 case SIMD_EMIT_STORE
:
1944 return simd_intrinsic_emit_store (result
, cfg
, cmethod
, args
);
1945 case SIMD_EMIT_EXTRACT_MASK
:
1946 return simd_intrinsic_emit_extract_mask (result
, cfg
, cmethod
, args
);
1947 case SIMD_EMIT_PREFETCH
:
1948 return simd_intrinsic_emit_prefetch (result
, cfg
, cmethod
, args
);
1950 g_assert_not_reached ();
1954 mono_emit_vector_ldelema (MonoCompile
*cfg
, MonoType
*array_type
, MonoInst
*arr
, MonoInst
*index
, gboolean check_bounds
)
1958 int mult_reg
, add_reg
, array_reg
, index_reg
, index2_reg
, index3_reg
;
1960 size
= mono_array_element_size (mono_class_from_mono_type_internal (array_type
));
1961 mult_reg
= alloc_preg (cfg
);
1962 array_reg
= arr
->dreg
;
1963 index_reg
= index
->dreg
;
1965 #if TARGET_SIZEOF_VOID_P == 8
1966 /* The array reg is 64 bits but the index reg is only 32 */
1967 index2_reg
= alloc_preg (cfg
);
1968 MONO_EMIT_NEW_UNALU (cfg
, OP_SEXT_I4
, index2_reg
, index_reg
);
1970 index2_reg
= index_reg
;
1972 index3_reg
= alloc_preg (cfg
);
1975 MONO_EMIT_BOUNDS_CHECK (cfg
, array_reg
, MonoArray
, max_length
, index2_reg
);
1976 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_PADD_IMM
, index3_reg
, index2_reg
, 16 / size
- 1);
1977 MONO_EMIT_BOUNDS_CHECK (cfg
, array_reg
, MonoArray
, max_length
, index3_reg
);
1980 add_reg
= alloc_preg (cfg
);
1982 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_MUL_IMM
, mult_reg
, index2_reg
, size
);
1983 MONO_EMIT_NEW_BIALU (cfg
, OP_PADD
, add_reg
, array_reg
, mult_reg
);
1984 NEW_BIALU_IMM (cfg
, ins
, OP_PADD_IMM
, add_reg
, add_reg
, MONO_STRUCT_OFFSET (MonoArray
, vector
));
1985 ins
->type
= STACK_PTR
;
1986 MONO_ADD_INS (cfg
->cbb
, ins
);
1992 emit_array_extension_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
1994 if ((!strcmp ("GetVector", cmethod
->name
) || !strcmp ("GetVectorAligned", cmethod
->name
)) && fsig
->param_count
== 2) {
1996 int addr
= mono_emit_vector_ldelema (cfg
, fsig
->params
[0], args
[0], args
[1], TRUE
);
1998 MONO_INST_NEW (cfg
, load
, !strcmp ("GetVectorAligned", cmethod
->name
) ? OP_LOADX_ALIGNED_MEMBASE
: OP_LOADX_MEMBASE
);
1999 load
->klass
= cmethod
->klass
;
2001 load
->type
= STACK_VTYPE
;
2002 load
->dreg
= alloc_ireg (cfg
);
2003 MONO_ADD_INS (cfg
->cbb
, load
);
2007 if ((!strcmp ("SetVector", cmethod
->name
) || !strcmp ("SetVectorAligned", cmethod
->name
)) && fsig
->param_count
== 3) {
2009 int vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
2010 int addr
= mono_emit_vector_ldelema (cfg
, fsig
->params
[0], args
[0], args
[2], TRUE
);
2012 MONO_INST_NEW (cfg
, store
, !strcmp ("SetVectorAligned", cmethod
->name
) ? OP_STOREX_ALIGNED_MEMBASE_REG
: OP_STOREX_MEMBASE
);
2013 store
->klass
= cmethod
->klass
;
2015 store
->sreg1
= vreg
;
2016 MONO_ADD_INS (cfg
->cbb
, store
);
2020 if (!strcmp ("IsAligned", cmethod
->name
) && fsig
->param_count
== 2) {
2022 int addr
= mono_emit_vector_ldelema (cfg
, fsig
->params
[0], args
[0], args
[1], FALSE
);
2024 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_AND_IMM
, addr
, addr
, 15);
2025 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_COMPARE_IMM
, -1, addr
, 0);
2026 NEW_UNALU (cfg
, ins
, OP_CEQ
, addr
, -1);
2027 MONO_ADD_INS (cfg
->cbb
, ins
);
2035 emit_simd_runtime_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2037 if (!strcmp ("get_AccelMode", cmethod
->name
) && fsig
->param_count
== 0) {
2039 EMIT_NEW_ICONST (cfg
, ins
, simd_supported_versions
);
2046 is_sys_numerics_assembly (MonoAssembly
*assembly
)
2048 return !strcmp ("System.Numerics", assembly
->aname
.name
);
2052 is_sys_numerics_vectors_assembly (MonoAssembly
*assembly
)
2054 return !strcmp ("System.Numerics.Vectors", assembly
->aname
.name
);
2058 mono_emit_simd_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2060 const char *class_name
;
2061 MonoInst
*simd_inst
= NULL
;
2063 if (is_sys_numerics_assembly (m_class_get_image (cmethod
->klass
)->assembly
)) {
2064 simd_inst
= emit_sys_numerics_intrinsics (cfg
, cmethod
, fsig
, args
);
2068 if (is_sys_numerics_vectors_assembly (m_class_get_image (cmethod
->klass
)->assembly
)) {
2069 simd_inst
= emit_sys_numerics_vectors_intrinsics (cfg
, cmethod
, fsig
, args
);
2073 if (strcmp ("Mono.Simd", m_class_get_image (cmethod
->klass
)->assembly
->aname
.name
) ||
2074 strcmp ("Mono.Simd", m_class_get_name_space (cmethod
->klass
))) {
2078 class_name
= m_class_get_name (cmethod
->klass
);
2079 if (!strcmp ("SimdRuntime", class_name
)) {
2080 simd_inst
= emit_simd_runtime_intrinsics (cfg
, cmethod
, fsig
, args
);
2084 if (!strcmp ("ArrayExtensions", class_name
)) {
2085 simd_inst
= emit_array_extension_intrinsics (cfg
, cmethod
, fsig
, args
);
2089 if (!strcmp ("VectorOperations", class_name
)) {
2090 if (!(cmethod
->flags
& METHOD_ATTRIBUTE_STATIC
))
2092 class_name
= m_class_get_name (mono_class_from_mono_type_internal (mono_method_signature_internal (cmethod
)->params
[0]));
2093 } else if (!m_class_is_simd_type (cmethod
->klass
))
2096 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS_SIMPLIFY_INDIRECTION
;
2097 if (!strcmp ("Vector2d", class_name
)) {
2098 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2d_intrinsics
, sizeof (vector2d_intrinsics
) / sizeof (SimdIntrinsic
));
2101 if (!strcmp ("Vector4f", class_name
)) {
2102 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4f_intrinsics
, sizeof (vector4f_intrinsics
) / sizeof (SimdIntrinsic
));
2105 if (!strcmp ("Vector2ul", class_name
)) {
2106 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2ul_intrinsics
, sizeof (vector2ul_intrinsics
) / sizeof (SimdIntrinsic
));
2109 if (!strcmp ("Vector2l", class_name
)) {
2110 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2l_intrinsics
, sizeof (vector2l_intrinsics
) / sizeof (SimdIntrinsic
));
2113 if (!strcmp ("Vector4ui", class_name
)) {
2114 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4ui_intrinsics
, sizeof (vector4ui_intrinsics
) / sizeof (SimdIntrinsic
));
2117 if (!strcmp ("Vector4i", class_name
)) {
2118 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4i_intrinsics
, sizeof (vector4i_intrinsics
) / sizeof (SimdIntrinsic
));
2121 if (!strcmp ("Vector8us", class_name
)) {
2122 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector8us_intrinsics
, sizeof (vector8us_intrinsics
) / sizeof (SimdIntrinsic
));
2125 if (!strcmp ("Vector8s", class_name
)) {
2126 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector8s_intrinsics
, sizeof (vector8s_intrinsics
) / sizeof (SimdIntrinsic
));
2129 if (!strcmp ("Vector16b", class_name
)) {
2130 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector16b_intrinsics
, sizeof (vector16b_intrinsics
) / sizeof (SimdIntrinsic
));
2133 if (!strcmp ("Vector16sb", class_name
)) {
2134 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector16sb_intrinsics
, sizeof (vector16sb_intrinsics
) / sizeof (SimdIntrinsic
));
2139 if (simd_inst
!= NULL
) {
2140 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS
;
2141 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS_DECOMPOSE_VTYPE
;
2148 assert_handled (MonoCompile
*cfg
, MonoMethod
*method
)
2150 MonoCustomAttrInfo
*cattr
;
2153 if (cfg
->verbose_level
> 1) {
2154 cattr
= mono_custom_attrs_from_method_checked (method
, error
);
2157 gboolean has_attr
= FALSE
;
2158 for (int i
= 0; i
< cattr
->num_attrs
; ++i
)
2159 if (cattr
->attrs
[i
].ctor
&& (!strcmp (m_class_get_name (cattr
->attrs
[i
].ctor
->klass
), "JitIntrinsicAttribute")))
2162 printf ("SIMD intrinsic unhandled: %s\n", mono_method_get_name_full (method
, TRUE
, TRUE
, MONO_TYPE_NAME_FORMAT_IL
));
2164 //g_assert_not_reached ();
2166 mono_custom_attrs_free (cattr
);
2171 // The entries should be ordered by name
2172 // System.Numerics.Vector2/Vector3/Vector4
2173 static const SimdIntrinsic vector2_intrinsics
[] = {
2174 { SN_ctor
, OP_EXPAND_R4
},
2176 { SN_Dot
, OP_DPPS
},
2177 { SN_Equals
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
2178 { SN_Max
, OP_MAXPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2179 { SN_Min
, OP_MINPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2180 { SN_SquareRoot
, OP_SQRTPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
2181 { SN_op_Addition
, OP_ADDPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2182 { SN_op_Division
, OP_DIVPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2183 { SN_op_Multiply
, OP_MULPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2184 { SN_op_Subtraction
, OP_SUBPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2188 emit_vector_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2190 const SimdIntrinsic
*intrins
;
2191 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
2192 MonoType
*type
= m_class_get_byval_arg (cmethod
->klass
);
2194 if (!m_class_is_simd_type (cmethod
->klass
))
2198 * Vector2/3/4 are handled the same way, since the underlying SIMD type is the same (4 * r4).
2200 intrins
= (const SimdIntrinsic
*)mono_binary_search (cmethod
->name
, vector2_intrinsics
, sizeof (vector2_intrinsics
) / sizeof (SimdIntrinsic
), sizeof (SimdIntrinsic
), &simd_intrinsic_compare_by_name
);
2202 assert_handled (cfg
, cmethod
);
2206 if (cfg
->verbose_level
> 1) {
2207 char *name
= mono_method_full_name (cmethod
, TRUE
);
2208 printf (" SIMD intrinsic %s\n", name
);
2212 switch (intrins
->name
) {
2214 gboolean match
= TRUE
;
2215 for (int i
= 0; i
< fsig
->param_count
; ++i
)
2216 if (fsig
->params
[i
]->type
!= MONO_TYPE_R4
)
2220 return simd_intrinsic_emit_ctor (intrins
, cfg
, cmethod
, args
);
2223 if (!(fsig
->param_count
== 1 && fsig
->ret
->type
== MONO_TYPE_BOOLEAN
&& fsig
->params
[0] == type
))
2225 return simd_intrinsic_emit_equality (intrins
, cfg
, cmethod
, args
);
2227 if (!(fsig
->param_count
== 1 && fsig
->ret
== type
&& fsig
->params
[0] == type
))
2229 return simd_intrinsic_emit_unary (intrins
, cfg
, cmethod
, args
);
2231 if (!(fsig
->param_count
== 2 && fsig
->ret
->type
== MONO_TYPE_R4
&& fsig
->params
[0] == type
&& fsig
->params
[1] == type
))
2233 if (COMPILE_LLVM (cfg
)) {
2236 ins
= simd_intrinsic_emit_binary (intrins
, cfg
, cmethod
, args
);
2237 /* The end result is in the lowest element */
2238 return simd_intrinsic_emit_getter_op (cfg
, 0, cmethod
->klass
, mono_method_signature_internal (cmethod
)->ret
, ins
);
2242 // abs(x) = max(x, sub(0,x))
2246 if (!(fsig
->param_count
== 1 && fsig
->ret
== type
&& fsig
->params
[0] == type
))
2249 MONO_INST_NEW (cfg
, zero
, OP_XZERO
);
2250 zero
->dreg
= alloc_xreg (cfg
);
2251 zero
->klass
= cmethod
->klass
;
2252 MONO_ADD_INS (cfg
->cbb
, zero
);
2254 sub
= simd_intrinsic_emit_binary_op (cfg
, OP_SUBPS
, 0, cmethod
->klass
, sig
->params
[0], sig
->params
[0], zero
, args
[0]);
2255 return simd_intrinsic_emit_binary_op (cfg
, OP_MAXPS
, 0, cmethod
->klass
, sig
->params
[0], sig
->params
[0], args
[0], sub
);
2259 case SN_op_Addition
:
2260 case SN_op_Division
:
2261 case SN_op_Multiply
:
2262 case SN_op_Subtraction
:
2263 if (!(fsig
->param_count
== 2 && fsig
->ret
== type
&& (fsig
->params
[0] == type
|| fsig
->params
[0]->type
== MONO_TYPE_R4
) && (fsig
->params
[1] == type
|| fsig
->params
[1]->type
== MONO_TYPE_R4
)))
2265 return simd_intrinsic_emit_binary (intrins
, cfg
, cmethod
, args
);
2270 assert_handled (cfg
, cmethod
);
2272 if (cfg
->verbose_level
> 1) {
2273 char *name
= mono_method_full_name (cmethod
, TRUE
);
2274 printf (" SIMD method %s not handled.\n", name
);
2281 emit_vector_is_hardware_accelerated_intrinsic (MonoCompile
*cfg
)
2285 if (simd_supported_versions
)
2286 EMIT_NEW_ICONST (cfg
, ins
, 1);
2288 EMIT_NEW_ICONST (cfg
, ins
, 0);
2289 ins
->type
= STACK_I4
;
2293 /* These should be ordered by name */
2294 static const SimdIntrinsic vector_t_intrinsics
[] = {
2300 { SN_GreaterThanOrEqual
},
2302 { SN_LessThanOrEqual
},
2305 { SN_get_AllOnes
, OP_XONES
},
2308 { SN_get_Zero
, OP_XZERO
},
2310 { SN_op_BitwiseAnd
},
2311 { SN_op_BitwiseOr
},
2313 { SN_op_ExclusiveOr
},
2316 { SN_op_Subtraction
}
2320 emit_vector_t_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2322 const SimdIntrinsic
*intrins
;
2323 MonoType
*type
, *etype
;
2325 int size
, len
, index
;
2327 intrins
= (const SimdIntrinsic
*)mono_binary_search (cmethod
->name
, vector_t_intrinsics
, sizeof (vector_t_intrinsics
) / sizeof (SimdIntrinsic
), sizeof (SimdIntrinsic
), &simd_intrinsic_compare_by_name
);
2329 assert_handled (cfg
, cmethod
);
2333 type
= m_class_get_byval_arg (cmethod
->klass
);
2334 etype
= mono_class_get_context (cmethod
->klass
)->class_inst
->type_argv
[0];
2335 size
= mono_class_value_size (mono_class_from_mono_type_internal (etype
), NULL
);
2339 if (!MONO_TYPE_IS_PRIMITIVE (etype
))
2342 if (cfg
->verbose_level
> 1) {
2343 char *name
= mono_method_full_name (cmethod
, TRUE
);
2344 printf (" SIMD intrinsic %s\n", name
);
2348 switch (intrins
->name
) {
2350 if (!(fsig
->param_count
== 0 && fsig
->ret
->type
== MONO_TYPE_I4
))
2352 EMIT_NEW_ICONST (cfg
, ins
, len
);
2354 case SN_get_AllOnes
:
2356 if (!(fsig
->param_count
== 0 && mono_metadata_type_equal (fsig
->ret
, type
)))
2358 return simd_intrinsic_emit_const (intrins
, cfg
, cmethod
, args
);
2360 g_assert (fsig
->param_count
== 1);
2361 if (args
[1]->opcode
!= OP_ICONST
)
2363 index
= args
[1]->inst_c0
;
2364 if (index
< 0 || index
>= len
)
2366 return simd_intrinsic_emit_getter_op (cfg
, index
, cmethod
->klass
, etype
, args
[0]);
2368 if (fsig
->param_count
== 1 && mono_metadata_type_equal (fsig
->params
[0], etype
))
2369 return simd_intrinsic_emit_ctor (NULL
, cfg
, cmethod
, args
);
2370 if ((fsig
->param_count
== 1 || fsig
->param_count
== 2) && (fsig
->params
[0]->type
== MONO_TYPE_SZARRAY
)) {
2371 MonoInst
*array_ins
= args
[1];
2372 MonoInst
*index_ins
;
2373 MonoInst
*ldelema_ins
;
2377 if (args
[0]->opcode
!= OP_LDADDR
)
2380 /* .ctor (T[]) or .ctor (T[], index) */
2382 if (fsig
->param_count
== 2) {
2383 index_ins
= args
[2];
2385 EMIT_NEW_ICONST (cfg
, index_ins
, 0);
2388 /* Emit index check for the end (index + len - 1 < array length) */
2389 end_index_reg
= alloc_ireg (cfg
);
2390 EMIT_NEW_BIALU_IMM (cfg
, ins
, OP_IADD_IMM
, end_index_reg
, index_ins
->dreg
, len
- 1);
2391 MONO_EMIT_BOUNDS_CHECK (cfg
, array_ins
->dreg
, MonoArray
, max_length
, end_index_reg
);
2393 /* Load the array slice into the simd reg */
2394 ldelema_ins
= mini_emit_ldelema_1_ins (cfg
, mono_class_from_mono_type_internal (etype
), array_ins
, index_ins
, TRUE
);
2395 g_assert (args
[0]->opcode
== OP_LDADDR
);
2396 var
= (MonoInst
*)args
[0]->inst_p0
;
2397 EMIT_NEW_LOAD_MEMBASE (cfg
, ins
, OP_LOADX_MEMBASE
, var
->dreg
, ldelema_ins
->dreg
, 0);
2398 ins
->klass
= cmethod
->klass
;
2402 case SN_op_Explicit
:
2403 return simd_intrinsic_emit_cast (intrins
, cfg
, cmethod
, args
);
2405 if (fsig
->param_count
== 1 && fsig
->ret
->type
== MONO_TYPE_BOOLEAN
&& mono_metadata_type_equal (fsig
->params
[0], type
))
2406 return simd_intrinsic_emit_equality_op (cfg
, cmethod
, args
, type_to_comp_op (etype
), SIMD_COMP_EQ
);
2407 if (fsig
->param_count
== 2 && mono_metadata_type_equal (fsig
->ret
, type
) && mono_metadata_type_equal (fsig
->params
[0], type
) && mono_metadata_type_equal (fsig
->params
[1], type
))
2408 return simd_intrinsic_emit_binary_op (cfg
, type_to_comp_op (etype
), 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2411 case SN_GreaterThan
:
2412 case SN_GreaterThanOrEqual
:
2414 case SN_LessThanOrEqual
: {
2415 MonoInst
*cmp1
, *cmp2
;
2418 switch (etype
->type
) {
2428 eq_op
= type_to_comp_op (etype
);
2429 gt_op
= type_to_gt_op (etype
);
2431 switch (intrins
->name
) {
2432 case SN_GreaterThan
:
2433 return simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2435 return simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[1], args
[0]);
2436 case SN_LessThanOrEqual
:
2437 cmp1
= simd_intrinsic_emit_binary_op (cfg
, eq_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[1], args
[0]);
2438 cmp2
= simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[1], args
[0]);
2439 return simd_intrinsic_emit_binary_op (cfg
, OP_POR
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], cmp1
, cmp2
);
2440 case SN_GreaterThanOrEqual
:
2441 cmp1
= simd_intrinsic_emit_binary_op (cfg
, eq_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2442 cmp2
= simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2443 return simd_intrinsic_emit_binary_op (cfg
, OP_POR
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], cmp1
, cmp2
);
2445 g_assert_not_reached ();
2451 switch (etype
->type
) {
2455 case MONO_TYPE_U8
: {
2459 MONO_INST_NEW (cfg
, ins
, OP_XMOVE
);
2460 ins
->klass
= cmethod
->klass
;
2461 ins
->type
= STACK_VTYPE
;
2462 ins
->sreg1
= args
[0]->dreg
;
2463 ins
->dreg
= alloc_xreg (cfg
);
2464 MONO_ADD_INS (cfg
->cbb
, ins
);
2471 case SN_op_Addition
:
2472 case SN_op_Subtraction
:
2473 case SN_op_Multiply
:
2474 case SN_op_Division
:
2475 case SN_op_ExclusiveOr
:
2476 case SN_op_BitwiseAnd
:
2477 case SN_op_BitwiseOr
:
2480 if (!(fsig
->param_count
== 2 && mono_metadata_type_equal (fsig
->ret
, fsig
->params
[0]) && mono_metadata_type_equal (fsig
->params
[0], fsig
->params
[1])))
2483 switch (intrins
->name
) {
2484 case SN_op_Addition
:
2485 op
= type_to_padd_op (etype
);
2487 case SN_op_Subtraction
:
2488 op
= type_to_psub_op (etype
);
2490 case SN_op_Multiply
:
2491 op
= type_to_pmul_op (etype
);
2493 case SN_op_Division
:
2494 op
= type_to_pdiv_op (etype
);
2496 case SN_op_ExclusiveOr
:
2497 op
= type_to_pxor_op (etype
);
2499 case SN_op_BitwiseAnd
:
2500 op
= type_to_pand_op (etype
);
2502 case SN_op_BitwiseOr
:
2503 op
= type_to_por_op (etype
);
2506 op
= type_to_pmin_op (etype
);
2509 op
= type_to_pmax_op (etype
);
2512 g_assert_not_reached ();
2515 return simd_intrinsic_emit_binary_op (cfg
, op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[0], args
[0], args
[1]);
2519 MonoInst
*array_ins
= args
[1];
2520 MonoInst
*index_ins
= args
[2];
2521 MonoInst
*ldelema_ins
;
2525 if (args
[0]->opcode
!= OP_LDADDR
)
2528 /* Emit index check for the end (index + len - 1 < array length) */
2529 end_index_reg
= alloc_ireg (cfg
);
2530 EMIT_NEW_BIALU_IMM (cfg
, ins
, OP_IADD_IMM
, end_index_reg
, index_ins
->dreg
, len
- 1);
2532 int length_reg
= alloc_ireg (cfg
);
2533 MONO_EMIT_NEW_LOAD_MEMBASE_OP_FAULT (cfg
, OP_LOADI4_MEMBASE
, length_reg
, array_ins
->dreg
, MONO_STRUCT_OFFSET (MonoArray
, max_length
));
2534 MONO_EMIT_NEW_BIALU (cfg
, OP_COMPARE
, -1, length_reg
, end_index_reg
);
2535 MONO_EMIT_NEW_COND_EXC (cfg
, LE_UN
, "ArgumentException");
2537 /* Load the simd reg into the array slice */
2538 ldelema_ins
= mini_emit_ldelema_1_ins (cfg
, mono_class_from_mono_type_internal (etype
), array_ins
, index_ins
, TRUE
);
2539 g_assert (args
[0]->opcode
== OP_LDADDR
);
2540 var
= (MonoInst
*)args
[0]->inst_p0
;
2541 EMIT_NEW_STORE_MEMBASE (cfg
, ins
, OP_STOREX_MEMBASE
, ldelema_ins
->dreg
, 0, var
->dreg
);
2542 ins
->klass
= cmethod
->klass
;
2550 assert_handled (cfg
, cmethod
);
2552 if (cfg
->verbose_level
> 1) {
2553 char *name
= mono_method_full_name (cmethod
, TRUE
);
2554 printf (" SIMD method %s not handled.\n", name
);
2562 * emit_sys_numerics_intrinsics:
2564 * Emit intrinsics for the System.Numerics assembly.
2567 emit_sys_numerics_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2569 const char *nspace
= m_class_get_name_space (cmethod
->klass
);
2570 const char *class_name
= m_class_get_name (cmethod
->klass
);
2572 if (!strcmp ("Vector2", class_name
) || !strcmp ("Vector4", class_name
) || !strcmp ("Vector3", class_name
))
2573 return emit_vector_intrinsics (cfg
, cmethod
, fsig
, args
);
2575 if (!strcmp ("System.Numerics", nspace
) && !strcmp ("Vector", class_name
)) {
2576 if (!strcmp (cmethod
->name
, "get_IsHardwareAccelerated"))
2577 return emit_vector_is_hardware_accelerated_intrinsic (cfg
);
2584 emit_sys_numerics_vectors_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2586 const char *nspace
= m_class_get_name_space (cmethod
->klass
);
2587 const char *class_name
= m_class_get_name (cmethod
->klass
);
2589 if (!strcmp (class_name
, "Vector`1"))
2590 return emit_vector_t_intrinsics (cfg
, cmethod
, fsig
, args
);
2592 if (!strcmp ("System.Numerics", nspace
) && !strcmp ("Vector", class_name
)) {
2593 if (!strcmp (cmethod
->name
, "get_IsHardwareAccelerated"))
2594 return emit_vector_is_hardware_accelerated_intrinsic (cfg
);
2601 mono_emit_simd_field_load (MonoCompile
*cfg
, MonoClassField
*field
, MonoInst
*addr
)
2603 MonoInst
* simd_inst
= NULL
;
2605 if (is_sys_numerics_assembly (m_class_get_image (field
->parent
)->assembly
)) {
2608 const char *parent_name
= m_class_get_name (field
->parent
);
2609 if (!strcmp (parent_name
, "Vector2") ||
2610 !strcmp (parent_name
, "Vector3") ||
2611 !strcmp (parent_name
, "Vector4")) {
2612 if (!strcmp (field
->name
, "X"))
2614 else if (!strcmp (field
->name
, "Y"))
2616 else if (!strcmp (field
->name
, "Z"))
2618 else if (!strcmp (field
->name
, "W"))
2623 if (cfg
->verbose_level
> 1)
2624 printf (" SIMD intrinsic field access: %s\n", field
->name
);
2626 simd_inst
= simd_intrinsic_emit_getter_op (cfg
, index
, field
->parent
, mono_field_get_type_internal (field
), addr
);
2633 if (simd_inst
!= NULL
) {
2634 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS
;
2635 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS_DECOMPOSE_VTYPE
;
2641 #endif /* DISABLE_JIT */
2642 #endif /* MONO_ARCH_SIMD_INTRINSICS */