3 * simd support for intrinsics
6 * Rodrigo Kumpera (rkumpera@novell.com)
8 * (C) 2008 Novell, Inc.
16 #include "mono/utils/bsearch.h"
17 #include <mono/metadata/abi-details.h>
18 #include <mono/metadata/reflection-internals.h>
21 General notes on SIMD intrinsics
23 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
24 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
25 TODO extend op_to_op_dest_membase to handle simd ops
26 TODO add support for indexed versions of simd ops
27 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
28 TODO make sure locals, arguments and spills are properly aligned.
29 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
30 TODO add stuff to man pages
31 TODO document this under /docs
32 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
33 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
34 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
35 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
36 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
37 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
38 TODO check if we need to init the SSE control word with better precision.
39 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
40 TODO make SimdRuntime.get_AccelMode work under AOT
41 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
42 TODO extend bounds checking code to support for range checking.
44 General notes for SIMD intrinsics.
46 -Bad extractor and constructor performance
47 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
48 It will be loaded in the FP stack just to be pushed on the call stack.
50 A similar thing happens with Vector4f constructor that require float vars to be
52 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
53 trip to the FP stack is desirable.
55 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
59 -Promote OP_EXTRACT_I4 to a STORE op
60 The advantage of this change is that it could have a _membase version and promote further optimizations.
62 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
66 #if defined (MONO_ARCH_SIMD_INTRINSICS)
68 #if defined (DISABLE_JIT)
71 mono_simd_intrinsics_init (void)
77 //#define IS_DEBUG_ON(cfg) (0)
79 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
80 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
86 SIMD_EMIT_GETTER_QWORD
,
92 SIMD_EMIT_LOAD_ALIGNED
,
94 SIMD_EMIT_EXTRACT_MASK
,
98 // This, instead of an array of pointers, to optimize away a pointer and a relocation per string.
99 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
100 #define MSGSTRFIELD1(line) str##line
101 static const struct msgstr_t
{
102 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
103 #include "simd-methods.h"
106 #define SIMD_METHOD(str,name) str,
107 #include "simd-methods.h"
112 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
113 #include "simd-methods.h"
115 #define method_name(idx) ((const char*)&method_names + (idx))
120 guint8 simd_version_flags
;
121 guint8 simd_emit_mode
: 4;
125 static const SimdIntrinsic vector4f_intrinsics
[] = {
126 { SN_ctor
, OP_EXPAND_R4
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
127 { SN_AddSub
, OP_ADDSUBPS
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
128 { SN_AndNot
, OP_ANDNPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
129 { SN_CompareEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_EQ
},
130 { SN_CompareLessEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LE
},
131 { SN_CompareLessThan
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LT
},
132 { SN_CompareNotEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NEQ
},
133 { SN_CompareNotLessEqual
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLE
},
134 { SN_CompareNotLessThan
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLT
},
135 { SN_CompareOrdered
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_ORD
},
136 { SN_CompareUnordered
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_UNORD
},
137 { SN_ConvertToDouble
, OP_CVTPS2PD
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
138 { SN_ConvertToInt
, OP_CVTPS2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
139 { SN_ConvertToIntTruncated
, OP_CVTTPS2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
140 { SN_DuplicateHigh
, OP_DUPPS_HIGH
, SIMD_VERSION_SSE3
, SIMD_EMIT_UNARY
},
141 { SN_DuplicateLow
, OP_DUPPS_LOW
, SIMD_VERSION_SSE3
, SIMD_EMIT_UNARY
},
142 { SN_HorizontalAdd
, OP_HADDPS
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
143 { SN_HorizontalSub
, OP_HSUBPS
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
144 { SN_InterleaveHigh
, OP_UNPACK_HIGHPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
145 { SN_InterleaveLow
, OP_UNPACK_LOWPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
146 { SN_InvSqrt
, OP_RSQRTPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
147 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
148 { SN_Max
, OP_MAXPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
149 { SN_Min
, OP_MINPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
150 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
151 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
152 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
153 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
154 { SN_Reciprocal
, OP_RCPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
155 { SN_Shuffle
, OP_PSHUFLED
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
156 { SN_Sqrt
, OP_SQRTPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
157 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
158 { SN_StoreNonTemporal
, OP_STOREX_NTA_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
159 { SN_get_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
160 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
161 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
162 { SN_get_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
163 { SN_op_Addition
, OP_ADDPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
164 { SN_op_BitwiseAnd
, OP_ANDPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
165 { SN_op_BitwiseOr
, OP_ORPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
166 { SN_op_Division
, OP_DIVPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
167 { SN_op_Equality
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
168 { SN_op_ExclusiveOr
, OP_XORPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
169 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
170 { SN_op_Inequality
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
171 { SN_op_Multiply
, OP_MULPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
172 { SN_op_Subtraction
, OP_SUBPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
173 { SN_set_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
174 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
175 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
176 { SN_set_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
}
179 static const SimdIntrinsic vector2d_intrinsics
[] = {
180 { SN_ctor
, OP_EXPAND_R8
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
181 { SN_AddSub
, OP_ADDSUBPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
,},
182 { SN_AndNot
, OP_ANDNPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
183 { SN_CompareEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_EQ
},
184 { SN_CompareLessEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LE
},
185 { SN_CompareLessThan
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_LT
},
186 { SN_CompareNotEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NEQ
},
187 { SN_CompareNotLessEqual
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLE
},
188 { SN_CompareNotLessThan
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_NLT
},
189 { SN_CompareOrdered
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_ORD
},
190 { SN_CompareUnordered
, OP_COMPPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_COMP_UNORD
},
191 { SN_ConvertToFloat
, OP_CVTPD2PS
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
192 { SN_ConvertToInt
, OP_CVTPD2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
193 { SN_ConvertToIntTruncated
, OP_CVTTPD2DQ
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
194 { SN_Duplicate
, OP_DUPPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_UNARY
},
195 { SN_HorizontalAdd
, OP_HADDPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
196 { SN_HorizontalSub
, OP_HSUBPD
, SIMD_VERSION_SSE3
, SIMD_EMIT_BINARY
},
197 { SN_InterleaveHigh
, OP_UNPACK_HIGHPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
198 { SN_InterleaveLow
, OP_UNPACK_LOWPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
199 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
200 { SN_Max
, OP_MAXPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
201 { SN_Min
, OP_MINPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
202 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
203 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
204 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
205 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
206 { SN_Shuffle
, OP_SHUFPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
207 { SN_Sqrt
, OP_SQRTPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
208 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
209 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
210 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
211 { SN_op_Addition
, OP_ADDPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
212 { SN_op_BitwiseAnd
, OP_ANDPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
213 { SN_op_BitwiseOr
, OP_ORPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
214 { SN_op_Division
, OP_DIVPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
215 { SN_op_ExclusiveOr
, OP_XORPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
216 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
217 { SN_op_Multiply
, OP_MULPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
218 { SN_op_Subtraction
, OP_SUBPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
219 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
220 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
223 static const SimdIntrinsic vector2ul_intrinsics
[] = {
224 { SN_ctor
, OP_EXPAND_I8
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
225 { SN_CompareEqual
, OP_PCMPEQQ
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
226 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
227 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
228 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
229 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
230 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
231 { SN_Shuffle
, OP_SHUFPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
232 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
233 { SN_UnpackHigh
, OP_UNPACK_HIGHQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
234 { SN_UnpackLow
, OP_UNPACK_LOWQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
235 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
236 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
237 { SN_op_Addition
, OP_PADDQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
238 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
239 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
240 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
},
241 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
242 { SN_op_LeftShift
, OP_PSHLQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
243 { SN_op_Multiply
, OP_PMULQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
244 { SN_op_RightShift
, OP_PSHRQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
245 { SN_op_Subtraction
, OP_PSUBQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
246 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
247 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
250 static const SimdIntrinsic vector2l_intrinsics
[] = {
251 { SN_ctor
, OP_EXPAND_I8
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
252 { SN_CompareEqual
, OP_PCMPEQQ
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
253 { SN_CompareGreaterThan
, OP_PCMPGTQ
, SIMD_VERSION_SSE42
, SIMD_EMIT_BINARY
},
254 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
255 { SN_LogicalRightShift
, OP_PSHRQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
256 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
257 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
258 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
259 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
260 { SN_Shuffle
, OP_SHUFPD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
261 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
262 { SN_UnpackHigh
, OP_UNPACK_HIGHQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
263 { SN_UnpackLow
, OP_UNPACK_LOWQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
264 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
265 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER_QWORD
},
266 { SN_op_Addition
, OP_PADDQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
267 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
268 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
269 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
270 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
271 { SN_op_LeftShift
, OP_PSHLQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
272 { SN_op_Multiply
, OP_PMULQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
273 { SN_op_Subtraction
, OP_PSUBQ
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
274 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
275 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
278 static const SimdIntrinsic vector4ui_intrinsics
[] = {
279 { SN_ctor
, OP_EXPAND_I4
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
280 { SN_ArithmeticRightShift
, OP_PSARD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
281 { SN_CompareEqual
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
282 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
283 { SN_Max
, OP_PMAXD_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
284 { SN_Min
, OP_PMIND_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
285 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
286 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
287 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
288 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
289 { SN_Shuffle
, OP_PSHUFLED
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
290 { SN_SignedPackWithSignedSaturation
, OP_PACKD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
291 { SN_SignedPackWithUnsignedSaturation
, OP_PACKD_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
292 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
293 { SN_UnpackHigh
, OP_UNPACK_HIGHD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
294 { SN_UnpackLow
, OP_UNPACK_LOWD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
295 { SN_get_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
296 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
297 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
298 { SN_get_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
299 { SN_op_Addition
, OP_PADDD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
300 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
301 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
302 { SN_op_Equality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
303 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
304 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
305 { SN_op_Inequality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
306 { SN_op_LeftShift
, OP_PSHLD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
307 { SN_op_Multiply
, OP_PMULD
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
308 { SN_op_RightShift
, OP_PSHRD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
309 { SN_op_Subtraction
, OP_PSUBD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
310 { SN_set_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
311 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
312 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
313 { SN_set_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
316 static const SimdIntrinsic vector4i_intrinsics
[] = {
317 { SN_ctor
, OP_EXPAND_I4
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
318 { SN_CompareEqual
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
319 { SN_CompareGreaterThan
, OP_PCMPGTD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
320 { SN_ConvertToDouble
, OP_CVTDQ2PD
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
321 { SN_ConvertToFloat
, OP_CVTDQ2PS
, SIMD_VERSION_SSE2
, SIMD_EMIT_UNARY
},
322 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
323 { SN_LogicalRightShift
, OP_PSHRD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
324 { SN_Max
, OP_PMAXD
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
325 { SN_Min
, OP_PMIND
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
326 { SN_PackWithSignedSaturation
, OP_PACKD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
327 { SN_PackWithUnsignedSaturation
, OP_PACKD_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
328 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
329 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
330 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
331 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
332 { SN_Shuffle
, OP_PSHUFLED
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
333 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
334 { SN_UnpackHigh
, OP_UNPACK_HIGHD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
335 { SN_UnpackLow
, OP_UNPACK_LOWD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
336 { SN_get_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
337 { SN_get_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
338 { SN_get_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
339 { SN_get_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
340 { SN_op_Addition
, OP_PADDD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
341 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
342 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
343 { SN_op_Equality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
344 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
345 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
346 { SN_op_Inequality
, OP_PCMPEQD
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
347 { SN_op_LeftShift
, OP_PSHLD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
348 { SN_op_Multiply
, OP_PMULD
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
349 { SN_op_RightShift
, OP_PSARD
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
350 { SN_op_Subtraction
, OP_PSUBD
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
351 { SN_set_W
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
352 { SN_set_X
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
353 { SN_set_Y
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
354 { SN_set_Z
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
357 static const SimdIntrinsic vector8us_intrinsics
[] = {
358 { SN_ctor
, OP_EXPAND_I2
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
359 { SN_AddWithSaturation
, OP_PADDW_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
360 { SN_ArithmeticRightShift
, OP_PSARW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
361 { SN_Average
, OP_PAVGW_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
362 { SN_CompareEqual
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
, SIMD_VERSION_SSE1
},
363 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
364 { SN_Max
, OP_PMAXW_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
365 { SN_Min
, OP_PMINW_UN
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
366 { SN_MultiplyStoreHigh
, OP_PMULW_HIGH_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
367 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
368 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
369 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
370 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
371 { SN_ShuffleHigh
, OP_PSHUFLEW_HIGH
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
372 { SN_ShuffleLow
, OP_PSHUFLEW_LOW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
373 { SN_SignedPackWithSignedSaturation
, OP_PACKW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
374 { SN_SignedPackWithUnsignedSaturation
, OP_PACKW_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
375 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
376 { SN_SubtractWithSaturation
, OP_PSUBW_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
377 { SN_UnpackHigh
, OP_UNPACK_HIGHW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
378 { SN_UnpackLow
, OP_UNPACK_LOWW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
379 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
380 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
381 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
382 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
383 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
384 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
385 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
386 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
387 { SN_op_Addition
, OP_PADDW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
388 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
389 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
390 { SN_op_Equality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
391 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
392 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
393 { SN_op_Inequality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
394 { SN_op_LeftShift
, OP_PSHLW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
395 { SN_op_Multiply
, OP_PMULW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
396 { SN_op_RightShift
, OP_PSHRW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
397 { SN_op_Subtraction
, OP_PSUBW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
398 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
399 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
400 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
401 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
402 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
403 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
404 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
405 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
408 static const SimdIntrinsic vector8s_intrinsics
[] = {
409 { SN_ctor
, OP_EXPAND_I2
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
410 { SN_AddWithSaturation
, OP_PADDW_SAT
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
411 { SN_CompareEqual
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
412 { SN_CompareGreaterThan
, OP_PCMPGTW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
413 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
414 { SN_LogicalRightShift
, OP_PSHRW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
415 { SN_Max
, OP_PMAXW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
416 { SN_Min
, OP_PMINW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
417 { SN_MultiplyStoreHigh
, OP_PMULW_HIGH
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
418 { SN_PackWithSignedSaturation
, OP_PACKW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
419 { SN_PackWithUnsignedSaturation
, OP_PACKW_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
420 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
421 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
422 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
423 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
424 { SN_ShuffleHigh
, OP_PSHUFLEW_HIGH
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
425 { SN_ShuffleLow
, OP_PSHUFLEW_LOW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHUFFLE
},
426 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
427 { SN_SubtractWithSaturation
, OP_PSUBW_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
428 { SN_UnpackHigh
, OP_UNPACK_HIGHW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
429 { SN_UnpackLow
, OP_UNPACK_LOWW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
430 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
431 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
432 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
433 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
434 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
435 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
436 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
437 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
438 { SN_op_Addition
, OP_PADDW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
439 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
440 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
441 { SN_op_Equality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
442 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
443 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
444 { SN_op_Inequality
, OP_PCMPEQW
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
445 { SN_op_LeftShift
, OP_PSHLW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
446 { SN_op_Multiply
, OP_PMULW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
447 { SN_op_RightShift
, OP_PSARW
, SIMD_VERSION_SSE1
, SIMD_EMIT_SHIFT
},
448 { SN_op_Subtraction
, OP_PSUBW
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
449 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
450 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
451 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
452 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
453 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
454 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
455 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
456 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
459 static const SimdIntrinsic vector16b_intrinsics
[] = {
460 { SN_ctor
, OP_EXPAND_I1
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
461 { SN_AddWithSaturation
, OP_PADDB_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
462 { SN_Average
, OP_PAVGB_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
463 { SN_CompareEqual
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
464 { SN_ExtractByteMask
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_EXTRACT_MASK
},
465 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
466 { SN_Max
, OP_PMAXB_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
467 { SN_Min
, OP_PMINB_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
468 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
469 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
470 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
471 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
472 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
473 { SN_SubtractWithSaturation
, OP_PSUBB_SAT_UN
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
474 { SN_SumOfAbsoluteDifferences
, OP_PSUM_ABS_DIFF
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
475 { SN_UnpackHigh
, OP_UNPACK_HIGHB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
476 { SN_UnpackLow
, OP_UNPACK_LOWB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
477 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
478 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
479 { SN_get_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
480 { SN_get_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
481 { SN_get_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
482 { SN_get_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
483 { SN_get_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
484 { SN_get_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
485 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
486 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
487 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
488 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
489 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
490 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
491 { SN_get_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
492 { SN_get_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
493 { SN_op_Addition
, OP_PADDB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
494 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
495 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
496 { SN_op_Equality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
497 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
498 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
499 { SN_op_Inequality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
500 { SN_op_Subtraction
, OP_PSUBB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
501 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
502 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
503 { SN_set_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
504 { SN_set_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
505 { SN_set_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
506 { SN_set_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
507 { SN_set_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
508 { SN_set_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
509 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
510 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
511 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
512 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
513 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
514 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
515 { SN_set_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
516 { SN_set_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
523 static const SimdIntrinsic vector16sb_intrinsics
[] = {
524 { SN_ctor
, OP_EXPAND_I1
, SIMD_VERSION_SSE1
, SIMD_EMIT_CTOR
},
525 { SN_AddWithSaturation
, OP_PADDB_SAT
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
526 { SN_CompareEqual
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
527 { SN_CompareGreaterThan
, OP_PCMPGTB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
528 { SN_ExtractByteMask
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_EXTRACT_MASK
},
529 { SN_LoadAligned
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_LOAD_ALIGNED
},
530 { SN_Max
, OP_PMAXB
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
531 { SN_Min
, OP_PMINB
, SIMD_VERSION_SSE41
, SIMD_EMIT_BINARY
},
532 { SN_PrefetchTemporalAllCacheLevels
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_0
},
533 { SN_PrefetchTemporal1stLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_1
},
534 { SN_PrefetchTemporal2ndLevelCache
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_2
},
535 { SN_PrefetchNonTemporal
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_PREFETCH
, SIMD_PREFETCH_MODE_NTA
},
536 { SN_StoreAligned
, OP_STOREX_ALIGNED_MEMBASE_REG
, SIMD_VERSION_SSE1
, SIMD_EMIT_STORE
},
537 { SN_SubtractWithSaturation
, OP_PSUBB_SAT
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
538 { SN_UnpackHigh
, OP_UNPACK_HIGHB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
539 { SN_UnpackLow
, OP_UNPACK_LOWB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
540 { SN_get_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
541 { SN_get_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
542 { SN_get_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
543 { SN_get_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
544 { SN_get_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
545 { SN_get_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
546 { SN_get_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
547 { SN_get_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
548 { SN_get_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
549 { SN_get_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
550 { SN_get_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
551 { SN_get_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
552 { SN_get_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
553 { SN_get_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
554 { SN_get_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
555 { SN_get_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_GETTER
},
556 { SN_op_Addition
, OP_PADDB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
557 { SN_op_BitwiseAnd
, OP_PAND
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
558 { SN_op_BitwiseOr
, OP_POR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
559 { SN_op_Equality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
560 { SN_op_ExclusiveOr
, OP_PXOR
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
561 { SN_op_Explicit
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_CAST
},
562 { SN_op_Inequality
, OP_PCMPEQB
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_NEQ
},
563 { SN_op_Subtraction
, OP_PSUBB
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
564 { SN_set_V0
, 0, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
565 { SN_set_V1
, 1, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
566 { SN_set_V10
, 10, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
567 { SN_set_V11
, 11, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
568 { SN_set_V12
, 12, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
569 { SN_set_V13
, 13, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
570 { SN_set_V14
, 14, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
571 { SN_set_V15
, 15, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
572 { SN_set_V2
, 2, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
573 { SN_set_V3
, 3, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
574 { SN_set_V4
, 4, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
575 { SN_set_V5
, 5, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
576 { SN_set_V6
, 6, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
577 { SN_set_V7
, 7, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
578 { SN_set_V8
, 8, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
579 { SN_set_V9
, 9, SIMD_VERSION_SSE1
, SIMD_EMIT_SETTER
},
582 static guint32 simd_supported_versions
;
584 static MonoInst
* emit_sys_numerics_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
);
585 static MonoInst
* emit_sys_numerics_vectors_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
);
587 /*TODO match using number of parameters as well*/
589 simd_intrinsic_compare_by_name (const void *key
, const void *value
)
591 return strcmp ((const char*)key
, method_name (((SimdIntrinsic
*)value
)->name
));
596 VREG_HAS_XZERO_BB0
= 0x02,
597 VREG_HAS_OTHER_OP_BB0
= 0x04,
598 VREG_SINGLE_BB_USE
= 0x08,
599 VREG_MANY_BB_USE
= 0x10,
603 mono_simd_intrinsics_init (void)
605 simd_supported_versions
= mono_arch_cpu_enumerate_simd_versions ();
606 /*TODO log the supported flags*/
609 static inline gboolean
610 apply_vreg_first_block_interference (MonoCompile
*cfg
, MonoInst
*ins
, int reg
, int max_vreg
, char *vreg_flags
)
612 if (reg
!= -1 && reg
<= max_vreg
&& vreg_flags
[reg
]) {
613 vreg_flags
[reg
] &= ~VREG_HAS_XZERO_BB0
;
614 vreg_flags
[reg
] |= VREG_HAS_OTHER_OP_BB0
;
615 DEBUG (printf ("[simd-simplify] R%d used: ", reg
); mono_print_ins(ins
));
621 static inline gboolean
622 apply_vreg_following_block_interference (MonoCompile
*cfg
, MonoInst
*ins
, int reg
, MonoBasicBlock
*bb
, int max_vreg
, char *vreg_flags
, MonoBasicBlock
**target_bb
)
624 if (reg
== -1 || reg
> max_vreg
|| !(vreg_flags
[reg
] & VREG_HAS_XZERO_BB0
) || target_bb
[reg
] == bb
)
627 if (vreg_flags
[reg
] & VREG_SINGLE_BB_USE
) {
628 vreg_flags
[reg
] &= ~VREG_SINGLE_BB_USE
;
629 vreg_flags
[reg
] |= VREG_MANY_BB_USE
;
630 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg
); mono_print_ins(ins
));
632 } else if (!(vreg_flags
[reg
] & VREG_MANY_BB_USE
)) {
633 vreg_flags
[reg
] |= VREG_SINGLE_BB_USE
;
634 target_bb
[reg
] = bb
;
635 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg
); mono_print_ins(ins
));
642 This pass recalculate which vars need MONO_INST_INDIRECT.
644 We cannot do this for non SIMD vars since code like mono_get_vtable_var
645 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
648 mono_simd_simplify_indirection (MonoCompile
*cfg
)
651 MonoBasicBlock
*bb
, *first_bb
= NULL
, **target_bb
;
655 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
656 MonoInst
*var
= cfg
->varinfo
[i
];
657 if (m_class_is_simd_type (var
->klass
)) {
658 var
->flags
&= ~MONO_INST_INDIRECT
;
659 max_vreg
= MAX (var
->dreg
, max_vreg
);
663 for (bb
= cfg
->bb_entry
; bb
; bb
= bb
->next_bb
) {
664 if (!first_bb
&& bb
->code
)
666 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
667 if (ins
->opcode
== OP_LDADDR
) {
668 MonoInst
*var
= (MonoInst
*)ins
->inst_p0
;
669 if (m_class_is_simd_type (var
->klass
)) {
670 var
->flags
|= MONO_INST_INDIRECT
;
676 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg
));
677 vreg_flags
= (char *)g_malloc0 (max_vreg
+ 1);
678 target_bb
= g_new0 (MonoBasicBlock
*, max_vreg
+ 1);
680 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
681 MonoInst
*var
= cfg
->varinfo
[i
];
682 if (m_class_is_simd_type (var
->klass
) && !(var
->flags
& (MONO_INST_INDIRECT
|MONO_INST_VOLATILE
))) {
683 vreg_flags
[var
->dreg
] = VREG_USED
;
684 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i
, var
->dreg
));
688 /*Scan the first basic block looking xzeros not used*/
689 for (ins
= first_bb
->code
; ins
; ins
= ins
->next
) {
691 int sregs
[MONO_MAX_SRC_REGS
];
693 if (ins
->opcode
== OP_XZERO
) {
694 if (!(vreg_flags
[ins
->dreg
] & VREG_HAS_OTHER_OP_BB0
)) {
695 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins
->dreg
); mono_print_ins(ins
));
696 vreg_flags
[ins
->dreg
] |= VREG_HAS_XZERO_BB0
;
700 if (ins
->opcode
== OP_LDADDR
&& apply_vreg_first_block_interference (cfg
, ins
, ((MonoInst
*)ins
->inst_p0
)->dreg
, max_vreg
, vreg_flags
))
702 if (apply_vreg_first_block_interference (cfg
, ins
, ins
->dreg
, max_vreg
, vreg_flags
))
704 num_sregs
= mono_inst_get_src_registers (ins
, sregs
);
705 for (i
= 0; i
< num_sregs
; ++i
) {
706 if (apply_vreg_first_block_interference (cfg
, ins
, sregs
[i
], max_vreg
, vreg_flags
))
711 if (IS_DEBUG_ON (cfg
)) {
712 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
713 MonoInst
*var
= cfg
->varinfo
[i
];
714 if (m_class_is_simd_type (var
->klass
)) {
715 if ((vreg_flags
[var
->dreg
] & VREG_HAS_XZERO_BB0
))
716 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var
->dreg
));
717 if ((vreg_flags
[var
->dreg
] & VREG_HAS_OTHER_OP_BB0
))
718 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var
->dreg
));
723 /*TODO stop here if no var is xzero only*/
726 Scan all other bb and check if it has only one other use
727 Ideally this would be done after an extended bb formation pass
729 FIXME This pass could use dominator information to properly
730 place the XZERO on the bb that dominates all uses of the var,
731 but this will have zero effect with the current local reg alloc
733 TODO simply the use of flags.
736 for (bb
= first_bb
->next_bb
; bb
; bb
= bb
->next_bb
) {
737 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
739 int sregs
[MONO_MAX_SRC_REGS
];
741 if (ins
->opcode
== OP_LDADDR
&& apply_vreg_following_block_interference (cfg
, ins
, ((MonoInst
*)ins
->inst_p0
)->dreg
, bb
, max_vreg
, vreg_flags
, target_bb
))
743 if (apply_vreg_following_block_interference (cfg
, ins
, ins
->dreg
, bb
, max_vreg
, vreg_flags
, target_bb
))
745 num_sregs
= mono_inst_get_src_registers (ins
, sregs
);
746 for (i
= 0; i
< num_sregs
; ++i
) {
747 if (apply_vreg_following_block_interference (cfg
, ins
, sregs
[i
], bb
,
748 max_vreg
, vreg_flags
, target_bb
))
754 for (i
= 0; i
< cfg
->num_varinfo
; i
++) {
755 MonoInst
*var
= cfg
->varinfo
[i
];
756 if (!m_class_is_simd_type (var
->klass
))
758 if ((vreg_flags
[var
->dreg
] & VREG_SINGLE_BB_USE
))
759 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var
->dreg
));
760 if ((vreg_flags
[var
->dreg
] & VREG_MANY_BB_USE
))
761 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var
->dreg
));
763 if (!(vreg_flags
[var
->dreg
] & VREG_SINGLE_BB_USE
))
765 for (ins
= target_bb
[var
->dreg
]->code
; ins
; ins
= ins
->next
) {
767 int sregs
[MONO_MAX_SRC_REGS
];
768 gboolean found
= FALSE
;
770 num_sregs
= mono_inst_get_src_registers (ins
, sregs
);
771 for (j
= 0; j
< num_sregs
; ++j
) {
772 if (sregs
[j
] == var
->dreg
)
775 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
776 if (ins
->dreg
== var
->dreg
&& !found
) {
777 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i
, target_bb
[var
->dreg
]->block_num
););
780 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i
, target_bb
[var
->dreg
]->block_num
); );
782 MONO_INST_NEW (cfg
, tmp
, OP_XZERO
);
783 tmp
->dreg
= var
->dreg
;
784 tmp
->type
= STACK_VTYPE
;
785 tmp
->klass
= var
->klass
;
786 mono_bblock_insert_before_ins (target_bb
[var
->dreg
], ins
, tmp
);
792 for (ins
= first_bb
->code
; ins
; ins
= ins
->next
) {
793 if (ins
->opcode
== OP_XZERO
&& (vreg_flags
[ins
->dreg
] & VREG_SINGLE_BB_USE
)) {
794 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins
->dreg
); mono_print_ins(ins
));
804 * Windows x64 value type ABI uses reg/stack references (ArgValuetypeAddrInIReg/ArgValuetypeAddrOnStack)
805 * for function arguments. When using SIMD intrinsics arguments optimized into OP_ARG needs to be decomposed
806 * into correspondig SIMD LOADX/STOREX instructions.
808 #if defined(TARGET_WIN32) && defined(TARGET_AMD64)
810 decompose_vtype_opt_uses_simd_intrinsics (MonoCompile
*cfg
, MonoInst
*ins
)
812 if (cfg
->uses_simd_intrinsics
& MONO_CFG_USES_SIMD_INTRINSICS_DECOMPOSE_VTYPE
)
815 switch (ins
->opcode
) {
818 case OP_LOADX_MEMBASE
:
819 case OP_LOADX_ALIGNED_MEMBASE
:
820 case OP_STOREX_MEMBASE
:
821 case OP_STOREX_ALIGNED_MEMBASE_REG
:
829 decompose_vtype_opt_load_arg (MonoCompile
*cfg
, MonoBasicBlock
*bb
, MonoInst
*ins
, gint32
*sreg_int32
)
831 guint32
*sreg
= (guint32
*)sreg_int32
;
832 MonoInst
*src_var
= get_vreg_to_inst (cfg
, *sreg
);
833 if (src_var
&& src_var
->opcode
== OP_ARG
&& src_var
->klass
&& MONO_CLASS_IS_SIMD (cfg
, src_var
->klass
)) {
834 MonoInst
*varload_ins
, *load_ins
;
835 NEW_VARLOADA (cfg
, varload_ins
, src_var
, src_var
->inst_vtype
);
836 mono_bblock_insert_before_ins (bb
, ins
, varload_ins
);
837 MONO_INST_NEW (cfg
, load_ins
, OP_LOADX_MEMBASE
);
838 load_ins
->klass
= src_var
->klass
;
839 load_ins
->type
= STACK_VTYPE
;
840 load_ins
->sreg1
= varload_ins
->dreg
;
841 load_ins
->dreg
= alloc_xreg (cfg
);
842 mono_bblock_insert_after_ins (bb
, varload_ins
, load_ins
);
843 *sreg
= load_ins
->dreg
;
848 mono_simd_decompose_intrinsic (MonoCompile
*cfg
, MonoBasicBlock
*bb
, MonoInst
*ins
)
850 if (cfg
->opt
& MONO_OPT_SIMD
&& decompose_vtype_opt_uses_simd_intrinsics (cfg
, ins
)) {
851 decompose_vtype_opt_load_arg (cfg
, bb
, ins
, &(ins
->sreg1
));
852 decompose_vtype_opt_load_arg (cfg
, bb
, ins
, &(ins
->sreg2
));
853 decompose_vtype_opt_load_arg (cfg
, bb
, ins
, &(ins
->sreg3
));
854 MonoInst
*dest_var
= get_vreg_to_inst (cfg
, ins
->dreg
);
855 if (dest_var
&& dest_var
->opcode
== OP_ARG
&& dest_var
->klass
&& MONO_CLASS_IS_SIMD (cfg
, dest_var
->klass
)) {
856 MonoInst
*varload_ins
, *store_ins
;
857 ins
->dreg
= alloc_xreg (cfg
);
858 NEW_VARLOADA (cfg
, varload_ins
, dest_var
, dest_var
->inst_vtype
);
859 mono_bblock_insert_after_ins (bb
, ins
, varload_ins
);
860 MONO_INST_NEW (cfg
, store_ins
, OP_STOREX_MEMBASE
);
861 store_ins
->klass
= dest_var
->klass
;
862 store_ins
->type
= STACK_VTYPE
;
863 store_ins
->sreg1
= ins
->dreg
;
864 store_ins
->dreg
= varload_ins
->dreg
;
865 mono_bblock_insert_after_ins (bb
, varload_ins
, store_ins
);
871 mono_simd_decompose_intrinsics (MonoCompile
*cfg
)
876 for (bb
= cfg
->bb_entry
; bb
; bb
= bb
->next_bb
) {
877 for (ins
= bb
->code
; ins
; ins
= ins
->next
) {
878 mono_simd_decompose_intrinsic (cfg
, bb
, ins
);
884 mono_simd_decompose_intrinsic (MonoCompile
*cfg
, MonoBasicBlock
*bb
, MonoInst
*ins
)
889 mono_simd_decompose_intrinsics (MonoCompile
*cfg
)
892 #endif /*defined(TARGET_WIN32) && defined(TARGET_AMD64)*/
895 * This function expect that src be a value.
898 get_simd_vreg (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
*src
)
900 const char *spec
= INS_INFO (src
->opcode
);
902 if (src
->opcode
== OP_XMOVE
) {
904 } else if (spec
[MONO_INST_DEST
] == 'x') {
906 } else if (src
->opcode
== OP_VCALL
|| src
->opcode
== OP_VCALL_MEMBASE
) {
910 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
911 mono_print_ins (src
);
912 g_assert_not_reached ();
916 * This function will load the value if needed.
919 load_simd_vreg_class (MonoCompile
*cfg
, MonoClass
*klass
, MonoInst
*src
, gboolean
*indirect
)
921 const char *spec
= INS_INFO (src
->opcode
);
925 if (src
->opcode
== OP_XMOVE
) {
927 } else if (src
->opcode
== OP_LDADDR
) {
928 int res
= ((MonoInst
*)src
->inst_p0
)->dreg
;
930 } else if (spec
[MONO_INST_DEST
] == 'x') {
932 } else if (src
->type
== STACK_PTR
|| src
->type
== STACK_MP
) {
937 MONO_INST_NEW (cfg
, ins
, OP_LOADX_MEMBASE
);
939 ins
->sreg1
= src
->dreg
;
940 ins
->type
= STACK_VTYPE
;
941 ins
->dreg
= alloc_ireg (cfg
);
942 MONO_ADD_INS (cfg
->cbb
, ins
);
945 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src
->type
);
946 mono_print_ins (src
);
947 g_assert_not_reached ();
951 load_simd_vreg (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
*src
, gboolean
*indirect
)
953 return load_simd_vreg_class (cfg
, cmethod
->klass
, src
, indirect
);
956 /*We share the var with fconv_to_r8_x to save some stack space.*/
958 get_double_spill_area (MonoCompile
*cfg
)
960 if (!cfg
->fconv_to_r8_x_var
) {
961 cfg
->fconv_to_r8_x_var
= mono_compile_create_var (cfg
, m_class_get_byval_arg (mono_defaults
.double_class
), OP_LOCAL
);
962 cfg
->fconv_to_r8_x_var
->flags
|= MONO_INST_VOLATILE
; /*FIXME, use the don't regalloc flag*/
964 return cfg
->fconv_to_r8_x_var
;
967 get_simd_ctor_spill_area (MonoCompile
*cfg
, MonoClass
*avector_klass
)
969 if (!cfg
->simd_ctor_var
) {
970 cfg
->simd_ctor_var
= mono_compile_create_var (cfg
, m_class_get_byval_arg (avector_klass
), OP_LOCAL
);
971 cfg
->simd_ctor_var
->flags
|= MONO_INST_VOLATILE
; /*FIXME, use the don't regalloc flag*/
973 return cfg
->simd_ctor_var
;
977 mono_type_to_expand_op (MonoType
*type
)
979 switch (type
->type
) {
997 g_assert_not_reached ();
1002 type_to_comp_op (MonoType
*t
)
1022 g_assert_not_reached ();
1028 type_to_gt_op (MonoType
*t
)
1045 type_to_padd_op (MonoType
*t
)
1071 type_to_psub_op (MonoType
*t
)
1097 type_to_pmul_op (MonoType
*t
)
1111 /* PMULQ multiplies two 32 bit numbers into a 64 bit one */
1122 type_to_pdiv_op (MonoType
*t
)
1136 type_to_pxor_op (MonoType
*t
)
1139 * These opcodes have the same semantics, but using the
1140 * correctly typed version is better for performance.
1153 type_to_pand_op (MonoType
*t
)
1166 type_to_por_op (MonoType
*t
)
1179 type_to_pmin_op (MonoType
*t
)
1204 type_to_pmax_op (MonoType
*t
)
1229 get_simd_vreg_or_expanded_scalar (MonoCompile
*cfg
, MonoClass
*klass
, MonoType
*param_type
, MonoInst
*src
)
1234 if (m_class_is_simd_type (mono_class_from_mono_type_internal (param_type
)))
1235 return get_simd_vreg (cfg
, NULL
, src
);
1237 expand_op
= mono_type_to_expand_op (param_type
);
1238 MONO_INST_NEW (cfg
, ins
, expand_op
);
1240 ins
->sreg1
= src
->dreg
;
1241 ins
->type
= STACK_VTYPE
;
1242 ins
->dreg
= alloc_ireg (cfg
);
1243 MONO_ADD_INS (cfg
->cbb
, ins
);
1245 if (expand_op
== OP_EXPAND_R4
)
1246 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1247 else if (expand_op
== OP_EXPAND_R8
)
1248 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1254 * simd_intrinsic_emit_binary_op:
1256 * Emit a binary SIMD opcode.
1257 * @LHS/@RHS are the two arguments, they can be either a SIMD type or a scalar one. Scalar arguments are
1258 * expanded to the SIMD type.
1261 simd_intrinsic_emit_binary_op (MonoCompile
*cfg
, int opcode
, int flags
, MonoClass
*klass
, MonoType
*lhs_type
, MonoType
*rhs_type
, MonoInst
*lhs
, MonoInst
*rhs
)
1264 int left_vreg
, right_vreg
;
1266 left_vreg
= get_simd_vreg_or_expanded_scalar (cfg
, klass
, lhs_type
, lhs
);
1267 right_vreg
= get_simd_vreg_or_expanded_scalar (cfg
, klass
, rhs_type
, rhs
);
1269 MONO_INST_NEW (cfg
, ins
, opcode
);
1271 ins
->sreg1
= left_vreg
;
1272 ins
->sreg2
= right_vreg
;
1273 ins
->type
= STACK_VTYPE
;
1274 ins
->dreg
= alloc_ireg (cfg
);
1275 ins
->inst_c0
= flags
;
1276 MONO_ADD_INS (cfg
->cbb
, ins
);
1281 simd_intrinsic_emit_binary (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1283 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1285 g_assert (sig
->param_count
== 2);
1287 return simd_intrinsic_emit_binary_op (cfg
, intrinsic
->opcode
, intrinsic
->flags
, cmethod
->klass
, sig
->params
[0], sig
->params
[1], args
[0], args
[1]);
1291 simd_intrinsic_emit_unary (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1296 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1298 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1299 ins
->klass
= cmethod
->klass
;
1301 ins
->type
= STACK_VTYPE
;
1302 ins
->dreg
= alloc_ireg (cfg
);
1303 MONO_ADD_INS (cfg
->cbb
, ins
);
1308 mono_type_to_extract_op (MonoType
*type
)
1310 switch (type
->type
) {
1312 return OP_EXTRACT_I1
;
1314 return OP_EXTRACT_U1
;
1316 return OP_EXTRACT_I2
;
1318 return OP_EXTRACT_U2
;
1322 return OP_EXTRACT_I4
;
1324 g_assert_not_reached ();
1328 /*Returns the amount to shift the element index to get the dword it belongs to*/
1330 mono_type_elements_shift_bits (MonoType
*type
)
1332 switch (type
->type
) {
1344 g_assert_not_reached ();
1348 static G_GNUC_UNUSED
int
1349 mono_type_to_insert_op (MonoType
*type
)
1351 switch (type
->type
) {
1354 return OP_INSERT_I1
;
1357 return OP_INSERT_I2
;
1360 return OP_INSERT_I4
;
1363 return OP_INSERT_I8
;
1365 return OP_INSERT_R4
;
1367 return OP_INSERT_R8
;
1369 g_assert_not_reached ();
1374 mono_type_to_slow_insert_op (MonoType
*type
)
1376 switch (type
->type
) {
1379 return OP_INSERTX_U1_SLOW
;
1382 return OP_INSERT_I2
;
1385 return OP_INSERTX_I4_SLOW
;
1388 return OP_INSERTX_I8_SLOW
;
1390 return OP_INSERTX_R4_SLOW
;
1392 return OP_INSERTX_R8_SLOW
;
1394 g_assert_not_reached ();
1399 simd_intrinsic_emit_setter (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1402 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1407 size
= mono_type_size (sig
->params
[0], &align
);
1409 if (COMPILE_LLVM (cfg
)) {
1410 MONO_INST_NEW (cfg
, ins
, mono_type_to_insert_op (sig
->params
[0]));
1411 ins
->klass
= cmethod
->klass
;
1412 ins
->dreg
= ins
->sreg1
= dreg
= load_simd_vreg (cfg
, cmethod
, args
[0], &indirect
);
1413 ins
->sreg2
= args
[1]->dreg
;
1414 ins
->inst_c0
= intrinsic
->opcode
;
1415 MONO_ADD_INS (cfg
->cbb
, ins
);
1416 } else if (size
== 2 || size
== 4 || size
== 8) {
1417 MONO_INST_NEW (cfg
, ins
, mono_type_to_slow_insert_op (sig
->params
[0]));
1418 ins
->klass
= cmethod
->klass
;
1419 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1420 ins
->dreg
= ins
->sreg1
= dreg
= load_simd_vreg (cfg
, cmethod
, args
[0], &indirect
);
1421 ins
->sreg2
= args
[1]->dreg
;
1422 ins
->inst_c0
= intrinsic
->opcode
;
1423 if (sig
->params
[0]->type
== MONO_TYPE_R4
)
1424 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1425 else if (sig
->params
[0]->type
== MONO_TYPE_R8
)
1426 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1427 MONO_ADD_INS (cfg
->cbb
, ins
);
1431 MONO_INST_NEW (cfg
, ins
, OP_EXTRACTX_U2
);
1432 ins
->klass
= cmethod
->klass
;
1433 ins
->sreg1
= sreg
= dreg
= load_simd_vreg (cfg
, cmethod
, args
[0], &indirect
);
1434 ins
->type
= STACK_I4
;
1435 ins
->dreg
= vreg
= alloc_ireg (cfg
);
1436 ins
->inst_c0
= intrinsic
->opcode
/ 2;
1437 MONO_ADD_INS (cfg
->cbb
, ins
);
1439 MONO_INST_NEW (cfg
, ins
, OP_INSERTX_U1_SLOW
);
1440 ins
->klass
= cmethod
->klass
;
1442 ins
->sreg2
= args
[1]->dreg
;
1444 ins
->inst_c0
= intrinsic
->opcode
;
1445 MONO_ADD_INS (cfg
->cbb
, ins
);
1449 MONO_INST_NEW (cfg
, ins
, OP_STOREX_MEMBASE
);
1450 ins
->klass
= cmethod
->klass
;
1451 ins
->dreg
= args
[0]->dreg
;
1453 MONO_ADD_INS (cfg
->cbb
, ins
);
1459 * simd_intrinsic_emit_getter_op:
1461 * Emit IR for loading an element of a SIMD value.
1463 * @klass is the simd type, @type is the element type.
1466 simd_intrinsic_emit_getter_op (MonoCompile
*cfg
, int index
, MonoClass
*klass
, MonoType
*type
, MonoInst
*arg
)
1469 int vreg
, shift_bits
;
1471 vreg
= load_simd_vreg_class (cfg
, klass
, arg
, NULL
);
1473 if (type
->type
== MONO_TYPE_I8
|| type
->type
== MONO_TYPE_U8
|| type
->type
== MONO_TYPE_R8
) {
1475 gboolean is_r8
= type
->type
== MONO_TYPE_R8
;
1477 MONO_INST_NEW (cfg
, ins
, is_r8
? OP_EXTRACT_R8
: OP_EXTRACT_I8
);
1480 ins
->inst_c0
= index
;
1482 ins
->type
= STACK_R8
;
1483 ins
->dreg
= alloc_freg (cfg
);
1484 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1486 ins
->type
= STACK_I8
;
1487 ins
->dreg
= alloc_lreg (cfg
);
1489 MONO_ADD_INS (cfg
->cbb
, ins
);
1493 shift_bits
= mono_type_elements_shift_bits (type
);
1495 if ((index
>> shift_bits
) && !cfg
->compile_llvm
) {
1496 MONO_INST_NEW (cfg
, ins
, OP_PSHUFLED
);
1499 ins
->inst_c0
= index
>> shift_bits
;
1500 ins
->type
= STACK_VTYPE
;
1501 ins
->dreg
= vreg
= alloc_ireg (cfg
);
1502 MONO_ADD_INS (cfg
->cbb
, ins
);
1505 MONO_INST_NEW (cfg
, ins
, mono_type_to_extract_op (type
));
1508 ins
->type
= STACK_I4
;
1509 ins
->dreg
= vreg
= alloc_ireg (cfg
);
1510 if (cfg
->compile_llvm
)
1511 ins
->inst_c0
= index
;
1513 ins
->inst_c0
= index
& ((1 << shift_bits
) - 1);
1514 MONO_ADD_INS (cfg
->cbb
, ins
);
1516 if (type
->type
== MONO_TYPE_R4
) {
1517 MONO_INST_NEW (cfg
, ins
, cfg
->r4fp
? OP_ICONV_TO_R4_RAW
: OP_MOVE_I4_TO_F
);
1518 ins
->klass
= mono_defaults
.single_class
;
1520 ins
->type
= cfg
->r4_stack_type
;
1521 ins
->dreg
= alloc_freg (cfg
);
1522 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1523 MONO_ADD_INS (cfg
->cbb
, ins
);
1529 simd_intrinsic_emit_getter (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1531 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1533 return simd_intrinsic_emit_getter_op (cfg
, intrinsic
->opcode
, cmethod
->klass
, sig
->ret
, args
[0]);
1537 simd_intrinsic_emit_long_getter (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1541 gboolean is_r8
= mono_method_signature_internal (cmethod
)->ret
->type
== MONO_TYPE_R8
;
1543 vreg
= load_simd_vreg (cfg
, cmethod
, args
[0], NULL
);
1545 MONO_INST_NEW (cfg
, ins
, is_r8
? OP_EXTRACT_R8
: OP_EXTRACT_I8
);
1546 ins
->klass
= cmethod
->klass
;
1548 ins
->inst_c0
= intrinsic
->opcode
;
1550 ins
->type
= STACK_R8
;
1551 ins
->dreg
= alloc_freg (cfg
);
1552 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1554 ins
->type
= STACK_I8
;
1555 ins
->dreg
= alloc_lreg (cfg
);
1557 MONO_ADD_INS (cfg
->cbb
, ins
);
1563 simd_intrinsic_emit_ctor (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1565 MonoInst
*ins
= NULL
;
1567 gboolean is_ldaddr
= (args
[0]->opcode
== OP_LDADDR
&& args
[0]->inst_left
->opcode
!= OP_ARG
);
1568 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
1569 int store_op
= mono_type_to_store_membase (cfg
, sig
->params
[0]);
1570 int arg_size
= mono_type_size (sig
->params
[0], &i
);
1573 if (sig
->param_count
== 1) {
1577 dreg
= args
[0]->inst_i0
->dreg
;
1578 NULLIFY_INS (args
[0]);
1580 g_assert (args
[0]->type
== STACK_MP
|| args
[0]->type
== STACK_PTR
);
1581 dreg
= alloc_ireg (cfg
);
1585 opcode
= intrinsic
->opcode
;
1587 opcode
= mono_type_to_expand_op (sig
->params
[0]);
1588 MONO_INST_NEW (cfg
, ins
, opcode
);
1589 ins
->klass
= cmethod
->klass
;
1590 ins
->sreg1
= args
[1]->dreg
;
1591 ins
->type
= STACK_VTYPE
;
1593 MONO_ADD_INS (cfg
->cbb
, ins
);
1594 if (sig
->params
[0]->type
== MONO_TYPE_R4
)
1595 ins
->backend
.spill_var
= mini_get_int_to_float_spill_area (cfg
);
1596 else if (sig
->params
[0]->type
== MONO_TYPE_R8
)
1597 ins
->backend
.spill_var
= get_double_spill_area (cfg
);
1600 MONO_INST_NEW (cfg
, ins
, OP_STOREX_MEMBASE
);
1601 ins
->dreg
= args
[0]->dreg
;
1603 MONO_ADD_INS (cfg
->cbb
, ins
);
1609 NEW_VARLOADA (cfg
, ins
, get_simd_ctor_spill_area (cfg
, cmethod
->klass
), &cmethod
->klass
->byref_arg
);
1610 MONO_ADD_INS (cfg
->cbb
, ins
);
1611 addr_reg
= ins
->dreg
;
1613 g_assert (args
[0]->type
== STACK_MP
|| args
[0]->type
== STACK_PTR
);
1614 addr_reg
= args
[0]->dreg
;
1617 for (i
= sig
->param_count
- 1; i
>= 0; --i
) {
1618 EMIT_NEW_STORE_MEMBASE (cfg
, ins
, store_op
, addr_reg
, i
* arg_size
, args
[i
+ 1]->dreg
);
1621 if (sig
->param_count
* arg_size
< 16) {
1622 /* If there are not enough arguments, fill the rest with 0s */
1623 for (i
= sig
->param_count
; i
< 16 / arg_size
; ++i
) {
1626 MONO_EMIT_NEW_STORE_MEMBASE_IMM (cfg
, OP_STOREI4_MEMBASE_IMM
, addr_reg
, i
* arg_size
, 0);
1629 g_assert_not_reached ();
1635 if (is_ldaddr
) { /*Eliminate LDADDR if it's initing a local var*/
1636 int vreg
= ((MonoInst
*)args
[0]->inst_p0
)->dreg
;
1637 NULLIFY_INS (args
[0]);
1639 MONO_INST_NEW (cfg
, ins
, OP_LOADX_MEMBASE
);
1640 ins
->klass
= cmethod
->klass
;
1641 ins
->sreg1
= addr_reg
;
1642 ins
->type
= STACK_VTYPE
;
1644 MONO_ADD_INS (cfg
->cbb
, ins
);
1650 simd_intrinsic_emit_cast (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1656 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1658 if (cmethod
->is_inflated
)
1660 klass
= mono_class_from_mono_type_internal (mono_method_signature_internal (cmethod
)->ret
);
1662 klass
= cmethod
->klass
;
1664 MONO_INST_NEW (cfg
, ins
, OP_XMOVE
);
1666 ins
->type
= STACK_VTYPE
;
1668 ins
->dreg
= alloc_ireg (cfg
);
1669 MONO_ADD_INS (cfg
->cbb
, ins
);
1674 simd_intrinsic_emit_shift (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1677 int vreg
, vreg2
= -1, opcode
= intrinsic
->opcode
;
1679 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1681 if (args
[1]->opcode
!= OP_ICONST
) {
1682 MONO_INST_NEW (cfg
, ins
, OP_ICONV_TO_X
);
1683 ins
->klass
= mono_defaults
.int32_class
;
1684 ins
->sreg1
= args
[1]->dreg
;
1685 ins
->type
= STACK_I4
;
1686 ins
->dreg
= vreg2
= alloc_ireg (cfg
);
1687 MONO_ADD_INS (cfg
->cbb
, ins
);
1689 ++opcode
; /*The shift_reg version op is always +1 from the regular one.*/
1692 MONO_INST_NEW (cfg
, ins
, opcode
);
1693 ins
->klass
= cmethod
->klass
;
1697 if (args
[1]->opcode
== OP_ICONST
) {
1698 ins
->inst_imm
= args
[1]->inst_c0
;
1699 NULLIFY_INS (args
[1]);
1702 ins
->type
= STACK_VTYPE
;
1703 ins
->dreg
= alloc_ireg (cfg
);
1704 MONO_ADD_INS (cfg
->cbb
, ins
);
1708 static inline gboolean
1709 mono_op_is_packed_compare (int op
)
1711 return op
>= OP_PCMPEQB
&& op
<= OP_PCMPEQQ
;
1715 simd_intrinsic_emit_equality_op (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
, int opcode
, int flags
)
1718 int left_vreg
, right_vreg
, tmp_vreg
;
1720 left_vreg
= load_simd_vreg (cfg
, cmethod
, args
[0], NULL
);
1721 right_vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
1723 MONO_INST_NEW (cfg
, ins
, opcode
);
1724 ins
->klass
= cmethod
->klass
;
1725 ins
->sreg1
= left_vreg
;
1726 ins
->sreg2
= right_vreg
;
1727 ins
->type
= STACK_VTYPE
;
1728 ins
->klass
= cmethod
->klass
;
1729 ins
->dreg
= tmp_vreg
= alloc_ireg (cfg
);
1730 ins
->inst_c0
= flags
;
1731 MONO_ADD_INS (cfg
->cbb
, ins
);
1733 /*FIXME the next ops are SSE specific*/
1734 MONO_INST_NEW (cfg
, ins
, OP_EXTRACT_MASK
);
1735 ins
->klass
= cmethod
->klass
;
1736 ins
->sreg1
= tmp_vreg
;
1737 ins
->type
= STACK_I4
;
1738 ins
->dreg
= tmp_vreg
= alloc_ireg (cfg
);
1739 MONO_ADD_INS (cfg
->cbb
, ins
);
1741 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1742 if (mono_op_is_packed_compare (opcode
) || flags
== SIMD_COMP_EQ
) {
1743 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_COMPARE_IMM
, -1, tmp_vreg
, 0xFFFF);
1744 NEW_UNALU (cfg
, ins
, flags
== SIMD_COMP_EQ
? OP_CEQ
: OP_CLT_UN
, tmp_vreg
, -1);
1746 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_COMPARE_IMM
, -1, tmp_vreg
, 0);
1747 NEW_UNALU (cfg
, ins
, OP_CGT_UN
, tmp_vreg
, -1);
1749 MONO_ADD_INS (cfg
->cbb
, ins
);
1754 simd_intrinsic_emit_equality (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1756 return simd_intrinsic_emit_equality_op (cfg
, cmethod
, args
, intrinsic
->opcode
, intrinsic
->flags
);
1760 simd_intrinsic_emit_shuffle (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1763 int vreg
, vreg2
= -1;
1764 int param_count
= mono_method_signature_internal (cmethod
)->param_count
;
1766 if (args
[param_count
- 1]->opcode
!= OP_ICONST
) {
1767 /*TODO Shuffle with non literals is not yet supported */
1771 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1772 if (param_count
== 3)
1773 vreg2
= get_simd_vreg (cfg
, cmethod
, args
[1]);
1775 NULLIFY_INS (args
[param_count
- 1]);
1778 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1779 ins
->klass
= cmethod
->klass
;
1782 ins
->inst_c0
= args
[param_count
- 1]->inst_c0
;
1783 ins
->type
= STACK_VTYPE
;
1784 ins
->dreg
= alloc_ireg (cfg
);
1785 MONO_ADD_INS (cfg
->cbb
, ins
);
1787 if (param_count
== 3 && ins
->opcode
== OP_PSHUFLED
)
1788 ins
->opcode
= OP_SHUFPS
;
1793 simd_intrinsic_emit_load_aligned (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1797 MONO_INST_NEW (cfg
, ins
, OP_LOADX_ALIGNED_MEMBASE
);
1798 ins
->klass
= cmethod
->klass
;
1799 ins
->sreg1
= args
[0]->dreg
;
1800 ins
->type
= STACK_VTYPE
;
1801 ins
->dreg
= alloc_ireg (cfg
);
1802 MONO_ADD_INS (cfg
->cbb
, ins
);
1807 simd_intrinsic_emit_store (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1812 vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
1814 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1815 ins
->klass
= cmethod
->klass
;
1816 ins
->dreg
= args
[0]->dreg
;
1818 ins
->type
= STACK_VTYPE
;
1819 MONO_ADD_INS (cfg
->cbb
, ins
);
1824 simd_intrinsic_emit_extract_mask (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1829 vreg
= get_simd_vreg (cfg
, cmethod
, args
[0]);
1831 MONO_INST_NEW (cfg
, ins
, OP_EXTRACT_MASK
);
1832 ins
->klass
= cmethod
->klass
;
1834 ins
->type
= STACK_I4
;
1835 ins
->dreg
= alloc_ireg (cfg
);
1836 MONO_ADD_INS (cfg
->cbb
, ins
);
1842 simd_intrinsic_emit_prefetch (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1846 MONO_INST_NEW (cfg
, ins
, OP_PREFETCH_MEMBASE
);
1847 ins
->klass
= cmethod
->klass
;
1848 ins
->sreg1
= args
[0]->dreg
;
1849 ins
->backend
.arg_info
= intrinsic
->flags
;
1850 MONO_ADD_INS (cfg
->cbb
, ins
);
1855 simd_intrinsic_emit_const (const SimdIntrinsic
*intrinsic
, MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoInst
**args
)
1859 MONO_INST_NEW (cfg
, ins
, intrinsic
->opcode
);
1860 ins
->klass
= cmethod
->klass
;
1861 ins
->type
= STACK_VTYPE
;
1862 ins
->dreg
= alloc_xreg (cfg
);
1863 MONO_ADD_INS (cfg
->cbb
, ins
);
1868 simd_version_name (guint32 version
)
1871 case SIMD_VERSION_SSE1
:
1873 case SIMD_VERSION_SSE2
:
1875 case SIMD_VERSION_SSE3
:
1877 case SIMD_VERSION_SSSE3
:
1879 case SIMD_VERSION_SSE41
:
1881 case SIMD_VERSION_SSE42
:
1883 case SIMD_VERSION_SSE4a
:
1890 emit_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
, const SimdIntrinsic
*intrinsics
, guint32 size
)
1892 const SimdIntrinsic
*result
= (const SimdIntrinsic
*)mono_binary_search (cmethod
->name
, intrinsics
, size
, sizeof (SimdIntrinsic
), &simd_intrinsic_compare_by_name
);
1894 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", m_class_get_name (cmethod
->klass
), cmethod
->name
, fsig
->param_count
));
1897 if (IS_DEBUG_ON (cfg
)) {
1899 printf ("found call to intrinsic %s::%s/%d -> %s\n", m_class_get_name (cmethod
->klass
), cmethod
->name
, fsig
->param_count
, method_name (result
->name
));
1900 max
= fsig
->param_count
+ fsig
->hasthis
;
1901 for (i
= 0; i
< max
; ++i
) {
1902 printf ("param %d: ", i
);
1903 mono_print_ins (args
[i
]);
1906 if (result
->simd_version_flags
&& !(result
->simd_version_flags
& simd_supported_versions
)) {
1907 if (IS_DEBUG_ON (cfg
)) {
1909 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", m_class_get_name (cmethod
->klass
), cmethod
->name
, fsig
->param_count
);
1910 for (x
= 1; x
<= SIMD_VERSION_INDEX_END
; x
++)
1911 if (result
->simd_version_flags
& (1 << x
))
1912 printf ("%s ", simd_version_name (1 << x
));
1919 switch (result
->simd_emit_mode
) {
1920 case SIMD_EMIT_BINARY
:
1921 return simd_intrinsic_emit_binary (result
, cfg
, cmethod
, args
);
1922 case SIMD_EMIT_UNARY
:
1923 return simd_intrinsic_emit_unary (result
, cfg
, cmethod
, args
);
1924 case SIMD_EMIT_SETTER
:
1925 return simd_intrinsic_emit_setter (result
, cfg
, cmethod
, args
);
1926 case SIMD_EMIT_GETTER
:
1927 return simd_intrinsic_emit_getter (result
, cfg
, cmethod
, args
);
1928 case SIMD_EMIT_GETTER_QWORD
:
1929 return simd_intrinsic_emit_long_getter (result
, cfg
, cmethod
, args
);
1930 case SIMD_EMIT_CTOR
:
1931 return simd_intrinsic_emit_ctor (result
, cfg
, cmethod
, args
);
1932 case SIMD_EMIT_CAST
:
1933 return simd_intrinsic_emit_cast (result
, cfg
, cmethod
, args
);
1934 case SIMD_EMIT_SHUFFLE
:
1935 return simd_intrinsic_emit_shuffle (result
, cfg
, cmethod
, args
);
1936 case SIMD_EMIT_SHIFT
:
1937 return simd_intrinsic_emit_shift (result
, cfg
, cmethod
, args
);
1938 case SIMD_EMIT_EQUALITY
:
1939 return simd_intrinsic_emit_equality (result
, cfg
, cmethod
, args
);
1940 case SIMD_EMIT_LOAD_ALIGNED
:
1941 return simd_intrinsic_emit_load_aligned (result
, cfg
, cmethod
, args
);
1942 case SIMD_EMIT_STORE
:
1943 return simd_intrinsic_emit_store (result
, cfg
, cmethod
, args
);
1944 case SIMD_EMIT_EXTRACT_MASK
:
1945 return simd_intrinsic_emit_extract_mask (result
, cfg
, cmethod
, args
);
1946 case SIMD_EMIT_PREFETCH
:
1947 return simd_intrinsic_emit_prefetch (result
, cfg
, cmethod
, args
);
1949 g_assert_not_reached ();
1953 mono_emit_vector_ldelema (MonoCompile
*cfg
, MonoType
*array_type
, MonoInst
*arr
, MonoInst
*index
, gboolean check_bounds
)
1957 int mult_reg
, add_reg
, array_reg
, index_reg
, index2_reg
, index3_reg
;
1959 size
= mono_array_element_size (mono_class_from_mono_type_internal (array_type
));
1960 mult_reg
= alloc_preg (cfg
);
1961 array_reg
= arr
->dreg
;
1962 index_reg
= index
->dreg
;
1964 #if TARGET_SIZEOF_VOID_P == 8
1965 /* The array reg is 64 bits but the index reg is only 32 */
1966 index2_reg
= alloc_preg (cfg
);
1967 MONO_EMIT_NEW_UNALU (cfg
, OP_SEXT_I4
, index2_reg
, index_reg
);
1969 index2_reg
= index_reg
;
1971 index3_reg
= alloc_preg (cfg
);
1974 MONO_EMIT_BOUNDS_CHECK (cfg
, array_reg
, MonoArray
, max_length
, index2_reg
);
1975 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_PADD_IMM
, index3_reg
, index2_reg
, 16 / size
- 1);
1976 MONO_EMIT_BOUNDS_CHECK (cfg
, array_reg
, MonoArray
, max_length
, index3_reg
);
1979 add_reg
= alloc_preg (cfg
);
1981 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_MUL_IMM
, mult_reg
, index2_reg
, size
);
1982 MONO_EMIT_NEW_BIALU (cfg
, OP_PADD
, add_reg
, array_reg
, mult_reg
);
1983 NEW_BIALU_IMM (cfg
, ins
, OP_PADD_IMM
, add_reg
, add_reg
, MONO_STRUCT_OFFSET (MonoArray
, vector
));
1984 ins
->type
= STACK_PTR
;
1985 MONO_ADD_INS (cfg
->cbb
, ins
);
1991 emit_array_extension_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
1993 if ((!strcmp ("GetVector", cmethod
->name
) || !strcmp ("GetVectorAligned", cmethod
->name
)) && fsig
->param_count
== 2) {
1995 int addr
= mono_emit_vector_ldelema (cfg
, fsig
->params
[0], args
[0], args
[1], TRUE
);
1997 MONO_INST_NEW (cfg
, load
, !strcmp ("GetVectorAligned", cmethod
->name
) ? OP_LOADX_ALIGNED_MEMBASE
: OP_LOADX_MEMBASE
);
1998 load
->klass
= cmethod
->klass
;
2000 load
->type
= STACK_VTYPE
;
2001 load
->dreg
= alloc_ireg (cfg
);
2002 MONO_ADD_INS (cfg
->cbb
, load
);
2006 if ((!strcmp ("SetVector", cmethod
->name
) || !strcmp ("SetVectorAligned", cmethod
->name
)) && fsig
->param_count
== 3) {
2008 int vreg
= get_simd_vreg (cfg
, cmethod
, args
[1]);
2009 int addr
= mono_emit_vector_ldelema (cfg
, fsig
->params
[0], args
[0], args
[2], TRUE
);
2011 MONO_INST_NEW (cfg
, store
, !strcmp ("SetVectorAligned", cmethod
->name
) ? OP_STOREX_ALIGNED_MEMBASE_REG
: OP_STOREX_MEMBASE
);
2012 store
->klass
= cmethod
->klass
;
2014 store
->sreg1
= vreg
;
2015 MONO_ADD_INS (cfg
->cbb
, store
);
2019 if (!strcmp ("IsAligned", cmethod
->name
) && fsig
->param_count
== 2) {
2021 int addr
= mono_emit_vector_ldelema (cfg
, fsig
->params
[0], args
[0], args
[1], FALSE
);
2023 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_AND_IMM
, addr
, addr
, 15);
2024 MONO_EMIT_NEW_BIALU_IMM (cfg
, OP_COMPARE_IMM
, -1, addr
, 0);
2025 NEW_UNALU (cfg
, ins
, OP_CEQ
, addr
, -1);
2026 MONO_ADD_INS (cfg
->cbb
, ins
);
2034 emit_simd_runtime_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2036 if (!strcmp ("get_AccelMode", cmethod
->name
) && fsig
->param_count
== 0) {
2038 EMIT_NEW_ICONST (cfg
, ins
, simd_supported_versions
);
2045 is_sys_numerics_assembly (MonoAssembly
*assembly
)
2047 return !strcmp ("System.Numerics", assembly
->aname
.name
);
2051 is_sys_numerics_vectors_assembly (MonoAssembly
*assembly
)
2053 return !strcmp ("System.Numerics.Vectors", assembly
->aname
.name
);
2057 mono_emit_simd_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2059 const char *class_name
;
2060 MonoInst
*simd_inst
= NULL
;
2062 if (is_sys_numerics_assembly (m_class_get_image (cmethod
->klass
)->assembly
)) {
2063 simd_inst
= emit_sys_numerics_intrinsics (cfg
, cmethod
, fsig
, args
);
2067 if (is_sys_numerics_vectors_assembly (m_class_get_image (cmethod
->klass
)->assembly
)) {
2068 simd_inst
= emit_sys_numerics_vectors_intrinsics (cfg
, cmethod
, fsig
, args
);
2072 if (strcmp ("Mono.Simd", m_class_get_image (cmethod
->klass
)->assembly
->aname
.name
) ||
2073 strcmp ("Mono.Simd", m_class_get_name_space (cmethod
->klass
))) {
2077 class_name
= m_class_get_name (cmethod
->klass
);
2078 if (!strcmp ("SimdRuntime", class_name
)) {
2079 simd_inst
= emit_simd_runtime_intrinsics (cfg
, cmethod
, fsig
, args
);
2083 if (!strcmp ("ArrayExtensions", class_name
)) {
2084 simd_inst
= emit_array_extension_intrinsics (cfg
, cmethod
, fsig
, args
);
2088 if (!strcmp ("VectorOperations", class_name
)) {
2089 if (!(cmethod
->flags
& METHOD_ATTRIBUTE_STATIC
))
2091 class_name
= m_class_get_name (mono_class_from_mono_type_internal (mono_method_signature_internal (cmethod
)->params
[0]));
2092 } else if (!m_class_is_simd_type (cmethod
->klass
))
2095 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS_SIMPLIFY_INDIRECTION
;
2096 if (!strcmp ("Vector2d", class_name
)) {
2097 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2d_intrinsics
, sizeof (vector2d_intrinsics
) / sizeof (SimdIntrinsic
));
2100 if (!strcmp ("Vector4f", class_name
)) {
2101 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4f_intrinsics
, sizeof (vector4f_intrinsics
) / sizeof (SimdIntrinsic
));
2104 if (!strcmp ("Vector2ul", class_name
)) {
2105 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2ul_intrinsics
, sizeof (vector2ul_intrinsics
) / sizeof (SimdIntrinsic
));
2108 if (!strcmp ("Vector2l", class_name
)) {
2109 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector2l_intrinsics
, sizeof (vector2l_intrinsics
) / sizeof (SimdIntrinsic
));
2112 if (!strcmp ("Vector4ui", class_name
)) {
2113 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4ui_intrinsics
, sizeof (vector4ui_intrinsics
) / sizeof (SimdIntrinsic
));
2116 if (!strcmp ("Vector4i", class_name
)) {
2117 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector4i_intrinsics
, sizeof (vector4i_intrinsics
) / sizeof (SimdIntrinsic
));
2120 if (!strcmp ("Vector8us", class_name
)) {
2121 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector8us_intrinsics
, sizeof (vector8us_intrinsics
) / sizeof (SimdIntrinsic
));
2124 if (!strcmp ("Vector8s", class_name
)) {
2125 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector8s_intrinsics
, sizeof (vector8s_intrinsics
) / sizeof (SimdIntrinsic
));
2128 if (!strcmp ("Vector16b", class_name
)) {
2129 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector16b_intrinsics
, sizeof (vector16b_intrinsics
) / sizeof (SimdIntrinsic
));
2132 if (!strcmp ("Vector16sb", class_name
)) {
2133 simd_inst
= emit_intrinsics (cfg
, cmethod
, fsig
, args
, vector16sb_intrinsics
, sizeof (vector16sb_intrinsics
) / sizeof (SimdIntrinsic
));
2138 if (simd_inst
!= NULL
) {
2139 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS
;
2140 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS_DECOMPOSE_VTYPE
;
2147 assert_handled (MonoCompile
*cfg
, MonoMethod
*method
)
2149 MonoCustomAttrInfo
*cattr
;
2152 if (cfg
->verbose_level
> 1) {
2153 cattr
= mono_custom_attrs_from_method_checked (method
, error
);
2156 gboolean has_attr
= FALSE
;
2157 for (int i
= 0; i
< cattr
->num_attrs
; ++i
)
2158 if (cattr
->attrs
[i
].ctor
&& (!strcmp (m_class_get_name (cattr
->attrs
[i
].ctor
->klass
), "JitIntrinsicAttribute")))
2161 printf ("SIMD intrinsic unhandled: %s\n", mono_method_get_name_full (method
, TRUE
, TRUE
, MONO_TYPE_NAME_FORMAT_IL
));
2163 //g_assert_not_reached ();
2165 mono_custom_attrs_free (cattr
);
2170 // The entries should be ordered by name
2171 // System.Numerics.Vector2/Vector3/Vector4
2172 static const SimdIntrinsic vector2_intrinsics
[] = {
2173 { SN_ctor
, OP_EXPAND_R4
},
2175 { SN_Dot
, OP_DPPS
},
2176 { SN_Equals
, OP_COMPPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_EQUALITY
, SIMD_COMP_EQ
},
2177 { SN_Max
, OP_MAXPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2178 { SN_Min
, OP_MINPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2179 { SN_SquareRoot
, OP_SQRTPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_UNARY
},
2180 { SN_op_Addition
, OP_ADDPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2181 { SN_op_Division
, OP_DIVPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2182 { SN_op_Multiply
, OP_MULPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2183 { SN_op_Subtraction
, OP_SUBPS
, SIMD_VERSION_SSE1
, SIMD_EMIT_BINARY
},
2187 emit_vector_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2189 const SimdIntrinsic
*intrins
;
2190 MonoMethodSignature
*sig
= mono_method_signature_internal (cmethod
);
2191 MonoType
*type
= m_class_get_byval_arg (cmethod
->klass
);
2193 if (!m_class_is_simd_type (cmethod
->klass
))
2197 * Vector2/3/4 are handled the same way, since the underlying SIMD type is the same (4 * r4).
2199 intrins
= (const SimdIntrinsic
*)mono_binary_search (cmethod
->name
, vector2_intrinsics
, sizeof (vector2_intrinsics
) / sizeof (SimdIntrinsic
), sizeof (SimdIntrinsic
), &simd_intrinsic_compare_by_name
);
2201 assert_handled (cfg
, cmethod
);
2205 if (cfg
->verbose_level
> 1) {
2206 char *name
= mono_method_full_name (cmethod
, TRUE
);
2207 printf (" SIMD intrinsic %s\n", name
);
2211 switch (intrins
->name
) {
2213 gboolean match
= TRUE
;
2214 for (int i
= 0; i
< fsig
->param_count
; ++i
)
2215 if (fsig
->params
[i
]->type
!= MONO_TYPE_R4
)
2219 return simd_intrinsic_emit_ctor (intrins
, cfg
, cmethod
, args
);
2222 if (!(fsig
->param_count
== 1 && fsig
->ret
->type
== MONO_TYPE_BOOLEAN
&& fsig
->params
[0] == type
))
2224 return simd_intrinsic_emit_equality (intrins
, cfg
, cmethod
, args
);
2226 if (!(fsig
->param_count
== 1 && fsig
->ret
== type
&& fsig
->params
[0] == type
))
2228 return simd_intrinsic_emit_unary (intrins
, cfg
, cmethod
, args
);
2230 if (!(fsig
->param_count
== 2 && fsig
->ret
->type
== MONO_TYPE_R4
&& fsig
->params
[0] == type
&& fsig
->params
[1] == type
))
2232 if (COMPILE_LLVM (cfg
)) {
2235 ins
= simd_intrinsic_emit_binary (intrins
, cfg
, cmethod
, args
);
2236 /* The end result is in the lowest element */
2237 return simd_intrinsic_emit_getter_op (cfg
, 0, cmethod
->klass
, mono_method_signature_internal (cmethod
)->ret
, ins
);
2241 // abs(x) = max(x, sub(0,x))
2245 if (!(fsig
->param_count
== 1 && fsig
->ret
== type
&& fsig
->params
[0] == type
))
2248 MONO_INST_NEW (cfg
, zero
, OP_XZERO
);
2249 zero
->dreg
= alloc_xreg (cfg
);
2250 zero
->klass
= cmethod
->klass
;
2251 MONO_ADD_INS (cfg
->cbb
, zero
);
2253 sub
= simd_intrinsic_emit_binary_op (cfg
, OP_SUBPS
, 0, cmethod
->klass
, sig
->params
[0], sig
->params
[0], zero
, args
[0]);
2254 return simd_intrinsic_emit_binary_op (cfg
, OP_MAXPS
, 0, cmethod
->klass
, sig
->params
[0], sig
->params
[0], args
[0], sub
);
2258 case SN_op_Addition
:
2259 case SN_op_Division
:
2260 case SN_op_Multiply
:
2261 case SN_op_Subtraction
:
2262 if (!(fsig
->param_count
== 2 && fsig
->ret
== type
&& (fsig
->params
[0] == type
|| fsig
->params
[0]->type
== MONO_TYPE_R4
) && (fsig
->params
[1] == type
|| fsig
->params
[1]->type
== MONO_TYPE_R4
)))
2264 return simd_intrinsic_emit_binary (intrins
, cfg
, cmethod
, args
);
2269 assert_handled (cfg
, cmethod
);
2271 if (cfg
->verbose_level
> 1) {
2272 char *name
= mono_method_full_name (cmethod
, TRUE
);
2273 printf (" SIMD method %s not handled.\n", name
);
2280 emit_vector_is_hardware_accelerated_intrinsic (MonoCompile
*cfg
)
2284 if (simd_supported_versions
)
2285 EMIT_NEW_ICONST (cfg
, ins
, 1);
2287 EMIT_NEW_ICONST (cfg
, ins
, 0);
2288 ins
->type
= STACK_I4
;
2292 /* These should be ordered by name */
2293 static const SimdIntrinsic vector_t_intrinsics
[] = {
2299 { SN_GreaterThanOrEqual
},
2301 { SN_LessThanOrEqual
},
2304 { SN_get_AllOnes
, OP_XONES
},
2307 { SN_get_Zero
, OP_XZERO
},
2309 { SN_op_BitwiseAnd
},
2310 { SN_op_BitwiseOr
},
2312 { SN_op_ExclusiveOr
},
2315 { SN_op_Subtraction
}
2319 emit_vector_t_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2321 const SimdIntrinsic
*intrins
;
2322 MonoType
*type
, *etype
;
2324 int size
, len
, index
;
2326 intrins
= (const SimdIntrinsic
*)mono_binary_search (cmethod
->name
, vector_t_intrinsics
, sizeof (vector_t_intrinsics
) / sizeof (SimdIntrinsic
), sizeof (SimdIntrinsic
), &simd_intrinsic_compare_by_name
);
2328 assert_handled (cfg
, cmethod
);
2332 type
= m_class_get_byval_arg (cmethod
->klass
);
2333 etype
= mono_class_get_context (cmethod
->klass
)->class_inst
->type_argv
[0];
2334 size
= mono_class_value_size (mono_class_from_mono_type_internal (etype
), NULL
);
2338 if (!MONO_TYPE_IS_PRIMITIVE (etype
))
2341 if (cfg
->verbose_level
> 1) {
2342 char *name
= mono_method_full_name (cmethod
, TRUE
);
2343 printf (" SIMD intrinsic %s\n", name
);
2347 switch (intrins
->name
) {
2349 if (!(fsig
->param_count
== 0 && fsig
->ret
->type
== MONO_TYPE_I4
))
2351 EMIT_NEW_ICONST (cfg
, ins
, len
);
2353 case SN_get_AllOnes
:
2355 if (!(fsig
->param_count
== 0 && mono_metadata_type_equal (fsig
->ret
, type
)))
2357 return simd_intrinsic_emit_const (intrins
, cfg
, cmethod
, args
);
2359 g_assert (fsig
->param_count
== 1);
2360 if (args
[1]->opcode
!= OP_ICONST
)
2362 index
= args
[1]->inst_c0
;
2363 if (index
< 0 || index
>= len
)
2365 return simd_intrinsic_emit_getter_op (cfg
, index
, cmethod
->klass
, etype
, args
[0]);
2367 if (fsig
->param_count
== 1 && mono_metadata_type_equal (fsig
->params
[0], etype
))
2368 return simd_intrinsic_emit_ctor (NULL
, cfg
, cmethod
, args
);
2369 if ((fsig
->param_count
== 1 || fsig
->param_count
== 2) && (fsig
->params
[0]->type
== MONO_TYPE_SZARRAY
)) {
2370 MonoInst
*array_ins
= args
[1];
2371 MonoInst
*index_ins
;
2372 MonoInst
*ldelema_ins
;
2376 if (args
[0]->opcode
!= OP_LDADDR
)
2379 /* .ctor (T[]) or .ctor (T[], index) */
2381 if (fsig
->param_count
== 2) {
2382 index_ins
= args
[2];
2384 EMIT_NEW_ICONST (cfg
, index_ins
, 0);
2387 /* Emit index check for the end (index + len - 1 < array length) */
2388 end_index_reg
= alloc_ireg (cfg
);
2389 EMIT_NEW_BIALU_IMM (cfg
, ins
, OP_IADD_IMM
, end_index_reg
, index_ins
->dreg
, len
- 1);
2390 MONO_EMIT_BOUNDS_CHECK (cfg
, array_ins
->dreg
, MonoArray
, max_length
, end_index_reg
);
2392 /* Load the array slice into the simd reg */
2393 ldelema_ins
= mini_emit_ldelema_1_ins (cfg
, mono_class_from_mono_type_internal (etype
), array_ins
, index_ins
, TRUE
);
2394 g_assert (args
[0]->opcode
== OP_LDADDR
);
2395 var
= (MonoInst
*)args
[0]->inst_p0
;
2396 EMIT_NEW_LOAD_MEMBASE (cfg
, ins
, OP_LOADX_MEMBASE
, var
->dreg
, ldelema_ins
->dreg
, 0);
2397 ins
->klass
= cmethod
->klass
;
2401 case SN_op_Explicit
:
2402 return simd_intrinsic_emit_cast (intrins
, cfg
, cmethod
, args
);
2404 if (fsig
->param_count
== 1 && fsig
->ret
->type
== MONO_TYPE_BOOLEAN
&& mono_metadata_type_equal (fsig
->params
[0], type
))
2405 return simd_intrinsic_emit_equality_op (cfg
, cmethod
, args
, type_to_comp_op (etype
), SIMD_COMP_EQ
);
2406 if (fsig
->param_count
== 2 && mono_metadata_type_equal (fsig
->ret
, type
) && mono_metadata_type_equal (fsig
->params
[0], type
) && mono_metadata_type_equal (fsig
->params
[1], type
))
2407 return simd_intrinsic_emit_binary_op (cfg
, type_to_comp_op (etype
), 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2410 case SN_GreaterThan
:
2411 case SN_GreaterThanOrEqual
:
2413 case SN_LessThanOrEqual
: {
2414 MonoInst
*cmp1
, *cmp2
;
2417 switch (etype
->type
) {
2427 eq_op
= type_to_comp_op (etype
);
2428 gt_op
= type_to_gt_op (etype
);
2430 switch (intrins
->name
) {
2431 case SN_GreaterThan
:
2432 return simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2434 return simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[1], args
[0]);
2435 case SN_LessThanOrEqual
:
2436 cmp1
= simd_intrinsic_emit_binary_op (cfg
, eq_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[1], args
[0]);
2437 cmp2
= simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[1], args
[0]);
2438 return simd_intrinsic_emit_binary_op (cfg
, OP_POR
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], cmp1
, cmp2
);
2439 case SN_GreaterThanOrEqual
:
2440 cmp1
= simd_intrinsic_emit_binary_op (cfg
, eq_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2441 cmp2
= simd_intrinsic_emit_binary_op (cfg
, gt_op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], args
[0], args
[1]);
2442 return simd_intrinsic_emit_binary_op (cfg
, OP_POR
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[1], cmp1
, cmp2
);
2444 g_assert_not_reached ();
2450 switch (etype
->type
) {
2454 case MONO_TYPE_U8
: {
2458 MONO_INST_NEW (cfg
, ins
, OP_XMOVE
);
2459 ins
->klass
= cmethod
->klass
;
2460 ins
->type
= STACK_VTYPE
;
2461 ins
->sreg1
= args
[0]->dreg
;
2462 ins
->dreg
= alloc_xreg (cfg
);
2463 MONO_ADD_INS (cfg
->cbb
, ins
);
2470 case SN_op_Addition
:
2471 case SN_op_Subtraction
:
2472 case SN_op_Multiply
:
2473 case SN_op_Division
:
2474 case SN_op_ExclusiveOr
:
2475 case SN_op_BitwiseAnd
:
2476 case SN_op_BitwiseOr
:
2479 if (!(fsig
->param_count
== 2 && mono_metadata_type_equal (fsig
->ret
, fsig
->params
[0]) && mono_metadata_type_equal (fsig
->params
[0], fsig
->params
[1])))
2482 switch (intrins
->name
) {
2483 case SN_op_Addition
:
2484 op
= type_to_padd_op (etype
);
2486 case SN_op_Subtraction
:
2487 op
= type_to_psub_op (etype
);
2489 case SN_op_Multiply
:
2490 op
= type_to_pmul_op (etype
);
2492 case SN_op_Division
:
2493 op
= type_to_pdiv_op (etype
);
2495 case SN_op_ExclusiveOr
:
2496 op
= type_to_pxor_op (etype
);
2498 case SN_op_BitwiseAnd
:
2499 op
= type_to_pand_op (etype
);
2501 case SN_op_BitwiseOr
:
2502 op
= type_to_por_op (etype
);
2505 op
= type_to_pmin_op (etype
);
2508 op
= type_to_pmax_op (etype
);
2511 g_assert_not_reached ();
2514 return simd_intrinsic_emit_binary_op (cfg
, op
, 0, cmethod
->klass
, fsig
->params
[0], fsig
->params
[0], args
[0], args
[1]);
2518 MonoInst
*array_ins
= args
[1];
2519 MonoInst
*index_ins
= args
[2];
2520 MonoInst
*ldelema_ins
;
2524 if (args
[0]->opcode
!= OP_LDADDR
)
2527 /* Emit index check for the end (index + len - 1 < array length) */
2528 end_index_reg
= alloc_ireg (cfg
);
2529 EMIT_NEW_BIALU_IMM (cfg
, ins
, OP_IADD_IMM
, end_index_reg
, index_ins
->dreg
, len
- 1);
2531 int length_reg
= alloc_ireg (cfg
);
2532 MONO_EMIT_NEW_LOAD_MEMBASE_OP_FAULT (cfg
, OP_LOADI4_MEMBASE
, length_reg
, array_ins
->dreg
, MONO_STRUCT_OFFSET (MonoArray
, max_length
));
2533 MONO_EMIT_NEW_BIALU (cfg
, OP_COMPARE
, -1, length_reg
, end_index_reg
);
2534 MONO_EMIT_NEW_COND_EXC (cfg
, LE_UN
, "ArgumentException");
2536 /* Load the simd reg into the array slice */
2537 ldelema_ins
= mini_emit_ldelema_1_ins (cfg
, mono_class_from_mono_type_internal (etype
), array_ins
, index_ins
, TRUE
);
2538 g_assert (args
[0]->opcode
== OP_LDADDR
);
2539 var
= (MonoInst
*)args
[0]->inst_p0
;
2540 EMIT_NEW_STORE_MEMBASE (cfg
, ins
, OP_STOREX_MEMBASE
, ldelema_ins
->dreg
, 0, var
->dreg
);
2541 ins
->klass
= cmethod
->klass
;
2549 assert_handled (cfg
, cmethod
);
2551 if (cfg
->verbose_level
> 1) {
2552 char *name
= mono_method_full_name (cmethod
, TRUE
);
2553 printf (" SIMD method %s not handled.\n", name
);
2561 * emit_sys_numerics_intrinsics:
2563 * Emit intrinsics for the System.Numerics assembly.
2566 emit_sys_numerics_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2568 const char *nspace
= m_class_get_name_space (cmethod
->klass
);
2569 const char *class_name
= m_class_get_name (cmethod
->klass
);
2571 if (!strcmp ("Vector2", class_name
) || !strcmp ("Vector4", class_name
) || !strcmp ("Vector3", class_name
))
2572 return emit_vector_intrinsics (cfg
, cmethod
, fsig
, args
);
2574 if (!strcmp ("System.Numerics", nspace
) && !strcmp ("Vector", class_name
)) {
2575 if (!strcmp (cmethod
->name
, "get_IsHardwareAccelerated"))
2576 return emit_vector_is_hardware_accelerated_intrinsic (cfg
);
2583 emit_sys_numerics_vectors_intrinsics (MonoCompile
*cfg
, MonoMethod
*cmethod
, MonoMethodSignature
*fsig
, MonoInst
**args
)
2585 const char *nspace
= m_class_get_name_space (cmethod
->klass
);
2586 const char *class_name
= m_class_get_name (cmethod
->klass
);
2588 if (!strcmp (class_name
, "Vector`1"))
2589 return emit_vector_t_intrinsics (cfg
, cmethod
, fsig
, args
);
2591 if (!strcmp ("System.Numerics", nspace
) && !strcmp ("Vector", class_name
)) {
2592 if (!strcmp (cmethod
->name
, "get_IsHardwareAccelerated"))
2593 return emit_vector_is_hardware_accelerated_intrinsic (cfg
);
2600 mono_emit_simd_field_load (MonoCompile
*cfg
, MonoClassField
*field
, MonoInst
*addr
)
2602 MonoInst
* simd_inst
= NULL
;
2604 if (is_sys_numerics_assembly (m_class_get_image (field
->parent
)->assembly
)) {
2607 const char *parent_name
= m_class_get_name (field
->parent
);
2608 if (!strcmp (parent_name
, "Vector2") ||
2609 !strcmp (parent_name
, "Vector3") ||
2610 !strcmp (parent_name
, "Vector4")) {
2611 if (!strcmp (field
->name
, "X"))
2613 else if (!strcmp (field
->name
, "Y"))
2615 else if (!strcmp (field
->name
, "Z"))
2617 else if (!strcmp (field
->name
, "W"))
2622 if (cfg
->verbose_level
> 1)
2623 printf (" SIMD intrinsic field access: %s\n", field
->name
);
2625 simd_inst
= simd_intrinsic_emit_getter_op (cfg
, index
, field
->parent
, mono_field_get_type_internal (field
), addr
);
2632 if (simd_inst
!= NULL
) {
2633 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS
;
2634 cfg
->uses_simd_intrinsics
|= MONO_CFG_USES_SIMD_INTRINSICS_DECOMPOSE_VTYPE
;
2640 #endif /* DISABLE_JIT */
2641 #endif /* MONO_ARCH_SIMD_INTRINSICS */