Illumos regtest: updates in none/tests/amd64-solaris
[valgrind.git] / VEX / priv / host_generic_simd128.c
blobf895de46f486f6e5dfa34a03bb5db0f9193891b7
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd128.c ---*/
4 /*---------------------------------------------------------------*/
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
10 Copyright (C) 2010-2017 OpenWorks GbR
11 info@open-works.net
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
29 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
30 where the instruction selectors cannot generate code in-line.
31 These are purely back-end entities and cannot be seen/referenced
32 from IR. */
34 #include "libvex_basictypes.h"
35 #include "host_generic_simd128.h"
38 /* Primitive helpers always take args of the real type (signed vs
39 unsigned) but return an unsigned result, so there's no conversion
40 weirdness when stuffing results back in the V128 union fields,
41 which are all unsigned. */
43 static inline UInt mul32 ( Int xx, Int yy )
45 Long t = ((Long)xx) * ((Long)yy);
46 return toUInt(t);
49 static inline UInt max32S ( Int xx, Int yy )
51 return toUInt((xx > yy) ? xx : yy);
54 static inline UInt min32S ( Int xx, Int yy )
56 return toUInt((xx < yy) ? xx : yy);
59 static inline UInt max32U ( UInt xx, UInt yy )
61 return toUInt((xx > yy) ? xx : yy);
64 static inline UInt min32U ( UInt xx, UInt yy )
66 return toUInt((xx < yy) ? xx : yy);
69 static inline UShort max16U ( UShort xx, UShort yy )
71 return toUShort((xx > yy) ? xx : yy);
74 static inline UShort min16U ( UShort xx, UShort yy )
76 return toUShort((xx < yy) ? xx : yy);
79 static inline UChar max8S ( Char xx, Char yy )
81 return toUChar((xx > yy) ? xx : yy);
84 static inline UChar min8S ( Char xx, Char yy )
86 return toUChar((xx < yy) ? xx : yy);
89 static inline ULong cmpEQ64 ( Long xx, Long yy )
91 return (((Long)xx) == ((Long)yy))
92 ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
95 static inline ULong cmpGT64S ( Long xx, Long yy )
97 return (((Long)xx) > ((Long)yy))
98 ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
101 static inline ULong sar64 ( ULong v, UInt n )
103 return ((Long)v) >> n;
106 static inline UChar sar8 ( UChar v, UInt n )
108 return toUChar(((Char)v) >> n);
111 static inline UShort qnarrow32Sto16U ( UInt xx0 )
113 Int xx = (Int)xx0;
114 if (xx < 0) xx = 0;
115 if (xx > 65535) xx = 65535;
116 return (UShort)xx;
119 static inline UShort narrow32to16 ( UInt xx )
121 return (UShort)xx;
124 static inline UChar narrow16to8 ( UShort xx )
126 return (UChar)xx;
130 void VEX_REGPARM(3)
131 h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
132 V128* argL, V128* argR )
134 res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
135 res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
136 res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
137 res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
140 void VEX_REGPARM(3)
141 h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
142 V128* argL, V128* argR )
144 res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
145 res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
146 res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
147 res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
150 void VEX_REGPARM(3)
151 h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
152 V128* argL, V128* argR )
154 res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
155 res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
156 res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
157 res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
160 void VEX_REGPARM(3)
161 h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
162 V128* argL, V128* argR )
164 res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
165 res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
166 res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
167 res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
170 void VEX_REGPARM(3)
171 h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
172 V128* argL, V128* argR )
174 res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
175 res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
176 res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
177 res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
180 void VEX_REGPARM(3)
181 h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
182 V128* argL, V128* argR )
184 res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
185 res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
186 res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
187 res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
188 res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
189 res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
190 res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
191 res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
194 void VEX_REGPARM(3)
195 h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
196 V128* argL, V128* argR )
198 res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
199 res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
200 res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
201 res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
202 res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
203 res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
204 res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
205 res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
208 void VEX_REGPARM(3)
209 h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
210 V128* argL, V128* argR )
212 res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
213 res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
214 res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
215 res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
216 res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
217 res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
218 res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
219 res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
220 res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
221 res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
222 res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
223 res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
224 res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
225 res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
226 res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
227 res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
230 void VEX_REGPARM(3)
231 h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
232 V128* argL, V128* argR )
234 res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
235 res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
236 res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
237 res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
238 res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
239 res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
240 res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
241 res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
242 res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
243 res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
244 res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
245 res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
246 res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
247 res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
248 res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
249 res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
252 void VEX_REGPARM(3)
253 h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
254 V128* argL, V128* argR )
256 res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
257 res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
260 void VEX_REGPARM(3)
261 h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
262 V128* argL, V128* argR )
264 res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
265 res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
268 /* ------------ Shifting ------------ */
269 /* Note that because these primops are undefined if the shift amount
270 equals or exceeds the lane width, the shift amount is masked so
271 that the scalar shifts are always in range. In fact, given the
272 semantics of these primops (Sar64x2, etc) it is an error if in
273 fact we are ever given an out-of-range shift amount.
275 void /*not-regparm*/
276 h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
277 V128* argL, UInt nn)
279 /* vassert(nn < 64); */
280 nn &= 63;
281 res->w64[0] = sar64(argL->w64[0], nn);
282 res->w64[1] = sar64(argL->w64[1], nn);
285 void /*not-regparm*/
286 h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
287 V128* argL, UInt nn)
289 /* vassert(nn < 8); */
290 nn &= 7;
291 res->w8[ 0] = sar8(argL->w8[ 0], nn);
292 res->w8[ 1] = sar8(argL->w8[ 1], nn);
293 res->w8[ 2] = sar8(argL->w8[ 2], nn);
294 res->w8[ 3] = sar8(argL->w8[ 3], nn);
295 res->w8[ 4] = sar8(argL->w8[ 4], nn);
296 res->w8[ 5] = sar8(argL->w8[ 5], nn);
297 res->w8[ 6] = sar8(argL->w8[ 6], nn);
298 res->w8[ 7] = sar8(argL->w8[ 7], nn);
299 res->w8[ 8] = sar8(argL->w8[ 8], nn);
300 res->w8[ 9] = sar8(argL->w8[ 9], nn);
301 res->w8[10] = sar8(argL->w8[10], nn);
302 res->w8[11] = sar8(argL->w8[11], nn);
303 res->w8[12] = sar8(argL->w8[12], nn);
304 res->w8[13] = sar8(argL->w8[13], nn);
305 res->w8[14] = sar8(argL->w8[14], nn);
306 res->w8[15] = sar8(argL->w8[15], nn);
309 void VEX_REGPARM(3)
310 h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
311 V128* argL, V128* argR )
313 res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
314 res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
315 res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
316 res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
317 res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
318 res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
319 res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
320 res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
323 void VEX_REGPARM(3)
324 h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
325 V128* argL, V128* argR )
327 res->w8[ 0] = narrow16to8(argR->w16[0]);
328 res->w8[ 1] = narrow16to8(argR->w16[1]);
329 res->w8[ 2] = narrow16to8(argR->w16[2]);
330 res->w8[ 3] = narrow16to8(argR->w16[3]);
331 res->w8[ 4] = narrow16to8(argR->w16[4]);
332 res->w8[ 5] = narrow16to8(argR->w16[5]);
333 res->w8[ 6] = narrow16to8(argR->w16[6]);
334 res->w8[ 7] = narrow16to8(argR->w16[7]);
335 res->w8[ 8] = narrow16to8(argL->w16[0]);
336 res->w8[ 9] = narrow16to8(argL->w16[1]);
337 res->w8[10] = narrow16to8(argL->w16[2]);
338 res->w8[11] = narrow16to8(argL->w16[3]);
339 res->w8[12] = narrow16to8(argL->w16[4]);
340 res->w8[13] = narrow16to8(argL->w16[5]);
341 res->w8[14] = narrow16to8(argL->w16[6]);
342 res->w8[15] = narrow16to8(argL->w16[7]);
345 void VEX_REGPARM(3)
346 h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
347 V128* argL, V128* argR )
349 res->w16[0] = narrow32to16(argR->w32[0]);
350 res->w16[1] = narrow32to16(argR->w32[1]);
351 res->w16[2] = narrow32to16(argR->w32[2]);
352 res->w16[3] = narrow32to16(argR->w32[3]);
353 res->w16[4] = narrow32to16(argL->w32[0]);
354 res->w16[5] = narrow32to16(argL->w32[1]);
355 res->w16[6] = narrow32to16(argL->w32[2]);
356 res->w16[7] = narrow32to16(argL->w32[3]);
359 void VEX_REGPARM(3)
360 h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
361 V128* argL, V128* argR )
363 res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
364 res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
365 res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
366 res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
369 //void VEX_REGPARM(3)
370 // h_generic_calc_PermOrZero8x16 ( /*OUT*/V128* res,
371 // V128* argL, V128* argR )
373 // for (UInt i = 0; i < 16; i++) {
374 // UChar ix = argR->w8[i];
375 // Char zeroingMask = (Char)ix;
376 // zeroingMask ^= 0x80;
377 // zeroingMask >>= 7;
378 // ix &= 15;
379 // res->w8[i] = (argL->w8[ix] & zeroingMask) & 0xFF;
380 // }
383 UInt /*not-regparm*/
384 h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
386 /* Some serious bit twiddling going on here. Mostly we can do it in
387 parallel for the upper and lower 64 bits, assuming the processor offers
388 a suitably high level of ILP. */
389 w64hi &= 0x8080808080808080ULL;
390 w64lo &= 0x8080808080808080ULL;
391 w64hi >>= 7;
392 w64lo >>= 7;
393 w64hi |= (w64hi >> 7);
394 w64lo |= (w64lo >> 7);
395 w64hi |= (w64hi >> 14);
396 w64lo |= (w64lo >> 14);
397 w64hi |= (w64hi >> 28);
398 w64lo |= (w64lo >> 28);
399 UInt r = ((w64hi & 0xFF) << 8) | (w64lo & 0xFF);
400 return r;
403 /*---------------------------------------------------------------*/
404 /*--- end host_generic_simd128.c ---*/
405 /*---------------------------------------------------------------*/