2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd128.c ---*/
4 /*---------------------------------------------------------------*/
7 This file is part of Valgrind, a dynamic binary instrumentation
10 Copyright (C) 2010-2017 OpenWorks GbR
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 The GNU General Public License is contained in the file COPYING.
29 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
30 where the instruction selectors cannot generate code in-line.
31 These are purely back-end entities and cannot be seen/referenced
34 #include "libvex_basictypes.h"
35 #include "host_generic_simd128.h"
38 /* Primitive helpers always take args of the real type (signed vs
39 unsigned) but return an unsigned result, so there's no conversion
40 weirdness when stuffing results back in the V128 union fields,
41 which are all unsigned. */
43 static inline UInt
mul32 ( Int xx
, Int yy
)
45 Long t
= ((Long
)xx
) * ((Long
)yy
);
49 static inline UInt
max32S ( Int xx
, Int yy
)
51 return toUInt((xx
> yy
) ? xx
: yy
);
54 static inline UInt
min32S ( Int xx
, Int yy
)
56 return toUInt((xx
< yy
) ? xx
: yy
);
59 static inline UInt
max32U ( UInt xx
, UInt yy
)
61 return toUInt((xx
> yy
) ? xx
: yy
);
64 static inline UInt
min32U ( UInt xx
, UInt yy
)
66 return toUInt((xx
< yy
) ? xx
: yy
);
69 static inline UShort
max16U ( UShort xx
, UShort yy
)
71 return toUShort((xx
> yy
) ? xx
: yy
);
74 static inline UShort
min16U ( UShort xx
, UShort yy
)
76 return toUShort((xx
< yy
) ? xx
: yy
);
79 static inline UChar
max8S ( Char xx
, Char yy
)
81 return toUChar((xx
> yy
) ? xx
: yy
);
84 static inline UChar
min8S ( Char xx
, Char yy
)
86 return toUChar((xx
< yy
) ? xx
: yy
);
89 static inline ULong
cmpEQ64 ( Long xx
, Long yy
)
91 return (((Long
)xx
) == ((Long
)yy
))
92 ? 0xFFFFFFFFFFFFFFFFULL
: 0ULL;
95 static inline ULong
cmpGT64S ( Long xx
, Long yy
)
97 return (((Long
)xx
) > ((Long
)yy
))
98 ? 0xFFFFFFFFFFFFFFFFULL
: 0ULL;
101 static inline ULong
sar64 ( ULong v
, UInt n
)
103 return ((Long
)v
) >> n
;
106 static inline UChar
sar8 ( UChar v
, UInt n
)
108 return toUChar(((Char
)v
) >> n
);
111 static inline UShort
qnarrow32Sto16U ( UInt xx0
)
115 if (xx
> 65535) xx
= 65535;
119 static inline UShort
narrow32to16 ( UInt xx
)
124 static inline UChar
narrow16to8 ( UShort xx
)
131 h_generic_calc_Mul32x4 ( /*OUT*/V128
* res
,
132 V128
* argL
, V128
* argR
)
134 res
->w32
[0] = mul32(argL
->w32
[0], argR
->w32
[0]);
135 res
->w32
[1] = mul32(argL
->w32
[1], argR
->w32
[1]);
136 res
->w32
[2] = mul32(argL
->w32
[2], argR
->w32
[2]);
137 res
->w32
[3] = mul32(argL
->w32
[3], argR
->w32
[3]);
141 h_generic_calc_Max32Sx4 ( /*OUT*/V128
* res
,
142 V128
* argL
, V128
* argR
)
144 res
->w32
[0] = max32S(argL
->w32
[0], argR
->w32
[0]);
145 res
->w32
[1] = max32S(argL
->w32
[1], argR
->w32
[1]);
146 res
->w32
[2] = max32S(argL
->w32
[2], argR
->w32
[2]);
147 res
->w32
[3] = max32S(argL
->w32
[3], argR
->w32
[3]);
151 h_generic_calc_Min32Sx4 ( /*OUT*/V128
* res
,
152 V128
* argL
, V128
* argR
)
154 res
->w32
[0] = min32S(argL
->w32
[0], argR
->w32
[0]);
155 res
->w32
[1] = min32S(argL
->w32
[1], argR
->w32
[1]);
156 res
->w32
[2] = min32S(argL
->w32
[2], argR
->w32
[2]);
157 res
->w32
[3] = min32S(argL
->w32
[3], argR
->w32
[3]);
161 h_generic_calc_Max32Ux4 ( /*OUT*/V128
* res
,
162 V128
* argL
, V128
* argR
)
164 res
->w32
[0] = max32U(argL
->w32
[0], argR
->w32
[0]);
165 res
->w32
[1] = max32U(argL
->w32
[1], argR
->w32
[1]);
166 res
->w32
[2] = max32U(argL
->w32
[2], argR
->w32
[2]);
167 res
->w32
[3] = max32U(argL
->w32
[3], argR
->w32
[3]);
171 h_generic_calc_Min32Ux4 ( /*OUT*/V128
* res
,
172 V128
* argL
, V128
* argR
)
174 res
->w32
[0] = min32U(argL
->w32
[0], argR
->w32
[0]);
175 res
->w32
[1] = min32U(argL
->w32
[1], argR
->w32
[1]);
176 res
->w32
[2] = min32U(argL
->w32
[2], argR
->w32
[2]);
177 res
->w32
[3] = min32U(argL
->w32
[3], argR
->w32
[3]);
181 h_generic_calc_Max16Ux8 ( /*OUT*/V128
* res
,
182 V128
* argL
, V128
* argR
)
184 res
->w16
[0] = max16U(argL
->w16
[0], argR
->w16
[0]);
185 res
->w16
[1] = max16U(argL
->w16
[1], argR
->w16
[1]);
186 res
->w16
[2] = max16U(argL
->w16
[2], argR
->w16
[2]);
187 res
->w16
[3] = max16U(argL
->w16
[3], argR
->w16
[3]);
188 res
->w16
[4] = max16U(argL
->w16
[4], argR
->w16
[4]);
189 res
->w16
[5] = max16U(argL
->w16
[5], argR
->w16
[5]);
190 res
->w16
[6] = max16U(argL
->w16
[6], argR
->w16
[6]);
191 res
->w16
[7] = max16U(argL
->w16
[7], argR
->w16
[7]);
195 h_generic_calc_Min16Ux8 ( /*OUT*/V128
* res
,
196 V128
* argL
, V128
* argR
)
198 res
->w16
[0] = min16U(argL
->w16
[0], argR
->w16
[0]);
199 res
->w16
[1] = min16U(argL
->w16
[1], argR
->w16
[1]);
200 res
->w16
[2] = min16U(argL
->w16
[2], argR
->w16
[2]);
201 res
->w16
[3] = min16U(argL
->w16
[3], argR
->w16
[3]);
202 res
->w16
[4] = min16U(argL
->w16
[4], argR
->w16
[4]);
203 res
->w16
[5] = min16U(argL
->w16
[5], argR
->w16
[5]);
204 res
->w16
[6] = min16U(argL
->w16
[6], argR
->w16
[6]);
205 res
->w16
[7] = min16U(argL
->w16
[7], argR
->w16
[7]);
209 h_generic_calc_Max8Sx16 ( /*OUT*/V128
* res
,
210 V128
* argL
, V128
* argR
)
212 res
->w8
[ 0] = max8S(argL
->w8
[ 0], argR
->w8
[ 0]);
213 res
->w8
[ 1] = max8S(argL
->w8
[ 1], argR
->w8
[ 1]);
214 res
->w8
[ 2] = max8S(argL
->w8
[ 2], argR
->w8
[ 2]);
215 res
->w8
[ 3] = max8S(argL
->w8
[ 3], argR
->w8
[ 3]);
216 res
->w8
[ 4] = max8S(argL
->w8
[ 4], argR
->w8
[ 4]);
217 res
->w8
[ 5] = max8S(argL
->w8
[ 5], argR
->w8
[ 5]);
218 res
->w8
[ 6] = max8S(argL
->w8
[ 6], argR
->w8
[ 6]);
219 res
->w8
[ 7] = max8S(argL
->w8
[ 7], argR
->w8
[ 7]);
220 res
->w8
[ 8] = max8S(argL
->w8
[ 8], argR
->w8
[ 8]);
221 res
->w8
[ 9] = max8S(argL
->w8
[ 9], argR
->w8
[ 9]);
222 res
->w8
[10] = max8S(argL
->w8
[10], argR
->w8
[10]);
223 res
->w8
[11] = max8S(argL
->w8
[11], argR
->w8
[11]);
224 res
->w8
[12] = max8S(argL
->w8
[12], argR
->w8
[12]);
225 res
->w8
[13] = max8S(argL
->w8
[13], argR
->w8
[13]);
226 res
->w8
[14] = max8S(argL
->w8
[14], argR
->w8
[14]);
227 res
->w8
[15] = max8S(argL
->w8
[15], argR
->w8
[15]);
231 h_generic_calc_Min8Sx16 ( /*OUT*/V128
* res
,
232 V128
* argL
, V128
* argR
)
234 res
->w8
[ 0] = min8S(argL
->w8
[ 0], argR
->w8
[ 0]);
235 res
->w8
[ 1] = min8S(argL
->w8
[ 1], argR
->w8
[ 1]);
236 res
->w8
[ 2] = min8S(argL
->w8
[ 2], argR
->w8
[ 2]);
237 res
->w8
[ 3] = min8S(argL
->w8
[ 3], argR
->w8
[ 3]);
238 res
->w8
[ 4] = min8S(argL
->w8
[ 4], argR
->w8
[ 4]);
239 res
->w8
[ 5] = min8S(argL
->w8
[ 5], argR
->w8
[ 5]);
240 res
->w8
[ 6] = min8S(argL
->w8
[ 6], argR
->w8
[ 6]);
241 res
->w8
[ 7] = min8S(argL
->w8
[ 7], argR
->w8
[ 7]);
242 res
->w8
[ 8] = min8S(argL
->w8
[ 8], argR
->w8
[ 8]);
243 res
->w8
[ 9] = min8S(argL
->w8
[ 9], argR
->w8
[ 9]);
244 res
->w8
[10] = min8S(argL
->w8
[10], argR
->w8
[10]);
245 res
->w8
[11] = min8S(argL
->w8
[11], argR
->w8
[11]);
246 res
->w8
[12] = min8S(argL
->w8
[12], argR
->w8
[12]);
247 res
->w8
[13] = min8S(argL
->w8
[13], argR
->w8
[13]);
248 res
->w8
[14] = min8S(argL
->w8
[14], argR
->w8
[14]);
249 res
->w8
[15] = min8S(argL
->w8
[15], argR
->w8
[15]);
253 h_generic_calc_CmpEQ64x2 ( /*OUT*/V128
* res
,
254 V128
* argL
, V128
* argR
)
256 res
->w64
[0] = cmpEQ64(argL
->w64
[0], argR
->w64
[0]);
257 res
->w64
[1] = cmpEQ64(argL
->w64
[1], argR
->w64
[1]);
261 h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128
* res
,
262 V128
* argL
, V128
* argR
)
264 res
->w64
[0] = cmpGT64S(argL
->w64
[0], argR
->w64
[0]);
265 res
->w64
[1] = cmpGT64S(argL
->w64
[1], argR
->w64
[1]);
268 /* ------------ Shifting ------------ */
269 /* Note that because these primops are undefined if the shift amount
270 equals or exceeds the lane width, the shift amount is masked so
271 that the scalar shifts are always in range. In fact, given the
272 semantics of these primops (Sar64x2, etc) it is an error if in
273 fact we are ever given an out-of-range shift amount.
276 h_generic_calc_SarN64x2 ( /*OUT*/V128
* res
,
279 /* vassert(nn < 64); */
281 res
->w64
[0] = sar64(argL
->w64
[0], nn
);
282 res
->w64
[1] = sar64(argL
->w64
[1], nn
);
286 h_generic_calc_SarN8x16 ( /*OUT*/V128
* res
,
289 /* vassert(nn < 8); */
291 res
->w8
[ 0] = sar8(argL
->w8
[ 0], nn
);
292 res
->w8
[ 1] = sar8(argL
->w8
[ 1], nn
);
293 res
->w8
[ 2] = sar8(argL
->w8
[ 2], nn
);
294 res
->w8
[ 3] = sar8(argL
->w8
[ 3], nn
);
295 res
->w8
[ 4] = sar8(argL
->w8
[ 4], nn
);
296 res
->w8
[ 5] = sar8(argL
->w8
[ 5], nn
);
297 res
->w8
[ 6] = sar8(argL
->w8
[ 6], nn
);
298 res
->w8
[ 7] = sar8(argL
->w8
[ 7], nn
);
299 res
->w8
[ 8] = sar8(argL
->w8
[ 8], nn
);
300 res
->w8
[ 9] = sar8(argL
->w8
[ 9], nn
);
301 res
->w8
[10] = sar8(argL
->w8
[10], nn
);
302 res
->w8
[11] = sar8(argL
->w8
[11], nn
);
303 res
->w8
[12] = sar8(argL
->w8
[12], nn
);
304 res
->w8
[13] = sar8(argL
->w8
[13], nn
);
305 res
->w8
[14] = sar8(argL
->w8
[14], nn
);
306 res
->w8
[15] = sar8(argL
->w8
[15], nn
);
310 h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128
* res
,
311 V128
* argL
, V128
* argR
)
313 res
->w16
[0] = qnarrow32Sto16U(argR
->w32
[0]);
314 res
->w16
[1] = qnarrow32Sto16U(argR
->w32
[1]);
315 res
->w16
[2] = qnarrow32Sto16U(argR
->w32
[2]);
316 res
->w16
[3] = qnarrow32Sto16U(argR
->w32
[3]);
317 res
->w16
[4] = qnarrow32Sto16U(argL
->w32
[0]);
318 res
->w16
[5] = qnarrow32Sto16U(argL
->w32
[1]);
319 res
->w16
[6] = qnarrow32Sto16U(argL
->w32
[2]);
320 res
->w16
[7] = qnarrow32Sto16U(argL
->w32
[3]);
324 h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128
* res
,
325 V128
* argL
, V128
* argR
)
327 res
->w8
[ 0] = narrow16to8(argR
->w16
[0]);
328 res
->w8
[ 1] = narrow16to8(argR
->w16
[1]);
329 res
->w8
[ 2] = narrow16to8(argR
->w16
[2]);
330 res
->w8
[ 3] = narrow16to8(argR
->w16
[3]);
331 res
->w8
[ 4] = narrow16to8(argR
->w16
[4]);
332 res
->w8
[ 5] = narrow16to8(argR
->w16
[5]);
333 res
->w8
[ 6] = narrow16to8(argR
->w16
[6]);
334 res
->w8
[ 7] = narrow16to8(argR
->w16
[7]);
335 res
->w8
[ 8] = narrow16to8(argL
->w16
[0]);
336 res
->w8
[ 9] = narrow16to8(argL
->w16
[1]);
337 res
->w8
[10] = narrow16to8(argL
->w16
[2]);
338 res
->w8
[11] = narrow16to8(argL
->w16
[3]);
339 res
->w8
[12] = narrow16to8(argL
->w16
[4]);
340 res
->w8
[13] = narrow16to8(argL
->w16
[5]);
341 res
->w8
[14] = narrow16to8(argL
->w16
[6]);
342 res
->w8
[15] = narrow16to8(argL
->w16
[7]);
346 h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128
* res
,
347 V128
* argL
, V128
* argR
)
349 res
->w16
[0] = narrow32to16(argR
->w32
[0]);
350 res
->w16
[1] = narrow32to16(argR
->w32
[1]);
351 res
->w16
[2] = narrow32to16(argR
->w32
[2]);
352 res
->w16
[3] = narrow32to16(argR
->w32
[3]);
353 res
->w16
[4] = narrow32to16(argL
->w32
[0]);
354 res
->w16
[5] = narrow32to16(argL
->w32
[1]);
355 res
->w16
[6] = narrow32to16(argL
->w32
[2]);
356 res
->w16
[7] = narrow32to16(argL
->w32
[3]);
360 h_generic_calc_Perm32x4 ( /*OUT*/V128
* res
,
361 V128
* argL
, V128
* argR
)
363 res
->w32
[0] = argL
->w32
[ argR
->w32
[0] & 3 ];
364 res
->w32
[1] = argL
->w32
[ argR
->w32
[1] & 3 ];
365 res
->w32
[2] = argL
->w32
[ argR
->w32
[2] & 3 ];
366 res
->w32
[3] = argL
->w32
[ argR
->w32
[3] & 3 ];
369 //void VEX_REGPARM(3)
370 // h_generic_calc_PermOrZero8x16 ( /*OUT*/V128* res,
371 // V128* argL, V128* argR )
373 // for (UInt i = 0; i < 16; i++) {
374 // UChar ix = argR->w8[i];
375 // Char zeroingMask = (Char)ix;
376 // zeroingMask ^= 0x80;
377 // zeroingMask >>= 7;
379 // res->w8[i] = (argL->w8[ix] & zeroingMask) & 0xFF;
384 h_generic_calc_GetMSBs8x16 ( ULong w64hi
, ULong w64lo
)
386 /* Some serious bit twiddling going on here. Mostly we can do it in
387 parallel for the upper and lower 64 bits, assuming the processor offers
388 a suitably high level of ILP. */
389 w64hi
&= 0x8080808080808080ULL
;
390 w64lo
&= 0x8080808080808080ULL
;
393 w64hi
|= (w64hi
>> 7);
394 w64lo
|= (w64lo
>> 7);
395 w64hi
|= (w64hi
>> 14);
396 w64lo
|= (w64lo
>> 14);
397 w64hi
|= (w64hi
>> 28);
398 w64lo
|= (w64lo
>> 28);
399 UInt r
= ((w64hi
& 0xFF) << 8) | (w64lo
& 0xFF);
403 /*---------------------------------------------------------------*/
404 /*--- end host_generic_simd128.c ---*/
405 /*---------------------------------------------------------------*/