VEX/priv/host_generic_simd128.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                            host_generic_simd128.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2010-2017 OpenWorks GbR
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27 */
  28
  29 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
  30    where the instruction selectors cannot generate code in-line.
  31    These are purely back-end entities and cannot be seen/referenced
  32    from IR. */
  33
  34 #include "libvex_basictypes.h"
  35 #include "host_generic_simd128.h"
  36
  37
  38 /* Primitive helpers always take args of the real type (signed vs
  39    unsigned) but return an unsigned result, so there's no conversion
  40    weirdness when stuffing results back in the V128 union fields,
  41    which are all unsigned. */
  42
  43 static inline UInt mul32 ( Int xx, Int yy )
  44 {
  45    Long t = ((Long)xx) * ((Long)yy);
  46    return toUInt(t);
  47 }
  48
  49 static inline UInt max32S ( Int xx, Int yy )
  50 {
  51    return toUInt((xx > yy) ? xx : yy);
  52 }
  53
  54 static inline UInt min32S ( Int xx, Int yy )
  55 {
  56    return toUInt((xx < yy) ? xx : yy);
  57 }
  58
  59 static inline UInt max32U ( UInt xx, UInt yy )
  60 {
  61    return toUInt((xx > yy) ? xx : yy);
  62 }
  63
  64 static inline UInt min32U ( UInt xx, UInt yy )
  65 {
  66    return toUInt((xx < yy) ? xx : yy);
  67 }
  68
  69 static inline UShort max16U ( UShort xx, UShort yy )
  70 {
  71    return toUShort((xx > yy) ? xx : yy);
  72 }
  73
  74 static inline UShort min16U ( UShort xx, UShort yy )
  75 {
  76    return toUShort((xx < yy) ? xx : yy);
  77 }
  78
  79 static inline UChar max8S ( Char xx, Char yy )
  80 {
  81    return toUChar((xx > yy) ? xx : yy);
  82 }
  83
  84 static inline UChar min8S ( Char xx, Char yy )
  85 {
  86    return toUChar((xx < yy) ? xx : yy);
  87 }
  88
  89 static inline ULong cmpEQ64 ( Long xx, Long yy )
  90 {
  91    return (((Long)xx) == ((Long)yy))
  92              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
  93 }
  94
  95 static inline ULong cmpGT64S ( Long xx, Long yy )
  96 {
  97    return (((Long)xx) > ((Long)yy))
  98              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
  99 }
 100
 101 static inline ULong sar64 ( ULong v, UInt n )
 102 {
 103    return ((Long)v) >> n;
 104 }
 105
 106 static inline UChar sar8 ( UChar v, UInt n )
 107 {
 108    return toUChar(((Char)v) >> n);
 109 }
 110
 111 static inline UShort qnarrow32Sto16U ( UInt xx0 )
 112 {
 113    Int xx = (Int)xx0;
 114    if (xx < 0)     xx = 0;
 115    if (xx > 65535) xx = 65535;
 116    return (UShort)xx;
 117 }
 118
 119 static inline UShort narrow32to16 ( UInt xx )
 120 {
 121    return (UShort)xx;
 122 }
 123
 124 static inline UChar narrow16to8 ( UShort xx )
 125 {
 126    return (UChar)xx;
 127 }
 128
 129
 130 void VEX_REGPARM(3)
 131      h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
 132                               V128* argL, V128* argR )
 133 {
 134    res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
 135    res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
 136    res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
 137    res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
 138 }
 139
 140 void VEX_REGPARM(3)
 141      h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
 142                                V128* argL, V128* argR )
 143 {
 144    res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
 145    res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
 146    res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
 147    res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
 148 }
 149
 150 void VEX_REGPARM(3)
 151      h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
 152                                V128* argL, V128* argR )
 153 {
 154    res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
 155    res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
 156    res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
 157    res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
 158 }
 159
 160 void VEX_REGPARM(3)
 161      h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
 162                                V128* argL, V128* argR )
 163 {
 164    res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
 165    res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
 166    res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
 167    res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
 168 }
 169
 170 void VEX_REGPARM(3)
 171      h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
 172                                V128* argL, V128* argR )
 173 {
 174    res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
 175    res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
 176    res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
 177    res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
 178 }
 179
 180 void VEX_REGPARM(3)
 181      h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
 182                                V128* argL, V128* argR )
 183 {
 184    res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
 185    res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
 186    res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
 187    res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
 188    res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
 189    res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
 190    res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
 191    res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
 192 }
 193
 194 void VEX_REGPARM(3)
 195      h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
 196                                V128* argL, V128* argR )
 197 {
 198    res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
 199    res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
 200    res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
 201    res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
 202    res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
 203    res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
 204    res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
 205    res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
 206 }
 207
 208 void VEX_REGPARM(3)
 209      h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
 210                                V128* argL, V128* argR )
 211 {
 212    res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
 213    res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
 214    res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
 215    res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
 216    res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
 217    res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
 218    res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
 219    res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
 220    res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
 221    res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
 222    res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
 223    res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
 224    res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
 225    res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
 226    res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
 227    res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
 228 }
 229
 230 void VEX_REGPARM(3)
 231      h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
 232                                V128* argL, V128* argR )
 233 {
 234    res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
 235    res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
 236    res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
 237    res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
 238    res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
 239    res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
 240    res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
 241    res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
 242    res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
 243    res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
 244    res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
 245    res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
 246    res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
 247    res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
 248    res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
 249    res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
 250 }
 251
 252 void VEX_REGPARM(3)
 253      h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
 254                                 V128* argL, V128* argR )
 255 {
 256    res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
 257    res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
 258 }
 259
 260 void VEX_REGPARM(3)
 261      h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
 262                                  V128* argL, V128* argR )
 263 {
 264    res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
 265    res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
 266 }
 267
 268 /* ------------ Shifting ------------ */
 269 /* Note that because these primops are undefined if the shift amount
 270    equals or exceeds the lane width, the shift amount is masked so
 271    that the scalar shifts are always in range.  In fact, given the
 272    semantics of these primops (Sar64x2, etc) it is an error if in
 273    fact we are ever given an out-of-range shift amount.
 274 */
 275 void /*not-regparm*/
 276      h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
 277                                V128* argL, UInt nn)
 278 {
 279    /* vassert(nn < 64); */
 280    nn &= 63;
 281    res->w64[0] = sar64(argL->w64[0], nn);
 282    res->w64[1] = sar64(argL->w64[1], nn);
 283 }
 284
 285 void /*not-regparm*/
 286      h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
 287                               V128* argL, UInt nn)
 288 {
 289    /* vassert(nn < 8); */
 290    nn &= 7;
 291    res->w8[ 0] = sar8(argL->w8[ 0], nn);
 292    res->w8[ 1] = sar8(argL->w8[ 1], nn);
 293    res->w8[ 2] = sar8(argL->w8[ 2], nn);
 294    res->w8[ 3] = sar8(argL->w8[ 3], nn);
 295    res->w8[ 4] = sar8(argL->w8[ 4], nn);
 296    res->w8[ 5] = sar8(argL->w8[ 5], nn);
 297    res->w8[ 6] = sar8(argL->w8[ 6], nn);
 298    res->w8[ 7] = sar8(argL->w8[ 7], nn);
 299    res->w8[ 8] = sar8(argL->w8[ 8], nn);
 300    res->w8[ 9] = sar8(argL->w8[ 9], nn);
 301    res->w8[10] = sar8(argL->w8[10], nn);
 302    res->w8[11] = sar8(argL->w8[11], nn);
 303    res->w8[12] = sar8(argL->w8[12], nn);
 304    res->w8[13] = sar8(argL->w8[13], nn);
 305    res->w8[14] = sar8(argL->w8[14], nn);
 306    res->w8[15] = sar8(argL->w8[15], nn);
 307 }
 308
 309 void VEX_REGPARM(3)
 310      h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
 311                                            V128* argL, V128* argR )
 312 {
 313    res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
 314    res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
 315    res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
 316    res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
 317    res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
 318    res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
 319    res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
 320    res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
 321 }
 322
 323 void VEX_REGPARM(3)
 324      h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
 325                                         V128* argL, V128* argR )
 326 {
 327    res->w8[ 0] = narrow16to8(argR->w16[0]);
 328    res->w8[ 1] = narrow16to8(argR->w16[1]);
 329    res->w8[ 2] = narrow16to8(argR->w16[2]);
 330    res->w8[ 3] = narrow16to8(argR->w16[3]);
 331    res->w8[ 4] = narrow16to8(argR->w16[4]);
 332    res->w8[ 5] = narrow16to8(argR->w16[5]);
 333    res->w8[ 6] = narrow16to8(argR->w16[6]);
 334    res->w8[ 7] = narrow16to8(argR->w16[7]);
 335    res->w8[ 8] = narrow16to8(argL->w16[0]);
 336    res->w8[ 9] = narrow16to8(argL->w16[1]);
 337    res->w8[10] = narrow16to8(argL->w16[2]);
 338    res->w8[11] = narrow16to8(argL->w16[3]);
 339    res->w8[12] = narrow16to8(argL->w16[4]);
 340    res->w8[13] = narrow16to8(argL->w16[5]);
 341    res->w8[14] = narrow16to8(argL->w16[6]);
 342    res->w8[15] = narrow16to8(argL->w16[7]);
 343 }
 344
 345 void VEX_REGPARM(3)
 346      h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
 347                                         V128* argL, V128* argR )
 348 {
 349    res->w16[0] = narrow32to16(argR->w32[0]);
 350    res->w16[1] = narrow32to16(argR->w32[1]);
 351    res->w16[2] = narrow32to16(argR->w32[2]);
 352    res->w16[3] = narrow32to16(argR->w32[3]);
 353    res->w16[4] = narrow32to16(argL->w32[0]);
 354    res->w16[5] = narrow32to16(argL->w32[1]);
 355    res->w16[6] = narrow32to16(argL->w32[2]);
 356    res->w16[7] = narrow32to16(argL->w32[3]);
 357 }
 358
 359 void VEX_REGPARM(3)
 360      h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
 361                                V128* argL, V128* argR )
 362 {
 363    res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
 364    res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
 365    res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
 366    res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
 367 }
 368
 369 //void VEX_REGPARM(3)
 370 //     h_generic_calc_PermOrZero8x16 ( /*OUT*/V128* res,
 371 //                                     V128* argL, V128* argR )
 372 //{
 373 //   for (UInt i = 0; i < 16; i++) {
 374 //      UChar ix = argR->w8[i];
 375 //      Char zeroingMask = (Char)ix;
 376 //      zeroingMask ^= 0x80;
 377 //      zeroingMask >>= 7;
 378 //      ix &= 15;
 379 //      res->w8[i] = (argL->w8[ix] & zeroingMask) & 0xFF;
 380 //   }
 381 //}
 382
 383 UInt /*not-regparm*/
 384      h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
 385 {
 386    /* Some serious bit twiddling going on here.  Mostly we can do it in
 387       parallel for the upper and lower 64 bits, assuming the processor offers
 388       a suitably high level of ILP. */
 389    w64hi &= 0x8080808080808080ULL;
 390    w64lo &= 0x8080808080808080ULL;
 391    w64hi >>= 7;
 392    w64lo >>= 7;
 393    w64hi |= (w64hi >> 7);
 394    w64lo |= (w64lo >> 7);
 395    w64hi |= (w64hi >> 14);
 396    w64lo |= (w64lo >> 14);
 397    w64hi |= (w64hi >> 28);
 398    w64lo |= (w64lo >> 28);
 399    UInt r = ((w64hi & 0xFF) << 8) | (w64lo & 0xFF);
 400    return r;
 401 }
 402
 403 /*---------------------------------------------------------------*/
 404 /*--- end                              host_generic_simd128.c ---*/
 405 /*---------------------------------------------------------------*/