gcc/config/powerpcspe/si2vmx.h

   1 /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
   2    Copyright (C) 2007-2017 Free Software Foundation, Inc.
   3
   4    This file is free software; you can redistribute it and/or modify it under
   5    the terms of the GNU General Public License as published by the Free
   6    Software Foundation; either version 3 of the License, or (at your option)
   7    any later version.
   8
   9    This file is distributed in the hope that it will be useful, but WITHOUT
  10    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12    for more details.
  13
  14    Under Section 7 of GPL version 3, you are granted additional
  15    permissions described in the GCC Runtime Library Exception, version
  16    3.1, as published by the Free Software Foundation.
  17
  18    You should have received a copy of the GNU General Public License and
  19    a copy of the GCC Runtime Library Exception along with this program;
  20    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  21    <http://www.gnu.org/licenses/>.  */
  22
  23 #ifndef _SI2VMX_H_
  24 #define _SI2VMX_H_      1
  25
  26 #ifndef __SPU__
  27
  28 #include <stdlib.h>
  29 #include <vec_types.h>
  30
  31
  32 /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
  33  * Users can override the action by defining it prior to including this
  34  * header file.
  35  */
  36 #ifndef SPU_HALT_ACTION
  37 #define SPU_HALT_ACTION         abort()
  38 #endif
  39
  40 /* Specify a default stop action for the spu_stop intrinsic.
  41  * Users can override the action by defining it prior to including this
  42  * header file.
  43  */
  44 #ifndef SPU_STOP_ACTION
  45 #define SPU_STOP_ACTION         abort()
  46 #endif
  47
  48
  49 /* Specify a default action for unsupported intrinsic.
  50  * Users can override the action by defining it prior to including this
  51  * header file.
  52  */
  53 #ifndef SPU_UNSUPPORTED_ACTION
  54 #define SPU_UNSUPPORTED_ACTION  abort()
  55 #endif
  56
  57
  58 /* Casting intrinsics - from scalar to quadword
  59  */
  60
  61 static __inline qword si_from_uchar(unsigned char c) {
  62   union {
  63     qword q;
  64     unsigned char c[16];
  65   } x;
  66   x.c[3] = c;
  67   return (x.q);
  68 }
  69
  70 static __inline qword si_from_char(signed char c) {
  71   union {
  72     qword q;
  73     signed char c[16];
  74   } x;
  75   x.c[3] = c;
  76   return (x.q);
  77 }
  78
  79 static __inline qword si_from_ushort(unsigned short s) {
  80   union {
  81     qword q;
  82     unsigned short s[8];
  83   } x;
  84   x.s[1] = s;
  85   return (x.q);
  86 }
  87
  88 static __inline qword si_from_short(short s) {
  89   union {
  90     qword q;
  91     short s[8];
  92   } x;
  93   x.s[1] = s;
  94   return (x.q);
  95 }
  96
  97
  98 static __inline qword si_from_uint(unsigned int i) {
  99   union {
 100     qword q;
 101     unsigned int i[4];
 102   } x;
 103   x.i[0] = i;
 104   return (x.q);
 105 }
 106
 107 static __inline qword si_from_int(int i) {
 108   union {
 109     qword q;
 110     int i[4];
 111   } x;
 112   x.i[0] = i;
 113   return (x.q);
 114 }
 115
 116 static __inline qword si_from_ullong(unsigned long long l) {
 117   union {
 118     qword q;
 119     unsigned long long l[2];
 120   } x;
 121   x.l[0] = l;
 122   return (x.q);
 123 }
 124
 125 static __inline qword si_from_llong(long long l) {
 126   union {
 127     qword q;
 128     long long l[2];
 129   } x;
 130   x.l[0] = l;
 131   return (x.q);
 132 }
 133
 134 static __inline qword si_from_float(float f) {
 135   union {
 136     qword q;
 137     float f[4];
 138   } x;
 139   x.f[0] = f;
 140   return (x.q);
 141 }
 142
 143 static __inline qword si_from_double(double d) {
 144   union {
 145     qword q;
 146     double d[2];
 147   } x;
 148   x.d[0] = d;
 149   return (x.q);
 150 }
 151
 152 static __inline qword si_from_ptr(void *ptr) {
 153   union {
 154     qword q;
 155     void *p;
 156   } x;
 157   x.p = ptr;
 158   return (x.q);
 159 }
 160
 161
 162 /* Casting intrinsics - from quadword to scalar
 163  */
 164 static __inline unsigned char si_to_uchar(qword q) {
 165   union {
 166     qword q;
 167     unsigned char c[16];
 168   } x;
 169   x.q = q;
 170   return (x.c[3]);
 171 }
 172
 173 static __inline signed char si_to_char(qword q) {
 174   union {
 175     qword q;
 176     signed char c[16];
 177   } x;
 178   x.q = q;
 179   return (x.c[3]);
 180 }
 181
 182 static __inline unsigned short si_to_ushort(qword q) {
 183   union {
 184     qword q;
 185     unsigned short s[8];
 186   } x;
 187   x.q = q;
 188   return (x.s[1]);
 189 }
 190
 191 static __inline short si_to_short(qword q) {
 192   union {
 193     qword q;
 194     short s[8];
 195   } x;
 196   x.q = q;
 197   return (x.s[1]);
 198 }
 199
 200 static __inline unsigned int si_to_uint(qword q) {
 201   union {
 202     qword q;
 203     unsigned int i[4];
 204   } x;
 205   x.q = q;
 206   return (x.i[0]);
 207 }
 208
 209 static __inline int si_to_int(qword q) {
 210   union {
 211     qword q;
 212     int i[4];
 213   } x;
 214   x.q = q;
 215   return (x.i[0]);
 216 }
 217
 218 static __inline unsigned long long si_to_ullong(qword q) {
 219   union {
 220     qword q;
 221     unsigned long long l[2];
 222   } x;
 223   x.q = q;
 224   return (x.l[0]);
 225 }
 226
 227 static __inline long long si_to_llong(qword q) {
 228   union {
 229     qword q;
 230     long long l[2];
 231   } x;
 232   x.q = q;
 233   return (x.l[0]);
 234 }
 235
 236 static __inline float si_to_float(qword q) {
 237   union {
 238     qword q;
 239     float f[4];
 240   } x;
 241   x.q = q;
 242   return (x.f[0]);
 243 }
 244
 245 static __inline double si_to_double(qword q) {
 246   union {
 247     qword q;
 248     double d[2];
 249   } x;
 250   x.q = q;
 251   return (x.d[0]);
 252 }
 253
 254 static __inline void * si_to_ptr(qword q) {
 255   union {
 256     qword q;
 257     void *p;
 258   } x;
 259   x.q = q;
 260   return (x.p);
 261 }
 262
 263
 264 /* Absolute difference
 265  */
 266 static __inline qword si_absdb(qword a, qword b)
 267 {
 268   vec_uchar16 ac, bc, dc;
 269
 270   ac = (vec_uchar16)(a);
 271   bc = (vec_uchar16)(b);
 272   dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
 273
 274   return ((qword)(dc));
 275 }
 276
 277 /* Add intrinsics
 278  */
 279 #define si_a(_a, _b)            ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
 280
 281 #define si_ah(_a, _b)           ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
 282
 283 static __inline qword si_ai(qword a, int b)
 284 {
 285   return ((qword)(vec_add((vec_int4)(a),
 286                           vec_splat((vec_int4)(si_from_int(b)), 0))));
 287 }
 288
 289
 290 static __inline qword si_ahi(qword a, short b)
 291 {
 292   return ((qword)(vec_add((vec_short8)(a),
 293                           vec_splat((vec_short8)(si_from_short(b)), 1))));
 294 }
 295
 296
 297 #define si_fa(_a, _b)   ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
 298
 299
 300 static __inline qword si_dfa(qword a, qword b)
 301 {
 302   union {
 303     vec_double2 v;
 304     double d[2];
 305   } ad, bd, dd;
 306
 307   ad.v = (vec_double2)(a);
 308   bd.v = (vec_double2)(b);
 309   dd.d[0] = ad.d[0] + bd.d[0];
 310   dd.d[1] = ad.d[1] + bd.d[1];
 311
 312   return ((qword)(dd.v));
 313 }
 314
 315 /* Add word extended
 316  */
 317 #define si_addx(_a, _b, _c)     ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),     \
 318                                                  vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
 319
 320
 321 /* Bit-wise AND
 322  */
 323 #define si_and(_a, _b)          ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
 324
 325
 326 static __inline qword si_andbi(qword a, signed char b)
 327 {
 328   return ((qword)(vec_and((vec_char16)(a),
 329                           vec_splat((vec_char16)(si_from_char(b)), 3))));
 330 }
 331
 332 static __inline qword si_andhi(qword a, signed short b)
 333 {
 334   return ((qword)(vec_and((vec_short8)(a),
 335                           vec_splat((vec_short8)(si_from_short(b)), 1))));
 336 }
 337
 338
 339 static __inline qword si_andi(qword a, signed int b)
 340 {
 341   return ((qword)(vec_and((vec_int4)(a),
 342                           vec_splat((vec_int4)(si_from_int(b)), 0))));
 343 }
 344
 345
 346 /* Bit-wise AND with complement
 347  */
 348 #define si_andc(_a, _b)         ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
 349
 350
 351 /* Average byte vectors
 352  */
 353 #define si_avgb(_a, _b)         ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
 354
 355
 356 /* Branch indirect and set link on external data
 357  */
 358 #define si_bisled(_func)        /* not mappable */
 359 #define si_bisledd(_func)       /* not mappable */
 360 #define si_bislede(_func)       /* not mappable */
 361
 362
 363 /* Borrow generate
 364  */
 365 #define si_bg(_a, _b)           ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
 366
 367 #define si_bgx(_a, _b, _c)      ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)),            \
 368                                                         vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)),    \
 369                                                                 (vec_uint4)(_c))), vec_splat_u32(1))))
 370
 371 /* Compare absolute equal
 372  */
 373 static __inline qword si_fcmeq(qword a, qword b)
 374 {
 375   vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
 376
 377   return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
 378                                   vec_andc((vec_float4)(b), msb))));
 379 }
 380
 381 static __inline qword si_dfcmeq(qword a, qword b)
 382 {
 383   vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 384   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
 385   vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
 386
 387   vec_uint4 biteq;
 388   vec_uint4 aabs;
 389   vec_uint4 babs;
 390   vec_uint4 a_gt;
 391   vec_uint4 ahi_inf;
 392   vec_uint4 anan;
 393   vec_uint4 result;
 394
 395   union {
 396     vec_uchar16 v;
 397     int i[4];
 398   } x;
 399
 400   /* Shift 4 bytes  */
 401   x.i[3] = 4 << 3;
 402
 403   /*  Mask out sign bits */
 404   aabs = vec_and((vec_uint4)a,sign_mask);
 405   babs = vec_and((vec_uint4)b,sign_mask);
 406
 407   /*  A)  Check for bit equality, store in high word */
 408   biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
 409   biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
 410
 411   /*
 412       B)  Check if a is NaN, store in high word
 413
 414       B1) If the high word is greater than max_exp (indicates a NaN)
 415       B2) If the low word is greater than 0
 416   */
 417   a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
 418
 419   /*  B3) Check if the high word is equal to the inf exponent */
 420   ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
 421
 422   /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
 423   anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
 424
 425   /*  result = A and not B  */
 426   result = vec_andc(biteq, anan);
 427
 428   /*  Promote high words to 64 bits and return  */
 429   return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
 430 }
 431
 432
 433 /* Compare absolute greater than
 434  */
 435 static __inline qword si_fcmgt(qword a, qword b)
 436 {
 437   vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
 438
 439   return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
 440                                   vec_andc((vec_float4)(b), msb))));
 441 }
 442
 443 static __inline qword si_dfcmgt(qword a, qword b)
 444 {
 445   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 446   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
 447   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 448
 449   union {
 450     vec_uchar16 v;
 451     int i[4];
 452   } x;
 453
 454   /* Shift 4 bytes  */
 455   x.i[3] = 4 << 3;
 456
 457   // absolute value of a,b
 458   vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
 459   vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
 460
 461   // check if a is nan
 462   vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
 463   vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
 464   a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
 465   a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
 466
 467   // check if b is nan
 468   vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
 469   vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
 470   b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
 471   b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
 472
 473   // A) Check if the exponents are different
 474   vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
 475
 476   // B) Check if high word equal, and low word greater
 477   vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
 478   vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
 479   vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
 480
 481   //  If either A or B is true, return true (unless NaNs detected)
 482   vec_uint4 r = vec_or(gt_hi, eqgt);
 483
 484   // splat the high words of the comparison step
 485   r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
 486
 487   // correct for NaNs in input
 488   return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
 489 }
 490
 491
 492 /* Compare equal
 493  */
 494 static __inline qword si_ceqb(qword a, qword b)
 495 {
 496   return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
 497 }
 498
 499 static __inline qword si_ceqh(qword a, qword b)
 500 {
 501   return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
 502 }
 503
 504 static __inline qword si_ceq(qword a, qword b)
 505 {
 506   return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
 507 }
 508
 509 static __inline qword si_fceq(qword a, qword b)
 510 {
 511   return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
 512 }
 513
 514 static __inline qword si_ceqbi(qword a, signed char b)
 515 {
 516   return ((qword)(vec_cmpeq((vec_char16)(a),
 517                             vec_splat((vec_char16)(si_from_char(b)), 3))));
 518 }
 519
 520 static __inline qword si_ceqhi(qword a, signed short b)
 521 {
 522   return ((qword)(vec_cmpeq((vec_short8)(a),
 523                           vec_splat((vec_short8)(si_from_short(b)), 1))));
 524 }
 525
 526 static __inline qword si_ceqi(qword a, signed int b)
 527 {
 528   return ((qword)(vec_cmpeq((vec_int4)(a),
 529                           vec_splat((vec_int4)(si_from_int(b)), 0))));
 530 }
 531
 532 static __inline qword si_dfceq(qword a, qword b)
 533 {
 534   vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 535   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
 536   vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
 537
 538   vec_uint4 biteq;
 539   vec_uint4 aabs;
 540   vec_uint4 babs;
 541   vec_uint4 a_gt;
 542   vec_uint4 ahi_inf;
 543   vec_uint4 anan;
 544   vec_uint4 iszero;
 545   vec_uint4 result;
 546
 547   union {
 548     vec_uchar16 v;
 549     int i[4];
 550   } x;
 551
 552   /* Shift 4 bytes  */
 553   x.i[3] = 4 << 3;
 554
 555   /*  A)  Check for bit equality, store in high word */
 556   biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
 557   biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
 558
 559   /*  Mask out sign bits */
 560   aabs = vec_and((vec_uint4)a,sign_mask);
 561   babs = vec_and((vec_uint4)b,sign_mask);
 562
 563   /*
 564       B)  Check if a is NaN, store in high word
 565
 566       B1) If the high word is greater than max_exp (indicates a NaN)
 567       B2) If the low word is greater than 0
 568   */
 569   a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
 570
 571   /*  B3) Check if the high word is equal to the inf exponent */
 572   ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
 573
 574   /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
 575   anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
 576
 577   /*  C)  Check for 0 = -0 special case */
 578   iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
 579   iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
 580
 581   /*  result = (A or C) and not B  */
 582   result = vec_or(biteq,iszero);
 583   result = vec_andc(result, anan);
 584
 585   /*  Promote high words to 64 bits and return  */
 586   return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
 587 }
 588
 589
 590 /* Compare greater than
 591  */
 592 static __inline qword si_cgtb(qword a, qword b)
 593 {
 594   return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
 595 }
 596
 597 static __inline qword si_cgth(qword a, qword b)
 598 {
 599   return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
 600 }
 601
 602 static __inline qword si_cgt(qword a, qword b)
 603 {
 604   return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
 605 }
 606
 607 static __inline qword si_clgtb(qword a, qword b)
 608 {
 609   return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
 610 }
 611
 612 static __inline qword si_clgth(qword a, qword b)
 613 {
 614   return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
 615 }
 616
 617 static __inline qword si_clgt(qword a, qword b)
 618 {
 619   return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
 620 }
 621
 622 static __inline qword si_fcgt(qword a, qword b)
 623 {
 624   return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
 625 }
 626
 627 static __inline qword si_dfcgt(qword a, qword b)
 628 {
 629   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 630   vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
 631   vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
 632   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 633
 634   union {
 635     vec_uchar16 v;
 636     int i[4];
 637   } x;
 638
 639   /* Shift 4 bytes  */
 640   x.i[3] = 4 << 3;
 641
 642   // absolute value of a,b
 643   vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
 644   vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
 645
 646   // check if a is nan
 647   vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
 648   vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
 649   a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
 650   a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
 651
 652   // check if b is nan
 653   vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
 654   vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
 655   b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
 656   b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
 657
 658   // sign of a
 659   vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
 660   asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
 661
 662   // sign of b
 663   vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
 664   bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
 665
 666   // negative a
 667   vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
 668   vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
 669   abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
 670   vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
 671
 672   // pick the one we want
 673   vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
 674
 675   // negative b
 676   vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
 677   bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
 678   vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
 679
 680   // pick the one we want
 681   vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
 682
 683   // A) Check if the exponents are different
 684   vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
 685
 686   // B) Check if high word equal, and low word greater
 687   vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
 688   vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
 689   vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
 690
 691   //  If either A or B is true, return true (unless NaNs detected)
 692   vec_uint4 r = vec_or(gt_hi, eqgt);
 693
 694   // splat the high words of the comparison step
 695   r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
 696
 697   // correct for NaNs in input
 698   return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
 699 }
 700
 701 static __inline qword si_cgtbi(qword a, signed char b)
 702 {
 703   return ((qword)(vec_cmpgt((vec_char16)(a),
 704                             vec_splat((vec_char16)(si_from_char(b)), 3))));
 705 }
 706
 707 static __inline qword si_cgthi(qword a, signed short b)
 708 {
 709   return ((qword)(vec_cmpgt((vec_short8)(a),
 710                             vec_splat((vec_short8)(si_from_short(b)), 1))));
 711 }
 712
 713 static __inline qword si_cgti(qword a, signed int b)
 714 {
 715   return ((qword)(vec_cmpgt((vec_int4)(a),
 716                             vec_splat((vec_int4)(si_from_int(b)), 0))));
 717 }
 718
 719 static __inline qword si_clgtbi(qword a, unsigned char b)
 720 {
 721   return ((qword)(vec_cmpgt((vec_uchar16)(a),
 722                             vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
 723 }
 724
 725 static __inline qword si_clgthi(qword a, unsigned short b)
 726 {
 727   return ((qword)(vec_cmpgt((vec_ushort8)(a),
 728                             vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
 729 }
 730
 731 static __inline qword si_clgti(qword a, unsigned int b)
 732 {
 733   return ((qword)(vec_cmpgt((vec_uint4)(a),
 734                             vec_splat((vec_uint4)(si_from_uint(b)), 0))));
 735 }
 736
 737 static __inline qword si_dftsv(qword a, char b)
 738 {
 739   vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 740   vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
 741   vec_uint4 result = (vec_uint4){0};
 742   vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
 743   sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
 744   vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
 745
 746   union {
 747     vec_uchar16 v;
 748     int i[4];
 749   } x;
 750
 751   /* Shift 4 bytes  */
 752   x.i[3] = 4 << 3;
 753
 754   /* Nan or +inf or -inf  */
 755   if (b & 0x70)
 756   {
 757     vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
 758     vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
 759      /* NaN  */
 760      if (b & 0x40)
 761      {
 762        vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
 763        a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
 764        a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
 765        result = vec_or(result, a_nan);
 766      }
 767      /* inf  */
 768      if (b & 0x30)
 769      {
 770        a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
 771        a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
 772         /* +inf  */
 773         if (b & 0x20)
 774           result = vec_or(vec_andc(a_inf, sign), result);
 775         /* -inf  */
 776         if (b & 0x10)
 777           result = vec_or(vec_and(a_inf, sign), result);
 778      }
 779   }
 780   /* 0 or denorm  */
 781   if (b & 0xF)
 782   {
 783     vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
 784     iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
 785     /* denorm  */
 786     if (b & 0x3)
 787     {
 788       vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
 789       vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
 790       isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
 791       /* +denorm  */
 792      if (b & 0x2)
 793         result = vec_or(vec_andc(isdenorm, sign), result);
 794       /* -denorm  */
 795      if (b & 0x1)
 796         result = vec_or(vec_and(isdenorm, sign), result);
 797     }
 798     /* 0  */
 799     if (b & 0xC)
 800     {
 801       iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
 802       /* +0  */
 803      if (b & 0x8)
 804         result = vec_or(vec_andc(iszero, sign), result);
 805       /* -0  */
 806      if (b & 0x4)
 807         result = vec_or(vec_and(iszero, sign), result);
 808     }
 809   }
 810   return ((qword)result);
 811 }
 812
 813
 814 /* Carry generate
 815  */
 816 #define si_cg(_a, _b)           ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
 817
 818 #define si_cgx(_a, _b, _c)      ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)),             \
 819                                                 vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),     \
 820                                                          vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
 821
 822
 823 /* Count ones for bytes
 824  */
 825 static __inline qword si_cntb(qword a)
 826 {
 827   vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
 828   vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
 829   vec_uchar16 av;
 830
 831   av = (vec_uchar16)(a);
 832
 833   return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
 834                           vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
 835 }
 836
 837 /* Count ones for bytes
 838  */
 839 static __inline qword si_clz(qword a)
 840 {
 841   vec_uchar16 av;
 842   vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
 843   vec_uchar16 four    = vec_splat_u8(4);
 844   vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
 845   vec_uchar16 eight   = vec_splat_u8(8);
 846   vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
 847   vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
 848
 849   av = (vec_uchar16)(a);
 850
 851   cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
 852   cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
 853
 854   cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
 855
 856   tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
 857   tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
 858   tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
 859
 860   cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
 861   cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
 862   cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
 863
 864   return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
 865 }
 866
 867 /* Convert to float
 868  */
 869 #define si_cuflt(_a, _b)        ((qword)(vec_ctf((vec_uint4)(_a), _b)))
 870 #define si_csflt(_a, _b)        ((qword)(vec_ctf((vec_int4)(_a), _b)))
 871
 872 /* Convert to signed int
 873  */
 874 #define si_cflts(_a, _b)        ((qword)(vec_cts((vec_float4)(_a), _b)))
 875
 876 /* Convert to unsigned int
 877  */
 878 #define si_cfltu(_a, _b)        ((qword)(vec_ctu((vec_float4)(_a), _b)))
 879
 880 /* Synchronize
 881  */
 882 #define si_dsync()              /* do nothing */
 883 #define si_sync()               /* do nothing */
 884 #define si_syncc()              /* do nothing */
 885
 886
 887 /* Equivalence
 888  */
 889 static __inline qword si_eqv(qword a, qword b)
 890 {
 891   vec_uchar16 d;
 892
 893   d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
 894   return ((qword)(vec_nor(d, d)));
 895 }
 896
 897 /* Extend
 898  */
 899 static __inline qword si_xsbh(qword a)
 900 {
 901   vec_char16 av;
 902
 903   av = (vec_char16)(a);
 904   return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
 905                                                               0, 0, 0, 0, 0, 0, 0, 0})))));
 906 }
 907
 908 static __inline qword si_xshw(qword a)
 909 {
 910   vec_short8 av;
 911
 912   av = (vec_short8)(a);
 913   return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
 914                                                               10,11,14,15,
 915                                                               0, 0, 0, 0,
 916                                                               0, 0, 0, 0})))));
 917 }
 918
 919 static __inline qword si_xswd(qword a)
 920 {
 921   vec_int4 av;
 922
 923   av = (vec_int4)(a);
 924   return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
 925                            ((vec_uchar16){20, 21, 22, 23,
 926                                            4,  5,  6,  7,
 927                                           28, 29, 30, 31,
 928                                           12, 13, 14, 15}))));
 929 }
 930
 931 static __inline qword si_fesd(qword a)
 932 {
 933   union {
 934     double d[2];
 935     vec_double2 vd;
 936   } out;
 937   union {
 938     float f[4];
 939     vec_float4 vf;
 940   } in;
 941
 942   in.vf = (vec_float4)(a);
 943   out.d[0] = (double)(in.f[0]);
 944   out.d[1] = (double)(in.f[2]);
 945   return ((qword)(out.vd));
 946 }
 947
 948 /* Gather
 949  */
 950 static __inline qword si_gbb(qword a)
 951 {
 952   vec_uchar16 bits;
 953   vec_uint4   bytes;
 954
 955   bits  = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
 956                                                                             7, 6, 5, 4, 3, 2, 1, 0}));
 957   bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
 958
 959   return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
 960                                                         0, 0, 0, 0, 0, 0, 0, 0}))));
 961 }
 962
 963
 964 static __inline qword si_gbh(qword a)
 965 {
 966   vec_ushort8 bits;
 967   vec_uint4   bytes;
 968
 969   bits  = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
 970
 971   bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
 972
 973   return ((qword)(vec_sld(bytes, bytes, 12)));
 974 }
 975
 976 static __inline qword si_gb(qword a)
 977 {
 978   vec_uint4 bits;
 979   vec_uint4 bytes;
 980
 981   bits  = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
 982   bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
 983   return ((qword)(vec_sld(bytes, bytes, 12)));
 984 }
 985
 986
 987 /* Compare and halt
 988  */
 989 static __inline void si_heq(qword a, qword b)
 990 {
 991   union {
 992     vector unsigned int v;
 993     unsigned int i[4];
 994   } aa, bb;
 995
 996   aa.v = (vector unsigned int)(a);
 997   bb.v = (vector unsigned int)(b);
 998
 999   if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1000 }
1001
1002 static __inline void si_heqi(qword a, unsigned int b)
1003 {
1004   union {
1005     vector unsigned int v;
1006     unsigned int i[4];
1007   } aa;
1008
1009   aa.v = (vector unsigned int)(a);
1010
1011   if (aa.i[0] == b) { SPU_HALT_ACTION; };
1012 }
1013
1014 static __inline void si_hgt(qword a, qword b)
1015 {
1016   union {
1017     vector signed int v;
1018     signed int i[4];
1019   } aa, bb;
1020
1021   aa.v = (vector signed int)(a);
1022   bb.v = (vector signed int)(b);
1023
1024   if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1025 }
1026
1027 static __inline void si_hgti(qword a, signed int b)
1028 {
1029   union {
1030     vector signed int v;
1031     signed int i[4];
1032   } aa;
1033
1034   aa.v = (vector signed int)(a);
1035
1036   if (aa.i[0] > b) { SPU_HALT_ACTION; };
1037 }
1038
1039 static __inline void si_hlgt(qword a, qword b)
1040 {
1041   union {
1042     vector unsigned int v;
1043     unsigned int i[4];
1044   } aa, bb;
1045
1046   aa.v = (vector unsigned int)(a);
1047   bb.v = (vector unsigned int)(b);
1048
1049   if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1050 }
1051
1052 static __inline void si_hlgti(qword a, unsigned int b)
1053 {
1054   union {
1055     vector unsigned int v;
1056     unsigned int i[4];
1057   } aa;
1058
1059   aa.v = (vector unsigned int)(a);
1060
1061   if (aa.i[0] > b) { SPU_HALT_ACTION; };
1062 }
1063
1064
1065 /* Multiply and Add
1066  */
1067 static __inline qword si_mpya(qword a, qword b, qword c)
1068 {
1069   return ((qword)(vec_msum(vec_and((vec_short8)(a),
1070                                    ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1071                            (vec_short8)(b), (vec_int4)(c))));
1072 }
1073
1074 static __inline qword si_fma(qword a, qword b, qword c)
1075 {
1076   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1077 }
1078
1079 static __inline qword si_dfma(qword a, qword b, qword c)
1080 {
1081   union {
1082     vec_double2 v;
1083     double d[2];
1084   } aa, bb, cc, dd;
1085
1086   aa.v = (vec_double2)(a);
1087   bb.v = (vec_double2)(b);
1088   cc.v = (vec_double2)(c);
1089   dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1090   dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1091   return ((qword)(dd.v));
1092 }
1093
1094 /* Form Mask
1095  */
1096 #define si_fsmbi(_a)    si_fsmb(si_from_int(_a))
1097
1098 static __inline qword si_fsmb(qword a)
1099 {
1100   vec_char16 mask;
1101   vec_ushort8 in;
1102
1103   in = (vec_ushort8)(a);
1104   mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1105                                                       3, 3, 3, 3, 3, 3, 3, 3})));
1106   return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1107                                                       0, 1, 2, 3, 4, 5, 6, 7})),
1108                           vec_splat_u8(7))));
1109 }
1110
1111
1112 static __inline qword si_fsmh(qword a)
1113 {
1114   vec_uchar16 in;
1115   vec_short8 mask;
1116
1117   in = (vec_uchar16)(a);
1118   mask = (vec_short8)(vec_splat(in, 3));
1119   return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1120                           vec_splat_u16(15))));
1121 }
1122
1123 static __inline qword si_fsm(qword a)
1124 {
1125   vec_uchar16 in;
1126   vec_int4 mask;
1127
1128   in = (vec_uchar16)(a);
1129   mask = (vec_int4)(vec_splat(in, 3));
1130   return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1131                           ((vec_uint4){31,31,31,31}))));
1132 }
1133
1134 /* Move from/to registers
1135  */
1136 #define si_fscrrd()             ((qword)((vec_uint4){0}))
1137 #define si_fscrwr(_a)
1138
1139 #define si_mfspr(_reg)          ((qword)((vec_uint4){0}))
1140 #define si_mtspr(_reg, _a)
1141
1142 /* Multiply High High Add
1143  */
1144 static __inline qword si_mpyhha(qword a, qword b, qword c)
1145 {
1146   return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1147 }
1148
1149 static __inline qword si_mpyhhau(qword a, qword b, qword c)
1150 {
1151   return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1152 }
1153
1154 /* Multiply Subtract
1155  */
1156 static __inline qword si_fms(qword a, qword b, qword c)
1157 {
1158   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1159                            vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1160 }
1161
1162 static __inline qword si_dfms(qword a, qword b, qword c)
1163 {
1164   union {
1165     vec_double2 v;
1166     double d[2];
1167   } aa, bb, cc, dd;
1168
1169   aa.v = (vec_double2)(a);
1170   bb.v = (vec_double2)(b);
1171   cc.v = (vec_double2)(c);
1172   dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1173   dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1174   return ((qword)(dd.v));
1175 }
1176
1177 /* Multiply
1178  */
1179 static __inline qword si_fm(qword a, qword b)
1180 {
1181   return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1182 }
1183
1184 static __inline qword si_dfm(qword a, qword b)
1185 {
1186   union {
1187     vec_double2 v;
1188     double d[2];
1189   } aa, bb, dd;
1190
1191   aa.v = (vec_double2)(a);
1192   bb.v = (vec_double2)(b);
1193   dd.d[0] = aa.d[0] * bb.d[0];
1194   dd.d[1] = aa.d[1] * bb.d[1];
1195   return ((qword)(dd.v));
1196 }
1197
1198 /* Multiply High
1199  */
1200 static __inline qword si_mpyh(qword a, qword b)
1201 {
1202   vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1203
1204   return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1205 }
1206
1207
1208 /* Multiply High High
1209  */
1210 static __inline qword si_mpyhh(qword a, qword b)
1211 {
1212   return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1213 }
1214
1215 static __inline qword si_mpyhhu(qword a, qword b)
1216 {
1217   return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1218 }
1219
1220 /* Multiply Odd
1221  */
1222 static __inline qword si_mpy(qword a, qword b)
1223 {
1224   return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1225 }
1226
1227 static __inline qword si_mpyu(qword a, qword b)
1228 {
1229   return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1230 }
1231
1232 static __inline qword si_mpyi(qword a, short b)
1233 {
1234   return ((qword)(vec_mulo((vec_short8)(a),
1235                            vec_splat((vec_short8)(si_from_short(b)), 1))));
1236 }
1237
1238 static __inline qword si_mpyui(qword a, unsigned short b)
1239 {
1240   return ((qword)(vec_mulo((vec_ushort8)(a),
1241                            vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1242 }
1243
1244 /* Multiply and Shift Right
1245  */
1246 static __inline qword si_mpys(qword a, qword b)
1247 {
1248   return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1249 }
1250
1251 /* Nand
1252  */
1253 static __inline qword si_nand(qword a, qword b)
1254 {
1255   vec_uchar16 d;
1256
1257   d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1258   return ((qword)(vec_nor(d, d)));
1259 }
1260
1261 /* Negative Multiply Add
1262  */
1263 static __inline qword si_dfnma(qword a, qword b, qword c)
1264 {
1265   union {
1266     vec_double2 v;
1267     double d[2];
1268   } aa, bb, cc, dd;
1269
1270   aa.v = (vec_double2)(a);
1271   bb.v = (vec_double2)(b);
1272   cc.v = (vec_double2)(c);
1273   dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1274   dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1275   return ((qword)(dd.v));
1276 }
1277
1278 /* Negative Multiply and Subtract
1279  */
1280 static __inline qword si_fnms(qword a, qword b, qword c)
1281 {
1282   return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1283 }
1284
1285 static __inline qword si_dfnms(qword a, qword b, qword c)
1286 {
1287   union {
1288     vec_double2 v;
1289     double d[2];
1290   } aa, bb, cc, dd;
1291
1292   aa.v = (vec_double2)(a);
1293   bb.v = (vec_double2)(b);
1294   cc.v = (vec_double2)(c);
1295   dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1296   dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1297   return ((qword)(dd.v));
1298 }
1299
1300 /* Nor
1301  */
1302 static __inline qword si_nor(qword a, qword b)
1303 {
1304   return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1305 }
1306
1307 /* Or
1308  */
1309 static __inline qword si_or(qword a, qword b)
1310 {
1311   return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1312 }
1313
1314 static __inline qword si_orbi(qword a, unsigned char b)
1315 {
1316   return ((qword)(vec_or((vec_uchar16)(a),
1317                          vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1318 }
1319
1320 static __inline qword si_orhi(qword a, unsigned short b)
1321 {
1322   return ((qword)(vec_or((vec_ushort8)(a),
1323                           vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1324 }
1325
1326 static __inline qword si_ori(qword a, unsigned int b)
1327 {
1328   return ((qword)(vec_or((vec_uint4)(a),
1329                           vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1330 }
1331
1332 /* Or Complement
1333  */
1334 static __inline qword si_orc(qword a, qword b)
1335 {
1336   return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1337 }
1338
1339
1340 /* Or Across
1341  */
1342 static __inline qword si_orx(qword a)
1343 {
1344   vec_uchar16 tmp;
1345   tmp = (vec_uchar16)(a);
1346   tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1347   tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1348   return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1349                                               0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1350 }
1351
1352
1353 /* Estimates
1354  */
1355 static __inline qword si_frest(qword a)
1356 {
1357   return ((qword)(vec_re((vec_float4)(a))));
1358 }
1359
1360 static __inline qword si_frsqest(qword a)
1361 {
1362   return ((qword)(vec_rsqrte((vec_float4)(a))));
1363 }
1364
1365 #define si_fi(_a, _d)           (_d)
1366
1367 /* Channel Read and Write
1368  */
1369 #define si_rdch(_channel)               ((qword)(vec_splat_u8(0)))      /* not mappable */
1370 #define si_rchcnt(_channel)             ((qword)(vec_splat_u8(0)))      /* not mappable */
1371 #define si_wrch(_channel, _a)           /* not mappable */
1372
1373 /* Rotate Left
1374  */
1375 static __inline qword si_roth(qword a, qword b)
1376 {
1377   return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1378 }
1379
1380 static __inline qword si_rot(qword a, qword b)
1381 {
1382   return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1383 }
1384
1385 static __inline qword si_rothi(qword a, int b)
1386 {
1387   return ((qword)(vec_rl((vec_ushort8)(a),
1388                          vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1389 }
1390
1391 static __inline qword si_roti(qword a, int b)
1392 {
1393   return ((qword)(vec_rl((vec_uint4)(a),
1394                          vec_splat((vec_uint4)(si_from_int(b)), 0))));
1395 }
1396
1397 /* Rotate Left with Mask
1398  */
1399 static __inline qword si_rothm(qword a, qword b)
1400 {
1401   vec_ushort8 neg_b;
1402   vec_ushort8 mask;
1403
1404   neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1405   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1406   return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1407 }
1408
1409 static __inline qword si_rotm(qword a, qword b)
1410 {
1411   vec_uint4 neg_b;
1412   vec_uint4 mask;
1413
1414   neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1415   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1416   return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1417 }
1418
1419 static __inline qword si_rothmi(qword a, int b)
1420 {
1421   vec_ushort8 neg_b;
1422   vec_ushort8 mask;
1423
1424   neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1425   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1426   return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1427 }
1428
1429 static __inline qword si_rotmi(qword a, int b)
1430 {
1431   vec_uint4 neg_b;
1432   vec_uint4 mask;
1433
1434   neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1435   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1436   return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1437 }
1438
1439
1440 /* Rotate Left Algebraic with Mask
1441  */
1442 static __inline qword si_rotmah(qword a, qword b)
1443 {
1444   vec_ushort8 neg_b;
1445   vec_ushort8 mask;
1446
1447   neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1448   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1449   return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1450 }
1451
1452 static __inline qword si_rotma(qword a, qword b)
1453 {
1454   vec_uint4 neg_b;
1455   vec_uint4 mask;
1456
1457   neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1458   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1459   return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1460 }
1461
1462
1463 static __inline qword si_rotmahi(qword a, int b)
1464 {
1465   vec_ushort8 neg_b;
1466   vec_ushort8 mask;
1467
1468   neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1469   mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1470   return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1471 }
1472
1473 static __inline qword si_rotmai(qword a, int b)
1474 {
1475   vec_uint4 neg_b;
1476   vec_uint4 mask;
1477
1478   neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1479   mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1480   return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1481 }
1482
1483
1484 /* Rotate Left Quadword by Bytes with Mask
1485  */
1486 static __inline qword si_rotqmbyi(qword a, int count)
1487 {
1488   union {
1489     vec_uchar16 v;
1490     int i[4];
1491   } x;
1492   vec_uchar16 mask;
1493
1494   count = 0 - count;
1495   x.i[3] = count << 3;
1496   mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1497
1498   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1499 }
1500
1501
1502 static __inline qword si_rotqmby(qword a, qword count)
1503 {
1504   union {
1505     vec_uchar16 v;
1506     int i[4];
1507   } x;
1508   int cnt;
1509   vec_uchar16 mask;
1510
1511   x.v = (vec_uchar16)(count);
1512   x.i[0] = cnt = (0 - x.i[0]) << 3;
1513
1514   x.v = vec_splat(x.v, 3);
1515   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1516
1517   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1518 }
1519
1520
1521 /* Rotate Left Quadword by Bytes
1522  */
1523 static __inline qword si_rotqbyi(qword a, int count)
1524 {
1525   union {
1526     vec_uchar16 v;
1527     int i[4];
1528   } left, right;
1529
1530   count <<= 3;
1531   left.i[3] = count;
1532   right.i[3] = 0 - count;
1533   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1534 }
1535
1536 static __inline qword si_rotqby(qword a, qword count)
1537 {
1538   vec_uchar16 left, right;
1539
1540   left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1541   right = vec_sub(vec_splat_u8(0), left);
1542   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1543 }
1544
1545 /* Rotate Left Quadword by Bytes Bit Count
1546  */
1547 static __inline qword si_rotqbybi(qword a, qword count)
1548 {
1549   vec_uchar16 left, right;
1550
1551   left = vec_splat((vec_uchar16)(count), 3);
1552   right = vec_sub(vec_splat_u8(7), left);
1553   return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1554 }
1555
1556
1557 /* Rotate Left Quadword by Bytes Bit Count
1558  */
1559 static __inline qword si_rotqbii(qword a, int count)
1560 {
1561   vec_uchar16 x, y;
1562   vec_uchar16 result;
1563
1564   x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1565   y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1566                            (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1567   result = vec_or(vec_sll((qword)(a), x), y);
1568   return ((qword)(result));
1569 }
1570
1571 static __inline qword si_rotqbi(qword a, qword count)
1572 {
1573   vec_uchar16 x, y;
1574   vec_uchar16 result;
1575
1576   x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1577   y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1578                            (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1579
1580   result = vec_or(vec_sll((qword)(a), x), y);
1581   return ((qword)(result));
1582 }
1583
1584
1585 /* Rotate Left Quadword and Mask by Bits
1586  */
1587 static __inline qword si_rotqmbii(qword a, int count)
1588 {
1589   return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1590 }
1591
1592 static __inline qword si_rotqmbi(qword a, qword count)
1593 {
1594   return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1595 }
1596
1597
1598 /* Rotate Left Quadword and Mask by Bytes with Bit Count
1599  */
1600 static __inline qword si_rotqmbybi(qword a, qword count)
1601 {
1602   union {
1603     vec_uchar16 v;
1604     int i[4];
1605   } x;
1606   int cnt;
1607   vec_uchar16 mask;
1608
1609   x.v = (vec_uchar16)(count);
1610   x.i[0] = cnt = 0 - (x.i[0] & ~7);
1611   x.v = vec_splat(x.v, 3);
1612   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1613
1614   return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1615 }
1616
1617
1618
1619
1620 /* Round Double to Float
1621  */
1622 static __inline qword si_frds(qword a)
1623 {
1624   union {
1625     vec_float4 v;
1626     float f[4];
1627   } d;
1628   union {
1629     vec_double2 v;
1630     double d[2];
1631   } in;
1632
1633   in.v = (vec_double2)(a);
1634   d.v = (vec_float4){0.0f};
1635   d.f[0] = (float)in.d[0];
1636   d.f[2] = (float)in.d[1];
1637
1638   return ((qword)(d.v));
1639 }
1640
1641 /* Select Bits
1642  */
1643 static __inline qword si_selb(qword a, qword b, qword c)
1644 {
1645   return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1646 }
1647
1648
1649 /* Shuffle Bytes
1650  */
1651 static __inline qword si_shufb(qword a, qword b, qword pattern)
1652 {
1653   vec_uchar16 pat;
1654
1655   pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1656                 vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1657                 vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1658   return ((qword)(vec_perm(vec_perm(a, b, pattern),
1659                            ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1660                                           0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1661                            pat)));
1662 }
1663
1664
1665 /* Shift Left
1666  */
1667 static __inline qword si_shlh(qword a, qword b)
1668 {
1669   vec_ushort8 mask;
1670
1671   mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1672   return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1673 }
1674
1675 static __inline qword si_shl(qword a, qword b)
1676 {
1677   vec_uint4 mask;
1678
1679   mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1680   return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1681 }
1682
1683
1684 static __inline qword si_shlhi(qword a, unsigned int b)
1685 {
1686   vec_ushort8 mask;
1687   vec_ushort8 bv;
1688
1689   bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1690   mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1691   return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1692 }
1693
1694 static __inline qword si_shli(qword a, unsigned int b)
1695 {
1696   vec_uint4 bv;
1697   vec_uint4 mask;
1698
1699   bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1700   mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1701   return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1702 }
1703
1704
1705 /* Shift Left Quadword
1706  */
1707 static __inline qword si_shlqbii(qword a, unsigned int count)
1708 {
1709   vec_uchar16 x;
1710
1711   x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1712   return ((qword)(vec_sll((vec_uchar16)(a), x)));
1713 }
1714
1715 static __inline qword si_shlqbi(qword a, qword count)
1716 {
1717   vec_uchar16 x;
1718
1719   x = vec_splat((vec_uchar16)(count), 3);
1720   return ((qword)(vec_sll((vec_uchar16)(a), x)));
1721 }
1722
1723
1724 /* Shift Left Quadword by Bytes
1725  */
1726 static __inline qword si_shlqbyi(qword a, unsigned int count)
1727 {
1728   union {
1729     vec_uchar16 v;
1730     int i[4];
1731   } x;
1732   vec_uchar16 mask;
1733
1734   x.i[3] = count << 3;
1735   mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1736   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1737 }
1738
1739 static __inline qword si_shlqby(qword a, qword count)
1740 {
1741   union {
1742     vec_uchar16 v;
1743     unsigned int i[4];
1744   } x;
1745   unsigned int cnt;
1746   vec_uchar16 mask;
1747
1748   x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1749   cnt = x.i[0];
1750   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1751   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1752 }
1753
1754 /* Shift Left Quadword by Bytes with Bit Count
1755  */
1756 static __inline qword si_shlqbybi(qword a, qword count)
1757 {
1758   union {
1759     vec_uchar16 v;
1760     int i[4];
1761   } x;
1762   unsigned int cnt;
1763   vec_uchar16 mask;
1764
1765   x.v = vec_splat((vec_uchar16)(count), 3);
1766   cnt = x.i[0];
1767   mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1768   return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1769 }
1770
1771
1772 /* Stop and Signal
1773  */
1774 #define si_stop(_type)          SPU_STOP_ACTION
1775 #define si_stopd(a, b, c)       SPU_STOP_ACTION
1776
1777
1778 /* Subtract
1779  */
1780 static __inline qword si_sfh(qword a, qword b)
1781 {
1782   return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1783 }
1784
1785 static __inline qword si_sf(qword a, qword b)
1786 {
1787   return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1788 }
1789
1790 static __inline qword si_fs(qword a, qword b)
1791 {
1792   return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1793 }
1794
1795 static __inline qword si_dfs(qword a, qword b)
1796 {
1797   union {
1798     vec_double2 v;
1799     double d[2];
1800   } aa, bb, dd;
1801
1802   aa.v = (vec_double2)(a);
1803   bb.v = (vec_double2)(b);
1804   dd.d[0] = aa.d[0] - bb.d[0];
1805   dd.d[1] = aa.d[1] - bb.d[1];
1806   return ((qword)(dd.v));
1807 }
1808
1809 static __inline qword si_sfhi(qword a, short b)
1810 {
1811   return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1812                           (vec_short8)(a))));
1813 }
1814
1815 static __inline qword si_sfi(qword a, int b)
1816 {
1817   return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1818                           (vec_int4)(a))));
1819 }
1820
1821 /* Subtract word extended
1822  */
1823 #define si_sfx(_a, _b, _c)      ((qword)(vec_add(vec_add((vec_uint4)(_b),                               \
1824                                                          vec_nor((vec_uint4)(_a), (vec_uint4)(_a))),    \
1825                                                  vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1826
1827
1828 /* Sum Bytes into Shorts
1829  */
1830 static __inline qword si_sumb(qword a, qword b)
1831 {
1832   vec_uint4 zero = (vec_uint4){0};
1833   vec_ushort8 sum_a, sum_b;
1834
1835   sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1836   sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1837
1838   return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19,  2,  3, 22, 23,  6,  7,
1839                                                         26, 27, 10, 11, 30, 31, 14, 15}))));
1840 }
1841
1842 /* Exclusive OR
1843  */
1844 static __inline qword si_xor(qword a, qword b)
1845 {
1846   return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1847 }
1848
1849 static __inline qword si_xorbi(qword a, unsigned char b)
1850 {
1851   return ((qword)(vec_xor((vec_uchar16)(a),
1852                           vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1853 }
1854
1855 static __inline qword si_xorhi(qword a, unsigned short b)
1856 {
1857   return ((qword)(vec_xor((vec_ushort8)(a),
1858                           vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1859 }
1860
1861 static __inline qword si_xori(qword a, unsigned int b)
1862 {
1863   return ((qword)(vec_xor((vec_uint4)(a),
1864                           vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1865 }
1866
1867
1868 /* Generate Controls for Sub-Quadword Insertion
1869  */
1870 static __inline qword si_cbd(qword a, int imm)
1871 {
1872   union {
1873     vec_uint4 v;
1874     unsigned char c[16];
1875   } shmask;
1876
1877   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1878   shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1879   return ((qword)(shmask.v));
1880 }
1881
1882 static __inline qword si_cdd(qword a, int imm)
1883 {
1884   union {
1885     vec_uint4 v;
1886     unsigned long long ll[2];
1887   } shmask;
1888
1889   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1890   shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1891   return ((qword)(shmask.v));
1892 }
1893
1894 static __inline qword si_chd(qword a, int imm)
1895 {
1896   union {
1897     vec_uint4 v;
1898     unsigned short s[8];
1899   } shmask;
1900
1901   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1902   shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1903   return ((qword)(shmask.v));
1904 }
1905
1906 static __inline qword si_cwd(qword a, int imm)
1907 {
1908   union {
1909     vec_uint4 v;
1910     unsigned int i[4];
1911   } shmask;
1912
1913   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1914   shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1915   return ((qword)(shmask.v));
1916 }
1917
1918 static __inline qword si_cbx(qword a, qword b)
1919 {
1920   union {
1921     vec_uint4 v;
1922     unsigned char c[16];
1923   } shmask;
1924
1925   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1926   shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1927   return ((qword)(shmask.v));
1928 }
1929
1930
1931 static __inline qword si_cdx(qword a, qword b)
1932 {
1933   union {
1934     vec_uint4 v;
1935     unsigned long long ll[2];
1936   } shmask;
1937
1938   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1939   shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1940   return ((qword)(shmask.v));
1941 }
1942
1943 static __inline qword si_chx(qword a, qword b)
1944 {
1945   union {
1946     vec_uint4 v;
1947     unsigned short s[8];
1948   } shmask;
1949
1950   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1951   shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1952   return ((qword)(shmask.v));
1953 }
1954
1955 static __inline qword si_cwx(qword a, qword b)
1956 {
1957   union {
1958     vec_uint4 v;
1959     unsigned int i[4];
1960   } shmask;
1961
1962   shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1963   shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1964   return ((qword)(shmask.v));
1965 }
1966
1967
1968 /* Constant Formation
1969  */
1970 static __inline qword si_il(signed short imm)
1971 {
1972   return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1973 }
1974
1975
1976 static __inline qword si_ila(unsigned int imm)
1977 {
1978   return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1979 }
1980
1981 static __inline qword si_ilh(signed short imm)
1982 {
1983   return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1984 }
1985
1986 static __inline qword si_ilhu(signed short imm)
1987 {
1988   return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1989 }
1990
1991 static __inline qword si_iohl(qword a, unsigned short imm)
1992 {
1993   return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1994 }
1995
1996 /* No Operation
1997  */
1998 #define si_lnop()               /* do nothing */
1999 #define si_nop()                /* do nothing */
2000
2001
2002 /* Memory Load and Store
2003  */
2004 static __inline qword si_lqa(unsigned int imm)
2005 {
2006   return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2007 }
2008
2009 static __inline qword si_lqd(qword a, unsigned int imm)
2010 {
2011   return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2012 }
2013
2014 static __inline qword si_lqr(unsigned int imm)
2015 {
2016   return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2017 }
2018
2019 static __inline qword si_lqx(qword a, qword b)
2020 {
2021   return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2022 }
2023
2024 static __inline void si_stqa(qword a, unsigned int imm)
2025 {
2026   vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2027 }
2028
2029 static __inline void si_stqd(qword a, qword b, unsigned int imm)
2030 {
2031   vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2032 }
2033
2034 static __inline void si_stqr(qword a, unsigned int imm)
2035 {
2036   vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2037 }
2038
2039 static __inline void si_stqx(qword a, qword b, qword c)
2040 {
2041   vec_st((vec_uchar16)(a),
2042          si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2043          (vector unsigned char *)(0));
2044 }
2045
2046 #endif /* !__SPU__ */
2047 #endif /* !_SI2VMX_H_ */
2048