aom_dsp/x86/intrapred_ssse3.c

   1 /*
   2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
   3  *
   4  * This source code is subject to the terms of the BSD 2 Clause License and
   5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
   6  * was not distributed with this source code in the LICENSE file, you can
   7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
   8  * Media Patent License 1.0 was not distributed with this source code in the
   9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  10  */
  11
  12 #include <tmmintrin.h>
  13
  14 #include "./aom_dsp_rtcd.h"
  15 #include "aom_dsp/intrapred_common.h"
  16
  17 // -----------------------------------------------------------------------------
  18 // PAETH_PRED
  19
  20 // Return 8 16-bit pixels in one row
  21 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
  22                                      const __m128i *topleft) {
  23   const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
  24
  25   __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
  26   __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
  27   __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
  28
  29   __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
  30   mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
  31   __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
  32
  33   pl = _mm_andnot_si128(mask1, *left);
  34
  35   ptl = _mm_and_si128(mask2, *topleft);
  36   pt = _mm_andnot_si128(mask2, *top);
  37   pt = _mm_or_si128(pt, ptl);
  38   pt = _mm_and_si128(mask1, pt);
  39
  40   return _mm_or_si128(pl, pt);
  41 }
  42
  43 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
  44                                    const uint8_t *above, const uint8_t *left) {
  45   __m128i l = _mm_loadl_epi64((const __m128i *)left);
  46   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  47   const __m128i zero = _mm_setzero_si128();
  48   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  49   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
  50   __m128i rep = _mm_set1_epi16(0x8000);
  51   const __m128i one = _mm_set1_epi16(1);
  52
  53   int i;
  54   for (i = 0; i < 4; ++i) {
  55     const __m128i l16 = _mm_shuffle_epi8(l, rep);
  56     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  57
  58     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
  59     dst += stride;
  60     rep = _mm_add_epi16(rep, one);
  61   }
  62 }
  63
  64 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
  65                                    const uint8_t *above, const uint8_t *left) {
  66   __m128i l = _mm_loadl_epi64((const __m128i *)left);
  67   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  68   const __m128i zero = _mm_setzero_si128();
  69   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  70   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
  71   __m128i rep = _mm_set1_epi16(0x8000);
  72   const __m128i one = _mm_set1_epi16(1);
  73
  74   int i;
  75   for (i = 0; i < 8; ++i) {
  76     const __m128i l16 = _mm_shuffle_epi8(l, rep);
  77     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  78
  79     *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
  80     dst += stride;
  81     rep = _mm_add_epi16(rep, one);
  82   }
  83 }
  84
  85 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
  86                                    const uint8_t *above, const uint8_t *left) {
  87   __m128i l = _mm_loadl_epi64((const __m128i *)left);
  88   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
  89   const __m128i zero = _mm_setzero_si128();
  90   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
  91   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
  92   __m128i rep = _mm_set1_epi16(0x8000);
  93   const __m128i one = _mm_set1_epi16(1);
  94
  95   int i;
  96   for (i = 0; i < 4; ++i) {
  97     const __m128i l16 = _mm_shuffle_epi8(l, rep);
  98     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
  99
 100     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
 101     dst += stride;
 102     rep = _mm_add_epi16(rep, one);
 103   }
 104 }
 105
 106 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
 107                                    const uint8_t *above, const uint8_t *left) {
 108   __m128i l = _mm_loadl_epi64((const __m128i *)left);
 109   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
 110   const __m128i zero = _mm_setzero_si128();
 111   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
 112   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 113   __m128i rep = _mm_set1_epi16(0x8000);
 114   const __m128i one = _mm_set1_epi16(1);
 115
 116   int i;
 117   for (i = 0; i < 8; ++i) {
 118     const __m128i l16 = _mm_shuffle_epi8(l, rep);
 119     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
 120
 121     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
 122     dst += stride;
 123     rep = _mm_add_epi16(rep, one);
 124   }
 125 }
 126
 127 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 128                                     const uint8_t *above, const uint8_t *left) {
 129   __m128i l = _mm_load_si128((const __m128i *)left);
 130   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
 131   const __m128i zero = _mm_setzero_si128();
 132   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
 133   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 134   __m128i rep = _mm_set1_epi16(0x8000);
 135   const __m128i one = _mm_set1_epi16(1);
 136
 137   int i;
 138   for (i = 0; i < 16; ++i) {
 139     const __m128i l16 = _mm_shuffle_epi8(l, rep);
 140     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
 141
 142     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
 143     dst += stride;
 144     rep = _mm_add_epi16(rep, one);
 145   }
 146 }
 147
 148 // Return 16 8-bit pixels in one row
 149 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
 150                                       const __m128i *top1,
 151                                       const __m128i *topleft) {
 152   const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
 153   const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
 154   return _mm_packus_epi16(p0, p1);
 155 }
 156
 157 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
 158                                     const uint8_t *above, const uint8_t *left) {
 159   __m128i l = _mm_loadl_epi64((const __m128i *)left);
 160   const __m128i t = _mm_load_si128((const __m128i *)above);
 161   const __m128i zero = _mm_setzero_si128();
 162   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
 163   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
 164   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 165   __m128i rep = _mm_set1_epi16(0x8000);
 166   const __m128i one = _mm_set1_epi16(1);
 167
 168   int i;
 169   for (i = 0; i < 8; ++i) {
 170     const __m128i l16 = _mm_shuffle_epi8(l, rep);
 171     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 172
 173     _mm_store_si128((__m128i *)dst, row);
 174     dst += stride;
 175     rep = _mm_add_epi16(rep, one);
 176   }
 177 }
 178
 179 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 180                                      const uint8_t *above,
 181                                      const uint8_t *left) {
 182   __m128i l = _mm_load_si128((const __m128i *)left);
 183   const __m128i t = _mm_load_si128((const __m128i *)above);
 184   const __m128i zero = _mm_setzero_si128();
 185   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
 186   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
 187   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 188   __m128i rep = _mm_set1_epi16(0x8000);
 189   const __m128i one = _mm_set1_epi16(1);
 190
 191   int i;
 192   for (i = 0; i < 16; ++i) {
 193     const __m128i l16 = _mm_shuffle_epi8(l, rep);
 194     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 195
 196     _mm_store_si128((__m128i *)dst, row);
 197     dst += stride;
 198     rep = _mm_add_epi16(rep, one);
 199   }
 200 }
 201
 202 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 203                                      const uint8_t *above,
 204                                      const uint8_t *left) {
 205   __m128i l = _mm_load_si128((const __m128i *)left);
 206   const __m128i t = _mm_load_si128((const __m128i *)above);
 207   const __m128i zero = _mm_setzero_si128();
 208   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
 209   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
 210   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 211   __m128i rep = _mm_set1_epi16(0x8000);
 212   const __m128i one = _mm_set1_epi16(1);
 213   __m128i l16;
 214
 215   int i;
 216   for (i = 0; i < 16; ++i) {
 217     l16 = _mm_shuffle_epi8(l, rep);
 218     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 219
 220     _mm_store_si128((__m128i *)dst, row);
 221     dst += stride;
 222     rep = _mm_add_epi16(rep, one);
 223   }
 224
 225   l = _mm_load_si128((const __m128i *)(left + 16));
 226   rep = _mm_set1_epi16(0x8000);
 227   for (i = 0; i < 16; ++i) {
 228     l16 = _mm_shuffle_epi8(l, rep);
 229     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 230
 231     _mm_store_si128((__m128i *)dst, row);
 232     dst += stride;
 233     rep = _mm_add_epi16(rep, one);
 234   }
 235 }
 236
 237 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
 238                                      const uint8_t *above,
 239                                      const uint8_t *left) {
 240   const __m128i t = _mm_load_si128((const __m128i *)above);
 241   const __m128i zero = _mm_setzero_si128();
 242   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
 243   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
 244   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 245   const __m128i one = _mm_set1_epi16(1);
 246
 247   for (int j = 0; j < 4; ++j) {
 248     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
 249     __m128i rep = _mm_set1_epi16(0x8000);
 250     for (int i = 0; i < 16; ++i) {
 251       const __m128i l16 = _mm_shuffle_epi8(l, rep);
 252       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
 253       _mm_store_si128((__m128i *)dst, row);
 254       dst += stride;
 255       rep = _mm_add_epi16(rep, one);
 256     }
 257   }
 258 }
 259
 260 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 261                                      const uint8_t *above,
 262                                      const uint8_t *left) {
 263   const __m128i a = _mm_load_si128((const __m128i *)above);
 264   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
 265   const __m128i zero = _mm_setzero_si128();
 266   const __m128i al = _mm_unpacklo_epi8(a, zero);
 267   const __m128i ah = _mm_unpackhi_epi8(a, zero);
 268   const __m128i bl = _mm_unpacklo_epi8(b, zero);
 269   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 270
 271   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 272   __m128i rep = _mm_set1_epi16(0x8000);
 273   const __m128i one = _mm_set1_epi16(1);
 274   __m128i l = _mm_load_si128((const __m128i *)left);
 275   __m128i l16;
 276
 277   int i;
 278   for (i = 0; i < 16; ++i) {
 279     l16 = _mm_shuffle_epi8(l, rep);
 280     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
 281     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
 282
 283     _mm_store_si128((__m128i *)dst, r32l);
 284     _mm_store_si128((__m128i *)(dst + 16), r32h);
 285     dst += stride;
 286     rep = _mm_add_epi16(rep, one);
 287   }
 288 }
 289
 290 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 291                                      const uint8_t *above,
 292                                      const uint8_t *left) {
 293   const __m128i a = _mm_load_si128((const __m128i *)above);
 294   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
 295   const __m128i zero = _mm_setzero_si128();
 296   const __m128i al = _mm_unpacklo_epi8(a, zero);
 297   const __m128i ah = _mm_unpackhi_epi8(a, zero);
 298   const __m128i bl = _mm_unpacklo_epi8(b, zero);
 299   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 300
 301   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 302   __m128i rep = _mm_set1_epi16(0x8000);
 303   const __m128i one = _mm_set1_epi16(1);
 304   __m128i l = _mm_load_si128((const __m128i *)left);
 305   __m128i l16;
 306
 307   int i;
 308   for (i = 0; i < 16; ++i) {
 309     l16 = _mm_shuffle_epi8(l, rep);
 310     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
 311     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
 312
 313     _mm_store_si128((__m128i *)dst, r32l);
 314     _mm_store_si128((__m128i *)(dst + 16), r32h);
 315     dst += stride;
 316     rep = _mm_add_epi16(rep, one);
 317   }
 318
 319   rep = _mm_set1_epi16(0x8000);
 320   l = _mm_load_si128((const __m128i *)(left + 16));
 321   for (i = 0; i < 16; ++i) {
 322     l16 = _mm_shuffle_epi8(l, rep);
 323     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
 324     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
 325
 326     _mm_store_si128((__m128i *)dst, r32l);
 327     _mm_store_si128((__m128i *)(dst + 16), r32h);
 328     dst += stride;
 329     rep = _mm_add_epi16(rep, one);
 330   }
 331 }
 332
 333 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
 334                                      const uint8_t *above,
 335                                      const uint8_t *left) {
 336   const __m128i a = _mm_load_si128((const __m128i *)above);
 337   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
 338   const __m128i zero = _mm_setzero_si128();
 339   const __m128i al = _mm_unpacklo_epi8(a, zero);
 340   const __m128i ah = _mm_unpackhi_epi8(a, zero);
 341   const __m128i bl = _mm_unpacklo_epi8(b, zero);
 342   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 343
 344   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 345   const __m128i one = _mm_set1_epi16(1);
 346   __m128i l16;
 347
 348   int i, j;
 349   for (j = 0; j < 4; ++j) {
 350     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
 351     __m128i rep = _mm_set1_epi16(0x8000);
 352     for (i = 0; i < 16; ++i) {
 353       l16 = _mm_shuffle_epi8(l, rep);
 354       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
 355       const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
 356
 357       _mm_store_si128((__m128i *)dst, r32l);
 358       _mm_store_si128((__m128i *)(dst + 16), r32h);
 359       dst += stride;
 360       rep = _mm_add_epi16(rep, one);
 361     }
 362   }
 363 }
 364
 365 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 366                                      const uint8_t *above,
 367                                      const uint8_t *left) {
 368   const __m128i a = _mm_load_si128((const __m128i *)above);
 369   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
 370   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
 371   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
 372   const __m128i zero = _mm_setzero_si128();
 373   const __m128i al = _mm_unpacklo_epi8(a, zero);
 374   const __m128i ah = _mm_unpackhi_epi8(a, zero);
 375   const __m128i bl = _mm_unpacklo_epi8(b, zero);
 376   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 377   const __m128i cl = _mm_unpacklo_epi8(c, zero);
 378   const __m128i ch = _mm_unpackhi_epi8(c, zero);
 379   const __m128i dl = _mm_unpacklo_epi8(d, zero);
 380   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 381
 382   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 383   const __m128i one = _mm_set1_epi16(1);
 384   __m128i l16;
 385
 386   int i, j;
 387   for (j = 0; j < 2; ++j) {
 388     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
 389     __m128i rep = _mm_set1_epi16(0x8000);
 390     for (i = 0; i < 16; ++i) {
 391       l16 = _mm_shuffle_epi8(l, rep);
 392       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
 393       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
 394       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
 395       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
 396
 397       _mm_store_si128((__m128i *)dst, r0);
 398       _mm_store_si128((__m128i *)(dst + 16), r1);
 399       _mm_store_si128((__m128i *)(dst + 32), r2);
 400       _mm_store_si128((__m128i *)(dst + 48), r3);
 401       dst += stride;
 402       rep = _mm_add_epi16(rep, one);
 403     }
 404   }
 405 }
 406
 407 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
 408                                      const uint8_t *above,
 409                                      const uint8_t *left) {
 410   const __m128i a = _mm_load_si128((const __m128i *)above);
 411   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
 412   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
 413   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
 414   const __m128i zero = _mm_setzero_si128();
 415   const __m128i al = _mm_unpacklo_epi8(a, zero);
 416   const __m128i ah = _mm_unpackhi_epi8(a, zero);
 417   const __m128i bl = _mm_unpacklo_epi8(b, zero);
 418   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 419   const __m128i cl = _mm_unpacklo_epi8(c, zero);
 420   const __m128i ch = _mm_unpackhi_epi8(c, zero);
 421   const __m128i dl = _mm_unpacklo_epi8(d, zero);
 422   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 423
 424   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 425   const __m128i one = _mm_set1_epi16(1);
 426   __m128i l16;
 427
 428   int i, j;
 429   for (j = 0; j < 4; ++j) {
 430     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
 431     __m128i rep = _mm_set1_epi16(0x8000);
 432     for (i = 0; i < 16; ++i) {
 433       l16 = _mm_shuffle_epi8(l, rep);
 434       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
 435       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
 436       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
 437       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
 438
 439       _mm_store_si128((__m128i *)dst, r0);
 440       _mm_store_si128((__m128i *)(dst + 16), r1);
 441       _mm_store_si128((__m128i *)(dst + 32), r2);
 442       _mm_store_si128((__m128i *)(dst + 48), r3);
 443       dst += stride;
 444       rep = _mm_add_epi16(rep, one);
 445     }
 446   }
 447 }
 448
 449 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 450                                      const uint8_t *above,
 451                                      const uint8_t *left) {
 452   const __m128i a = _mm_load_si128((const __m128i *)above);
 453   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
 454   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
 455   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
 456   const __m128i zero = _mm_setzero_si128();
 457   const __m128i al = _mm_unpacklo_epi8(a, zero);
 458   const __m128i ah = _mm_unpackhi_epi8(a, zero);
 459   const __m128i bl = _mm_unpacklo_epi8(b, zero);
 460   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 461   const __m128i cl = _mm_unpacklo_epi8(c, zero);
 462   const __m128i ch = _mm_unpackhi_epi8(c, zero);
 463   const __m128i dl = _mm_unpacklo_epi8(d, zero);
 464   const __m128i dh = _mm_unpackhi_epi8(d, zero);
 465
 466   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
 467   const __m128i one = _mm_set1_epi16(1);
 468   __m128i l16;
 469
 470   int i;
 471   const __m128i l = _mm_load_si128((const __m128i *)left);
 472   __m128i rep = _mm_set1_epi16(0x8000);
 473   for (i = 0; i < 16; ++i) {
 474     l16 = _mm_shuffle_epi8(l, rep);
 475     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
 476     const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
 477     const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
 478     const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
 479
 480     _mm_store_si128((__m128i *)dst, r0);
 481     _mm_store_si128((__m128i *)(dst + 16), r1);
 482     _mm_store_si128((__m128i *)(dst + 32), r2);
 483     _mm_store_si128((__m128i *)(dst + 48), r3);
 484     dst += stride;
 485     rep = _mm_add_epi16(rep, one);
 486   }
 487 }
 488
 489 // -----------------------------------------------------------------------------
 490 // SMOOTH_PRED
 491
 492 // pixels[0]: above and below_pred interleave vector
 493 // pixels[1]: left vector
 494 // pixels[2]: right_pred vector
 495 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
 496                                  int height, __m128i *pixels) {
 497   __m128i d = _mm_loadl_epi64((const __m128i *)above);
 498   pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
 499   pixels[1] = _mm_loadl_epi64((const __m128i *)left);
 500
 501   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
 502   const __m128i zero = _mm_setzero_si128();
 503   d = _mm_unpacklo_epi8(d, zero);
 504   pixels[0] = _mm_unpacklo_epi16(d, bp);
 505 }
 506
 507 // weights[0]: weights_h vector
 508 // weights[1]: scale - weights_h vecotr
 509 // weights[2]: weights_w and scale - weights_w interleave vector
 510 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
 511                                   __m128i *weights) {
 512   __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
 513   const __m128i zero = _mm_setzero_si128();
 514
 515   weights[0] = _mm_unpacklo_epi8(t, zero);
 516   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
 517   weights[1] = _mm_sub_epi16(d, weights[0]);
 518   weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]);
 519
 520   if (height == 8) {
 521     t = _mm_srli_si128(t, 4);
 522     weights[0] = _mm_unpacklo_epi8(t, zero);
 523     weights[1] = _mm_sub_epi16(d, weights[0]);
 524   }
 525 }
 526
 527 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight,
 528                                    int h, uint8_t *dst, ptrdiff_t stride) {
 529   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
 530   const __m128i one = _mm_set1_epi16(1);
 531   const __m128i inc = _mm_set1_epi16(0x202);
 532   const __m128i gat = _mm_set1_epi32(0xc080400);
 533   __m128i rep = _mm_set1_epi16(0x8000);
 534   __m128i d = _mm_set1_epi16(0x100);
 535
 536   int i;
 537   for (i = 0; i < h; ++i) {
 538     const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
 539     const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
 540     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
 541     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
 542
 543     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
 544     b = _mm_unpacklo_epi16(b, pixel[2]);
 545     __m128i sum = _mm_madd_epi16(b, weight[2]);
 546
 547     sum = _mm_add_epi32(s, sum);
 548     sum = _mm_add_epi32(sum, round);
 549     sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
 550
 551     sum = _mm_shuffle_epi8(sum, gat);
 552     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
 553     dst += stride;
 554
 555     rep = _mm_add_epi16(rep, one);
 556     d = _mm_add_epi16(d, inc);
 557   }
 558 }
 559
 560 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
 561                                     const uint8_t *above, const uint8_t *left) {
 562   __m128i pixels[3];
 563   load_pixel_w4(above, left, 4, pixels);
 564
 565   __m128i weights[3];
 566   load_weight_w4(sm_weight_arrays, 4, weights);
 567
 568   smooth_pred_4xh(pixels, weights, 4, dst, stride);
 569 }
 570
 571 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
 572                                     const uint8_t *above, const uint8_t *left) {
 573   __m128i pixels[3];
 574   load_pixel_w4(above, left, 8, pixels);
 575
 576   __m128i weights[3];
 577   load_weight_w4(sm_weight_arrays, 8, weights);
 578
 579   smooth_pred_4xh(pixels, weights, 8, dst, stride);
 580 }
 581
 582 // pixels[0]: above and below_pred interleave vector, first half
 583 // pixels[1]: above and below_pred interleave vector, second half
 584 // pixels[2]: left vector
 585 // pixels[3]: right_pred vector
 586 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
 587                                  int height, __m128i *pixels) {
 588   __m128i d = _mm_loadl_epi64((const __m128i *)above);
 589   pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
 590   pixels[2] = _mm_load_si128((const __m128i *)left);
 591   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
 592   const __m128i zero = _mm_setzero_si128();
 593
 594   d = _mm_unpacklo_epi8(d, zero);
 595   pixels[0] = _mm_unpacklo_epi16(d, bp);
 596   pixels[1] = _mm_unpackhi_epi16(d, bp);
 597 }
 598
 599 // weight_h[0]: weight_h vector
 600 // weight_h[1]: scale - weight_h vector
 601 // weight_h[2]: same as [0], second half for height = 16 only
 602 // weight_h[3]: same as [1], second half for height = 16 only
 603 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 604 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
 605 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
 606                                   __m128i *weight_h, __m128i *weight_w) {
 607   const __m128i zero = _mm_setzero_si128();
 608   const int we_offset = height < 8 ? 4 : 8;
 609   __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
 610   weight_h[0] = _mm_unpacklo_epi8(we, zero);
 611
 612   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
 613   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 614
 615   if (height == 4) {
 616     we = _mm_srli_si128(we, 4);
 617     __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
 618     __m128i tmp2 = _mm_sub_epi16(d, tmp1);
 619     weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
 620     weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
 621   } else {
 622     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
 623     weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
 624   }
 625
 626   if (height == 16) {
 627     we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
 628     weight_h[0] = _mm_unpacklo_epi8(we, zero);
 629     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 630     weight_h[2] = _mm_unpackhi_epi8(we, zero);
 631     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
 632   }
 633 }
 634
 635 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
 636                                    const __m128i *ww, int h, uint8_t *dst,
 637                                    ptrdiff_t stride, int second_half) {
 638   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
 639   const __m128i one = _mm_set1_epi16(1);
 640   const __m128i inc = _mm_set1_epi16(0x202);
 641   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
 642
 643   __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
 644   __m128i d = _mm_set1_epi16(0x100);
 645
 646   int i;
 647   for (i = 0; i < h; ++i) {
 648     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
 649     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
 650     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
 651     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
 652     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
 653
 654     __m128i b = _mm_shuffle_epi8(pixels[2], rep);
 655     b = _mm_unpacklo_epi16(b, pixels[3]);
 656     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
 657     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
 658
 659     s0 = _mm_add_epi32(s0, sum0);
 660     s0 = _mm_add_epi32(s0, round);
 661     s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
 662
 663     s1 = _mm_add_epi32(s1, sum1);
 664     s1 = _mm_add_epi32(s1, round);
 665     s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
 666
 667     sum0 = _mm_packus_epi16(s0, s1);
 668     sum0 = _mm_shuffle_epi8(sum0, gat);
 669     _mm_storel_epi64((__m128i *)dst, sum0);
 670     dst += stride;
 671
 672     rep = _mm_add_epi16(rep, one);
 673     d = _mm_add_epi16(d, inc);
 674   }
 675 }
 676
 677 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
 678                                     const uint8_t *above, const uint8_t *left) {
 679   __m128i pixels[4];
 680   load_pixel_w8(above, left, 4, pixels);
 681
 682   __m128i wh[4], ww[2];
 683   load_weight_w8(sm_weight_arrays, 4, wh, ww);
 684
 685   smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
 686 }
 687
 688 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
 689                                     const uint8_t *above, const uint8_t *left) {
 690   __m128i pixels[4];
 691   load_pixel_w8(above, left, 8, pixels);
 692
 693   __m128i wh[4], ww[2];
 694   load_weight_w8(sm_weight_arrays, 8, wh, ww);
 695
 696   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
 697 }
 698
 699 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 700                                      const uint8_t *above,
 701                                      const uint8_t *left) {
 702   __m128i pixels[4];
 703   load_pixel_w8(above, left, 16, pixels);
 704
 705   __m128i wh[4], ww[2];
 706   load_weight_w8(sm_weight_arrays, 16, wh, ww);
 707
 708   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
 709   dst += stride << 3;
 710   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 711 }
 712
 713 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
 714                                         const uint8_t *above,
 715                                         const uint8_t *left, uint32_t bw,
 716                                         uint32_t bh) {
 717   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
 718   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
 719   const __m128i zero = _mm_setzero_si128();
 720   const __m128i scale_value =
 721       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
 722   const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
 723   const __m128i dup16 =
 724       _mm_set_epi32(0x01000100, 0x01000100, 0x01000100, 0x01000100);
 725   const __m128i top_right =
 726       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
 727   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
 728   const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
 729
 730   for (uint32_t y = 0; y < bh; ++y) {
 731     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
 732     const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
 733     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
 734     __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
 735     const __m128i wl_y =
 736         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
 737     pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
 738     pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
 739
 740     for (uint32_t x = 0; x < bw; x += 8) {
 741       const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
 742       const __m128i weights_x =
 743           _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
 744       const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
 745       const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
 746       const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
 747
 748       __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
 749       __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
 750
 751       const __m128i scale_m_weights_x =
 752           _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
 753       const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
 754       const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
 755       const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
 756
 757       pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
 758       pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
 759
 760       pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
 761       pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
 762
 763       pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
 764       pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
 765
 766       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
 767       pred = _mm_shuffle_epi8(pred, gat);
 768       _mm_storel_epi64((__m128i *)(dst + x), pred);
 769     }
 770     dst += stride;
 771   }
 772 }
 773
 774 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
 775                                      const uint8_t *above,
 776                                      const uint8_t *left) {
 777   smooth_predictor_wxh(dst, stride, above, left, 16, 8);
 778 }
 779
 780 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 781                                       const uint8_t *above,
 782                                       const uint8_t *left) {
 783   smooth_predictor_wxh(dst, stride, above, left, 16, 16);
 784 }
 785
 786 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 787                                       const uint8_t *above,
 788                                       const uint8_t *left) {
 789   smooth_predictor_wxh(dst, stride, above, left, 16, 32);
 790 }
 791
 792 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 793                                       const uint8_t *above,
 794                                       const uint8_t *left) {
 795   smooth_predictor_wxh(dst, stride, above, left, 32, 16);
 796 }
 797
 798 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 799                                       const uint8_t *above,
 800                                       const uint8_t *left) {
 801   smooth_predictor_wxh(dst, stride, above, left, 32, 32);
 802 }
 803
 804 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
 805                                       const uint8_t *above,
 806                                       const uint8_t *left) {
 807   smooth_predictor_wxh(dst, stride, above, left, 32, 64);
 808 }
 809
 810 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
 811                                       const uint8_t *above,
 812                                       const uint8_t *left) {
 813   smooth_predictor_wxh(dst, stride, above, left, 64, 64);
 814 }
 815
 816 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 817                                       const uint8_t *above,
 818                                       const uint8_t *left) {
 819   smooth_predictor_wxh(dst, stride, above, left, 64, 32);
 820 }
 821
 822 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 823                                       const uint8_t *above,
 824                                       const uint8_t *left) {
 825   smooth_predictor_wxh(dst, stride, above, left, 64, 16);
 826 }
 827
 828 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
 829                                       const uint8_t *above,
 830                                       const uint8_t *left) {
 831   smooth_predictor_wxh(dst, stride, above, left, 16, 64);
 832 }
 833
 834 // -----------------------------------------------------------------------------
 835 // SMOOTH_V_PRED
 836
 837 // pixels[0]: above and below_pred interleave vector
 838 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
 839                                    int height, __m128i *pixels) {
 840   const __m128i zero = _mm_setzero_si128();
 841   __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
 842   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
 843   d = _mm_unpacklo_epi8(d, zero);
 844   pixels[0] = _mm_unpacklo_epi16(d, bp);
 845 }
 846
 847 // weights[0]: weights_h vector
 848 // weights[1]: scale - weights_h vector
 849 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
 850                                     __m128i *weights) {
 851   const __m128i zero = _mm_setzero_si128();
 852   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
 853
 854   if (height == 4) {
 855     const __m128i weight =
 856         _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
 857     weights[0] = _mm_unpacklo_epi8(weight, zero);
 858     weights[1] = _mm_sub_epi16(d, weights[0]);
 859   } else if (height == 8) {
 860     const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
 861     weights[0] = _mm_unpacklo_epi8(weight, zero);
 862     weights[1] = _mm_sub_epi16(d, weights[0]);
 863   } else {
 864     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
 865     weights[0] = _mm_unpacklo_epi8(weight, zero);
 866     weights[1] = _mm_sub_epi16(d, weights[0]);
 867     weights[2] = _mm_unpackhi_epi8(weight, zero);
 868     weights[3] = _mm_sub_epi16(d, weights[2]);
 869   }
 870 }
 871
 872 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
 873                                      const __m128i *weight, int h, uint8_t *dst,
 874                                      ptrdiff_t stride) {
 875   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
 876   const __m128i inc = _mm_set1_epi16(0x202);
 877   const __m128i gat = _mm_set1_epi32(0xc080400);
 878   __m128i d = _mm_set1_epi16(0x100);
 879
 880   for (int i = 0; i < h; ++i) {
 881     const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
 882     const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
 883     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
 884     __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
 885     sum = _mm_add_epi32(sum, pred_round);
 886     sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
 887     sum = _mm_shuffle_epi8(sum, gat);
 888     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
 889     dst += stride;
 890     d = _mm_add_epi16(d, inc);
 891   }
 892 }
 893
 894 void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
 895                                       const uint8_t *above,
 896                                       const uint8_t *left) {
 897   __m128i pixels;
 898   load_pixel_v_w4(above, left, 4, &pixels);
 899
 900   __m128i weights[2];
 901   load_weight_v_w4(sm_weight_arrays, 4, weights);
 902
 903   smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
 904 }
 905
 906 void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
 907                                       const uint8_t *above,
 908                                       const uint8_t *left) {
 909   __m128i pixels;
 910   load_pixel_v_w4(above, left, 8, &pixels);
 911
 912   __m128i weights[2];
 913   load_weight_v_w4(sm_weight_arrays, 8, weights);
 914
 915   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
 916 }
 917
 918 void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 919                                        const uint8_t *above,
 920                                        const uint8_t *left) {
 921   __m128i pixels;
 922   load_pixel_v_w4(above, left, 16, &pixels);
 923
 924   __m128i weights[4];
 925   load_weight_v_w4(sm_weight_arrays, 16, weights);
 926
 927   smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
 928   dst += stride << 3;
 929   smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
 930 }
 931
 932 // pixels[0]: above and below_pred interleave vector, first half
 933 // pixels[1]: above and below_pred interleave vector, second half
 934 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
 935                                    int height, __m128i *pixels) {
 936   const __m128i zero = _mm_setzero_si128();
 937   __m128i d = _mm_loadl_epi64((const __m128i *)above);
 938   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
 939   d = _mm_unpacklo_epi8(d, zero);
 940   pixels[0] = _mm_unpacklo_epi16(d, bp);
 941   pixels[1] = _mm_unpackhi_epi16(d, bp);
 942 }
 943
 944 // weight_h[0]: weight_h vector
 945 // weight_h[1]: scale - weight_h vector
 946 // weight_h[2]: same as [0], offset 8
 947 // weight_h[3]: same as [1], offset 8
 948 // weight_h[4]: same as [0], offset 16
 949 // weight_h[5]: same as [1], offset 16
 950 // weight_h[6]: same as [0], offset 24
 951 // weight_h[7]: same as [1], offset 24
 952 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
 953                                     __m128i *weight_h) {
 954   const __m128i zero = _mm_setzero_si128();
 955   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
 956
 957   if (height < 16) {
 958     const int offset = height < 8 ? 4 : 8;
 959     const __m128i weight =
 960         _mm_loadu_si128((const __m128i *)&weight_array[offset]);
 961     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
 962     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 963   } else if (height == 16) {
 964     const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
 965     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
 966     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 967     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
 968     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
 969   } else {
 970     const __m128i weight_lo =
 971         _mm_loadu_si128((const __m128i *)&weight_array[32]);
 972     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
 973     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 974     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
 975     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
 976     const __m128i weight_hi =
 977         _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
 978     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
 979     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
 980     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
 981     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
 982   }
 983 }
 984
 985 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
 986                                      int h, uint8_t *dst, ptrdiff_t stride) {
 987   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
 988   const __m128i inc = _mm_set1_epi16(0x202);
 989   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
 990   __m128i d = _mm_set1_epi16(0x100);
 991
 992   for (int i = 0; i < h; ++i) {
 993     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
 994     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
 995     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
 996     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
 997     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
 998
 999     s0 = _mm_add_epi32(s0, pred_round);
1000     s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
1001
1002     s1 = _mm_add_epi32(s1, pred_round);
1003     s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
1004
1005     __m128i sum01 = _mm_packus_epi16(s0, s1);
1006     sum01 = _mm_shuffle_epi8(sum01, gat);
1007     _mm_storel_epi64((__m128i *)dst, sum01);
1008     dst += stride;
1009
1010     d = _mm_add_epi16(d, inc);
1011   }
1012 }
1013
1014 void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1015                                       const uint8_t *above,
1016                                       const uint8_t *left) {
1017   __m128i pixels[2];
1018   load_pixel_v_w8(above, left, 4, pixels);
1019
1020   __m128i wh[2];
1021   load_weight_v_w8(sm_weight_arrays, 4, wh);
1022
1023   smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
1024 }
1025
1026 void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1027                                       const uint8_t *above,
1028                                       const uint8_t *left) {
1029   __m128i pixels[2];
1030   load_pixel_v_w8(above, left, 8, pixels);
1031
1032   __m128i wh[2];
1033   load_weight_v_w8(sm_weight_arrays, 8, wh);
1034
1035   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1036 }
1037
1038 void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1039                                        const uint8_t *above,
1040                                        const uint8_t *left) {
1041   __m128i pixels[2];
1042   load_pixel_v_w8(above, left, 16, pixels);
1043
1044   __m128i wh[4];
1045   load_weight_v_w8(sm_weight_arrays, 16, wh);
1046
1047   smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1048   dst += stride << 3;
1049   smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1050 }
1051
1052 void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1053                                        const uint8_t *above,
1054                                        const uint8_t *left) {
1055   __m128i pixels[2];
1056   load_pixel_v_w8(above, left, 32, pixels);
1057
1058   __m128i wh[8];
1059   load_weight_v_w8(sm_weight_arrays, 32, wh);
1060
1061   smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
1062   dst += stride << 3;
1063   smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1064   dst += stride << 3;
1065   smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
1066   dst += stride << 3;
1067   smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
1068 }
1069
1070 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1071                                           const uint8_t *above,
1072                                           const uint8_t *left, uint32_t bw,
1073                                           uint32_t bh) {
1074   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
1075   const __m128i zero = _mm_setzero_si128();
1076   const __m128i scale_value =
1077       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1078   const __m128i dup16 =
1079       _mm_set_epi32(0x01000100, 0x01000100, 0x01000100, 0x01000100);
1080   const __m128i bottom_left =
1081       _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
1082   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1083   const __m128i round =
1084       _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
1085
1086   for (uint32_t y = 0; y < bh; ++y) {
1087     const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
1088     const __m128i scale_m_weights_y =
1089         _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
1090     const __m128i wl_y =
1091         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
1092
1093     for (uint32_t x = 0; x < bw; x += 8) {
1094       const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
1095       // 8 -> 16
1096       const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
1097       const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
1098       const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
1099       // top_x * weights_y + scale_m_weights_y * bottom_left
1100       __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
1101       __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
1102
1103       pred_lo = _mm_add_epi32(pred_lo, round);
1104       pred_hi = _mm_add_epi32(pred_hi, round);
1105       pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1106       pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1107
1108       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1109       pred = _mm_shuffle_epi8(pred, gat);
1110       _mm_storel_epi64((__m128i *)(dst + x), pred);
1111     }
1112     dst += stride;
1113   }
1114 }
1115
1116 void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1117                                        const uint8_t *above,
1118                                        const uint8_t *left) {
1119   smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
1120 }
1121
1122 void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1123                                        const uint8_t *above,
1124                                        const uint8_t *left) {
1125   smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
1126 }
1127
1128 void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1129                                         const uint8_t *above,
1130                                         const uint8_t *left) {
1131   smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
1132 }
1133
1134 void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1135                                         const uint8_t *above,
1136                                         const uint8_t *left) {
1137   smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
1138 }
1139
1140 void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1141                                        const uint8_t *above,
1142                                        const uint8_t *left) {
1143   smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
1144 }
1145
1146 void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1147                                         const uint8_t *above,
1148                                         const uint8_t *left) {
1149   smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
1150 }
1151
1152 void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1153                                         const uint8_t *above,
1154                                         const uint8_t *left) {
1155   smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
1156 }
1157
1158 void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1159                                         const uint8_t *above,
1160                                         const uint8_t *left) {
1161   smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
1162 }
1163
1164 void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1165                                         const uint8_t *above,
1166                                         const uint8_t *left) {
1167   smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
1168 }
1169
1170 void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1171                                         const uint8_t *above,
1172                                         const uint8_t *left) {
1173   smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
1174 }
1175
1176 void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1177                                         const uint8_t *above,
1178                                         const uint8_t *left) {
1179   smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
1180 }
1181
1182 void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1183                                         const uint8_t *above,
1184                                         const uint8_t *left) {
1185   smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
1186 }
1187
1188 // -----------------------------------------------------------------------------
1189 // SMOOTH_H_PRED
1190
1191 // pixels[0]: left vector
1192 // pixels[1]: right_pred vector
1193 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
1194                                    int height, __m128i *pixels) {
1195   if (height == 4)
1196     pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1197   else if (height == 8)
1198     pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
1199   else
1200     pixels[0] = _mm_loadu_si128(((const __m128i *)left));
1201   pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
1202 }
1203
1204 // weights[0]: weights_w and scale - weights_w interleave vector
1205 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
1206                                     __m128i *weights) {
1207   (void)height;
1208   const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
1209   const __m128i zero = _mm_setzero_si128();
1210
1211   const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
1212   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1213   const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
1214   weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
1215 }
1216
1217 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
1218                                      const __m128i *weight, int h, uint8_t *dst,
1219                                      ptrdiff_t stride) {
1220   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1221   const __m128i one = _mm_set1_epi16(1);
1222   const __m128i gat = _mm_set1_epi32(0xc080400);
1223   __m128i rep = _mm_set1_epi16(0x8000);
1224
1225   for (int i = 0; i < h; ++i) {
1226     __m128i b = _mm_shuffle_epi8(pixel[0], rep);
1227     b = _mm_unpacklo_epi16(b, pixel[1]);
1228     __m128i sum = _mm_madd_epi16(b, weight[0]);
1229
1230     sum = _mm_add_epi32(sum, pred_round);
1231     sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1232
1233     sum = _mm_shuffle_epi8(sum, gat);
1234     *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1235     dst += stride;
1236
1237     rep = _mm_add_epi16(rep, one);
1238   }
1239 }
1240
1241 void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1242                                       const uint8_t *above,
1243                                       const uint8_t *left) {
1244   __m128i pixels[2];
1245   load_pixel_h_w4(above, left, 4, pixels);
1246
1247   __m128i weights;
1248   load_weight_h_w4(sm_weight_arrays, 4, &weights);
1249
1250   smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
1251 }
1252
1253 void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1254                                       const uint8_t *above,
1255                                       const uint8_t *left) {
1256   __m128i pixels[2];
1257   load_pixel_h_w4(above, left, 8, pixels);
1258
1259   __m128i weights;
1260   load_weight_h_w4(sm_weight_arrays, 8, &weights);
1261
1262   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1263 }
1264
1265 void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1266                                        const uint8_t *above,
1267                                        const uint8_t *left) {
1268   __m128i pixels[2];
1269   load_pixel_h_w4(above, left, 16, pixels);
1270
1271   __m128i weights;
1272   load_weight_h_w4(sm_weight_arrays, 8, &weights);
1273
1274   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1275   dst += stride << 3;
1276
1277   pixels[0] = _mm_srli_si128(pixels[0], 8);
1278   smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1279 }
1280
1281 // pixels[0]: left vector
1282 // pixels[1]: right_pred vector
1283 // pixels[2]: left vector + 16
1284 // pixels[3]: right_pred vector
1285 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
1286                                    int height, __m128i *pixels) {
1287   pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
1288
1289   if (height == 4) {
1290     pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1291   } else if (height == 8) {
1292     pixels[0] = _mm_loadl_epi64((const __m128i *)left);
1293   } else if (height == 16) {
1294     pixels[0] = _mm_load_si128((const __m128i *)left);
1295   } else {
1296     pixels[0] = _mm_load_si128((const __m128i *)left);
1297     pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
1298     pixels[3] = pixels[1];
1299   }
1300 }
1301
1302 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1303 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
1304 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
1305                                     __m128i *weight_w) {
1306   (void)height;
1307   const __m128i zero = _mm_setzero_si128();
1308   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1309   const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
1310   const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
1311   const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
1312   weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
1313   weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
1314 }
1315
1316 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
1317                                      int h, uint8_t *dst, ptrdiff_t stride,
1318                                      int second_half) {
1319   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1320   const __m128i one = _mm_set1_epi16(1);
1321   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1322   __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
1323
1324   for (int i = 0; i < h; ++i) {
1325     __m128i b = _mm_shuffle_epi8(pixels[0], rep);
1326     b = _mm_unpacklo_epi16(b, pixels[1]);
1327     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
1328     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
1329
1330     sum0 = _mm_add_epi32(sum0, pred_round);
1331     sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
1332
1333     sum1 = _mm_add_epi32(sum1, pred_round);
1334     sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
1335
1336     sum0 = _mm_packus_epi16(sum0, sum1);
1337     sum0 = _mm_shuffle_epi8(sum0, gat);
1338     _mm_storel_epi64((__m128i *)dst, sum0);
1339     dst += stride;
1340
1341     rep = _mm_add_epi16(rep, one);
1342   }
1343 }
1344
1345 void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1346                                       const uint8_t *above,
1347                                       const uint8_t *left) {
1348   __m128i pixels[2];
1349   load_pixel_h_w8(above, left, 4, pixels);
1350
1351   __m128i ww[2];
1352   load_weight_h_w8(sm_weight_arrays, 4, ww);
1353
1354   smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1355 }
1356
1357 void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1358                                       const uint8_t *above,
1359                                       const uint8_t *left) {
1360   __m128i pixels[2];
1361   load_pixel_h_w8(above, left, 8, pixels);
1362
1363   __m128i ww[2];
1364   load_weight_h_w8(sm_weight_arrays, 8, ww);
1365
1366   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1367 }
1368
1369 void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1370                                        const uint8_t *above,
1371                                        const uint8_t *left) {
1372   __m128i pixels[2];
1373   load_pixel_h_w8(above, left, 16, pixels);
1374
1375   __m128i ww[2];
1376   load_weight_h_w8(sm_weight_arrays, 16, ww);
1377
1378   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1379   dst += stride << 3;
1380   smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1381 }
1382
1383 void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1384                                        const uint8_t *above,
1385                                        const uint8_t *left) {
1386   __m128i pixels[4];
1387   load_pixel_h_w8(above, left, 32, pixels);
1388
1389   __m128i ww[2];
1390   load_weight_h_w8(sm_weight_arrays, 32, ww);
1391
1392   smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1393   dst += stride << 3;
1394   smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1395   dst += stride << 3;
1396   smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1397   dst += stride << 3;
1398   smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1399 }
1400
1401 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1402                                           const uint8_t *above,
1403                                           const uint8_t *left, uint32_t bw,
1404                                           uint32_t bh) {
1405   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1406   const __m128i zero = _mm_setzero_si128();
1407   const __m128i scale_value =
1408       _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1409   const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1410   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1411   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1412
1413   for (uint32_t y = 0; y < bh; ++y) {
1414     const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1415     const __m128i tr_ly =
1416         _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1417
1418     for (uint32_t x = 0; x < bw; x += 8) {
1419       const __m128i weights_x =
1420           _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1421       const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1422       const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1423       const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1424       const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1425       __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1426       __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1427
1428       pred_lo = _mm_add_epi32(pred_lo, pred_round);
1429       pred_hi = _mm_add_epi32(pred_hi, pred_round);
1430
1431       pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1432       pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1433
1434       __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1435       pred = _mm_shuffle_epi8(pred, gat);
1436       _mm_storel_epi64((__m128i *)(dst + x), pred);
1437     }
1438     dst += stride;
1439   }
1440 }
1441
1442 void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1443                                        const uint8_t *above,
1444                                        const uint8_t *left) {
1445   smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1446 }
1447
1448 void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1449                                        const uint8_t *above,
1450                                        const uint8_t *left) {
1451   smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1452 }
1453
1454 void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1455                                         const uint8_t *above,
1456                                         const uint8_t *left) {
1457   smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1458 }
1459
1460 void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1461                                         const uint8_t *above,
1462                                         const uint8_t *left) {
1463   smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1464 }
1465
1466 void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1467                                         const uint8_t *above,
1468                                         const uint8_t *left) {
1469   smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1470 }
1471
1472 void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1473                                        const uint8_t *above,
1474                                        const uint8_t *left) {
1475   smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1476 }
1477
1478 void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1479                                         const uint8_t *above,
1480                                         const uint8_t *left) {
1481   smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1482 }
1483
1484 void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1485                                         const uint8_t *above,
1486                                         const uint8_t *left) {
1487   smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1488 }
1489
1490 void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1491                                         const uint8_t *above,
1492                                         const uint8_t *left) {
1493   smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1494 }
1495
1496 void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1497                                         const uint8_t *above,
1498                                         const uint8_t *left) {
1499   smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1500 }
1501
1502 void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1503                                         const uint8_t *above,
1504                                         const uint8_t *left) {
1505   smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1506 }
1507
1508 void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1509                                         const uint8_t *above,
1510                                         const uint8_t *left) {
1511   smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1512 }