aom_dsp/x86/variance_sse2.c

   1 /*
   2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
   3  *
   4  * This source code is subject to the terms of the BSD 2 Clause License and
   5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
   6  * was not distributed with this source code in the LICENSE file, you can
   7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
   8  * Media Patent License 1.0 was not distributed with this source code in the
   9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  10  */
  11
  12 #include <assert.h>
  13 #include <emmintrin.h>  // SSE2
  14
  15 #include "config/aom_config.h"
  16 #include "config/aom_dsp_rtcd.h"
  17 #include "config/av1_rtcd.h"
  18
  19 #include "aom_dsp/x86/synonyms.h"
  20
  21 #include "aom_ports/mem.h"
  22
  23 #include "av1/common/filter.h"
  24 #include "av1/common/onyxc_int.h"
  25 #include "av1/common/reconinter.h"
  26
  27 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
  28   __m128i vsum = _mm_setzero_si128();
  29   int i;
  30
  31   for (i = 0; i < 32; ++i) {
  32     const __m128i v = xx_loadu_128(src);
  33     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
  34     src += 8;
  35   }
  36
  37   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  38   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  39   return _mm_cvtsi128_si32(vsum);
  40 }
  41
  42 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
  43   const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
  44   const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
  45   return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
  46 }
  47
  48 static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
  49   const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
  50   return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
  51 }
  52
  53 // Accumulate 4 32bit numbers in val to 1 32bit number
  54 static INLINE unsigned int add32x4_sse2(__m128i val) {
  55   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
  56   val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
  57   return _mm_cvtsi128_si32(val);
  58 }
  59
  60 // Accumulate 8 16bit in sum to 4 32bit number
  61 static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
  62   const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
  63   const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
  64   return _mm_add_epi32(sum_lo, sum_hi);
  65 }
  66
  67 static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
  68                                         __m128i *const sse,
  69                                         __m128i *const sum) {
  70   const __m128i diff = _mm_sub_epi16(src, ref);
  71   *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
  72   *sum = _mm_add_epi16(*sum, diff);
  73 }
  74
  75 // Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
  76 // Slightly faster than variance_final_256_pel_sse2()
  77 // diff sum of 128 pixels can still fit in 16bit integer
  78 static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
  79                                                unsigned int *const sse,
  80                                                int *const sum) {
  81   *sse = add32x4_sse2(vsse);
  82
  83   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  84   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  85   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  86   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  87 }
  88
  89 // Can handle 256 pixels' diff sum (such as 16x16)
  90 static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
  91                                                unsigned int *const sse,
  92                                                int *const sum) {
  93   *sse = add32x4_sse2(vsse);
  94
  95   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  96   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  97   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  98   *sum += (int16_t)_mm_extract_epi16(vsum, 1);
  99 }
 100
 101 // Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
 102 static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
 103                                                unsigned int *const sse,
 104                                                int *const sum) {
 105   *sse = add32x4_sse2(vsse);
 106
 107   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
 108   vsum = _mm_unpacklo_epi16(vsum, vsum);
 109   vsum = _mm_srai_epi32(vsum, 16);
 110   *sum = add32x4_sse2(vsum);
 111 }
 112
 113 // Can handle 1024 pixels' diff sum (such as 32x32)
 114 static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
 115                                                 unsigned int *const sse,
 116                                                 int *const sum) {
 117   *sse = add32x4_sse2(vsse);
 118
 119   vsum = sum_to_32bit_sse2(vsum);
 120   *sum = add32x4_sse2(vsum);
 121 }
 122
 123 static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
 124                                   const uint8_t *ref, const int ref_stride,
 125                                   const int h, __m128i *const sse,
 126                                   __m128i *const sum) {
 127   assert(h <= 256);  // May overflow for larger height.
 128   *sum = _mm_setzero_si128();
 129
 130   for (int i = 0; i < h; i += 2) {
 131     const __m128i s = load4x2_sse2(src, src_stride);
 132     const __m128i r = load4x2_sse2(ref, ref_stride);
 133
 134     variance_kernel_sse2(s, r, sse, sum);
 135     src += 2 * src_stride;
 136     ref += 2 * ref_stride;
 137   }
 138 }
 139
 140 static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
 141                                   const uint8_t *ref, const int ref_stride,
 142                                   const int h, __m128i *const sse,
 143                                   __m128i *const sum) {
 144   assert(h <= 128);  // May overflow for larger height.
 145   *sum = _mm_setzero_si128();
 146   for (int i = 0; i < h; i++) {
 147     const __m128i s = load8_8to16_sse2(src);
 148     const __m128i r = load8_8to16_sse2(ref);
 149
 150     variance_kernel_sse2(s, r, sse, sum);
 151     src += src_stride;
 152     ref += ref_stride;
 153   }
 154 }
 155
 156 static INLINE void variance16_kernel_sse2(const uint8_t *const src,
 157                                           const uint8_t *const ref,
 158                                           __m128i *const sse,
 159                                           __m128i *const sum) {
 160   const __m128i zero = _mm_setzero_si128();
 161   const __m128i s = _mm_loadu_si128((const __m128i *)src);
 162   const __m128i r = _mm_loadu_si128((const __m128i *)ref);
 163   const __m128i src0 = _mm_unpacklo_epi8(s, zero);
 164   const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
 165   const __m128i src1 = _mm_unpackhi_epi8(s, zero);
 166   const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
 167
 168   variance_kernel_sse2(src0, ref0, sse, sum);
 169   variance_kernel_sse2(src1, ref1, sse, sum);
 170 }
 171
 172 static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
 173                                    const uint8_t *ref, const int ref_stride,
 174                                    const int h, __m128i *const sse,
 175                                    __m128i *const sum) {
 176   assert(h <= 64);  // May overflow for larger height.
 177   *sum = _mm_setzero_si128();
 178
 179   for (int i = 0; i < h; ++i) {
 180     variance16_kernel_sse2(src, ref, sse, sum);
 181     src += src_stride;
 182     ref += ref_stride;
 183   }
 184 }
 185
 186 static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
 187                                    const uint8_t *ref, const int ref_stride,
 188                                    const int h, __m128i *const sse,
 189                                    __m128i *const sum) {
 190   assert(h <= 32);  // May overflow for larger height.
 191   // Don't initialize sse here since it's an accumulation.
 192   *sum = _mm_setzero_si128();
 193
 194   for (int i = 0; i < h; ++i) {
 195     variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
 196     variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
 197     src += src_stride;
 198     ref += ref_stride;
 199   }
 200 }
 201
 202 static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
 203                                    const uint8_t *ref, const int ref_stride,
 204                                    const int h, __m128i *const sse,
 205                                    __m128i *const sum) {
 206   assert(h <= 16);  // May overflow for larger height.
 207   *sum = _mm_setzero_si128();
 208
 209   for (int i = 0; i < h; ++i) {
 210     variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
 211     variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
 212     variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
 213     variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
 214     src += src_stride;
 215     ref += ref_stride;
 216   }
 217 }
 218
 219 static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
 220                                     const uint8_t *ref, const int ref_stride,
 221                                     const int h, __m128i *const sse,
 222                                     __m128i *const sum) {
 223   assert(h <= 8);  // May overflow for larger height.
 224   *sum = _mm_setzero_si128();
 225
 226   for (int i = 0; i < h; ++i) {
 227     for (int j = 0; j < 4; ++j) {
 228       const int offset0 = j << 5;
 229       const int offset1 = offset0 + 16;
 230       variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
 231       variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
 232     }
 233     src += src_stride;
 234     ref += ref_stride;
 235   }
 236 }
 237
 238 #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
 239   unsigned int aom_variance##bw##x##bh##_sse2(                                \
 240       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
 241       unsigned int *sse) {                                                    \
 242     __m128i vsse = _mm_setzero_si128();                                       \
 243     __m128i vsum;                                                             \
 244     int sum = 0;                                                              \
 245     variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
 246     variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
 247     assert(sum <= 255 * bw * bh);                                             \
 248     assert(sum >= -255 * bw * bh);                                            \
 249     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
 250   }
 251
 252 AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
 253 AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
 254 AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
 255
 256 AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
 257 AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
 258 AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
 259 AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
 260
 261 AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
 262 AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
 263 AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
 264 AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
 265 AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
 266
 267 AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
 268 AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
 269 AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
 270
 271 #define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
 272   unsigned int aom_variance##bw##x##bh##_sse2(                                \
 273       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
 274       unsigned int *sse) {                                                    \
 275     __m128i vsse = _mm_setzero_si128();                                       \
 276     __m128i vsum = _mm_setzero_si128();                                       \
 277     for (int i = 0; i < (bh / uh); ++i) {                                     \
 278       __m128i vsum16;                                                         \
 279       variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
 280                           &vsum16);                                           \
 281       vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
 282       src += (src_stride * uh);                                               \
 283       ref += (ref_stride * uh);                                               \
 284     }                                                                         \
 285     *sse = add32x4_sse2(vsse);                                                \
 286     int sum = add32x4_sse2(vsum);                                             \
 287     assert(sum <= 255 * bw * bh);                                             \
 288     assert(sum >= -255 * bw * bh);                                            \
 289     return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
 290   }
 291
 292 AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
 293
 294 AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
 295 AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
 296 AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
 297 AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
 298
 299 AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
 300 AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
 301
 302 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
 303                              const uint8_t *ref, int ref_stride,
 304                              unsigned int *sse) {
 305   aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
 306   return *sse;
 307 }
 308
 309 unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
 310                               const uint8_t *ref, int ref_stride,
 311                               unsigned int *sse) {
 312   aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
 313   return *sse;
 314 }
 315
 316 unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
 317                               const uint8_t *ref, int ref_stride,
 318                               unsigned int *sse) {
 319   aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
 320   return *sse;
 321 }
 322
 323 unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
 324                                const uint8_t *ref, int ref_stride,
 325                                unsigned int *sse) {
 326   aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
 327   return *sse;
 328 }
 329
 330 // The 2 unused parameters are place holders for PIC enabled build.
 331 // These definitions are for functions defined in subpel_variance.asm
 332 #define DECL(w, opt)                                                           \
 333   int aom_sub_pixel_variance##w##xh_##opt(                                     \
 334       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
 335       const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
 336       void *unused0, void *unused)
 337 #define DECLS(opt) \
 338   DECL(4, opt);    \
 339   DECL(8, opt);    \
 340   DECL(16, opt)
 341
 342 DECLS(sse2);
 343 DECLS(ssse3);
 344 #undef DECLS
 345 #undef DECL
 346
 347 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
 348   unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
 349       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
 350       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
 351     /*Avoid overflow in helper by capping height.*/                           \
 352     const int hf = AOMMIN(h, 64);                                             \
 353     unsigned int sse = 0;                                                     \
 354     int se = 0;                                                               \
 355     for (int i = 0; i < (w / wf); ++i) {                                      \
 356       const uint8_t *src_ptr = src;                                           \
 357       const uint8_t *dst_ptr = dst;                                           \
 358       for (int j = 0; j < (h / hf); ++j) {                                    \
 359         unsigned int sse2;                                                    \
 360         const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
 361             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
 362             &sse2, NULL, NULL);                                               \
 363         dst_ptr += hf * dst_stride;                                           \
 364         src_ptr += hf * src_stride;                                           \
 365         se += se2;                                                            \
 366         sse += sse2;                                                          \
 367       }                                                                       \
 368       src += wf;                                                              \
 369       dst += wf;                                                              \
 370     }                                                                         \
 371     *sse_ptr = sse;                                                           \
 372     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
 373   }
 374
 375 #define FNS(opt)                                     \
 376   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
 377   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
 378   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
 379   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
 380   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
 381   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
 382   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
 383   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
 384   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
 385   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
 386   FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
 387   FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
 388   FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
 389   FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
 390   FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
 391   FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
 392   FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
 393   FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
 394   FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
 395   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
 396   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
 397   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 398
 399 FNS(sse2);
 400 FNS(ssse3);
 401
 402 #undef FNS
 403 #undef FN
 404
 405 // The 2 unused parameters are place holders for PIC enabled build.
 406 #define DECL(w, opt)                                                        \
 407   int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
 408       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
 409       const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
 410       ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
 411       void *unused)
 412 #define DECLS(opt) \
 413   DECL(4, opt);    \
 414   DECL(8, opt);    \
 415   DECL(16, opt)
 416
 417 DECLS(sse2);
 418 DECLS(ssse3);
 419 #undef DECL
 420 #undef DECLS
 421
 422 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
 423   unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
 424       const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
 425       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
 426       const uint8_t *sec) {                                                  \
 427     /*Avoid overflow in helper by capping height.*/                          \
 428     const int hf = AOMMIN(h, 64);                                            \
 429     unsigned int sse = 0;                                                    \
 430     int se = 0;                                                              \
 431     for (int i = 0; i < (w / wf); ++i) {                                     \
 432       const uint8_t *src_ptr = src;                                          \
 433       const uint8_t *dst_ptr = dst;                                          \
 434       const uint8_t *sec_ptr = sec;                                          \
 435       for (int j = 0; j < (h / hf); ++j) {                                   \
 436         unsigned int sse2;                                                   \
 437         const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
 438             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
 439             sec_ptr, w, hf, &sse2, NULL, NULL);                              \
 440         dst_ptr += hf * dst_stride;                                          \
 441         src_ptr += hf * src_stride;                                          \
 442         sec_ptr += hf * w;                                                   \
 443         se += se2;                                                           \
 444         sse += sse2;                                                         \
 445       }                                                                      \
 446       src += wf;                                                             \
 447       dst += wf;                                                             \
 448       sec += wf;                                                             \
 449     }                                                                        \
 450     *sse_ptr = sse;                                                          \
 451     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
 452   }
 453
 454 #define FNS(opt)                                     \
 455   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
 456   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
 457   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
 458   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
 459   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
 460   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
 461   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
 462   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
 463   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
 464   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
 465   FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
 466   FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
 467   FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
 468   FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
 469   FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
 470   FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
 471   FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
 472   FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
 473   FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
 474   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
 475   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
 476   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 477
 478 FNS(sse2);
 479 FNS(ssse3);
 480
 481 #undef FNS
 482 #undef FN
 483
 484 void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
 485                              int mi_row, int mi_col, const MV *const mv,
 486                              uint8_t *comp_pred, int width, int height,
 487                              int subpel_x_q3, int subpel_y_q3,
 488                              const uint8_t *ref, int ref_stride) {
 489   // expect xd == NULL only in tests
 490   if (xd != NULL) {
 491     const MB_MODE_INFO *mi = xd->mi[0];
 492     const int ref_num = 0;
 493     const int is_intrabc = is_intrabc_block(mi);
 494     const struct scale_factors *const sf =
 495         is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
 496     const int is_scaled = av1_is_scaled(sf);
 497
 498     if (is_scaled) {
 499       // Note: This is mostly a copy from the >=8X8 case in
 500       // build_inter_predictors() function, with some small tweaks.
 501
 502       // Some assumptions.
 503       const int plane = 0;
 504
 505       // Get pre-requisites.
 506       const struct macroblockd_plane *const pd = &xd->plane[plane];
 507       const int ssx = pd->subsampling_x;
 508       const int ssy = pd->subsampling_y;
 509       assert(ssx == 0 && ssy == 0);
 510       const struct buf_2d *const dst_buf = &pd->dst;
 511       const struct buf_2d *const pre_buf =
 512           is_intrabc ? dst_buf : &pd->pre[ref_num];
 513       const int mi_x = mi_col * MI_SIZE;
 514       const int mi_y = mi_row * MI_SIZE;
 515
 516       // Calculate subpel_x/y and x/y_step.
 517       const int row_start = 0;  // Because ss_y is 0.
 518       const int col_start = 0;  // Because ss_x is 0.
 519       const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
 520       const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
 521       int orig_pos_y = pre_y << SUBPEL_BITS;
 522       orig_pos_y += mv->row * (1 << (1 - ssy));
 523       int orig_pos_x = pre_x << SUBPEL_BITS;
 524       orig_pos_x += mv->col * (1 << (1 - ssx));
 525       int pos_y = sf->scale_value_y(orig_pos_y, sf);
 526       int pos_x = sf->scale_value_x(orig_pos_x, sf);
 527       pos_x += SCALE_EXTRA_OFF;
 528       pos_y += SCALE_EXTRA_OFF;
 529
 530       const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
 531       const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
 532       const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
 533                          << SCALE_SUBPEL_BITS;
 534       const int right = (pre_buf->width + AOM_INTERP_EXTEND)
 535                         << SCALE_SUBPEL_BITS;
 536       pos_y = clamp(pos_y, top, bottom);
 537       pos_x = clamp(pos_x, left, right);
 538
 539       const uint8_t *const pre =
 540           pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
 541           (pos_x >> SCALE_SUBPEL_BITS);
 542
 543       const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
 544                                            pos_x & SCALE_SUBPEL_MASK,
 545                                            pos_y & SCALE_SUBPEL_MASK };
 546
 547       // Get warp types.
 548       const WarpedMotionParams *const wm =
 549           &xd->global_motion[mi->ref_frame[ref_num]];
 550       const int is_global = is_global_mv_block(mi, wm->wmtype);
 551       WarpTypesAllowed warp_types;
 552       warp_types.global_warp_allowed = is_global;
 553       warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
 554
 555       // Get convolve parameters.
 556       ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
 557       const InterpFilters filters =
 558           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 559
 560       // Get the inter predictor.
 561       const int build_for_obmc = 0;
 562       av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
 563                                &subpel_params, sf, width, height, &conv_params,
 564                                filters, &warp_types, mi_x >> pd->subsampling_x,
 565                                mi_y >> pd->subsampling_y, plane, ref_num, mi,
 566                                build_for_obmc, xd, cm->allow_warped_motion);
 567
 568       return;
 569     }
 570   }
 571
 572   const InterpFilterParams filter =
 573       av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 574
 575   if (!subpel_x_q3 && !subpel_y_q3) {
 576     if (width >= 16) {
 577       int i;
 578       assert(!(width & 15));
 579       /*Read 16 pixels one row at a time.*/
 580       for (i = 0; i < height; i++) {
 581         int j;
 582         for (j = 0; j < width; j += 16) {
 583           xx_storeu_128(comp_pred, xx_loadu_128(ref));
 584           comp_pred += 16;
 585           ref += 16;
 586         }
 587         ref += ref_stride - width;
 588       }
 589     } else if (width >= 8) {
 590       int i;
 591       assert(!(width & 7));
 592       assert(!(height & 1));
 593       /*Read 8 pixels two rows at a time.*/
 594       for (i = 0; i < height; i += 2) {
 595         __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
 596         __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
 597         xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
 598         comp_pred += 16;
 599         ref += 2 * ref_stride;
 600       }
 601     } else {
 602       int i;
 603       assert(!(width & 3));
 604       assert(!(height & 3));
 605       /*Read 4 pixels four rows at a time.*/
 606       for (i = 0; i < height; i++) {
 607         const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
 608         const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
 609         const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
 610         const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
 611         const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
 612                                                _mm_unpacklo_epi32(row2, row3));
 613         xx_storeu_128(comp_pred, reg);
 614         comp_pred += 16;
 615         ref += 4 * ref_stride;
 616       }
 617     }
 618   } else if (!subpel_y_q3) {
 619     const int16_t *const kernel =
 620         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
 621     aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
 622                         width, height);
 623   } else if (!subpel_x_q3) {
 624     const int16_t *const kernel =
 625         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
 626     aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
 627                        width, height);
 628   } else {
 629     DECLARE_ALIGNED(16, uint8_t,
 630                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
 631     const int16_t *const kernel_x =
 632         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
 633     const int16_t *const kernel_y =
 634         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
 635     const int intermediate_height =
 636         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
 637     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
 638     aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
 639                         temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
 640                         intermediate_height);
 641     aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
 642                        MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
 643                        width, height);
 644   }
 645 }
 646
 647 void aom_comp_avg_upsampled_pred_sse2(
 648     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
 649     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
 650     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
 651     int ref_stride) {
 652   int n;
 653   int i;
 654   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
 655                      subpel_x_q3, subpel_y_q3, ref, ref_stride);
 656   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
 657   assert(!(width * height & 15));
 658   n = width * height >> 4;
 659   for (i = 0; i < n; i++) {
 660     __m128i s0 = xx_loadu_128(comp_pred);
 661     __m128i p0 = xx_loadu_128(pred);
 662     xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
 663     comp_pred += 16;
 664     pred += 16;
 665   }
 666 }