aom_dsp/variance.c

   1 /*
   2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
   3  *
   4  * This source code is subject to the terms of the BSD 2 Clause License and
   5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
   6  * was not distributed with this source code in the LICENSE file, you can
   7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
   8  * Media Patent License 1.0 was not distributed with this source code in the
   9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  10  */
  11 #include <assert.h>
  12 #include <stdlib.h>
  13 #include <string.h>
  14
  15 #include "config/aom_config.h"
  16 #include "config/aom_dsp_rtcd.h"
  17
  18 #include "aom/aom_integer.h"
  19 #include "aom_ports/mem.h"
  20
  21 #include "aom_dsp/aom_filter.h"
  22 #include "aom_dsp/blend.h"
  23 #include "aom_dsp/variance.h"
  24
  25 #include "av1/common/filter.h"
  26 #include "av1/common/reconinter.h"
  27
  28 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
  29                             int b_stride) {
  30   int distortion = 0;
  31   int r, c;
  32
  33   for (r = 0; r < 4; ++r) {
  34     for (c = 0; c < 4; ++c) {
  35       int diff = a[c] - b[c];
  36       distortion += diff * diff;
  37     }
  38
  39     a += a_stride;
  40     b += b_stride;
  41   }
  42
  43   return distortion;
  44 }
  45
  46 uint32_t aom_get_mb_ss_c(const int16_t *a) {
  47   unsigned int i, sum = 0;
  48
  49   for (i = 0; i < 256; ++i) {
  50     sum += a[i] * a[i];
  51   }
  52
  53   return sum;
  54 }
  55
  56 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
  57                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
  58   int i, j;
  59
  60   *sum = 0;
  61   *sse = 0;
  62
  63   for (i = 0; i < h; ++i) {
  64     for (j = 0; j < w; ++j) {
  65       const int diff = a[j] - b[j];
  66       *sum += diff;
  67       *sse += diff * diff;
  68     }
  69
  70     a += a_stride;
  71     b += b_stride;
  72   }
  73 }
  74
  75 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
  76                           int b_stride, int w, int h) {
  77   uint32_t sse;
  78   int sum;
  79   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
  80   return sse;
  81 }
  82
  83 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
  84 // or vertical direction to produce the filtered output block. Used to implement
  85 // the first-pass of 2-D separable filter.
  86 //
  87 // Produces int16_t output to retain precision for the next pass. Two filter
  88 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
  89 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
  90 // It defines the offset required to move from one input to the next.
  91 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
  92                                              unsigned int src_pixels_per_line,
  93                                              unsigned int pixel_step,
  94                                              unsigned int output_height,
  95                                              unsigned int output_width,
  96                                              const uint8_t *filter) {
  97   unsigned int i, j;
  98
  99   for (i = 0; i < output_height; ++i) {
 100     for (j = 0; j < output_width; ++j) {
 101       b[j] = ROUND_POWER_OF_TWO(
 102           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
 103
 104       ++a;
 105     }
 106
 107     a += src_pixels_per_line - output_width;
 108     b += output_width;
 109   }
 110 }
 111
 112 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
 113 // or vertical direction to produce the filtered output block. Used to implement
 114 // the second-pass of 2-D separable filter.
 115 //
 116 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
 117 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
 118 // filter is applied horizontally (pixel_step = 1) or vertically
 119 // (pixel_step = stride). It defines the offset required to move from one input
 120 // to the next. Output is 8-bit.
 121 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
 122                                               unsigned int src_pixels_per_line,
 123                                               unsigned int pixel_step,
 124                                               unsigned int output_height,
 125                                               unsigned int output_width,
 126                                               const uint8_t *filter) {
 127   unsigned int i, j;
 128
 129   for (i = 0; i < output_height; ++i) {
 130     for (j = 0; j < output_width; ++j) {
 131       b[j] = ROUND_POWER_OF_TWO(
 132           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
 133       ++a;
 134     }
 135
 136     a += src_pixels_per_line - output_width;
 137     b += output_width;
 138   }
 139 }
 140
 141 #define VAR(W, H)                                                    \
 142   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
 143                                      const uint8_t *b, int b_stride, \
 144                                      uint32_t *sse) {                \
 145     int sum;                                                         \
 146     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
 147     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
 148   }
 149
 150 #define SUBPIX_VAR(W, H)                                                      \
 151   uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
 152       const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
 153       const uint8_t *b, int b_stride, uint32_t *sse) {                        \
 154     uint16_t fdata3[(H + 1) * W];                                             \
 155     uint8_t temp2[H * W];                                                     \
 156                                                                               \
 157     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
 158                                             bilinear_filters_2t[xoffset]);    \
 159     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
 160                                              bilinear_filters_2t[yoffset]);   \
 161                                                                               \
 162     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
 163   }
 164
 165 #define SUBPIX_AVG_VAR(W, H)                                                   \
 166   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
 167       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
 168       const uint8_t *b, int b_stride, uint32_t *sse,                           \
 169       const uint8_t *second_pred) {                                            \
 170     uint16_t fdata3[(H + 1) * W];                                              \
 171     uint8_t temp2[H * W];                                                      \
 172     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
 173                                                                                \
 174     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
 175                                             bilinear_filters_2t[xoffset]);     \
 176     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
 177                                              bilinear_filters_2t[yoffset]);    \
 178                                                                                \
 179     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
 180                                                                                \
 181     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
 182   }                                                                            \
 183   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
 184       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
 185       const uint8_t *b, int b_stride, uint32_t *sse,                           \
 186       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
 187     uint16_t fdata3[(H + 1) * W];                                              \
 188     uint8_t temp2[H * W];                                                      \
 189     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
 190                                                                                \
 191     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
 192                                             bilinear_filters_2t[xoffset]);     \
 193     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
 194                                              bilinear_filters_2t[yoffset]);    \
 195                                                                                \
 196     aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
 197                                                                                \
 198     return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
 199   }
 200
 201 /* Identical to the variance call except it takes an additional parameter, sum,
 202  * and returns that value using pass-by-reference instead of returning
 203  * sse - sum^2 / w*h
 204  */
 205 #define GET_VAR(W, H)                                                         \
 206   void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
 207                                const uint8_t *b, int b_stride, uint32_t *sse, \
 208                                int *sum) {                                    \
 209     variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
 210   }
 211
 212 /* Identical to the variance call except it does not calculate the
 213  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
 214  * variable.
 215  */
 216 #define MSE(W, H)                                               \
 217   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
 218                                 const uint8_t *b, int b_stride, \
 219                                 uint32_t *sse) {                \
 220     int sum;                                                    \
 221     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
 222     return *sse;                                                \
 223   }
 224
 225 /* All three forms of the variance are available in the same sizes. */
 226 #define VARIANCES(W, H) \
 227   VAR(W, H)             \
 228   SUBPIX_VAR(W, H)      \
 229   SUBPIX_AVG_VAR(W, H)
 230
 231 VARIANCES(128, 128)
 232 VARIANCES(128, 64)
 233 VARIANCES(64, 128)
 234 VARIANCES(64, 64)
 235 VARIANCES(64, 32)
 236 VARIANCES(32, 64)
 237 VARIANCES(32, 32)
 238 VARIANCES(32, 16)
 239 VARIANCES(16, 32)
 240 VARIANCES(16, 16)
 241 VARIANCES(16, 8)
 242 VARIANCES(8, 16)
 243 VARIANCES(8, 8)
 244 VARIANCES(8, 4)
 245 VARIANCES(4, 8)
 246 VARIANCES(4, 4)
 247 VARIANCES(4, 2)
 248 VARIANCES(2, 4)
 249 VARIANCES(2, 2)
 250
 251 // Realtime mode doesn't use rectangular blocks.
 252 #if !CONFIG_REALTIME_ONLY
 253 VARIANCES(4, 16)
 254 VARIANCES(16, 4)
 255 VARIANCES(8, 32)
 256 VARIANCES(32, 8)
 257 VARIANCES(16, 64)
 258 VARIANCES(64, 16)
 259 #endif
 260
 261 GET_VAR(16, 16)
 262 GET_VAR(8, 8)
 263
 264 MSE(16, 16)
 265 MSE(16, 8)
 266 MSE(8, 16)
 267 MSE(8, 8)
 268
 269 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 270                          int height, const uint8_t *ref, int ref_stride) {
 271   int i, j;
 272
 273   for (i = 0; i < height; ++i) {
 274     for (j = 0; j < width; ++j) {
 275       const int tmp = pred[j] + ref[j];
 276       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
 277     }
 278     comp_pred += width;
 279     pred += width;
 280     ref += ref_stride;
 281   }
 282 }
 283
 284 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
 285                                   int width, int height, const uint8_t *ref,
 286                                   int ref_stride,
 287                                   const DIST_WTD_COMP_PARAMS *jcp_param) {
 288   int i, j;
 289   const int fwd_offset = jcp_param->fwd_offset;
 290   const int bck_offset = jcp_param->bck_offset;
 291
 292   for (i = 0; i < height; ++i) {
 293     for (j = 0; j < width; ++j) {
 294       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
 295       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
 296       comp_pred[j] = (uint8_t)tmp;
 297     }
 298     comp_pred += width;
 299     pred += width;
 300     ref += ref_stride;
 301   }
 302 }
 303
 304 #if CONFIG_AV1_HIGHBITDEPTH
 305 static void highbd_variance64(const uint8_t *a8, int a_stride,
 306                               const uint8_t *b8, int b_stride, int w, int h,
 307                               uint64_t *sse, int64_t *sum) {
 308   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
 309   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 310   int64_t tsum = 0;
 311   uint64_t tsse = 0;
 312   for (int i = 0; i < h; ++i) {
 313     int32_t lsum = 0;
 314     for (int j = 0; j < w; ++j) {
 315       const int diff = a[j] - b[j];
 316       lsum += diff;
 317       tsse += (uint32_t)(diff * diff);
 318     }
 319     tsum += lsum;
 320     a += a_stride;
 321     b += b_stride;
 322   }
 323   *sum = tsum;
 324   *sse = tsse;
 325 }
 326
 327 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
 328                                  const uint8_t *b, int b_stride, int w, int h) {
 329   uint64_t sse;
 330   int64_t sum;
 331   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
 332   return sse;
 333 }
 334
 335 static void highbd_8_variance(const uint8_t *a8, int a_stride,
 336                               const uint8_t *b8, int b_stride, int w, int h,
 337                               uint32_t *sse, int *sum) {
 338   uint64_t sse_long = 0;
 339   int64_t sum_long = 0;
 340   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
 341   *sse = (uint32_t)sse_long;
 342   *sum = (int)sum_long;
 343 }
 344
 345 static void highbd_10_variance(const uint8_t *a8, int a_stride,
 346                                const uint8_t *b8, int b_stride, int w, int h,
 347                                uint32_t *sse, int *sum) {
 348   uint64_t sse_long = 0;
 349   int64_t sum_long = 0;
 350   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
 351   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
 352   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 353 }
 354
 355 static void highbd_12_variance(const uint8_t *a8, int a_stride,
 356                                const uint8_t *b8, int b_stride, int w, int h,
 357                                uint32_t *sse, int *sum) {
 358   uint64_t sse_long = 0;
 359   int64_t sum_long = 0;
 360   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
 361   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
 362   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 363 }
 364
 365 #define HIGHBD_VAR(W, H)                                                       \
 366   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
 367                                               const uint8_t *b, int b_stride,  \
 368                                               uint32_t *sse) {                 \
 369     int sum;                                                                   \
 370     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
 371     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
 372   }                                                                            \
 373                                                                                \
 374   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
 375                                                const uint8_t *b, int b_stride, \
 376                                                uint32_t *sse) {                \
 377     int sum;                                                                   \
 378     int64_t var;                                                               \
 379     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
 380     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
 381     return (var >= 0) ? (uint32_t)var : 0;                                     \
 382   }                                                                            \
 383                                                                                \
 384   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
 385                                                const uint8_t *b, int b_stride, \
 386                                                uint32_t *sse) {                \
 387     int sum;                                                                   \
 388     int64_t var;                                                               \
 389     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
 390     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
 391     return (var >= 0) ? (uint32_t)var : 0;                                     \
 392   }
 393
 394 #define HIGHBD_GET_VAR(S)                                                    \
 395   void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
 396                                         const uint8_t *ref, int ref_stride,  \
 397                                         uint32_t *sse, int *sum) {           \
 398     highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
 399   }                                                                          \
 400                                                                              \
 401   void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
 402                                          const uint8_t *ref, int ref_stride, \
 403                                          uint32_t *sse, int *sum) {          \
 404     highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
 405   }                                                                          \
 406                                                                              \
 407   void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
 408                                          const uint8_t *ref, int ref_stride, \
 409                                          uint32_t *sse, int *sum) {          \
 410     highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
 411   }
 412
 413 #define HIGHBD_MSE(W, H)                                                      \
 414   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
 415                                          const uint8_t *ref, int ref_stride,  \
 416                                          uint32_t *sse) {                     \
 417     int sum;                                                                  \
 418     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
 419     return *sse;                                                              \
 420   }                                                                           \
 421                                                                               \
 422   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
 423                                           const uint8_t *ref, int ref_stride, \
 424                                           uint32_t *sse) {                    \
 425     int sum;                                                                  \
 426     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
 427     return *sse;                                                              \
 428   }                                                                           \
 429                                                                               \
 430   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
 431                                           const uint8_t *ref, int ref_stride, \
 432                                           uint32_t *sse) {                    \
 433     int sum;                                                                  \
 434     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
 435     return *sse;                                                              \
 436   }
 437
 438 void aom_highbd_var_filter_block2d_bil_first_pass(
 439     const uint8_t *src_ptr8, uint16_t *output_ptr,
 440     unsigned int src_pixels_per_line, int pixel_step,
 441     unsigned int output_height, unsigned int output_width,
 442     const uint8_t *filter) {
 443   unsigned int i, j;
 444   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
 445   for (i = 0; i < output_height; ++i) {
 446     for (j = 0; j < output_width; ++j) {
 447       output_ptr[j] = ROUND_POWER_OF_TWO(
 448           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
 449           FILTER_BITS);
 450
 451       ++src_ptr;
 452     }
 453
 454     // Next row...
 455     src_ptr += src_pixels_per_line - output_width;
 456     output_ptr += output_width;
 457   }
 458 }
 459
 460 void aom_highbd_var_filter_block2d_bil_second_pass(
 461     const uint16_t *src_ptr, uint16_t *output_ptr,
 462     unsigned int src_pixels_per_line, unsigned int pixel_step,
 463     unsigned int output_height, unsigned int output_width,
 464     const uint8_t *filter) {
 465   unsigned int i, j;
 466
 467   for (i = 0; i < output_height; ++i) {
 468     for (j = 0; j < output_width; ++j) {
 469       output_ptr[j] = ROUND_POWER_OF_TWO(
 470           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
 471           FILTER_BITS);
 472       ++src_ptr;
 473     }
 474
 475     src_ptr += src_pixels_per_line - output_width;
 476     output_ptr += output_width;
 477   }
 478 }
 479
 480 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
 481   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
 482       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
 483       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
 484     uint16_t fdata3[(H + 1) * W];                                            \
 485     uint16_t temp2[H * W];                                                   \
 486                                                                              \
 487     aom_highbd_var_filter_block2d_bil_first_pass(                            \
 488         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
 489     aom_highbd_var_filter_block2d_bil_second_pass(                           \
 490         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
 491                                                                              \
 492     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
 493                                               dst, dst_stride, sse);         \
 494   }                                                                          \
 495                                                                              \
 496   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
 497       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
 498       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
 499     uint16_t fdata3[(H + 1) * W];                                            \
 500     uint16_t temp2[H * W];                                                   \
 501                                                                              \
 502     aom_highbd_var_filter_block2d_bil_first_pass(                            \
 503         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
 504     aom_highbd_var_filter_block2d_bil_second_pass(                           \
 505         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
 506                                                                              \
 507     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
 508                                                dst, dst_stride, sse);        \
 509   }                                                                          \
 510                                                                              \
 511   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
 512       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
 513       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
 514     uint16_t fdata3[(H + 1) * W];                                            \
 515     uint16_t temp2[H * W];                                                   \
 516                                                                              \
 517     aom_highbd_var_filter_block2d_bil_first_pass(                            \
 518         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
 519     aom_highbd_var_filter_block2d_bil_second_pass(                           \
 520         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
 521                                                                              \
 522     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
 523                                                dst, dst_stride, sse);        \
 524   }
 525
 526 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
 527   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
 528       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
 529       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
 530       const uint8_t *second_pred) {                                           \
 531     uint16_t fdata3[(H + 1) * W];                                             \
 532     uint16_t temp2[H * W];                                                    \
 533     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
 534                                                                               \
 535     aom_highbd_var_filter_block2d_bil_first_pass(                             \
 536         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
 537     aom_highbd_var_filter_block2d_bil_second_pass(                            \
 538         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
 539                                                                               \
 540     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
 541                                CONVERT_TO_BYTEPTR(temp2), W);                 \
 542                                                                               \
 543     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
 544                                               dst, dst_stride, sse);          \
 545   }                                                                           \
 546                                                                               \
 547   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
 548       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
 549       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
 550       const uint8_t *second_pred) {                                           \
 551     uint16_t fdata3[(H + 1) * W];                                             \
 552     uint16_t temp2[H * W];                                                    \
 553     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
 554                                                                               \
 555     aom_highbd_var_filter_block2d_bil_first_pass(                             \
 556         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
 557     aom_highbd_var_filter_block2d_bil_second_pass(                            \
 558         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
 559                                                                               \
 560     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
 561                                CONVERT_TO_BYTEPTR(temp2), W);                 \
 562                                                                               \
 563     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
 564                                                dst, dst_stride, sse);         \
 565   }                                                                           \
 566                                                                               \
 567   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
 568       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
 569       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
 570       const uint8_t *second_pred) {                                           \
 571     uint16_t fdata3[(H + 1) * W];                                             \
 572     uint16_t temp2[H * W];                                                    \
 573     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
 574                                                                               \
 575     aom_highbd_var_filter_block2d_bil_first_pass(                             \
 576         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
 577     aom_highbd_var_filter_block2d_bil_second_pass(                            \
 578         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
 579                                                                               \
 580     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
 581                                CONVERT_TO_BYTEPTR(temp2), W);                 \
 582                                                                               \
 583     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
 584                                                dst, dst_stride, sse);         \
 585   }                                                                           \
 586                                                                               \
 587   uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
 588       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
 589       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
 590       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
 591     uint16_t fdata3[(H + 1) * W];                                             \
 592     uint16_t temp2[H * W];                                                    \
 593     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
 594                                                                               \
 595     aom_highbd_var_filter_block2d_bil_first_pass(                             \
 596         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
 597     aom_highbd_var_filter_block2d_bil_second_pass(                            \
 598         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
 599                                                                               \
 600     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
 601                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
 602                                       jcp_param);                             \
 603                                                                               \
 604     return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
 605                                           dst_stride, sse);                   \
 606   }                                                                           \
 607                                                                               \
 608   uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
 609       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
 610       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
 611       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
 612     uint16_t fdata3[(H + 1) * W];                                             \
 613     uint16_t temp2[H * W];                                                    \
 614     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
 615                                                                               \
 616     aom_highbd_var_filter_block2d_bil_first_pass(                             \
 617         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
 618     aom_highbd_var_filter_block2d_bil_second_pass(                            \
 619         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
 620                                                                               \
 621     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
 622                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
 623                                       jcp_param);                             \
 624                                                                               \
 625     return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
 626                                            dst_stride, sse);                  \
 627   }                                                                           \
 628                                                                               \
 629   uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
 630       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
 631       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
 632       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
 633     uint16_t fdata3[(H + 1) * W];                                             \
 634     uint16_t temp2[H * W];                                                    \
 635     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
 636                                                                               \
 637     aom_highbd_var_filter_block2d_bil_first_pass(                             \
 638         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
 639     aom_highbd_var_filter_block2d_bil_second_pass(                            \
 640         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
 641                                                                               \
 642     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
 643                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
 644                                       jcp_param);                             \
 645                                                                               \
 646     return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
 647                                            dst_stride, sse);                  \
 648   }
 649
 650 /* All three forms of the variance are available in the same sizes. */
 651 #define HIGHBD_VARIANCES(W, H) \
 652   HIGHBD_VAR(W, H)             \
 653   HIGHBD_SUBPIX_VAR(W, H)      \
 654   HIGHBD_SUBPIX_AVG_VAR(W, H)
 655
 656 HIGHBD_VARIANCES(128, 128)
 657 HIGHBD_VARIANCES(128, 64)
 658 HIGHBD_VARIANCES(64, 128)
 659 HIGHBD_VARIANCES(64, 64)
 660 HIGHBD_VARIANCES(64, 32)
 661 HIGHBD_VARIANCES(32, 64)
 662 HIGHBD_VARIANCES(32, 32)
 663 HIGHBD_VARIANCES(32, 16)
 664 HIGHBD_VARIANCES(16, 32)
 665 HIGHBD_VARIANCES(16, 16)
 666 HIGHBD_VARIANCES(16, 8)
 667 HIGHBD_VARIANCES(8, 16)
 668 HIGHBD_VARIANCES(8, 8)
 669 HIGHBD_VARIANCES(8, 4)
 670 HIGHBD_VARIANCES(4, 8)
 671 HIGHBD_VARIANCES(4, 4)
 672 HIGHBD_VARIANCES(4, 2)
 673 HIGHBD_VARIANCES(2, 4)
 674 HIGHBD_VARIANCES(2, 2)
 675
 676 // Realtime mode doesn't use 4x rectangular blocks.
 677 #if !CONFIG_REALTIME_ONLY
 678 HIGHBD_VARIANCES(4, 16)
 679 HIGHBD_VARIANCES(16, 4)
 680 HIGHBD_VARIANCES(8, 32)
 681 HIGHBD_VARIANCES(32, 8)
 682 HIGHBD_VARIANCES(16, 64)
 683 HIGHBD_VARIANCES(64, 16)
 684 #endif
 685
 686 HIGHBD_GET_VAR(8)
 687 HIGHBD_GET_VAR(16)
 688
 689 HIGHBD_MSE(16, 16)
 690 HIGHBD_MSE(16, 8)
 691 HIGHBD_MSE(8, 16)
 692 HIGHBD_MSE(8, 8)
 693
 694 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
 695                                 int width, int height, const uint8_t *ref8,
 696                                 int ref_stride) {
 697   int i, j;
 698   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
 699   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 700   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
 701   for (i = 0; i < height; ++i) {
 702     for (j = 0; j < width; ++j) {
 703       const int tmp = pred[j] + ref[j];
 704       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
 705     }
 706     comp_pred += width;
 707     pred += width;
 708     ref += ref_stride;
 709   }
 710 }
 711
 712 void aom_highbd_dist_wtd_comp_avg_pred_c(
 713     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
 714     const uint8_t *ref8, int ref_stride,
 715     const DIST_WTD_COMP_PARAMS *jcp_param) {
 716   int i, j;
 717   const int fwd_offset = jcp_param->fwd_offset;
 718   const int bck_offset = jcp_param->bck_offset;
 719   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
 720   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 721   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
 722
 723   for (i = 0; i < height; ++i) {
 724     for (j = 0; j < width; ++j) {
 725       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
 726       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
 727       comp_pred[j] = (uint16_t)tmp;
 728     }
 729     comp_pred += width;
 730     pred += width;
 731     ref += ref_stride;
 732   }
 733 }
 734 #endif  // CONFIG_AV1_HIGHBITDEPTH
 735
 736 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 737                           int height, const uint8_t *ref, int ref_stride,
 738                           const uint8_t *mask, int mask_stride,
 739                           int invert_mask) {
 740   int i, j;
 741   const uint8_t *src0 = invert_mask ? pred : ref;
 742   const uint8_t *src1 = invert_mask ? ref : pred;
 743   const int stride0 = invert_mask ? width : ref_stride;
 744   const int stride1 = invert_mask ? ref_stride : width;
 745   for (i = 0; i < height; ++i) {
 746     for (j = 0; j < width; ++j) {
 747       comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
 748     }
 749     comp_pred += width;
 750     src0 += stride0;
 751     src1 += stride1;
 752     mask += mask_stride;
 753   }
 754 }
 755
 756 #define MASK_SUBPIX_VAR(W, H)                                                  \
 757   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
 758       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
 759       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
 760       const uint8_t *msk, int msk_stride, int invert_mask,                     \
 761       unsigned int *sse) {                                                     \
 762     uint16_t fdata3[(H + 1) * W];                                              \
 763     uint8_t temp2[H * W];                                                      \
 764     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
 765                                                                                \
 766     aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
 767                                             W, bilinear_filters_2t[xoffset]);  \
 768     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
 769                                              bilinear_filters_2t[yoffset]);    \
 770                                                                                \
 771     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
 772                          invert_mask);                                         \
 773     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
 774   }
 775
 776 MASK_SUBPIX_VAR(4, 4)
 777 MASK_SUBPIX_VAR(4, 8)
 778 MASK_SUBPIX_VAR(8, 4)
 779 MASK_SUBPIX_VAR(8, 8)
 780 MASK_SUBPIX_VAR(8, 16)
 781 MASK_SUBPIX_VAR(16, 8)
 782 MASK_SUBPIX_VAR(16, 16)
 783 MASK_SUBPIX_VAR(16, 32)
 784 MASK_SUBPIX_VAR(32, 16)
 785 MASK_SUBPIX_VAR(32, 32)
 786 MASK_SUBPIX_VAR(32, 64)
 787 MASK_SUBPIX_VAR(64, 32)
 788 MASK_SUBPIX_VAR(64, 64)
 789 MASK_SUBPIX_VAR(64, 128)
 790 MASK_SUBPIX_VAR(128, 64)
 791 MASK_SUBPIX_VAR(128, 128)
 792
 793 // Realtime mode doesn't use 4x rectangular blocks.
 794 #if !CONFIG_REALTIME_ONLY
 795 MASK_SUBPIX_VAR(4, 16)
 796 MASK_SUBPIX_VAR(16, 4)
 797 MASK_SUBPIX_VAR(8, 32)
 798 MASK_SUBPIX_VAR(32, 8)
 799 MASK_SUBPIX_VAR(16, 64)
 800 MASK_SUBPIX_VAR(64, 16)
 801 #endif
 802
 803 #if CONFIG_AV1_HIGHBITDEPTH
 804 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
 805                                  int width, int height, const uint8_t *ref8,
 806                                  int ref_stride, const uint8_t *mask,
 807                                  int mask_stride, int invert_mask) {
 808   int i, j;
 809   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
 810   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 811   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
 812   for (i = 0; i < height; ++i) {
 813     for (j = 0; j < width; ++j) {
 814       if (!invert_mask)
 815         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
 816       else
 817         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
 818     }
 819     comp_pred += width;
 820     pred += width;
 821     ref += ref_stride;
 822     mask += mask_stride;
 823   }
 824 }
 825
 826 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
 827   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
 828       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
 829       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
 830       const uint8_t *msk, int msk_stride, int invert_mask,                     \
 831       unsigned int *sse) {                                                     \
 832     uint16_t fdata3[(H + 1) * W];                                              \
 833     uint16_t temp2[H * W];                                                     \
 834     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
 835                                                                                \
 836     aom_highbd_var_filter_block2d_bil_first_pass(                              \
 837         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
 838     aom_highbd_var_filter_block2d_bil_second_pass(                             \
 839         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
 840                                                                                \
 841     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
 842                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
 843                                 invert_mask);                                  \
 844                                                                                \
 845     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
 846                                               ref, ref_stride, sse);           \
 847   }                                                                            \
 848                                                                                \
 849   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
 850       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
 851       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
 852       const uint8_t *msk, int msk_stride, int invert_mask,                     \
 853       unsigned int *sse) {                                                     \
 854     uint16_t fdata3[(H + 1) * W];                                              \
 855     uint16_t temp2[H * W];                                                     \
 856     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
 857                                                                                \
 858     aom_highbd_var_filter_block2d_bil_first_pass(                              \
 859         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
 860     aom_highbd_var_filter_block2d_bil_second_pass(                             \
 861         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
 862                                                                                \
 863     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
 864                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
 865                                 invert_mask);                                  \
 866                                                                                \
 867     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
 868                                                ref, ref_stride, sse);          \
 869   }                                                                            \
 870                                                                                \
 871   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
 872       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
 873       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
 874       const uint8_t *msk, int msk_stride, int invert_mask,                     \
 875       unsigned int *sse) {                                                     \
 876     uint16_t fdata3[(H + 1) * W];                                              \
 877     uint16_t temp2[H * W];                                                     \
 878     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
 879                                                                                \
 880     aom_highbd_var_filter_block2d_bil_first_pass(                              \
 881         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
 882     aom_highbd_var_filter_block2d_bil_second_pass(                             \
 883         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
 884                                                                                \
 885     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
 886                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
 887                                 invert_mask);                                  \
 888                                                                                \
 889     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
 890                                                ref, ref_stride, sse);          \
 891   }
 892
 893 HIGHBD_MASK_SUBPIX_VAR(4, 4)
 894 HIGHBD_MASK_SUBPIX_VAR(4, 8)
 895 HIGHBD_MASK_SUBPIX_VAR(8, 4)
 896 HIGHBD_MASK_SUBPIX_VAR(8, 8)
 897 HIGHBD_MASK_SUBPIX_VAR(8, 16)
 898 HIGHBD_MASK_SUBPIX_VAR(16, 8)
 899 HIGHBD_MASK_SUBPIX_VAR(16, 16)
 900 HIGHBD_MASK_SUBPIX_VAR(16, 32)
 901 HIGHBD_MASK_SUBPIX_VAR(32, 16)
 902 HIGHBD_MASK_SUBPIX_VAR(32, 32)
 903 HIGHBD_MASK_SUBPIX_VAR(32, 64)
 904 HIGHBD_MASK_SUBPIX_VAR(64, 32)
 905 HIGHBD_MASK_SUBPIX_VAR(64, 64)
 906 HIGHBD_MASK_SUBPIX_VAR(64, 128)
 907 HIGHBD_MASK_SUBPIX_VAR(128, 64)
 908 HIGHBD_MASK_SUBPIX_VAR(128, 128)
 909 #if !CONFIG_REALTIME_ONLY
 910 HIGHBD_MASK_SUBPIX_VAR(4, 16)
 911 HIGHBD_MASK_SUBPIX_VAR(16, 4)
 912 HIGHBD_MASK_SUBPIX_VAR(8, 32)
 913 HIGHBD_MASK_SUBPIX_VAR(32, 8)
 914 HIGHBD_MASK_SUBPIX_VAR(16, 64)
 915 HIGHBD_MASK_SUBPIX_VAR(64, 16)
 916 #endif
 917 #endif  // CONFIG_AV1_HIGHBITDEPTH
 918
 919 #if !CONFIG_REALTIME_ONLY
 920 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
 921                                  const int32_t *wsrc, const int32_t *mask,
 922                                  int w, int h, unsigned int *sse, int *sum) {
 923   int i, j;
 924
 925   *sse = 0;
 926   *sum = 0;
 927
 928   for (i = 0; i < h; i++) {
 929     for (j = 0; j < w; j++) {
 930       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
 931       *sum += diff;
 932       *sse += diff * diff;
 933     }
 934
 935     pre += pre_stride;
 936     wsrc += w;
 937     mask += w;
 938   }
 939 }
 940
 941 #define OBMC_VAR(W, H)                                            \
 942   unsigned int aom_obmc_variance##W##x##H##_c(                    \
 943       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
 944       const int32_t *mask, unsigned int *sse) {                   \
 945     int sum;                                                      \
 946     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
 947     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
 948   }
 949
 950 #define OBMC_SUBPIX_VAR(W, H)                                                  \
 951   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
 952       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
 953       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
 954     uint16_t fdata3[(H + 1) * W];                                              \
 955     uint8_t temp2[H * W];                                                      \
 956                                                                                \
 957     aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
 958                                             W, bilinear_filters_2t[xoffset]);  \
 959     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
 960                                              bilinear_filters_2t[yoffset]);    \
 961                                                                                \
 962     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
 963   }
 964
 965 OBMC_VAR(4, 4)
 966 OBMC_SUBPIX_VAR(4, 4)
 967
 968 OBMC_VAR(4, 8)
 969 OBMC_SUBPIX_VAR(4, 8)
 970
 971 OBMC_VAR(8, 4)
 972 OBMC_SUBPIX_VAR(8, 4)
 973
 974 OBMC_VAR(8, 8)
 975 OBMC_SUBPIX_VAR(8, 8)
 976
 977 OBMC_VAR(8, 16)
 978 OBMC_SUBPIX_VAR(8, 16)
 979
 980 OBMC_VAR(16, 8)
 981 OBMC_SUBPIX_VAR(16, 8)
 982
 983 OBMC_VAR(16, 16)
 984 OBMC_SUBPIX_VAR(16, 16)
 985
 986 OBMC_VAR(16, 32)
 987 OBMC_SUBPIX_VAR(16, 32)
 988
 989 OBMC_VAR(32, 16)
 990 OBMC_SUBPIX_VAR(32, 16)
 991
 992 OBMC_VAR(32, 32)
 993 OBMC_SUBPIX_VAR(32, 32)
 994
 995 OBMC_VAR(32, 64)
 996 OBMC_SUBPIX_VAR(32, 64)
 997
 998 OBMC_VAR(64, 32)
 999 OBMC_SUBPIX_VAR(64, 32)
1000
1001 OBMC_VAR(64, 64)
1002 OBMC_SUBPIX_VAR(64, 64)
1003
1004 OBMC_VAR(64, 128)
1005 OBMC_SUBPIX_VAR(64, 128)
1006
1007 OBMC_VAR(128, 64)
1008 OBMC_SUBPIX_VAR(128, 64)
1009
1010 OBMC_VAR(128, 128)
1011 OBMC_SUBPIX_VAR(128, 128)
1012
1013 OBMC_VAR(4, 16)
1014 OBMC_SUBPIX_VAR(4, 16)
1015 OBMC_VAR(16, 4)
1016 OBMC_SUBPIX_VAR(16, 4)
1017 OBMC_VAR(8, 32)
1018 OBMC_SUBPIX_VAR(8, 32)
1019 OBMC_VAR(32, 8)
1020 OBMC_SUBPIX_VAR(32, 8)
1021 OBMC_VAR(16, 64)
1022 OBMC_SUBPIX_VAR(16, 64)
1023 OBMC_VAR(64, 16)
1024 OBMC_SUBPIX_VAR(64, 16)
1025
1026 #if CONFIG_AV1_HIGHBITDEPTH
1027 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1028                                           const int32_t *wsrc,
1029                                           const int32_t *mask, int w, int h,
1030                                           uint64_t *sse, int64_t *sum) {
1031   int i, j;
1032   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1033
1034   *sse = 0;
1035   *sum = 0;
1036
1037   for (i = 0; i < h; i++) {
1038     for (j = 0; j < w; j++) {
1039       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1040       *sum += diff;
1041       *sse += diff * diff;
1042     }
1043
1044     pre += pre_stride;
1045     wsrc += w;
1046     mask += w;
1047   }
1048 }
1049
1050 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1051                                         const int32_t *wsrc,
1052                                         const int32_t *mask, int w, int h,
1053                                         unsigned int *sse, int *sum) {
1054   int64_t sum64;
1055   uint64_t sse64;
1056   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1057   *sum = (int)sum64;
1058   *sse = (unsigned int)sse64;
1059 }
1060
1061 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1062                                            const int32_t *wsrc,
1063                                            const int32_t *mask, int w, int h,
1064                                            unsigned int *sse, int *sum) {
1065   int64_t sum64;
1066   uint64_t sse64;
1067   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1068   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1069   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1070 }
1071
1072 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1073                                            const int32_t *wsrc,
1074                                            const int32_t *mask, int w, int h,
1075                                            unsigned int *sse, int *sum) {
1076   int64_t sum64;
1077   uint64_t sse64;
1078   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1079   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1080   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1081 }
1082
1083 #define HIGHBD_OBMC_VAR(W, H)                                              \
1084   unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
1085       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1086       const int32_t *mask, unsigned int *sse) {                            \
1087     int sum;                                                               \
1088     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
1089     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
1090   }                                                                        \
1091                                                                            \
1092   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
1093       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1094       const int32_t *mask, unsigned int *sse) {                            \
1095     int sum;                                                               \
1096     int64_t var;                                                           \
1097     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1098     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1099     return (var >= 0) ? (uint32_t)var : 0;                                 \
1100   }                                                                        \
1101                                                                            \
1102   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
1103       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1104       const int32_t *mask, unsigned int *sse) {                            \
1105     int sum;                                                               \
1106     int64_t var;                                                           \
1107     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1108     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1109     return (var >= 0) ? (uint32_t)var : 0;                                 \
1110   }
1111
1112 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
1113   unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
1114       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1115       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1116     uint16_t fdata3[(H + 1) * W];                                              \
1117     uint16_t temp2[H * W];                                                     \
1118                                                                                \
1119     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1120         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1121     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1122         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1123                                                                                \
1124     return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1125                                                  wsrc, mask, sse);             \
1126   }                                                                            \
1127                                                                                \
1128   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
1129       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1130       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1131     uint16_t fdata3[(H + 1) * W];                                              \
1132     uint16_t temp2[H * W];                                                     \
1133                                                                                \
1134     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1135         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1136     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1137         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1138                                                                                \
1139     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1140                                                     W, wsrc, mask, sse);       \
1141   }                                                                            \
1142                                                                                \
1143   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
1144       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1145       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1146     uint16_t fdata3[(H + 1) * W];                                              \
1147     uint16_t temp2[H * W];                                                     \
1148                                                                                \
1149     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1150         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1151     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1152         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1153                                                                                \
1154     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1155                                                     W, wsrc, mask, sse);       \
1156   }
1157
1158 HIGHBD_OBMC_VAR(4, 4)
1159 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1160
1161 HIGHBD_OBMC_VAR(4, 8)
1162 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1163
1164 HIGHBD_OBMC_VAR(8, 4)
1165 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1166
1167 HIGHBD_OBMC_VAR(8, 8)
1168 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1169
1170 HIGHBD_OBMC_VAR(8, 16)
1171 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1172
1173 HIGHBD_OBMC_VAR(16, 8)
1174 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1175
1176 HIGHBD_OBMC_VAR(16, 16)
1177 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1178
1179 HIGHBD_OBMC_VAR(16, 32)
1180 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1181
1182 HIGHBD_OBMC_VAR(32, 16)
1183 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1184
1185 HIGHBD_OBMC_VAR(32, 32)
1186 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1187
1188 HIGHBD_OBMC_VAR(32, 64)
1189 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1190
1191 HIGHBD_OBMC_VAR(64, 32)
1192 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1193
1194 HIGHBD_OBMC_VAR(64, 64)
1195 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1196
1197 HIGHBD_OBMC_VAR(64, 128)
1198 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1199
1200 HIGHBD_OBMC_VAR(128, 64)
1201 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1202
1203 HIGHBD_OBMC_VAR(128, 128)
1204 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1205
1206 HIGHBD_OBMC_VAR(4, 16)
1207 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1208 HIGHBD_OBMC_VAR(16, 4)
1209 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1210 HIGHBD_OBMC_VAR(8, 32)
1211 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1212 HIGHBD_OBMC_VAR(32, 8)
1213 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1214 HIGHBD_OBMC_VAR(16, 64)
1215 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1216 HIGHBD_OBMC_VAR(64, 16)
1217 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1218 #endif  // CONFIG_AV1_HIGHBITDEPTH
1219 #endif  // !CONFIG_REALTIME_ONLY
1220
1221 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1222                              int sstride, int w, int h) {
1223   uint64_t sum = 0;
1224   for (int i = 0; i < h; i++) {
1225     for (int j = 0; j < w; j++) {
1226       int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1227       sum += e * e;
1228     }
1229   }
1230   return sum;
1231 }
1232
1233 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1234                                     int sstride, int w, int h) {
1235   uint64_t sum = 0;
1236   for (int i = 0; i < h; i++) {
1237     for (int j = 0; j < w; j++) {
1238       int e = dst[i * dstride + j] - src[i * sstride + j];
1239       sum += e * e;
1240     }
1241   }
1242   return sum;
1243 }