vpx_dsp/variance.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "./vpx_config.h"
  12 #include "./vpx_dsp_rtcd.h"
  13
  14 #include "vpx_ports/mem.h"
  15 #include "vpx/vpx_integer.h"
  16
  17 #include "vpx_dsp/variance.h"
  18
  19 static const uint8_t bilinear_filters[8][2] = {
  20   { 128,   0  },
  21   { 112,  16  },
  22   {  96,  32  },
  23   {  80,  48  },
  24   {  64,  64  },
  25   {  48,  80  },
  26   {  32,  96  },
  27   {  16, 112  },
  28 };
  29
  30 uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
  31                             const uint8_t *b, int  b_stride) {
  32   int distortion = 0;
  33   int r, c;
  34
  35   for (r = 0; r < 4; ++r) {
  36     for (c = 0; c < 4; ++c) {
  37       int diff = a[c] - b[c];
  38       distortion += diff * diff;
  39     }
  40
  41     a += a_stride;
  42     b += b_stride;
  43   }
  44
  45   return distortion;
  46 }
  47
  48 uint32_t vpx_get_mb_ss_c(const int16_t *a) {
  49   unsigned int i, sum = 0;
  50
  51   for (i = 0; i < 256; ++i) {
  52     sum += a[i] * a[i];
  53   }
  54
  55   return sum;
  56 }
  57
  58 uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
  59                                           const uint8_t *b, int b_stride,
  60                                           uint32_t *sse) {
  61   return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
  62                                        b, b_stride, sse);
  63 }
  64
  65
  66 uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
  67                                           const uint8_t *b, int b_stride,
  68                                           uint32_t *sse) {
  69   return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
  70                                        b, b_stride, sse);
  71 }
  72
  73 uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
  74                                            const uint8_t *b, int b_stride,
  75                                            uint32_t *sse) {
  76   return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
  77                                        b, b_stride, sse);
  78 }
  79
  80 static void variance(const uint8_t *a, int  a_stride,
  81                      const uint8_t *b, int  b_stride,
  82                      int  w, int  h, uint32_t *sse, int *sum) {
  83   int i, j;
  84
  85   *sum = 0;
  86   *sse = 0;
  87
  88   for (i = 0; i < h; ++i) {
  89     for (j = 0; j < w; ++j) {
  90       const int diff = a[j] - b[j];
  91       *sum += diff;
  92       *sse += diff * diff;
  93     }
  94
  95     a += a_stride;
  96     b += b_stride;
  97   }
  98 }
  99
 100 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
 101 // or vertical direction to produce the filtered output block. Used to implement
 102 // the first-pass of 2-D separable filter.
 103 //
 104 // Produces int16_t output to retain precision for the next pass. Two filter
 105 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
 106 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
 107 // It defines the offset required to move from one input to the next.
 108 static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
 109                                               unsigned int src_pixels_per_line,
 110                                               int pixel_step,
 111                                               unsigned int output_height,
 112                                               unsigned int output_width,
 113                                               const uint8_t *filter) {
 114   unsigned int i, j;
 115
 116   for (i = 0; i < output_height; ++i) {
 117     for (j = 0; j < output_width; ++j) {
 118       b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
 119                           (int)a[pixel_step] * filter[1],
 120                           FILTER_BITS);
 121
 122       ++a;
 123     }
 124
 125     a += src_pixels_per_line - output_width;
 126     b += output_width;
 127   }
 128 }
 129
 130 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
 131 // or vertical direction to produce the filtered output block. Used to implement
 132 // the second-pass of 2-D separable filter.
 133 //
 134 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
 135 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
 136 // filter is applied horizontally (pixel_step = 1) or vertically
 137 // (pixel_step = stride). It defines the offset required to move from one input
 138 // to the next. Output is 8-bit.
 139 static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
 140                                                unsigned int src_pixels_per_line,
 141                                                unsigned int pixel_step,
 142                                                unsigned int output_height,
 143                                                unsigned int output_width,
 144                                                const uint8_t *filter) {
 145   unsigned int  i, j;
 146
 147   for (i = 0; i < output_height; ++i) {
 148     for (j = 0; j < output_width; ++j) {
 149       b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
 150                           (int)a[pixel_step] * filter[1],
 151                           FILTER_BITS);
 152       ++a;
 153     }
 154
 155     a += src_pixels_per_line - output_width;
 156     b += output_width;
 157   }
 158 }
 159
 160 #define VAR(W, H) \
 161 uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
 162                                    const uint8_t *b, int b_stride, \
 163                                    uint32_t *sse) { \
 164   int sum; \
 165   variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
 166   return *sse - (((int64_t)sum * sum) / (W * H)); \
 167 }
 168
 169 #define SUBPIX_VAR(W, H) \
 170 uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
 171                                              int xoffset, int  yoffset, \
 172                                              const uint8_t *b, int b_stride, \
 173                                              uint32_t *sse) { \
 174   uint16_t fdata3[(H + 1) * W]; \
 175   uint8_t temp2[H * W]; \
 176 \
 177   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
 178                                     bilinear_filters[xoffset]); \
 179   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 180                                      bilinear_filters[yoffset]); \
 181 \
 182   return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
 183 }
 184
 185 #define SUBPIX_AVG_VAR(W, H) \
 186 uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
 187                                                  int  a_stride, \
 188                                                  int xoffset, int  yoffset, \
 189                                                  const uint8_t *b, \
 190                                                  int b_stride, \
 191                                                  uint32_t *sse, \
 192                                                  const uint8_t *second_pred) { \
 193   uint16_t fdata3[(H + 1) * W]; \
 194   uint8_t temp2[H * W]; \
 195   DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
 196 \
 197   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
 198                                     bilinear_filters[xoffset]); \
 199   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 200                                      bilinear_filters[yoffset]); \
 201 \
 202   vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
 203 \
 204   return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
 205 }
 206
 207 /* Identical to the variance call except it takes an additional parameter, sum,
 208  * and returns that value using pass-by-reference instead of returning
 209  * sse - sum^2 / w*h
 210  */
 211 #define GET_VAR(W, H) \
 212 void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
 213                              const uint8_t *b, int b_stride, \
 214                              uint32_t *sse, int *sum) { \
 215   variance(a, a_stride, b, b_stride, W, H, sse, sum); \
 216 }
 217
 218 /* Identical to the variance call except it does not calculate the
 219  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
 220  * variable.
 221  */
 222 #define MSE(W, H) \
 223 uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
 224                               const uint8_t *b, int b_stride, \
 225                               uint32_t *sse) { \
 226   int sum; \
 227   variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
 228   return *sse; \
 229 }
 230
 231 /* All three forms of the variance are available in the same sizes. */
 232 #define VARIANCES(W, H) \
 233     VAR(W, H) \
 234     SUBPIX_VAR(W, H) \
 235     SUBPIX_AVG_VAR(W, H)
 236
 237 VARIANCES(64, 64)
 238 VARIANCES(64, 32)
 239 VARIANCES(32, 64)
 240 VARIANCES(32, 32)
 241 VARIANCES(32, 16)
 242 VARIANCES(16, 32)
 243 VARIANCES(16, 16)
 244 VARIANCES(16, 8)
 245 VARIANCES(8, 16)
 246 VARIANCES(8, 8)
 247 VARIANCES(8, 4)
 248 VARIANCES(4, 8)
 249 VARIANCES(4, 4)
 250
 251 GET_VAR(16, 16)
 252 GET_VAR(8, 8)
 253
 254 MSE(16, 16)
 255 MSE(16, 8)
 256 MSE(8, 16)
 257 MSE(8, 8)
 258
 259 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
 260                          int width, int height,
 261                          const uint8_t *ref, int ref_stride) {
 262   int i, j;
 263
 264   for (i = 0; i < height; ++i) {
 265     for (j = 0; j < width; ++j) {
 266       const int tmp = pred[j] + ref[j];
 267       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
 268     }
 269     comp_pred += width;
 270     pred += width;
 271     ref += ref_stride;
 272   }
 273 }
 274
 275 #if CONFIG_VP9_HIGHBITDEPTH
 276 static void highbd_variance64(const uint8_t *a8, int  a_stride,
 277                               const uint8_t *b8, int  b_stride,
 278                               int w, int h, uint64_t *sse, uint64_t *sum) {
 279   int i, j;
 280
 281   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
 282   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 283   *sum = 0;
 284   *sse = 0;
 285
 286   for (i = 0; i < h; ++i) {
 287     for (j = 0; j < w; ++j) {
 288       const int diff = a[j] - b[j];
 289       *sum += diff;
 290       *sse += diff * diff;
 291     }
 292     a += a_stride;
 293     b += b_stride;
 294   }
 295 }
 296
 297 static void highbd_8_variance(const uint8_t *a8, int  a_stride,
 298                               const uint8_t *b8, int  b_stride,
 299                               int w, int h, uint32_t *sse, int *sum) {
 300   uint64_t sse_long = 0;
 301   uint64_t sum_long = 0;
 302   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
 303   *sse = (uint32_t)sse_long;
 304   *sum = (int)sum_long;
 305 }
 306
 307 static void highbd_10_variance(const uint8_t *a8, int  a_stride,
 308                                const uint8_t *b8, int  b_stride,
 309                                int w, int h, uint32_t *sse, int *sum) {
 310   uint64_t sse_long = 0;
 311   uint64_t sum_long = 0;
 312   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
 313   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
 314   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 315 }
 316
 317 static void highbd_12_variance(const uint8_t *a8, int  a_stride,
 318                                const uint8_t *b8, int  b_stride,
 319                                int w, int h, uint32_t *sse, int *sum) {
 320   uint64_t sse_long = 0;
 321   uint64_t sum_long = 0;
 322   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
 323   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
 324   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 325 }
 326
 327 #define HIGHBD_VAR(W, H) \
 328 uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
 329                                             int a_stride, \
 330                                             const uint8_t *b, \
 331                                             int b_stride, \
 332                                             uint32_t *sse) { \
 333   int sum; \
 334   highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
 335   return *sse - (((int64_t)sum * sum) / (W * H)); \
 336 } \
 337 \
 338 uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
 339                                              int a_stride, \
 340                                              const uint8_t *b, \
 341                                              int b_stride, \
 342                                              uint32_t *sse) { \
 343   int sum; \
 344   highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
 345   return *sse - (((int64_t)sum * sum) / (W * H)); \
 346 } \
 347 \
 348 uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
 349                                              int a_stride, \
 350                                              const uint8_t *b, \
 351                                              int b_stride, \
 352                                              uint32_t *sse) { \
 353   int sum; \
 354   highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
 355   return *sse - (((int64_t)sum * sum) / (W * H)); \
 356 }
 357
 358 #define HIGHBD_GET_VAR(S) \
 359 void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
 360                                       const uint8_t *ref, int ref_stride, \
 361                                       uint32_t *sse, int *sum) { \
 362   highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 363 } \
 364 \
 365 void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
 366                                        const uint8_t *ref, int ref_stride, \
 367                                        uint32_t *sse, int *sum) { \
 368   highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 369 } \
 370 \
 371 void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
 372                                        const uint8_t *ref, int ref_stride, \
 373                                        uint32_t *sse, int *sum) { \
 374   highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 375 }
 376
 377 #define HIGHBD_MSE(W, H) \
 378 uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
 379                                        int src_stride, \
 380                                        const uint8_t *ref, \
 381                                        int ref_stride, \
 382                                        uint32_t *sse) { \
 383   int sum; \
 384   highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
 385   return *sse; \
 386 } \
 387 \
 388 uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
 389                                         int src_stride, \
 390                                         const uint8_t *ref, \
 391                                         int ref_stride, \
 392                                         uint32_t *sse) { \
 393   int sum; \
 394   highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
 395   return *sse; \
 396 } \
 397 \
 398 uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
 399                                         int src_stride, \
 400                                         const uint8_t *ref, \
 401                                         int ref_stride, \
 402                                         uint32_t *sse) { \
 403   int sum; \
 404   highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
 405   return *sse; \
 406 }
 407
 408 static void highbd_var_filter_block2d_bil_first_pass(
 409     const uint8_t *src_ptr8,
 410     uint16_t *output_ptr,
 411     unsigned int src_pixels_per_line,
 412     int pixel_step,
 413     unsigned int output_height,
 414     unsigned int output_width,
 415     const uint8_t *filter) {
 416   unsigned int i, j;
 417   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
 418   for (i = 0; i < output_height; ++i) {
 419     for (j = 0; j < output_width; ++j) {
 420       output_ptr[j] =
 421           ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
 422                              (int)src_ptr[pixel_step] * filter[1],
 423                              FILTER_BITS);
 424
 425       ++src_ptr;
 426     }
 427
 428     // Next row...
 429     src_ptr += src_pixels_per_line - output_width;
 430     output_ptr += output_width;
 431   }
 432 }
 433
 434 static void highbd_var_filter_block2d_bil_second_pass(
 435     const uint16_t *src_ptr,
 436     uint16_t *output_ptr,
 437     unsigned int src_pixels_per_line,
 438     unsigned int pixel_step,
 439     unsigned int output_height,
 440     unsigned int output_width,
 441     const uint8_t *filter) {
 442   unsigned int  i, j;
 443
 444   for (i = 0; i < output_height; ++i) {
 445     for (j = 0; j < output_width; ++j) {
 446       output_ptr[j] =
 447           ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
 448                              (int)src_ptr[pixel_step] * filter[1],
 449                              FILTER_BITS);
 450       ++src_ptr;
 451     }
 452
 453     src_ptr += src_pixels_per_line - output_width;
 454     output_ptr += output_width;
 455   }
 456 }
 457
 458 #define HIGHBD_SUBPIX_VAR(W, H) \
 459 uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
 460   const uint8_t *src, int  src_stride, \
 461   int xoffset, int  yoffset, \
 462   const uint8_t *dst, int dst_stride, \
 463   uint32_t *sse) { \
 464   uint16_t fdata3[(H + 1) * W]; \
 465   uint16_t temp2[H * W]; \
 466 \
 467   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
 468                                            W, bilinear_filters[xoffset]); \
 469   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 470                                             bilinear_filters[yoffset]); \
 471 \
 472   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
 473                                           dst_stride, sse); \
 474 } \
 475 \
 476 uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
 477   const uint8_t *src, int  src_stride, \
 478   int xoffset, int  yoffset, \
 479   const uint8_t *dst, int dst_stride, \
 480   uint32_t *sse) { \
 481   uint16_t fdata3[(H + 1) * W]; \
 482   uint16_t temp2[H * W]; \
 483 \
 484   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
 485                                            W, bilinear_filters[xoffset]); \
 486   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 487                                             bilinear_filters[yoffset]); \
 488 \
 489   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
 490                                              W, dst, dst_stride, sse); \
 491 } \
 492 \
 493 uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
 494   const uint8_t *src, int  src_stride, \
 495   int xoffset, int  yoffset, \
 496   const uint8_t *dst, int dst_stride, \
 497   uint32_t *sse) { \
 498   uint16_t fdata3[(H + 1) * W]; \
 499   uint16_t temp2[H * W]; \
 500 \
 501   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
 502                                            W, bilinear_filters[xoffset]); \
 503   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 504                                             bilinear_filters[yoffset]); \
 505 \
 506   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
 507                                              W, dst, dst_stride, sse); \
 508 }
 509
 510 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
 511 uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
 512   const uint8_t *src, int  src_stride, \
 513   int xoffset, int  yoffset, \
 514   const uint8_t *dst, int dst_stride, \
 515   uint32_t *sse, \
 516   const uint8_t *second_pred) { \
 517   uint16_t fdata3[(H + 1) * W]; \
 518   uint16_t temp2[H * W]; \
 519   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 520 \
 521   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
 522                                            W, bilinear_filters[xoffset]); \
 523   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 524                                             bilinear_filters[yoffset]); \
 525 \
 526   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
 527                            CONVERT_TO_BYTEPTR(temp2), W); \
 528 \
 529   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
 530                                           dst_stride, sse); \
 531 } \
 532 \
 533 uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
 534   const uint8_t *src, int  src_stride, \
 535   int xoffset, int  yoffset, \
 536   const uint8_t *dst, int dst_stride, \
 537   uint32_t *sse, \
 538   const uint8_t *second_pred) { \
 539   uint16_t fdata3[(H + 1) * W]; \
 540   uint16_t temp2[H * W]; \
 541   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 542 \
 543   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
 544                                            W, bilinear_filters[xoffset]); \
 545   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 546                                             bilinear_filters[yoffset]); \
 547 \
 548   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
 549                            CONVERT_TO_BYTEPTR(temp2), W); \
 550 \
 551   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
 552                                              W, dst, dst_stride, sse); \
 553 } \
 554 \
 555 uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
 556   const uint8_t *src, int  src_stride, \
 557   int xoffset, int  yoffset, \
 558   const uint8_t *dst, int dst_stride, \
 559   uint32_t *sse, \
 560   const uint8_t *second_pred) { \
 561   uint16_t fdata3[(H + 1) * W]; \
 562   uint16_t temp2[H * W]; \
 563   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 564 \
 565   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
 566                                            W, bilinear_filters[xoffset]); \
 567   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
 568                                             bilinear_filters[yoffset]); \
 569 \
 570   vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
 571                            CONVERT_TO_BYTEPTR(temp2), W); \
 572 \
 573   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
 574                                              W, dst, dst_stride, sse); \
 575 }
 576
 577 /* All three forms of the variance are available in the same sizes. */
 578 #define HIGHBD_VARIANCES(W, H) \
 579     HIGHBD_VAR(W, H) \
 580     HIGHBD_SUBPIX_VAR(W, H) \
 581     HIGHBD_SUBPIX_AVG_VAR(W, H)
 582
 583 HIGHBD_VARIANCES(64, 64)
 584 HIGHBD_VARIANCES(64, 32)
 585 HIGHBD_VARIANCES(32, 64)
 586 HIGHBD_VARIANCES(32, 32)
 587 HIGHBD_VARIANCES(32, 16)
 588 HIGHBD_VARIANCES(16, 32)
 589 HIGHBD_VARIANCES(16, 16)
 590 HIGHBD_VARIANCES(16, 8)
 591 HIGHBD_VARIANCES(8, 16)
 592 HIGHBD_VARIANCES(8, 8)
 593 HIGHBD_VARIANCES(8, 4)
 594 HIGHBD_VARIANCES(4, 8)
 595 HIGHBD_VARIANCES(4, 4)
 596
 597 HIGHBD_GET_VAR(8)
 598 HIGHBD_GET_VAR(16)
 599
 600 HIGHBD_MSE(16, 16)
 601 HIGHBD_MSE(16, 8)
 602 HIGHBD_MSE(8, 16)
 603 HIGHBD_MSE(8, 8)
 604
 605 void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
 606                               int width, int height, const uint8_t *ref8,
 607                               int ref_stride) {
 608   int i, j;
 609   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
 610   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 611   for (i = 0; i < height; ++i) {
 612     for (j = 0; j < width; ++j) {
 613       const int tmp = pred[j] + ref[j];
 614       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
 615     }
 616     comp_pred += width;
 617     pred += width;
 618     ref += ref_stride;
 619   }
 620 }
 621 #endif  // CONFIG_VP9_HIGHBITDEPTH