cdef: Improve cdef_filter_block_8x8_{8,16}.
[aom.git] / aom_dsp / variance.c
blobcb9356e211c069c9cfb14eb1e54f9cd09dea7eba
1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
18 #include "aom/aom_integer.h"
19 #include "aom_ports/mem.h"
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/blend.h"
23 #include "aom_dsp/variance.h"
25 #include "av1/common/filter.h"
26 #include "av1/common/reconinter.h"
28 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
29 int b_stride) {
30 int distortion = 0;
31 int r, c;
33 for (r = 0; r < 4; ++r) {
34 for (c = 0; c < 4; ++c) {
35 int diff = a[c] - b[c];
36 distortion += diff * diff;
39 a += a_stride;
40 b += b_stride;
43 return distortion;
46 uint32_t aom_get_mb_ss_c(const int16_t *a) {
47 unsigned int i, sum = 0;
49 for (i = 0; i < 256; ++i) {
50 sum += a[i] * a[i];
53 return sum;
56 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
57 int b_stride, int w, int h, uint32_t *sse, int *sum) {
58 int i, j;
60 *sum = 0;
61 *sse = 0;
63 for (i = 0; i < h; ++i) {
64 for (j = 0; j < w; ++j) {
65 const int diff = a[j] - b[j];
66 *sum += diff;
67 *sse += diff * diff;
70 a += a_stride;
71 b += b_stride;
75 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
76 int b_stride, int w, int h) {
77 uint32_t sse;
78 int sum;
79 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
80 return sse;
83 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
84 // or vertical direction to produce the filtered output block. Used to implement
85 // the first-pass of 2-D separable filter.
87 // Produces int16_t output to retain precision for the next pass. Two filter
88 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
89 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
90 // It defines the offset required to move from one input to the next.
91 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
92 unsigned int src_pixels_per_line,
93 unsigned int pixel_step,
94 unsigned int output_height,
95 unsigned int output_width,
96 const uint8_t *filter) {
97 unsigned int i, j;
99 for (i = 0; i < output_height; ++i) {
100 for (j = 0; j < output_width; ++j) {
101 b[j] = ROUND_POWER_OF_TWO(
102 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
104 ++a;
107 a += src_pixels_per_line - output_width;
108 b += output_width;
112 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
113 // or vertical direction to produce the filtered output block. Used to implement
114 // the second-pass of 2-D separable filter.
116 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
117 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
118 // filter is applied horizontally (pixel_step = 1) or vertically
119 // (pixel_step = stride). It defines the offset required to move from one input
120 // to the next. Output is 8-bit.
121 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
122 unsigned int src_pixels_per_line,
123 unsigned int pixel_step,
124 unsigned int output_height,
125 unsigned int output_width,
126 const uint8_t *filter) {
127 unsigned int i, j;
129 for (i = 0; i < output_height; ++i) {
130 for (j = 0; j < output_width; ++j) {
131 b[j] = ROUND_POWER_OF_TWO(
132 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
133 ++a;
136 a += src_pixels_per_line - output_width;
137 b += output_width;
141 #define VAR(W, H) \
142 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
143 const uint8_t *b, int b_stride, \
144 uint32_t *sse) { \
145 int sum; \
146 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
147 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
150 #define SUBPIX_VAR(W, H) \
151 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
152 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
153 const uint8_t *b, int b_stride, uint32_t *sse) { \
154 uint16_t fdata3[(H + 1) * W]; \
155 uint8_t temp2[H * W]; \
157 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
158 bilinear_filters_2t[xoffset]); \
159 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
160 bilinear_filters_2t[yoffset]); \
162 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
165 #define SUBPIX_AVG_VAR(W, H) \
166 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
167 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
168 const uint8_t *b, int b_stride, uint32_t *sse, \
169 const uint8_t *second_pred) { \
170 uint16_t fdata3[(H + 1) * W]; \
171 uint8_t temp2[H * W]; \
172 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
174 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
175 bilinear_filters_2t[xoffset]); \
176 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
177 bilinear_filters_2t[yoffset]); \
179 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
181 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
183 uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
184 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
185 const uint8_t *b, int b_stride, uint32_t *sse, \
186 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
187 uint16_t fdata3[(H + 1) * W]; \
188 uint8_t temp2[H * W]; \
189 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
191 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
192 bilinear_filters_2t[xoffset]); \
193 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
194 bilinear_filters_2t[yoffset]); \
196 aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
198 return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
201 /* Identical to the variance call except it takes an additional parameter, sum,
202 * and returns that value using pass-by-reference instead of returning
203 * sse - sum^2 / w*h
205 #define GET_VAR(W, H) \
206 void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
207 const uint8_t *b, int b_stride, uint32_t *sse, \
208 int *sum) { \
209 variance(a, a_stride, b, b_stride, W, H, sse, sum); \
212 /* Identical to the variance call except it does not calculate the
213 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
214 * variable.
216 #define MSE(W, H) \
217 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
218 const uint8_t *b, int b_stride, \
219 uint32_t *sse) { \
220 int sum; \
221 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
222 return *sse; \
225 /* All three forms of the variance are available in the same sizes. */
226 #define VARIANCES(W, H) \
227 VAR(W, H) \
228 SUBPIX_VAR(W, H) \
229 SUBPIX_AVG_VAR(W, H)
231 VARIANCES(128, 128)
232 VARIANCES(128, 64)
233 VARIANCES(64, 128)
234 VARIANCES(64, 64)
235 VARIANCES(64, 32)
236 VARIANCES(32, 64)
237 VARIANCES(32, 32)
238 VARIANCES(32, 16)
239 VARIANCES(16, 32)
240 VARIANCES(16, 16)
241 VARIANCES(16, 8)
242 VARIANCES(8, 16)
243 VARIANCES(8, 8)
244 VARIANCES(8, 4)
245 VARIANCES(4, 8)
246 VARIANCES(4, 4)
247 VARIANCES(4, 2)
248 VARIANCES(2, 4)
249 VARIANCES(2, 2)
251 // Realtime mode doesn't use rectangular blocks.
252 #if !CONFIG_REALTIME_ONLY
253 VARIANCES(4, 16)
254 VARIANCES(16, 4)
255 VARIANCES(8, 32)
256 VARIANCES(32, 8)
257 VARIANCES(16, 64)
258 VARIANCES(64, 16)
259 #endif
261 GET_VAR(16, 16)
262 GET_VAR(8, 8)
264 MSE(16, 16)
265 MSE(16, 8)
266 MSE(8, 16)
267 MSE(8, 8)
269 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
270 int height, const uint8_t *ref, int ref_stride) {
271 int i, j;
273 for (i = 0; i < height; ++i) {
274 for (j = 0; j < width; ++j) {
275 const int tmp = pred[j] + ref[j];
276 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
278 comp_pred += width;
279 pred += width;
280 ref += ref_stride;
284 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
285 int width, int height, const uint8_t *ref,
286 int ref_stride,
287 const DIST_WTD_COMP_PARAMS *jcp_param) {
288 int i, j;
289 const int fwd_offset = jcp_param->fwd_offset;
290 const int bck_offset = jcp_param->bck_offset;
292 for (i = 0; i < height; ++i) {
293 for (j = 0; j < width; ++j) {
294 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
295 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
296 comp_pred[j] = (uint8_t)tmp;
298 comp_pred += width;
299 pred += width;
300 ref += ref_stride;
304 #if CONFIG_AV1_HIGHBITDEPTH
305 static void highbd_variance64(const uint8_t *a8, int a_stride,
306 const uint8_t *b8, int b_stride, int w, int h,
307 uint64_t *sse, int64_t *sum) {
308 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
309 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
310 int64_t tsum = 0;
311 uint64_t tsse = 0;
312 for (int i = 0; i < h; ++i) {
313 int32_t lsum = 0;
314 for (int j = 0; j < w; ++j) {
315 const int diff = a[j] - b[j];
316 lsum += diff;
317 tsse += (uint32_t)(diff * diff);
319 tsum += lsum;
320 a += a_stride;
321 b += b_stride;
323 *sum = tsum;
324 *sse = tsse;
327 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
328 const uint8_t *b, int b_stride, int w, int h) {
329 uint64_t sse;
330 int64_t sum;
331 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
332 return sse;
335 static void highbd_8_variance(const uint8_t *a8, int a_stride,
336 const uint8_t *b8, int b_stride, int w, int h,
337 uint32_t *sse, int *sum) {
338 uint64_t sse_long = 0;
339 int64_t sum_long = 0;
340 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
341 *sse = (uint32_t)sse_long;
342 *sum = (int)sum_long;
345 static void highbd_10_variance(const uint8_t *a8, int a_stride,
346 const uint8_t *b8, int b_stride, int w, int h,
347 uint32_t *sse, int *sum) {
348 uint64_t sse_long = 0;
349 int64_t sum_long = 0;
350 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
351 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
352 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
355 static void highbd_12_variance(const uint8_t *a8, int a_stride,
356 const uint8_t *b8, int b_stride, int w, int h,
357 uint32_t *sse, int *sum) {
358 uint64_t sse_long = 0;
359 int64_t sum_long = 0;
360 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
361 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
362 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
365 #define HIGHBD_VAR(W, H) \
366 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
367 const uint8_t *b, int b_stride, \
368 uint32_t *sse) { \
369 int sum; \
370 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
371 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
374 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
375 const uint8_t *b, int b_stride, \
376 uint32_t *sse) { \
377 int sum; \
378 int64_t var; \
379 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
380 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
381 return (var >= 0) ? (uint32_t)var : 0; \
384 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
385 const uint8_t *b, int b_stride, \
386 uint32_t *sse) { \
387 int sum; \
388 int64_t var; \
389 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
390 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
391 return (var >= 0) ? (uint32_t)var : 0; \
394 #define HIGHBD_GET_VAR(S) \
395 void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
396 const uint8_t *ref, int ref_stride, \
397 uint32_t *sse, int *sum) { \
398 highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
401 void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
402 const uint8_t *ref, int ref_stride, \
403 uint32_t *sse, int *sum) { \
404 highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
407 void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
408 const uint8_t *ref, int ref_stride, \
409 uint32_t *sse, int *sum) { \
410 highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
413 #define HIGHBD_MSE(W, H) \
414 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
415 const uint8_t *ref, int ref_stride, \
416 uint32_t *sse) { \
417 int sum; \
418 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
419 return *sse; \
422 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
423 const uint8_t *ref, int ref_stride, \
424 uint32_t *sse) { \
425 int sum; \
426 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
427 return *sse; \
430 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
431 const uint8_t *ref, int ref_stride, \
432 uint32_t *sse) { \
433 int sum; \
434 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
435 return *sse; \
438 void aom_highbd_var_filter_block2d_bil_first_pass(
439 const uint8_t *src_ptr8, uint16_t *output_ptr,
440 unsigned int src_pixels_per_line, int pixel_step,
441 unsigned int output_height, unsigned int output_width,
442 const uint8_t *filter) {
443 unsigned int i, j;
444 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
445 for (i = 0; i < output_height; ++i) {
446 for (j = 0; j < output_width; ++j) {
447 output_ptr[j] = ROUND_POWER_OF_TWO(
448 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
449 FILTER_BITS);
451 ++src_ptr;
454 // Next row...
455 src_ptr += src_pixels_per_line - output_width;
456 output_ptr += output_width;
460 void aom_highbd_var_filter_block2d_bil_second_pass(
461 const uint16_t *src_ptr, uint16_t *output_ptr,
462 unsigned int src_pixels_per_line, unsigned int pixel_step,
463 unsigned int output_height, unsigned int output_width,
464 const uint8_t *filter) {
465 unsigned int i, j;
467 for (i = 0; i < output_height; ++i) {
468 for (j = 0; j < output_width; ++j) {
469 output_ptr[j] = ROUND_POWER_OF_TWO(
470 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
471 FILTER_BITS);
472 ++src_ptr;
475 src_ptr += src_pixels_per_line - output_width;
476 output_ptr += output_width;
480 #define HIGHBD_SUBPIX_VAR(W, H) \
481 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
482 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
483 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
484 uint16_t fdata3[(H + 1) * W]; \
485 uint16_t temp2[H * W]; \
487 aom_highbd_var_filter_block2d_bil_first_pass( \
488 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
489 aom_highbd_var_filter_block2d_bil_second_pass( \
490 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
492 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
493 dst, dst_stride, sse); \
496 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
497 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
498 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
499 uint16_t fdata3[(H + 1) * W]; \
500 uint16_t temp2[H * W]; \
502 aom_highbd_var_filter_block2d_bil_first_pass( \
503 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
504 aom_highbd_var_filter_block2d_bil_second_pass( \
505 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
507 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
508 dst, dst_stride, sse); \
511 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
512 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
513 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
514 uint16_t fdata3[(H + 1) * W]; \
515 uint16_t temp2[H * W]; \
517 aom_highbd_var_filter_block2d_bil_first_pass( \
518 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
519 aom_highbd_var_filter_block2d_bil_second_pass( \
520 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
522 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
523 dst, dst_stride, sse); \
526 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
527 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
528 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
529 const uint8_t *dst, int dst_stride, uint32_t *sse, \
530 const uint8_t *second_pred) { \
531 uint16_t fdata3[(H + 1) * W]; \
532 uint16_t temp2[H * W]; \
533 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
535 aom_highbd_var_filter_block2d_bil_first_pass( \
536 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
537 aom_highbd_var_filter_block2d_bil_second_pass( \
538 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
540 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
541 CONVERT_TO_BYTEPTR(temp2), W); \
543 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
544 dst, dst_stride, sse); \
547 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
548 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
549 const uint8_t *dst, int dst_stride, uint32_t *sse, \
550 const uint8_t *second_pred) { \
551 uint16_t fdata3[(H + 1) * W]; \
552 uint16_t temp2[H * W]; \
553 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
555 aom_highbd_var_filter_block2d_bil_first_pass( \
556 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
557 aom_highbd_var_filter_block2d_bil_second_pass( \
558 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
560 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
561 CONVERT_TO_BYTEPTR(temp2), W); \
563 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
564 dst, dst_stride, sse); \
567 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
568 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
569 const uint8_t *dst, int dst_stride, uint32_t *sse, \
570 const uint8_t *second_pred) { \
571 uint16_t fdata3[(H + 1) * W]; \
572 uint16_t temp2[H * W]; \
573 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
575 aom_highbd_var_filter_block2d_bil_first_pass( \
576 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
577 aom_highbd_var_filter_block2d_bil_second_pass( \
578 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
580 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
581 CONVERT_TO_BYTEPTR(temp2), W); \
583 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
584 dst, dst_stride, sse); \
587 uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
588 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
589 const uint8_t *dst, int dst_stride, uint32_t *sse, \
590 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
591 uint16_t fdata3[(H + 1) * W]; \
592 uint16_t temp2[H * W]; \
593 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
595 aom_highbd_var_filter_block2d_bil_first_pass( \
596 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
597 aom_highbd_var_filter_block2d_bil_second_pass( \
598 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
600 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
601 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
602 jcp_param); \
604 return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
605 dst_stride, sse); \
608 uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
609 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
610 const uint8_t *dst, int dst_stride, uint32_t *sse, \
611 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
612 uint16_t fdata3[(H + 1) * W]; \
613 uint16_t temp2[H * W]; \
614 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
616 aom_highbd_var_filter_block2d_bil_first_pass( \
617 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
618 aom_highbd_var_filter_block2d_bil_second_pass( \
619 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
621 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
622 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
623 jcp_param); \
625 return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
626 dst_stride, sse); \
629 uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
630 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
631 const uint8_t *dst, int dst_stride, uint32_t *sse, \
632 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
633 uint16_t fdata3[(H + 1) * W]; \
634 uint16_t temp2[H * W]; \
635 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
637 aom_highbd_var_filter_block2d_bil_first_pass( \
638 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
639 aom_highbd_var_filter_block2d_bil_second_pass( \
640 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
642 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
643 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
644 jcp_param); \
646 return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
647 dst_stride, sse); \
650 /* All three forms of the variance are available in the same sizes. */
651 #define HIGHBD_VARIANCES(W, H) \
652 HIGHBD_VAR(W, H) \
653 HIGHBD_SUBPIX_VAR(W, H) \
654 HIGHBD_SUBPIX_AVG_VAR(W, H)
656 HIGHBD_VARIANCES(128, 128)
657 HIGHBD_VARIANCES(128, 64)
658 HIGHBD_VARIANCES(64, 128)
659 HIGHBD_VARIANCES(64, 64)
660 HIGHBD_VARIANCES(64, 32)
661 HIGHBD_VARIANCES(32, 64)
662 HIGHBD_VARIANCES(32, 32)
663 HIGHBD_VARIANCES(32, 16)
664 HIGHBD_VARIANCES(16, 32)
665 HIGHBD_VARIANCES(16, 16)
666 HIGHBD_VARIANCES(16, 8)
667 HIGHBD_VARIANCES(8, 16)
668 HIGHBD_VARIANCES(8, 8)
669 HIGHBD_VARIANCES(8, 4)
670 HIGHBD_VARIANCES(4, 8)
671 HIGHBD_VARIANCES(4, 4)
672 HIGHBD_VARIANCES(4, 2)
673 HIGHBD_VARIANCES(2, 4)
674 HIGHBD_VARIANCES(2, 2)
676 // Realtime mode doesn't use 4x rectangular blocks.
677 #if !CONFIG_REALTIME_ONLY
678 HIGHBD_VARIANCES(4, 16)
679 HIGHBD_VARIANCES(16, 4)
680 HIGHBD_VARIANCES(8, 32)
681 HIGHBD_VARIANCES(32, 8)
682 HIGHBD_VARIANCES(16, 64)
683 HIGHBD_VARIANCES(64, 16)
684 #endif
686 HIGHBD_GET_VAR(8)
687 HIGHBD_GET_VAR(16)
689 HIGHBD_MSE(16, 16)
690 HIGHBD_MSE(16, 8)
691 HIGHBD_MSE(8, 16)
692 HIGHBD_MSE(8, 8)
694 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
695 int width, int height, const uint8_t *ref8,
696 int ref_stride) {
697 int i, j;
698 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
699 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
700 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
701 for (i = 0; i < height; ++i) {
702 for (j = 0; j < width; ++j) {
703 const int tmp = pred[j] + ref[j];
704 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
706 comp_pred += width;
707 pred += width;
708 ref += ref_stride;
712 void aom_highbd_dist_wtd_comp_avg_pred_c(
713 uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
714 const uint8_t *ref8, int ref_stride,
715 const DIST_WTD_COMP_PARAMS *jcp_param) {
716 int i, j;
717 const int fwd_offset = jcp_param->fwd_offset;
718 const int bck_offset = jcp_param->bck_offset;
719 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
720 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
721 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
723 for (i = 0; i < height; ++i) {
724 for (j = 0; j < width; ++j) {
725 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
726 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
727 comp_pred[j] = (uint16_t)tmp;
729 comp_pred += width;
730 pred += width;
731 ref += ref_stride;
734 #endif // CONFIG_AV1_HIGHBITDEPTH
736 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
737 int height, const uint8_t *ref, int ref_stride,
738 const uint8_t *mask, int mask_stride,
739 int invert_mask) {
740 int i, j;
741 const uint8_t *src0 = invert_mask ? pred : ref;
742 const uint8_t *src1 = invert_mask ? ref : pred;
743 const int stride0 = invert_mask ? width : ref_stride;
744 const int stride1 = invert_mask ? ref_stride : width;
745 for (i = 0; i < height; ++i) {
746 for (j = 0; j < width; ++j) {
747 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
749 comp_pred += width;
750 src0 += stride0;
751 src1 += stride1;
752 mask += mask_stride;
756 #define MASK_SUBPIX_VAR(W, H) \
757 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
758 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
759 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
760 const uint8_t *msk, int msk_stride, int invert_mask, \
761 unsigned int *sse) { \
762 uint16_t fdata3[(H + 1) * W]; \
763 uint8_t temp2[H * W]; \
764 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
766 aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
767 W, bilinear_filters_2t[xoffset]); \
768 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
769 bilinear_filters_2t[yoffset]); \
771 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
772 invert_mask); \
773 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
776 MASK_SUBPIX_VAR(4, 4)
777 MASK_SUBPIX_VAR(4, 8)
778 MASK_SUBPIX_VAR(8, 4)
779 MASK_SUBPIX_VAR(8, 8)
780 MASK_SUBPIX_VAR(8, 16)
781 MASK_SUBPIX_VAR(16, 8)
782 MASK_SUBPIX_VAR(16, 16)
783 MASK_SUBPIX_VAR(16, 32)
784 MASK_SUBPIX_VAR(32, 16)
785 MASK_SUBPIX_VAR(32, 32)
786 MASK_SUBPIX_VAR(32, 64)
787 MASK_SUBPIX_VAR(64, 32)
788 MASK_SUBPIX_VAR(64, 64)
789 MASK_SUBPIX_VAR(64, 128)
790 MASK_SUBPIX_VAR(128, 64)
791 MASK_SUBPIX_VAR(128, 128)
793 // Realtime mode doesn't use 4x rectangular blocks.
794 #if !CONFIG_REALTIME_ONLY
795 MASK_SUBPIX_VAR(4, 16)
796 MASK_SUBPIX_VAR(16, 4)
797 MASK_SUBPIX_VAR(8, 32)
798 MASK_SUBPIX_VAR(32, 8)
799 MASK_SUBPIX_VAR(16, 64)
800 MASK_SUBPIX_VAR(64, 16)
801 #endif
803 #if CONFIG_AV1_HIGHBITDEPTH
804 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
805 int width, int height, const uint8_t *ref8,
806 int ref_stride, const uint8_t *mask,
807 int mask_stride, int invert_mask) {
808 int i, j;
809 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
810 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
811 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
812 for (i = 0; i < height; ++i) {
813 for (j = 0; j < width; ++j) {
814 if (!invert_mask)
815 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
816 else
817 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
819 comp_pred += width;
820 pred += width;
821 ref += ref_stride;
822 mask += mask_stride;
826 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
827 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
828 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
829 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
830 const uint8_t *msk, int msk_stride, int invert_mask, \
831 unsigned int *sse) { \
832 uint16_t fdata3[(H + 1) * W]; \
833 uint16_t temp2[H * W]; \
834 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
836 aom_highbd_var_filter_block2d_bil_first_pass( \
837 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
838 aom_highbd_var_filter_block2d_bil_second_pass( \
839 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
841 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
842 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
843 invert_mask); \
845 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
846 ref, ref_stride, sse); \
849 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
850 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
851 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
852 const uint8_t *msk, int msk_stride, int invert_mask, \
853 unsigned int *sse) { \
854 uint16_t fdata3[(H + 1) * W]; \
855 uint16_t temp2[H * W]; \
856 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
858 aom_highbd_var_filter_block2d_bil_first_pass( \
859 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
860 aom_highbd_var_filter_block2d_bil_second_pass( \
861 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
863 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
864 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
865 invert_mask); \
867 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
868 ref, ref_stride, sse); \
871 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
872 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
873 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
874 const uint8_t *msk, int msk_stride, int invert_mask, \
875 unsigned int *sse) { \
876 uint16_t fdata3[(H + 1) * W]; \
877 uint16_t temp2[H * W]; \
878 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
880 aom_highbd_var_filter_block2d_bil_first_pass( \
881 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
882 aom_highbd_var_filter_block2d_bil_second_pass( \
883 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
885 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
886 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
887 invert_mask); \
889 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
890 ref, ref_stride, sse); \
893 HIGHBD_MASK_SUBPIX_VAR(4, 4)
894 HIGHBD_MASK_SUBPIX_VAR(4, 8)
895 HIGHBD_MASK_SUBPIX_VAR(8, 4)
896 HIGHBD_MASK_SUBPIX_VAR(8, 8)
897 HIGHBD_MASK_SUBPIX_VAR(8, 16)
898 HIGHBD_MASK_SUBPIX_VAR(16, 8)
899 HIGHBD_MASK_SUBPIX_VAR(16, 16)
900 HIGHBD_MASK_SUBPIX_VAR(16, 32)
901 HIGHBD_MASK_SUBPIX_VAR(32, 16)
902 HIGHBD_MASK_SUBPIX_VAR(32, 32)
903 HIGHBD_MASK_SUBPIX_VAR(32, 64)
904 HIGHBD_MASK_SUBPIX_VAR(64, 32)
905 HIGHBD_MASK_SUBPIX_VAR(64, 64)
906 HIGHBD_MASK_SUBPIX_VAR(64, 128)
907 HIGHBD_MASK_SUBPIX_VAR(128, 64)
908 HIGHBD_MASK_SUBPIX_VAR(128, 128)
909 #if !CONFIG_REALTIME_ONLY
910 HIGHBD_MASK_SUBPIX_VAR(4, 16)
911 HIGHBD_MASK_SUBPIX_VAR(16, 4)
912 HIGHBD_MASK_SUBPIX_VAR(8, 32)
913 HIGHBD_MASK_SUBPIX_VAR(32, 8)
914 HIGHBD_MASK_SUBPIX_VAR(16, 64)
915 HIGHBD_MASK_SUBPIX_VAR(64, 16)
916 #endif
917 #endif // CONFIG_AV1_HIGHBITDEPTH
919 #if !CONFIG_REALTIME_ONLY
920 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
921 const int32_t *wsrc, const int32_t *mask,
922 int w, int h, unsigned int *sse, int *sum) {
923 int i, j;
925 *sse = 0;
926 *sum = 0;
928 for (i = 0; i < h; i++) {
929 for (j = 0; j < w; j++) {
930 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
931 *sum += diff;
932 *sse += diff * diff;
935 pre += pre_stride;
936 wsrc += w;
937 mask += w;
941 #define OBMC_VAR(W, H) \
942 unsigned int aom_obmc_variance##W##x##H##_c( \
943 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
944 const int32_t *mask, unsigned int *sse) { \
945 int sum; \
946 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
947 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
950 #define OBMC_SUBPIX_VAR(W, H) \
951 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
952 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
953 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
954 uint16_t fdata3[(H + 1) * W]; \
955 uint8_t temp2[H * W]; \
957 aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
958 W, bilinear_filters_2t[xoffset]); \
959 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
960 bilinear_filters_2t[yoffset]); \
962 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
965 OBMC_VAR(4, 4)
966 OBMC_SUBPIX_VAR(4, 4)
968 OBMC_VAR(4, 8)
969 OBMC_SUBPIX_VAR(4, 8)
971 OBMC_VAR(8, 4)
972 OBMC_SUBPIX_VAR(8, 4)
974 OBMC_VAR(8, 8)
975 OBMC_SUBPIX_VAR(8, 8)
977 OBMC_VAR(8, 16)
978 OBMC_SUBPIX_VAR(8, 16)
980 OBMC_VAR(16, 8)
981 OBMC_SUBPIX_VAR(16, 8)
983 OBMC_VAR(16, 16)
984 OBMC_SUBPIX_VAR(16, 16)
986 OBMC_VAR(16, 32)
987 OBMC_SUBPIX_VAR(16, 32)
989 OBMC_VAR(32, 16)
990 OBMC_SUBPIX_VAR(32, 16)
992 OBMC_VAR(32, 32)
993 OBMC_SUBPIX_VAR(32, 32)
995 OBMC_VAR(32, 64)
996 OBMC_SUBPIX_VAR(32, 64)
998 OBMC_VAR(64, 32)
999 OBMC_SUBPIX_VAR(64, 32)
1001 OBMC_VAR(64, 64)
1002 OBMC_SUBPIX_VAR(64, 64)
1004 OBMC_VAR(64, 128)
1005 OBMC_SUBPIX_VAR(64, 128)
1007 OBMC_VAR(128, 64)
1008 OBMC_SUBPIX_VAR(128, 64)
1010 OBMC_VAR(128, 128)
1011 OBMC_SUBPIX_VAR(128, 128)
1013 OBMC_VAR(4, 16)
1014 OBMC_SUBPIX_VAR(4, 16)
1015 OBMC_VAR(16, 4)
1016 OBMC_SUBPIX_VAR(16, 4)
1017 OBMC_VAR(8, 32)
1018 OBMC_SUBPIX_VAR(8, 32)
1019 OBMC_VAR(32, 8)
1020 OBMC_SUBPIX_VAR(32, 8)
1021 OBMC_VAR(16, 64)
1022 OBMC_SUBPIX_VAR(16, 64)
1023 OBMC_VAR(64, 16)
1024 OBMC_SUBPIX_VAR(64, 16)
1026 #if CONFIG_AV1_HIGHBITDEPTH
1027 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1028 const int32_t *wsrc,
1029 const int32_t *mask, int w, int h,
1030 uint64_t *sse, int64_t *sum) {
1031 int i, j;
1032 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1034 *sse = 0;
1035 *sum = 0;
1037 for (i = 0; i < h; i++) {
1038 for (j = 0; j < w; j++) {
1039 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1040 *sum += diff;
1041 *sse += diff * diff;
1044 pre += pre_stride;
1045 wsrc += w;
1046 mask += w;
1050 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1051 const int32_t *wsrc,
1052 const int32_t *mask, int w, int h,
1053 unsigned int *sse, int *sum) {
1054 int64_t sum64;
1055 uint64_t sse64;
1056 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1057 *sum = (int)sum64;
1058 *sse = (unsigned int)sse64;
1061 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1062 const int32_t *wsrc,
1063 const int32_t *mask, int w, int h,
1064 unsigned int *sse, int *sum) {
1065 int64_t sum64;
1066 uint64_t sse64;
1067 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1068 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1069 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1072 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1073 const int32_t *wsrc,
1074 const int32_t *mask, int w, int h,
1075 unsigned int *sse, int *sum) {
1076 int64_t sum64;
1077 uint64_t sse64;
1078 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1079 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1080 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1083 #define HIGHBD_OBMC_VAR(W, H) \
1084 unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
1085 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1086 const int32_t *mask, unsigned int *sse) { \
1087 int sum; \
1088 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1089 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1092 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1093 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1094 const int32_t *mask, unsigned int *sse) { \
1095 int sum; \
1096 int64_t var; \
1097 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1098 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1099 return (var >= 0) ? (uint32_t)var : 0; \
1102 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1103 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1104 const int32_t *mask, unsigned int *sse) { \
1105 int sum; \
1106 int64_t var; \
1107 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1108 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1109 return (var >= 0) ? (uint32_t)var : 0; \
1112 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1113 unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
1114 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1115 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1116 uint16_t fdata3[(H + 1) * W]; \
1117 uint16_t temp2[H * W]; \
1119 aom_highbd_var_filter_block2d_bil_first_pass( \
1120 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1121 aom_highbd_var_filter_block2d_bil_second_pass( \
1122 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1124 return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1125 wsrc, mask, sse); \
1128 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1129 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1130 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1131 uint16_t fdata3[(H + 1) * W]; \
1132 uint16_t temp2[H * W]; \
1134 aom_highbd_var_filter_block2d_bil_first_pass( \
1135 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1136 aom_highbd_var_filter_block2d_bil_second_pass( \
1137 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1139 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1140 W, wsrc, mask, sse); \
1143 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1144 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1145 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1146 uint16_t fdata3[(H + 1) * W]; \
1147 uint16_t temp2[H * W]; \
1149 aom_highbd_var_filter_block2d_bil_first_pass( \
1150 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1151 aom_highbd_var_filter_block2d_bil_second_pass( \
1152 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1154 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1155 W, wsrc, mask, sse); \
1158 HIGHBD_OBMC_VAR(4, 4)
1159 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1161 HIGHBD_OBMC_VAR(4, 8)
1162 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1164 HIGHBD_OBMC_VAR(8, 4)
1165 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1167 HIGHBD_OBMC_VAR(8, 8)
1168 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1170 HIGHBD_OBMC_VAR(8, 16)
1171 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1173 HIGHBD_OBMC_VAR(16, 8)
1174 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1176 HIGHBD_OBMC_VAR(16, 16)
1177 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1179 HIGHBD_OBMC_VAR(16, 32)
1180 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1182 HIGHBD_OBMC_VAR(32, 16)
1183 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1185 HIGHBD_OBMC_VAR(32, 32)
1186 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1188 HIGHBD_OBMC_VAR(32, 64)
1189 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1191 HIGHBD_OBMC_VAR(64, 32)
1192 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1194 HIGHBD_OBMC_VAR(64, 64)
1195 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1197 HIGHBD_OBMC_VAR(64, 128)
1198 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1200 HIGHBD_OBMC_VAR(128, 64)
1201 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1203 HIGHBD_OBMC_VAR(128, 128)
1204 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1206 HIGHBD_OBMC_VAR(4, 16)
1207 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1208 HIGHBD_OBMC_VAR(16, 4)
1209 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1210 HIGHBD_OBMC_VAR(8, 32)
1211 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1212 HIGHBD_OBMC_VAR(32, 8)
1213 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1214 HIGHBD_OBMC_VAR(16, 64)
1215 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1216 HIGHBD_OBMC_VAR(64, 16)
1217 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1218 #endif // CONFIG_AV1_HIGHBITDEPTH
1219 #endif // !CONFIG_REALTIME_ONLY
1221 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1222 int sstride, int w, int h) {
1223 uint64_t sum = 0;
1224 for (int i = 0; i < h; i++) {
1225 for (int j = 0; j < w; j++) {
1226 int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1227 sum += e * e;
1230 return sum;
1233 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1234 int sstride, int w, int h) {
1235 uint64_t sum = 0;
1236 for (int i = 0; i < h; i++) {
1237 for (int j = 0; j < w; j++) {
1238 int e = dst[i * dstride + j] - src[i * sstride + j];
1239 sum += e * e;
1242 return sum;