Merge "vp8: fix build with mingw+pthreads"
[aom.git] / vpx_dsp / variance.c
blobe8bddb0a0e0088de41c39578c267a8a447344509
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_ports/mem.h"
15 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/variance.h"
19 static const uint8_t bilinear_filters[8][2] = {
20 { 128, 0 },
21 { 112, 16 },
22 { 96, 32 },
23 { 80, 48 },
24 { 64, 64 },
25 { 48, 80 },
26 { 32, 96 },
27 { 16, 112 },
30 uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride,
31 const uint8_t *b, int b_stride) {
32 int distortion = 0;
33 int r, c;
35 for (r = 0; r < 4; ++r) {
36 for (c = 0; c < 4; ++c) {
37 int diff = a[c] - b[c];
38 distortion += diff * diff;
41 a += a_stride;
42 b += b_stride;
45 return distortion;
48 uint32_t vpx_get_mb_ss_c(const int16_t *a) {
49 unsigned int i, sum = 0;
51 for (i = 0; i < 256; ++i) {
52 sum += a[i] * a[i];
55 return sum;
58 uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
59 const uint8_t *b, int b_stride,
60 uint32_t *sse) {
61 return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
62 b, b_stride, sse);
66 uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
67 const uint8_t *b, int b_stride,
68 uint32_t *sse) {
69 return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
70 b, b_stride, sse);
73 uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
74 const uint8_t *b, int b_stride,
75 uint32_t *sse) {
76 return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
77 b, b_stride, sse);
80 static void variance(const uint8_t *a, int a_stride,
81 const uint8_t *b, int b_stride,
82 int w, int h, uint32_t *sse, int *sum) {
83 int i, j;
85 *sum = 0;
86 *sse = 0;
88 for (i = 0; i < h; ++i) {
89 for (j = 0; j < w; ++j) {
90 const int diff = a[j] - b[j];
91 *sum += diff;
92 *sse += diff * diff;
95 a += a_stride;
96 b += b_stride;
100 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
101 // or vertical direction to produce the filtered output block. Used to implement
102 // the first-pass of 2-D separable filter.
104 // Produces int16_t output to retain precision for the next pass. Two filter
105 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
106 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
107 // It defines the offset required to move from one input to the next.
108 static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
109 unsigned int src_pixels_per_line,
110 int pixel_step,
111 unsigned int output_height,
112 unsigned int output_width,
113 const uint8_t *filter) {
114 unsigned int i, j;
116 for (i = 0; i < output_height; ++i) {
117 for (j = 0; j < output_width; ++j) {
118 b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
119 (int)a[pixel_step] * filter[1],
120 FILTER_BITS);
122 ++a;
125 a += src_pixels_per_line - output_width;
126 b += output_width;
130 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
131 // or vertical direction to produce the filtered output block. Used to implement
132 // the second-pass of 2-D separable filter.
134 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
135 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
136 // filter is applied horizontally (pixel_step = 1) or vertically
137 // (pixel_step = stride). It defines the offset required to move from one input
138 // to the next. Output is 8-bit.
139 static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
140 unsigned int src_pixels_per_line,
141 unsigned int pixel_step,
142 unsigned int output_height,
143 unsigned int output_width,
144 const uint8_t *filter) {
145 unsigned int i, j;
147 for (i = 0; i < output_height; ++i) {
148 for (j = 0; j < output_width; ++j) {
149 b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
150 (int)a[pixel_step] * filter[1],
151 FILTER_BITS);
152 ++a;
155 a += src_pixels_per_line - output_width;
156 b += output_width;
160 #define VAR(W, H) \
161 uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
162 const uint8_t *b, int b_stride, \
163 uint32_t *sse) { \
164 int sum; \
165 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
166 return *sse - (((int64_t)sum * sum) / (W * H)); \
169 #define SUBPIX_VAR(W, H) \
170 uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
171 int xoffset, int yoffset, \
172 const uint8_t *b, int b_stride, \
173 uint32_t *sse) { \
174 uint16_t fdata3[(H + 1) * W]; \
175 uint8_t temp2[H * W]; \
177 var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
178 bilinear_filters[xoffset]); \
179 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
180 bilinear_filters[yoffset]); \
182 return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
185 #define SUBPIX_AVG_VAR(W, H) \
186 uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
187 int a_stride, \
188 int xoffset, int yoffset, \
189 const uint8_t *b, \
190 int b_stride, \
191 uint32_t *sse, \
192 const uint8_t *second_pred) { \
193 uint16_t fdata3[(H + 1) * W]; \
194 uint8_t temp2[H * W]; \
195 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
197 var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
198 bilinear_filters[xoffset]); \
199 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
200 bilinear_filters[yoffset]); \
202 vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
204 return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
207 /* Identical to the variance call except it takes an additional parameter, sum,
208 * and returns that value using pass-by-reference instead of returning
209 * sse - sum^2 / w*h
211 #define GET_VAR(W, H) \
212 void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
213 const uint8_t *b, int b_stride, \
214 uint32_t *sse, int *sum) { \
215 variance(a, a_stride, b, b_stride, W, H, sse, sum); \
218 /* Identical to the variance call except it does not calculate the
219 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
220 * variable.
222 #define MSE(W, H) \
223 uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
224 const uint8_t *b, int b_stride, \
225 uint32_t *sse) { \
226 int sum; \
227 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
228 return *sse; \
231 /* All three forms of the variance are available in the same sizes. */
232 #define VARIANCES(W, H) \
233 VAR(W, H) \
234 SUBPIX_VAR(W, H) \
235 SUBPIX_AVG_VAR(W, H)
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
251 GET_VAR(16, 16)
252 GET_VAR(8, 8)
254 MSE(16, 16)
255 MSE(16, 8)
256 MSE(8, 16)
257 MSE(8, 8)
259 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
260 int width, int height,
261 const uint8_t *ref, int ref_stride) {
262 int i, j;
264 for (i = 0; i < height; ++i) {
265 for (j = 0; j < width; ++j) {
266 const int tmp = pred[j] + ref[j];
267 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
269 comp_pred += width;
270 pred += width;
271 ref += ref_stride;
275 #if CONFIG_VP9_HIGHBITDEPTH
276 static void highbd_variance64(const uint8_t *a8, int a_stride,
277 const uint8_t *b8, int b_stride,
278 int w, int h, uint64_t *sse, uint64_t *sum) {
279 int i, j;
281 uint16_t *a = CONVERT_TO_SHORTPTR(a8);
282 uint16_t *b = CONVERT_TO_SHORTPTR(b8);
283 *sum = 0;
284 *sse = 0;
286 for (i = 0; i < h; ++i) {
287 for (j = 0; j < w; ++j) {
288 const int diff = a[j] - b[j];
289 *sum += diff;
290 *sse += diff * diff;
292 a += a_stride;
293 b += b_stride;
297 static void highbd_8_variance(const uint8_t *a8, int a_stride,
298 const uint8_t *b8, int b_stride,
299 int w, int h, uint32_t *sse, int *sum) {
300 uint64_t sse_long = 0;
301 uint64_t sum_long = 0;
302 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
303 *sse = (uint32_t)sse_long;
304 *sum = (int)sum_long;
307 static void highbd_10_variance(const uint8_t *a8, int a_stride,
308 const uint8_t *b8, int b_stride,
309 int w, int h, uint32_t *sse, int *sum) {
310 uint64_t sse_long = 0;
311 uint64_t sum_long = 0;
312 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
313 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
314 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
317 static void highbd_12_variance(const uint8_t *a8, int a_stride,
318 const uint8_t *b8, int b_stride,
319 int w, int h, uint32_t *sse, int *sum) {
320 uint64_t sse_long = 0;
321 uint64_t sum_long = 0;
322 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
323 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
324 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
327 #define HIGHBD_VAR(W, H) \
328 uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
329 int a_stride, \
330 const uint8_t *b, \
331 int b_stride, \
332 uint32_t *sse) { \
333 int sum; \
334 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
335 return *sse - (((int64_t)sum * sum) / (W * H)); \
338 uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
339 int a_stride, \
340 const uint8_t *b, \
341 int b_stride, \
342 uint32_t *sse) { \
343 int sum; \
344 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
345 return *sse - (((int64_t)sum * sum) / (W * H)); \
348 uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
349 int a_stride, \
350 const uint8_t *b, \
351 int b_stride, \
352 uint32_t *sse) { \
353 int sum; \
354 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
355 return *sse - (((int64_t)sum * sum) / (W * H)); \
358 #define HIGHBD_GET_VAR(S) \
359 void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
360 const uint8_t *ref, int ref_stride, \
361 uint32_t *sse, int *sum) { \
362 highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
365 void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
366 const uint8_t *ref, int ref_stride, \
367 uint32_t *sse, int *sum) { \
368 highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
371 void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
372 const uint8_t *ref, int ref_stride, \
373 uint32_t *sse, int *sum) { \
374 highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
377 #define HIGHBD_MSE(W, H) \
378 uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
379 int src_stride, \
380 const uint8_t *ref, \
381 int ref_stride, \
382 uint32_t *sse) { \
383 int sum; \
384 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
385 return *sse; \
388 uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
389 int src_stride, \
390 const uint8_t *ref, \
391 int ref_stride, \
392 uint32_t *sse) { \
393 int sum; \
394 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
395 return *sse; \
398 uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
399 int src_stride, \
400 const uint8_t *ref, \
401 int ref_stride, \
402 uint32_t *sse) { \
403 int sum; \
404 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
405 return *sse; \
408 static void highbd_var_filter_block2d_bil_first_pass(
409 const uint8_t *src_ptr8,
410 uint16_t *output_ptr,
411 unsigned int src_pixels_per_line,
412 int pixel_step,
413 unsigned int output_height,
414 unsigned int output_width,
415 const uint8_t *filter) {
416 unsigned int i, j;
417 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
418 for (i = 0; i < output_height; ++i) {
419 for (j = 0; j < output_width; ++j) {
420 output_ptr[j] =
421 ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
422 (int)src_ptr[pixel_step] * filter[1],
423 FILTER_BITS);
425 ++src_ptr;
428 // Next row...
429 src_ptr += src_pixels_per_line - output_width;
430 output_ptr += output_width;
434 static void highbd_var_filter_block2d_bil_second_pass(
435 const uint16_t *src_ptr,
436 uint16_t *output_ptr,
437 unsigned int src_pixels_per_line,
438 unsigned int pixel_step,
439 unsigned int output_height,
440 unsigned int output_width,
441 const uint8_t *filter) {
442 unsigned int i, j;
444 for (i = 0; i < output_height; ++i) {
445 for (j = 0; j < output_width; ++j) {
446 output_ptr[j] =
447 ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
448 (int)src_ptr[pixel_step] * filter[1],
449 FILTER_BITS);
450 ++src_ptr;
453 src_ptr += src_pixels_per_line - output_width;
454 output_ptr += output_width;
458 #define HIGHBD_SUBPIX_VAR(W, H) \
459 uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
460 const uint8_t *src, int src_stride, \
461 int xoffset, int yoffset, \
462 const uint8_t *dst, int dst_stride, \
463 uint32_t *sse) { \
464 uint16_t fdata3[(H + 1) * W]; \
465 uint16_t temp2[H * W]; \
467 highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
468 W, bilinear_filters[xoffset]); \
469 highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
470 bilinear_filters[yoffset]); \
472 return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
473 dst_stride, sse); \
476 uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
477 const uint8_t *src, int src_stride, \
478 int xoffset, int yoffset, \
479 const uint8_t *dst, int dst_stride, \
480 uint32_t *sse) { \
481 uint16_t fdata3[(H + 1) * W]; \
482 uint16_t temp2[H * W]; \
484 highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
485 W, bilinear_filters[xoffset]); \
486 highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
487 bilinear_filters[yoffset]); \
489 return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
490 W, dst, dst_stride, sse); \
493 uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
494 const uint8_t *src, int src_stride, \
495 int xoffset, int yoffset, \
496 const uint8_t *dst, int dst_stride, \
497 uint32_t *sse) { \
498 uint16_t fdata3[(H + 1) * W]; \
499 uint16_t temp2[H * W]; \
501 highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
502 W, bilinear_filters[xoffset]); \
503 highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
504 bilinear_filters[yoffset]); \
506 return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
507 W, dst, dst_stride, sse); \
510 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
511 uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
512 const uint8_t *src, int src_stride, \
513 int xoffset, int yoffset, \
514 const uint8_t *dst, int dst_stride, \
515 uint32_t *sse, \
516 const uint8_t *second_pred) { \
517 uint16_t fdata3[(H + 1) * W]; \
518 uint16_t temp2[H * W]; \
519 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
521 highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
522 W, bilinear_filters[xoffset]); \
523 highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
524 bilinear_filters[yoffset]); \
526 vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
527 CONVERT_TO_BYTEPTR(temp2), W); \
529 return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
530 dst_stride, sse); \
533 uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
534 const uint8_t *src, int src_stride, \
535 int xoffset, int yoffset, \
536 const uint8_t *dst, int dst_stride, \
537 uint32_t *sse, \
538 const uint8_t *second_pred) { \
539 uint16_t fdata3[(H + 1) * W]; \
540 uint16_t temp2[H * W]; \
541 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
543 highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
544 W, bilinear_filters[xoffset]); \
545 highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
546 bilinear_filters[yoffset]); \
548 vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
549 CONVERT_TO_BYTEPTR(temp2), W); \
551 return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
552 W, dst, dst_stride, sse); \
555 uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
556 const uint8_t *src, int src_stride, \
557 int xoffset, int yoffset, \
558 const uint8_t *dst, int dst_stride, \
559 uint32_t *sse, \
560 const uint8_t *second_pred) { \
561 uint16_t fdata3[(H + 1) * W]; \
562 uint16_t temp2[H * W]; \
563 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
565 highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
566 W, bilinear_filters[xoffset]); \
567 highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
568 bilinear_filters[yoffset]); \
570 vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
571 CONVERT_TO_BYTEPTR(temp2), W); \
573 return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
574 W, dst, dst_stride, sse); \
577 /* All three forms of the variance are available in the same sizes. */
578 #define HIGHBD_VARIANCES(W, H) \
579 HIGHBD_VAR(W, H) \
580 HIGHBD_SUBPIX_VAR(W, H) \
581 HIGHBD_SUBPIX_AVG_VAR(W, H)
583 HIGHBD_VARIANCES(64, 64)
584 HIGHBD_VARIANCES(64, 32)
585 HIGHBD_VARIANCES(32, 64)
586 HIGHBD_VARIANCES(32, 32)
587 HIGHBD_VARIANCES(32, 16)
588 HIGHBD_VARIANCES(16, 32)
589 HIGHBD_VARIANCES(16, 16)
590 HIGHBD_VARIANCES(16, 8)
591 HIGHBD_VARIANCES(8, 16)
592 HIGHBD_VARIANCES(8, 8)
593 HIGHBD_VARIANCES(8, 4)
594 HIGHBD_VARIANCES(4, 8)
595 HIGHBD_VARIANCES(4, 4)
597 HIGHBD_GET_VAR(8)
598 HIGHBD_GET_VAR(16)
600 HIGHBD_MSE(16, 16)
601 HIGHBD_MSE(16, 8)
602 HIGHBD_MSE(8, 16)
603 HIGHBD_MSE(8, 8)
605 void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
606 int width, int height, const uint8_t *ref8,
607 int ref_stride) {
608 int i, j;
609 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
610 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
611 for (i = 0; i < height; ++i) {
612 for (j = 0; j < width; ++j) {
613 const int tmp = pred[j] + ref[j];
614 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
616 comp_pred += width;
617 pred += width;
618 ref += ref_stride;
621 #endif // CONFIG_VP9_HIGHBITDEPTH