2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <immintrin.h> // AVX2
14 #include "config/aom_dsp_rtcd.h"
16 #include "aom_ports/mem.h"
18 /* clang-format off */
19 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2
[512]) = {
20 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
21 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
22 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
23 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
27 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
28 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
29 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
30 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
31 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
32 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
33 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
34 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
35 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
39 #define FILTER_SRC(filter) \
40 /* filter the source */ \
41 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
42 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
44 /* add 8 to source */ \
45 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
46 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
48 /* divide source by 16 */ \
49 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
50 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
52 #define MERGE_WITH_SRC(src_reg, reg) \
53 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
54 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
56 #define LOAD_SRC_DST \
57 /* load source and destination */ \
58 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
59 dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
61 #define AVG_NEXT_SRC(src_reg, size_stride) \
62 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
63 /* average between current and next stride source */ \
64 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
66 #define MERGE_NEXT_SRC(src_reg, size_stride) \
67 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
68 MERGE_WITH_SRC(src_reg, src_next_reg)
70 #define CALC_SUM_SSE_INSIDE_LOOP \
71 /* expand each byte to 2 bytes */ \
72 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
73 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
75 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
76 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
78 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
79 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
80 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
81 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
83 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
84 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
86 // final calculation to sum and sse
87 #define CALC_SUM_AND_SSE \
88 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
89 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
90 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
91 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
92 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
93 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
95 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
96 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
98 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
99 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
100 *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
101 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
102 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
103 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
104 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
105 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
107 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src
, int src_stride
,
108 int x_offset
, int y_offset
,
109 const uint8_t *dst
, int dst_stride
,
110 int height
, unsigned int *sse
) {
111 __m256i src_reg
, dst_reg
, exp_src_lo
, exp_src_hi
, exp_dst_lo
, exp_dst_hi
;
112 __m256i sse_reg
, sum_reg
, sse_reg_hi
, res_cmp
, sum_reg_lo
, sum_reg_hi
;
115 sum_reg
= _mm256_set1_epi16(0);
116 sse_reg
= _mm256_set1_epi16(0);
117 zero_reg
= _mm256_set1_epi16(0);
119 // x_offset = 0 and y_offset = 0
122 for (i
= 0; i
< height
; i
++) {
124 // expend each byte to 2 bytes
125 MERGE_WITH_SRC(src_reg
, zero_reg
)
126 CALC_SUM_SSE_INSIDE_LOOP
130 // x_offset = 0 and y_offset = 8
131 } else if (y_offset
== 8) {
132 __m256i src_next_reg
;
133 for (i
= 0; i
< height
; i
++) {
135 AVG_NEXT_SRC(src_reg
, src_stride
)
136 // expend each byte to 2 bytes
137 MERGE_WITH_SRC(src_reg
, zero_reg
)
138 CALC_SUM_SSE_INSIDE_LOOP
142 // x_offset = 0 and y_offset = bilin interpolation
144 __m256i filter
, pw8
, src_next_reg
;
147 filter
= _mm256_load_si256(
148 (__m256i
const *)(bilinear_filters_avx2
+ y_offset
));
149 pw8
= _mm256_set1_epi16(8);
150 for (i
= 0; i
< height
; i
++) {
152 MERGE_NEXT_SRC(src_reg
, src_stride
)
154 CALC_SUM_SSE_INSIDE_LOOP
159 // x_offset = 8 and y_offset = 0
160 } else if (x_offset
== 8) {
162 __m256i src_next_reg
;
163 for (i
= 0; i
< height
; i
++) {
165 AVG_NEXT_SRC(src_reg
, 1)
166 // expand each byte to 2 bytes
167 MERGE_WITH_SRC(src_reg
, zero_reg
)
168 CALC_SUM_SSE_INSIDE_LOOP
172 // x_offset = 8 and y_offset = 8
173 } else if (y_offset
== 8) {
174 __m256i src_next_reg
, src_avg
;
175 // load source and another source starting from the next
177 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
178 AVG_NEXT_SRC(src_reg
, 1)
179 for (i
= 0; i
< height
; i
++) {
183 AVG_NEXT_SRC(src_reg
, 1)
184 // average between previous average to current average
185 src_avg
= _mm256_avg_epu8(src_avg
, src_reg
);
186 // expand each byte to 2 bytes
187 MERGE_WITH_SRC(src_avg
, zero_reg
)
188 // save current source average
189 CALC_SUM_SSE_INSIDE_LOOP
192 // x_offset = 8 and y_offset = bilin interpolation
194 __m256i filter
, pw8
, src_next_reg
, src_avg
;
196 filter
= _mm256_load_si256(
197 (__m256i
const *)(bilinear_filters_avx2
+ y_offset
));
198 pw8
= _mm256_set1_epi16(8);
199 // load source and another source starting from the next
201 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
202 AVG_NEXT_SRC(src_reg
, 1)
203 for (i
= 0; i
< height
; i
++) {
204 // save current source average
208 AVG_NEXT_SRC(src_reg
, 1)
209 MERGE_WITH_SRC(src_avg
, src_reg
)
211 CALC_SUM_SSE_INSIDE_LOOP
215 // x_offset = bilin interpolation and y_offset = 0
218 __m256i filter
, pw8
, src_next_reg
;
220 filter
= _mm256_load_si256(
221 (__m256i
const *)(bilinear_filters_avx2
+ x_offset
));
222 pw8
= _mm256_set1_epi16(8);
223 for (i
= 0; i
< height
; i
++) {
225 MERGE_NEXT_SRC(src_reg
, 1)
227 CALC_SUM_SSE_INSIDE_LOOP
231 // x_offset = bilin interpolation and y_offset = 8
232 } else if (y_offset
== 8) {
233 __m256i filter
, pw8
, src_next_reg
, src_pack
;
235 filter
= _mm256_load_si256(
236 (__m256i
const *)(bilinear_filters_avx2
+ x_offset
));
237 pw8
= _mm256_set1_epi16(8);
238 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
239 MERGE_NEXT_SRC(src_reg
, 1)
241 // convert each 16 bit to 8 bit to each low and high lane source
242 src_pack
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
243 for (i
= 0; i
< height
; i
++) {
246 MERGE_NEXT_SRC(src_reg
, 1)
248 src_reg
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
249 // average between previous pack to the current
250 src_pack
= _mm256_avg_epu8(src_pack
, src_reg
);
251 MERGE_WITH_SRC(src_pack
, zero_reg
)
252 CALC_SUM_SSE_INSIDE_LOOP
256 // x_offset = bilin interpolation and y_offset = bilin interpolation
258 __m256i xfilter
, yfilter
, pw8
, src_next_reg
, src_pack
;
260 xfilter
= _mm256_load_si256(
261 (__m256i
const *)(bilinear_filters_avx2
+ x_offset
));
263 yfilter
= _mm256_load_si256(
264 (__m256i
const *)(bilinear_filters_avx2
+ y_offset
));
265 pw8
= _mm256_set1_epi16(8);
266 // load source and another source starting from the next
268 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
269 MERGE_NEXT_SRC(src_reg
, 1)
272 // convert each 16 bit to 8 bit to each low and high lane source
273 src_pack
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
274 for (i
= 0; i
< height
; i
++) {
277 MERGE_NEXT_SRC(src_reg
, 1)
279 src_reg
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
280 // merge previous pack to current pack source
281 MERGE_WITH_SRC(src_pack
, src_reg
)
285 CALC_SUM_SSE_INSIDE_LOOP
295 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
296 const uint8_t *src
, int src_stride
, int x_offset
, int y_offset
,
297 const uint8_t *dst
, int dst_stride
, const uint8_t *sec
, int sec_stride
,
298 int height
, unsigned int *sse
) {
300 __m256i src_reg
, dst_reg
, exp_src_lo
, exp_src_hi
, exp_dst_lo
, exp_dst_hi
;
301 __m256i sse_reg
, sum_reg
, sse_reg_hi
, res_cmp
, sum_reg_lo
, sum_reg_hi
;
304 sum_reg
= _mm256_set1_epi16(0);
305 sse_reg
= _mm256_set1_epi16(0);
306 zero_reg
= _mm256_set1_epi16(0);
308 // x_offset = 0 and y_offset = 0
311 for (i
= 0; i
< height
; i
++) {
313 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
314 src_reg
= _mm256_avg_epu8(src_reg
, sec_reg
);
316 // expend each byte to 2 bytes
317 MERGE_WITH_SRC(src_reg
, zero_reg
)
318 CALC_SUM_SSE_INSIDE_LOOP
322 } else if (y_offset
== 8) {
323 __m256i src_next_reg
;
324 for (i
= 0; i
< height
; i
++) {
326 AVG_NEXT_SRC(src_reg
, src_stride
)
327 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
328 src_reg
= _mm256_avg_epu8(src_reg
, sec_reg
);
330 // expend each byte to 2 bytes
331 MERGE_WITH_SRC(src_reg
, zero_reg
)
332 CALC_SUM_SSE_INSIDE_LOOP
336 // x_offset = 0 and y_offset = bilin interpolation
338 __m256i filter
, pw8
, src_next_reg
;
341 filter
= _mm256_load_si256(
342 (__m256i
const *)(bilinear_filters_avx2
+ y_offset
));
343 pw8
= _mm256_set1_epi16(8);
344 for (i
= 0; i
< height
; i
++) {
346 MERGE_NEXT_SRC(src_reg
, src_stride
)
348 src_reg
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
349 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
350 src_reg
= _mm256_avg_epu8(src_reg
, sec_reg
);
352 MERGE_WITH_SRC(src_reg
, zero_reg
)
353 CALC_SUM_SSE_INSIDE_LOOP
358 // x_offset = 8 and y_offset = 0
359 } else if (x_offset
== 8) {
361 __m256i src_next_reg
;
362 for (i
= 0; i
< height
; i
++) {
364 AVG_NEXT_SRC(src_reg
, 1)
365 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
366 src_reg
= _mm256_avg_epu8(src_reg
, sec_reg
);
368 // expand each byte to 2 bytes
369 MERGE_WITH_SRC(src_reg
, zero_reg
)
370 CALC_SUM_SSE_INSIDE_LOOP
374 // x_offset = 8 and y_offset = 8
375 } else if (y_offset
== 8) {
376 __m256i src_next_reg
, src_avg
;
377 // load source and another source starting from the next
379 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
380 AVG_NEXT_SRC(src_reg
, 1)
381 for (i
= 0; i
< height
; i
++) {
382 // save current source average
386 AVG_NEXT_SRC(src_reg
, 1)
387 // average between previous average to current average
388 src_avg
= _mm256_avg_epu8(src_avg
, src_reg
);
389 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
390 src_avg
= _mm256_avg_epu8(src_avg
, sec_reg
);
392 // expand each byte to 2 bytes
393 MERGE_WITH_SRC(src_avg
, zero_reg
)
394 CALC_SUM_SSE_INSIDE_LOOP
397 // x_offset = 8 and y_offset = bilin interpolation
399 __m256i filter
, pw8
, src_next_reg
, src_avg
;
401 filter
= _mm256_load_si256(
402 (__m256i
const *)(bilinear_filters_avx2
+ y_offset
));
403 pw8
= _mm256_set1_epi16(8);
404 // load source and another source starting from the next
406 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
407 AVG_NEXT_SRC(src_reg
, 1)
408 for (i
= 0; i
< height
; i
++) {
409 // save current source average
413 AVG_NEXT_SRC(src_reg
, 1)
414 MERGE_WITH_SRC(src_avg
, src_reg
)
416 src_avg
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
417 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
418 src_avg
= _mm256_avg_epu8(src_avg
, sec_reg
);
419 // expand each byte to 2 bytes
420 MERGE_WITH_SRC(src_avg
, zero_reg
)
422 CALC_SUM_SSE_INSIDE_LOOP
426 // x_offset = bilin interpolation and y_offset = 0
429 __m256i filter
, pw8
, src_next_reg
;
431 filter
= _mm256_load_si256(
432 (__m256i
const *)(bilinear_filters_avx2
+ x_offset
));
433 pw8
= _mm256_set1_epi16(8);
434 for (i
= 0; i
< height
; i
++) {
436 MERGE_NEXT_SRC(src_reg
, 1)
438 src_reg
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
439 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
440 src_reg
= _mm256_avg_epu8(src_reg
, sec_reg
);
441 MERGE_WITH_SRC(src_reg
, zero_reg
)
443 CALC_SUM_SSE_INSIDE_LOOP
447 // x_offset = bilin interpolation and y_offset = 8
448 } else if (y_offset
== 8) {
449 __m256i filter
, pw8
, src_next_reg
, src_pack
;
451 filter
= _mm256_load_si256(
452 (__m256i
const *)(bilinear_filters_avx2
+ x_offset
));
453 pw8
= _mm256_set1_epi16(8);
454 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
455 MERGE_NEXT_SRC(src_reg
, 1)
457 // convert each 16 bit to 8 bit to each low and high lane source
458 src_pack
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
459 for (i
= 0; i
< height
; i
++) {
462 MERGE_NEXT_SRC(src_reg
, 1)
464 src_reg
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
465 // average between previous pack to the current
466 src_pack
= _mm256_avg_epu8(src_pack
, src_reg
);
467 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
468 src_pack
= _mm256_avg_epu8(src_pack
, sec_reg
);
470 MERGE_WITH_SRC(src_pack
, zero_reg
)
472 CALC_SUM_SSE_INSIDE_LOOP
475 // x_offset = bilin interpolation and y_offset = bilin interpolation
477 __m256i xfilter
, yfilter
, pw8
, src_next_reg
, src_pack
;
479 xfilter
= _mm256_load_si256(
480 (__m256i
const *)(bilinear_filters_avx2
+ x_offset
));
482 yfilter
= _mm256_load_si256(
483 (__m256i
const *)(bilinear_filters_avx2
+ y_offset
));
484 pw8
= _mm256_set1_epi16(8);
485 // load source and another source starting from the next
487 src_reg
= _mm256_loadu_si256((__m256i
const *)(src
));
488 MERGE_NEXT_SRC(src_reg
, 1)
491 // convert each 16 bit to 8 bit to each low and high lane source
492 src_pack
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
493 for (i
= 0; i
< height
; i
++) {
496 MERGE_NEXT_SRC(src_reg
, 1)
498 src_reg
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
499 // merge previous pack to current pack source
500 MERGE_WITH_SRC(src_pack
, src_reg
)
503 src_pack
= _mm256_packus_epi16(exp_src_lo
, exp_src_hi
);
504 sec_reg
= _mm256_loadu_si256((__m256i
const *)(sec
));
505 src_pack
= _mm256_avg_epu8(src_pack
, sec_reg
);
506 MERGE_WITH_SRC(src_pack
, zero_reg
)
509 CALC_SUM_SSE_INSIDE_LOOP