Change blend function param order from h,w to w,h
[aom.git] / aom_dsp / x86 / variance_impl_avx2.c
blob88e27aef3af0a2969edb86dd2d986930c12b9dd4
1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <immintrin.h> // AVX2
14 #include "config/aom_dsp_rtcd.h"
16 #include "aom_ports/mem.h"
18 /* clang-format off */
19 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
20 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
21 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
22 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
23 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
27 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
28 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
29 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
30 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
31 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
32 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
33 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
34 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
35 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
37 /* clang-format on */
39 #define FILTER_SRC(filter) \
40 /* filter the source */ \
41 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
42 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
44 /* add 8 to source */ \
45 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
46 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
48 /* divide source by 16 */ \
49 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
50 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
52 #define MERGE_WITH_SRC(src_reg, reg) \
53 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
54 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
56 #define LOAD_SRC_DST \
57 /* load source and destination */ \
58 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
59 dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
61 #define AVG_NEXT_SRC(src_reg, size_stride) \
62 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
63 /* average between current and next stride source */ \
64 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
66 #define MERGE_NEXT_SRC(src_reg, size_stride) \
67 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
68 MERGE_WITH_SRC(src_reg, src_next_reg)
70 #define CALC_SUM_SSE_INSIDE_LOOP \
71 /* expand each byte to 2 bytes */ \
72 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
73 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
74 /* source - dest */ \
75 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
76 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
77 /* caculate sum */ \
78 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
79 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
80 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
81 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
82 /* calculate sse */ \
83 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
84 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
86 // final calculation to sum and sse
87 #define CALC_SUM_AND_SSE \
88 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
89 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
90 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
91 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
92 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
93 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
95 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
96 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
98 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
99 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
100 *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
101 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
102 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
103 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
104 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
105 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
107 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
108 int x_offset, int y_offset,
109 const uint8_t *dst, int dst_stride,
110 int height, unsigned int *sse) {
111 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
112 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
113 __m256i zero_reg;
114 int i, sum;
115 sum_reg = _mm256_set1_epi16(0);
116 sse_reg = _mm256_set1_epi16(0);
117 zero_reg = _mm256_set1_epi16(0);
119 // x_offset = 0 and y_offset = 0
120 if (x_offset == 0) {
121 if (y_offset == 0) {
122 for (i = 0; i < height; i++) {
123 LOAD_SRC_DST
124 // expend each byte to 2 bytes
125 MERGE_WITH_SRC(src_reg, zero_reg)
126 CALC_SUM_SSE_INSIDE_LOOP
127 src += src_stride;
128 dst += dst_stride;
130 // x_offset = 0 and y_offset = 8
131 } else if (y_offset == 8) {
132 __m256i src_next_reg;
133 for (i = 0; i < height; i++) {
134 LOAD_SRC_DST
135 AVG_NEXT_SRC(src_reg, src_stride)
136 // expend each byte to 2 bytes
137 MERGE_WITH_SRC(src_reg, zero_reg)
138 CALC_SUM_SSE_INSIDE_LOOP
139 src += src_stride;
140 dst += dst_stride;
142 // x_offset = 0 and y_offset = bilin interpolation
143 } else {
144 __m256i filter, pw8, src_next_reg;
146 y_offset <<= 5;
147 filter = _mm256_load_si256(
148 (__m256i const *)(bilinear_filters_avx2 + y_offset));
149 pw8 = _mm256_set1_epi16(8);
150 for (i = 0; i < height; i++) {
151 LOAD_SRC_DST
152 MERGE_NEXT_SRC(src_reg, src_stride)
153 FILTER_SRC(filter)
154 CALC_SUM_SSE_INSIDE_LOOP
155 src += src_stride;
156 dst += dst_stride;
159 // x_offset = 8 and y_offset = 0
160 } else if (x_offset == 8) {
161 if (y_offset == 0) {
162 __m256i src_next_reg;
163 for (i = 0; i < height; i++) {
164 LOAD_SRC_DST
165 AVG_NEXT_SRC(src_reg, 1)
166 // expand each byte to 2 bytes
167 MERGE_WITH_SRC(src_reg, zero_reg)
168 CALC_SUM_SSE_INSIDE_LOOP
169 src += src_stride;
170 dst += dst_stride;
172 // x_offset = 8 and y_offset = 8
173 } else if (y_offset == 8) {
174 __m256i src_next_reg, src_avg;
175 // load source and another source starting from the next
176 // following byte
177 src_reg = _mm256_loadu_si256((__m256i const *)(src));
178 AVG_NEXT_SRC(src_reg, 1)
179 for (i = 0; i < height; i++) {
180 src_avg = src_reg;
181 src += src_stride;
182 LOAD_SRC_DST
183 AVG_NEXT_SRC(src_reg, 1)
184 // average between previous average to current average
185 src_avg = _mm256_avg_epu8(src_avg, src_reg);
186 // expand each byte to 2 bytes
187 MERGE_WITH_SRC(src_avg, zero_reg)
188 // save current source average
189 CALC_SUM_SSE_INSIDE_LOOP
190 dst += dst_stride;
192 // x_offset = 8 and y_offset = bilin interpolation
193 } else {
194 __m256i filter, pw8, src_next_reg, src_avg;
195 y_offset <<= 5;
196 filter = _mm256_load_si256(
197 (__m256i const *)(bilinear_filters_avx2 + y_offset));
198 pw8 = _mm256_set1_epi16(8);
199 // load source and another source starting from the next
200 // following byte
201 src_reg = _mm256_loadu_si256((__m256i const *)(src));
202 AVG_NEXT_SRC(src_reg, 1)
203 for (i = 0; i < height; i++) {
204 // save current source average
205 src_avg = src_reg;
206 src += src_stride;
207 LOAD_SRC_DST
208 AVG_NEXT_SRC(src_reg, 1)
209 MERGE_WITH_SRC(src_avg, src_reg)
210 FILTER_SRC(filter)
211 CALC_SUM_SSE_INSIDE_LOOP
212 dst += dst_stride;
215 // x_offset = bilin interpolation and y_offset = 0
216 } else {
217 if (y_offset == 0) {
218 __m256i filter, pw8, src_next_reg;
219 x_offset <<= 5;
220 filter = _mm256_load_si256(
221 (__m256i const *)(bilinear_filters_avx2 + x_offset));
222 pw8 = _mm256_set1_epi16(8);
223 for (i = 0; i < height; i++) {
224 LOAD_SRC_DST
225 MERGE_NEXT_SRC(src_reg, 1)
226 FILTER_SRC(filter)
227 CALC_SUM_SSE_INSIDE_LOOP
228 src += src_stride;
229 dst += dst_stride;
231 // x_offset = bilin interpolation and y_offset = 8
232 } else if (y_offset == 8) {
233 __m256i filter, pw8, src_next_reg, src_pack;
234 x_offset <<= 5;
235 filter = _mm256_load_si256(
236 (__m256i const *)(bilinear_filters_avx2 + x_offset));
237 pw8 = _mm256_set1_epi16(8);
238 src_reg = _mm256_loadu_si256((__m256i const *)(src));
239 MERGE_NEXT_SRC(src_reg, 1)
240 FILTER_SRC(filter)
241 // convert each 16 bit to 8 bit to each low and high lane source
242 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
243 for (i = 0; i < height; i++) {
244 src += src_stride;
245 LOAD_SRC_DST
246 MERGE_NEXT_SRC(src_reg, 1)
247 FILTER_SRC(filter)
248 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
249 // average between previous pack to the current
250 src_pack = _mm256_avg_epu8(src_pack, src_reg);
251 MERGE_WITH_SRC(src_pack, zero_reg)
252 CALC_SUM_SSE_INSIDE_LOOP
253 src_pack = src_reg;
254 dst += dst_stride;
256 // x_offset = bilin interpolation and y_offset = bilin interpolation
257 } else {
258 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
259 x_offset <<= 5;
260 xfilter = _mm256_load_si256(
261 (__m256i const *)(bilinear_filters_avx2 + x_offset));
262 y_offset <<= 5;
263 yfilter = _mm256_load_si256(
264 (__m256i const *)(bilinear_filters_avx2 + y_offset));
265 pw8 = _mm256_set1_epi16(8);
266 // load source and another source starting from the next
267 // following byte
268 src_reg = _mm256_loadu_si256((__m256i const *)(src));
269 MERGE_NEXT_SRC(src_reg, 1)
271 FILTER_SRC(xfilter)
272 // convert each 16 bit to 8 bit to each low and high lane source
273 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
274 for (i = 0; i < height; i++) {
275 src += src_stride;
276 LOAD_SRC_DST
277 MERGE_NEXT_SRC(src_reg, 1)
278 FILTER_SRC(xfilter)
279 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
280 // merge previous pack to current pack source
281 MERGE_WITH_SRC(src_pack, src_reg)
282 // filter the source
283 FILTER_SRC(yfilter)
284 src_pack = src_reg;
285 CALC_SUM_SSE_INSIDE_LOOP
286 dst += dst_stride;
290 CALC_SUM_AND_SSE
291 _mm256_zeroupper();
292 return sum;
295 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
296 const uint8_t *src, int src_stride, int x_offset, int y_offset,
297 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
298 int height, unsigned int *sse) {
299 __m256i sec_reg;
300 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
301 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
302 __m256i zero_reg;
303 int i, sum;
304 sum_reg = _mm256_set1_epi16(0);
305 sse_reg = _mm256_set1_epi16(0);
306 zero_reg = _mm256_set1_epi16(0);
308 // x_offset = 0 and y_offset = 0
309 if (x_offset == 0) {
310 if (y_offset == 0) {
311 for (i = 0; i < height; i++) {
312 LOAD_SRC_DST
313 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
314 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
315 sec += sec_stride;
316 // expend each byte to 2 bytes
317 MERGE_WITH_SRC(src_reg, zero_reg)
318 CALC_SUM_SSE_INSIDE_LOOP
319 src += src_stride;
320 dst += dst_stride;
322 } else if (y_offset == 8) {
323 __m256i src_next_reg;
324 for (i = 0; i < height; i++) {
325 LOAD_SRC_DST
326 AVG_NEXT_SRC(src_reg, src_stride)
327 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
328 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
329 sec += sec_stride;
330 // expend each byte to 2 bytes
331 MERGE_WITH_SRC(src_reg, zero_reg)
332 CALC_SUM_SSE_INSIDE_LOOP
333 src += src_stride;
334 dst += dst_stride;
336 // x_offset = 0 and y_offset = bilin interpolation
337 } else {
338 __m256i filter, pw8, src_next_reg;
340 y_offset <<= 5;
341 filter = _mm256_load_si256(
342 (__m256i const *)(bilinear_filters_avx2 + y_offset));
343 pw8 = _mm256_set1_epi16(8);
344 for (i = 0; i < height; i++) {
345 LOAD_SRC_DST
346 MERGE_NEXT_SRC(src_reg, src_stride)
347 FILTER_SRC(filter)
348 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
349 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
350 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
351 sec += sec_stride;
352 MERGE_WITH_SRC(src_reg, zero_reg)
353 CALC_SUM_SSE_INSIDE_LOOP
354 src += src_stride;
355 dst += dst_stride;
358 // x_offset = 8 and y_offset = 0
359 } else if (x_offset == 8) {
360 if (y_offset == 0) {
361 __m256i src_next_reg;
362 for (i = 0; i < height; i++) {
363 LOAD_SRC_DST
364 AVG_NEXT_SRC(src_reg, 1)
365 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
366 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
367 sec += sec_stride;
368 // expand each byte to 2 bytes
369 MERGE_WITH_SRC(src_reg, zero_reg)
370 CALC_SUM_SSE_INSIDE_LOOP
371 src += src_stride;
372 dst += dst_stride;
374 // x_offset = 8 and y_offset = 8
375 } else if (y_offset == 8) {
376 __m256i src_next_reg, src_avg;
377 // load source and another source starting from the next
378 // following byte
379 src_reg = _mm256_loadu_si256((__m256i const *)(src));
380 AVG_NEXT_SRC(src_reg, 1)
381 for (i = 0; i < height; i++) {
382 // save current source average
383 src_avg = src_reg;
384 src += src_stride;
385 LOAD_SRC_DST
386 AVG_NEXT_SRC(src_reg, 1)
387 // average between previous average to current average
388 src_avg = _mm256_avg_epu8(src_avg, src_reg);
389 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
390 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
391 sec += sec_stride;
392 // expand each byte to 2 bytes
393 MERGE_WITH_SRC(src_avg, zero_reg)
394 CALC_SUM_SSE_INSIDE_LOOP
395 dst += dst_stride;
397 // x_offset = 8 and y_offset = bilin interpolation
398 } else {
399 __m256i filter, pw8, src_next_reg, src_avg;
400 y_offset <<= 5;
401 filter = _mm256_load_si256(
402 (__m256i const *)(bilinear_filters_avx2 + y_offset));
403 pw8 = _mm256_set1_epi16(8);
404 // load source and another source starting from the next
405 // following byte
406 src_reg = _mm256_loadu_si256((__m256i const *)(src));
407 AVG_NEXT_SRC(src_reg, 1)
408 for (i = 0; i < height; i++) {
409 // save current source average
410 src_avg = src_reg;
411 src += src_stride;
412 LOAD_SRC_DST
413 AVG_NEXT_SRC(src_reg, 1)
414 MERGE_WITH_SRC(src_avg, src_reg)
415 FILTER_SRC(filter)
416 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
417 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
418 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
419 // expand each byte to 2 bytes
420 MERGE_WITH_SRC(src_avg, zero_reg)
421 sec += sec_stride;
422 CALC_SUM_SSE_INSIDE_LOOP
423 dst += dst_stride;
426 // x_offset = bilin interpolation and y_offset = 0
427 } else {
428 if (y_offset == 0) {
429 __m256i filter, pw8, src_next_reg;
430 x_offset <<= 5;
431 filter = _mm256_load_si256(
432 (__m256i const *)(bilinear_filters_avx2 + x_offset));
433 pw8 = _mm256_set1_epi16(8);
434 for (i = 0; i < height; i++) {
435 LOAD_SRC_DST
436 MERGE_NEXT_SRC(src_reg, 1)
437 FILTER_SRC(filter)
438 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
439 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
440 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
441 MERGE_WITH_SRC(src_reg, zero_reg)
442 sec += sec_stride;
443 CALC_SUM_SSE_INSIDE_LOOP
444 src += src_stride;
445 dst += dst_stride;
447 // x_offset = bilin interpolation and y_offset = 8
448 } else if (y_offset == 8) {
449 __m256i filter, pw8, src_next_reg, src_pack;
450 x_offset <<= 5;
451 filter = _mm256_load_si256(
452 (__m256i const *)(bilinear_filters_avx2 + x_offset));
453 pw8 = _mm256_set1_epi16(8);
454 src_reg = _mm256_loadu_si256((__m256i const *)(src));
455 MERGE_NEXT_SRC(src_reg, 1)
456 FILTER_SRC(filter)
457 // convert each 16 bit to 8 bit to each low and high lane source
458 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
459 for (i = 0; i < height; i++) {
460 src += src_stride;
461 LOAD_SRC_DST
462 MERGE_NEXT_SRC(src_reg, 1)
463 FILTER_SRC(filter)
464 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
465 // average between previous pack to the current
466 src_pack = _mm256_avg_epu8(src_pack, src_reg);
467 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
468 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
469 sec += sec_stride;
470 MERGE_WITH_SRC(src_pack, zero_reg)
471 src_pack = src_reg;
472 CALC_SUM_SSE_INSIDE_LOOP
473 dst += dst_stride;
475 // x_offset = bilin interpolation and y_offset = bilin interpolation
476 } else {
477 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
478 x_offset <<= 5;
479 xfilter = _mm256_load_si256(
480 (__m256i const *)(bilinear_filters_avx2 + x_offset));
481 y_offset <<= 5;
482 yfilter = _mm256_load_si256(
483 (__m256i const *)(bilinear_filters_avx2 + y_offset));
484 pw8 = _mm256_set1_epi16(8);
485 // load source and another source starting from the next
486 // following byte
487 src_reg = _mm256_loadu_si256((__m256i const *)(src));
488 MERGE_NEXT_SRC(src_reg, 1)
490 FILTER_SRC(xfilter)
491 // convert each 16 bit to 8 bit to each low and high lane source
492 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
493 for (i = 0; i < height; i++) {
494 src += src_stride;
495 LOAD_SRC_DST
496 MERGE_NEXT_SRC(src_reg, 1)
497 FILTER_SRC(xfilter)
498 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
499 // merge previous pack to current pack source
500 MERGE_WITH_SRC(src_pack, src_reg)
501 // filter the source
502 FILTER_SRC(yfilter)
503 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
504 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
505 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
506 MERGE_WITH_SRC(src_pack, zero_reg)
507 src_pack = src_reg;
508 sec += sec_stride;
509 CALC_SUM_SSE_INSIDE_LOOP
510 dst += dst_stride;
514 CALC_SUM_AND_SSE
515 _mm256_zeroupper();
516 return sum;