cdef: Add cdef_filter_block_highbd().
[aom.git] / av1 / common / cdef_block_simd.h
blobc2fd5c3057eb12a06ca95f90730126f84a59e8a6
1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
13 #define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
15 #include "config/av1_rtcd.h"
17 #include "av1/common/cdef_block.h"
19 /* partial A is a 16-bit vector of the form:
20 [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
21 [0 y1 y2 y3 y4 y5 y6 y7].
22 This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
23 (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
24 and const2. */
25 static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
26 v128 const2) {
27 v128 tmp;
28 /* Reverse partial B. */
29 partialb = v128_shuffle_8(
30 partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
31 /* Interleave the x and y values of identical indices and pair x8 with 0. */
32 tmp = partiala;
33 partiala = v128_ziplo_16(partialb, partiala);
34 partialb = v128_ziphi_16(partialb, tmp);
35 /* Square and add the corresponding x and y values. */
36 partiala = v128_madd_s16(partiala, partiala);
37 partialb = v128_madd_s16(partialb, partialb);
38 /* Multiply by constant. */
39 partiala = v128_mullo_s32(partiala, const1);
40 partialb = v128_mullo_s32(partialb, const2);
41 /* Sum all results. */
42 partiala = v128_add_32(partiala, partialb);
43 return partiala;
46 static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
47 v128 t0, t1, t2, t3;
48 t0 = v128_ziplo_32(x1, x0);
49 t1 = v128_ziplo_32(x3, x2);
50 t2 = v128_ziphi_32(x1, x0);
51 t3 = v128_ziphi_32(x3, x2);
52 x0 = v128_ziplo_64(t1, t0);
53 x1 = v128_ziphi_64(t1, t0);
54 x2 = v128_ziplo_64(t3, t2);
55 x3 = v128_ziphi_64(t3, t2);
56 return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
59 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
60 to compute the remaining directions. */
61 static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
62 v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
63 v128 partial6;
64 v128 tmp;
65 /* Partial sums for lines 0 and 1. */
66 partial4a = v128_shl_n_byte(lines[0], 14);
67 partial4b = v128_shr_n_byte(lines[0], 2);
68 partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
69 partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
70 tmp = v128_add_16(lines[0], lines[1]);
71 partial5a = v128_shl_n_byte(tmp, 10);
72 partial5b = v128_shr_n_byte(tmp, 6);
73 partial7a = v128_shl_n_byte(tmp, 4);
74 partial7b = v128_shr_n_byte(tmp, 12);
75 partial6 = tmp;
77 /* Partial sums for lines 2 and 3. */
78 partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
79 partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
80 partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
81 partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
82 tmp = v128_add_16(lines[2], lines[3]);
83 partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
84 partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
85 partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
86 partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
87 partial6 = v128_add_16(partial6, tmp);
89 /* Partial sums for lines 4 and 5. */
90 partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
91 partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
92 partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
93 partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
94 tmp = v128_add_16(lines[4], lines[5]);
95 partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
96 partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
97 partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
98 partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
99 partial6 = v128_add_16(partial6, tmp);
101 /* Partial sums for lines 6 and 7. */
102 partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
103 partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
104 partial4a = v128_add_16(partial4a, lines[7]);
105 tmp = v128_add_16(lines[6], lines[7]);
106 partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
107 partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
108 partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
109 partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
110 partial6 = v128_add_16(partial6, tmp);
112 /* Compute costs in terms of partial sums. */
113 partial4a =
114 fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
115 v128_from_32(105, 120, 140, 168));
116 partial7a =
117 fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
118 v128_from_32(105, 105, 105, 140));
119 partial5a =
120 fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
121 v128_from_32(105, 105, 105, 140));
122 partial6 = v128_madd_s16(partial6, partial6);
123 partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
125 partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
126 v128_store_unaligned(tmp_cost1, partial4a);
127 return partial4a;
130 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
131 counter-clockwise rotation of the pixels. */
132 static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
133 const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
134 const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
135 const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
136 const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
137 const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
138 const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
139 const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
140 const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
142 const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
143 const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
144 const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
145 const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
146 const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
147 const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
148 const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
149 const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
151 res[7] = v128_ziplo_64(tr1_1, tr1_0);
152 res[6] = v128_ziphi_64(tr1_1, tr1_0);
153 res[5] = v128_ziplo_64(tr1_3, tr1_2);
154 res[4] = v128_ziphi_64(tr1_3, tr1_2);
155 res[3] = v128_ziplo_64(tr1_5, tr1_4);
156 res[2] = v128_ziphi_64(tr1_5, tr1_4);
157 res[1] = v128_ziplo_64(tr1_7, tr1_6);
158 res[0] = v128_ziphi_64(tr1_7, tr1_6);
161 int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
162 int coeff_shift) {
163 int i;
164 int32_t cost[8];
165 int32_t best_cost = 0;
166 int best_dir = 0;
167 v128 lines[8];
168 for (i = 0; i < 8; i++) {
169 lines[i] = v128_load_unaligned(&img[i * stride]);
170 lines[i] =
171 v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
174 /* Compute "mostly vertical" directions. */
175 v128 dir47 = compute_directions(lines, cost + 4);
177 array_reverse_transpose_8x8(lines, lines);
179 /* Compute "mostly horizontal" directions. */
180 v128 dir03 = compute_directions(lines, cost);
182 v128 max = v128_max_s32(dir03, dir47);
183 max = v128_max_s32(max, v128_align(max, max, 8));
184 max = v128_max_s32(max, v128_align(max, max, 4));
185 best_cost = v128_low_u32(max);
186 v128 t =
187 v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
188 best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
189 best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros
191 /* Difference between the optimal variance and the variance along the
192 orthogonal direction. Again, the sum(x^2) terms cancel out. */
193 *var = best_cost - cost[(best_dir + 4) & 7];
194 /* We'd normally divide by 840, but dividing by 1024 is close enough
195 for what we're going to do with this. */
196 *var >>= 10;
197 return best_dir;
200 // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
201 SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
202 unsigned int adjdamp) {
203 const v256 diff16 = v256_sub_16(a, b);
204 v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
205 const v128 sign = v128_cmplt_s8(diff, v128_zero());
206 diff = v128_abs_s8(diff);
207 return v128_xor(
208 v128_add_8(sign,
209 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
210 v128_shr_u8(diff, adjdamp)))),
211 sign);
214 void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
215 const uint16_t *in, int pri_strength,
216 int sec_strength, int dir,
217 int pri_damping, int sec_damping,
218 int coeff_shift) {
219 v128 p0, p1, p2, p3;
220 v256 sum, row, res, tap;
221 v256 max, min;
222 v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
223 int po1 = cdef_directions[dir][0];
224 int po2 = cdef_directions[dir][1];
225 int s1o1 = cdef_directions[dir + 2][0];
226 int s1o2 = cdef_directions[dir + 2][1];
227 int s2o1 = cdef_directions[dir - 2][0];
228 int s2o2 = cdef_directions[dir - 2][1];
229 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
230 const int *sec_taps = cdef_sec_taps;
232 if (pri_strength)
233 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
234 if (sec_strength)
235 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
237 sum = v256_zero();
238 row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
239 v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
240 v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
241 v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
242 max = min = row;
244 if (pri_strength) {
245 v256 max_u8;
246 // Primary near taps
247 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
248 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
249 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
250 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
251 max_u8 = tap;
252 min = v256_min_s16(min, tap);
253 p0 = constrain(tap, row, pri_strength, pri_damping);
254 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
255 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
256 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
257 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
258 max_u8 = v256_max_u8(max_u8, tap);
259 min = v256_min_s16(min, tap);
260 p1 = constrain(tap, row, pri_strength, pri_damping);
262 // sum += pri_taps[0] * (p0 + p1)
263 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
264 v256_from_v128(v128_ziphi_8(p0, p1),
265 v128_ziplo_8(p0, p1))));
267 // Primary far taps
268 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
269 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
270 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
271 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
272 max_u8 = v256_max_u8(max_u8, tap);
273 min = v256_min_s16(min, tap);
274 p0 = constrain(tap, row, pri_strength, pri_damping);
275 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
276 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
277 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
278 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
279 max_u8 = v256_max_u8(max_u8, tap);
280 min = v256_min_s16(min, tap);
281 p1 = constrain(tap, row, pri_strength, pri_damping);
283 // sum += pri_taps[1] * (p0 + p1)
284 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
285 v256_from_v128(v128_ziphi_8(p0, p1),
286 v128_ziplo_8(p0, p1))));
287 /* The source is 16 bits, however, we only really care about the lower
288 8 bits. The upper 8 bits contain the "large" flag. After the final
289 primary max has been calculated, zero out the upper 8 bits. Use this
290 to find the "16 bit" max. */
291 max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
294 if (sec_strength) {
295 v256 max_u8;
296 // Secondary near taps
297 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
298 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
299 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
300 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
301 max_u8 = tap;
302 min = v256_min_s16(min, tap);
303 p0 = constrain(tap, row, sec_strength, sec_damping);
304 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
305 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
306 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
307 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
308 max_u8 = v256_max_u8(max_u8, tap);
309 min = v256_min_s16(min, tap);
310 p1 = constrain(tap, row, sec_strength, sec_damping);
311 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
312 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
313 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
314 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
315 max_u8 = v256_max_u8(max_u8, tap);
316 min = v256_min_s16(min, tap);
317 p2 = constrain(tap, row, sec_strength, sec_damping);
318 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
319 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
320 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
321 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
322 max_u8 = v256_max_u8(max_u8, tap);
323 min = v256_min_s16(min, tap);
324 p3 = constrain(tap, row, sec_strength, sec_damping);
326 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
327 p0 = v128_add_8(p0, p1);
328 p2 = v128_add_8(p2, p3);
329 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
330 v256_from_v128(v128_ziphi_8(p0, p2),
331 v128_ziplo_8(p0, p2))));
333 // Secondary far taps
334 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
335 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
336 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
337 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
338 max_u8 = v256_max_u8(max_u8, tap);
339 min = v256_min_s16(min, tap);
340 p0 = constrain(tap, row, sec_strength, sec_damping);
341 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
342 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
343 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
344 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
345 max_u8 = v256_max_u8(max_u8, tap);
346 min = v256_min_s16(min, tap);
347 p1 = constrain(tap, row, sec_strength, sec_damping);
348 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
349 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
350 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
351 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
352 max_u8 = v256_max_u8(max_u8, tap);
353 min = v256_min_s16(min, tap);
354 p2 = constrain(tap, row, sec_strength, sec_damping);
355 tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
356 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
357 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
358 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
359 max_u8 = v256_max_u8(max_u8, tap);
360 min = v256_min_s16(min, tap);
361 p3 = constrain(tap, row, sec_strength, sec_damping);
363 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
364 p0 = v128_add_8(p0, p1);
365 p2 = v128_add_8(p2, p3);
367 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
368 v256_from_v128(v128_ziphi_8(p0, p2),
369 v128_ziplo_8(p0, p2))));
371 /* The source is 16 bits, however, we only really care about the lower
372 8 bits. The upper 8 bits contain the "large" flag. After the final
373 primary max has been calculated, zero out the upper 8 bits. Use this
374 to find the "16 bit" max. */
375 max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
378 // res = row + ((sum - (sum < 0) + 8) >> 4)
379 sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
380 res = v256_add_16(sum, v256_dup_16(8));
381 res = v256_shr_n_s16(res, 4);
382 res = v256_add_16(row, res);
383 res = v256_min_s16(v256_max_s16(res, min), max);
384 res = v256_pack_s16_u8(res, res);
386 p0 = v256_low_v128(res);
387 u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
388 u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
389 u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
390 u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
393 void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
394 const uint16_t *in, int pri_strength,
395 int sec_strength, int dir,
396 int pri_damping, int sec_damping,
397 int coeff_shift) {
398 int i;
399 v128 p0, p1, p2, p3;
400 v256 sum, row, res, tap;
401 v256 max, min;
402 v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
403 int po1 = cdef_directions[dir][0];
404 int po2 = cdef_directions[dir][1];
405 int s1o1 = cdef_directions[dir + 2][0];
406 int s1o2 = cdef_directions[dir + 2][1];
407 int s2o1 = cdef_directions[dir - 2][0];
408 int s2o2 = cdef_directions[dir - 2][1];
409 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
410 const int *sec_taps = cdef_sec_taps;
412 if (pri_strength)
413 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
414 if (sec_strength)
415 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
416 for (i = 0; i < 8; i += 2) {
417 v256 max_u8;
418 sum = v256_zero();
419 row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
420 v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
422 max = min = row;
423 // Primary near taps
424 tap =
425 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
426 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
427 max_u8 = tap;
428 min = v256_min_s16(min, tap);
429 p0 = constrain(tap, row, pri_strength, pri_damping);
430 tap =
431 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
432 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
433 max_u8 = v256_max_u8(max_u8, tap);
434 min = v256_min_s16(min, tap);
435 p1 = constrain(tap, row, pri_strength, pri_damping);
437 // sum += pri_taps[0] * (p0 + p1)
438 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
439 v256_from_v128(v128_ziphi_8(p0, p1),
440 v128_ziplo_8(p0, p1))));
442 // Primary far taps
443 tap =
444 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
445 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
446 max_u8 = v256_max_u8(max_u8, tap);
447 min = v256_min_s16(min, tap);
448 p0 = constrain(tap, row, pri_strength, pri_damping);
449 tap =
450 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
451 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
452 max_u8 = v256_max_u8(max_u8, tap);
453 min = v256_min_s16(min, tap);
454 p1 = constrain(tap, row, pri_strength, pri_damping);
456 // sum += pri_taps[1] * (p0 + p1)
457 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
458 v256_from_v128(v128_ziphi_8(p0, p1),
459 v128_ziplo_8(p0, p1))));
461 max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
463 // Secondary near taps
464 tap =
465 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
466 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
467 max_u8 = tap;
468 min = v256_min_s16(min, tap);
469 p0 = constrain(tap, row, sec_strength, sec_damping);
470 tap =
471 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
472 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
473 max_u8 = v256_max_u8(max_u8, tap);
474 min = v256_min_s16(min, tap);
475 p1 = constrain(tap, row, sec_strength, sec_damping);
476 tap =
477 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
478 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
479 max_u8 = v256_max_u8(max_u8, tap);
480 min = v256_min_s16(min, tap);
481 p2 = constrain(tap, row, sec_strength, sec_damping);
482 tap =
483 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
484 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
485 max_u8 = v256_max_u8(max_u8, tap);
486 min = v256_min_s16(min, tap);
487 p3 = constrain(tap, row, sec_strength, sec_damping);
489 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
490 p0 = v128_add_8(p0, p1);
491 p2 = v128_add_8(p2, p3);
492 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
493 v256_from_v128(v128_ziphi_8(p0, p2),
494 v128_ziplo_8(p0, p2))));
496 // Secondary far taps
497 tap =
498 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
499 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
500 max_u8 = v256_max_u8(max_u8, tap);
501 min = v256_min_s16(min, tap);
502 p0 = constrain(tap, row, sec_strength, sec_damping);
503 tap =
504 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
505 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
506 max_u8 = v256_max_u8(max_u8, tap);
507 min = v256_min_s16(min, tap);
508 p1 = constrain(tap, row, sec_strength, sec_damping);
509 tap =
510 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
511 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
512 max_u8 = v256_max_u8(max_u8, tap);
513 min = v256_min_s16(min, tap);
514 p2 = constrain(tap, row, sec_strength, sec_damping);
515 tap =
516 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
517 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
518 max_u8 = v256_max_u8(max_u8, tap);
519 min = v256_min_s16(min, tap);
520 p3 = constrain(tap, row, sec_strength, sec_damping);
522 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
523 p0 = v128_add_8(p0, p1);
524 p2 = v128_add_8(p2, p3);
525 sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
526 v256_from_v128(v128_ziphi_8(p0, p2),
527 v128_ziplo_8(p0, p2))));
529 max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
531 // res = row + ((sum - (sum < 0) + 8) >> 4)
532 sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
533 res = v256_add_16(sum, v256_dup_16(8));
534 res = v256_shr_n_s16(res, 4);
535 res = v256_add_16(row, res);
536 res = v256_min_s16(v256_max_s16(res, min), max);
537 res = v256_pack_s16_u8(res, res);
539 p0 = v256_low_v128(res);
540 v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
541 v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
545 void SIMD_FUNC(cdef_filter_block)(void *dest, int dstride, const uint16_t *in,
546 int pri_strength, int sec_strength, int dir,
547 int pri_damping, int sec_damping, int bsize,
548 int coeff_shift) {
549 uint8_t *dst8 = (uint8_t *)dest;
550 if (bsize == BLOCK_8X8) {
551 SIMD_FUNC(cdef_filter_block_8x8_8)
552 (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
553 sec_damping, coeff_shift);
554 } else if (bsize == BLOCK_4X8) {
555 SIMD_FUNC(cdef_filter_block_4x4_8)
556 (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
557 sec_damping, coeff_shift);
558 SIMD_FUNC(cdef_filter_block_4x4_8)
559 (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
560 sec_strength, dir, pri_damping, sec_damping, coeff_shift);
561 } else if (bsize == BLOCK_8X4) {
562 SIMD_FUNC(cdef_filter_block_4x4_8)
563 (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
564 sec_damping, coeff_shift);
565 SIMD_FUNC(cdef_filter_block_4x4_8)
566 (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
567 sec_damping, coeff_shift);
568 } else {
569 SIMD_FUNC(cdef_filter_block_4x4_8)
570 (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
571 sec_damping, coeff_shift);
575 #if CONFIG_AV1_HIGHBITDEPTH
576 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
577 SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
578 unsigned int adjdamp) {
579 v256 diff = v256_sub_16(a, b);
580 const v256 sign = v256_shr_n_s16(diff, 15);
581 diff = v256_abs_s16(diff);
582 const v256 s =
583 v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
584 return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
587 void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
588 const uint16_t *in, int pri_strength,
589 int sec_strength, int dir,
590 int pri_damping, int sec_damping,
591 int coeff_shift) {
592 v256 p0, p1, p2, p3, sum, row, res;
593 v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
594 int po1 = cdef_directions[dir][0];
595 int po2 = cdef_directions[dir][1];
596 int s1o1 = cdef_directions[dir + 2][0];
597 int s1o2 = cdef_directions[dir + 2][1];
598 int s2o1 = cdef_directions[dir - 2][0];
599 int s2o2 = cdef_directions[dir - 2][1];
600 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
601 const int *sec_taps = cdef_sec_taps;
603 if (pri_strength)
604 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
605 if (sec_strength)
606 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
608 sum = v256_zero();
609 row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
610 v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
611 v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
612 v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
613 min = max = row;
615 // Primary near taps
616 p0 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
617 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
618 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
619 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
620 p1 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
621 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
622 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
623 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
624 max = v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
625 v256_andn(p1, v256_cmpeq_16(p1, large)));
626 min = v256_min_s16(v256_min_s16(min, p0), p1);
627 p0 = constrain16(p0, row, pri_strength, pri_damping);
628 p1 = constrain16(p1, row, pri_strength, pri_damping);
630 // sum += pri_taps[0] * (p0 + p1)
631 sum = v256_add_16(
632 sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
634 // Primary far taps
635 p0 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
636 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
637 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
638 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
639 p1 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
640 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
641 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
642 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
643 max = v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
644 v256_andn(p1, v256_cmpeq_16(p1, large)));
645 min = v256_min_s16(v256_min_s16(min, p0), p1);
646 p0 = constrain16(p0, row, pri_strength, pri_damping);
647 p1 = constrain16(p1, row, pri_strength, pri_damping);
649 // sum += pri_taps[1] * (p0 + p1)
650 sum = v256_add_16(
651 sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
653 // Secondary near taps
654 p0 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
655 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
656 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
657 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
658 p1 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
659 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
660 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
661 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
662 p2 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
663 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
664 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
665 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
666 p3 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
667 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
668 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
669 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
670 max = v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
671 v256_andn(p1, v256_cmpeq_16(p1, large)));
672 max = v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
673 v256_andn(p3, v256_cmpeq_16(p3, large)));
674 min = v256_min_s16(v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2),
675 p3);
676 p0 = constrain16(p0, row, sec_strength, sec_damping);
677 p1 = constrain16(p1, row, sec_strength, sec_damping);
678 p2 = constrain16(p2, row, sec_strength, sec_damping);
679 p3 = constrain16(p3, row, sec_strength, sec_damping);
681 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
682 sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
683 v256_add_16(v256_add_16(p0, p1),
684 v256_add_16(p2, p3))));
686 // Secondary far taps
687 p0 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
688 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
689 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
690 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
691 p1 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
692 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
693 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
694 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
695 p2 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
696 v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
697 v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
698 v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
699 p3 = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
700 v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
701 v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
702 v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
703 max = v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
704 v256_andn(p1, v256_cmpeq_16(p1, large)));
705 max = v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
706 v256_andn(p3, v256_cmpeq_16(p3, large)));
707 min = v256_min_s16(v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2),
708 p3);
709 p0 = constrain16(p0, row, sec_strength, sec_damping);
710 p1 = constrain16(p1, row, sec_strength, sec_damping);
711 p2 = constrain16(p2, row, sec_strength, sec_damping);
712 p3 = constrain16(p3, row, sec_strength, sec_damping);
714 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
715 sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
716 v256_add_16(v256_add_16(p0, p1),
717 v256_add_16(p2, p3))));
719 // res = row + ((sum - (sum < 0) + 8) >> 4)
720 sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
721 res = v256_add_16(sum, v256_dup_16(8));
722 res = v256_shr_n_s16(res, 4);
723 res = v256_add_16(row, res);
724 res = v256_min_s16(v256_max_s16(res, min), max);
726 v64_store_aligned(&dst[0 * dstride], v128_high_v64(v256_high_v128(res)));
727 v64_store_aligned(&dst[1 * dstride], v128_low_v64(v256_high_v128(res)));
728 v64_store_aligned(&dst[2 * dstride], v128_high_v64(v256_low_v128(res)));
729 v64_store_aligned(&dst[3 * dstride], v128_low_v64(v256_low_v128(res)));
732 void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
733 const uint16_t *in, int pri_strength,
734 int sec_strength, int dir,
735 int pri_damping, int sec_damping,
736 int coeff_shift) {
737 int i;
738 v256 sum, p0, p1, p2, p3, row, res;
739 v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
740 int po1 = cdef_directions[dir][0];
741 int po2 = cdef_directions[dir][1];
742 int s1o1 = cdef_directions[dir + 2][0];
743 int s1o2 = cdef_directions[dir + 2][1];
744 int s2o1 = cdef_directions[dir - 2][0];
745 int s2o2 = cdef_directions[dir - 2][1];
746 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
747 const int *sec_taps = cdef_sec_taps;
749 if (pri_strength)
750 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
751 if (sec_strength)
752 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
754 for (i = 0; i < 8; i += 2) {
755 sum = v256_zero();
756 row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
757 v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
759 min = max = row;
760 // Primary near taps
761 p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
762 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
763 p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
764 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
765 max =
766 v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
767 v256_andn(p1, v256_cmpeq_16(p1, large)));
768 min = v256_min_s16(v256_min_s16(min, p0), p1);
769 p0 = constrain16(p0, row, pri_strength, pri_damping);
770 p1 = constrain16(p1, row, pri_strength, pri_damping);
772 // sum += pri_taps[0] * (p0 + p1)
773 sum = v256_add_16(
774 sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
776 // Primary far taps
777 p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
778 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
779 p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
780 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
781 max =
782 v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
783 v256_andn(p1, v256_cmpeq_16(p1, large)));
784 min = v256_min_s16(v256_min_s16(min, p0), p1);
785 p0 = constrain16(p0, row, pri_strength, pri_damping);
786 p1 = constrain16(p1, row, pri_strength, pri_damping);
788 // sum += pri_taps[1] * (p0 + p1)
789 sum = v256_add_16(
790 sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
792 // Secondary near taps
793 p0 =
794 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
795 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
796 p1 =
797 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
798 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
799 p2 =
800 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
801 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
802 p3 =
803 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
804 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
805 max =
806 v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
807 v256_andn(p1, v256_cmpeq_16(p1, large)));
808 max =
809 v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
810 v256_andn(p3, v256_cmpeq_16(p3, large)));
811 min = v256_min_s16(
812 v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
813 p0 = constrain16(p0, row, sec_strength, sec_damping);
814 p1 = constrain16(p1, row, sec_strength, sec_damping);
815 p2 = constrain16(p2, row, sec_strength, sec_damping);
816 p3 = constrain16(p3, row, sec_strength, sec_damping);
818 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
819 sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
820 v256_add_16(v256_add_16(p0, p1),
821 v256_add_16(p2, p3))));
823 // Secondary far taps
824 p0 =
825 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
826 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
827 p1 =
828 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
829 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
830 p2 =
831 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
832 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
833 p3 =
834 v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
835 v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
836 max =
837 v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
838 v256_andn(p1, v256_cmpeq_16(p1, large)));
839 max =
840 v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
841 v256_andn(p3, v256_cmpeq_16(p3, large)));
842 min = v256_min_s16(
843 v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
844 p0 = constrain16(p0, row, sec_strength, sec_damping);
845 p1 = constrain16(p1, row, sec_strength, sec_damping);
846 p2 = constrain16(p2, row, sec_strength, sec_damping);
847 p3 = constrain16(p3, row, sec_strength, sec_damping);
849 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
850 sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
851 v256_add_16(v256_add_16(p0, p1),
852 v256_add_16(p2, p3))));
854 // res = row + ((sum - (sum < 0) + 8) >> 4)
855 sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
856 res = v256_add_16(sum, v256_dup_16(8));
857 res = v256_shr_n_s16(res, 4);
858 res = v256_add_16(row, res);
859 res = v256_min_s16(v256_max_s16(res, min), max);
860 v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
861 v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
865 void SIMD_FUNC(cdef_filter_block_highbd)(void *dest, int dstride,
866 const uint16_t *in, int pri_strength,
867 int sec_strength, int dir,
868 int pri_damping, int sec_damping,
869 int bsize, int coeff_shift) {
870 uint16_t *dst16 = (uint16_t *)dest;
871 if (bsize == BLOCK_8X8) {
872 SIMD_FUNC(cdef_filter_block_8x8_16)
873 (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
874 sec_damping, coeff_shift);
875 } else if (bsize == BLOCK_4X8) {
876 SIMD_FUNC(cdef_filter_block_4x4_16)
877 (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
878 sec_damping, coeff_shift);
879 SIMD_FUNC(cdef_filter_block_4x4_16)
880 (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
881 sec_strength, dir, pri_damping, sec_damping, coeff_shift);
882 } else if (bsize == BLOCK_8X4) {
883 SIMD_FUNC(cdef_filter_block_4x4_16)
884 (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
885 sec_damping, coeff_shift);
886 SIMD_FUNC(cdef_filter_block_4x4_16)
887 (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
888 sec_damping, coeff_shift);
889 } else {
890 assert(bsize == BLOCK_4X4);
891 SIMD_FUNC(cdef_filter_block_4x4_16)
892 (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
893 sec_damping, coeff_shift);
896 #endif // CONFIG_AV1_HIGHBITDEPTH
898 void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
899 const uint8_t *src, int sstride,
900 int v, int h) {
901 int i, j;
902 for (i = 0; i < v; i++) {
903 for (j = 0; j < (h & ~0x7); j += 8) {
904 v64 row = v64_load_unaligned(&src[i * sstride + j]);
905 v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
907 for (; j < h; j++) {
908 dst[i * dstride + j] = src[i * sstride + j];
913 void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
914 const uint16_t *src, int sstride,
915 int v, int h) {
916 int i, j;
917 for (i = 0; i < v; i++) {
918 for (j = 0; j < (h & ~0x7); j += 8) {
919 v128 row = v128_load_unaligned(&src[i * sstride + j]);
920 v128_store_unaligned(&dst[i * dstride + j], row);
922 for (; j < h; j++) {
923 dst[i * dstride + j] = src[i * sstride + j];
928 #endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_