2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
13 #define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
15 #include "config/av1_rtcd.h"
17 #include "av1/common/cdef_block.h"
19 /* partial A is a 16-bit vector of the form:
20 [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
21 [0 y1 y2 y3 y4 y5 y6 y7].
22 This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
23 (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
25 static INLINE v128
fold_mul_and_sum(v128 partiala
, v128 partialb
, v128 const1
,
28 /* Reverse partial B. */
29 partialb
= v128_shuffle_8(
30 partialb
, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
31 /* Interleave the x and y values of identical indices and pair x8 with 0. */
33 partiala
= v128_ziplo_16(partialb
, partiala
);
34 partialb
= v128_ziphi_16(partialb
, tmp
);
35 /* Square and add the corresponding x and y values. */
36 partiala
= v128_madd_s16(partiala
, partiala
);
37 partialb
= v128_madd_s16(partialb
, partialb
);
38 /* Multiply by constant. */
39 partiala
= v128_mullo_s32(partiala
, const1
);
40 partialb
= v128_mullo_s32(partialb
, const2
);
41 /* Sum all results. */
42 partiala
= v128_add_32(partiala
, partialb
);
46 static INLINE v128
hsum4(v128 x0
, v128 x1
, v128 x2
, v128 x3
) {
48 t0
= v128_ziplo_32(x1
, x0
);
49 t1
= v128_ziplo_32(x3
, x2
);
50 t2
= v128_ziphi_32(x1
, x0
);
51 t3
= v128_ziphi_32(x3
, x2
);
52 x0
= v128_ziplo_64(t1
, t0
);
53 x1
= v128_ziphi_64(t1
, t0
);
54 x2
= v128_ziplo_64(t3
, t2
);
55 x3
= v128_ziphi_64(t3
, t2
);
56 return v128_add_32(v128_add_32(x0
, x1
), v128_add_32(x2
, x3
));
59 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
60 to compute the remaining directions. */
61 static INLINE v128
compute_directions(v128 lines
[8], int32_t tmp_cost1
[4]) {
62 v128 partial4a
, partial4b
, partial5a
, partial5b
, partial7a
, partial7b
;
65 /* Partial sums for lines 0 and 1. */
66 partial4a
= v128_shl_n_byte(lines
[0], 14);
67 partial4b
= v128_shr_n_byte(lines
[0], 2);
68 partial4a
= v128_add_16(partial4a
, v128_shl_n_byte(lines
[1], 12));
69 partial4b
= v128_add_16(partial4b
, v128_shr_n_byte(lines
[1], 4));
70 tmp
= v128_add_16(lines
[0], lines
[1]);
71 partial5a
= v128_shl_n_byte(tmp
, 10);
72 partial5b
= v128_shr_n_byte(tmp
, 6);
73 partial7a
= v128_shl_n_byte(tmp
, 4);
74 partial7b
= v128_shr_n_byte(tmp
, 12);
77 /* Partial sums for lines 2 and 3. */
78 partial4a
= v128_add_16(partial4a
, v128_shl_n_byte(lines
[2], 10));
79 partial4b
= v128_add_16(partial4b
, v128_shr_n_byte(lines
[2], 6));
80 partial4a
= v128_add_16(partial4a
, v128_shl_n_byte(lines
[3], 8));
81 partial4b
= v128_add_16(partial4b
, v128_shr_n_byte(lines
[3], 8));
82 tmp
= v128_add_16(lines
[2], lines
[3]);
83 partial5a
= v128_add_16(partial5a
, v128_shl_n_byte(tmp
, 8));
84 partial5b
= v128_add_16(partial5b
, v128_shr_n_byte(tmp
, 8));
85 partial7a
= v128_add_16(partial7a
, v128_shl_n_byte(tmp
, 6));
86 partial7b
= v128_add_16(partial7b
, v128_shr_n_byte(tmp
, 10));
87 partial6
= v128_add_16(partial6
, tmp
);
89 /* Partial sums for lines 4 and 5. */
90 partial4a
= v128_add_16(partial4a
, v128_shl_n_byte(lines
[4], 6));
91 partial4b
= v128_add_16(partial4b
, v128_shr_n_byte(lines
[4], 10));
92 partial4a
= v128_add_16(partial4a
, v128_shl_n_byte(lines
[5], 4));
93 partial4b
= v128_add_16(partial4b
, v128_shr_n_byte(lines
[5], 12));
94 tmp
= v128_add_16(lines
[4], lines
[5]);
95 partial5a
= v128_add_16(partial5a
, v128_shl_n_byte(tmp
, 6));
96 partial5b
= v128_add_16(partial5b
, v128_shr_n_byte(tmp
, 10));
97 partial7a
= v128_add_16(partial7a
, v128_shl_n_byte(tmp
, 8));
98 partial7b
= v128_add_16(partial7b
, v128_shr_n_byte(tmp
, 8));
99 partial6
= v128_add_16(partial6
, tmp
);
101 /* Partial sums for lines 6 and 7. */
102 partial4a
= v128_add_16(partial4a
, v128_shl_n_byte(lines
[6], 2));
103 partial4b
= v128_add_16(partial4b
, v128_shr_n_byte(lines
[6], 14));
104 partial4a
= v128_add_16(partial4a
, lines
[7]);
105 tmp
= v128_add_16(lines
[6], lines
[7]);
106 partial5a
= v128_add_16(partial5a
, v128_shl_n_byte(tmp
, 4));
107 partial5b
= v128_add_16(partial5b
, v128_shr_n_byte(tmp
, 12));
108 partial7a
= v128_add_16(partial7a
, v128_shl_n_byte(tmp
, 10));
109 partial7b
= v128_add_16(partial7b
, v128_shr_n_byte(tmp
, 6));
110 partial6
= v128_add_16(partial6
, tmp
);
112 /* Compute costs in terms of partial sums. */
114 fold_mul_and_sum(partial4a
, partial4b
, v128_from_32(210, 280, 420, 840),
115 v128_from_32(105, 120, 140, 168));
117 fold_mul_and_sum(partial7a
, partial7b
, v128_from_32(210, 420, 0, 0),
118 v128_from_32(105, 105, 105, 140));
120 fold_mul_and_sum(partial5a
, partial5b
, v128_from_32(210, 420, 0, 0),
121 v128_from_32(105, 105, 105, 140));
122 partial6
= v128_madd_s16(partial6
, partial6
);
123 partial6
= v128_mullo_s32(partial6
, v128_dup_32(105));
125 partial4a
= hsum4(partial4a
, partial5a
, partial6
, partial7a
);
126 v128_store_unaligned(tmp_cost1
, partial4a
);
130 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
131 counter-clockwise rotation of the pixels. */
132 static INLINE
void array_reverse_transpose_8x8(v128
*in
, v128
*res
) {
133 const v128 tr0_0
= v128_ziplo_16(in
[1], in
[0]);
134 const v128 tr0_1
= v128_ziplo_16(in
[3], in
[2]);
135 const v128 tr0_2
= v128_ziphi_16(in
[1], in
[0]);
136 const v128 tr0_3
= v128_ziphi_16(in
[3], in
[2]);
137 const v128 tr0_4
= v128_ziplo_16(in
[5], in
[4]);
138 const v128 tr0_5
= v128_ziplo_16(in
[7], in
[6]);
139 const v128 tr0_6
= v128_ziphi_16(in
[5], in
[4]);
140 const v128 tr0_7
= v128_ziphi_16(in
[7], in
[6]);
142 const v128 tr1_0
= v128_ziplo_32(tr0_1
, tr0_0
);
143 const v128 tr1_1
= v128_ziplo_32(tr0_5
, tr0_4
);
144 const v128 tr1_2
= v128_ziphi_32(tr0_1
, tr0_0
);
145 const v128 tr1_3
= v128_ziphi_32(tr0_5
, tr0_4
);
146 const v128 tr1_4
= v128_ziplo_32(tr0_3
, tr0_2
);
147 const v128 tr1_5
= v128_ziplo_32(tr0_7
, tr0_6
);
148 const v128 tr1_6
= v128_ziphi_32(tr0_3
, tr0_2
);
149 const v128 tr1_7
= v128_ziphi_32(tr0_7
, tr0_6
);
151 res
[7] = v128_ziplo_64(tr1_1
, tr1_0
);
152 res
[6] = v128_ziphi_64(tr1_1
, tr1_0
);
153 res
[5] = v128_ziplo_64(tr1_3
, tr1_2
);
154 res
[4] = v128_ziphi_64(tr1_3
, tr1_2
);
155 res
[3] = v128_ziplo_64(tr1_5
, tr1_4
);
156 res
[2] = v128_ziphi_64(tr1_5
, tr1_4
);
157 res
[1] = v128_ziplo_64(tr1_7
, tr1_6
);
158 res
[0] = v128_ziphi_64(tr1_7
, tr1_6
);
161 int SIMD_FUNC(cdef_find_dir
)(const uint16_t *img
, int stride
, int32_t *var
,
165 int32_t best_cost
= 0;
168 for (i
= 0; i
< 8; i
++) {
169 lines
[i
] = v128_load_unaligned(&img
[i
* stride
]);
171 v128_sub_16(v128_shr_s16(lines
[i
], coeff_shift
), v128_dup_16(128));
174 /* Compute "mostly vertical" directions. */
175 v128 dir47
= compute_directions(lines
, cost
+ 4);
177 array_reverse_transpose_8x8(lines
, lines
);
179 /* Compute "mostly horizontal" directions. */
180 v128 dir03
= compute_directions(lines
, cost
);
182 v128 max
= v128_max_s32(dir03
, dir47
);
183 max
= v128_max_s32(max
, v128_align(max
, max
, 8));
184 max
= v128_max_s32(max
, v128_align(max
, max
, 4));
185 best_cost
= v128_low_u32(max
);
187 v128_pack_s32_s16(v128_cmpeq_32(max
, dir47
), v128_cmpeq_32(max
, dir03
));
188 best_dir
= v128_movemask_8(v128_pack_s16_s8(t
, t
));
189 best_dir
= get_msb(best_dir
^ (best_dir
- 1)); // Count trailing zeros
191 /* Difference between the optimal variance and the variance along the
192 orthogonal direction. Again, the sum(x^2) terms cancel out. */
193 *var
= best_cost
- cost
[(best_dir
+ 4) & 7];
194 /* We'd normally divide by 840, but dividing by 1024 is close enough
195 for what we're going to do with this. */
200 // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
201 SIMD_INLINE v128
constrain(v256 a
, v256 b
, unsigned int strength
,
202 unsigned int adjdamp
) {
203 const v256 diff16
= v256_sub_16(a
, b
);
204 v128 diff
= v128_pack_s16_s8(v256_high_v128(diff16
), v256_low_v128(diff16
));
205 const v128 sign
= v128_cmplt_s8(diff
, v128_zero());
206 diff
= v128_abs_s8(diff
);
209 v128_min_u8(diff
, v128_ssub_u8(v128_dup_8(strength
),
210 v128_shr_u8(diff
, adjdamp
)))),
214 void SIMD_FUNC(cdef_filter_block_4x4_8
)(uint8_t *dst
, int dstride
,
215 const uint16_t *in
, int pri_strength
,
216 int sec_strength
, int dir
,
217 int pri_damping
, int sec_damping
,
220 v256 sum
, row
, res
, tap
;
222 v256 cdef_large_value_mask
= v256_dup_16((uint16_t)~CDEF_VERY_LARGE
);
223 int po1
= cdef_directions
[dir
][0];
224 int po2
= cdef_directions
[dir
][1];
225 int s1o1
= cdef_directions
[dir
+ 2][0];
226 int s1o2
= cdef_directions
[dir
+ 2][1];
227 int s2o1
= cdef_directions
[dir
- 2][0];
228 int s2o2
= cdef_directions
[dir
- 2][1];
229 const int *pri_taps
= cdef_pri_taps
[(pri_strength
>> coeff_shift
) & 1];
230 const int *sec_taps
= cdef_sec_taps
;
233 pri_damping
= AOMMAX(0, pri_damping
- get_msb(pri_strength
));
235 sec_damping
= AOMMAX(0, sec_damping
- get_msb(sec_strength
));
238 row
= v256_from_v64(v64_load_aligned(&in
[0 * CDEF_BSTRIDE
]),
239 v64_load_aligned(&in
[1 * CDEF_BSTRIDE
]),
240 v64_load_aligned(&in
[2 * CDEF_BSTRIDE
]),
241 v64_load_aligned(&in
[3 * CDEF_BSTRIDE
]));
247 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ po1
]),
248 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ po1
]),
249 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ po1
]),
250 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ po1
]));
252 min
= v256_min_s16(min
, tap
);
253 p0
= constrain(tap
, row
, pri_strength
, pri_damping
);
254 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- po1
]),
255 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- po1
]),
256 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- po1
]),
257 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- po1
]));
258 max_u8
= v256_max_u8(max_u8
, tap
);
259 min
= v256_min_s16(min
, tap
);
260 p1
= constrain(tap
, row
, pri_strength
, pri_damping
);
262 // sum += pri_taps[0] * (p0 + p1)
263 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(pri_taps
[0]),
264 v256_from_v128(v128_ziphi_8(p0
, p1
),
265 v128_ziplo_8(p0
, p1
))));
268 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ po2
]),
269 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ po2
]),
270 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ po2
]),
271 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ po2
]));
272 max_u8
= v256_max_u8(max_u8
, tap
);
273 min
= v256_min_s16(min
, tap
);
274 p0
= constrain(tap
, row
, pri_strength
, pri_damping
);
275 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- po2
]),
276 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- po2
]),
277 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- po2
]),
278 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- po2
]));
279 max_u8
= v256_max_u8(max_u8
, tap
);
280 min
= v256_min_s16(min
, tap
);
281 p1
= constrain(tap
, row
, pri_strength
, pri_damping
);
283 // sum += pri_taps[1] * (p0 + p1)
284 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(pri_taps
[1]),
285 v256_from_v128(v128_ziphi_8(p0
, p1
),
286 v128_ziplo_8(p0
, p1
))));
287 /* The source is 16 bits, however, we only really care about the lower
288 8 bits. The upper 8 bits contain the "large" flag. After the final
289 primary max has been calculated, zero out the upper 8 bits. Use this
290 to find the "16 bit" max. */
291 max
= v256_max_s16(max
, v256_and(max_u8
, cdef_large_value_mask
));
296 // Secondary near taps
297 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s1o1
]),
298 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s1o1
]),
299 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s1o1
]),
300 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s1o1
]));
302 min
= v256_min_s16(min
, tap
);
303 p0
= constrain(tap
, row
, sec_strength
, sec_damping
);
304 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s1o1
]),
305 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s1o1
]),
306 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s1o1
]),
307 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s1o1
]));
308 max_u8
= v256_max_u8(max_u8
, tap
);
309 min
= v256_min_s16(min
, tap
);
310 p1
= constrain(tap
, row
, sec_strength
, sec_damping
);
311 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s2o1
]),
312 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s2o1
]),
313 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s2o1
]),
314 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s2o1
]));
315 max_u8
= v256_max_u8(max_u8
, tap
);
316 min
= v256_min_s16(min
, tap
);
317 p2
= constrain(tap
, row
, sec_strength
, sec_damping
);
318 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s2o1
]),
319 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s2o1
]),
320 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s2o1
]),
321 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s2o1
]));
322 max_u8
= v256_max_u8(max_u8
, tap
);
323 min
= v256_min_s16(min
, tap
);
324 p3
= constrain(tap
, row
, sec_strength
, sec_damping
);
326 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
327 p0
= v128_add_8(p0
, p1
);
328 p2
= v128_add_8(p2
, p3
);
329 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(sec_taps
[0]),
330 v256_from_v128(v128_ziphi_8(p0
, p2
),
331 v128_ziplo_8(p0
, p2
))));
333 // Secondary far taps
334 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s1o2
]),
335 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s1o2
]),
336 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s1o2
]),
337 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s1o2
]));
338 max_u8
= v256_max_u8(max_u8
, tap
);
339 min
= v256_min_s16(min
, tap
);
340 p0
= constrain(tap
, row
, sec_strength
, sec_damping
);
341 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s1o2
]),
342 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s1o2
]),
343 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s1o2
]),
344 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s1o2
]));
345 max_u8
= v256_max_u8(max_u8
, tap
);
346 min
= v256_min_s16(min
, tap
);
347 p1
= constrain(tap
, row
, sec_strength
, sec_damping
);
348 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s2o2
]),
349 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s2o2
]),
350 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s2o2
]),
351 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s2o2
]));
352 max_u8
= v256_max_u8(max_u8
, tap
);
353 min
= v256_min_s16(min
, tap
);
354 p2
= constrain(tap
, row
, sec_strength
, sec_damping
);
355 tap
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s2o2
]),
356 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s2o2
]),
357 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s2o2
]),
358 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s2o2
]));
359 max_u8
= v256_max_u8(max_u8
, tap
);
360 min
= v256_min_s16(min
, tap
);
361 p3
= constrain(tap
, row
, sec_strength
, sec_damping
);
363 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
364 p0
= v128_add_8(p0
, p1
);
365 p2
= v128_add_8(p2
, p3
);
367 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(sec_taps
[1]),
368 v256_from_v128(v128_ziphi_8(p0
, p2
),
369 v128_ziplo_8(p0
, p2
))));
371 /* The source is 16 bits, however, we only really care about the lower
372 8 bits. The upper 8 bits contain the "large" flag. After the final
373 primary max has been calculated, zero out the upper 8 bits. Use this
374 to find the "16 bit" max. */
375 max
= v256_max_s16(max
, v256_and(max_u8
, cdef_large_value_mask
));
378 // res = row + ((sum - (sum < 0) + 8) >> 4)
379 sum
= v256_add_16(sum
, v256_cmplt_s16(sum
, v256_zero()));
380 res
= v256_add_16(sum
, v256_dup_16(8));
381 res
= v256_shr_n_s16(res
, 4);
382 res
= v256_add_16(row
, res
);
383 res
= v256_min_s16(v256_max_s16(res
, min
), max
);
384 res
= v256_pack_s16_u8(res
, res
);
386 p0
= v256_low_v128(res
);
387 u32_store_aligned(&dst
[0 * dstride
], v64_high_u32(v128_high_v64(p0
)));
388 u32_store_aligned(&dst
[1 * dstride
], v64_low_u32(v128_high_v64(p0
)));
389 u32_store_aligned(&dst
[2 * dstride
], v64_high_u32(v128_low_v64(p0
)));
390 u32_store_aligned(&dst
[3 * dstride
], v64_low_u32(v128_low_v64(p0
)));
393 void SIMD_FUNC(cdef_filter_block_8x8_8
)(uint8_t *dst
, int dstride
,
394 const uint16_t *in
, int pri_strength
,
395 int sec_strength
, int dir
,
396 int pri_damping
, int sec_damping
,
400 v256 sum
, row
, res
, tap
;
402 v256 cdef_large_value_mask
= v256_dup_16((uint16_t)~CDEF_VERY_LARGE
);
403 int po1
= cdef_directions
[dir
][0];
404 int po2
= cdef_directions
[dir
][1];
405 int s1o1
= cdef_directions
[dir
+ 2][0];
406 int s1o2
= cdef_directions
[dir
+ 2][1];
407 int s2o1
= cdef_directions
[dir
- 2][0];
408 int s2o2
= cdef_directions
[dir
- 2][1];
409 const int *pri_taps
= cdef_pri_taps
[(pri_strength
>> coeff_shift
) & 1];
410 const int *sec_taps
= cdef_sec_taps
;
413 pri_damping
= AOMMAX(0, pri_damping
- get_msb(pri_strength
));
415 sec_damping
= AOMMAX(0, sec_damping
- get_msb(sec_strength
));
416 for (i
= 0; i
< 8; i
+= 2) {
419 row
= v256_from_v128(v128_load_aligned(&in
[i
* CDEF_BSTRIDE
]),
420 v128_load_aligned(&in
[(i
+ 1) * CDEF_BSTRIDE
]));
425 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ po1
]),
426 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ po1
]));
428 min
= v256_min_s16(min
, tap
);
429 p0
= constrain(tap
, row
, pri_strength
, pri_damping
);
431 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- po1
]),
432 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- po1
]));
433 max_u8
= v256_max_u8(max_u8
, tap
);
434 min
= v256_min_s16(min
, tap
);
435 p1
= constrain(tap
, row
, pri_strength
, pri_damping
);
437 // sum += pri_taps[0] * (p0 + p1)
438 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(pri_taps
[0]),
439 v256_from_v128(v128_ziphi_8(p0
, p1
),
440 v128_ziplo_8(p0
, p1
))));
444 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ po2
]),
445 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ po2
]));
446 max_u8
= v256_max_u8(max_u8
, tap
);
447 min
= v256_min_s16(min
, tap
);
448 p0
= constrain(tap
, row
, pri_strength
, pri_damping
);
450 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- po2
]),
451 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- po2
]));
452 max_u8
= v256_max_u8(max_u8
, tap
);
453 min
= v256_min_s16(min
, tap
);
454 p1
= constrain(tap
, row
, pri_strength
, pri_damping
);
456 // sum += pri_taps[1] * (p0 + p1)
457 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(pri_taps
[1]),
458 v256_from_v128(v128_ziphi_8(p0
, p1
),
459 v128_ziplo_8(p0
, p1
))));
461 max
= v256_max_s16(max
, v256_and(max_u8
, cdef_large_value_mask
));
463 // Secondary near taps
465 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s1o1
]),
466 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s1o1
]));
468 min
= v256_min_s16(min
, tap
);
469 p0
= constrain(tap
, row
, sec_strength
, sec_damping
);
471 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s1o1
]),
472 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s1o1
]));
473 max_u8
= v256_max_u8(max_u8
, tap
);
474 min
= v256_min_s16(min
, tap
);
475 p1
= constrain(tap
, row
, sec_strength
, sec_damping
);
477 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s2o1
]),
478 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s2o1
]));
479 max_u8
= v256_max_u8(max_u8
, tap
);
480 min
= v256_min_s16(min
, tap
);
481 p2
= constrain(tap
, row
, sec_strength
, sec_damping
);
483 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s2o1
]),
484 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s2o1
]));
485 max_u8
= v256_max_u8(max_u8
, tap
);
486 min
= v256_min_s16(min
, tap
);
487 p3
= constrain(tap
, row
, sec_strength
, sec_damping
);
489 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
490 p0
= v128_add_8(p0
, p1
);
491 p2
= v128_add_8(p2
, p3
);
492 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(sec_taps
[0]),
493 v256_from_v128(v128_ziphi_8(p0
, p2
),
494 v128_ziplo_8(p0
, p2
))));
496 // Secondary far taps
498 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s1o2
]),
499 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s1o2
]));
500 max_u8
= v256_max_u8(max_u8
, tap
);
501 min
= v256_min_s16(min
, tap
);
502 p0
= constrain(tap
, row
, sec_strength
, sec_damping
);
504 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s1o2
]),
505 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s1o2
]));
506 max_u8
= v256_max_u8(max_u8
, tap
);
507 min
= v256_min_s16(min
, tap
);
508 p1
= constrain(tap
, row
, sec_strength
, sec_damping
);
510 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s2o2
]),
511 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s2o2
]));
512 max_u8
= v256_max_u8(max_u8
, tap
);
513 min
= v256_min_s16(min
, tap
);
514 p2
= constrain(tap
, row
, sec_strength
, sec_damping
);
516 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s2o2
]),
517 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s2o2
]));
518 max_u8
= v256_max_u8(max_u8
, tap
);
519 min
= v256_min_s16(min
, tap
);
520 p3
= constrain(tap
, row
, sec_strength
, sec_damping
);
522 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
523 p0
= v128_add_8(p0
, p1
);
524 p2
= v128_add_8(p2
, p3
);
525 sum
= v256_add_16(sum
, v256_madd_us8(v256_dup_8(sec_taps
[1]),
526 v256_from_v128(v128_ziphi_8(p0
, p2
),
527 v128_ziplo_8(p0
, p2
))));
529 max
= v256_max_s16(max
, v256_and(max_u8
, cdef_large_value_mask
));
531 // res = row + ((sum - (sum < 0) + 8) >> 4)
532 sum
= v256_add_16(sum
, v256_cmplt_s16(sum
, v256_zero()));
533 res
= v256_add_16(sum
, v256_dup_16(8));
534 res
= v256_shr_n_s16(res
, 4);
535 res
= v256_add_16(row
, res
);
536 res
= v256_min_s16(v256_max_s16(res
, min
), max
);
537 res
= v256_pack_s16_u8(res
, res
);
539 p0
= v256_low_v128(res
);
540 v64_store_aligned(&dst
[i
* dstride
], v128_high_v64(p0
));
541 v64_store_aligned(&dst
[(i
+ 1) * dstride
], v128_low_v64(p0
));
545 void SIMD_FUNC(cdef_filter_block
)(void *dest
, int dstride
, const uint16_t *in
,
546 int pri_strength
, int sec_strength
, int dir
,
547 int pri_damping
, int sec_damping
, int bsize
,
549 uint8_t *dst8
= (uint8_t *)dest
;
550 if (bsize
== BLOCK_8X8
) {
551 SIMD_FUNC(cdef_filter_block_8x8_8
)
552 (dst8
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
553 sec_damping
, coeff_shift
);
554 } else if (bsize
== BLOCK_4X8
) {
555 SIMD_FUNC(cdef_filter_block_4x4_8
)
556 (dst8
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
557 sec_damping
, coeff_shift
);
558 SIMD_FUNC(cdef_filter_block_4x4_8
)
559 (dst8
+ 4 * dstride
, dstride
, in
+ 4 * CDEF_BSTRIDE
, pri_strength
,
560 sec_strength
, dir
, pri_damping
, sec_damping
, coeff_shift
);
561 } else if (bsize
== BLOCK_8X4
) {
562 SIMD_FUNC(cdef_filter_block_4x4_8
)
563 (dst8
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
564 sec_damping
, coeff_shift
);
565 SIMD_FUNC(cdef_filter_block_4x4_8
)
566 (dst8
+ 4, dstride
, in
+ 4, pri_strength
, sec_strength
, dir
, pri_damping
,
567 sec_damping
, coeff_shift
);
569 SIMD_FUNC(cdef_filter_block_4x4_8
)
570 (dst8
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
571 sec_damping
, coeff_shift
);
575 #if CONFIG_AV1_HIGHBITDEPTH
576 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
577 SIMD_INLINE v256
constrain16(v256 a
, v256 b
, unsigned int threshold
,
578 unsigned int adjdamp
) {
579 v256 diff
= v256_sub_16(a
, b
);
580 const v256 sign
= v256_shr_n_s16(diff
, 15);
581 diff
= v256_abs_s16(diff
);
583 v256_ssub_u16(v256_dup_16(threshold
), v256_shr_u16(diff
, adjdamp
));
584 return v256_xor(v256_add_16(sign
, v256_min_s16(diff
, s
)), sign
);
587 void SIMD_FUNC(cdef_filter_block_4x4_16
)(uint16_t *dst
, int dstride
,
588 const uint16_t *in
, int pri_strength
,
589 int sec_strength
, int dir
,
590 int pri_damping
, int sec_damping
,
592 v256 p0
, p1
, p2
, p3
, sum
, row
, res
;
593 v256 max
, min
, large
= v256_dup_16(CDEF_VERY_LARGE
);
594 int po1
= cdef_directions
[dir
][0];
595 int po2
= cdef_directions
[dir
][1];
596 int s1o1
= cdef_directions
[dir
+ 2][0];
597 int s1o2
= cdef_directions
[dir
+ 2][1];
598 int s2o1
= cdef_directions
[dir
- 2][0];
599 int s2o2
= cdef_directions
[dir
- 2][1];
600 const int *pri_taps
= cdef_pri_taps
[(pri_strength
>> coeff_shift
) & 1];
601 const int *sec_taps
= cdef_sec_taps
;
604 pri_damping
= AOMMAX(0, pri_damping
- get_msb(pri_strength
));
606 sec_damping
= AOMMAX(0, sec_damping
- get_msb(sec_strength
));
609 row
= v256_from_v64(v64_load_aligned(&in
[0 * CDEF_BSTRIDE
]),
610 v64_load_aligned(&in
[1 * CDEF_BSTRIDE
]),
611 v64_load_aligned(&in
[2 * CDEF_BSTRIDE
]),
612 v64_load_aligned(&in
[3 * CDEF_BSTRIDE
]));
616 p0
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ po1
]),
617 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ po1
]),
618 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ po1
]),
619 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ po1
]));
620 p1
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- po1
]),
621 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- po1
]),
622 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- po1
]),
623 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- po1
]));
624 max
= v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
625 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
626 min
= v256_min_s16(v256_min_s16(min
, p0
), p1
);
627 p0
= constrain16(p0
, row
, pri_strength
, pri_damping
);
628 p1
= constrain16(p1
, row
, pri_strength
, pri_damping
);
630 // sum += pri_taps[0] * (p0 + p1)
632 sum
, v256_mullo_s16(v256_dup_16(pri_taps
[0]), v256_add_16(p0
, p1
)));
635 p0
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ po2
]),
636 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ po2
]),
637 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ po2
]),
638 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ po2
]));
639 p1
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- po2
]),
640 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- po2
]),
641 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- po2
]),
642 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- po2
]));
643 max
= v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
644 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
645 min
= v256_min_s16(v256_min_s16(min
, p0
), p1
);
646 p0
= constrain16(p0
, row
, pri_strength
, pri_damping
);
647 p1
= constrain16(p1
, row
, pri_strength
, pri_damping
);
649 // sum += pri_taps[1] * (p0 + p1)
651 sum
, v256_mullo_s16(v256_dup_16(pri_taps
[1]), v256_add_16(p0
, p1
)));
653 // Secondary near taps
654 p0
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s1o1
]),
655 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s1o1
]),
656 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s1o1
]),
657 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s1o1
]));
658 p1
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s1o1
]),
659 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s1o1
]),
660 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s1o1
]),
661 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s1o1
]));
662 p2
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s2o1
]),
663 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s2o1
]),
664 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s2o1
]),
665 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s2o1
]));
666 p3
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s2o1
]),
667 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s2o1
]),
668 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s2o1
]),
669 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s2o1
]));
670 max
= v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
671 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
672 max
= v256_max_s16(v256_max_s16(max
, v256_andn(p2
, v256_cmpeq_16(p2
, large
))),
673 v256_andn(p3
, v256_cmpeq_16(p3
, large
)));
674 min
= v256_min_s16(v256_min_s16(v256_min_s16(v256_min_s16(min
, p0
), p1
), p2
),
676 p0
= constrain16(p0
, row
, sec_strength
, sec_damping
);
677 p1
= constrain16(p1
, row
, sec_strength
, sec_damping
);
678 p2
= constrain16(p2
, row
, sec_strength
, sec_damping
);
679 p3
= constrain16(p3
, row
, sec_strength
, sec_damping
);
681 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
682 sum
= v256_add_16(sum
, v256_mullo_s16(v256_dup_16(sec_taps
[0]),
683 v256_add_16(v256_add_16(p0
, p1
),
684 v256_add_16(p2
, p3
))));
686 // Secondary far taps
687 p0
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s1o2
]),
688 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s1o2
]),
689 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s1o2
]),
690 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s1o2
]));
691 p1
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s1o2
]),
692 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s1o2
]),
693 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s1o2
]),
694 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s1o2
]));
695 p2
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
+ s2o2
]),
696 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
+ s2o2
]),
697 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
+ s2o2
]),
698 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
+ s2o2
]));
699 p3
= v256_from_v64(v64_load_unaligned(&in
[0 * CDEF_BSTRIDE
- s2o2
]),
700 v64_load_unaligned(&in
[1 * CDEF_BSTRIDE
- s2o2
]),
701 v64_load_unaligned(&in
[2 * CDEF_BSTRIDE
- s2o2
]),
702 v64_load_unaligned(&in
[3 * CDEF_BSTRIDE
- s2o2
]));
703 max
= v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
704 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
705 max
= v256_max_s16(v256_max_s16(max
, v256_andn(p2
, v256_cmpeq_16(p2
, large
))),
706 v256_andn(p3
, v256_cmpeq_16(p3
, large
)));
707 min
= v256_min_s16(v256_min_s16(v256_min_s16(v256_min_s16(min
, p0
), p1
), p2
),
709 p0
= constrain16(p0
, row
, sec_strength
, sec_damping
);
710 p1
= constrain16(p1
, row
, sec_strength
, sec_damping
);
711 p2
= constrain16(p2
, row
, sec_strength
, sec_damping
);
712 p3
= constrain16(p3
, row
, sec_strength
, sec_damping
);
714 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
715 sum
= v256_add_16(sum
, v256_mullo_s16(v256_dup_16(sec_taps
[1]),
716 v256_add_16(v256_add_16(p0
, p1
),
717 v256_add_16(p2
, p3
))));
719 // res = row + ((sum - (sum < 0) + 8) >> 4)
720 sum
= v256_add_16(sum
, v256_cmplt_s16(sum
, v256_zero()));
721 res
= v256_add_16(sum
, v256_dup_16(8));
722 res
= v256_shr_n_s16(res
, 4);
723 res
= v256_add_16(row
, res
);
724 res
= v256_min_s16(v256_max_s16(res
, min
), max
);
726 v64_store_aligned(&dst
[0 * dstride
], v128_high_v64(v256_high_v128(res
)));
727 v64_store_aligned(&dst
[1 * dstride
], v128_low_v64(v256_high_v128(res
)));
728 v64_store_aligned(&dst
[2 * dstride
], v128_high_v64(v256_low_v128(res
)));
729 v64_store_aligned(&dst
[3 * dstride
], v128_low_v64(v256_low_v128(res
)));
732 void SIMD_FUNC(cdef_filter_block_8x8_16
)(uint16_t *dst
, int dstride
,
733 const uint16_t *in
, int pri_strength
,
734 int sec_strength
, int dir
,
735 int pri_damping
, int sec_damping
,
738 v256 sum
, p0
, p1
, p2
, p3
, row
, res
;
739 v256 max
, min
, large
= v256_dup_16(CDEF_VERY_LARGE
);
740 int po1
= cdef_directions
[dir
][0];
741 int po2
= cdef_directions
[dir
][1];
742 int s1o1
= cdef_directions
[dir
+ 2][0];
743 int s1o2
= cdef_directions
[dir
+ 2][1];
744 int s2o1
= cdef_directions
[dir
- 2][0];
745 int s2o2
= cdef_directions
[dir
- 2][1];
746 const int *pri_taps
= cdef_pri_taps
[(pri_strength
>> coeff_shift
) & 1];
747 const int *sec_taps
= cdef_sec_taps
;
750 pri_damping
= AOMMAX(0, pri_damping
- get_msb(pri_strength
));
752 sec_damping
= AOMMAX(0, sec_damping
- get_msb(sec_strength
));
754 for (i
= 0; i
< 8; i
+= 2) {
756 row
= v256_from_v128(v128_load_aligned(&in
[i
* CDEF_BSTRIDE
]),
757 v128_load_aligned(&in
[(i
+ 1) * CDEF_BSTRIDE
]));
761 p0
= v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ po1
]),
762 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ po1
]));
763 p1
= v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- po1
]),
764 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- po1
]));
766 v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
767 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
768 min
= v256_min_s16(v256_min_s16(min
, p0
), p1
);
769 p0
= constrain16(p0
, row
, pri_strength
, pri_damping
);
770 p1
= constrain16(p1
, row
, pri_strength
, pri_damping
);
772 // sum += pri_taps[0] * (p0 + p1)
774 sum
, v256_mullo_s16(v256_dup_16(pri_taps
[0]), v256_add_16(p0
, p1
)));
777 p0
= v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ po2
]),
778 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ po2
]));
779 p1
= v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- po2
]),
780 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- po2
]));
782 v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
783 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
784 min
= v256_min_s16(v256_min_s16(min
, p0
), p1
);
785 p0
= constrain16(p0
, row
, pri_strength
, pri_damping
);
786 p1
= constrain16(p1
, row
, pri_strength
, pri_damping
);
788 // sum += pri_taps[1] * (p0 + p1)
790 sum
, v256_mullo_s16(v256_dup_16(pri_taps
[1]), v256_add_16(p0
, p1
)));
792 // Secondary near taps
794 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s1o1
]),
795 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s1o1
]));
797 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s1o1
]),
798 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s1o1
]));
800 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s2o1
]),
801 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s2o1
]));
803 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s2o1
]),
804 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s2o1
]));
806 v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
807 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
809 v256_max_s16(v256_max_s16(max
, v256_andn(p2
, v256_cmpeq_16(p2
, large
))),
810 v256_andn(p3
, v256_cmpeq_16(p3
, large
)));
812 v256_min_s16(v256_min_s16(v256_min_s16(min
, p0
), p1
), p2
), p3
);
813 p0
= constrain16(p0
, row
, sec_strength
, sec_damping
);
814 p1
= constrain16(p1
, row
, sec_strength
, sec_damping
);
815 p2
= constrain16(p2
, row
, sec_strength
, sec_damping
);
816 p3
= constrain16(p3
, row
, sec_strength
, sec_damping
);
818 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
819 sum
= v256_add_16(sum
, v256_mullo_s16(v256_dup_16(sec_taps
[0]),
820 v256_add_16(v256_add_16(p0
, p1
),
821 v256_add_16(p2
, p3
))));
823 // Secondary far taps
825 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s1o2
]),
826 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s1o2
]));
828 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s1o2
]),
829 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s1o2
]));
831 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
+ s2o2
]),
832 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
+ s2o2
]));
834 v256_from_v128(v128_load_unaligned(&in
[i
* CDEF_BSTRIDE
- s2o2
]),
835 v128_load_unaligned(&in
[(i
+ 1) * CDEF_BSTRIDE
- s2o2
]));
837 v256_max_s16(v256_max_s16(max
, v256_andn(p0
, v256_cmpeq_16(p0
, large
))),
838 v256_andn(p1
, v256_cmpeq_16(p1
, large
)));
840 v256_max_s16(v256_max_s16(max
, v256_andn(p2
, v256_cmpeq_16(p2
, large
))),
841 v256_andn(p3
, v256_cmpeq_16(p3
, large
)));
843 v256_min_s16(v256_min_s16(v256_min_s16(min
, p0
), p1
), p2
), p3
);
844 p0
= constrain16(p0
, row
, sec_strength
, sec_damping
);
845 p1
= constrain16(p1
, row
, sec_strength
, sec_damping
);
846 p2
= constrain16(p2
, row
, sec_strength
, sec_damping
);
847 p3
= constrain16(p3
, row
, sec_strength
, sec_damping
);
849 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
850 sum
= v256_add_16(sum
, v256_mullo_s16(v256_dup_16(sec_taps
[1]),
851 v256_add_16(v256_add_16(p0
, p1
),
852 v256_add_16(p2
, p3
))));
854 // res = row + ((sum - (sum < 0) + 8) >> 4)
855 sum
= v256_add_16(sum
, v256_cmplt_s16(sum
, v256_zero()));
856 res
= v256_add_16(sum
, v256_dup_16(8));
857 res
= v256_shr_n_s16(res
, 4);
858 res
= v256_add_16(row
, res
);
859 res
= v256_min_s16(v256_max_s16(res
, min
), max
);
860 v128_store_unaligned(&dst
[i
* dstride
], v256_high_v128(res
));
861 v128_store_unaligned(&dst
[(i
+ 1) * dstride
], v256_low_v128(res
));
865 void SIMD_FUNC(cdef_filter_block_highbd
)(void *dest
, int dstride
,
866 const uint16_t *in
, int pri_strength
,
867 int sec_strength
, int dir
,
868 int pri_damping
, int sec_damping
,
869 int bsize
, int coeff_shift
) {
870 uint16_t *dst16
= (uint16_t *)dest
;
871 if (bsize
== BLOCK_8X8
) {
872 SIMD_FUNC(cdef_filter_block_8x8_16
)
873 (dst16
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
874 sec_damping
, coeff_shift
);
875 } else if (bsize
== BLOCK_4X8
) {
876 SIMD_FUNC(cdef_filter_block_4x4_16
)
877 (dst16
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
878 sec_damping
, coeff_shift
);
879 SIMD_FUNC(cdef_filter_block_4x4_16
)
880 (dst16
+ 4 * dstride
, dstride
, in
+ 4 * CDEF_BSTRIDE
, pri_strength
,
881 sec_strength
, dir
, pri_damping
, sec_damping
, coeff_shift
);
882 } else if (bsize
== BLOCK_8X4
) {
883 SIMD_FUNC(cdef_filter_block_4x4_16
)
884 (dst16
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
885 sec_damping
, coeff_shift
);
886 SIMD_FUNC(cdef_filter_block_4x4_16
)
887 (dst16
+ 4, dstride
, in
+ 4, pri_strength
, sec_strength
, dir
, pri_damping
,
888 sec_damping
, coeff_shift
);
890 assert(bsize
== BLOCK_4X4
);
891 SIMD_FUNC(cdef_filter_block_4x4_16
)
892 (dst16
, dstride
, in
, pri_strength
, sec_strength
, dir
, pri_damping
,
893 sec_damping
, coeff_shift
);
896 #endif // CONFIG_AV1_HIGHBITDEPTH
898 void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit
)(uint16_t *dst
, int dstride
,
899 const uint8_t *src
, int sstride
,
902 for (i
= 0; i
< v
; i
++) {
903 for (j
= 0; j
< (h
& ~0x7); j
+= 8) {
904 v64 row
= v64_load_unaligned(&src
[i
* sstride
+ j
]);
905 v128_store_unaligned(&dst
[i
* dstride
+ j
], v128_unpack_u8_s16(row
));
908 dst
[i
* dstride
+ j
] = src
[i
* sstride
+ j
];
913 void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit
)(uint16_t *dst
, int dstride
,
914 const uint16_t *src
, int sstride
,
917 for (i
= 0; i
< v
; i
++) {
918 for (j
= 0; j
< (h
& ~0x7); j
+= 8) {
919 v128 row
= v128_load_unaligned(&src
[i
* sstride
+ j
]);
920 v128_store_unaligned(&dst
[i
* dstride
+ j
], row
);
923 dst
[i
* dstride
+ j
] = src
[i
* sstride
+ j
];
928 #endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_