2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "vpx_dsp/vpx_dsp_common.h"
15 #include "vpx_ports/mem.h"
17 static INLINE
int8_t signed_char_clamp(int t
) {
18 return (int8_t)clamp(t
, -128, 127);
21 #if CONFIG_VP9_HIGHBITDEPTH
22 static INLINE
int16_t signed_char_clamp_high(int t
, int bd
) {
25 return (int16_t)clamp(t
, -128*4, 128*4-1);
27 return (int16_t)clamp(t
, -128*16, 128*16-1);
30 return (int16_t)clamp(t
, -128, 128-1);
35 // should we apply any filter at all: 11111111 yes, 00000000 no
36 static INLINE
int8_t filter_mask(uint8_t limit
, uint8_t blimit
,
37 uint8_t p3
, uint8_t p2
,
38 uint8_t p1
, uint8_t p0
,
39 uint8_t q0
, uint8_t q1
,
40 uint8_t q2
, uint8_t q3
) {
42 mask
|= (abs(p3
- p2
) > limit
) * -1;
43 mask
|= (abs(p2
- p1
) > limit
) * -1;
44 mask
|= (abs(p1
- p0
) > limit
) * -1;
45 mask
|= (abs(q1
- q0
) > limit
) * -1;
46 mask
|= (abs(q2
- q1
) > limit
) * -1;
47 mask
|= (abs(q3
- q2
) > limit
) * -1;
48 mask
|= (abs(p0
- q0
) * 2 + abs(p1
- q1
) / 2 > blimit
) * -1;
52 static INLINE
int8_t flat_mask4(uint8_t thresh
,
53 uint8_t p3
, uint8_t p2
,
54 uint8_t p1
, uint8_t p0
,
55 uint8_t q0
, uint8_t q1
,
56 uint8_t q2
, uint8_t q3
) {
58 mask
|= (abs(p1
- p0
) > thresh
) * -1;
59 mask
|= (abs(q1
- q0
) > thresh
) * -1;
60 mask
|= (abs(p2
- p0
) > thresh
) * -1;
61 mask
|= (abs(q2
- q0
) > thresh
) * -1;
62 mask
|= (abs(p3
- p0
) > thresh
) * -1;
63 mask
|= (abs(q3
- q0
) > thresh
) * -1;
67 static INLINE
int8_t flat_mask5(uint8_t thresh
,
68 uint8_t p4
, uint8_t p3
,
69 uint8_t p2
, uint8_t p1
,
70 uint8_t p0
, uint8_t q0
,
71 uint8_t q1
, uint8_t q2
,
72 uint8_t q3
, uint8_t q4
) {
73 int8_t mask
= ~flat_mask4(thresh
, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
74 mask
|= (abs(p4
- p0
) > thresh
) * -1;
75 mask
|= (abs(q4
- q0
) > thresh
) * -1;
79 // is there high edge variance internal edge: 11111111 yes, 00000000 no
80 static INLINE
int8_t hev_mask(uint8_t thresh
, uint8_t p1
, uint8_t p0
,
81 uint8_t q0
, uint8_t q1
) {
83 hev
|= (abs(p1
- p0
) > thresh
) * -1;
84 hev
|= (abs(q1
- q0
) > thresh
) * -1;
88 static INLINE
void filter4(int8_t mask
, uint8_t thresh
, uint8_t *op1
,
89 uint8_t *op0
, uint8_t *oq0
, uint8_t *oq1
) {
90 int8_t filter1
, filter2
;
92 const int8_t ps1
= (int8_t) *op1
^ 0x80;
93 const int8_t ps0
= (int8_t) *op0
^ 0x80;
94 const int8_t qs0
= (int8_t) *oq0
^ 0x80;
95 const int8_t qs1
= (int8_t) *oq1
^ 0x80;
96 const uint8_t hev
= hev_mask(thresh
, *op1
, *op0
, *oq0
, *oq1
);
98 // add outer taps if we have high edge variance
99 int8_t filter
= signed_char_clamp(ps1
- qs1
) & hev
;
102 filter
= signed_char_clamp(filter
+ 3 * (qs0
- ps0
)) & mask
;
104 // save bottom 3 bits so that we round one side +4 and the other +3
105 // if it equals 4 we'll set to adjust by -1 to account for the fact
106 // we'd round 3 the other way
107 filter1
= signed_char_clamp(filter
+ 4) >> 3;
108 filter2
= signed_char_clamp(filter
+ 3) >> 3;
110 *oq0
= signed_char_clamp(qs0
- filter1
) ^ 0x80;
111 *op0
= signed_char_clamp(ps0
+ filter2
) ^ 0x80;
113 // outer tap adjustments
114 filter
= ROUND_POWER_OF_TWO(filter1
, 1) & ~hev
;
116 *oq1
= signed_char_clamp(qs1
- filter
) ^ 0x80;
117 *op1
= signed_char_clamp(ps1
+ filter
) ^ 0x80;
120 void vpx_lpf_horizontal_4_c(uint8_t *s
, int p
/* pitch */,
121 const uint8_t *blimit
, const uint8_t *limit
,
122 const uint8_t *thresh
) {
125 // loop filter designed to work using chars so that we can make maximum use
126 // of 8 bit simd instructions.
127 for (i
= 0; i
< 8; ++i
) {
128 const uint8_t p3
= s
[-4 * p
], p2
= s
[-3 * p
], p1
= s
[-2 * p
], p0
= s
[-p
];
129 const uint8_t q0
= s
[0 * p
], q1
= s
[1 * p
], q2
= s
[2 * p
], q3
= s
[3 * p
];
130 const int8_t mask
= filter_mask(*limit
, *blimit
,
131 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
132 filter4(mask
, *thresh
, s
- 2 * p
, s
- 1 * p
, s
, s
+ 1 * p
);
137 void vpx_lpf_horizontal_4_dual_c(uint8_t *s
, int p
, const uint8_t *blimit0
,
138 const uint8_t *limit0
, const uint8_t *thresh0
,
139 const uint8_t *blimit1
, const uint8_t *limit1
,
140 const uint8_t *thresh1
) {
141 vpx_lpf_horizontal_4_c(s
, p
, blimit0
, limit0
, thresh0
);
142 vpx_lpf_horizontal_4_c(s
+ 8, p
, blimit1
, limit1
, thresh1
);
145 void vpx_lpf_vertical_4_c(uint8_t *s
, int pitch
, const uint8_t *blimit
,
146 const uint8_t *limit
, const uint8_t *thresh
) {
149 // loop filter designed to work using chars so that we can make maximum use
150 // of 8 bit simd instructions.
151 for (i
= 0; i
< 8; ++i
) {
152 const uint8_t p3
= s
[-4], p2
= s
[-3], p1
= s
[-2], p0
= s
[-1];
153 const uint8_t q0
= s
[0], q1
= s
[1], q2
= s
[2], q3
= s
[3];
154 const int8_t mask
= filter_mask(*limit
, *blimit
,
155 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
156 filter4(mask
, *thresh
, s
- 2, s
- 1, s
, s
+ 1);
161 void vpx_lpf_vertical_4_dual_c(uint8_t *s
, int pitch
, const uint8_t *blimit0
,
162 const uint8_t *limit0
, const uint8_t *thresh0
,
163 const uint8_t *blimit1
, const uint8_t *limit1
,
164 const uint8_t *thresh1
) {
165 vpx_lpf_vertical_4_c(s
, pitch
, blimit0
, limit0
, thresh0
);
166 vpx_lpf_vertical_4_c(s
+ 8 * pitch
, pitch
, blimit1
, limit1
, thresh1
);
169 static INLINE
void filter8(int8_t mask
, uint8_t thresh
, uint8_t flat
,
170 uint8_t *op3
, uint8_t *op2
,
171 uint8_t *op1
, uint8_t *op0
,
172 uint8_t *oq0
, uint8_t *oq1
,
173 uint8_t *oq2
, uint8_t *oq3
) {
175 const uint8_t p3
= *op3
, p2
= *op2
, p1
= *op1
, p0
= *op0
;
176 const uint8_t q0
= *oq0
, q1
= *oq1
, q2
= *oq2
, q3
= *oq3
;
178 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
179 *op2
= ROUND_POWER_OF_TWO(p3
+ p3
+ p3
+ 2 * p2
+ p1
+ p0
+ q0
, 3);
180 *op1
= ROUND_POWER_OF_TWO(p3
+ p3
+ p2
+ 2 * p1
+ p0
+ q0
+ q1
, 3);
181 *op0
= ROUND_POWER_OF_TWO(p3
+ p2
+ p1
+ 2 * p0
+ q0
+ q1
+ q2
, 3);
182 *oq0
= ROUND_POWER_OF_TWO(p2
+ p1
+ p0
+ 2 * q0
+ q1
+ q2
+ q3
, 3);
183 *oq1
= ROUND_POWER_OF_TWO(p1
+ p0
+ q0
+ 2 * q1
+ q2
+ q3
+ q3
, 3);
184 *oq2
= ROUND_POWER_OF_TWO(p0
+ q0
+ q1
+ 2 * q2
+ q3
+ q3
+ q3
, 3);
186 filter4(mask
, thresh
, op1
, op0
, oq0
, oq1
);
190 void vpx_lpf_horizontal_8_c(uint8_t *s
, int p
, const uint8_t *blimit
,
191 const uint8_t *limit
, const uint8_t *thresh
) {
194 // loop filter designed to work using chars so that we can make maximum use
195 // of 8 bit simd instructions.
196 for (i
= 0; i
< 8; ++i
) {
197 const uint8_t p3
= s
[-4 * p
], p2
= s
[-3 * p
], p1
= s
[-2 * p
], p0
= s
[-p
];
198 const uint8_t q0
= s
[0 * p
], q1
= s
[1 * p
], q2
= s
[2 * p
], q3
= s
[3 * p
];
200 const int8_t mask
= filter_mask(*limit
, *blimit
,
201 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
202 const int8_t flat
= flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
203 filter8(mask
, *thresh
, flat
, s
- 4 * p
, s
- 3 * p
, s
- 2 * p
, s
- 1 * p
,
204 s
, s
+ 1 * p
, s
+ 2 * p
, s
+ 3 * p
);
209 void vpx_lpf_horizontal_8_dual_c(uint8_t *s
, int p
, const uint8_t *blimit0
,
210 const uint8_t *limit0
, const uint8_t *thresh0
,
211 const uint8_t *blimit1
, const uint8_t *limit1
,
212 const uint8_t *thresh1
) {
213 vpx_lpf_horizontal_8_c(s
, p
, blimit0
, limit0
, thresh0
);
214 vpx_lpf_horizontal_8_c(s
+ 8, p
, blimit1
, limit1
, thresh1
);
217 void vpx_lpf_vertical_8_c(uint8_t *s
, int pitch
, const uint8_t *blimit
,
218 const uint8_t *limit
, const uint8_t *thresh
) {
221 for (i
= 0; i
< 8; ++i
) {
222 const uint8_t p3
= s
[-4], p2
= s
[-3], p1
= s
[-2], p0
= s
[-1];
223 const uint8_t q0
= s
[0], q1
= s
[1], q2
= s
[2], q3
= s
[3];
224 const int8_t mask
= filter_mask(*limit
, *blimit
,
225 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
226 const int8_t flat
= flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
227 filter8(mask
, *thresh
, flat
, s
- 4, s
- 3, s
- 2, s
- 1,
228 s
, s
+ 1, s
+ 2, s
+ 3);
233 void vpx_lpf_vertical_8_dual_c(uint8_t *s
, int pitch
, const uint8_t *blimit0
,
234 const uint8_t *limit0
, const uint8_t *thresh0
,
235 const uint8_t *blimit1
, const uint8_t *limit1
,
236 const uint8_t *thresh1
) {
237 vpx_lpf_vertical_8_c(s
, pitch
, blimit0
, limit0
, thresh0
);
238 vpx_lpf_vertical_8_c(s
+ 8 * pitch
, pitch
, blimit1
, limit1
, thresh1
);
241 static INLINE
void filter16(int8_t mask
, uint8_t thresh
,
242 uint8_t flat
, uint8_t flat2
,
243 uint8_t *op7
, uint8_t *op6
,
244 uint8_t *op5
, uint8_t *op4
,
245 uint8_t *op3
, uint8_t *op2
,
246 uint8_t *op1
, uint8_t *op0
,
247 uint8_t *oq0
, uint8_t *oq1
,
248 uint8_t *oq2
, uint8_t *oq3
,
249 uint8_t *oq4
, uint8_t *oq5
,
250 uint8_t *oq6
, uint8_t *oq7
) {
251 if (flat2
&& flat
&& mask
) {
252 const uint8_t p7
= *op7
, p6
= *op6
, p5
= *op5
, p4
= *op4
,
253 p3
= *op3
, p2
= *op2
, p1
= *op1
, p0
= *op0
;
255 const uint8_t q0
= *oq0
, q1
= *oq1
, q2
= *oq2
, q3
= *oq3
,
256 q4
= *oq4
, q5
= *oq5
, q6
= *oq6
, q7
= *oq7
;
258 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
259 *op6
= ROUND_POWER_OF_TWO(p7
* 7 + p6
* 2 + p5
+ p4
+ p3
+ p2
+ p1
+ p0
+
261 *op5
= ROUND_POWER_OF_TWO(p7
* 6 + p6
+ p5
* 2 + p4
+ p3
+ p2
+ p1
+ p0
+
263 *op4
= ROUND_POWER_OF_TWO(p7
* 5 + p6
+ p5
+ p4
* 2 + p3
+ p2
+ p1
+ p0
+
265 *op3
= ROUND_POWER_OF_TWO(p7
* 4 + p6
+ p5
+ p4
+ p3
* 2 + p2
+ p1
+ p0
+
266 q0
+ q1
+ q2
+ q3
, 4);
267 *op2
= ROUND_POWER_OF_TWO(p7
* 3 + p6
+ p5
+ p4
+ p3
+ p2
* 2 + p1
+ p0
+
268 q0
+ q1
+ q2
+ q3
+ q4
, 4);
269 *op1
= ROUND_POWER_OF_TWO(p7
* 2 + p6
+ p5
+ p4
+ p3
+ p2
+ p1
* 2 + p0
+
270 q0
+ q1
+ q2
+ q3
+ q4
+ q5
, 4);
271 *op0
= ROUND_POWER_OF_TWO(p7
+ p6
+ p5
+ p4
+ p3
+ p2
+ p1
+ p0
* 2 +
272 q0
+ q1
+ q2
+ q3
+ q4
+ q5
+ q6
, 4);
273 *oq0
= ROUND_POWER_OF_TWO(p6
+ p5
+ p4
+ p3
+ p2
+ p1
+ p0
+
274 q0
* 2 + q1
+ q2
+ q3
+ q4
+ q5
+ q6
+ q7
, 4);
275 *oq1
= ROUND_POWER_OF_TWO(p5
+ p4
+ p3
+ p2
+ p1
+ p0
+
276 q0
+ q1
* 2 + q2
+ q3
+ q4
+ q5
+ q6
+ q7
* 2, 4);
277 *oq2
= ROUND_POWER_OF_TWO(p4
+ p3
+ p2
+ p1
+ p0
+
278 q0
+ q1
+ q2
* 2 + q3
+ q4
+ q5
+ q6
+ q7
* 3, 4);
279 *oq3
= ROUND_POWER_OF_TWO(p3
+ p2
+ p1
+ p0
+
280 q0
+ q1
+ q2
+ q3
* 2 + q4
+ q5
+ q6
+ q7
* 4, 4);
281 *oq4
= ROUND_POWER_OF_TWO(p2
+ p1
+ p0
+
282 q0
+ q1
+ q2
+ q3
+ q4
* 2 + q5
+ q6
+ q7
* 5, 4);
283 *oq5
= ROUND_POWER_OF_TWO(p1
+ p0
+
284 q0
+ q1
+ q2
+ q3
+ q4
+ q5
* 2 + q6
+ q7
* 6, 4);
285 *oq6
= ROUND_POWER_OF_TWO(p0
+
286 q0
+ q1
+ q2
+ q3
+ q4
+ q5
+ q6
* 2 + q7
* 7, 4);
288 filter8(mask
, thresh
, flat
, op3
, op2
, op1
, op0
, oq0
, oq1
, oq2
, oq3
);
292 static void mb_lpf_horizontal_edge_w(uint8_t *s
, int p
, const uint8_t *blimit
,
293 const uint8_t *limit
,
294 const uint8_t *thresh
, int count
) {
297 // loop filter designed to work using chars so that we can make maximum use
298 // of 8 bit simd instructions.
299 for (i
= 0; i
< 8 * count
; ++i
) {
300 const uint8_t p3
= s
[-4 * p
], p2
= s
[-3 * p
], p1
= s
[-2 * p
], p0
= s
[-p
];
301 const uint8_t q0
= s
[0 * p
], q1
= s
[1 * p
], q2
= s
[2 * p
], q3
= s
[3 * p
];
302 const int8_t mask
= filter_mask(*limit
, *blimit
,
303 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
304 const int8_t flat
= flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
305 const int8_t flat2
= flat_mask5(1,
306 s
[-8 * p
], s
[-7 * p
], s
[-6 * p
], s
[-5 * p
], p0
,
307 q0
, s
[4 * p
], s
[5 * p
], s
[6 * p
], s
[7 * p
]);
309 filter16(mask
, *thresh
, flat
, flat2
,
310 s
- 8 * p
, s
- 7 * p
, s
- 6 * p
, s
- 5 * p
,
311 s
- 4 * p
, s
- 3 * p
, s
- 2 * p
, s
- 1 * p
,
312 s
, s
+ 1 * p
, s
+ 2 * p
, s
+ 3 * p
,
313 s
+ 4 * p
, s
+ 5 * p
, s
+ 6 * p
, s
+ 7 * p
);
318 void vpx_lpf_horizontal_edge_8_c(uint8_t *s
, int p
, const uint8_t *blimit
,
319 const uint8_t *limit
, const uint8_t *thresh
) {
320 mb_lpf_horizontal_edge_w(s
, p
, blimit
, limit
, thresh
, 1);
323 void vpx_lpf_horizontal_edge_16_c(uint8_t *s
, int p
, const uint8_t *blimit
,
324 const uint8_t *limit
, const uint8_t *thresh
) {
325 mb_lpf_horizontal_edge_w(s
, p
, blimit
, limit
, thresh
, 2);
328 static void mb_lpf_vertical_edge_w(uint8_t *s
, int p
,
329 const uint8_t *blimit
,
330 const uint8_t *limit
,
331 const uint8_t *thresh
,
335 for (i
= 0; i
< count
; ++i
) {
336 const uint8_t p3
= s
[-4], p2
= s
[-3], p1
= s
[-2], p0
= s
[-1];
337 const uint8_t q0
= s
[0], q1
= s
[1], q2
= s
[2], q3
= s
[3];
338 const int8_t mask
= filter_mask(*limit
, *blimit
,
339 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
340 const int8_t flat
= flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
);
341 const int8_t flat2
= flat_mask5(1, s
[-8], s
[-7], s
[-6], s
[-5], p0
,
342 q0
, s
[4], s
[5], s
[6], s
[7]);
344 filter16(mask
, *thresh
, flat
, flat2
,
345 s
- 8, s
- 7, s
- 6, s
- 5, s
- 4, s
- 3, s
- 2, s
- 1,
346 s
, s
+ 1, s
+ 2, s
+ 3, s
+ 4, s
+ 5, s
+ 6, s
+ 7);
351 void vpx_lpf_vertical_16_c(uint8_t *s
, int p
, const uint8_t *blimit
,
352 const uint8_t *limit
, const uint8_t *thresh
) {
353 mb_lpf_vertical_edge_w(s
, p
, blimit
, limit
, thresh
, 8);
356 void vpx_lpf_vertical_16_dual_c(uint8_t *s
, int p
, const uint8_t *blimit
,
357 const uint8_t *limit
, const uint8_t *thresh
) {
358 mb_lpf_vertical_edge_w(s
, p
, blimit
, limit
, thresh
, 16);
361 #if CONFIG_VP9_HIGHBITDEPTH
362 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
363 static INLINE
int8_t highbd_filter_mask(uint8_t limit
, uint8_t blimit
,
364 uint16_t p3
, uint16_t p2
,
365 uint16_t p1
, uint16_t p0
,
366 uint16_t q0
, uint16_t q1
,
367 uint16_t q2
, uint16_t q3
, int bd
) {
369 int16_t limit16
= (uint16_t)limit
<< (bd
- 8);
370 int16_t blimit16
= (uint16_t)blimit
<< (bd
- 8);
371 mask
|= (abs(p3
- p2
) > limit16
) * -1;
372 mask
|= (abs(p2
- p1
) > limit16
) * -1;
373 mask
|= (abs(p1
- p0
) > limit16
) * -1;
374 mask
|= (abs(q1
- q0
) > limit16
) * -1;
375 mask
|= (abs(q2
- q1
) > limit16
) * -1;
376 mask
|= (abs(q3
- q2
) > limit16
) * -1;
377 mask
|= (abs(p0
- q0
) * 2 + abs(p1
- q1
) / 2 > blimit16
) * -1;
381 static INLINE
int8_t highbd_flat_mask4(uint8_t thresh
,
382 uint16_t p3
, uint16_t p2
,
383 uint16_t p1
, uint16_t p0
,
384 uint16_t q0
, uint16_t q1
,
385 uint16_t q2
, uint16_t q3
, int bd
) {
387 int16_t thresh16
= (uint16_t)thresh
<< (bd
- 8);
388 mask
|= (abs(p1
- p0
) > thresh16
) * -1;
389 mask
|= (abs(q1
- q0
) > thresh16
) * -1;
390 mask
|= (abs(p2
- p0
) > thresh16
) * -1;
391 mask
|= (abs(q2
- q0
) > thresh16
) * -1;
392 mask
|= (abs(p3
- p0
) > thresh16
) * -1;
393 mask
|= (abs(q3
- q0
) > thresh16
) * -1;
397 static INLINE
int8_t highbd_flat_mask5(uint8_t thresh
,
398 uint16_t p4
, uint16_t p3
,
399 uint16_t p2
, uint16_t p1
,
400 uint16_t p0
, uint16_t q0
,
401 uint16_t q1
, uint16_t q2
,
402 uint16_t q3
, uint16_t q4
, int bd
) {
403 int8_t mask
= ~highbd_flat_mask4(thresh
, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, bd
);
404 int16_t thresh16
= (uint16_t)thresh
<< (bd
- 8);
405 mask
|= (abs(p4
- p0
) > thresh16
) * -1;
406 mask
|= (abs(q4
- q0
) > thresh16
) * -1;
410 // Is there high edge variance internal edge:
411 // 11111111_11111111 yes, 00000000_00000000 no ?
412 static INLINE
int16_t highbd_hev_mask(uint8_t thresh
, uint16_t p1
, uint16_t p0
,
413 uint16_t q0
, uint16_t q1
, int bd
) {
415 int16_t thresh16
= (uint16_t)thresh
<< (bd
- 8);
416 hev
|= (abs(p1
- p0
) > thresh16
) * -1;
417 hev
|= (abs(q1
- q0
) > thresh16
) * -1;
421 static INLINE
void highbd_filter4(int8_t mask
, uint8_t thresh
, uint16_t *op1
,
422 uint16_t *op0
, uint16_t *oq0
, uint16_t *oq1
,
424 int16_t filter1
, filter2
;
425 // ^0x80 equivalent to subtracting 0x80 from the values to turn them
426 // into -128 to +127 instead of 0 to 255.
428 const int16_t ps1
= (int16_t)*op1
- (0x80 << shift
);
429 const int16_t ps0
= (int16_t)*op0
- (0x80 << shift
);
430 const int16_t qs0
= (int16_t)*oq0
- (0x80 << shift
);
431 const int16_t qs1
= (int16_t)*oq1
- (0x80 << shift
);
432 const uint16_t hev
= highbd_hev_mask(thresh
, *op1
, *op0
, *oq0
, *oq1
, bd
);
434 // Add outer taps if we have high edge variance.
435 int16_t filter
= signed_char_clamp_high(ps1
- qs1
, bd
) & hev
;
438 filter
= signed_char_clamp_high(filter
+ 3 * (qs0
- ps0
), bd
) & mask
;
440 // Save bottom 3 bits so that we round one side +4 and the other +3
441 // if it equals 4 we'll set to adjust by -1 to account for the fact
442 // we'd round 3 the other way.
443 filter1
= signed_char_clamp_high(filter
+ 4, bd
) >> 3;
444 filter2
= signed_char_clamp_high(filter
+ 3, bd
) >> 3;
446 *oq0
= signed_char_clamp_high(qs0
- filter1
, bd
) + (0x80 << shift
);
447 *op0
= signed_char_clamp_high(ps0
+ filter2
, bd
) + (0x80 << shift
);
449 // Outer tap adjustments.
450 filter
= ROUND_POWER_OF_TWO(filter1
, 1) & ~hev
;
452 *oq1
= signed_char_clamp_high(qs1
- filter
, bd
) + (0x80 << shift
);
453 *op1
= signed_char_clamp_high(ps1
+ filter
, bd
) + (0x80 << shift
);
456 void vpx_highbd_lpf_horizontal_4_c(uint16_t *s
, int p
/* pitch */,
457 const uint8_t *blimit
, const uint8_t *limit
,
458 const uint8_t *thresh
, int bd
) {
461 // loop filter designed to work using chars so that we can make maximum use
462 // of 8 bit simd instructions.
463 for (i
= 0; i
< 8; ++i
) {
464 const uint16_t p3
= s
[-4 * p
];
465 const uint16_t p2
= s
[-3 * p
];
466 const uint16_t p1
= s
[-2 * p
];
467 const uint16_t p0
= s
[-p
];
468 const uint16_t q0
= s
[0 * p
];
469 const uint16_t q1
= s
[1 * p
];
470 const uint16_t q2
= s
[2 * p
];
471 const uint16_t q3
= s
[3 * p
];
472 const int8_t mask
= highbd_filter_mask(*limit
, *blimit
,
473 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, bd
);
474 highbd_filter4(mask
, *thresh
, s
- 2 * p
, s
- 1 * p
, s
, s
+ 1 * p
, bd
);
479 void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s
, int p
,
480 const uint8_t *blimit0
,
481 const uint8_t *limit0
,
482 const uint8_t *thresh0
,
483 const uint8_t *blimit1
,
484 const uint8_t *limit1
,
485 const uint8_t *thresh1
,
487 vpx_highbd_lpf_horizontal_4_c(s
, p
, blimit0
, limit0
, thresh0
, bd
);
488 vpx_highbd_lpf_horizontal_4_c(s
+ 8, p
, blimit1
, limit1
, thresh1
, bd
);
491 void vpx_highbd_lpf_vertical_4_c(uint16_t *s
, int pitch
, const uint8_t *blimit
,
492 const uint8_t *limit
, const uint8_t *thresh
,
496 // loop filter designed to work using chars so that we can make maximum use
497 // of 8 bit simd instructions.
498 for (i
= 0; i
< 8; ++i
) {
499 const uint16_t p3
= s
[-4], p2
= s
[-3], p1
= s
[-2], p0
= s
[-1];
500 const uint16_t q0
= s
[0], q1
= s
[1], q2
= s
[2], q3
= s
[3];
501 const int8_t mask
= highbd_filter_mask(*limit
, *blimit
,
502 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, bd
);
503 highbd_filter4(mask
, *thresh
, s
- 2, s
- 1, s
, s
+ 1, bd
);
508 void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s
, int pitch
,
509 const uint8_t *blimit0
,
510 const uint8_t *limit0
,
511 const uint8_t *thresh0
,
512 const uint8_t *blimit1
,
513 const uint8_t *limit1
,
514 const uint8_t *thresh1
,
516 vpx_highbd_lpf_vertical_4_c(s
, pitch
, blimit0
, limit0
, thresh0
, bd
);
517 vpx_highbd_lpf_vertical_4_c(s
+ 8 * pitch
, pitch
, blimit1
, limit1
,
521 static INLINE
void highbd_filter8(int8_t mask
, uint8_t thresh
, uint8_t flat
,
522 uint16_t *op3
, uint16_t *op2
,
523 uint16_t *op1
, uint16_t *op0
,
524 uint16_t *oq0
, uint16_t *oq1
,
525 uint16_t *oq2
, uint16_t *oq3
, int bd
) {
527 const uint16_t p3
= *op3
, p2
= *op2
, p1
= *op1
, p0
= *op0
;
528 const uint16_t q0
= *oq0
, q1
= *oq1
, q2
= *oq2
, q3
= *oq3
;
530 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
531 *op2
= ROUND_POWER_OF_TWO(p3
+ p3
+ p3
+ 2 * p2
+ p1
+ p0
+ q0
, 3);
532 *op1
= ROUND_POWER_OF_TWO(p3
+ p3
+ p2
+ 2 * p1
+ p0
+ q0
+ q1
, 3);
533 *op0
= ROUND_POWER_OF_TWO(p3
+ p2
+ p1
+ 2 * p0
+ q0
+ q1
+ q2
, 3);
534 *oq0
= ROUND_POWER_OF_TWO(p2
+ p1
+ p0
+ 2 * q0
+ q1
+ q2
+ q3
, 3);
535 *oq1
= ROUND_POWER_OF_TWO(p1
+ p0
+ q0
+ 2 * q1
+ q2
+ q3
+ q3
, 3);
536 *oq2
= ROUND_POWER_OF_TWO(p0
+ q0
+ q1
+ 2 * q2
+ q3
+ q3
+ q3
, 3);
538 highbd_filter4(mask
, thresh
, op1
, op0
, oq0
, oq1
, bd
);
542 void vpx_highbd_lpf_horizontal_8_c(uint16_t *s
, int p
, const uint8_t *blimit
,
543 const uint8_t *limit
, const uint8_t *thresh
,
547 // loop filter designed to work using chars so that we can make maximum use
548 // of 8 bit simd instructions.
549 for (i
= 0; i
< 8; ++i
) {
550 const uint16_t p3
= s
[-4 * p
], p2
= s
[-3 * p
], p1
= s
[-2 * p
], p0
= s
[-p
];
551 const uint16_t q0
= s
[0 * p
], q1
= s
[1 * p
], q2
= s
[2 * p
], q3
= s
[3 * p
];
553 const int8_t mask
= highbd_filter_mask(*limit
, *blimit
,
554 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, bd
);
555 const int8_t flat
= highbd_flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
,
557 highbd_filter8(mask
, *thresh
, flat
,
558 s
- 4 * p
, s
- 3 * p
, s
- 2 * p
, s
- 1 * p
,
559 s
, s
+ 1 * p
, s
+ 2 * p
, s
+ 3 * p
, bd
);
564 void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s
, int p
,
565 const uint8_t *blimit0
,
566 const uint8_t *limit0
,
567 const uint8_t *thresh0
,
568 const uint8_t *blimit1
,
569 const uint8_t *limit1
,
570 const uint8_t *thresh1
,
572 vpx_highbd_lpf_horizontal_8_c(s
, p
, blimit0
, limit0
, thresh0
, bd
);
573 vpx_highbd_lpf_horizontal_8_c(s
+ 8, p
, blimit1
, limit1
, thresh1
, bd
);
576 void vpx_highbd_lpf_vertical_8_c(uint16_t *s
, int pitch
, const uint8_t *blimit
,
577 const uint8_t *limit
, const uint8_t *thresh
,
581 for (i
= 0; i
< 8; ++i
) {
582 const uint16_t p3
= s
[-4], p2
= s
[-3], p1
= s
[-2], p0
= s
[-1];
583 const uint16_t q0
= s
[0], q1
= s
[1], q2
= s
[2], q3
= s
[3];
584 const int8_t mask
= highbd_filter_mask(*limit
, *blimit
,
585 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, bd
);
586 const int8_t flat
= highbd_flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
,
588 highbd_filter8(mask
, *thresh
, flat
,
589 s
- 4, s
- 3, s
- 2, s
- 1,
590 s
, s
+ 1, s
+ 2, s
+ 3,
596 void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s
, int pitch
,
597 const uint8_t *blimit0
,
598 const uint8_t *limit0
,
599 const uint8_t *thresh0
,
600 const uint8_t *blimit1
,
601 const uint8_t *limit1
,
602 const uint8_t *thresh1
,
604 vpx_highbd_lpf_vertical_8_c(s
, pitch
, blimit0
, limit0
, thresh0
, bd
);
605 vpx_highbd_lpf_vertical_8_c(s
+ 8 * pitch
, pitch
, blimit1
, limit1
,
609 static INLINE
void highbd_filter16(int8_t mask
, uint8_t thresh
,
610 uint8_t flat
, uint8_t flat2
,
611 uint16_t *op7
, uint16_t *op6
,
612 uint16_t *op5
, uint16_t *op4
,
613 uint16_t *op3
, uint16_t *op2
,
614 uint16_t *op1
, uint16_t *op0
,
615 uint16_t *oq0
, uint16_t *oq1
,
616 uint16_t *oq2
, uint16_t *oq3
,
617 uint16_t *oq4
, uint16_t *oq5
,
618 uint16_t *oq6
, uint16_t *oq7
, int bd
) {
619 if (flat2
&& flat
&& mask
) {
620 const uint16_t p7
= *op7
;
621 const uint16_t p6
= *op6
;
622 const uint16_t p5
= *op5
;
623 const uint16_t p4
= *op4
;
624 const uint16_t p3
= *op3
;
625 const uint16_t p2
= *op2
;
626 const uint16_t p1
= *op1
;
627 const uint16_t p0
= *op0
;
628 const uint16_t q0
= *oq0
;
629 const uint16_t q1
= *oq1
;
630 const uint16_t q2
= *oq2
;
631 const uint16_t q3
= *oq3
;
632 const uint16_t q4
= *oq4
;
633 const uint16_t q5
= *oq5
;
634 const uint16_t q6
= *oq6
;
635 const uint16_t q7
= *oq7
;
637 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
638 *op6
= ROUND_POWER_OF_TWO(p7
* 7 + p6
* 2 + p5
+ p4
+ p3
+ p2
+ p1
+ p0
+
640 *op5
= ROUND_POWER_OF_TWO(p7
* 6 + p6
+ p5
* 2 + p4
+ p3
+ p2
+ p1
+ p0
+
642 *op4
= ROUND_POWER_OF_TWO(p7
* 5 + p6
+ p5
+ p4
* 2 + p3
+ p2
+ p1
+ p0
+
644 *op3
= ROUND_POWER_OF_TWO(p7
* 4 + p6
+ p5
+ p4
+ p3
* 2 + p2
+ p1
+ p0
+
645 q0
+ q1
+ q2
+ q3
, 4);
646 *op2
= ROUND_POWER_OF_TWO(p7
* 3 + p6
+ p5
+ p4
+ p3
+ p2
* 2 + p1
+ p0
+
647 q0
+ q1
+ q2
+ q3
+ q4
, 4);
648 *op1
= ROUND_POWER_OF_TWO(p7
* 2 + p6
+ p5
+ p4
+ p3
+ p2
+ p1
* 2 + p0
+
649 q0
+ q1
+ q2
+ q3
+ q4
+ q5
, 4);
650 *op0
= ROUND_POWER_OF_TWO(p7
+ p6
+ p5
+ p4
+ p3
+ p2
+ p1
+ p0
* 2 +
651 q0
+ q1
+ q2
+ q3
+ q4
+ q5
+ q6
, 4);
652 *oq0
= ROUND_POWER_OF_TWO(p6
+ p5
+ p4
+ p3
+ p2
+ p1
+ p0
+
653 q0
* 2 + q1
+ q2
+ q3
+ q4
+ q5
+ q6
+ q7
, 4);
654 *oq1
= ROUND_POWER_OF_TWO(p5
+ p4
+ p3
+ p2
+ p1
+ p0
+
655 q0
+ q1
* 2 + q2
+ q3
+ q4
+ q5
+ q6
+ q7
* 2, 4);
656 *oq2
= ROUND_POWER_OF_TWO(p4
+ p3
+ p2
+ p1
+ p0
+
657 q0
+ q1
+ q2
* 2 + q3
+ q4
+ q5
+ q6
+ q7
* 3, 4);
658 *oq3
= ROUND_POWER_OF_TWO(p3
+ p2
+ p1
+ p0
+
659 q0
+ q1
+ q2
+ q3
* 2 + q4
+ q5
+ q6
+ q7
* 4, 4);
660 *oq4
= ROUND_POWER_OF_TWO(p2
+ p1
+ p0
+
661 q0
+ q1
+ q2
+ q3
+ q4
* 2 + q5
+ q6
+ q7
* 5, 4);
662 *oq5
= ROUND_POWER_OF_TWO(p1
+ p0
+
663 q0
+ q1
+ q2
+ q3
+ q4
+ q5
* 2 + q6
+ q7
* 6, 4);
664 *oq6
= ROUND_POWER_OF_TWO(p0
+
665 q0
+ q1
+ q2
+ q3
+ q4
+ q5
+ q6
* 2 + q7
* 7, 4);
667 highbd_filter8(mask
, thresh
, flat
, op3
, op2
, op1
, op0
, oq0
, oq1
, oq2
, oq3
,
672 static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s
, int p
,
673 const uint8_t *blimit
,
674 const uint8_t *limit
,
675 const uint8_t *thresh
,
679 // loop filter designed to work using chars so that we can make maximum use
680 // of 8 bit simd instructions.
681 for (i
= 0; i
< 8 * count
; ++i
) {
682 const uint16_t p3
= s
[-4 * p
];
683 const uint16_t p2
= s
[-3 * p
];
684 const uint16_t p1
= s
[-2 * p
];
685 const uint16_t p0
= s
[-p
];
686 const uint16_t q0
= s
[0 * p
];
687 const uint16_t q1
= s
[1 * p
];
688 const uint16_t q2
= s
[2 * p
];
689 const uint16_t q3
= s
[3 * p
];
690 const int8_t mask
= highbd_filter_mask(*limit
, *blimit
,
691 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, bd
);
692 const int8_t flat
= highbd_flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
,
694 const int8_t flat2
= highbd_flat_mask5(
695 1, s
[-8 * p
], s
[-7 * p
], s
[-6 * p
], s
[-5 * p
], p0
,
696 q0
, s
[4 * p
], s
[5 * p
], s
[6 * p
], s
[7 * p
], bd
);
698 highbd_filter16(mask
, *thresh
, flat
, flat2
,
699 s
- 8 * p
, s
- 7 * p
, s
- 6 * p
, s
- 5 * p
,
700 s
- 4 * p
, s
- 3 * p
, s
- 2 * p
, s
- 1 * p
,
701 s
, s
+ 1 * p
, s
+ 2 * p
, s
+ 3 * p
,
702 s
+ 4 * p
, s
+ 5 * p
, s
+ 6 * p
, s
+ 7 * p
,
708 void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s
, int p
,
709 const uint8_t *blimit
,
710 const uint8_t *limit
,
711 const uint8_t *thresh
, int bd
) {
712 highbd_mb_lpf_horizontal_edge_w(s
, p
, blimit
, limit
, thresh
, 1, bd
);
715 void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s
, int p
,
716 const uint8_t *blimit
,
717 const uint8_t *limit
,
718 const uint8_t *thresh
, int bd
) {
719 highbd_mb_lpf_horizontal_edge_w(s
, p
, blimit
, limit
, thresh
, 2, bd
);
722 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s
, int p
,
723 const uint8_t *blimit
,
724 const uint8_t *limit
,
725 const uint8_t *thresh
,
729 for (i
= 0; i
< count
; ++i
) {
730 const uint16_t p3
= s
[-4];
731 const uint16_t p2
= s
[-3];
732 const uint16_t p1
= s
[-2];
733 const uint16_t p0
= s
[-1];
734 const uint16_t q0
= s
[0];
735 const uint16_t q1
= s
[1];
736 const uint16_t q2
= s
[2];
737 const uint16_t q3
= s
[3];
738 const int8_t mask
= highbd_filter_mask(*limit
, *blimit
,
739 p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, bd
);
740 const int8_t flat
= highbd_flat_mask4(1, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
,
742 const int8_t flat2
= highbd_flat_mask5(1, s
[-8], s
[-7], s
[-6], s
[-5], p0
,
743 q0
, s
[4], s
[5], s
[6], s
[7], bd
);
745 highbd_filter16(mask
, *thresh
, flat
, flat2
,
746 s
- 8, s
- 7, s
- 6, s
- 5, s
- 4, s
- 3, s
- 2, s
- 1,
747 s
, s
+ 1, s
+ 2, s
+ 3, s
+ 4, s
+ 5, s
+ 6, s
+ 7,
753 void vpx_highbd_lpf_vertical_16_c(uint16_t *s
, int p
, const uint8_t *blimit
,
754 const uint8_t *limit
, const uint8_t *thresh
,
756 highbd_mb_lpf_vertical_edge_w(s
, p
, blimit
, limit
, thresh
, 8, bd
);
759 void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s
, int p
,
760 const uint8_t *blimit
,
761 const uint8_t *limit
,
762 const uint8_t *thresh
,
764 highbd_mb_lpf_vertical_edge_w(s
, p
, blimit
, limit
, thresh
, 16, bd
);
766 #endif // CONFIG_VP9_HIGHBITDEPTH