2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 #include "common/attributes.h"
33 #include "common/intops.h"
35 #include "src/loopfilter.h"
38 loop_filter(pixel
*dst
, int E
, int I
, int H
,
39 const ptrdiff_t stridea
, const ptrdiff_t strideb
, const int wd
42 const int bitdepth_min_8
= bitdepth_from_max(bitdepth_max
) - 8;
43 const int F
= 1 << bitdepth_min_8
;
48 for (int i
= 0; i
< 4; i
++, dst
+= stridea
) {
49 int p6
, p5
, p4
, p3
, p2
;
50 int p1
= dst
[strideb
* -2], p0
= dst
[strideb
* -1];
51 int q0
= dst
[strideb
* +0], q1
= dst
[strideb
* +1];
52 int q2
, q3
, q4
, q5
, q6
;
53 int fm
, flat8out
, flat8in
;
55 fm
= abs(p1
- p0
) <= I
&& abs(q1
- q0
) <= I
&&
56 abs(p0
- q0
) * 2 + (abs(p1
- q1
) >> 1) <= E
;
59 p2
= dst
[strideb
* -3];
60 q2
= dst
[strideb
* +2];
62 fm
&= abs(p2
- p1
) <= I
&& abs(q2
- q1
) <= I
;
65 p3
= dst
[strideb
* -4];
66 q3
= dst
[strideb
* +3];
68 fm
&= abs(p3
- p2
) <= I
&& abs(q3
- q2
) <= I
;
74 p6
= dst
[strideb
* -7];
75 p5
= dst
[strideb
* -6];
76 p4
= dst
[strideb
* -5];
77 q4
= dst
[strideb
* +4];
78 q5
= dst
[strideb
* +5];
79 q6
= dst
[strideb
* +6];
81 flat8out
= abs(p6
- p0
) <= F
&& abs(p5
- p0
) <= F
&&
82 abs(p4
- p0
) <= F
&& abs(q4
- q0
) <= F
&&
83 abs(q5
- q0
) <= F
&& abs(q6
- q0
) <= F
;
87 flat8in
= abs(p2
- p0
) <= F
&& abs(p1
- p0
) <= F
&&
88 abs(q1
- q0
) <= F
&& abs(q2
- q0
) <= F
;
91 flat8in
&= abs(p3
- p0
) <= F
&& abs(q3
- q0
) <= F
;
93 if (wd
>= 16 && (flat8out
& flat8in
)) {
94 dst
[strideb
* -6] = (p6
+ p6
+ p6
+ p6
+ p6
+ p6
* 2 + p5
* 2 +
95 p4
* 2 + p3
+ p2
+ p1
+ p0
+ q0
+ 8) >> 4;
96 dst
[strideb
* -5] = (p6
+ p6
+ p6
+ p6
+ p6
+ p5
* 2 + p4
* 2 +
97 p3
* 2 + p2
+ p1
+ p0
+ q0
+ q1
+ 8) >> 4;
98 dst
[strideb
* -4] = (p6
+ p6
+ p6
+ p6
+ p5
+ p4
* 2 + p3
* 2 +
99 p2
* 2 + p1
+ p0
+ q0
+ q1
+ q2
+ 8) >> 4;
100 dst
[strideb
* -3] = (p6
+ p6
+ p6
+ p5
+ p4
+ p3
* 2 + p2
* 2 +
101 p1
* 2 + p0
+ q0
+ q1
+ q2
+ q3
+ 8) >> 4;
102 dst
[strideb
* -2] = (p6
+ p6
+ p5
+ p4
+ p3
+ p2
* 2 + p1
* 2 +
103 p0
* 2 + q0
+ q1
+ q2
+ q3
+ q4
+ 8) >> 4;
104 dst
[strideb
* -1] = (p6
+ p5
+ p4
+ p3
+ p2
+ p1
* 2 + p0
* 2 +
105 q0
* 2 + q1
+ q2
+ q3
+ q4
+ q5
+ 8) >> 4;
106 dst
[strideb
* +0] = (p5
+ p4
+ p3
+ p2
+ p1
+ p0
* 2 + q0
* 2 +
107 q1
* 2 + q2
+ q3
+ q4
+ q5
+ q6
+ 8) >> 4;
108 dst
[strideb
* +1] = (p4
+ p3
+ p2
+ p1
+ p0
+ q0
* 2 + q1
* 2 +
109 q2
* 2 + q3
+ q4
+ q5
+ q6
+ q6
+ 8) >> 4;
110 dst
[strideb
* +2] = (p3
+ p2
+ p1
+ p0
+ q0
+ q1
* 2 + q2
* 2 +
111 q3
* 2 + q4
+ q5
+ q6
+ q6
+ q6
+ 8) >> 4;
112 dst
[strideb
* +3] = (p2
+ p1
+ p0
+ q0
+ q1
+ q2
* 2 + q3
* 2 +
113 q4
* 2 + q5
+ q6
+ q6
+ q6
+ q6
+ 8) >> 4;
114 dst
[strideb
* +4] = (p1
+ p0
+ q0
+ q1
+ q2
+ q3
* 2 + q4
* 2 +
115 q5
* 2 + q6
+ q6
+ q6
+ q6
+ q6
+ 8) >> 4;
116 dst
[strideb
* +5] = (p0
+ q0
+ q1
+ q2
+ q3
+ q4
* 2 + q5
* 2 +
117 q6
* 2 + q6
+ q6
+ q6
+ q6
+ q6
+ 8) >> 4;
118 } else if (wd
>= 8 && flat8in
) {
119 dst
[strideb
* -3] = (p3
+ p3
+ p3
+ 2 * p2
+ p1
+ p0
+ q0
+ 4) >> 3;
120 dst
[strideb
* -2] = (p3
+ p3
+ p2
+ 2 * p1
+ p0
+ q0
+ q1
+ 4) >> 3;
121 dst
[strideb
* -1] = (p3
+ p2
+ p1
+ 2 * p0
+ q0
+ q1
+ q2
+ 4) >> 3;
122 dst
[strideb
* +0] = (p2
+ p1
+ p0
+ 2 * q0
+ q1
+ q2
+ q3
+ 4) >> 3;
123 dst
[strideb
* +1] = (p1
+ p0
+ q0
+ 2 * q1
+ q2
+ q3
+ q3
+ 4) >> 3;
124 dst
[strideb
* +2] = (p0
+ q0
+ q1
+ 2 * q2
+ q3
+ q3
+ q3
+ 4) >> 3;
125 } else if (wd
== 6 && flat8in
) {
126 dst
[strideb
* -2] = (p2
+ 2 * p2
+ 2 * p1
+ 2 * p0
+ q0
+ 4) >> 3;
127 dst
[strideb
* -1] = (p2
+ 2 * p1
+ 2 * p0
+ 2 * q0
+ q1
+ 4) >> 3;
128 dst
[strideb
* +0] = (p1
+ 2 * p0
+ 2 * q0
+ 2 * q1
+ q2
+ 4) >> 3;
129 dst
[strideb
* +1] = (p0
+ 2 * q0
+ 2 * q1
+ 2 * q2
+ q2
+ 4) >> 3;
131 const int hev
= abs(p1
- p0
) > H
|| abs(q1
- q0
) > H
;
133 #define iclip_diff(v) iclip(v, -128 * (1 << bitdepth_min_8), \
134 128 * (1 << bitdepth_min_8) - 1)
137 int f
= iclip_diff(p1
- q1
), f1
, f2
;
138 f
= iclip_diff(3 * (q0
- p0
) + f
);
140 f1
= imin(f
+ 4, (128 << bitdepth_min_8
) - 1) >> 3;
141 f2
= imin(f
+ 3, (128 << bitdepth_min_8
) - 1) >> 3;
143 dst
[strideb
* -1] = iclip_pixel(p0
+ f2
);
144 dst
[strideb
* +0] = iclip_pixel(q0
- f1
);
146 int f
= iclip_diff(3 * (q0
- p0
)), f1
, f2
;
148 f1
= imin(f
+ 4, (128 << bitdepth_min_8
) - 1) >> 3;
149 f2
= imin(f
+ 3, (128 << bitdepth_min_8
) - 1) >> 3;
151 dst
[strideb
* -1] = iclip_pixel(p0
+ f2
);
152 dst
[strideb
* +0] = iclip_pixel(q0
- f1
);
155 dst
[strideb
* -2] = iclip_pixel(p1
+ f
);
156 dst
[strideb
* +1] = iclip_pixel(q1
- f
);
163 static void loop_filter_h_sb128y_c(pixel
*dst
, const ptrdiff_t stride
,
164 const uint32_t *const vmask
,
165 const uint8_t (*l
)[4], ptrdiff_t b4_stride
,
166 const Av1FilterLUT
*lut
, const int h
169 const unsigned vm
= vmask
[0] | vmask
[1] | vmask
[2];
170 for (unsigned y
= 1; vm
& ~(y
- 1);
171 y
<<= 1, dst
+= 4 * PXSTRIDE(stride
), l
+= b4_stride
)
174 const int L
= l
[0][0] ? l
[0][0] : l
[-1][0];
176 const int H
= L
>> 4;
177 const int E
= lut
->e
[L
], I
= lut
->i
[L
];
178 const int idx
= (vmask
[2] & y
) ? 2 : !!(vmask
[1] & y
);
179 loop_filter(dst
, E
, I
, H
, PXSTRIDE(stride
), 1, 4 << idx
185 static void loop_filter_v_sb128y_c(pixel
*dst
, const ptrdiff_t stride
,
186 const uint32_t *const vmask
,
187 const uint8_t (*l
)[4], ptrdiff_t b4_stride
,
188 const Av1FilterLUT
*lut
, const int w
191 const unsigned vm
= vmask
[0] | vmask
[1] | vmask
[2];
192 for (unsigned x
= 1; vm
& ~(x
- 1); x
<<= 1, dst
+= 4, l
++) {
194 const int L
= l
[0][0] ? l
[0][0] : l
[-b4_stride
][0];
196 const int H
= L
>> 4;
197 const int E
= lut
->e
[L
], I
= lut
->i
[L
];
198 const int idx
= (vmask
[2] & x
) ? 2 : !!(vmask
[1] & x
);
199 loop_filter(dst
, E
, I
, H
, 1, PXSTRIDE(stride
), 4 << idx
205 static void loop_filter_h_sb128uv_c(pixel
*dst
, const ptrdiff_t stride
,
206 const uint32_t *const vmask
,
207 const uint8_t (*l
)[4], ptrdiff_t b4_stride
,
208 const Av1FilterLUT
*lut
, const int h
211 const unsigned vm
= vmask
[0] | vmask
[1];
212 for (unsigned y
= 1; vm
& ~(y
- 1);
213 y
<<= 1, dst
+= 4 * PXSTRIDE(stride
), l
+= b4_stride
)
216 const int L
= l
[0][0] ? l
[0][0] : l
[-1][0];
218 const int H
= L
>> 4;
219 const int E
= lut
->e
[L
], I
= lut
->i
[L
];
220 const int idx
= !!(vmask
[1] & y
);
221 loop_filter(dst
, E
, I
, H
, PXSTRIDE(stride
), 1, 4 + 2 * idx
227 static void loop_filter_v_sb128uv_c(pixel
*dst
, const ptrdiff_t stride
,
228 const uint32_t *const vmask
,
229 const uint8_t (*l
)[4], ptrdiff_t b4_stride
,
230 const Av1FilterLUT
*lut
, const int w
233 const unsigned vm
= vmask
[0] | vmask
[1];
234 for (unsigned x
= 1; vm
& ~(x
- 1); x
<<= 1, dst
+= 4, l
++) {
236 const int L
= l
[0][0] ? l
[0][0] : l
[-b4_stride
][0];
238 const int H
= L
>> 4;
239 const int E
= lut
->e
[L
], I
= lut
->i
[L
];
240 const int idx
= !!(vmask
[1] & x
);
241 loop_filter(dst
, E
, I
, H
, 1, PXSTRIDE(stride
), 4 + 2 * idx
248 #if ARCH_AARCH64 || ARCH_ARM
249 #include "src/arm/loopfilter.h"
250 #elif ARCH_LOONGARCH64
251 #include "src/loongarch/loopfilter.h"
253 #include "src/ppc/loopfilter.h"
255 #include "src/x86/loopfilter.h"
259 COLD
void bitfn(dav1d_loop_filter_dsp_init
)(Dav1dLoopFilterDSPContext
*const c
) {
260 c
->loop_filter_sb
[0][0] = loop_filter_h_sb128y_c
;
261 c
->loop_filter_sb
[0][1] = loop_filter_v_sb128y_c
;
262 c
->loop_filter_sb
[1][0] = loop_filter_h_sb128uv_c
;
263 c
->loop_filter_sb
[1][1] = loop_filter_v_sb128uv_c
;
266 #if ARCH_AARCH64 || ARCH_ARM
267 loop_filter_dsp_init_arm(c
);
268 #elif ARCH_LOONGARCH64
269 loop_filter_dsp_init_loongarch(c
);
271 loop_filter_dsp_init_ppc(c
);
273 loop_filter_dsp_init_x86(c
);