Merge changes from topic 'missing-proto'
[aom.git] / vpx_dsp / mips / loopfilter_msa.h
blob62b170610b8b5e4de4c518aec941eeaf57f37566
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
11 #ifndef VPX_DSP_LOOPFILTER_MSA_H_
12 #define VPX_DSP_LOOPFILTER_MSA_H_
14 #include "vpx_dsp/mips/macros_msa.h"
16 #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
17 p1_out, p0_out, q0_out, q1_out) { \
18 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
19 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
20 v8i16 q0_sub_p0_r, filt_r, cnst3h; \
22 p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
23 p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
24 q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
25 q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
27 filt = __msa_subs_s_b(p1_m, q1_m); \
28 filt = filt & (v16i8)hev_in; \
29 q0_sub_p0 = q0_m - p0_m; \
30 filt_sign = __msa_clti_s_b(filt, 0); \
32 cnst3h = __msa_ldi_h(3); \
33 q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
34 q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
35 filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
36 filt_r += q0_sub_p0_r; \
37 filt_r = __msa_sat_s_h(filt_r, 7); \
39 /* combine left and right part */ \
40 filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \
42 filt = filt & (v16i8)mask_in; \
43 cnst4b = __msa_ldi_b(4); \
44 filt1 = __msa_adds_s_b(filt, cnst4b); \
45 filt1 >>= 3; \
47 cnst3b = __msa_ldi_b(3); \
48 filt2 = __msa_adds_s_b(filt, cnst3b); \
49 filt2 >>= 3; \
51 q0_m = __msa_subs_s_b(q0_m, filt1); \
52 q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
53 p0_m = __msa_adds_s_b(p0_m, filt2); \
54 p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
56 filt = __msa_srari_b(filt1, 1); \
57 hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
58 filt = filt & (v16i8)hev_in; \
60 q1_m = __msa_subs_s_b(q1_m, filt); \
61 q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
62 p1_m = __msa_adds_s_b(p1_m, filt); \
63 p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
66 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
67 p1_out, p0_out, q0_out, q1_out) { \
68 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
69 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
70 v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
72 p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
73 p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
74 q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
75 q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
77 filt = __msa_subs_s_b(p1_m, q1_m); \
79 filt = filt & (v16i8)hev_in; \
81 q0_sub_p0 = q0_m - p0_m; \
82 filt_sign = __msa_clti_s_b(filt, 0); \
84 cnst3h = __msa_ldi_h(3); \
85 q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
86 q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
87 filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
88 filt_r += q0_sub_p0_r; \
89 filt_r = __msa_sat_s_h(filt_r, 7); \
91 q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
92 q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \
93 filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
94 filt_l += q0_sub_p0_l; \
95 filt_l = __msa_sat_s_h(filt_l, 7); \
97 filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
98 filt = filt & (v16i8)mask_in; \
100 cnst4b = __msa_ldi_b(4); \
101 filt1 = __msa_adds_s_b(filt, cnst4b); \
102 filt1 >>= 3; \
104 cnst3b = __msa_ldi_b(3); \
105 filt2 = __msa_adds_s_b(filt, cnst3b); \
106 filt2 >>= 3; \
108 q0_m = __msa_subs_s_b(q0_m, filt1); \
109 q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
110 p0_m = __msa_adds_s_b(p0_m, filt2); \
111 p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
113 filt = __msa_srari_b(filt1, 1); \
114 hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
115 filt = filt & (v16i8)hev_in; \
117 q1_m = __msa_subs_s_b(q1_m, filt); \
118 q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
119 p1_m = __msa_adds_s_b(p1_m, filt); \
120 p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
123 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) { \
124 v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
125 v16u8 zero_in = { 0 }; \
127 tmp = __msa_ori_b(zero_in, 1); \
128 p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
129 q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
130 p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
131 q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
133 p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
134 flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
135 p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
136 flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
138 flat_out = (tmp < (v16u8)flat_out); \
139 flat_out = __msa_xori_b(flat_out, 0xff); \
140 flat_out = flat_out & (mask); \
143 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
144 q5_in, q6_in, q7_in, flat_in, flat2_out) { \
145 v16u8 tmp, zero_in = { 0 }; \
146 v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
147 v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
149 tmp = __msa_ori_b(zero_in, 1); \
150 p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
151 q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
152 p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
153 q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
154 p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
155 q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
156 p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
157 q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
159 p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
160 flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
161 flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
162 p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
163 flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
164 p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
165 flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
167 flat2_out = (tmp < (v16u8)flat2_out); \
168 flat2_out = __msa_xori_b(flat2_out, 0xff); \
169 flat2_out = flat2_out & flat_in; \
172 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
173 q0_in, q1_in, q2_in, q3_in, \
174 p2_filt8_out, p1_filt8_out, p0_filt8_out, \
175 q0_filt8_out, q1_filt8_out, q2_filt8_out) { \
176 v8u16 tmp0, tmp1, tmp2; \
178 tmp2 = p2_in + p1_in + p0_in; \
179 tmp0 = p3_in << 1; \
181 tmp0 = tmp0 + tmp2 + q0_in; \
182 tmp1 = tmp0 + p3_in + p2_in; \
183 p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
185 tmp1 = tmp0 + p1_in + q1_in; \
186 p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
188 tmp1 = q2_in + q1_in + q0_in; \
189 tmp2 = tmp2 + tmp1; \
190 tmp0 = tmp2 + (p0_in); \
191 tmp0 = tmp0 + (p3_in); \
192 p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \
194 tmp0 = q2_in + q3_in; \
195 tmp0 = p0_in + tmp1 + tmp0; \
196 tmp1 = q3_in + q3_in; \
197 tmp1 = tmp1 + tmp0; \
198 q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
200 tmp0 = tmp2 + q3_in; \
201 tmp1 = tmp0 + q0_in; \
202 q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
204 tmp1 = tmp0 - p2_in; \
205 tmp0 = q1_in + q3_in; \
206 tmp1 = tmp0 + tmp1; \
207 q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
210 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
211 q0_in, q1_in, q2_in, q3_in, \
212 limit_in, b_limit_in, thresh_in, \
213 hev_out, mask_out, flat_out) { \
214 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
215 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
217 /* absolute subtraction of pixel values */ \
218 p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
219 p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
220 p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
221 q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
222 q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
223 q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
224 p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
225 p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
227 /* calculation of hev */ \
228 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
229 hev_out = thresh_in < (v16u8)flat_out; \
231 /* calculation of mask */ \
232 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
233 p1_asub_q1_m >>= 1; \
234 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
236 mask_out = b_limit_in < p0_asub_q0_m; \
237 mask_out = __msa_max_u_b(flat_out, mask_out); \
238 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
239 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
240 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
241 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
243 mask_out = limit_in < (v16u8)mask_out; \
244 mask_out = __msa_xori_b(mask_out, 0xff); \
246 #endif /* VPX_DSP_LOOPFILTER_MSA_H_ */