Intrinsic version of loopfilter now matches C code
[aom.git] / vp9 / common / x86 / vp9_loopfilter_x86.c
blob70499d94a64a984c70208f2a703c5f4dfc1b4c3b
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
11 #include <emmintrin.h> // SSE2
12 #include "vpx_config.h"
13 #include "vp9/common/vp9_loopfilter.h"
14 #include "vpx_ports/emmintrin_compat.h"
16 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
17 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
19 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
20 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
22 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
23 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
25 #if HAVE_MMX
26 /* Horizontal MB filtering */
27 void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr,
28 unsigned char *u_ptr, unsigned char *v_ptr,
29 int y_stride, int uv_stride,
30 struct loop_filter_info *lfi) {
33 /* Vertical MB Filtering */
34 void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr,
35 unsigned char *u_ptr, unsigned char *v_ptr,
36 int y_stride, int uv_stride,
37 struct loop_filter_info *lfi) {
40 /* Horizontal B Filtering */
41 void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
42 unsigned char *u_ptr, unsigned char *v_ptr,
43 int y_stride, int uv_stride,
44 struct loop_filter_info *lfi) {
48 void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
49 const unsigned char *blimit) {
50 vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
51 y_stride, blimit);
52 vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
53 y_stride, blimit);
54 vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
55 y_stride, blimit);
58 /* Vertical B Filtering */
59 void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
60 unsigned char *u_ptr, unsigned char *v_ptr,
61 int y_stride, int uv_stride,
62 struct loop_filter_info *lfi) {
63 vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride,
64 lfi->blim, lfi->lim, lfi->hev_thr, 2);
65 vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride,
66 lfi->blim, lfi->lim, lfi->hev_thr, 2);
67 vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride,
68 lfi->blim, lfi->lim, lfi->hev_thr, 2);
70 if (u_ptr)
71 vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride,
72 lfi->blim, lfi->lim, lfi->hev_thr, 1);
74 if (v_ptr)
75 vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride,
76 lfi->blim, lfi->lim, lfi->hev_thr, 1);
79 void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
80 const unsigned char *blimit) {
81 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
82 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
83 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
85 #endif
87 #if HAVE_SSE2
89 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
90 int p,
91 const unsigned char *_blimit,
92 const unsigned char *_limit,
93 const unsigned char *_thresh) {
94 DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
95 DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
97 DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
98 DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
100 DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
101 DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
104 __m128i mask, hev, flat, flat2;
105 const __m128i zero = _mm_set1_epi16(0);
106 const __m128i one = _mm_set1_epi8(1);
107 __m128i p7, p6, p5;
108 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
109 __m128i q5, q6, q7;
110 int i = 0;
111 const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
112 const unsigned int extended_limit = _limit[0] * 0x01010101u;
113 const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
114 const __m128i thresh =
115 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
116 const __m128i limit =
117 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
118 const __m128i blimit =
119 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
121 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
122 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
123 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
124 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
125 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
126 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
127 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
128 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
129 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
130 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
132 _mm_store_si128((__m128i *)ap[4], p4);
133 _mm_store_si128((__m128i *)ap[3], p3);
134 _mm_store_si128((__m128i *)ap[2], p2);
135 _mm_store_si128((__m128i *)ap[1], p1);
136 _mm_store_si128((__m128i *)ap[0], p0);
137 _mm_store_si128((__m128i *)aq[4], q4);
138 _mm_store_si128((__m128i *)aq[3], q3);
139 _mm_store_si128((__m128i *)aq[2], q2);
140 _mm_store_si128((__m128i *)aq[1], q1);
141 _mm_store_si128((__m128i *)aq[0], q0);
145 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
146 _mm_subs_epu8(p0, p1));
147 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
148 _mm_subs_epu8(q0, q1));
149 const __m128i fe = _mm_set1_epi8(0xfe);
150 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
151 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
152 _mm_subs_epu8(q0, p0));
153 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
154 _mm_subs_epu8(q1, p1));
155 __m128i work;
156 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
157 hev = _mm_subs_epu8(flat, thresh);
158 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
160 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
161 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
162 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
163 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
164 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
165 mask = _mm_max_epu8(flat, mask);
166 // mask |= (abs(p1 - p0) > limit) * -1;
167 // mask |= (abs(q1 - q0) > limit) * -1;
168 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
169 _mm_subs_epu8(p1, p2)),
170 _mm_or_si128(_mm_subs_epu8(p3, p2),
171 _mm_subs_epu8(p2, p3)));
172 mask = _mm_max_epu8(work, mask);
173 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
174 _mm_subs_epu8(q1, q2)),
175 _mm_or_si128(_mm_subs_epu8(q3, q2),
176 _mm_subs_epu8(q2, q3)));
177 mask = _mm_max_epu8(work, mask);
178 mask = _mm_subs_epu8(mask, limit);
179 mask = _mm_cmpeq_epi8(mask, zero);
182 // lp filter
184 const __m128i t4 = _mm_set1_epi8(4);
185 const __m128i t3 = _mm_set1_epi8(3);
186 const __m128i t80 = _mm_set1_epi8(0x80);
187 const __m128i te0 = _mm_set1_epi8(0xe0);
188 const __m128i t1f = _mm_set1_epi8(0x1f);
189 const __m128i t1 = _mm_set1_epi8(0x1);
190 const __m128i t7f = _mm_set1_epi8(0x7f);
192 __m128i ps1 = _mm_xor_si128(p1, t80);
193 __m128i ps0 = _mm_xor_si128(p0, t80);
194 __m128i qs0 = _mm_xor_si128(q0, t80);
195 __m128i qs1 = _mm_xor_si128(q1, t80);
196 __m128i filt;
197 __m128i work_a;
198 __m128i filter1, filter2;
200 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
201 work_a = _mm_subs_epi8(qs0, ps0);
202 filt = _mm_adds_epi8(filt, work_a);
203 filt = _mm_adds_epi8(filt, work_a);
204 filt = _mm_adds_epi8(filt, work_a);
205 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
206 filt = _mm_and_si128(filt, mask);
208 filter1 = _mm_adds_epi8(filt, t4);
209 filter2 = _mm_adds_epi8(filt, t3);
211 /* Filter1 >> 3 */
212 work_a = _mm_cmpgt_epi8(zero, filter1);
213 filter1 = _mm_srli_epi16(filter1, 3);
214 work_a = _mm_and_si128(work_a, te0);
215 filter1 = _mm_and_si128(filter1, t1f);
216 filter1 = _mm_or_si128(filter1, work_a);
217 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
219 /* Filter2 >> 3 */
220 work_a = _mm_cmpgt_epi8(zero, filter2);
221 filter2 = _mm_srli_epi16(filter2, 3);
222 work_a = _mm_and_si128(work_a, te0);
223 filter2 = _mm_and_si128(filter2, t1f);
224 filter2 = _mm_or_si128(filter2, work_a);
225 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
227 /* filt >> 1 */
228 filt = _mm_adds_epi8(filter1, t1);
229 work_a = _mm_cmpgt_epi8(zero, filt);
230 filt = _mm_srli_epi16(filt, 1);
231 work_a = _mm_and_si128(work_a, t80);
232 filt = _mm_and_si128(filt, t7f);
233 filt = _mm_or_si128(filt, work_a);
234 filt = _mm_andnot_si128(hev, filt);
235 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
236 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
237 // loopfilter done
240 __m128i work;
241 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
242 _mm_subs_epu8(p0, p2)),
243 _mm_or_si128(_mm_subs_epu8(q2, q0),
244 _mm_subs_epu8(q0, q2)));
245 flat = _mm_max_epu8(work, flat);
246 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
247 _mm_subs_epu8(p0, p3)),
248 _mm_or_si128(_mm_subs_epu8(q3, q0),
249 _mm_subs_epu8(q0, q3)));
250 flat = _mm_max_epu8(work, flat);
251 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
252 _mm_subs_epu8(p0, p4)),
253 _mm_or_si128(_mm_subs_epu8(q4, q0),
254 _mm_subs_epu8(q0, q4)));
255 flat = _mm_subs_epu8(flat, one);
256 flat = _mm_cmpeq_epi8(flat, zero);
257 flat = _mm_and_si128(flat, mask);
259 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
260 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
261 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
262 _mm_subs_epu8(p0, p5)),
263 _mm_or_si128(_mm_subs_epu8(q5, q0),
264 _mm_subs_epu8(q0, q5)));
265 _mm_store_si128((__m128i *)ap[5], p5);
266 _mm_store_si128((__m128i *)aq[5], q5);
267 flat2 = _mm_max_epu8(work, flat2);
268 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
269 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
270 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
271 _mm_subs_epu8(p0, p6)),
272 _mm_or_si128(_mm_subs_epu8(q6, q0),
273 _mm_subs_epu8(q0, q6)));
274 _mm_store_si128((__m128i *)ap[6], p6);
275 _mm_store_si128((__m128i *)aq[6], q6);
276 flat2 = _mm_max_epu8(work, flat2);
278 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
279 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
280 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
281 _mm_subs_epu8(p0, p7)),
282 _mm_or_si128(_mm_subs_epu8(q7, q0),
283 _mm_subs_epu8(q0, q7)));
284 _mm_store_si128((__m128i *)ap[7], p7);
285 _mm_store_si128((__m128i *)aq[7], q7);
286 flat2 = _mm_max_epu8(work, flat2);
287 flat2 = _mm_subs_epu8(flat2, one);
288 flat2 = _mm_cmpeq_epi8(flat2, zero);
289 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
292 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
293 // flat and wide flat calculations
295 const __m128i eight = _mm_set1_epi16(8);
296 const __m128i four = _mm_set1_epi16(4);
297 __m128i temp_flat2 = flat2;
298 unsigned char *src = s;
299 int i = 0;
300 do {
301 __m128i workp_shft;
302 __m128i a, b, c;
304 unsigned int off = i * 8;
305 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
306 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
307 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
308 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
309 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
310 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
311 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
312 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
313 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
314 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
315 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
316 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
317 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
318 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
319 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
320 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
322 c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
323 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
325 b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
326 a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
327 a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
329 _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
330 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
331 , b));
333 c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
334 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
335 _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
336 _mm_packus_epi16(workp_shft, workp_shft));
338 a = _mm_add_epi16(q1, a);
339 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
340 _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
341 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
342 , b));
344 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
345 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
346 _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
347 _mm_packus_epi16(workp_shft, workp_shft));
349 a = _mm_add_epi16(q2, a);
350 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
351 _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
352 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
353 , b));
355 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
356 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
357 _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
358 _mm_packus_epi16(workp_shft, workp_shft));
360 a = _mm_add_epi16(q3, a);
361 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
362 _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
363 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
364 , b));
366 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
367 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
368 _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
369 _mm_packus_epi16(workp_shft, workp_shft));
371 b = _mm_add_epi16(q3, b);
372 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
373 _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
374 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
375 , b));
377 c = _mm_add_epi16(q4, c);
378 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
379 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
380 _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
381 _mm_packus_epi16(workp_shft, workp_shft));
383 b = _mm_add_epi16(q3, b);
384 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
385 _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
386 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
387 , b));
388 a = _mm_add_epi16(q5, a);
389 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
390 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
391 _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
392 _mm_packus_epi16(workp_shft, workp_shft));
394 a = _mm_add_epi16(q6, a);
395 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
396 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
397 _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
398 _mm_packus_epi16(workp_shft, workp_shft));
400 a = _mm_add_epi16(q7, a);
401 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
402 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
403 _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
404 _mm_packus_epi16(workp_shft, workp_shft));
406 a = _mm_add_epi16(q7, a);
407 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
408 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
409 _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
410 _mm_packus_epi16(workp_shft, workp_shft));
412 a = _mm_add_epi16(q7, a);
413 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
414 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
415 _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
416 _mm_packus_epi16(workp_shft, workp_shft));
418 a = _mm_add_epi16(q7, a);
419 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
420 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
421 _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
422 _mm_packus_epi16(workp_shft, workp_shft));
424 a = _mm_add_epi16(q7, a);
425 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
426 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
427 _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
428 _mm_packus_epi16(workp_shft, workp_shft));
430 a = _mm_add_epi16(q7, a);
431 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
432 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
433 _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
434 _mm_packus_epi16(workp_shft, workp_shft));
436 a = _mm_add_epi16(q7, a);
437 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
438 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
439 _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
440 _mm_packus_epi16(workp_shft, workp_shft));
442 temp_flat2 = _mm_srli_si128(temp_flat2, 8);
443 src += 8;
444 } while (++i < 2);
446 // wide flat
447 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
449 work_a = _mm_load_si128((__m128i *)ap[2]);
450 p2 = _mm_load_si128((__m128i *)flat_op[2]);
451 work_a = _mm_andnot_si128(flat, work_a);
452 p2 = _mm_and_si128(flat, p2);
453 p2 = _mm_or_si128(work_a, p2);
454 _mm_store_si128((__m128i *)flat_op[2], p2);
456 p1 = _mm_load_si128((__m128i *)flat_op[1]);
457 work_a = _mm_andnot_si128(flat, ps1);
458 p1 = _mm_and_si128(flat, p1);
459 p1 = _mm_or_si128(work_a, p1);
460 _mm_store_si128((__m128i *)flat_op[1], p1);
462 p0 = _mm_load_si128((__m128i *)flat_op[0]);
463 work_a = _mm_andnot_si128(flat, ps0);
464 p0 = _mm_and_si128(flat, p0);
465 p0 = _mm_or_si128(work_a, p0);
466 _mm_store_si128((__m128i *)flat_op[0], p0);
468 q0 = _mm_load_si128((__m128i *)flat_oq[0]);
469 work_a = _mm_andnot_si128(flat, qs0);
470 q0 = _mm_and_si128(flat, q0);
471 q0 = _mm_or_si128(work_a, q0);
472 _mm_store_si128((__m128i *)flat_oq[0], q0);
474 q1 = _mm_load_si128((__m128i *)flat_oq[1]);
475 work_a = _mm_andnot_si128(flat, qs1);
476 q1 = _mm_and_si128(flat, q1);
477 q1 = _mm_or_si128(work_a, q1);
478 _mm_store_si128((__m128i *)flat_oq[1], q1);
480 work_a = _mm_load_si128((__m128i *)aq[2]);
481 q2 = _mm_load_si128((__m128i *)flat_oq[2]);
482 work_a = _mm_andnot_si128(flat, work_a);
483 q2 = _mm_and_si128(flat, q2);
484 q2 = _mm_or_si128(work_a, q2);
485 _mm_store_si128((__m128i *)flat_oq[2], q2);
487 // write out op6 - op3
489 unsigned char *dst = (s - 7 * p);
490 for (i = 6; i > 2; i--) {
491 __m128i flat2_output;
492 work_a = _mm_load_si128((__m128i *)ap[i]);
493 flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
494 work_a = _mm_andnot_si128(flat2, work_a);
495 flat2_output = _mm_and_si128(flat2, flat2_output);
496 work_a = _mm_or_si128(work_a, flat2_output);
497 _mm_storeu_si128((__m128i *)dst, work_a);
498 dst += p;
502 work_a = _mm_load_si128((__m128i *)flat_op[2]);
503 p2 = _mm_load_si128((__m128i *)flat2_op[2]);
504 work_a = _mm_andnot_si128(flat2, work_a);
505 p2 = _mm_and_si128(flat2, p2);
506 p2 = _mm_or_si128(work_a, p2);
507 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
509 work_a = _mm_load_si128((__m128i *)flat_op[1]);
510 p1 = _mm_load_si128((__m128i *)flat2_op[1]);
511 work_a = _mm_andnot_si128(flat2, work_a);
512 p1 = _mm_and_si128(flat2, p1);
513 p1 = _mm_or_si128(work_a, p1);
514 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
516 work_a = _mm_load_si128((__m128i *)flat_op[0]);
517 p0 = _mm_load_si128((__m128i *)flat2_op[0]);
518 work_a = _mm_andnot_si128(flat2, work_a);
519 p0 = _mm_and_si128(flat2, p0);
520 p0 = _mm_or_si128(work_a, p0);
521 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
523 work_a = _mm_load_si128((__m128i *)flat_oq[0]);
524 q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
525 work_a = _mm_andnot_si128(flat2, work_a);
526 q0 = _mm_and_si128(flat2, q0);
527 q0 = _mm_or_si128(work_a, q0);
528 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
530 work_a = _mm_load_si128((__m128i *)flat_oq[1]);
531 q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
532 work_a = _mm_andnot_si128(flat2, work_a);
533 q1 = _mm_and_si128(flat2, q1);
534 q1 = _mm_or_si128(work_a, q1);
535 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
537 work_a = _mm_load_si128((__m128i *)flat_oq[2]);
538 q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
539 work_a = _mm_andnot_si128(flat2, work_a);
540 q2 = _mm_and_si128(flat2, q2);
541 q2 = _mm_or_si128(work_a, q2);
542 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
544 // write out oq3 - oq7
546 unsigned char *dst = (s + 3 * p);
547 for (i = 3; i < 7; i++) {
548 __m128i flat2_output;
549 work_a = _mm_load_si128((__m128i *)aq[i]);
550 flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
551 work_a = _mm_andnot_si128(flat2, work_a);
552 flat2_output = _mm_and_si128(flat2, flat2_output);
553 work_a = _mm_or_si128(work_a, flat2_output);
554 _mm_storeu_si128((__m128i *)dst, work_a);
555 dst += p;
561 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
562 int p,
563 const unsigned char *_blimit,
564 const unsigned char *_limit,
565 const unsigned char *_thresh) {
566 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
567 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
568 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
569 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
570 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
571 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
572 __m128i mask, hev, flat;
573 const __m128i zero = _mm_set1_epi16(0);
574 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
575 const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
576 const unsigned int extended_limit = _limit[0] * 0x01010101u;
577 const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
578 const __m128i thresh =
579 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
580 const __m128i limit =
581 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
582 const __m128i blimit =
583 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
585 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
586 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
587 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
588 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
589 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
590 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
591 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
592 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
594 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
595 _mm_subs_epu8(p0, p1));
596 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
597 _mm_subs_epu8(q0, q1));
598 const __m128i one = _mm_set1_epi8(1);
599 const __m128i fe = _mm_set1_epi8(0xfe);
600 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
601 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
602 _mm_subs_epu8(q0, p0));
603 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
604 _mm_subs_epu8(q1, p1));
605 __m128i work;
606 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
607 hev = _mm_subs_epu8(flat, thresh);
608 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
610 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
611 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
612 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
613 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
614 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
615 mask = _mm_max_epu8(flat, mask);
616 // mask |= (abs(p1 - p0) > limit) * -1;
617 // mask |= (abs(q1 - q0) > limit) * -1;
618 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
619 _mm_subs_epu8(p1, p2)),
620 _mm_or_si128(_mm_subs_epu8(p3, p2),
621 _mm_subs_epu8(p2, p3)));
622 mask = _mm_max_epu8(work, mask);
623 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
624 _mm_subs_epu8(q1, q2)),
625 _mm_or_si128(_mm_subs_epu8(q3, q2),
626 _mm_subs_epu8(q2, q3)));
627 mask = _mm_max_epu8(work, mask);
628 mask = _mm_subs_epu8(mask, limit);
629 mask = _mm_cmpeq_epi8(mask, zero);
631 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
632 _mm_subs_epu8(p0, p2)),
633 _mm_or_si128(_mm_subs_epu8(q2, q0),
634 _mm_subs_epu8(q0, q2)));
635 flat = _mm_max_epu8(work, flat);
636 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
637 _mm_subs_epu8(p0, p3)),
638 _mm_or_si128(_mm_subs_epu8(q3, q0),
639 _mm_subs_epu8(q0, q3)));
640 flat = _mm_max_epu8(work, flat);
641 flat = _mm_subs_epu8(flat, one);
642 flat = _mm_cmpeq_epi8(flat, zero);
643 flat = _mm_and_si128(flat, mask);
646 const __m128i four = _mm_set1_epi16(4);
647 unsigned char *src = s;
648 int i = 0;
649 do {
650 __m128i workp_a, workp_b, workp_shft;
651 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
652 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
653 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
654 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
655 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
656 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
657 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
658 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
660 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
661 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
662 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
663 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
664 _mm_storel_epi64((__m128i *)&flat_op2[i*8],
665 _mm_packus_epi16(workp_shft, workp_shft));
667 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
668 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
669 _mm_storel_epi64((__m128i *)&flat_op1[i*8],
670 _mm_packus_epi16(workp_shft, workp_shft));
672 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
673 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
674 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
675 _mm_storel_epi64((__m128i *)&flat_op0[i*8],
676 _mm_packus_epi16(workp_shft, workp_shft));
678 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
679 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
680 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
681 _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
682 _mm_packus_epi16(workp_shft, workp_shft));
684 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
685 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
686 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
687 _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
688 _mm_packus_epi16(workp_shft, workp_shft));
690 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
691 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
692 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
693 _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
694 _mm_packus_epi16(workp_shft, workp_shft));
696 src += 8;
697 } while (++i < 2);
699 // lp filter
701 const __m128i t4 = _mm_set1_epi8(4);
702 const __m128i t3 = _mm_set1_epi8(3);
703 const __m128i t80 = _mm_set1_epi8(0x80);
704 const __m128i te0 = _mm_set1_epi8(0xe0);
705 const __m128i t1f = _mm_set1_epi8(0x1f);
706 const __m128i t1 = _mm_set1_epi8(0x1);
707 const __m128i t7f = _mm_set1_epi8(0x7f);
709 const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
710 t80);
711 const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
712 t80);
713 const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
714 t80);
715 const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
716 t80);
717 __m128i filt;
718 __m128i work_a;
719 __m128i filter1, filter2;
721 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
722 work_a = _mm_subs_epi8(qs0, ps0);
723 filt = _mm_adds_epi8(filt, work_a);
724 filt = _mm_adds_epi8(filt, work_a);
725 filt = _mm_adds_epi8(filt, work_a);
726 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
727 filt = _mm_and_si128(filt, mask);
729 filter1 = _mm_adds_epi8(filt, t4);
730 filter2 = _mm_adds_epi8(filt, t3);
732 /* Filter1 >> 3 */
733 work_a = _mm_cmpgt_epi8(zero, filter1);
734 filter1 = _mm_srli_epi16(filter1, 3);
735 work_a = _mm_and_si128(work_a, te0);
736 filter1 = _mm_and_si128(filter1, t1f);
737 filter1 = _mm_or_si128(filter1, work_a);
739 /* Filter2 >> 3 */
740 work_a = _mm_cmpgt_epi8(zero, filter2);
741 filter2 = _mm_srli_epi16(filter2, 3);
742 work_a = _mm_and_si128(work_a, te0);
743 filter2 = _mm_and_si128(filter2, t1f);
744 filter2 = _mm_or_si128(filter2, work_a);
746 /* filt >> 1 */
747 filt = _mm_adds_epi8(filter1, t1);
748 work_a = _mm_cmpgt_epi8(zero, filt);
749 filt = _mm_srli_epi16(filt, 1);
750 work_a = _mm_and_si128(work_a, t80);
751 filt = _mm_and_si128(filt, t7f);
752 filt = _mm_or_si128(filt, work_a);
754 filt = _mm_andnot_si128(hev, filt);
756 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
757 q0 = _mm_load_si128((__m128i *)flat_oq0);
758 work_a = _mm_andnot_si128(flat, work_a);
759 q0 = _mm_and_si128(flat, q0);
760 q0 = _mm_or_si128(work_a, q0);
762 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
763 q1 = _mm_load_si128((__m128i *)flat_oq1);
764 work_a = _mm_andnot_si128(flat, work_a);
765 q1 = _mm_and_si128(flat, q1);
766 q1 = _mm_or_si128(work_a, q1);
768 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
769 q2 = _mm_load_si128((__m128i *)flat_oq2);
770 work_a = _mm_andnot_si128(flat, work_a);
771 q2 = _mm_and_si128(flat, q2);
772 q2 = _mm_or_si128(work_a, q2);
774 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
775 p0 = _mm_load_si128((__m128i *)flat_op0);
776 work_a = _mm_andnot_si128(flat, work_a);
777 p0 = _mm_and_si128(flat, p0);
778 p0 = _mm_or_si128(work_a, p0);
780 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
781 p1 = _mm_load_si128((__m128i *)flat_op1);
782 work_a = _mm_andnot_si128(flat, work_a);
783 p1 = _mm_and_si128(flat, p1);
784 p1 = _mm_or_si128(work_a, p1);
786 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
787 p2 = _mm_load_si128((__m128i *)flat_op2);
788 work_a = _mm_andnot_si128(flat, work_a);
789 p2 = _mm_and_si128(flat, p2);
790 p2 = _mm_or_si128(work_a, p2);
792 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
793 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
794 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
795 _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
796 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
797 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
801 void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
802 int p,
803 const unsigned char *_blimit,
804 const unsigned char *_limit,
805 const unsigned char *_thresh,
806 unsigned char *v) {
807 DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160);
809 /* Read source */
810 const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)),
811 _mm_loadl_epi64((__m128i *)(v - 5 * p)));
812 const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)),
813 _mm_loadl_epi64((__m128i *)(v - 4 * p)));
814 const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)),
815 _mm_loadl_epi64((__m128i *)(v - 3 * p)));
816 const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)),
817 _mm_loadl_epi64((__m128i *)(v - 2 * p)));
818 const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)),
819 _mm_loadl_epi64((__m128i *)(v - 1 * p)));
820 const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)),
821 _mm_loadl_epi64((__m128i *)(v)));
822 const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)),
823 _mm_loadl_epi64((__m128i *)(v + 1 * p)));
824 const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)),
825 _mm_loadl_epi64((__m128i *)(v + 2 * p)));
826 const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)),
827 _mm_loadl_epi64((__m128i *)(v + 3 * p)));
828 const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)),
829 _mm_loadl_epi64((__m128i *)(v + 4 * p)));
831 _mm_store_si128((__m128i *)(src), p4);
832 _mm_store_si128((__m128i *)(src + 16), p3);
833 _mm_store_si128((__m128i *)(src + 32), p2);
834 _mm_store_si128((__m128i *)(src + 48), p1);
835 _mm_store_si128((__m128i *)(src + 64), p0);
836 _mm_store_si128((__m128i *)(src + 80), q0);
837 _mm_store_si128((__m128i *)(src + 96), q1);
838 _mm_store_si128((__m128i *)(src + 112), q2);
839 _mm_store_si128((__m128i *)(src + 128), q3);
840 _mm_store_si128((__m128i *)(src + 144), q4);
842 /* Loop filtering */
843 vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit,
844 _thresh);
846 /* Store result */
847 _mm_storel_epi64((__m128i *)(u - 3 * p),
848 _mm_loadl_epi64((__m128i *)(src + 32)));
849 _mm_storel_epi64((__m128i *)(u - 2 * p),
850 _mm_loadl_epi64((__m128i *)(src + 48)));
851 _mm_storel_epi64((__m128i *)(u - p),
852 _mm_loadl_epi64((__m128i *)(src + 64)));
853 _mm_storel_epi64((__m128i *)u,
854 _mm_loadl_epi64((__m128i *)(src + 80)));
855 _mm_storel_epi64((__m128i *)(u + p),
856 _mm_loadl_epi64((__m128i *)(src + 96)));
857 _mm_storel_epi64((__m128i *)(u + 2 * p),
858 _mm_loadl_epi64((__m128i *)(src + 112)));
860 _mm_storel_epi64((__m128i *)(v - 3 * p),
861 _mm_loadl_epi64((__m128i *)(src + 40)));
862 _mm_storel_epi64((__m128i *)(v - 2 * p),
863 _mm_loadl_epi64((__m128i *)(src + 56)));
864 _mm_storel_epi64((__m128i *)(v - p),
865 _mm_loadl_epi64((__m128i *)(src + 72)));
866 _mm_storel_epi64((__m128i *)v,
867 _mm_loadl_epi64((__m128i *)(src + 88)));
868 _mm_storel_epi64((__m128i *)(v + p),
869 _mm_loadl_epi64((__m128i *)(src + 104)));
870 _mm_storel_epi64((__m128i *)(v + 2 * p),
871 _mm_loadl_epi64((__m128i *)(src + 120)));
874 static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,
875 int in_p, unsigned char *out, int out_p) {
876 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
877 __m128i x8, x9, x10, x11, x12, x13, x14, x15;
879 /* Read in 16 lines */
880 x0 = _mm_loadl_epi64((__m128i *)in0);
881 x8 = _mm_loadl_epi64((__m128i *)in1);
882 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
883 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
884 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
885 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
886 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
887 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
888 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
889 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
890 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
891 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
892 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
893 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
894 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
895 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
897 x0 = _mm_unpacklo_epi8(x0, x1);
898 x1 = _mm_unpacklo_epi8(x2, x3);
899 x2 = _mm_unpacklo_epi8(x4, x5);
900 x3 = _mm_unpacklo_epi8(x6, x7);
902 x8 = _mm_unpacklo_epi8(x8, x9);
903 x9 = _mm_unpacklo_epi8(x10, x11);
904 x10 = _mm_unpacklo_epi8(x12, x13);
905 x11 = _mm_unpacklo_epi8(x14, x15);
907 x4 = _mm_unpacklo_epi16(x0, x1);
908 x5 = _mm_unpacklo_epi16(x2, x3);
909 x12 = _mm_unpacklo_epi16(x8, x9);
910 x13 = _mm_unpacklo_epi16(x10, x11);
912 x6 = _mm_unpacklo_epi32(x4, x5);
913 x7 = _mm_unpackhi_epi32(x4, x5);
914 x14 = _mm_unpacklo_epi32(x12, x13);
915 x15 = _mm_unpackhi_epi32(x12, x13);
917 /* Store first 4-line result */
918 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
919 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
920 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
921 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
923 x4 = _mm_unpackhi_epi16(x0, x1);
924 x5 = _mm_unpackhi_epi16(x2, x3);
925 x12 = _mm_unpackhi_epi16(x8, x9);
926 x13 = _mm_unpackhi_epi16(x10, x11);
928 x6 = _mm_unpacklo_epi32(x4, x5);
929 x7 = _mm_unpackhi_epi32(x4, x5);
930 x14 = _mm_unpacklo_epi32(x12, x13);
931 x15 = _mm_unpackhi_epi32(x12, x13);
933 /* Store second 4-line result */
934 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
935 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
936 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
937 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
940 static __inline void transpose(unsigned char *src[], int in_p,
941 unsigned char *dst[], int out_p,
942 int num_8x8_to_transpose) {
943 int idx8x8 = 0;
944 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
945 do {
946 unsigned char *in = src[idx8x8];
947 unsigned char *out = dst[idx8x8];
949 x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
950 x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
951 x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
952 x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
953 x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
954 x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
955 x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
956 x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
957 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
958 x0 = _mm_unpacklo_epi8(x0, x1);
959 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
960 x1 = _mm_unpacklo_epi8(x2, x3);
961 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
962 x2 = _mm_unpacklo_epi8(x4, x5);
963 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
964 x3 = _mm_unpacklo_epi8(x6, x7);
965 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
966 x4 = _mm_unpacklo_epi16(x0, x1);
967 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
968 x5 = _mm_unpacklo_epi16(x2, x3);
969 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
970 x6 = _mm_unpacklo_epi32(x4, x5);
971 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
972 x7 = _mm_unpackhi_epi32(x4, x5);
974 _mm_storel_pd((double *)(out + 0*out_p),
975 _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
976 _mm_storeh_pd((double *)(out + 1*out_p),
977 _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
978 _mm_storel_pd((double *)(out + 2*out_p),
979 _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
980 _mm_storeh_pd((double *)(out + 3*out_p),
981 _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
983 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
984 x4 = _mm_unpackhi_epi16(x0, x1);
985 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
986 x5 = _mm_unpackhi_epi16(x2, x3);
987 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
988 x6 = _mm_unpacklo_epi32(x4, x5);
989 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
990 x7 = _mm_unpackhi_epi32(x4, x5);
992 _mm_storel_pd((double *)(out + 4*out_p),
993 _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
994 _mm_storeh_pd((double *)(out + 5*out_p),
995 _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
996 _mm_storel_pd((double *)(out + 6*out_p),
997 _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
998 _mm_storeh_pd((double *)(out + 7*out_p),
999 _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
1000 } while (++idx8x8 < num_8x8_to_transpose);
1003 void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
1004 int p,
1005 const unsigned char *blimit,
1006 const unsigned char *limit,
1007 const unsigned char *thresh) {
1008 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1009 unsigned char *src[2];
1010 unsigned char *dst[2];
1012 /* Transpose 16x16 */
1013 transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
1014 transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
1016 /* Loop filtering */
1017 vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
1018 thresh);
1019 src[0] = t_dst + 3 * 16;
1020 src[1] = t_dst + 3 * 16 + 8;
1022 dst[0] = s - 5;
1023 dst[1] = s - 5 + p * 8;
1025 /* Transpose 16x8 */
1026 transpose(src, 16, dst, p, 2);
1029 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
1030 int p,
1031 const unsigned char *blimit,
1032 const unsigned char *limit,
1033 const unsigned char *thresh) {
1034 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1035 unsigned char *src[4];
1036 unsigned char *dst[4];
1038 /* Transpose 16x16 */
1039 transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
1040 transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
1042 /* Loop filtering */
1043 vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
1044 thresh);
1046 src[0] = t_dst;
1047 src[1] = t_dst + 8 * 16;
1048 src[2] = t_dst + 8;
1049 src[3] = t_dst + 8 * 16 + 8;
1051 dst[0] = s - 8;
1052 dst[1] = s - 8 + 8;
1053 dst[2] = s - 8 + p * 8;
1054 dst[3] = s - 8 + p * 8 + 8;
1056 /* Transpose 16x16 */
1057 transpose(src, 16, dst, p, 4);
1061 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
1062 int p,
1063 const unsigned char *blimit,
1064 const unsigned char *limit,
1065 const unsigned char *thresh,
1066 unsigned char *v) {
1067 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1068 unsigned char *src[2];
1069 unsigned char *dst[2];
1071 /* Transpose 16x16 */
1072 transpose8x16(u - 8, v - 8, p, t_dst, 16);
1073 transpose8x16(u, v, p, t_dst + 16 * 8, 16);
1075 /* Loop filtering */
1076 vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
1077 thresh);
1079 src[0] = t_dst + 3 * 16;
1080 src[1] = t_dst + 3 * 16 + 8;
1082 dst[0] = u - 5;
1083 dst[1] = v - 5;
1085 /* Transpose 16x8 */
1086 transpose(src, 16, dst, p, 2);
1089 /* Horizontal MB filtering */
1090 void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,
1091 unsigned char *u_ptr, unsigned char *v_ptr,
1092 int y_stride, int uv_stride,
1093 struct loop_filter_info *lfi) {
1094 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim,
1095 lfi->lim, lfi->hev_thr);
1097 /* u,v */
1098 if (u_ptr)
1099 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1100 lfi->lim, lfi->hev_thr, v_ptr);
1104 void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1105 unsigned char *v_ptr, int y_stride, int uv_stride,
1106 struct loop_filter_info *lfi) {
1107 vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr, y_stride,
1108 lfi->mblim, lfi->lim, lfi->hev_thr);
1110 /* u,v */
1111 if (u_ptr)
1112 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1113 lfi->lim, lfi->hev_thr, v_ptr);
1117 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1118 unsigned char *v_ptr, int y_stride, int uv_stride,
1119 struct loop_filter_info *lfi) {
1120 vp9_mbloop_filter_horizontal_edge_sse2(
1121 y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
1123 if (u_ptr)
1124 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
1125 lfi->blim, lfi->lim, lfi->hev_thr,
1126 v_ptr + 4 * uv_stride);
1129 /* Vertical MB Filtering */
1130 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1131 unsigned char *v_ptr, int y_stride, int uv_stride,
1132 struct loop_filter_info *lfi) {
1133 vp9_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
1134 lfi->hev_thr);
1136 /* u,v */
1137 if (u_ptr)
1138 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1139 lfi->lim, lfi->hev_thr, v_ptr);
1143 void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1144 unsigned char *v_ptr, int y_stride, int uv_stride,
1145 struct loop_filter_info *lfi) {
1146 vp9_mb_lpf_vertical_edge_w_sse2(y_ptr, y_stride,
1147 lfi->mblim, lfi->lim, lfi->hev_thr);
1149 /* u,v */
1150 if (u_ptr)
1151 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1152 lfi->lim, lfi->hev_thr, v_ptr);
1156 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1157 unsigned char *v_ptr, int y_stride, int uv_stride,
1158 struct loop_filter_info *lfi) {
1159 vp9_mbloop_filter_vertical_edge_sse2(
1160 y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
1162 if (u_ptr)
1163 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
1164 lfi->blim, lfi->lim, lfi->hev_thr,
1165 v_ptr + 4);
1168 /* Horizontal B Filtering */
1169 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
1170 unsigned char *u_ptr, unsigned char *v_ptr,
1171 int y_stride, int uv_stride,
1172 struct loop_filter_info *lfi) {
1173 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
1174 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1175 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
1176 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1177 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
1178 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1180 if (u_ptr)
1181 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
1182 lfi->blim, lfi->lim, lfi->hev_thr,
1183 v_ptr + 4 * uv_stride);
1186 void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
1187 const unsigned char *blimit) {
1188 vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
1189 y_stride, blimit);
1190 vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
1191 y_stride, blimit);
1192 vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
1193 y_stride, blimit);
1196 /* Vertical B Filtering */
1197 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
1198 unsigned char *u_ptr, unsigned char *v_ptr,
1199 int y_stride, int uv_stride,
1200 struct loop_filter_info *lfi) {
1201 vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,
1202 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1203 vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,
1204 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1205 vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,
1206 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1208 if (u_ptr)
1209 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
1210 lfi->blim, lfi->lim, lfi->hev_thr,
1211 v_ptr + 4);
1214 void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
1215 const unsigned char *blimit) {
1216 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
1217 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
1218 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
1221 #endif