2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include <emmintrin.h> // SSE2
12 #include "vpx_config.h"
13 #include "vp9/common/vp9_loopfilter.h"
14 #include "vpx_ports/emmintrin_compat.h"
16 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx
);
17 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx
);
19 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2
);
20 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2
);
22 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2
;
23 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2
;
26 /* Horizontal MB filtering */
27 void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr
,
28 unsigned char *u_ptr
, unsigned char *v_ptr
,
29 int y_stride
, int uv_stride
,
30 struct loop_filter_info
*lfi
) {
33 /* Vertical MB Filtering */
34 void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr
,
35 unsigned char *u_ptr
, unsigned char *v_ptr
,
36 int y_stride
, int uv_stride
,
37 struct loop_filter_info
*lfi
) {
40 /* Horizontal B Filtering */
41 void vp9_loop_filter_bh_mmx(unsigned char *y_ptr
,
42 unsigned char *u_ptr
, unsigned char *v_ptr
,
43 int y_stride
, int uv_stride
,
44 struct loop_filter_info
*lfi
) {
48 void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr
, int y_stride
,
49 const unsigned char *blimit
) {
50 vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr
+ 4 * y_stride
,
52 vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr
+ 8 * y_stride
,
54 vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr
+ 12 * y_stride
,
58 /* Vertical B Filtering */
59 void vp9_loop_filter_bv_mmx(unsigned char *y_ptr
,
60 unsigned char *u_ptr
, unsigned char *v_ptr
,
61 int y_stride
, int uv_stride
,
62 struct loop_filter_info
*lfi
) {
63 vp9_loop_filter_vertical_edge_mmx(y_ptr
+ 4, y_stride
,
64 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
65 vp9_loop_filter_vertical_edge_mmx(y_ptr
+ 8, y_stride
,
66 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
67 vp9_loop_filter_vertical_edge_mmx(y_ptr
+ 12, y_stride
,
68 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
71 vp9_loop_filter_vertical_edge_mmx(u_ptr
+ 4, uv_stride
,
72 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 1);
75 vp9_loop_filter_vertical_edge_mmx(v_ptr
+ 4, uv_stride
,
76 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 1);
79 void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr
, int y_stride
,
80 const unsigned char *blimit
) {
81 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr
+ 4, y_stride
, blimit
);
82 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr
+ 8, y_stride
, blimit
);
83 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr
+ 12, y_stride
, blimit
);
89 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s
,
91 const unsigned char *_blimit
,
92 const unsigned char *_limit
,
93 const unsigned char *_thresh
) {
94 DECLARE_ALIGNED(16, unsigned char, flat2_op
[7][16]);
95 DECLARE_ALIGNED(16, unsigned char, flat2_oq
[7][16]);
97 DECLARE_ALIGNED(16, unsigned char, flat_op
[3][16]);
98 DECLARE_ALIGNED(16, unsigned char, flat_oq
[3][16]);
100 DECLARE_ALIGNED(16, unsigned char, ap
[8][16]);
101 DECLARE_ALIGNED(16, unsigned char, aq
[8][16]);
104 __m128i mask
, hev
, flat
, flat2
;
105 const __m128i zero
= _mm_set1_epi16(0);
106 const __m128i one
= _mm_set1_epi8(1);
108 __m128i p4
, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, q4
;
111 const unsigned int extended_thresh
= _thresh
[0] * 0x01010101u
;
112 const unsigned int extended_limit
= _limit
[0] * 0x01010101u
;
113 const unsigned int extended_blimit
= _blimit
[0] * 0x01010101u
;
114 const __m128i thresh
=
115 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh
), 0);
116 const __m128i limit
=
117 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit
), 0);
118 const __m128i blimit
=
119 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit
), 0);
121 p4
= _mm_loadu_si128((__m128i
*)(s
- 5 * p
));
122 p3
= _mm_loadu_si128((__m128i
*)(s
- 4 * p
));
123 p2
= _mm_loadu_si128((__m128i
*)(s
- 3 * p
));
124 p1
= _mm_loadu_si128((__m128i
*)(s
- 2 * p
));
125 p0
= _mm_loadu_si128((__m128i
*)(s
- 1 * p
));
126 q0
= _mm_loadu_si128((__m128i
*)(s
- 0 * p
));
127 q1
= _mm_loadu_si128((__m128i
*)(s
+ 1 * p
));
128 q2
= _mm_loadu_si128((__m128i
*)(s
+ 2 * p
));
129 q3
= _mm_loadu_si128((__m128i
*)(s
+ 3 * p
));
130 q4
= _mm_loadu_si128((__m128i
*)(s
+ 4 * p
));
132 _mm_store_si128((__m128i
*)ap
[4], p4
);
133 _mm_store_si128((__m128i
*)ap
[3], p3
);
134 _mm_store_si128((__m128i
*)ap
[2], p2
);
135 _mm_store_si128((__m128i
*)ap
[1], p1
);
136 _mm_store_si128((__m128i
*)ap
[0], p0
);
137 _mm_store_si128((__m128i
*)aq
[4], q4
);
138 _mm_store_si128((__m128i
*)aq
[3], q3
);
139 _mm_store_si128((__m128i
*)aq
[2], q2
);
140 _mm_store_si128((__m128i
*)aq
[1], q1
);
141 _mm_store_si128((__m128i
*)aq
[0], q0
);
145 const __m128i abs_p1p0
= _mm_or_si128(_mm_subs_epu8(p1
, p0
),
146 _mm_subs_epu8(p0
, p1
));
147 const __m128i abs_q1q0
= _mm_or_si128(_mm_subs_epu8(q1
, q0
),
148 _mm_subs_epu8(q0
, q1
));
149 const __m128i fe
= _mm_set1_epi8(0xfe);
150 const __m128i ff
= _mm_cmpeq_epi8(abs_p1p0
, abs_p1p0
);
151 __m128i abs_p0q0
= _mm_or_si128(_mm_subs_epu8(p0
, q0
),
152 _mm_subs_epu8(q0
, p0
));
153 __m128i abs_p1q1
= _mm_or_si128(_mm_subs_epu8(p1
, q1
),
154 _mm_subs_epu8(q1
, p1
));
156 flat
= _mm_max_epu8(abs_p1p0
, abs_q1q0
);
157 hev
= _mm_subs_epu8(flat
, thresh
);
158 hev
= _mm_xor_si128(_mm_cmpeq_epi8(hev
, zero
), ff
);
160 abs_p0q0
=_mm_adds_epu8(abs_p0q0
, abs_p0q0
);
161 abs_p1q1
= _mm_srli_epi16(_mm_and_si128(abs_p1q1
, fe
), 1);
162 mask
= _mm_subs_epu8(_mm_adds_epu8(abs_p0q0
, abs_p1q1
), blimit
);
163 mask
= _mm_xor_si128(_mm_cmpeq_epi8(mask
, zero
), ff
);
164 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
165 mask
= _mm_max_epu8(flat
, mask
);
166 // mask |= (abs(p1 - p0) > limit) * -1;
167 // mask |= (abs(q1 - q0) > limit) * -1;
168 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p1
),
169 _mm_subs_epu8(p1
, p2
)),
170 _mm_or_si128(_mm_subs_epu8(p3
, p2
),
171 _mm_subs_epu8(p2
, p3
)));
172 mask
= _mm_max_epu8(work
, mask
);
173 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2
, q1
),
174 _mm_subs_epu8(q1
, q2
)),
175 _mm_or_si128(_mm_subs_epu8(q3
, q2
),
176 _mm_subs_epu8(q2
, q3
)));
177 mask
= _mm_max_epu8(work
, mask
);
178 mask
= _mm_subs_epu8(mask
, limit
);
179 mask
= _mm_cmpeq_epi8(mask
, zero
);
184 const __m128i t4
= _mm_set1_epi8(4);
185 const __m128i t3
= _mm_set1_epi8(3);
186 const __m128i t80
= _mm_set1_epi8(0x80);
187 const __m128i te0
= _mm_set1_epi8(0xe0);
188 const __m128i t1f
= _mm_set1_epi8(0x1f);
189 const __m128i t1
= _mm_set1_epi8(0x1);
190 const __m128i t7f
= _mm_set1_epi8(0x7f);
192 __m128i ps1
= _mm_xor_si128(p1
, t80
);
193 __m128i ps0
= _mm_xor_si128(p0
, t80
);
194 __m128i qs0
= _mm_xor_si128(q0
, t80
);
195 __m128i qs1
= _mm_xor_si128(q1
, t80
);
198 __m128i filter1
, filter2
;
200 filt
= _mm_and_si128(_mm_subs_epi8(ps1
, qs1
), hev
);
201 work_a
= _mm_subs_epi8(qs0
, ps0
);
202 filt
= _mm_adds_epi8(filt
, work_a
);
203 filt
= _mm_adds_epi8(filt
, work_a
);
204 filt
= _mm_adds_epi8(filt
, work_a
);
205 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
206 filt
= _mm_and_si128(filt
, mask
);
208 filter1
= _mm_adds_epi8(filt
, t4
);
209 filter2
= _mm_adds_epi8(filt
, t3
);
212 work_a
= _mm_cmpgt_epi8(zero
, filter1
);
213 filter1
= _mm_srli_epi16(filter1
, 3);
214 work_a
= _mm_and_si128(work_a
, te0
);
215 filter1
= _mm_and_si128(filter1
, t1f
);
216 filter1
= _mm_or_si128(filter1
, work_a
);
217 qs0
= _mm_xor_si128(_mm_subs_epi8(qs0
, filter1
), t80
);
220 work_a
= _mm_cmpgt_epi8(zero
, filter2
);
221 filter2
= _mm_srli_epi16(filter2
, 3);
222 work_a
= _mm_and_si128(work_a
, te0
);
223 filter2
= _mm_and_si128(filter2
, t1f
);
224 filter2
= _mm_or_si128(filter2
, work_a
);
225 ps0
= _mm_xor_si128(_mm_adds_epi8(ps0
, filter2
), t80
);
228 filt
= _mm_adds_epi8(filter1
, t1
);
229 work_a
= _mm_cmpgt_epi8(zero
, filt
);
230 filt
= _mm_srli_epi16(filt
, 1);
231 work_a
= _mm_and_si128(work_a
, t80
);
232 filt
= _mm_and_si128(filt
, t7f
);
233 filt
= _mm_or_si128(filt
, work_a
);
234 filt
= _mm_andnot_si128(hev
, filt
);
235 ps1
= _mm_xor_si128(_mm_adds_epi8(ps1
, filt
), t80
);
236 qs1
= _mm_xor_si128(_mm_subs_epi8(qs1
, filt
), t80
);
241 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p0
),
242 _mm_subs_epu8(p0
, p2
)),
243 _mm_or_si128(_mm_subs_epu8(q2
, q0
),
244 _mm_subs_epu8(q0
, q2
)));
245 flat
= _mm_max_epu8(work
, flat
);
246 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3
, p0
),
247 _mm_subs_epu8(p0
, p3
)),
248 _mm_or_si128(_mm_subs_epu8(q3
, q0
),
249 _mm_subs_epu8(q0
, q3
)));
250 flat
= _mm_max_epu8(work
, flat
);
251 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4
, p0
),
252 _mm_subs_epu8(p0
, p4
)),
253 _mm_or_si128(_mm_subs_epu8(q4
, q0
),
254 _mm_subs_epu8(q0
, q4
)));
255 flat
= _mm_subs_epu8(flat
, one
);
256 flat
= _mm_cmpeq_epi8(flat
, zero
);
257 flat
= _mm_and_si128(flat
, mask
);
259 p5
= _mm_loadu_si128((__m128i
*)(s
- 6 * p
));
260 q5
= _mm_loadu_si128((__m128i
*)(s
+ 5 * p
));
261 flat2
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5
, p0
),
262 _mm_subs_epu8(p0
, p5
)),
263 _mm_or_si128(_mm_subs_epu8(q5
, q0
),
264 _mm_subs_epu8(q0
, q5
)));
265 _mm_store_si128((__m128i
*)ap
[5], p5
);
266 _mm_store_si128((__m128i
*)aq
[5], q5
);
267 flat2
= _mm_max_epu8(work
, flat2
);
268 p6
= _mm_loadu_si128((__m128i
*)(s
- 7 * p
));
269 q6
= _mm_loadu_si128((__m128i
*)(s
+ 6 * p
));
270 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6
, p0
),
271 _mm_subs_epu8(p0
, p6
)),
272 _mm_or_si128(_mm_subs_epu8(q6
, q0
),
273 _mm_subs_epu8(q0
, q6
)));
274 _mm_store_si128((__m128i
*)ap
[6], p6
);
275 _mm_store_si128((__m128i
*)aq
[6], q6
);
276 flat2
= _mm_max_epu8(work
, flat2
);
278 p7
= _mm_loadu_si128((__m128i
*)(s
- 8 * p
));
279 q7
= _mm_loadu_si128((__m128i
*)(s
+ 7 * p
));
280 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7
, p0
),
281 _mm_subs_epu8(p0
, p7
)),
282 _mm_or_si128(_mm_subs_epu8(q7
, q0
),
283 _mm_subs_epu8(q0
, q7
)));
284 _mm_store_si128((__m128i
*)ap
[7], p7
);
285 _mm_store_si128((__m128i
*)aq
[7], q7
);
286 flat2
= _mm_max_epu8(work
, flat2
);
287 flat2
= _mm_subs_epu8(flat2
, one
);
288 flat2
= _mm_cmpeq_epi8(flat2
, zero
);
289 flat2
= _mm_and_si128(flat2
, flat
); // flat2 & flat & mask
292 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
293 // flat and wide flat calculations
295 const __m128i eight
= _mm_set1_epi16(8);
296 const __m128i four
= _mm_set1_epi16(4);
297 __m128i temp_flat2
= flat2
;
298 unsigned char *src
= s
;
304 unsigned int off
= i
* 8;
305 p7
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[7] + off
)), zero
);
306 p6
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[6] + off
)), zero
);
307 p5
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[5] + off
)), zero
);
308 p4
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[4] + off
)), zero
);
309 p3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[3] + off
)), zero
);
310 p2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[2] + off
)), zero
);
311 p1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[1] + off
)), zero
);
312 p0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[0] + off
)), zero
);
313 q0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[0] + off
)), zero
);
314 q1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[1] + off
)), zero
);
315 q2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[2] + off
)), zero
);
316 q3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[3] + off
)), zero
);
317 q4
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[4] + off
)), zero
);
318 q5
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[5] + off
)), zero
);
319 q6
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[6] + off
)), zero
);
320 q7
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[7] + off
)), zero
);
322 c
= _mm_sub_epi16(_mm_slli_epi16(p7
, 3), p7
); // p7 * 7
323 c
= _mm_add_epi16(_mm_slli_epi16(p6
, 1), _mm_add_epi16(p4
, c
));
325 b
= _mm_add_epi16(_mm_add_epi16(p3
, four
), _mm_add_epi16(p3
, p2
));
326 a
= _mm_add_epi16(p3
, _mm_add_epi16(p2
, p1
));
327 a
= _mm_add_epi16(_mm_add_epi16(p0
, q0
), a
);
329 _mm_storel_epi64((__m128i
*)&flat_op
[2][i
*8],
330 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
333 c
= _mm_add_epi16(_mm_add_epi16(p5
, eight
), c
);
334 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
335 _mm_storel_epi64((__m128i
*)&flat2_op
[6][i
*8],
336 _mm_packus_epi16(workp_shft
, workp_shft
));
338 a
= _mm_add_epi16(q1
, a
);
339 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p3
, p2
)), p1
);
340 _mm_storel_epi64((__m128i
*)&flat_op
[1][i
*8],
341 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
344 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p6
)), p5
);
345 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
346 _mm_storel_epi64((__m128i
*)&flat2_op
[5][i
*8],
347 _mm_packus_epi16(workp_shft
, workp_shft
));
349 a
= _mm_add_epi16(q2
, a
);
350 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p3
, p1
)), p0
);
351 _mm_storel_epi64((__m128i
*)&flat_op
[0][i
*8],
352 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
355 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p5
)), p4
);
356 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
357 _mm_storel_epi64((__m128i
*)&flat2_op
[4][i
*8],
358 _mm_packus_epi16(workp_shft
, workp_shft
));
360 a
= _mm_add_epi16(q3
, a
);
361 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p3
, p0
)), q0
);
362 _mm_storel_epi64((__m128i
*)&flat_oq
[0][i
*8],
363 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
366 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p4
)), p3
);
367 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
368 _mm_storel_epi64((__m128i
*)&flat2_op
[3][i
*8],
369 _mm_packus_epi16(workp_shft
, workp_shft
));
371 b
= _mm_add_epi16(q3
, b
);
372 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p2
, q0
)), q1
);
373 _mm_storel_epi64((__m128i
*)&flat_oq
[1][i
*8],
374 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
377 c
= _mm_add_epi16(q4
, c
);
378 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p3
)), p2
);
379 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
380 _mm_storel_epi64((__m128i
*)&flat2_op
[2][i
*8],
381 _mm_packus_epi16(workp_shft
, workp_shft
));
383 b
= _mm_add_epi16(q3
, b
);
384 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p1
, q1
)), q2
);
385 _mm_storel_epi64((__m128i
*)&flat_oq
[2][i
*8],
386 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
388 a
= _mm_add_epi16(q5
, a
);
389 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p2
)), p1
);
390 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
391 _mm_storel_epi64((__m128i
*)&flat2_op
[1][i
*8],
392 _mm_packus_epi16(workp_shft
, workp_shft
));
394 a
= _mm_add_epi16(q6
, a
);
395 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p1
)), p0
);
396 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
397 _mm_storel_epi64((__m128i
*)&flat2_op
[0][i
*8],
398 _mm_packus_epi16(workp_shft
, workp_shft
));
400 a
= _mm_add_epi16(q7
, a
);
401 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p0
)), q0
);
402 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
403 _mm_storel_epi64((__m128i
*)&flat2_oq
[0][i
*8],
404 _mm_packus_epi16(workp_shft
, workp_shft
));
406 a
= _mm_add_epi16(q7
, a
);
407 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p6
, q0
)), q1
);
408 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
409 _mm_storel_epi64((__m128i
*)&flat2_oq
[1][i
*8],
410 _mm_packus_epi16(workp_shft
, workp_shft
));
412 a
= _mm_add_epi16(q7
, a
);
413 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p5
, q1
)), q2
);
414 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
415 _mm_storel_epi64((__m128i
*)&flat2_oq
[2][i
*8],
416 _mm_packus_epi16(workp_shft
, workp_shft
));
418 a
= _mm_add_epi16(q7
, a
);
419 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p4
, q2
)), q3
);
420 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
421 _mm_storel_epi64((__m128i
*)&flat2_oq
[3][i
*8],
422 _mm_packus_epi16(workp_shft
, workp_shft
));
424 a
= _mm_add_epi16(q7
, a
);
425 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p3
, q3
)), q4
);
426 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
427 _mm_storel_epi64((__m128i
*)&flat2_oq
[4][i
*8],
428 _mm_packus_epi16(workp_shft
, workp_shft
));
430 a
= _mm_add_epi16(q7
, a
);
431 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p2
, q4
)), q5
);
432 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
433 _mm_storel_epi64((__m128i
*)&flat2_oq
[5][i
*8],
434 _mm_packus_epi16(workp_shft
, workp_shft
));
436 a
= _mm_add_epi16(q7
, a
);
437 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p1
, q5
)), q6
);
438 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
439 _mm_storel_epi64((__m128i
*)&flat2_oq
[6][i
*8],
440 _mm_packus_epi16(workp_shft
, workp_shft
));
442 temp_flat2
= _mm_srli_si128(temp_flat2
, 8);
447 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
449 work_a
= _mm_load_si128((__m128i
*)ap
[2]);
450 p2
= _mm_load_si128((__m128i
*)flat_op
[2]);
451 work_a
= _mm_andnot_si128(flat
, work_a
);
452 p2
= _mm_and_si128(flat
, p2
);
453 p2
= _mm_or_si128(work_a
, p2
);
454 _mm_store_si128((__m128i
*)flat_op
[2], p2
);
456 p1
= _mm_load_si128((__m128i
*)flat_op
[1]);
457 work_a
= _mm_andnot_si128(flat
, ps1
);
458 p1
= _mm_and_si128(flat
, p1
);
459 p1
= _mm_or_si128(work_a
, p1
);
460 _mm_store_si128((__m128i
*)flat_op
[1], p1
);
462 p0
= _mm_load_si128((__m128i
*)flat_op
[0]);
463 work_a
= _mm_andnot_si128(flat
, ps0
);
464 p0
= _mm_and_si128(flat
, p0
);
465 p0
= _mm_or_si128(work_a
, p0
);
466 _mm_store_si128((__m128i
*)flat_op
[0], p0
);
468 q0
= _mm_load_si128((__m128i
*)flat_oq
[0]);
469 work_a
= _mm_andnot_si128(flat
, qs0
);
470 q0
= _mm_and_si128(flat
, q0
);
471 q0
= _mm_or_si128(work_a
, q0
);
472 _mm_store_si128((__m128i
*)flat_oq
[0], q0
);
474 q1
= _mm_load_si128((__m128i
*)flat_oq
[1]);
475 work_a
= _mm_andnot_si128(flat
, qs1
);
476 q1
= _mm_and_si128(flat
, q1
);
477 q1
= _mm_or_si128(work_a
, q1
);
478 _mm_store_si128((__m128i
*)flat_oq
[1], q1
);
480 work_a
= _mm_load_si128((__m128i
*)aq
[2]);
481 q2
= _mm_load_si128((__m128i
*)flat_oq
[2]);
482 work_a
= _mm_andnot_si128(flat
, work_a
);
483 q2
= _mm_and_si128(flat
, q2
);
484 q2
= _mm_or_si128(work_a
, q2
);
485 _mm_store_si128((__m128i
*)flat_oq
[2], q2
);
487 // write out op6 - op3
489 unsigned char *dst
= (s
- 7 * p
);
490 for (i
= 6; i
> 2; i
--) {
491 __m128i flat2_output
;
492 work_a
= _mm_load_si128((__m128i
*)ap
[i
]);
493 flat2_output
= _mm_load_si128((__m128i
*)flat2_op
[i
]);
494 work_a
= _mm_andnot_si128(flat2
, work_a
);
495 flat2_output
= _mm_and_si128(flat2
, flat2_output
);
496 work_a
= _mm_or_si128(work_a
, flat2_output
);
497 _mm_storeu_si128((__m128i
*)dst
, work_a
);
502 work_a
= _mm_load_si128((__m128i
*)flat_op
[2]);
503 p2
= _mm_load_si128((__m128i
*)flat2_op
[2]);
504 work_a
= _mm_andnot_si128(flat2
, work_a
);
505 p2
= _mm_and_si128(flat2
, p2
);
506 p2
= _mm_or_si128(work_a
, p2
);
507 _mm_storeu_si128((__m128i
*)(s
- 3 * p
), p2
);
509 work_a
= _mm_load_si128((__m128i
*)flat_op
[1]);
510 p1
= _mm_load_si128((__m128i
*)flat2_op
[1]);
511 work_a
= _mm_andnot_si128(flat2
, work_a
);
512 p1
= _mm_and_si128(flat2
, p1
);
513 p1
= _mm_or_si128(work_a
, p1
);
514 _mm_storeu_si128((__m128i
*)(s
- 2 * p
), p1
);
516 work_a
= _mm_load_si128((__m128i
*)flat_op
[0]);
517 p0
= _mm_load_si128((__m128i
*)flat2_op
[0]);
518 work_a
= _mm_andnot_si128(flat2
, work_a
);
519 p0
= _mm_and_si128(flat2
, p0
);
520 p0
= _mm_or_si128(work_a
, p0
);
521 _mm_storeu_si128((__m128i
*)(s
- 1 * p
), p0
);
523 work_a
= _mm_load_si128((__m128i
*)flat_oq
[0]);
524 q0
= _mm_load_si128((__m128i
*)flat2_oq
[0]);
525 work_a
= _mm_andnot_si128(flat2
, work_a
);
526 q0
= _mm_and_si128(flat2
, q0
);
527 q0
= _mm_or_si128(work_a
, q0
);
528 _mm_storeu_si128((__m128i
*)(s
- 0 * p
), q0
);
530 work_a
= _mm_load_si128((__m128i
*)flat_oq
[1]);
531 q1
= _mm_load_si128((__m128i
*)flat2_oq
[1]);
532 work_a
= _mm_andnot_si128(flat2
, work_a
);
533 q1
= _mm_and_si128(flat2
, q1
);
534 q1
= _mm_or_si128(work_a
, q1
);
535 _mm_storeu_si128((__m128i
*)(s
+ 1 * p
), q1
);
537 work_a
= _mm_load_si128((__m128i
*)flat_oq
[2]);
538 q2
= _mm_load_si128((__m128i
*)flat2_oq
[2]);
539 work_a
= _mm_andnot_si128(flat2
, work_a
);
540 q2
= _mm_and_si128(flat2
, q2
);
541 q2
= _mm_or_si128(work_a
, q2
);
542 _mm_storeu_si128((__m128i
*)(s
+ 2 * p
), q2
);
544 // write out oq3 - oq7
546 unsigned char *dst
= (s
+ 3 * p
);
547 for (i
= 3; i
< 7; i
++) {
548 __m128i flat2_output
;
549 work_a
= _mm_load_si128((__m128i
*)aq
[i
]);
550 flat2_output
= _mm_load_si128((__m128i
*)flat2_oq
[i
]);
551 work_a
= _mm_andnot_si128(flat2
, work_a
);
552 flat2_output
= _mm_and_si128(flat2
, flat2_output
);
553 work_a
= _mm_or_si128(work_a
, flat2_output
);
554 _mm_storeu_si128((__m128i
*)dst
, work_a
);
561 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s
,
563 const unsigned char *_blimit
,
564 const unsigned char *_limit
,
565 const unsigned char *_thresh
) {
566 DECLARE_ALIGNED(16, unsigned char, flat_op2
[16]);
567 DECLARE_ALIGNED(16, unsigned char, flat_op1
[16]);
568 DECLARE_ALIGNED(16, unsigned char, flat_op0
[16]);
569 DECLARE_ALIGNED(16, unsigned char, flat_oq2
[16]);
570 DECLARE_ALIGNED(16, unsigned char, flat_oq1
[16]);
571 DECLARE_ALIGNED(16, unsigned char, flat_oq0
[16]);
572 __m128i mask
, hev
, flat
;
573 const __m128i zero
= _mm_set1_epi16(0);
574 __m128i p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
;
575 const unsigned int extended_thresh
= _thresh
[0] * 0x01010101u
;
576 const unsigned int extended_limit
= _limit
[0] * 0x01010101u
;
577 const unsigned int extended_blimit
= _blimit
[0] * 0x01010101u
;
578 const __m128i thresh
=
579 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh
), 0);
580 const __m128i limit
=
581 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit
), 0);
582 const __m128i blimit
=
583 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit
), 0);
585 p3
= _mm_loadu_si128((__m128i
*)(s
- 4 * p
));
586 p2
= _mm_loadu_si128((__m128i
*)(s
- 3 * p
));
587 p1
= _mm_loadu_si128((__m128i
*)(s
- 2 * p
));
588 p0
= _mm_loadu_si128((__m128i
*)(s
- 1 * p
));
589 q0
= _mm_loadu_si128((__m128i
*)(s
- 0 * p
));
590 q1
= _mm_loadu_si128((__m128i
*)(s
+ 1 * p
));
591 q2
= _mm_loadu_si128((__m128i
*)(s
+ 2 * p
));
592 q3
= _mm_loadu_si128((__m128i
*)(s
+ 3 * p
));
594 const __m128i abs_p1p0
= _mm_or_si128(_mm_subs_epu8(p1
, p0
),
595 _mm_subs_epu8(p0
, p1
));
596 const __m128i abs_q1q0
= _mm_or_si128(_mm_subs_epu8(q1
, q0
),
597 _mm_subs_epu8(q0
, q1
));
598 const __m128i one
= _mm_set1_epi8(1);
599 const __m128i fe
= _mm_set1_epi8(0xfe);
600 const __m128i ff
= _mm_cmpeq_epi8(abs_p1p0
, abs_p1p0
);
601 __m128i abs_p0q0
= _mm_or_si128(_mm_subs_epu8(p0
, q0
),
602 _mm_subs_epu8(q0
, p0
));
603 __m128i abs_p1q1
= _mm_or_si128(_mm_subs_epu8(p1
, q1
),
604 _mm_subs_epu8(q1
, p1
));
606 flat
= _mm_max_epu8(abs_p1p0
, abs_q1q0
);
607 hev
= _mm_subs_epu8(flat
, thresh
);
608 hev
= _mm_xor_si128(_mm_cmpeq_epi8(hev
, zero
), ff
);
610 abs_p0q0
=_mm_adds_epu8(abs_p0q0
, abs_p0q0
);
611 abs_p1q1
= _mm_srli_epi16(_mm_and_si128(abs_p1q1
, fe
), 1);
612 mask
= _mm_subs_epu8(_mm_adds_epu8(abs_p0q0
, abs_p1q1
), blimit
);
613 mask
= _mm_xor_si128(_mm_cmpeq_epi8(mask
, zero
), ff
);
614 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
615 mask
= _mm_max_epu8(flat
, mask
);
616 // mask |= (abs(p1 - p0) > limit) * -1;
617 // mask |= (abs(q1 - q0) > limit) * -1;
618 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p1
),
619 _mm_subs_epu8(p1
, p2
)),
620 _mm_or_si128(_mm_subs_epu8(p3
, p2
),
621 _mm_subs_epu8(p2
, p3
)));
622 mask
= _mm_max_epu8(work
, mask
);
623 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2
, q1
),
624 _mm_subs_epu8(q1
, q2
)),
625 _mm_or_si128(_mm_subs_epu8(q3
, q2
),
626 _mm_subs_epu8(q2
, q3
)));
627 mask
= _mm_max_epu8(work
, mask
);
628 mask
= _mm_subs_epu8(mask
, limit
);
629 mask
= _mm_cmpeq_epi8(mask
, zero
);
631 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p0
),
632 _mm_subs_epu8(p0
, p2
)),
633 _mm_or_si128(_mm_subs_epu8(q2
, q0
),
634 _mm_subs_epu8(q0
, q2
)));
635 flat
= _mm_max_epu8(work
, flat
);
636 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3
, p0
),
637 _mm_subs_epu8(p0
, p3
)),
638 _mm_or_si128(_mm_subs_epu8(q3
, q0
),
639 _mm_subs_epu8(q0
, q3
)));
640 flat
= _mm_max_epu8(work
, flat
);
641 flat
= _mm_subs_epu8(flat
, one
);
642 flat
= _mm_cmpeq_epi8(flat
, zero
);
643 flat
= _mm_and_si128(flat
, mask
);
646 const __m128i four
= _mm_set1_epi16(4);
647 unsigned char *src
= s
;
650 __m128i workp_a
, workp_b
, workp_shft
;
651 p3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 4 * p
)), zero
);
652 p2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 3 * p
)), zero
);
653 p1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 2 * p
)), zero
);
654 p0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 1 * p
)), zero
);
655 q0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 0 * p
)), zero
);
656 q1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
+ 1 * p
)), zero
);
657 q2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
+ 2 * p
)), zero
);
658 q3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
+ 3 * p
)), zero
);
660 workp_a
= _mm_add_epi16(_mm_add_epi16(p3
, p3
), _mm_add_epi16(p2
, p1
));
661 workp_a
= _mm_add_epi16(_mm_add_epi16(workp_a
, four
), p0
);
662 workp_b
= _mm_add_epi16(_mm_add_epi16(q0
, p2
), p3
);
663 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
664 _mm_storel_epi64((__m128i
*)&flat_op2
[i
*8],
665 _mm_packus_epi16(workp_shft
, workp_shft
));
667 workp_b
= _mm_add_epi16(_mm_add_epi16(q0
, q1
), p1
);
668 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
669 _mm_storel_epi64((__m128i
*)&flat_op1
[i
*8],
670 _mm_packus_epi16(workp_shft
, workp_shft
));
672 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p3
), q2
);
673 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, p1
), p0
);
674 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
675 _mm_storel_epi64((__m128i
*)&flat_op0
[i
*8],
676 _mm_packus_epi16(workp_shft
, workp_shft
));
678 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p3
), q3
);
679 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, p0
), q0
);
680 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
681 _mm_storel_epi64((__m128i
*)&flat_oq0
[i
*8],
682 _mm_packus_epi16(workp_shft
, workp_shft
));
684 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p2
), q3
);
685 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, q0
), q1
);
686 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
687 _mm_storel_epi64((__m128i
*)&flat_oq1
[i
*8],
688 _mm_packus_epi16(workp_shft
, workp_shft
));
690 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p1
), q3
);
691 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, q1
), q2
);
692 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
693 _mm_storel_epi64((__m128i
*)&flat_oq2
[i
*8],
694 _mm_packus_epi16(workp_shft
, workp_shft
));
701 const __m128i t4
= _mm_set1_epi8(4);
702 const __m128i t3
= _mm_set1_epi8(3);
703 const __m128i t80
= _mm_set1_epi8(0x80);
704 const __m128i te0
= _mm_set1_epi8(0xe0);
705 const __m128i t1f
= _mm_set1_epi8(0x1f);
706 const __m128i t1
= _mm_set1_epi8(0x1);
707 const __m128i t7f
= _mm_set1_epi8(0x7f);
709 const __m128i ps1
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
- 2 * p
)),
711 const __m128i ps0
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
- 1 * p
)),
713 const __m128i qs0
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
+ 0 * p
)),
715 const __m128i qs1
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
+ 1 * p
)),
719 __m128i filter1
, filter2
;
721 filt
= _mm_and_si128(_mm_subs_epi8(ps1
, qs1
), hev
);
722 work_a
= _mm_subs_epi8(qs0
, ps0
);
723 filt
= _mm_adds_epi8(filt
, work_a
);
724 filt
= _mm_adds_epi8(filt
, work_a
);
725 filt
= _mm_adds_epi8(filt
, work_a
);
726 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
727 filt
= _mm_and_si128(filt
, mask
);
729 filter1
= _mm_adds_epi8(filt
, t4
);
730 filter2
= _mm_adds_epi8(filt
, t3
);
733 work_a
= _mm_cmpgt_epi8(zero
, filter1
);
734 filter1
= _mm_srli_epi16(filter1
, 3);
735 work_a
= _mm_and_si128(work_a
, te0
);
736 filter1
= _mm_and_si128(filter1
, t1f
);
737 filter1
= _mm_or_si128(filter1
, work_a
);
740 work_a
= _mm_cmpgt_epi8(zero
, filter2
);
741 filter2
= _mm_srli_epi16(filter2
, 3);
742 work_a
= _mm_and_si128(work_a
, te0
);
743 filter2
= _mm_and_si128(filter2
, t1f
);
744 filter2
= _mm_or_si128(filter2
, work_a
);
747 filt
= _mm_adds_epi8(filter1
, t1
);
748 work_a
= _mm_cmpgt_epi8(zero
, filt
);
749 filt
= _mm_srli_epi16(filt
, 1);
750 work_a
= _mm_and_si128(work_a
, t80
);
751 filt
= _mm_and_si128(filt
, t7f
);
752 filt
= _mm_or_si128(filt
, work_a
);
754 filt
= _mm_andnot_si128(hev
, filt
);
756 work_a
= _mm_xor_si128(_mm_subs_epi8(qs0
, filter1
), t80
);
757 q0
= _mm_load_si128((__m128i
*)flat_oq0
);
758 work_a
= _mm_andnot_si128(flat
, work_a
);
759 q0
= _mm_and_si128(flat
, q0
);
760 q0
= _mm_or_si128(work_a
, q0
);
762 work_a
= _mm_xor_si128(_mm_subs_epi8(qs1
, filt
), t80
);
763 q1
= _mm_load_si128((__m128i
*)flat_oq1
);
764 work_a
= _mm_andnot_si128(flat
, work_a
);
765 q1
= _mm_and_si128(flat
, q1
);
766 q1
= _mm_or_si128(work_a
, q1
);
768 work_a
= _mm_loadu_si128((__m128i
*)(s
+ 2 * p
));
769 q2
= _mm_load_si128((__m128i
*)flat_oq2
);
770 work_a
= _mm_andnot_si128(flat
, work_a
);
771 q2
= _mm_and_si128(flat
, q2
);
772 q2
= _mm_or_si128(work_a
, q2
);
774 work_a
= _mm_xor_si128(_mm_adds_epi8(ps0
, filter2
), t80
);
775 p0
= _mm_load_si128((__m128i
*)flat_op0
);
776 work_a
= _mm_andnot_si128(flat
, work_a
);
777 p0
= _mm_and_si128(flat
, p0
);
778 p0
= _mm_or_si128(work_a
, p0
);
780 work_a
= _mm_xor_si128(_mm_adds_epi8(ps1
, filt
), t80
);
781 p1
= _mm_load_si128((__m128i
*)flat_op1
);
782 work_a
= _mm_andnot_si128(flat
, work_a
);
783 p1
= _mm_and_si128(flat
, p1
);
784 p1
= _mm_or_si128(work_a
, p1
);
786 work_a
= _mm_loadu_si128((__m128i
*)(s
- 3 * p
));
787 p2
= _mm_load_si128((__m128i
*)flat_op2
);
788 work_a
= _mm_andnot_si128(flat
, work_a
);
789 p2
= _mm_and_si128(flat
, p2
);
790 p2
= _mm_or_si128(work_a
, p2
);
792 _mm_storeu_si128((__m128i
*)(s
- 3 * p
), p2
);
793 _mm_storeu_si128((__m128i
*)(s
- 2 * p
), p1
);
794 _mm_storeu_si128((__m128i
*)(s
- 1 * p
), p0
);
795 _mm_storeu_si128((__m128i
*)(s
+ 0 * p
), q0
);
796 _mm_storeu_si128((__m128i
*)(s
+ 1 * p
), q1
);
797 _mm_storeu_si128((__m128i
*)(s
+ 2 * p
), q2
);
801 void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u
,
803 const unsigned char *_blimit
,
804 const unsigned char *_limit
,
805 const unsigned char *_thresh
,
807 DECLARE_ALIGNED_ARRAY(16, unsigned char, src
, 160);
810 const __m128i p4
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 5 * p
)),
811 _mm_loadl_epi64((__m128i
*)(v
- 5 * p
)));
812 const __m128i p3
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 4 * p
)),
813 _mm_loadl_epi64((__m128i
*)(v
- 4 * p
)));
814 const __m128i p2
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 3 * p
)),
815 _mm_loadl_epi64((__m128i
*)(v
- 3 * p
)));
816 const __m128i p1
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 2 * p
)),
817 _mm_loadl_epi64((__m128i
*)(v
- 2 * p
)));
818 const __m128i p0
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 1 * p
)),
819 _mm_loadl_epi64((__m128i
*)(v
- 1 * p
)));
820 const __m128i q0
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
)),
821 _mm_loadl_epi64((__m128i
*)(v
)));
822 const __m128i q1
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 1 * p
)),
823 _mm_loadl_epi64((__m128i
*)(v
+ 1 * p
)));
824 const __m128i q2
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 2 * p
)),
825 _mm_loadl_epi64((__m128i
*)(v
+ 2 * p
)));
826 const __m128i q3
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 3 * p
)),
827 _mm_loadl_epi64((__m128i
*)(v
+ 3 * p
)));
828 const __m128i q4
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 4 * p
)),
829 _mm_loadl_epi64((__m128i
*)(v
+ 4 * p
)));
831 _mm_store_si128((__m128i
*)(src
), p4
);
832 _mm_store_si128((__m128i
*)(src
+ 16), p3
);
833 _mm_store_si128((__m128i
*)(src
+ 32), p2
);
834 _mm_store_si128((__m128i
*)(src
+ 48), p1
);
835 _mm_store_si128((__m128i
*)(src
+ 64), p0
);
836 _mm_store_si128((__m128i
*)(src
+ 80), q0
);
837 _mm_store_si128((__m128i
*)(src
+ 96), q1
);
838 _mm_store_si128((__m128i
*)(src
+ 112), q2
);
839 _mm_store_si128((__m128i
*)(src
+ 128), q3
);
840 _mm_store_si128((__m128i
*)(src
+ 144), q4
);
843 vp9_mbloop_filter_horizontal_edge_sse2(src
+ 80, 16, _blimit
, _limit
,
847 _mm_storel_epi64((__m128i
*)(u
- 3 * p
),
848 _mm_loadl_epi64((__m128i
*)(src
+ 32)));
849 _mm_storel_epi64((__m128i
*)(u
- 2 * p
),
850 _mm_loadl_epi64((__m128i
*)(src
+ 48)));
851 _mm_storel_epi64((__m128i
*)(u
- p
),
852 _mm_loadl_epi64((__m128i
*)(src
+ 64)));
853 _mm_storel_epi64((__m128i
*)u
,
854 _mm_loadl_epi64((__m128i
*)(src
+ 80)));
855 _mm_storel_epi64((__m128i
*)(u
+ p
),
856 _mm_loadl_epi64((__m128i
*)(src
+ 96)));
857 _mm_storel_epi64((__m128i
*)(u
+ 2 * p
),
858 _mm_loadl_epi64((__m128i
*)(src
+ 112)));
860 _mm_storel_epi64((__m128i
*)(v
- 3 * p
),
861 _mm_loadl_epi64((__m128i
*)(src
+ 40)));
862 _mm_storel_epi64((__m128i
*)(v
- 2 * p
),
863 _mm_loadl_epi64((__m128i
*)(src
+ 56)));
864 _mm_storel_epi64((__m128i
*)(v
- p
),
865 _mm_loadl_epi64((__m128i
*)(src
+ 72)));
866 _mm_storel_epi64((__m128i
*)v
,
867 _mm_loadl_epi64((__m128i
*)(src
+ 88)));
868 _mm_storel_epi64((__m128i
*)(v
+ p
),
869 _mm_loadl_epi64((__m128i
*)(src
+ 104)));
870 _mm_storel_epi64((__m128i
*)(v
+ 2 * p
),
871 _mm_loadl_epi64((__m128i
*)(src
+ 120)));
874 static __inline
void transpose8x16(unsigned char *in0
, unsigned char *in1
,
875 int in_p
, unsigned char *out
, int out_p
) {
876 __m128i x0
, x1
, x2
, x3
, x4
, x5
, x6
, x7
;
877 __m128i x8
, x9
, x10
, x11
, x12
, x13
, x14
, x15
;
879 /* Read in 16 lines */
880 x0
= _mm_loadl_epi64((__m128i
*)in0
);
881 x8
= _mm_loadl_epi64((__m128i
*)in1
);
882 x1
= _mm_loadl_epi64((__m128i
*)(in0
+ in_p
));
883 x9
= _mm_loadl_epi64((__m128i
*)(in1
+ in_p
));
884 x2
= _mm_loadl_epi64((__m128i
*)(in0
+ 2 * in_p
));
885 x10
= _mm_loadl_epi64((__m128i
*)(in1
+ 2 * in_p
));
886 x3
= _mm_loadl_epi64((__m128i
*)(in0
+ 3*in_p
));
887 x11
= _mm_loadl_epi64((__m128i
*)(in1
+ 3*in_p
));
888 x4
= _mm_loadl_epi64((__m128i
*)(in0
+ 4*in_p
));
889 x12
= _mm_loadl_epi64((__m128i
*)(in1
+ 4*in_p
));
890 x5
= _mm_loadl_epi64((__m128i
*)(in0
+ 5*in_p
));
891 x13
= _mm_loadl_epi64((__m128i
*)(in1
+ 5*in_p
));
892 x6
= _mm_loadl_epi64((__m128i
*)(in0
+ 6*in_p
));
893 x14
= _mm_loadl_epi64((__m128i
*)(in1
+ 6*in_p
));
894 x7
= _mm_loadl_epi64((__m128i
*)(in0
+ 7*in_p
));
895 x15
= _mm_loadl_epi64((__m128i
*)(in1
+ 7*in_p
));
897 x0
= _mm_unpacklo_epi8(x0
, x1
);
898 x1
= _mm_unpacklo_epi8(x2
, x3
);
899 x2
= _mm_unpacklo_epi8(x4
, x5
);
900 x3
= _mm_unpacklo_epi8(x6
, x7
);
902 x8
= _mm_unpacklo_epi8(x8
, x9
);
903 x9
= _mm_unpacklo_epi8(x10
, x11
);
904 x10
= _mm_unpacklo_epi8(x12
, x13
);
905 x11
= _mm_unpacklo_epi8(x14
, x15
);
907 x4
= _mm_unpacklo_epi16(x0
, x1
);
908 x5
= _mm_unpacklo_epi16(x2
, x3
);
909 x12
= _mm_unpacklo_epi16(x8
, x9
);
910 x13
= _mm_unpacklo_epi16(x10
, x11
);
912 x6
= _mm_unpacklo_epi32(x4
, x5
);
913 x7
= _mm_unpackhi_epi32(x4
, x5
);
914 x14
= _mm_unpacklo_epi32(x12
, x13
);
915 x15
= _mm_unpackhi_epi32(x12
, x13
);
917 /* Store first 4-line result */
918 _mm_storeu_si128((__m128i
*)out
, _mm_unpacklo_epi64(x6
, x14
));
919 _mm_storeu_si128((__m128i
*)(out
+ out_p
), _mm_unpackhi_epi64(x6
, x14
));
920 _mm_storeu_si128((__m128i
*)(out
+ 2 * out_p
), _mm_unpacklo_epi64(x7
, x15
));
921 _mm_storeu_si128((__m128i
*)(out
+ 3 * out_p
), _mm_unpackhi_epi64(x7
, x15
));
923 x4
= _mm_unpackhi_epi16(x0
, x1
);
924 x5
= _mm_unpackhi_epi16(x2
, x3
);
925 x12
= _mm_unpackhi_epi16(x8
, x9
);
926 x13
= _mm_unpackhi_epi16(x10
, x11
);
928 x6
= _mm_unpacklo_epi32(x4
, x5
);
929 x7
= _mm_unpackhi_epi32(x4
, x5
);
930 x14
= _mm_unpacklo_epi32(x12
, x13
);
931 x15
= _mm_unpackhi_epi32(x12
, x13
);
933 /* Store second 4-line result */
934 _mm_storeu_si128((__m128i
*)(out
+ 4 * out_p
), _mm_unpacklo_epi64(x6
, x14
));
935 _mm_storeu_si128((__m128i
*)(out
+ 5 * out_p
), _mm_unpackhi_epi64(x6
, x14
));
936 _mm_storeu_si128((__m128i
*)(out
+ 6 * out_p
), _mm_unpacklo_epi64(x7
, x15
));
937 _mm_storeu_si128((__m128i
*)(out
+ 7 * out_p
), _mm_unpackhi_epi64(x7
, x15
));
940 static __inline
void transpose(unsigned char *src
[], int in_p
,
941 unsigned char *dst
[], int out_p
,
942 int num_8x8_to_transpose
) {
944 __m128i x0
, x1
, x2
, x3
, x4
, x5
, x6
, x7
;
946 unsigned char *in
= src
[idx8x8
];
947 unsigned char *out
= dst
[idx8x8
];
949 x0
= _mm_loadl_epi64((__m128i
*)(in
+ 0*in_p
)); // 00 01 02 03 04 05 06 07
950 x1
= _mm_loadl_epi64((__m128i
*)(in
+ 1*in_p
)); // 10 11 12 13 14 15 16 17
951 x2
= _mm_loadl_epi64((__m128i
*)(in
+ 2*in_p
)); // 20 21 22 23 24 25 26 27
952 x3
= _mm_loadl_epi64((__m128i
*)(in
+ 3*in_p
)); // 30 31 32 33 34 35 36 37
953 x4
= _mm_loadl_epi64((__m128i
*)(in
+ 4*in_p
)); // 40 41 42 43 44 45 46 47
954 x5
= _mm_loadl_epi64((__m128i
*)(in
+ 5*in_p
)); // 50 51 52 53 54 55 56 57
955 x6
= _mm_loadl_epi64((__m128i
*)(in
+ 6*in_p
)); // 60 61 62 63 64 65 66 67
956 x7
= _mm_loadl_epi64((__m128i
*)(in
+ 7*in_p
)); // 70 71 72 73 74 75 76 77
957 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
958 x0
= _mm_unpacklo_epi8(x0
, x1
);
959 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
960 x1
= _mm_unpacklo_epi8(x2
, x3
);
961 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
962 x2
= _mm_unpacklo_epi8(x4
, x5
);
963 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
964 x3
= _mm_unpacklo_epi8(x6
, x7
);
965 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
966 x4
= _mm_unpacklo_epi16(x0
, x1
);
967 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
968 x5
= _mm_unpacklo_epi16(x2
, x3
);
969 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
970 x6
= _mm_unpacklo_epi32(x4
, x5
);
971 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
972 x7
= _mm_unpackhi_epi32(x4
, x5
);
974 _mm_storel_pd((double *)(out
+ 0*out_p
),
975 _mm_castsi128_pd(x6
)); // 00 10 20 30 40 50 60 70
976 _mm_storeh_pd((double *)(out
+ 1*out_p
),
977 _mm_castsi128_pd(x6
)); // 01 11 21 31 41 51 61 71
978 _mm_storel_pd((double *)(out
+ 2*out_p
),
979 _mm_castsi128_pd(x7
)); // 02 12 22 32 42 52 62 72
980 _mm_storeh_pd((double *)(out
+ 3*out_p
),
981 _mm_castsi128_pd(x7
)); // 03 13 23 33 43 53 63 73
983 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
984 x4
= _mm_unpackhi_epi16(x0
, x1
);
985 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
986 x5
= _mm_unpackhi_epi16(x2
, x3
);
987 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
988 x6
= _mm_unpacklo_epi32(x4
, x5
);
989 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
990 x7
= _mm_unpackhi_epi32(x4
, x5
);
992 _mm_storel_pd((double *)(out
+ 4*out_p
),
993 _mm_castsi128_pd(x6
)); // 04 14 24 34 44 54 64 74
994 _mm_storeh_pd((double *)(out
+ 5*out_p
),
995 _mm_castsi128_pd(x6
)); // 05 15 25 35 45 55 65 75
996 _mm_storel_pd((double *)(out
+ 6*out_p
),
997 _mm_castsi128_pd(x7
)); // 06 16 26 36 46 56 66 76
998 _mm_storeh_pd((double *)(out
+ 7*out_p
),
999 _mm_castsi128_pd(x7
)); // 07 17 27 37 47 57 67 77
1000 } while (++idx8x8
< num_8x8_to_transpose
);
1003 void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s
,
1005 const unsigned char *blimit
,
1006 const unsigned char *limit
,
1007 const unsigned char *thresh
) {
1008 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst
, 256);
1009 unsigned char *src
[2];
1010 unsigned char *dst
[2];
1012 /* Transpose 16x16 */
1013 transpose8x16(s
- 8, s
- 8 + p
* 8, p
, t_dst
, 16);
1014 transpose8x16(s
, s
+ p
* 8, p
, t_dst
+ 16 * 8, 16);
1016 /* Loop filtering */
1017 vp9_mbloop_filter_horizontal_edge_sse2(t_dst
+ 8 * 16, 16, blimit
, limit
,
1019 src
[0] = t_dst
+ 3 * 16;
1020 src
[1] = t_dst
+ 3 * 16 + 8;
1023 dst
[1] = s
- 5 + p
* 8;
1025 /* Transpose 16x8 */
1026 transpose(src
, 16, dst
, p
, 2);
1029 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s
,
1031 const unsigned char *blimit
,
1032 const unsigned char *limit
,
1033 const unsigned char *thresh
) {
1034 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst
, 256);
1035 unsigned char *src
[4];
1036 unsigned char *dst
[4];
1038 /* Transpose 16x16 */
1039 transpose8x16(s
- 8, s
- 8 + p
* 8, p
, t_dst
, 16);
1040 transpose8x16(s
, s
+ p
* 8, p
, t_dst
+ 16 * 8, 16);
1042 /* Loop filtering */
1043 vp9_mb_lpf_horizontal_edge_w_sse2(t_dst
+ 8 * 16, 16, blimit
, limit
,
1047 src
[1] = t_dst
+ 8 * 16;
1049 src
[3] = t_dst
+ 8 * 16 + 8;
1053 dst
[2] = s
- 8 + p
* 8;
1054 dst
[3] = s
- 8 + p
* 8 + 8;
1056 /* Transpose 16x16 */
1057 transpose(src
, 16, dst
, p
, 4);
1061 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u
,
1063 const unsigned char *blimit
,
1064 const unsigned char *limit
,
1065 const unsigned char *thresh
,
1067 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst
, 256);
1068 unsigned char *src
[2];
1069 unsigned char *dst
[2];
1071 /* Transpose 16x16 */
1072 transpose8x16(u
- 8, v
- 8, p
, t_dst
, 16);
1073 transpose8x16(u
, v
, p
, t_dst
+ 16 * 8, 16);
1075 /* Loop filtering */
1076 vp9_mbloop_filter_horizontal_edge_sse2(t_dst
+ 8 * 16, 16, blimit
, limit
,
1079 src
[0] = t_dst
+ 3 * 16;
1080 src
[1] = t_dst
+ 3 * 16 + 8;
1085 /* Transpose 16x8 */
1086 transpose(src
, 16, dst
, p
, 2);
1089 /* Horizontal MB filtering */
1090 void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr
,
1091 unsigned char *u_ptr
, unsigned char *v_ptr
,
1092 int y_stride
, int uv_stride
,
1093 struct loop_filter_info
*lfi
) {
1094 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr
, y_stride
, lfi
->mblim
,
1095 lfi
->lim
, lfi
->hev_thr
);
1099 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1100 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1104 void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1105 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1106 struct loop_filter_info
*lfi
) {
1107 vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr
, y_stride
,
1108 lfi
->mblim
, lfi
->lim
, lfi
->hev_thr
);
1112 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1113 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1117 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1118 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1119 struct loop_filter_info
*lfi
) {
1120 vp9_mbloop_filter_horizontal_edge_sse2(
1121 y_ptr
+ 8 * y_stride
, y_stride
, lfi
->blim
, lfi
->lim
, lfi
->hev_thr
);
1124 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr
+ 4 * uv_stride
, uv_stride
,
1125 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,
1126 v_ptr
+ 4 * uv_stride
);
1129 /* Vertical MB Filtering */
1130 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1131 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1132 struct loop_filter_info
*lfi
) {
1133 vp9_mbloop_filter_vertical_edge_sse2(y_ptr
, y_stride
, lfi
->mblim
, lfi
->lim
,
1138 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1139 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1143 void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1144 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1145 struct loop_filter_info
*lfi
) {
1146 vp9_mb_lpf_vertical_edge_w_sse2(y_ptr
, y_stride
,
1147 lfi
->mblim
, lfi
->lim
, lfi
->hev_thr
);
1151 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1152 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1156 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1157 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1158 struct loop_filter_info
*lfi
) {
1159 vp9_mbloop_filter_vertical_edge_sse2(
1160 y_ptr
+ 8, y_stride
, lfi
->blim
, lfi
->lim
, lfi
->hev_thr
);
1163 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr
+ 4, uv_stride
,
1164 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,
1168 /* Horizontal B Filtering */
1169 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr
,
1170 unsigned char *u_ptr
, unsigned char *v_ptr
,
1171 int y_stride
, int uv_stride
,
1172 struct loop_filter_info
*lfi
) {
1173 vp9_loop_filter_horizontal_edge_sse2(y_ptr
+ 4 * y_stride
, y_stride
,
1174 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1175 vp9_loop_filter_horizontal_edge_sse2(y_ptr
+ 8 * y_stride
, y_stride
,
1176 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1177 vp9_loop_filter_horizontal_edge_sse2(y_ptr
+ 12 * y_stride
, y_stride
,
1178 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1181 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr
+ 4 * uv_stride
, uv_stride
,
1182 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,
1183 v_ptr
+ 4 * uv_stride
);
1186 void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr
, int y_stride
,
1187 const unsigned char *blimit
) {
1188 vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr
+ 4 * y_stride
,
1190 vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr
+ 8 * y_stride
,
1192 vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr
+ 12 * y_stride
,
1196 /* Vertical B Filtering */
1197 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr
,
1198 unsigned char *u_ptr
, unsigned char *v_ptr
,
1199 int y_stride
, int uv_stride
,
1200 struct loop_filter_info
*lfi
) {
1201 vp9_loop_filter_vertical_edge_sse2(y_ptr
+ 4, y_stride
,
1202 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1203 vp9_loop_filter_vertical_edge_sse2(y_ptr
+ 8, y_stride
,
1204 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1205 vp9_loop_filter_vertical_edge_sse2(y_ptr
+ 12, y_stride
,
1206 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1209 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr
+ 4, uv_stride
,
1210 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,
1214 void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr
, int y_stride
,
1215 const unsigned char *blimit
) {
1216 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr
+ 4, y_stride
, blimit
);
1217 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr
+ 8, y_stride
, blimit
);
1218 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr
+ 12, y_stride
, blimit
);