2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include <emmintrin.h> /* SSE2 */
12 #include "vp9/common/vp9_loopfilter.h"
13 #include "vpx_ports/emmintrin_compat.h"
15 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2
);
16 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2
);
18 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2
;
19 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2
;
21 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s
,
23 const unsigned char *_blimit
,
24 const unsigned char *_limit
,
25 const unsigned char *_thresh
) {
26 DECLARE_ALIGNED(16, unsigned char, flat2_op
[7][8]);
27 DECLARE_ALIGNED(16, unsigned char, flat2_oq
[7][8]);
29 DECLARE_ALIGNED(16, unsigned char, flat_op
[3][8]);
30 DECLARE_ALIGNED(16, unsigned char, flat_oq
[3][8]);
32 DECLARE_ALIGNED(16, unsigned char, ap
[8][8]);
33 DECLARE_ALIGNED(16, unsigned char, aq
[8][8]);
36 __m128i mask
, hev
, flat
, flat2
;
37 const __m128i zero
= _mm_set1_epi16(0);
38 const __m128i one
= _mm_set1_epi8(1);
40 __m128i p4
, p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
, q4
;
43 const unsigned int extended_thresh
= _thresh
[0] * 0x01010101u
;
44 const unsigned int extended_limit
= _limit
[0] * 0x01010101u
;
45 const unsigned int extended_blimit
= _blimit
[0] * 0x01010101u
;
46 const __m128i thresh
=
47 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh
), 0);
49 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit
), 0);
50 const __m128i blimit
=
51 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit
), 0);
53 p4
= _mm_loadl_epi64((__m128i
*)(s
- 5 * p
));
54 p3
= _mm_loadl_epi64((__m128i
*)(s
- 4 * p
));
55 p2
= _mm_loadl_epi64((__m128i
*)(s
- 3 * p
));
56 p1
= _mm_loadl_epi64((__m128i
*)(s
- 2 * p
));
57 p0
= _mm_loadl_epi64((__m128i
*)(s
- 1 * p
));
58 q0
= _mm_loadl_epi64((__m128i
*)(s
- 0 * p
));
59 q1
= _mm_loadl_epi64((__m128i
*)(s
+ 1 * p
));
60 q2
= _mm_loadl_epi64((__m128i
*)(s
+ 2 * p
));
61 q3
= _mm_loadl_epi64((__m128i
*)(s
+ 3 * p
));
62 q4
= _mm_loadl_epi64((__m128i
*)(s
+ 4 * p
));
64 _mm_storel_epi64((__m128i
*)ap
[4], p4
);
65 _mm_storel_epi64((__m128i
*)ap
[3], p3
);
66 _mm_storel_epi64((__m128i
*)ap
[2], p2
);
67 _mm_storel_epi64((__m128i
*)ap
[1], p1
);
68 _mm_storel_epi64((__m128i
*)ap
[0], p0
);
69 _mm_storel_epi64((__m128i
*)aq
[4], q4
);
70 _mm_storel_epi64((__m128i
*)aq
[3], q3
);
71 _mm_storel_epi64((__m128i
*)aq
[2], q2
);
72 _mm_storel_epi64((__m128i
*)aq
[1], q1
);
73 _mm_storel_epi64((__m128i
*)aq
[0], q0
);
77 const __m128i abs_p1p0
= _mm_or_si128(_mm_subs_epu8(p1
, p0
),
78 _mm_subs_epu8(p0
, p1
));
79 const __m128i abs_q1q0
= _mm_or_si128(_mm_subs_epu8(q1
, q0
),
80 _mm_subs_epu8(q0
, q1
));
81 const __m128i fe
= _mm_set1_epi8(0xfe);
82 const __m128i ff
= _mm_cmpeq_epi8(abs_p1p0
, abs_p1p0
);
83 __m128i abs_p0q0
= _mm_or_si128(_mm_subs_epu8(p0
, q0
),
84 _mm_subs_epu8(q0
, p0
));
85 __m128i abs_p1q1
= _mm_or_si128(_mm_subs_epu8(p1
, q1
),
86 _mm_subs_epu8(q1
, p1
));
88 flat
= _mm_max_epu8(abs_p1p0
, abs_q1q0
);
89 hev
= _mm_subs_epu8(flat
, thresh
);
90 hev
= _mm_xor_si128(_mm_cmpeq_epi8(hev
, zero
), ff
);
92 abs_p0q0
=_mm_adds_epu8(abs_p0q0
, abs_p0q0
);
93 abs_p1q1
= _mm_srli_epi16(_mm_and_si128(abs_p1q1
, fe
), 1);
94 mask
= _mm_subs_epu8(_mm_adds_epu8(abs_p0q0
, abs_p1q1
), blimit
);
95 mask
= _mm_xor_si128(_mm_cmpeq_epi8(mask
, zero
), ff
);
96 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
97 mask
= _mm_max_epu8(flat
, mask
);
98 // mask |= (abs(p1 - p0) > limit) * -1;
99 // mask |= (abs(q1 - q0) > limit) * -1;
100 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p1
),
101 _mm_subs_epu8(p1
, p2
)),
102 _mm_or_si128(_mm_subs_epu8(p3
, p2
),
103 _mm_subs_epu8(p2
, p3
)));
104 mask
= _mm_max_epu8(work
, mask
);
105 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2
, q1
),
106 _mm_subs_epu8(q1
, q2
)),
107 _mm_or_si128(_mm_subs_epu8(q3
, q2
),
108 _mm_subs_epu8(q2
, q3
)));
109 mask
= _mm_max_epu8(work
, mask
);
110 mask
= _mm_subs_epu8(mask
, limit
);
111 mask
= _mm_cmpeq_epi8(mask
, zero
);
116 const __m128i t4
= _mm_set1_epi8(4);
117 const __m128i t3
= _mm_set1_epi8(3);
118 const __m128i t80
= _mm_set1_epi8(0x80);
119 const __m128i te0
= _mm_set1_epi8(0xe0);
120 const __m128i t1f
= _mm_set1_epi8(0x1f);
121 const __m128i t1
= _mm_set1_epi8(0x1);
122 const __m128i t7f
= _mm_set1_epi8(0x7f);
124 __m128i ps1
= _mm_xor_si128(p1
, t80
);
125 __m128i ps0
= _mm_xor_si128(p0
, t80
);
126 __m128i qs0
= _mm_xor_si128(q0
, t80
);
127 __m128i qs1
= _mm_xor_si128(q1
, t80
);
130 __m128i filter1
, filter2
;
132 filt
= _mm_and_si128(_mm_subs_epi8(ps1
, qs1
), hev
);
133 work_a
= _mm_subs_epi8(qs0
, ps0
);
134 filt
= _mm_adds_epi8(filt
, work_a
);
135 filt
= _mm_adds_epi8(filt
, work_a
);
136 filt
= _mm_adds_epi8(filt
, work_a
);
137 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
138 filt
= _mm_and_si128(filt
, mask
);
140 filter1
= _mm_adds_epi8(filt
, t4
);
141 filter2
= _mm_adds_epi8(filt
, t3
);
144 work_a
= _mm_cmpgt_epi8(zero
, filter1
);
145 filter1
= _mm_srli_epi16(filter1
, 3);
146 work_a
= _mm_and_si128(work_a
, te0
);
147 filter1
= _mm_and_si128(filter1
, t1f
);
148 filter1
= _mm_or_si128(filter1
, work_a
);
149 qs0
= _mm_xor_si128(_mm_subs_epi8(qs0
, filter1
), t80
);
152 work_a
= _mm_cmpgt_epi8(zero
, filter2
);
153 filter2
= _mm_srli_epi16(filter2
, 3);
154 work_a
= _mm_and_si128(work_a
, te0
);
155 filter2
= _mm_and_si128(filter2
, t1f
);
156 filter2
= _mm_or_si128(filter2
, work_a
);
157 ps0
= _mm_xor_si128(_mm_adds_epi8(ps0
, filter2
), t80
);
160 filt
= _mm_adds_epi8(filter1
, t1
);
161 work_a
= _mm_cmpgt_epi8(zero
, filt
);
162 filt
= _mm_srli_epi16(filt
, 1);
163 work_a
= _mm_and_si128(work_a
, t80
);
164 filt
= _mm_and_si128(filt
, t7f
);
165 filt
= _mm_or_si128(filt
, work_a
);
166 filt
= _mm_andnot_si128(hev
, filt
);
167 ps1
= _mm_xor_si128(_mm_adds_epi8(ps1
, filt
), t80
);
168 qs1
= _mm_xor_si128(_mm_subs_epi8(qs1
, filt
), t80
);
173 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p0
),
174 _mm_subs_epu8(p0
, p2
)),
175 _mm_or_si128(_mm_subs_epu8(q2
, q0
),
176 _mm_subs_epu8(q0
, q2
)));
177 flat
= _mm_max_epu8(work
, flat
);
178 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3
, p0
),
179 _mm_subs_epu8(p0
, p3
)),
180 _mm_or_si128(_mm_subs_epu8(q3
, q0
),
181 _mm_subs_epu8(q0
, q3
)));
182 flat
= _mm_max_epu8(work
, flat
);
183 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4
, p0
),
184 _mm_subs_epu8(p0
, p4
)),
185 _mm_or_si128(_mm_subs_epu8(q4
, q0
),
186 _mm_subs_epu8(q0
, q4
)));
187 flat
= _mm_subs_epu8(flat
, one
);
188 flat
= _mm_cmpeq_epi8(flat
, zero
);
189 flat
= _mm_and_si128(flat
, mask
);
191 p5
= _mm_loadl_epi64((__m128i
*)(s
- 6 * p
));
192 q5
= _mm_loadl_epi64((__m128i
*)(s
+ 5 * p
));
193 flat2
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5
, p0
),
194 _mm_subs_epu8(p0
, p5
)),
195 _mm_or_si128(_mm_subs_epu8(q5
, q0
),
196 _mm_subs_epu8(q0
, q5
)));
197 _mm_storel_epi64((__m128i
*)ap
[5], p5
);
198 _mm_storel_epi64((__m128i
*)aq
[5], q5
);
199 flat2
= _mm_max_epu8(work
, flat2
);
200 p6
= _mm_loadl_epi64((__m128i
*)(s
- 7 * p
));
201 q6
= _mm_loadl_epi64((__m128i
*)(s
+ 6 * p
));
202 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6
, p0
),
203 _mm_subs_epu8(p0
, p6
)),
204 _mm_or_si128(_mm_subs_epu8(q6
, q0
),
205 _mm_subs_epu8(q0
, q6
)));
206 _mm_storel_epi64((__m128i
*)ap
[6], p6
);
207 _mm_storel_epi64((__m128i
*)aq
[6], q6
);
208 flat2
= _mm_max_epu8(work
, flat2
);
210 p7
= _mm_loadl_epi64((__m128i
*)(s
- 8 * p
));
211 q7
= _mm_loadl_epi64((__m128i
*)(s
+ 7 * p
));
212 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7
, p0
),
213 _mm_subs_epu8(p0
, p7
)),
214 _mm_or_si128(_mm_subs_epu8(q7
, q0
),
215 _mm_subs_epu8(q0
, q7
)));
216 _mm_storel_epi64((__m128i
*)ap
[7], p7
);
217 _mm_storel_epi64((__m128i
*)aq
[7], q7
);
218 flat2
= _mm_max_epu8(work
, flat2
);
219 flat2
= _mm_subs_epu8(flat2
, one
);
220 flat2
= _mm_cmpeq_epi8(flat2
, zero
);
221 flat2
= _mm_and_si128(flat2
, flat
); // flat2 & flat & mask
224 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
225 // flat and wide flat calculations
227 const __m128i eight
= _mm_set1_epi16(8);
228 const __m128i four
= _mm_set1_epi16(4);
233 p7
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[7])), zero
);
234 p6
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[6])), zero
);
235 p5
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[5])), zero
);
236 p4
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[4])), zero
);
237 p3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[3])), zero
);
238 p2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[2])), zero
);
239 p1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[1])), zero
);
240 p0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(ap
[0])), zero
);
241 q0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[0])), zero
);
242 q1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[1])), zero
);
243 q2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[2])), zero
);
244 q3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[3])), zero
);
245 q4
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[4])), zero
);
246 q5
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[5])), zero
);
247 q6
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[6])), zero
);
248 q7
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(aq
[7])), zero
);
250 c
= _mm_sub_epi16(_mm_slli_epi16(p7
, 3), p7
); // p7 * 7
251 c
= _mm_add_epi16(_mm_slli_epi16(p6
, 1), _mm_add_epi16(p4
, c
));
253 b
= _mm_add_epi16(_mm_add_epi16(p3
, four
), _mm_add_epi16(p3
, p2
));
254 a
= _mm_add_epi16(p3
, _mm_add_epi16(p2
, p1
));
255 a
= _mm_add_epi16(_mm_add_epi16(p0
, q0
), a
);
257 _mm_storel_epi64((__m128i
*)&flat_op
[2][i
*8],
258 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
261 c
= _mm_add_epi16(_mm_add_epi16(p5
, eight
), c
);
262 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
263 _mm_storel_epi64((__m128i
*)&flat2_op
[6][i
*8],
264 _mm_packus_epi16(workp_shft
, workp_shft
));
266 a
= _mm_add_epi16(q1
, a
);
267 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p3
, p2
)), p1
);
268 _mm_storel_epi64((__m128i
*)&flat_op
[1][i
*8],
269 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
272 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p6
)), p5
);
273 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
274 _mm_storel_epi64((__m128i
*)&flat2_op
[5][i
*8],
275 _mm_packus_epi16(workp_shft
, workp_shft
));
277 a
= _mm_add_epi16(q2
, a
);
278 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p3
, p1
)), p0
);
279 _mm_storel_epi64((__m128i
*)&flat_op
[0][i
*8],
280 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
283 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p5
)), p4
);
284 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
285 _mm_storel_epi64((__m128i
*)&flat2_op
[4][i
*8],
286 _mm_packus_epi16(workp_shft
, workp_shft
));
288 a
= _mm_add_epi16(q3
, a
);
289 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p3
, p0
)), q0
);
290 _mm_storel_epi64((__m128i
*)&flat_oq
[0][i
*8],
291 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
294 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p4
)), p3
);
295 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
296 _mm_storel_epi64((__m128i
*)&flat2_op
[3][i
*8],
297 _mm_packus_epi16(workp_shft
, workp_shft
));
299 b
= _mm_add_epi16(q3
, b
);
300 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p2
, q0
)), q1
);
301 _mm_storel_epi64((__m128i
*)&flat_oq
[1][i
*8],
302 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
305 c
= _mm_add_epi16(q4
, c
);
306 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p3
)), p2
);
307 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
308 _mm_storel_epi64((__m128i
*)&flat2_op
[2][i
*8],
309 _mm_packus_epi16(workp_shft
, workp_shft
));
311 b
= _mm_add_epi16(q3
, b
);
312 b
= _mm_add_epi16(_mm_sub_epi16(b
, _mm_add_epi16(p1
, q1
)), q2
);
313 _mm_storel_epi64((__m128i
*)&flat_oq
[2][i
*8],
314 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a
, b
), 3)
316 a
= _mm_add_epi16(q5
, a
);
317 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p2
)), p1
);
318 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
319 _mm_storel_epi64((__m128i
*)&flat2_op
[1][i
*8],
320 _mm_packus_epi16(workp_shft
, workp_shft
));
322 a
= _mm_add_epi16(q6
, a
);
323 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p1
)), p0
);
324 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
325 _mm_storel_epi64((__m128i
*)&flat2_op
[0][i
*8],
326 _mm_packus_epi16(workp_shft
, workp_shft
));
328 a
= _mm_add_epi16(q7
, a
);
329 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p7
, p0
)), q0
);
330 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
331 _mm_storel_epi64((__m128i
*)&flat2_oq
[0][i
*8],
332 _mm_packus_epi16(workp_shft
, workp_shft
));
334 a
= _mm_add_epi16(q7
, a
);
335 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p6
, q0
)), q1
);
336 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
337 _mm_storel_epi64((__m128i
*)&flat2_oq
[1][i
*8],
338 _mm_packus_epi16(workp_shft
, workp_shft
));
340 a
= _mm_add_epi16(q7
, a
);
341 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p5
, q1
)), q2
);
342 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
343 _mm_storel_epi64((__m128i
*)&flat2_oq
[2][i
*8],
344 _mm_packus_epi16(workp_shft
, workp_shft
));
346 a
= _mm_add_epi16(q7
, a
);
347 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p4
, q2
)), q3
);
348 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
349 _mm_storel_epi64((__m128i
*)&flat2_oq
[3][i
*8],
350 _mm_packus_epi16(workp_shft
, workp_shft
));
352 a
= _mm_add_epi16(q7
, a
);
353 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p3
, q3
)), q4
);
354 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
355 _mm_storel_epi64((__m128i
*)&flat2_oq
[4][i
*8],
356 _mm_packus_epi16(workp_shft
, workp_shft
));
358 a
= _mm_add_epi16(q7
, a
);
359 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p2
, q4
)), q5
);
360 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
361 _mm_storel_epi64((__m128i
*)&flat2_oq
[5][i
*8],
362 _mm_packus_epi16(workp_shft
, workp_shft
));
364 a
= _mm_add_epi16(q7
, a
);
365 c
= _mm_add_epi16(_mm_sub_epi16(c
, _mm_add_epi16(p1
, q5
)), q6
);
366 workp_shft
= _mm_srli_epi16(_mm_add_epi16(a
, c
), 4);
367 _mm_storel_epi64((__m128i
*)&flat2_oq
[6][i
*8],
368 _mm_packus_epi16(workp_shft
, workp_shft
));
372 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
374 work_a
= _mm_loadl_epi64((__m128i
*)ap
[2]);
375 p2
= _mm_loadl_epi64((__m128i
*)flat_op
[2]);
376 work_a
= _mm_andnot_si128(flat
, work_a
);
377 p2
= _mm_and_si128(flat
, p2
);
378 p2
= _mm_or_si128(work_a
, p2
);
379 _mm_storel_epi64((__m128i
*)flat_op
[2], p2
);
381 p1
= _mm_loadl_epi64((__m128i
*)flat_op
[1]);
382 work_a
= _mm_andnot_si128(flat
, ps1
);
383 p1
= _mm_and_si128(flat
, p1
);
384 p1
= _mm_or_si128(work_a
, p1
);
385 _mm_storel_epi64((__m128i
*)flat_op
[1], p1
);
387 p0
= _mm_loadl_epi64((__m128i
*)flat_op
[0]);
388 work_a
= _mm_andnot_si128(flat
, ps0
);
389 p0
= _mm_and_si128(flat
, p0
);
390 p0
= _mm_or_si128(work_a
, p0
);
391 _mm_storel_epi64((__m128i
*)flat_op
[0], p0
);
393 q0
= _mm_loadl_epi64((__m128i
*)flat_oq
[0]);
394 work_a
= _mm_andnot_si128(flat
, qs0
);
395 q0
= _mm_and_si128(flat
, q0
);
396 q0
= _mm_or_si128(work_a
, q0
);
397 _mm_storel_epi64((__m128i
*)flat_oq
[0], q0
);
399 q1
= _mm_loadl_epi64((__m128i
*)flat_oq
[1]);
400 work_a
= _mm_andnot_si128(flat
, qs1
);
401 q1
= _mm_and_si128(flat
, q1
);
402 q1
= _mm_or_si128(work_a
, q1
);
403 _mm_storel_epi64((__m128i
*)flat_oq
[1], q1
);
405 work_a
= _mm_loadl_epi64((__m128i
*)aq
[2]);
406 q2
= _mm_loadl_epi64((__m128i
*)flat_oq
[2]);
407 work_a
= _mm_andnot_si128(flat
, work_a
);
408 q2
= _mm_and_si128(flat
, q2
);
409 q2
= _mm_or_si128(work_a
, q2
);
410 _mm_storel_epi64((__m128i
*)flat_oq
[2], q2
);
412 // write out op6 - op3
414 unsigned char *dst
= (s
- 7 * p
);
415 for (i
= 6; i
> 2; i
--) {
416 __m128i flat2_output
;
417 work_a
= _mm_loadl_epi64((__m128i
*)ap
[i
]);
418 flat2_output
= _mm_loadl_epi64((__m128i
*)flat2_op
[i
]);
419 work_a
= _mm_andnot_si128(flat2
, work_a
);
420 flat2_output
= _mm_and_si128(flat2
, flat2_output
);
421 work_a
= _mm_or_si128(work_a
, flat2_output
);
422 _mm_storel_epi64((__m128i
*)dst
, work_a
);
427 work_a
= _mm_loadl_epi64((__m128i
*)flat_op
[2]);
428 p2
= _mm_loadl_epi64((__m128i
*)flat2_op
[2]);
429 work_a
= _mm_andnot_si128(flat2
, work_a
);
430 p2
= _mm_and_si128(flat2
, p2
);
431 p2
= _mm_or_si128(work_a
, p2
);
432 _mm_storel_epi64((__m128i
*)(s
- 3 * p
), p2
);
434 work_a
= _mm_loadl_epi64((__m128i
*)flat_op
[1]);
435 p1
= _mm_loadl_epi64((__m128i
*)flat2_op
[1]);
436 work_a
= _mm_andnot_si128(flat2
, work_a
);
437 p1
= _mm_and_si128(flat2
, p1
);
438 p1
= _mm_or_si128(work_a
, p1
);
439 _mm_storel_epi64((__m128i
*)(s
- 2 * p
), p1
);
441 work_a
= _mm_loadl_epi64((__m128i
*)flat_op
[0]);
442 p0
= _mm_loadl_epi64((__m128i
*)flat2_op
[0]);
443 work_a
= _mm_andnot_si128(flat2
, work_a
);
444 p0
= _mm_and_si128(flat2
, p0
);
445 p0
= _mm_or_si128(work_a
, p0
);
446 _mm_storel_epi64((__m128i
*)(s
- 1 * p
), p0
);
448 work_a
= _mm_loadl_epi64((__m128i
*)flat_oq
[0]);
449 q0
= _mm_loadl_epi64((__m128i
*)flat2_oq
[0]);
450 work_a
= _mm_andnot_si128(flat2
, work_a
);
451 q0
= _mm_and_si128(flat2
, q0
);
452 q0
= _mm_or_si128(work_a
, q0
);
453 _mm_storel_epi64((__m128i
*)(s
- 0 * p
), q0
);
455 work_a
= _mm_loadl_epi64((__m128i
*)flat_oq
[1]);
456 q1
= _mm_loadl_epi64((__m128i
*)flat2_oq
[1]);
457 work_a
= _mm_andnot_si128(flat2
, work_a
);
458 q1
= _mm_and_si128(flat2
, q1
);
459 q1
= _mm_or_si128(work_a
, q1
);
460 _mm_storel_epi64((__m128i
*)(s
+ 1 * p
), q1
);
462 work_a
= _mm_loadl_epi64((__m128i
*)flat_oq
[2]);
463 q2
= _mm_loadl_epi64((__m128i
*)flat2_oq
[2]);
464 work_a
= _mm_andnot_si128(flat2
, work_a
);
465 q2
= _mm_and_si128(flat2
, q2
);
466 q2
= _mm_or_si128(work_a
, q2
);
467 _mm_storel_epi64((__m128i
*)(s
+ 2 * p
), q2
);
469 // write out oq3 - oq7
471 unsigned char *dst
= (s
+ 3 * p
);
472 for (i
= 3; i
< 7; i
++) {
473 __m128i flat2_output
;
474 work_a
= _mm_loadl_epi64((__m128i
*)aq
[i
]);
475 flat2_output
= _mm_loadl_epi64((__m128i
*)flat2_oq
[i
]);
476 work_a
= _mm_andnot_si128(flat2
, work_a
);
477 flat2_output
= _mm_and_si128(flat2
, flat2_output
);
478 work_a
= _mm_or_si128(work_a
, flat2_output
);
479 _mm_storel_epi64((__m128i
*)dst
, work_a
);
486 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s
,
488 const unsigned char *_blimit
,
489 const unsigned char *_limit
,
490 const unsigned char *_thresh
) {
491 DECLARE_ALIGNED(16, unsigned char, flat_op2
[16]);
492 DECLARE_ALIGNED(16, unsigned char, flat_op1
[16]);
493 DECLARE_ALIGNED(16, unsigned char, flat_op0
[16]);
494 DECLARE_ALIGNED(16, unsigned char, flat_oq2
[16]);
495 DECLARE_ALIGNED(16, unsigned char, flat_oq1
[16]);
496 DECLARE_ALIGNED(16, unsigned char, flat_oq0
[16]);
497 __m128i mask
, hev
, flat
;
498 const __m128i zero
= _mm_set1_epi16(0);
499 __m128i p3
, p2
, p1
, p0
, q0
, q1
, q2
, q3
;
500 const unsigned int extended_thresh
= _thresh
[0] * 0x01010101u
;
501 const unsigned int extended_limit
= _limit
[0] * 0x01010101u
;
502 const unsigned int extended_blimit
= _blimit
[0] * 0x01010101u
;
503 const __m128i thresh
=
504 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh
), 0);
505 const __m128i limit
=
506 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit
), 0);
507 const __m128i blimit
=
508 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit
), 0);
510 p3
= _mm_loadu_si128((__m128i
*)(s
- 4 * p
));
511 p2
= _mm_loadu_si128((__m128i
*)(s
- 3 * p
));
512 p1
= _mm_loadu_si128((__m128i
*)(s
- 2 * p
));
513 p0
= _mm_loadu_si128((__m128i
*)(s
- 1 * p
));
514 q0
= _mm_loadu_si128((__m128i
*)(s
- 0 * p
));
515 q1
= _mm_loadu_si128((__m128i
*)(s
+ 1 * p
));
516 q2
= _mm_loadu_si128((__m128i
*)(s
+ 2 * p
));
517 q3
= _mm_loadu_si128((__m128i
*)(s
+ 3 * p
));
519 const __m128i abs_p1p0
= _mm_or_si128(_mm_subs_epu8(p1
, p0
),
520 _mm_subs_epu8(p0
, p1
));
521 const __m128i abs_q1q0
= _mm_or_si128(_mm_subs_epu8(q1
, q0
),
522 _mm_subs_epu8(q0
, q1
));
523 const __m128i one
= _mm_set1_epi8(1);
524 const __m128i fe
= _mm_set1_epi8(0xfe);
525 const __m128i ff
= _mm_cmpeq_epi8(abs_p1p0
, abs_p1p0
);
526 __m128i abs_p0q0
= _mm_or_si128(_mm_subs_epu8(p0
, q0
),
527 _mm_subs_epu8(q0
, p0
));
528 __m128i abs_p1q1
= _mm_or_si128(_mm_subs_epu8(p1
, q1
),
529 _mm_subs_epu8(q1
, p1
));
531 flat
= _mm_max_epu8(abs_p1p0
, abs_q1q0
);
532 hev
= _mm_subs_epu8(flat
, thresh
);
533 hev
= _mm_xor_si128(_mm_cmpeq_epi8(hev
, zero
), ff
);
535 abs_p0q0
=_mm_adds_epu8(abs_p0q0
, abs_p0q0
);
536 abs_p1q1
= _mm_srli_epi16(_mm_and_si128(abs_p1q1
, fe
), 1);
537 mask
= _mm_subs_epu8(_mm_adds_epu8(abs_p0q0
, abs_p1q1
), blimit
);
538 mask
= _mm_xor_si128(_mm_cmpeq_epi8(mask
, zero
), ff
);
539 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
540 mask
= _mm_max_epu8(flat
, mask
);
541 // mask |= (abs(p1 - p0) > limit) * -1;
542 // mask |= (abs(q1 - q0) > limit) * -1;
543 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p1
),
544 _mm_subs_epu8(p1
, p2
)),
545 _mm_or_si128(_mm_subs_epu8(p3
, p2
),
546 _mm_subs_epu8(p2
, p3
)));
547 mask
= _mm_max_epu8(work
, mask
);
548 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2
, q1
),
549 _mm_subs_epu8(q1
, q2
)),
550 _mm_or_si128(_mm_subs_epu8(q3
, q2
),
551 _mm_subs_epu8(q2
, q3
)));
552 mask
= _mm_max_epu8(work
, mask
);
553 mask
= _mm_subs_epu8(mask
, limit
);
554 mask
= _mm_cmpeq_epi8(mask
, zero
);
556 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2
, p0
),
557 _mm_subs_epu8(p0
, p2
)),
558 _mm_or_si128(_mm_subs_epu8(q2
, q0
),
559 _mm_subs_epu8(q0
, q2
)));
560 flat
= _mm_max_epu8(work
, flat
);
561 work
= _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3
, p0
),
562 _mm_subs_epu8(p0
, p3
)),
563 _mm_or_si128(_mm_subs_epu8(q3
, q0
),
564 _mm_subs_epu8(q0
, q3
)));
565 flat
= _mm_max_epu8(work
, flat
);
566 flat
= _mm_subs_epu8(flat
, one
);
567 flat
= _mm_cmpeq_epi8(flat
, zero
);
568 flat
= _mm_and_si128(flat
, mask
);
571 const __m128i four
= _mm_set1_epi16(4);
572 unsigned char *src
= s
;
575 __m128i workp_a
, workp_b
, workp_shft
;
576 p3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 4 * p
)), zero
);
577 p2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 3 * p
)), zero
);
578 p1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 2 * p
)), zero
);
579 p0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 1 * p
)), zero
);
580 q0
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
- 0 * p
)), zero
);
581 q1
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
+ 1 * p
)), zero
);
582 q2
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
+ 2 * p
)), zero
);
583 q3
= _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i
*)(src
+ 3 * p
)), zero
);
585 workp_a
= _mm_add_epi16(_mm_add_epi16(p3
, p3
), _mm_add_epi16(p2
, p1
));
586 workp_a
= _mm_add_epi16(_mm_add_epi16(workp_a
, four
), p0
);
587 workp_b
= _mm_add_epi16(_mm_add_epi16(q0
, p2
), p3
);
588 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
589 _mm_storel_epi64((__m128i
*)&flat_op2
[i
*8],
590 _mm_packus_epi16(workp_shft
, workp_shft
));
592 workp_b
= _mm_add_epi16(_mm_add_epi16(q0
, q1
), p1
);
593 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
594 _mm_storel_epi64((__m128i
*)&flat_op1
[i
*8],
595 _mm_packus_epi16(workp_shft
, workp_shft
));
597 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p3
), q2
);
598 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, p1
), p0
);
599 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
600 _mm_storel_epi64((__m128i
*)&flat_op0
[i
*8],
601 _mm_packus_epi16(workp_shft
, workp_shft
));
603 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p3
), q3
);
604 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, p0
), q0
);
605 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
606 _mm_storel_epi64((__m128i
*)&flat_oq0
[i
*8],
607 _mm_packus_epi16(workp_shft
, workp_shft
));
609 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p2
), q3
);
610 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, q0
), q1
);
611 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
612 _mm_storel_epi64((__m128i
*)&flat_oq1
[i
*8],
613 _mm_packus_epi16(workp_shft
, workp_shft
));
615 workp_a
= _mm_add_epi16(_mm_sub_epi16(workp_a
, p1
), q3
);
616 workp_b
= _mm_add_epi16(_mm_sub_epi16(workp_b
, q1
), q2
);
617 workp_shft
= _mm_srli_epi16(_mm_add_epi16(workp_a
, workp_b
), 3);
618 _mm_storel_epi64((__m128i
*)&flat_oq2
[i
*8],
619 _mm_packus_epi16(workp_shft
, workp_shft
));
626 const __m128i t4
= _mm_set1_epi8(4);
627 const __m128i t3
= _mm_set1_epi8(3);
628 const __m128i t80
= _mm_set1_epi8(0x80);
629 const __m128i te0
= _mm_set1_epi8(0xe0);
630 const __m128i t1f
= _mm_set1_epi8(0x1f);
631 const __m128i t1
= _mm_set1_epi8(0x1);
632 const __m128i t7f
= _mm_set1_epi8(0x7f);
634 const __m128i ps1
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
- 2 * p
)),
636 const __m128i ps0
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
- 1 * p
)),
638 const __m128i qs0
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
+ 0 * p
)),
640 const __m128i qs1
= _mm_xor_si128(_mm_loadu_si128((__m128i
*)(s
+ 1 * p
)),
644 __m128i filter1
, filter2
;
646 filt
= _mm_and_si128(_mm_subs_epi8(ps1
, qs1
), hev
);
647 work_a
= _mm_subs_epi8(qs0
, ps0
);
648 filt
= _mm_adds_epi8(filt
, work_a
);
649 filt
= _mm_adds_epi8(filt
, work_a
);
650 filt
= _mm_adds_epi8(filt
, work_a
);
651 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
652 filt
= _mm_and_si128(filt
, mask
);
654 filter1
= _mm_adds_epi8(filt
, t4
);
655 filter2
= _mm_adds_epi8(filt
, t3
);
658 work_a
= _mm_cmpgt_epi8(zero
, filter1
);
659 filter1
= _mm_srli_epi16(filter1
, 3);
660 work_a
= _mm_and_si128(work_a
, te0
);
661 filter1
= _mm_and_si128(filter1
, t1f
);
662 filter1
= _mm_or_si128(filter1
, work_a
);
665 work_a
= _mm_cmpgt_epi8(zero
, filter2
);
666 filter2
= _mm_srli_epi16(filter2
, 3);
667 work_a
= _mm_and_si128(work_a
, te0
);
668 filter2
= _mm_and_si128(filter2
, t1f
);
669 filter2
= _mm_or_si128(filter2
, work_a
);
672 filt
= _mm_adds_epi8(filter1
, t1
);
673 work_a
= _mm_cmpgt_epi8(zero
, filt
);
674 filt
= _mm_srli_epi16(filt
, 1);
675 work_a
= _mm_and_si128(work_a
, t80
);
676 filt
= _mm_and_si128(filt
, t7f
);
677 filt
= _mm_or_si128(filt
, work_a
);
679 filt
= _mm_andnot_si128(hev
, filt
);
681 work_a
= _mm_xor_si128(_mm_subs_epi8(qs0
, filter1
), t80
);
682 q0
= _mm_load_si128((__m128i
*)flat_oq0
);
683 work_a
= _mm_andnot_si128(flat
, work_a
);
684 q0
= _mm_and_si128(flat
, q0
);
685 q0
= _mm_or_si128(work_a
, q0
);
687 work_a
= _mm_xor_si128(_mm_subs_epi8(qs1
, filt
), t80
);
688 q1
= _mm_load_si128((__m128i
*)flat_oq1
);
689 work_a
= _mm_andnot_si128(flat
, work_a
);
690 q1
= _mm_and_si128(flat
, q1
);
691 q1
= _mm_or_si128(work_a
, q1
);
693 work_a
= _mm_loadu_si128((__m128i
*)(s
+ 2 * p
));
694 q2
= _mm_load_si128((__m128i
*)flat_oq2
);
695 work_a
= _mm_andnot_si128(flat
, work_a
);
696 q2
= _mm_and_si128(flat
, q2
);
697 q2
= _mm_or_si128(work_a
, q2
);
699 work_a
= _mm_xor_si128(_mm_adds_epi8(ps0
, filter2
), t80
);
700 p0
= _mm_load_si128((__m128i
*)flat_op0
);
701 work_a
= _mm_andnot_si128(flat
, work_a
);
702 p0
= _mm_and_si128(flat
, p0
);
703 p0
= _mm_or_si128(work_a
, p0
);
705 work_a
= _mm_xor_si128(_mm_adds_epi8(ps1
, filt
), t80
);
706 p1
= _mm_load_si128((__m128i
*)flat_op1
);
707 work_a
= _mm_andnot_si128(flat
, work_a
);
708 p1
= _mm_and_si128(flat
, p1
);
709 p1
= _mm_or_si128(work_a
, p1
);
711 work_a
= _mm_loadu_si128((__m128i
*)(s
- 3 * p
));
712 p2
= _mm_load_si128((__m128i
*)flat_op2
);
713 work_a
= _mm_andnot_si128(flat
, work_a
);
714 p2
= _mm_and_si128(flat
, p2
);
715 p2
= _mm_or_si128(work_a
, p2
);
717 _mm_storeu_si128((__m128i
*)(s
- 3 * p
), p2
);
718 _mm_storeu_si128((__m128i
*)(s
- 2 * p
), p1
);
719 _mm_storeu_si128((__m128i
*)(s
- 1 * p
), p0
);
720 _mm_storeu_si128((__m128i
*)(s
+ 0 * p
), q0
);
721 _mm_storeu_si128((__m128i
*)(s
+ 1 * p
), q1
);
722 _mm_storeu_si128((__m128i
*)(s
+ 2 * p
), q2
);
726 void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u
,
728 const unsigned char *_blimit
,
729 const unsigned char *_limit
,
730 const unsigned char *_thresh
,
732 DECLARE_ALIGNED_ARRAY(16, unsigned char, src
, 160);
735 const __m128i p4
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 5 * p
)),
736 _mm_loadl_epi64((__m128i
*)(v
- 5 * p
)));
737 const __m128i p3
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 4 * p
)),
738 _mm_loadl_epi64((__m128i
*)(v
- 4 * p
)));
739 const __m128i p2
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 3 * p
)),
740 _mm_loadl_epi64((__m128i
*)(v
- 3 * p
)));
741 const __m128i p1
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 2 * p
)),
742 _mm_loadl_epi64((__m128i
*)(v
- 2 * p
)));
743 const __m128i p0
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
- 1 * p
)),
744 _mm_loadl_epi64((__m128i
*)(v
- 1 * p
)));
745 const __m128i q0
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
)),
746 _mm_loadl_epi64((__m128i
*)(v
)));
747 const __m128i q1
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 1 * p
)),
748 _mm_loadl_epi64((__m128i
*)(v
+ 1 * p
)));
749 const __m128i q2
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 2 * p
)),
750 _mm_loadl_epi64((__m128i
*)(v
+ 2 * p
)));
751 const __m128i q3
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 3 * p
)),
752 _mm_loadl_epi64((__m128i
*)(v
+ 3 * p
)));
753 const __m128i q4
= _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i
*)(u
+ 4 * p
)),
754 _mm_loadl_epi64((__m128i
*)(v
+ 4 * p
)));
756 _mm_store_si128((__m128i
*)(src
), p4
);
757 _mm_store_si128((__m128i
*)(src
+ 16), p3
);
758 _mm_store_si128((__m128i
*)(src
+ 32), p2
);
759 _mm_store_si128((__m128i
*)(src
+ 48), p1
);
760 _mm_store_si128((__m128i
*)(src
+ 64), p0
);
761 _mm_store_si128((__m128i
*)(src
+ 80), q0
);
762 _mm_store_si128((__m128i
*)(src
+ 96), q1
);
763 _mm_store_si128((__m128i
*)(src
+ 112), q2
);
764 _mm_store_si128((__m128i
*)(src
+ 128), q3
);
765 _mm_store_si128((__m128i
*)(src
+ 144), q4
);
768 vp9_mbloop_filter_horizontal_edge_sse2(src
+ 80, 16, _blimit
, _limit
,
772 _mm_storel_epi64((__m128i
*)(u
- 3 * p
),
773 _mm_loadl_epi64((__m128i
*)(src
+ 32)));
774 _mm_storel_epi64((__m128i
*)(u
- 2 * p
),
775 _mm_loadl_epi64((__m128i
*)(src
+ 48)));
776 _mm_storel_epi64((__m128i
*)(u
- p
),
777 _mm_loadl_epi64((__m128i
*)(src
+ 64)));
778 _mm_storel_epi64((__m128i
*)u
,
779 _mm_loadl_epi64((__m128i
*)(src
+ 80)));
780 _mm_storel_epi64((__m128i
*)(u
+ p
),
781 _mm_loadl_epi64((__m128i
*)(src
+ 96)));
782 _mm_storel_epi64((__m128i
*)(u
+ 2 * p
),
783 _mm_loadl_epi64((__m128i
*)(src
+ 112)));
785 _mm_storel_epi64((__m128i
*)(v
- 3 * p
),
786 _mm_loadl_epi64((__m128i
*)(src
+ 40)));
787 _mm_storel_epi64((__m128i
*)(v
- 2 * p
),
788 _mm_loadl_epi64((__m128i
*)(src
+ 56)));
789 _mm_storel_epi64((__m128i
*)(v
- p
),
790 _mm_loadl_epi64((__m128i
*)(src
+ 72)));
791 _mm_storel_epi64((__m128i
*)v
,
792 _mm_loadl_epi64((__m128i
*)(src
+ 88)));
793 _mm_storel_epi64((__m128i
*)(v
+ p
),
794 _mm_loadl_epi64((__m128i
*)(src
+ 104)));
795 _mm_storel_epi64((__m128i
*)(v
+ 2 * p
),
796 _mm_loadl_epi64((__m128i
*)(src
+ 120)));
799 static INLINE
void transpose8x16(unsigned char *in0
, unsigned char *in1
,
800 int in_p
, unsigned char *out
, int out_p
) {
801 __m128i x0
, x1
, x2
, x3
, x4
, x5
, x6
, x7
;
802 __m128i x8
, x9
, x10
, x11
, x12
, x13
, x14
, x15
;
804 /* Read in 16 lines */
805 x0
= _mm_loadl_epi64((__m128i
*)in0
);
806 x8
= _mm_loadl_epi64((__m128i
*)in1
);
807 x1
= _mm_loadl_epi64((__m128i
*)(in0
+ in_p
));
808 x9
= _mm_loadl_epi64((__m128i
*)(in1
+ in_p
));
809 x2
= _mm_loadl_epi64((__m128i
*)(in0
+ 2 * in_p
));
810 x10
= _mm_loadl_epi64((__m128i
*)(in1
+ 2 * in_p
));
811 x3
= _mm_loadl_epi64((__m128i
*)(in0
+ 3*in_p
));
812 x11
= _mm_loadl_epi64((__m128i
*)(in1
+ 3*in_p
));
813 x4
= _mm_loadl_epi64((__m128i
*)(in0
+ 4*in_p
));
814 x12
= _mm_loadl_epi64((__m128i
*)(in1
+ 4*in_p
));
815 x5
= _mm_loadl_epi64((__m128i
*)(in0
+ 5*in_p
));
816 x13
= _mm_loadl_epi64((__m128i
*)(in1
+ 5*in_p
));
817 x6
= _mm_loadl_epi64((__m128i
*)(in0
+ 6*in_p
));
818 x14
= _mm_loadl_epi64((__m128i
*)(in1
+ 6*in_p
));
819 x7
= _mm_loadl_epi64((__m128i
*)(in0
+ 7*in_p
));
820 x15
= _mm_loadl_epi64((__m128i
*)(in1
+ 7*in_p
));
822 x0
= _mm_unpacklo_epi8(x0
, x1
);
823 x1
= _mm_unpacklo_epi8(x2
, x3
);
824 x2
= _mm_unpacklo_epi8(x4
, x5
);
825 x3
= _mm_unpacklo_epi8(x6
, x7
);
827 x8
= _mm_unpacklo_epi8(x8
, x9
);
828 x9
= _mm_unpacklo_epi8(x10
, x11
);
829 x10
= _mm_unpacklo_epi8(x12
, x13
);
830 x11
= _mm_unpacklo_epi8(x14
, x15
);
832 x4
= _mm_unpacklo_epi16(x0
, x1
);
833 x5
= _mm_unpacklo_epi16(x2
, x3
);
834 x12
= _mm_unpacklo_epi16(x8
, x9
);
835 x13
= _mm_unpacklo_epi16(x10
, x11
);
837 x6
= _mm_unpacklo_epi32(x4
, x5
);
838 x7
= _mm_unpackhi_epi32(x4
, x5
);
839 x14
= _mm_unpacklo_epi32(x12
, x13
);
840 x15
= _mm_unpackhi_epi32(x12
, x13
);
842 /* Store first 4-line result */
843 _mm_storeu_si128((__m128i
*)out
, _mm_unpacklo_epi64(x6
, x14
));
844 _mm_storeu_si128((__m128i
*)(out
+ out_p
), _mm_unpackhi_epi64(x6
, x14
));
845 _mm_storeu_si128((__m128i
*)(out
+ 2 * out_p
), _mm_unpacklo_epi64(x7
, x15
));
846 _mm_storeu_si128((__m128i
*)(out
+ 3 * out_p
), _mm_unpackhi_epi64(x7
, x15
));
848 x4
= _mm_unpackhi_epi16(x0
, x1
);
849 x5
= _mm_unpackhi_epi16(x2
, x3
);
850 x12
= _mm_unpackhi_epi16(x8
, x9
);
851 x13
= _mm_unpackhi_epi16(x10
, x11
);
853 x6
= _mm_unpacklo_epi32(x4
, x5
);
854 x7
= _mm_unpackhi_epi32(x4
, x5
);
855 x14
= _mm_unpacklo_epi32(x12
, x13
);
856 x15
= _mm_unpackhi_epi32(x12
, x13
);
858 /* Store second 4-line result */
859 _mm_storeu_si128((__m128i
*)(out
+ 4 * out_p
), _mm_unpacklo_epi64(x6
, x14
));
860 _mm_storeu_si128((__m128i
*)(out
+ 5 * out_p
), _mm_unpackhi_epi64(x6
, x14
));
861 _mm_storeu_si128((__m128i
*)(out
+ 6 * out_p
), _mm_unpacklo_epi64(x7
, x15
));
862 _mm_storeu_si128((__m128i
*)(out
+ 7 * out_p
), _mm_unpackhi_epi64(x7
, x15
));
865 static INLINE
void transpose(unsigned char *src
[], int in_p
,
866 unsigned char *dst
[], int out_p
,
867 int num_8x8_to_transpose
) {
869 __m128i x0
, x1
, x2
, x3
, x4
, x5
, x6
, x7
;
871 unsigned char *in
= src
[idx8x8
];
872 unsigned char *out
= dst
[idx8x8
];
874 x0
= _mm_loadl_epi64((__m128i
*)(in
+ 0*in_p
)); // 00 01 02 03 04 05 06 07
875 x1
= _mm_loadl_epi64((__m128i
*)(in
+ 1*in_p
)); // 10 11 12 13 14 15 16 17
876 x2
= _mm_loadl_epi64((__m128i
*)(in
+ 2*in_p
)); // 20 21 22 23 24 25 26 27
877 x3
= _mm_loadl_epi64((__m128i
*)(in
+ 3*in_p
)); // 30 31 32 33 34 35 36 37
878 x4
= _mm_loadl_epi64((__m128i
*)(in
+ 4*in_p
)); // 40 41 42 43 44 45 46 47
879 x5
= _mm_loadl_epi64((__m128i
*)(in
+ 5*in_p
)); // 50 51 52 53 54 55 56 57
880 x6
= _mm_loadl_epi64((__m128i
*)(in
+ 6*in_p
)); // 60 61 62 63 64 65 66 67
881 x7
= _mm_loadl_epi64((__m128i
*)(in
+ 7*in_p
)); // 70 71 72 73 74 75 76 77
882 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
883 x0
= _mm_unpacklo_epi8(x0
, x1
);
884 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
885 x1
= _mm_unpacklo_epi8(x2
, x3
);
886 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
887 x2
= _mm_unpacklo_epi8(x4
, x5
);
888 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
889 x3
= _mm_unpacklo_epi8(x6
, x7
);
890 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
891 x4
= _mm_unpacklo_epi16(x0
, x1
);
892 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
893 x5
= _mm_unpacklo_epi16(x2
, x3
);
894 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
895 x6
= _mm_unpacklo_epi32(x4
, x5
);
896 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
897 x7
= _mm_unpackhi_epi32(x4
, x5
);
899 _mm_storel_pd((double *)(out
+ 0*out_p
),
900 _mm_castsi128_pd(x6
)); // 00 10 20 30 40 50 60 70
901 _mm_storeh_pd((double *)(out
+ 1*out_p
),
902 _mm_castsi128_pd(x6
)); // 01 11 21 31 41 51 61 71
903 _mm_storel_pd((double *)(out
+ 2*out_p
),
904 _mm_castsi128_pd(x7
)); // 02 12 22 32 42 52 62 72
905 _mm_storeh_pd((double *)(out
+ 3*out_p
),
906 _mm_castsi128_pd(x7
)); // 03 13 23 33 43 53 63 73
908 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
909 x4
= _mm_unpackhi_epi16(x0
, x1
);
910 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
911 x5
= _mm_unpackhi_epi16(x2
, x3
);
912 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
913 x6
= _mm_unpacklo_epi32(x4
, x5
);
914 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
915 x7
= _mm_unpackhi_epi32(x4
, x5
);
917 _mm_storel_pd((double *)(out
+ 4*out_p
),
918 _mm_castsi128_pd(x6
)); // 04 14 24 34 44 54 64 74
919 _mm_storeh_pd((double *)(out
+ 5*out_p
),
920 _mm_castsi128_pd(x6
)); // 05 15 25 35 45 55 65 75
921 _mm_storel_pd((double *)(out
+ 6*out_p
),
922 _mm_castsi128_pd(x7
)); // 06 16 26 36 46 56 66 76
923 _mm_storeh_pd((double *)(out
+ 7*out_p
),
924 _mm_castsi128_pd(x7
)); // 07 17 27 37 47 57 67 77
925 } while (++idx8x8
< num_8x8_to_transpose
);
928 void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s
,
930 const unsigned char *blimit
,
931 const unsigned char *limit
,
932 const unsigned char *thresh
) {
933 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst
, 256);
934 unsigned char *src
[2];
935 unsigned char *dst
[2];
937 /* Transpose 16x16 */
938 transpose8x16(s
- 8, s
- 8 + p
* 8, p
, t_dst
, 16);
939 transpose8x16(s
, s
+ p
* 8, p
, t_dst
+ 16 * 8, 16);
942 vp9_mbloop_filter_horizontal_edge_sse2(t_dst
+ 8 * 16, 16, blimit
, limit
,
944 src
[0] = t_dst
+ 3 * 16;
945 src
[1] = t_dst
+ 3 * 16 + 8;
948 dst
[1] = s
- 5 + p
* 8;
951 transpose(src
, 16, dst
, p
, 2);
954 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s
,
956 const unsigned char *blimit
,
957 const unsigned char *limit
,
958 const unsigned char *thresh
) {
959 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst
, 256);
960 unsigned char *src
[4];
961 unsigned char *dst
[4];
964 dst
[1] = t_dst
+ 8 * 16;
969 /* Transpose 16x16 */
970 transpose(src
, p
, dst
, 16, 2);
973 vp9_mb_lpf_horizontal_edge_w_sse2(t_dst
+ 8 * 16, 16, blimit
, limit
,
977 src
[1] = t_dst
+ 8 * 16;
982 transpose(src
, 16, dst
, p
, 2);
986 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u
,
988 const unsigned char *blimit
,
989 const unsigned char *limit
,
990 const unsigned char *thresh
,
992 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst
, 256);
993 unsigned char *src
[2];
994 unsigned char *dst
[2];
996 /* Transpose 16x16 */
997 transpose8x16(u
- 8, v
- 8, p
, t_dst
, 16);
998 transpose8x16(u
, v
, p
, t_dst
+ 16 * 8, 16);
1000 /* Loop filtering */
1001 vp9_mbloop_filter_horizontal_edge_sse2(t_dst
+ 8 * 16, 16, blimit
, limit
,
1004 src
[0] = t_dst
+ 3 * 16;
1005 src
[1] = t_dst
+ 3 * 16 + 8;
1010 /* Transpose 16x8 */
1011 transpose(src
, 16, dst
, p
, 2);
1014 /* Horizontal MB filtering */
1015 void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr
,
1016 unsigned char *u_ptr
, unsigned char *v_ptr
,
1017 int y_stride
, int uv_stride
,
1018 struct loop_filter_info
*lfi
) {
1019 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr
, y_stride
, lfi
->mblim
,
1020 lfi
->lim
, lfi
->hev_thr
);
1024 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1025 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1029 void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1030 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1031 struct loop_filter_info
*lfi
) {
1032 vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr
, y_stride
,
1033 lfi
->mblim
, lfi
->lim
, lfi
->hev_thr
);
1037 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1038 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1042 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1043 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1044 struct loop_filter_info
*lfi
) {
1045 vp9_mbloop_filter_horizontal_edge_sse2(
1046 y_ptr
+ 8 * y_stride
, y_stride
, lfi
->blim
, lfi
->lim
, lfi
->hev_thr
);
1049 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr
+ 4 * uv_stride
, uv_stride
,
1050 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,
1051 v_ptr
+ 4 * uv_stride
);
1054 /* Vertical MB Filtering */
1055 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1056 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1057 struct loop_filter_info
*lfi
) {
1058 vp9_mbloop_filter_vertical_edge_sse2(y_ptr
, y_stride
, lfi
->mblim
, lfi
->lim
,
1063 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1064 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1068 void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1069 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1070 struct loop_filter_info
*lfi
) {
1071 vp9_mb_lpf_vertical_edge_w_sse2(y_ptr
, y_stride
,
1072 lfi
->mblim
, lfi
->lim
, lfi
->hev_thr
);
1076 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr
, uv_stride
, lfi
->mblim
,
1077 lfi
->lim
, lfi
->hev_thr
, v_ptr
);
1081 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr
, unsigned char *u_ptr
,
1082 unsigned char *v_ptr
, int y_stride
, int uv_stride
,
1083 struct loop_filter_info
*lfi
) {
1084 vp9_mbloop_filter_vertical_edge_sse2(
1085 y_ptr
+ 8, y_stride
, lfi
->blim
, lfi
->lim
, lfi
->hev_thr
);
1088 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr
+ 4, uv_stride
,
1089 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,
1093 /* Horizontal B Filtering */
1094 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr
,
1095 unsigned char *u_ptr
, unsigned char *v_ptr
,
1096 int y_stride
, int uv_stride
,
1097 struct loop_filter_info
*lfi
) {
1098 vp9_loop_filter_horizontal_edge_sse2(y_ptr
+ 4 * y_stride
, y_stride
,
1099 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1100 vp9_loop_filter_horizontal_edge_sse2(y_ptr
+ 8 * y_stride
, y_stride
,
1101 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1102 vp9_loop_filter_horizontal_edge_sse2(y_ptr
+ 12 * y_stride
, y_stride
,
1103 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1106 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr
+ 4 * uv_stride
, uv_stride
,
1107 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,
1108 v_ptr
+ 4 * uv_stride
);
1111 /* Vertical B Filtering */
1112 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr
,
1113 unsigned char *u_ptr
, unsigned char *v_ptr
,
1114 int y_stride
, int uv_stride
,
1115 struct loop_filter_info
*lfi
) {
1116 vp9_loop_filter_vertical_edge_sse2(y_ptr
+ 4, y_stride
,
1117 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1118 vp9_loop_filter_vertical_edge_sse2(y_ptr
+ 8, y_stride
,
1119 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1120 vp9_loop_filter_vertical_edge_sse2(y_ptr
+ 12, y_stride
,
1121 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
, 2);
1124 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr
+ 4, uv_stride
,
1125 lfi
->blim
, lfi
->lim
, lfi
->hev_thr
,