Quick modifications to wide loopfilter intrinsic functions
[aom.git] / vp9 / common / x86 / vp9_loopfilter_intrin_sse2.c
blob9ab51208d87a4502b926f970ca0e5e7be4b141d4
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
11 #include <emmintrin.h> /* SSE2 */
12 #include "vp9/common/vp9_loopfilter.h"
13 #include "vpx_ports/emmintrin_compat.h"
15 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
16 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
18 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
19 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
21 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
22 int p,
23 const unsigned char *_blimit,
24 const unsigned char *_limit,
25 const unsigned char *_thresh) {
26 DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
27 DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
29 DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]);
30 DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]);
32 DECLARE_ALIGNED(16, unsigned char, ap[8][8]);
33 DECLARE_ALIGNED(16, unsigned char, aq[8][8]);
36 __m128i mask, hev, flat, flat2;
37 const __m128i zero = _mm_set1_epi16(0);
38 const __m128i one = _mm_set1_epi8(1);
39 __m128i p7, p6, p5;
40 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
41 __m128i q5, q6, q7;
42 int i = 0;
43 const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
44 const unsigned int extended_limit = _limit[0] * 0x01010101u;
45 const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
46 const __m128i thresh =
47 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
48 const __m128i limit =
49 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
50 const __m128i blimit =
51 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
53 p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
54 p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
55 p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
56 p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
57 p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
58 q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
59 q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
60 q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
61 q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
62 q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
64 _mm_storel_epi64((__m128i *)ap[4], p4);
65 _mm_storel_epi64((__m128i *)ap[3], p3);
66 _mm_storel_epi64((__m128i *)ap[2], p2);
67 _mm_storel_epi64((__m128i *)ap[1], p1);
68 _mm_storel_epi64((__m128i *)ap[0], p0);
69 _mm_storel_epi64((__m128i *)aq[4], q4);
70 _mm_storel_epi64((__m128i *)aq[3], q3);
71 _mm_storel_epi64((__m128i *)aq[2], q2);
72 _mm_storel_epi64((__m128i *)aq[1], q1);
73 _mm_storel_epi64((__m128i *)aq[0], q0);
77 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
78 _mm_subs_epu8(p0, p1));
79 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
80 _mm_subs_epu8(q0, q1));
81 const __m128i fe = _mm_set1_epi8(0xfe);
82 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
83 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
84 _mm_subs_epu8(q0, p0));
85 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
86 _mm_subs_epu8(q1, p1));
87 __m128i work;
88 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
89 hev = _mm_subs_epu8(flat, thresh);
90 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
92 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
93 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
94 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
95 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
96 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
97 mask = _mm_max_epu8(flat, mask);
98 // mask |= (abs(p1 - p0) > limit) * -1;
99 // mask |= (abs(q1 - q0) > limit) * -1;
100 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
101 _mm_subs_epu8(p1, p2)),
102 _mm_or_si128(_mm_subs_epu8(p3, p2),
103 _mm_subs_epu8(p2, p3)));
104 mask = _mm_max_epu8(work, mask);
105 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
106 _mm_subs_epu8(q1, q2)),
107 _mm_or_si128(_mm_subs_epu8(q3, q2),
108 _mm_subs_epu8(q2, q3)));
109 mask = _mm_max_epu8(work, mask);
110 mask = _mm_subs_epu8(mask, limit);
111 mask = _mm_cmpeq_epi8(mask, zero);
114 // lp filter
116 const __m128i t4 = _mm_set1_epi8(4);
117 const __m128i t3 = _mm_set1_epi8(3);
118 const __m128i t80 = _mm_set1_epi8(0x80);
119 const __m128i te0 = _mm_set1_epi8(0xe0);
120 const __m128i t1f = _mm_set1_epi8(0x1f);
121 const __m128i t1 = _mm_set1_epi8(0x1);
122 const __m128i t7f = _mm_set1_epi8(0x7f);
124 __m128i ps1 = _mm_xor_si128(p1, t80);
125 __m128i ps0 = _mm_xor_si128(p0, t80);
126 __m128i qs0 = _mm_xor_si128(q0, t80);
127 __m128i qs1 = _mm_xor_si128(q1, t80);
128 __m128i filt;
129 __m128i work_a;
130 __m128i filter1, filter2;
132 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
133 work_a = _mm_subs_epi8(qs0, ps0);
134 filt = _mm_adds_epi8(filt, work_a);
135 filt = _mm_adds_epi8(filt, work_a);
136 filt = _mm_adds_epi8(filt, work_a);
137 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
138 filt = _mm_and_si128(filt, mask);
140 filter1 = _mm_adds_epi8(filt, t4);
141 filter2 = _mm_adds_epi8(filt, t3);
143 /* Filter1 >> 3 */
144 work_a = _mm_cmpgt_epi8(zero, filter1);
145 filter1 = _mm_srli_epi16(filter1, 3);
146 work_a = _mm_and_si128(work_a, te0);
147 filter1 = _mm_and_si128(filter1, t1f);
148 filter1 = _mm_or_si128(filter1, work_a);
149 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
151 /* Filter2 >> 3 */
152 work_a = _mm_cmpgt_epi8(zero, filter2);
153 filter2 = _mm_srli_epi16(filter2, 3);
154 work_a = _mm_and_si128(work_a, te0);
155 filter2 = _mm_and_si128(filter2, t1f);
156 filter2 = _mm_or_si128(filter2, work_a);
157 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
159 /* filt >> 1 */
160 filt = _mm_adds_epi8(filter1, t1);
161 work_a = _mm_cmpgt_epi8(zero, filt);
162 filt = _mm_srli_epi16(filt, 1);
163 work_a = _mm_and_si128(work_a, t80);
164 filt = _mm_and_si128(filt, t7f);
165 filt = _mm_or_si128(filt, work_a);
166 filt = _mm_andnot_si128(hev, filt);
167 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
168 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
169 // loopfilter done
172 __m128i work;
173 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
174 _mm_subs_epu8(p0, p2)),
175 _mm_or_si128(_mm_subs_epu8(q2, q0),
176 _mm_subs_epu8(q0, q2)));
177 flat = _mm_max_epu8(work, flat);
178 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
179 _mm_subs_epu8(p0, p3)),
180 _mm_or_si128(_mm_subs_epu8(q3, q0),
181 _mm_subs_epu8(q0, q3)));
182 flat = _mm_max_epu8(work, flat);
183 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
184 _mm_subs_epu8(p0, p4)),
185 _mm_or_si128(_mm_subs_epu8(q4, q0),
186 _mm_subs_epu8(q0, q4)));
187 flat = _mm_subs_epu8(flat, one);
188 flat = _mm_cmpeq_epi8(flat, zero);
189 flat = _mm_and_si128(flat, mask);
191 p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
192 q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
193 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
194 _mm_subs_epu8(p0, p5)),
195 _mm_or_si128(_mm_subs_epu8(q5, q0),
196 _mm_subs_epu8(q0, q5)));
197 _mm_storel_epi64((__m128i *)ap[5], p5);
198 _mm_storel_epi64((__m128i *)aq[5], q5);
199 flat2 = _mm_max_epu8(work, flat2);
200 p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
201 q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
202 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
203 _mm_subs_epu8(p0, p6)),
204 _mm_or_si128(_mm_subs_epu8(q6, q0),
205 _mm_subs_epu8(q0, q6)));
206 _mm_storel_epi64((__m128i *)ap[6], p6);
207 _mm_storel_epi64((__m128i *)aq[6], q6);
208 flat2 = _mm_max_epu8(work, flat2);
210 p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
211 q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
212 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
213 _mm_subs_epu8(p0, p7)),
214 _mm_or_si128(_mm_subs_epu8(q7, q0),
215 _mm_subs_epu8(q0, q7)));
216 _mm_storel_epi64((__m128i *)ap[7], p7);
217 _mm_storel_epi64((__m128i *)aq[7], q7);
218 flat2 = _mm_max_epu8(work, flat2);
219 flat2 = _mm_subs_epu8(flat2, one);
220 flat2 = _mm_cmpeq_epi8(flat2, zero);
221 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
224 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
225 // flat and wide flat calculations
227 const __m128i eight = _mm_set1_epi16(8);
228 const __m128i four = _mm_set1_epi16(4);
230 __m128i workp_shft;
231 __m128i a, b, c;
233 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero);
234 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero);
235 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero);
236 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero);
237 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero);
238 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero);
239 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero);
240 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero);
241 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero);
242 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero);
243 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero);
244 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero);
245 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero);
246 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero);
247 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero);
248 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero);
250 c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
251 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
253 b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
254 a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
255 a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
257 _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
258 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
259 , b));
261 c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
262 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
263 _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
264 _mm_packus_epi16(workp_shft, workp_shft));
266 a = _mm_add_epi16(q1, a);
267 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
268 _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
269 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
270 , b));
272 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
273 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
274 _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
275 _mm_packus_epi16(workp_shft, workp_shft));
277 a = _mm_add_epi16(q2, a);
278 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
279 _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
280 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
281 , b));
283 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
284 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
285 _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
286 _mm_packus_epi16(workp_shft, workp_shft));
288 a = _mm_add_epi16(q3, a);
289 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
290 _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
291 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
292 , b));
294 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
295 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
296 _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
297 _mm_packus_epi16(workp_shft, workp_shft));
299 b = _mm_add_epi16(q3, b);
300 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
301 _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
302 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
303 , b));
305 c = _mm_add_epi16(q4, c);
306 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
307 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
308 _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
309 _mm_packus_epi16(workp_shft, workp_shft));
311 b = _mm_add_epi16(q3, b);
312 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
313 _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
314 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
315 , b));
316 a = _mm_add_epi16(q5, a);
317 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
318 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
319 _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
320 _mm_packus_epi16(workp_shft, workp_shft));
322 a = _mm_add_epi16(q6, a);
323 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
324 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
325 _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
326 _mm_packus_epi16(workp_shft, workp_shft));
328 a = _mm_add_epi16(q7, a);
329 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
330 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
331 _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
332 _mm_packus_epi16(workp_shft, workp_shft));
334 a = _mm_add_epi16(q7, a);
335 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
336 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
337 _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
338 _mm_packus_epi16(workp_shft, workp_shft));
340 a = _mm_add_epi16(q7, a);
341 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
342 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
343 _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
344 _mm_packus_epi16(workp_shft, workp_shft));
346 a = _mm_add_epi16(q7, a);
347 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
348 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
349 _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
350 _mm_packus_epi16(workp_shft, workp_shft));
352 a = _mm_add_epi16(q7, a);
353 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
354 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
355 _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
356 _mm_packus_epi16(workp_shft, workp_shft));
358 a = _mm_add_epi16(q7, a);
359 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
360 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
361 _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
362 _mm_packus_epi16(workp_shft, workp_shft));
364 a = _mm_add_epi16(q7, a);
365 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
366 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
367 _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
368 _mm_packus_epi16(workp_shft, workp_shft));
371 // wide flat
372 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
374 work_a = _mm_loadl_epi64((__m128i *)ap[2]);
375 p2 = _mm_loadl_epi64((__m128i *)flat_op[2]);
376 work_a = _mm_andnot_si128(flat, work_a);
377 p2 = _mm_and_si128(flat, p2);
378 p2 = _mm_or_si128(work_a, p2);
379 _mm_storel_epi64((__m128i *)flat_op[2], p2);
381 p1 = _mm_loadl_epi64((__m128i *)flat_op[1]);
382 work_a = _mm_andnot_si128(flat, ps1);
383 p1 = _mm_and_si128(flat, p1);
384 p1 = _mm_or_si128(work_a, p1);
385 _mm_storel_epi64((__m128i *)flat_op[1], p1);
387 p0 = _mm_loadl_epi64((__m128i *)flat_op[0]);
388 work_a = _mm_andnot_si128(flat, ps0);
389 p0 = _mm_and_si128(flat, p0);
390 p0 = _mm_or_si128(work_a, p0);
391 _mm_storel_epi64((__m128i *)flat_op[0], p0);
393 q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]);
394 work_a = _mm_andnot_si128(flat, qs0);
395 q0 = _mm_and_si128(flat, q0);
396 q0 = _mm_or_si128(work_a, q0);
397 _mm_storel_epi64((__m128i *)flat_oq[0], q0);
399 q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]);
400 work_a = _mm_andnot_si128(flat, qs1);
401 q1 = _mm_and_si128(flat, q1);
402 q1 = _mm_or_si128(work_a, q1);
403 _mm_storel_epi64((__m128i *)flat_oq[1], q1);
405 work_a = _mm_loadl_epi64((__m128i *)aq[2]);
406 q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]);
407 work_a = _mm_andnot_si128(flat, work_a);
408 q2 = _mm_and_si128(flat, q2);
409 q2 = _mm_or_si128(work_a, q2);
410 _mm_storel_epi64((__m128i *)flat_oq[2], q2);
412 // write out op6 - op3
414 unsigned char *dst = (s - 7 * p);
415 for (i = 6; i > 2; i--) {
416 __m128i flat2_output;
417 work_a = _mm_loadl_epi64((__m128i *)ap[i]);
418 flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]);
419 work_a = _mm_andnot_si128(flat2, work_a);
420 flat2_output = _mm_and_si128(flat2, flat2_output);
421 work_a = _mm_or_si128(work_a, flat2_output);
422 _mm_storel_epi64((__m128i *)dst, work_a);
423 dst += p;
427 work_a = _mm_loadl_epi64((__m128i *)flat_op[2]);
428 p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]);
429 work_a = _mm_andnot_si128(flat2, work_a);
430 p2 = _mm_and_si128(flat2, p2);
431 p2 = _mm_or_si128(work_a, p2);
432 _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
434 work_a = _mm_loadl_epi64((__m128i *)flat_op[1]);
435 p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]);
436 work_a = _mm_andnot_si128(flat2, work_a);
437 p1 = _mm_and_si128(flat2, p1);
438 p1 = _mm_or_si128(work_a, p1);
439 _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
441 work_a = _mm_loadl_epi64((__m128i *)flat_op[0]);
442 p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]);
443 work_a = _mm_andnot_si128(flat2, work_a);
444 p0 = _mm_and_si128(flat2, p0);
445 p0 = _mm_or_si128(work_a, p0);
446 _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
448 work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]);
449 q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]);
450 work_a = _mm_andnot_si128(flat2, work_a);
451 q0 = _mm_and_si128(flat2, q0);
452 q0 = _mm_or_si128(work_a, q0);
453 _mm_storel_epi64((__m128i *)(s - 0 * p), q0);
455 work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]);
456 q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]);
457 work_a = _mm_andnot_si128(flat2, work_a);
458 q1 = _mm_and_si128(flat2, q1);
459 q1 = _mm_or_si128(work_a, q1);
460 _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
462 work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]);
463 q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]);
464 work_a = _mm_andnot_si128(flat2, work_a);
465 q2 = _mm_and_si128(flat2, q2);
466 q2 = _mm_or_si128(work_a, q2);
467 _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
469 // write out oq3 - oq7
471 unsigned char *dst = (s + 3 * p);
472 for (i = 3; i < 7; i++) {
473 __m128i flat2_output;
474 work_a = _mm_loadl_epi64((__m128i *)aq[i]);
475 flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]);
476 work_a = _mm_andnot_si128(flat2, work_a);
477 flat2_output = _mm_and_si128(flat2, flat2_output);
478 work_a = _mm_or_si128(work_a, flat2_output);
479 _mm_storel_epi64((__m128i *)dst, work_a);
480 dst += p;
486 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
487 int p,
488 const unsigned char *_blimit,
489 const unsigned char *_limit,
490 const unsigned char *_thresh) {
491 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
492 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
493 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
494 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
495 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
496 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
497 __m128i mask, hev, flat;
498 const __m128i zero = _mm_set1_epi16(0);
499 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
500 const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
501 const unsigned int extended_limit = _limit[0] * 0x01010101u;
502 const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
503 const __m128i thresh =
504 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
505 const __m128i limit =
506 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
507 const __m128i blimit =
508 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
510 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
511 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
512 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
513 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
514 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
515 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
516 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
517 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
519 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
520 _mm_subs_epu8(p0, p1));
521 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
522 _mm_subs_epu8(q0, q1));
523 const __m128i one = _mm_set1_epi8(1);
524 const __m128i fe = _mm_set1_epi8(0xfe);
525 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
526 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
527 _mm_subs_epu8(q0, p0));
528 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
529 _mm_subs_epu8(q1, p1));
530 __m128i work;
531 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
532 hev = _mm_subs_epu8(flat, thresh);
533 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
535 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
536 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
537 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
538 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
539 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
540 mask = _mm_max_epu8(flat, mask);
541 // mask |= (abs(p1 - p0) > limit) * -1;
542 // mask |= (abs(q1 - q0) > limit) * -1;
543 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
544 _mm_subs_epu8(p1, p2)),
545 _mm_or_si128(_mm_subs_epu8(p3, p2),
546 _mm_subs_epu8(p2, p3)));
547 mask = _mm_max_epu8(work, mask);
548 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
549 _mm_subs_epu8(q1, q2)),
550 _mm_or_si128(_mm_subs_epu8(q3, q2),
551 _mm_subs_epu8(q2, q3)));
552 mask = _mm_max_epu8(work, mask);
553 mask = _mm_subs_epu8(mask, limit);
554 mask = _mm_cmpeq_epi8(mask, zero);
556 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
557 _mm_subs_epu8(p0, p2)),
558 _mm_or_si128(_mm_subs_epu8(q2, q0),
559 _mm_subs_epu8(q0, q2)));
560 flat = _mm_max_epu8(work, flat);
561 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
562 _mm_subs_epu8(p0, p3)),
563 _mm_or_si128(_mm_subs_epu8(q3, q0),
564 _mm_subs_epu8(q0, q3)));
565 flat = _mm_max_epu8(work, flat);
566 flat = _mm_subs_epu8(flat, one);
567 flat = _mm_cmpeq_epi8(flat, zero);
568 flat = _mm_and_si128(flat, mask);
571 const __m128i four = _mm_set1_epi16(4);
572 unsigned char *src = s;
573 int i = 0;
574 do {
575 __m128i workp_a, workp_b, workp_shft;
576 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
577 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
578 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
579 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
580 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
581 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
582 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
583 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
585 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
586 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
587 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
588 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
589 _mm_storel_epi64((__m128i *)&flat_op2[i*8],
590 _mm_packus_epi16(workp_shft, workp_shft));
592 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
593 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
594 _mm_storel_epi64((__m128i *)&flat_op1[i*8],
595 _mm_packus_epi16(workp_shft, workp_shft));
597 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
598 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
599 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
600 _mm_storel_epi64((__m128i *)&flat_op0[i*8],
601 _mm_packus_epi16(workp_shft, workp_shft));
603 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
604 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
605 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
606 _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
607 _mm_packus_epi16(workp_shft, workp_shft));
609 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
610 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
611 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
612 _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
613 _mm_packus_epi16(workp_shft, workp_shft));
615 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
616 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
617 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
618 _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
619 _mm_packus_epi16(workp_shft, workp_shft));
621 src += 8;
622 } while (++i < 2);
624 // lp filter
626 const __m128i t4 = _mm_set1_epi8(4);
627 const __m128i t3 = _mm_set1_epi8(3);
628 const __m128i t80 = _mm_set1_epi8(0x80);
629 const __m128i te0 = _mm_set1_epi8(0xe0);
630 const __m128i t1f = _mm_set1_epi8(0x1f);
631 const __m128i t1 = _mm_set1_epi8(0x1);
632 const __m128i t7f = _mm_set1_epi8(0x7f);
634 const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
635 t80);
636 const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
637 t80);
638 const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
639 t80);
640 const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
641 t80);
642 __m128i filt;
643 __m128i work_a;
644 __m128i filter1, filter2;
646 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
647 work_a = _mm_subs_epi8(qs0, ps0);
648 filt = _mm_adds_epi8(filt, work_a);
649 filt = _mm_adds_epi8(filt, work_a);
650 filt = _mm_adds_epi8(filt, work_a);
651 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
652 filt = _mm_and_si128(filt, mask);
654 filter1 = _mm_adds_epi8(filt, t4);
655 filter2 = _mm_adds_epi8(filt, t3);
657 /* Filter1 >> 3 */
658 work_a = _mm_cmpgt_epi8(zero, filter1);
659 filter1 = _mm_srli_epi16(filter1, 3);
660 work_a = _mm_and_si128(work_a, te0);
661 filter1 = _mm_and_si128(filter1, t1f);
662 filter1 = _mm_or_si128(filter1, work_a);
664 /* Filter2 >> 3 */
665 work_a = _mm_cmpgt_epi8(zero, filter2);
666 filter2 = _mm_srli_epi16(filter2, 3);
667 work_a = _mm_and_si128(work_a, te0);
668 filter2 = _mm_and_si128(filter2, t1f);
669 filter2 = _mm_or_si128(filter2, work_a);
671 /* filt >> 1 */
672 filt = _mm_adds_epi8(filter1, t1);
673 work_a = _mm_cmpgt_epi8(zero, filt);
674 filt = _mm_srli_epi16(filt, 1);
675 work_a = _mm_and_si128(work_a, t80);
676 filt = _mm_and_si128(filt, t7f);
677 filt = _mm_or_si128(filt, work_a);
679 filt = _mm_andnot_si128(hev, filt);
681 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
682 q0 = _mm_load_si128((__m128i *)flat_oq0);
683 work_a = _mm_andnot_si128(flat, work_a);
684 q0 = _mm_and_si128(flat, q0);
685 q0 = _mm_or_si128(work_a, q0);
687 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
688 q1 = _mm_load_si128((__m128i *)flat_oq1);
689 work_a = _mm_andnot_si128(flat, work_a);
690 q1 = _mm_and_si128(flat, q1);
691 q1 = _mm_or_si128(work_a, q1);
693 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
694 q2 = _mm_load_si128((__m128i *)flat_oq2);
695 work_a = _mm_andnot_si128(flat, work_a);
696 q2 = _mm_and_si128(flat, q2);
697 q2 = _mm_or_si128(work_a, q2);
699 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
700 p0 = _mm_load_si128((__m128i *)flat_op0);
701 work_a = _mm_andnot_si128(flat, work_a);
702 p0 = _mm_and_si128(flat, p0);
703 p0 = _mm_or_si128(work_a, p0);
705 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
706 p1 = _mm_load_si128((__m128i *)flat_op1);
707 work_a = _mm_andnot_si128(flat, work_a);
708 p1 = _mm_and_si128(flat, p1);
709 p1 = _mm_or_si128(work_a, p1);
711 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
712 p2 = _mm_load_si128((__m128i *)flat_op2);
713 work_a = _mm_andnot_si128(flat, work_a);
714 p2 = _mm_and_si128(flat, p2);
715 p2 = _mm_or_si128(work_a, p2);
717 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
718 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
719 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
720 _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
721 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
722 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
726 void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
727 int p,
728 const unsigned char *_blimit,
729 const unsigned char *_limit,
730 const unsigned char *_thresh,
731 unsigned char *v) {
732 DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160);
734 /* Read source */
735 const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)),
736 _mm_loadl_epi64((__m128i *)(v - 5 * p)));
737 const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)),
738 _mm_loadl_epi64((__m128i *)(v - 4 * p)));
739 const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)),
740 _mm_loadl_epi64((__m128i *)(v - 3 * p)));
741 const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)),
742 _mm_loadl_epi64((__m128i *)(v - 2 * p)));
743 const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)),
744 _mm_loadl_epi64((__m128i *)(v - 1 * p)));
745 const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)),
746 _mm_loadl_epi64((__m128i *)(v)));
747 const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)),
748 _mm_loadl_epi64((__m128i *)(v + 1 * p)));
749 const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)),
750 _mm_loadl_epi64((__m128i *)(v + 2 * p)));
751 const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)),
752 _mm_loadl_epi64((__m128i *)(v + 3 * p)));
753 const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)),
754 _mm_loadl_epi64((__m128i *)(v + 4 * p)));
756 _mm_store_si128((__m128i *)(src), p4);
757 _mm_store_si128((__m128i *)(src + 16), p3);
758 _mm_store_si128((__m128i *)(src + 32), p2);
759 _mm_store_si128((__m128i *)(src + 48), p1);
760 _mm_store_si128((__m128i *)(src + 64), p0);
761 _mm_store_si128((__m128i *)(src + 80), q0);
762 _mm_store_si128((__m128i *)(src + 96), q1);
763 _mm_store_si128((__m128i *)(src + 112), q2);
764 _mm_store_si128((__m128i *)(src + 128), q3);
765 _mm_store_si128((__m128i *)(src + 144), q4);
767 /* Loop filtering */
768 vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit,
769 _thresh);
771 /* Store result */
772 _mm_storel_epi64((__m128i *)(u - 3 * p),
773 _mm_loadl_epi64((__m128i *)(src + 32)));
774 _mm_storel_epi64((__m128i *)(u - 2 * p),
775 _mm_loadl_epi64((__m128i *)(src + 48)));
776 _mm_storel_epi64((__m128i *)(u - p),
777 _mm_loadl_epi64((__m128i *)(src + 64)));
778 _mm_storel_epi64((__m128i *)u,
779 _mm_loadl_epi64((__m128i *)(src + 80)));
780 _mm_storel_epi64((__m128i *)(u + p),
781 _mm_loadl_epi64((__m128i *)(src + 96)));
782 _mm_storel_epi64((__m128i *)(u + 2 * p),
783 _mm_loadl_epi64((__m128i *)(src + 112)));
785 _mm_storel_epi64((__m128i *)(v - 3 * p),
786 _mm_loadl_epi64((__m128i *)(src + 40)));
787 _mm_storel_epi64((__m128i *)(v - 2 * p),
788 _mm_loadl_epi64((__m128i *)(src + 56)));
789 _mm_storel_epi64((__m128i *)(v - p),
790 _mm_loadl_epi64((__m128i *)(src + 72)));
791 _mm_storel_epi64((__m128i *)v,
792 _mm_loadl_epi64((__m128i *)(src + 88)));
793 _mm_storel_epi64((__m128i *)(v + p),
794 _mm_loadl_epi64((__m128i *)(src + 104)));
795 _mm_storel_epi64((__m128i *)(v + 2 * p),
796 _mm_loadl_epi64((__m128i *)(src + 120)));
799 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
800 int in_p, unsigned char *out, int out_p) {
801 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
802 __m128i x8, x9, x10, x11, x12, x13, x14, x15;
804 /* Read in 16 lines */
805 x0 = _mm_loadl_epi64((__m128i *)in0);
806 x8 = _mm_loadl_epi64((__m128i *)in1);
807 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
808 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
809 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
810 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
811 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
812 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
813 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
814 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
815 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
816 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
817 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
818 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
819 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
820 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
822 x0 = _mm_unpacklo_epi8(x0, x1);
823 x1 = _mm_unpacklo_epi8(x2, x3);
824 x2 = _mm_unpacklo_epi8(x4, x5);
825 x3 = _mm_unpacklo_epi8(x6, x7);
827 x8 = _mm_unpacklo_epi8(x8, x9);
828 x9 = _mm_unpacklo_epi8(x10, x11);
829 x10 = _mm_unpacklo_epi8(x12, x13);
830 x11 = _mm_unpacklo_epi8(x14, x15);
832 x4 = _mm_unpacklo_epi16(x0, x1);
833 x5 = _mm_unpacklo_epi16(x2, x3);
834 x12 = _mm_unpacklo_epi16(x8, x9);
835 x13 = _mm_unpacklo_epi16(x10, x11);
837 x6 = _mm_unpacklo_epi32(x4, x5);
838 x7 = _mm_unpackhi_epi32(x4, x5);
839 x14 = _mm_unpacklo_epi32(x12, x13);
840 x15 = _mm_unpackhi_epi32(x12, x13);
842 /* Store first 4-line result */
843 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
844 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
845 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
846 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
848 x4 = _mm_unpackhi_epi16(x0, x1);
849 x5 = _mm_unpackhi_epi16(x2, x3);
850 x12 = _mm_unpackhi_epi16(x8, x9);
851 x13 = _mm_unpackhi_epi16(x10, x11);
853 x6 = _mm_unpacklo_epi32(x4, x5);
854 x7 = _mm_unpackhi_epi32(x4, x5);
855 x14 = _mm_unpacklo_epi32(x12, x13);
856 x15 = _mm_unpackhi_epi32(x12, x13);
858 /* Store second 4-line result */
859 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
860 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
861 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
862 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
865 static INLINE void transpose(unsigned char *src[], int in_p,
866 unsigned char *dst[], int out_p,
867 int num_8x8_to_transpose) {
868 int idx8x8 = 0;
869 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
870 do {
871 unsigned char *in = src[idx8x8];
872 unsigned char *out = dst[idx8x8];
874 x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
875 x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
876 x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
877 x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
878 x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
879 x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
880 x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
881 x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
882 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
883 x0 = _mm_unpacklo_epi8(x0, x1);
884 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
885 x1 = _mm_unpacklo_epi8(x2, x3);
886 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
887 x2 = _mm_unpacklo_epi8(x4, x5);
888 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
889 x3 = _mm_unpacklo_epi8(x6, x7);
890 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
891 x4 = _mm_unpacklo_epi16(x0, x1);
892 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
893 x5 = _mm_unpacklo_epi16(x2, x3);
894 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
895 x6 = _mm_unpacklo_epi32(x4, x5);
896 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
897 x7 = _mm_unpackhi_epi32(x4, x5);
899 _mm_storel_pd((double *)(out + 0*out_p),
900 _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
901 _mm_storeh_pd((double *)(out + 1*out_p),
902 _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
903 _mm_storel_pd((double *)(out + 2*out_p),
904 _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
905 _mm_storeh_pd((double *)(out + 3*out_p),
906 _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
908 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
909 x4 = _mm_unpackhi_epi16(x0, x1);
910 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
911 x5 = _mm_unpackhi_epi16(x2, x3);
912 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
913 x6 = _mm_unpacklo_epi32(x4, x5);
914 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
915 x7 = _mm_unpackhi_epi32(x4, x5);
917 _mm_storel_pd((double *)(out + 4*out_p),
918 _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
919 _mm_storeh_pd((double *)(out + 5*out_p),
920 _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
921 _mm_storel_pd((double *)(out + 6*out_p),
922 _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
923 _mm_storeh_pd((double *)(out + 7*out_p),
924 _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
925 } while (++idx8x8 < num_8x8_to_transpose);
928 void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
929 int p,
930 const unsigned char *blimit,
931 const unsigned char *limit,
932 const unsigned char *thresh) {
933 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
934 unsigned char *src[2];
935 unsigned char *dst[2];
937 /* Transpose 16x16 */
938 transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
939 transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
941 /* Loop filtering */
942 vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
943 thresh);
944 src[0] = t_dst + 3 * 16;
945 src[1] = t_dst + 3 * 16 + 8;
947 dst[0] = s - 5;
948 dst[1] = s - 5 + p * 8;
950 /* Transpose 16x8 */
951 transpose(src, 16, dst, p, 2);
954 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
955 int p,
956 const unsigned char *blimit,
957 const unsigned char *limit,
958 const unsigned char *thresh) {
959 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
960 unsigned char *src[4];
961 unsigned char *dst[4];
963 dst[0] = t_dst;
964 dst[1] = t_dst + 8 * 16;
966 src[0] = s - 8;
967 src[1] = s - 8 + 8;
969 /* Transpose 16x16 */
970 transpose(src, p, dst, 16, 2);
972 /* Loop filtering */
973 vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
974 thresh);
976 src[0] = t_dst;
977 src[1] = t_dst + 8 * 16;
979 dst[0] = s - 8;
980 dst[1] = s - 8 + 8;
982 transpose(src, 16, dst, p, 2);
986 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
987 int p,
988 const unsigned char *blimit,
989 const unsigned char *limit,
990 const unsigned char *thresh,
991 unsigned char *v) {
992 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
993 unsigned char *src[2];
994 unsigned char *dst[2];
996 /* Transpose 16x16 */
997 transpose8x16(u - 8, v - 8, p, t_dst, 16);
998 transpose8x16(u, v, p, t_dst + 16 * 8, 16);
1000 /* Loop filtering */
1001 vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
1002 thresh);
1004 src[0] = t_dst + 3 * 16;
1005 src[1] = t_dst + 3 * 16 + 8;
1007 dst[0] = u - 5;
1008 dst[1] = v - 5;
1010 /* Transpose 16x8 */
1011 transpose(src, 16, dst, p, 2);
1014 /* Horizontal MB filtering */
1015 void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,
1016 unsigned char *u_ptr, unsigned char *v_ptr,
1017 int y_stride, int uv_stride,
1018 struct loop_filter_info *lfi) {
1019 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim,
1020 lfi->lim, lfi->hev_thr);
1022 /* u,v */
1023 if (u_ptr)
1024 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1025 lfi->lim, lfi->hev_thr, v_ptr);
1029 void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1030 unsigned char *v_ptr, int y_stride, int uv_stride,
1031 struct loop_filter_info *lfi) {
1032 vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr, y_stride,
1033 lfi->mblim, lfi->lim, lfi->hev_thr);
1035 /* u,v */
1036 if (u_ptr)
1037 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1038 lfi->lim, lfi->hev_thr, v_ptr);
1042 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1043 unsigned char *v_ptr, int y_stride, int uv_stride,
1044 struct loop_filter_info *lfi) {
1045 vp9_mbloop_filter_horizontal_edge_sse2(
1046 y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
1048 if (u_ptr)
1049 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
1050 lfi->blim, lfi->lim, lfi->hev_thr,
1051 v_ptr + 4 * uv_stride);
1054 /* Vertical MB Filtering */
1055 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1056 unsigned char *v_ptr, int y_stride, int uv_stride,
1057 struct loop_filter_info *lfi) {
1058 vp9_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
1059 lfi->hev_thr);
1061 /* u,v */
1062 if (u_ptr)
1063 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1064 lfi->lim, lfi->hev_thr, v_ptr);
1068 void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1069 unsigned char *v_ptr, int y_stride, int uv_stride,
1070 struct loop_filter_info *lfi) {
1071 vp9_mb_lpf_vertical_edge_w_sse2(y_ptr, y_stride,
1072 lfi->mblim, lfi->lim, lfi->hev_thr);
1074 /* u,v */
1075 if (u_ptr)
1076 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1077 lfi->lim, lfi->hev_thr, v_ptr);
1081 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1082 unsigned char *v_ptr, int y_stride, int uv_stride,
1083 struct loop_filter_info *lfi) {
1084 vp9_mbloop_filter_vertical_edge_sse2(
1085 y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
1087 if (u_ptr)
1088 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
1089 lfi->blim, lfi->lim, lfi->hev_thr,
1090 v_ptr + 4);
1093 /* Horizontal B Filtering */
1094 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
1095 unsigned char *u_ptr, unsigned char *v_ptr,
1096 int y_stride, int uv_stride,
1097 struct loop_filter_info *lfi) {
1098 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
1099 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1100 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
1101 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1102 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
1103 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1105 if (u_ptr)
1106 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
1107 lfi->blim, lfi->lim, lfi->hev_thr,
1108 v_ptr + 4 * uv_stride);
1111 /* Vertical B Filtering */
1112 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
1113 unsigned char *u_ptr, unsigned char *v_ptr,
1114 int y_stride, int uv_stride,
1115 struct loop_filter_info *lfi) {
1116 vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,
1117 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1118 vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,
1119 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1120 vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,
1121 lfi->blim, lfi->lim, lfi->hev_thr, 2);
1123 if (u_ptr)
1124 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
1125 lfi->blim, lfi->lim, lfi->hev_thr,
1126 v_ptr + 4);