2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <emmintrin.h>
14 #include "./aom_dsp_rtcd.h"
16 static INLINE
void dc_store_4xh(uint32_t dc
, int height
, uint8_t *dst
,
18 for (int i
= 0; i
< height
; i
+= 2) {
19 *(uint32_t *)dst
= dc
;
21 *(uint32_t *)dst
= dc
;
26 static INLINE
void dc_store_8xh(const __m128i
*row
, int height
, uint8_t *dst
,
29 for (i
= 0; i
< height
; ++i
) {
30 _mm_storel_epi64((__m128i
*)dst
, *row
);
35 static INLINE
void dc_store_16xh(const __m128i
*row
, int height
, uint8_t *dst
,
38 for (i
= 0; i
< height
; ++i
) {
39 _mm_store_si128((__m128i
*)dst
, *row
);
44 static INLINE
void dc_store_32xh(const __m128i
*row
, int height
, uint8_t *dst
,
47 for (i
= 0; i
< height
; ++i
) {
48 _mm_store_si128((__m128i
*)dst
, *row
);
49 _mm_store_si128((__m128i
*)(dst
+ 16), *row
);
54 static INLINE
void dc_store_64xh(const __m128i
*row
, int height
, uint8_t *dst
,
56 for (int i
= 0; i
< height
; ++i
) {
57 _mm_store_si128((__m128i
*)dst
, *row
);
58 _mm_store_si128((__m128i
*)(dst
+ 16), *row
);
59 _mm_store_si128((__m128i
*)(dst
+ 32), *row
);
60 _mm_store_si128((__m128i
*)(dst
+ 48), *row
);
65 static INLINE __m128i
dc_sum_4(const uint8_t *ref
) {
66 __m128i x
= _mm_loadl_epi64((__m128i
const *)ref
);
67 const __m128i zero
= _mm_setzero_si128();
68 x
= _mm_unpacklo_epi8(x
, zero
);
69 return _mm_sad_epu8(x
, zero
);
72 static INLINE __m128i
dc_sum_8(const uint8_t *ref
) {
73 __m128i x
= _mm_loadl_epi64((__m128i
const *)ref
);
74 const __m128i zero
= _mm_setzero_si128();
75 return _mm_sad_epu8(x
, zero
);
78 static INLINE __m128i
dc_sum_16(const uint8_t *ref
) {
79 __m128i x
= _mm_load_si128((__m128i
const *)ref
);
80 const __m128i zero
= _mm_setzero_si128();
81 x
= _mm_sad_epu8(x
, zero
);
82 const __m128i high
= _mm_unpackhi_epi64(x
, x
);
83 return _mm_add_epi16(x
, high
);
86 static INLINE __m128i
dc_sum_32(const uint8_t *ref
) {
87 __m128i x0
= _mm_load_si128((__m128i
const *)ref
);
88 __m128i x1
= _mm_load_si128((__m128i
const *)(ref
+ 16));
89 const __m128i zero
= _mm_setzero_si128();
90 x0
= _mm_sad_epu8(x0
, zero
);
91 x1
= _mm_sad_epu8(x1
, zero
);
92 x0
= _mm_add_epi16(x0
, x1
);
93 const __m128i high
= _mm_unpackhi_epi64(x0
, x0
);
94 return _mm_add_epi16(x0
, high
);
97 static INLINE __m128i
dc_sum_64(const uint8_t *ref
) {
98 __m128i x0
= _mm_load_si128((__m128i
const *)ref
);
99 __m128i x1
= _mm_load_si128((__m128i
const *)(ref
+ 16));
100 __m128i x2
= _mm_load_si128((__m128i
const *)(ref
+ 32));
101 __m128i x3
= _mm_load_si128((__m128i
const *)(ref
+ 48));
102 const __m128i zero
= _mm_setzero_si128();
103 x0
= _mm_sad_epu8(x0
, zero
);
104 x1
= _mm_sad_epu8(x1
, zero
);
105 x2
= _mm_sad_epu8(x2
, zero
);
106 x3
= _mm_sad_epu8(x3
, zero
);
107 x0
= _mm_add_epi16(x0
, x1
);
108 x2
= _mm_add_epi16(x2
, x3
);
109 x0
= _mm_add_epi16(x0
, x2
);
110 const __m128i high
= _mm_unpackhi_epi64(x0
, x0
);
111 return _mm_add_epi16(x0
, high
);
114 // -----------------------------------------------------------------------------
117 void aom_dc_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
118 const uint8_t *above
, const uint8_t *left
) {
119 const __m128i sum_left
= dc_sum_8(left
);
120 __m128i sum_above
= dc_sum_4(above
);
121 sum_above
= _mm_add_epi16(sum_left
, sum_above
);
123 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
127 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
128 const uint32_t pred
= _mm_cvtsi128_si32(row
);
129 dc_store_4xh(pred
, 8, dst
, stride
);
132 void aom_dc_predictor_4x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
133 const uint8_t *above
, const uint8_t *left
) {
134 const __m128i sum_left
= dc_sum_16(left
);
135 __m128i sum_above
= dc_sum_4(above
);
136 sum_above
= _mm_add_epi16(sum_left
, sum_above
);
138 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
142 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
143 const uint32_t pred
= _mm_cvtsi128_si32(row
);
144 dc_store_4xh(pred
, 16, dst
, stride
);
147 void aom_dc_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
148 const uint8_t *above
, const uint8_t *left
) {
149 const __m128i sum_left
= dc_sum_4(left
);
150 __m128i sum_above
= dc_sum_8(above
);
151 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
153 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
157 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
158 dc_store_8xh(&row
, 4, dst
, stride
);
161 void aom_dc_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
162 const uint8_t *above
, const uint8_t *left
) {
163 const __m128i sum_left
= dc_sum_16(left
);
164 __m128i sum_above
= dc_sum_8(above
);
165 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
167 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
170 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
171 dc_store_8xh(&row
, 16, dst
, stride
);
174 void aom_dc_predictor_8x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
175 const uint8_t *above
, const uint8_t *left
) {
176 const __m128i sum_left
= dc_sum_32(left
);
177 __m128i sum_above
= dc_sum_8(above
);
178 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
180 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
183 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
184 dc_store_8xh(&row
, 32, dst
, stride
);
187 void aom_dc_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
188 const uint8_t *above
, const uint8_t *left
) {
189 const __m128i sum_left
= dc_sum_8(left
);
190 __m128i sum_above
= dc_sum_16(above
);
191 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
193 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
196 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
197 dc_store_16xh(&row
, 8, dst
, stride
);
200 void aom_dc_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
201 const uint8_t *above
, const uint8_t *left
) {
202 const __m128i sum_left
= dc_sum_32(left
);
203 __m128i sum_above
= dc_sum_16(above
);
204 sum_above
= _mm_add_epi16(sum_left
, sum_above
);
206 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
209 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
210 dc_store_16xh(&row
, 32, dst
, stride
);
213 void aom_dc_predictor_16x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
214 const uint8_t *above
, const uint8_t *left
) {
215 const __m128i sum_left
= dc_sum_64(left
);
216 __m128i sum_above
= dc_sum_16(above
);
217 sum_above
= _mm_add_epi16(sum_left
, sum_above
);
219 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
222 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
223 dc_store_16xh(&row
, 64, dst
, stride
);
226 void aom_dc_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
227 const uint8_t *above
, const uint8_t *left
) {
228 __m128i sum_above
= dc_sum_32(above
);
229 const __m128i sum_left
= dc_sum_16(left
);
230 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
232 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
235 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
236 dc_store_32xh(&row
, 16, dst
, stride
);
239 void aom_dc_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
240 const uint8_t *above
, const uint8_t *left
) {
241 __m128i sum_above
= dc_sum_32(above
);
242 const __m128i sum_left
= dc_sum_64(left
);
243 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
245 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
248 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
249 dc_store_32xh(&row
, 64, dst
, stride
);
252 void aom_dc_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
253 const uint8_t *above
, const uint8_t *left
) {
254 __m128i sum_above
= dc_sum_64(above
);
255 const __m128i sum_left
= dc_sum_64(left
);
256 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
258 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
261 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
262 dc_store_64xh(&row
, 64, dst
, stride
);
265 void aom_dc_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
266 const uint8_t *above
, const uint8_t *left
) {
267 __m128i sum_above
= dc_sum_64(above
);
268 const __m128i sum_left
= dc_sum_32(left
);
269 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
271 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
274 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
275 dc_store_64xh(&row
, 32, dst
, stride
);
278 void aom_dc_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
279 const uint8_t *above
, const uint8_t *left
) {
280 __m128i sum_above
= dc_sum_64(above
);
281 const __m128i sum_left
= dc_sum_16(left
);
282 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
284 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
287 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
288 dc_store_64xh(&row
, 16, dst
, stride
);
291 // -----------------------------------------------------------------------------
294 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
295 const uint8_t *above
, const uint8_t *left
) {
297 __m128i sum_above
= dc_sum_4(above
);
298 const __m128i two
= _mm_set1_epi16((int16_t)2);
299 sum_above
= _mm_add_epi16(sum_above
, two
);
300 sum_above
= _mm_srai_epi16(sum_above
, 2);
301 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
302 sum_above
= _mm_packus_epi16(sum_above
, sum_above
);
304 const uint32_t pred
= _mm_cvtsi128_si32(sum_above
);
305 dc_store_4xh(pred
, 8, dst
, stride
);
308 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
309 const uint8_t *above
, const uint8_t *left
) {
311 __m128i sum_above
= dc_sum_4(above
);
312 const __m128i two
= _mm_set1_epi16((int16_t)2);
313 sum_above
= _mm_add_epi16(sum_above
, two
);
314 sum_above
= _mm_srai_epi16(sum_above
, 2);
315 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
316 sum_above
= _mm_packus_epi16(sum_above
, sum_above
);
318 const uint32_t pred
= _mm_cvtsi128_si32(sum_above
);
319 dc_store_4xh(pred
, 16, dst
, stride
);
322 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
323 const uint8_t *above
, const uint8_t *left
) {
325 __m128i sum_above
= dc_sum_8(above
);
326 const __m128i four
= _mm_set1_epi16((uint16_t)4);
327 sum_above
= _mm_add_epi16(sum_above
, four
);
328 sum_above
= _mm_srai_epi16(sum_above
, 3);
329 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
330 const __m128i row
= _mm_shufflelo_epi16(sum_above
, 0);
331 dc_store_8xh(&row
, 4, dst
, stride
);
334 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
335 const uint8_t *above
, const uint8_t *left
) {
337 __m128i sum_above
= dc_sum_8(above
);
338 const __m128i four
= _mm_set1_epi16((uint16_t)4);
339 sum_above
= _mm_add_epi16(sum_above
, four
);
340 sum_above
= _mm_srai_epi16(sum_above
, 3);
341 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
342 const __m128i row
= _mm_shufflelo_epi16(sum_above
, 0);
343 dc_store_8xh(&row
, 16, dst
, stride
);
346 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
347 const uint8_t *above
, const uint8_t *left
) {
349 __m128i sum_above
= dc_sum_8(above
);
350 const __m128i four
= _mm_set1_epi16((uint16_t)4);
351 sum_above
= _mm_add_epi16(sum_above
, four
);
352 sum_above
= _mm_srai_epi16(sum_above
, 3);
353 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
354 const __m128i row
= _mm_shufflelo_epi16(sum_above
, 0);
355 dc_store_8xh(&row
, 32, dst
, stride
);
358 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
359 const uint8_t *above
, const uint8_t *left
) {
361 __m128i sum_above
= dc_sum_16(above
);
362 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
363 sum_above
= _mm_add_epi16(sum_above
, eight
);
364 sum_above
= _mm_srai_epi16(sum_above
, 4);
365 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
366 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
367 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
368 dc_store_16xh(&row
, 8, dst
, stride
);
371 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
372 const uint8_t *above
,
373 const uint8_t *left
) {
375 __m128i sum_above
= dc_sum_16(above
);
376 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
377 sum_above
= _mm_add_epi16(sum_above
, eight
);
378 sum_above
= _mm_srai_epi16(sum_above
, 4);
379 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
380 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
381 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
382 dc_store_16xh(&row
, 32, dst
, stride
);
385 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
386 const uint8_t *above
,
387 const uint8_t *left
) {
389 __m128i sum_above
= dc_sum_16(above
);
390 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
391 sum_above
= _mm_add_epi16(sum_above
, eight
);
392 sum_above
= _mm_srai_epi16(sum_above
, 4);
393 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
394 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
395 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
396 dc_store_16xh(&row
, 64, dst
, stride
);
399 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
400 const uint8_t *above
,
401 const uint8_t *left
) {
403 __m128i sum_above
= dc_sum_32(above
);
404 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
405 sum_above
= _mm_add_epi16(sum_above
, sixteen
);
406 sum_above
= _mm_srai_epi16(sum_above
, 5);
407 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
408 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
409 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
410 dc_store_32xh(&row
, 16, dst
, stride
);
413 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
414 const uint8_t *above
,
415 const uint8_t *left
) {
417 __m128i sum_above
= dc_sum_32(above
);
418 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
419 sum_above
= _mm_add_epi16(sum_above
, sixteen
);
420 sum_above
= _mm_srai_epi16(sum_above
, 5);
421 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
422 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
423 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
424 dc_store_32xh(&row
, 64, dst
, stride
);
427 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
428 const uint8_t *above
,
429 const uint8_t *left
) {
431 __m128i sum_above
= dc_sum_64(above
);
432 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
433 sum_above
= _mm_add_epi16(sum_above
, thirtytwo
);
434 sum_above
= _mm_srai_epi16(sum_above
, 6);
435 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
436 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
437 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
438 dc_store_64xh(&row
, 64, dst
, stride
);
441 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
442 const uint8_t *above
,
443 const uint8_t *left
) {
445 __m128i sum_above
= dc_sum_64(above
);
446 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
447 sum_above
= _mm_add_epi16(sum_above
, thirtytwo
);
448 sum_above
= _mm_srai_epi16(sum_above
, 6);
449 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
450 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
451 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
452 dc_store_64xh(&row
, 32, dst
, stride
);
455 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
456 const uint8_t *above
,
457 const uint8_t *left
) {
459 __m128i sum_above
= dc_sum_64(above
);
460 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
461 sum_above
= _mm_add_epi16(sum_above
, thirtytwo
);
462 sum_above
= _mm_srai_epi16(sum_above
, 6);
463 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
464 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
465 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
466 dc_store_64xh(&row
, 16, dst
, stride
);
469 // -----------------------------------------------------------------------------
472 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
473 const uint8_t *above
, const uint8_t *left
) {
475 __m128i sum_left
= dc_sum_8(left
);
476 const __m128i four
= _mm_set1_epi16((uint16_t)4);
477 sum_left
= _mm_add_epi16(sum_left
, four
);
478 sum_left
= _mm_srai_epi16(sum_left
, 3);
479 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
480 sum_left
= _mm_packus_epi16(sum_left
, sum_left
);
482 const uint32_t pred
= _mm_cvtsi128_si32(sum_left
);
483 dc_store_4xh(pred
, 8, dst
, stride
);
486 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
487 const uint8_t *above
,
488 const uint8_t *left
) {
490 __m128i sum_left
= dc_sum_16(left
);
491 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
492 sum_left
= _mm_add_epi16(sum_left
, eight
);
493 sum_left
= _mm_srai_epi16(sum_left
, 4);
494 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
495 sum_left
= _mm_packus_epi16(sum_left
, sum_left
);
497 const uint32_t pred
= _mm_cvtsi128_si32(sum_left
);
498 dc_store_4xh(pred
, 16, dst
, stride
);
501 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
502 const uint8_t *above
, const uint8_t *left
) {
504 __m128i sum_left
= dc_sum_4(left
);
505 const __m128i two
= _mm_set1_epi16((uint16_t)2);
506 sum_left
= _mm_add_epi16(sum_left
, two
);
507 sum_left
= _mm_srai_epi16(sum_left
, 2);
508 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
509 const __m128i row
= _mm_shufflelo_epi16(sum_left
, 0);
510 dc_store_8xh(&row
, 4, dst
, stride
);
513 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
514 const uint8_t *above
,
515 const uint8_t *left
) {
517 __m128i sum_left
= dc_sum_16(left
);
518 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
519 sum_left
= _mm_add_epi16(sum_left
, eight
);
520 sum_left
= _mm_srai_epi16(sum_left
, 4);
521 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
522 const __m128i row
= _mm_shufflelo_epi16(sum_left
, 0);
523 dc_store_8xh(&row
, 16, dst
, stride
);
526 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
527 const uint8_t *above
,
528 const uint8_t *left
) {
530 __m128i sum_left
= dc_sum_32(left
);
531 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
532 sum_left
= _mm_add_epi16(sum_left
, sixteen
);
533 sum_left
= _mm_srai_epi16(sum_left
, 5);
534 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
535 const __m128i row
= _mm_shufflelo_epi16(sum_left
, 0);
536 dc_store_8xh(&row
, 32, dst
, stride
);
539 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
540 const uint8_t *above
,
541 const uint8_t *left
) {
543 __m128i sum_left
= dc_sum_8(left
);
544 const __m128i four
= _mm_set1_epi16((uint16_t)4);
545 sum_left
= _mm_add_epi16(sum_left
, four
);
546 sum_left
= _mm_srai_epi16(sum_left
, 3);
547 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
548 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
549 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
550 dc_store_16xh(&row
, 8, dst
, stride
);
553 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
554 const uint8_t *above
,
555 const uint8_t *left
) {
557 __m128i sum_left
= dc_sum_32(left
);
558 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
559 sum_left
= _mm_add_epi16(sum_left
, sixteen
);
560 sum_left
= _mm_srai_epi16(sum_left
, 5);
561 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
562 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
563 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
564 dc_store_16xh(&row
, 32, dst
, stride
);
567 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
568 const uint8_t *above
,
569 const uint8_t *left
) {
571 __m128i sum_left
= dc_sum_64(left
);
572 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
573 sum_left
= _mm_add_epi16(sum_left
, thirtytwo
);
574 sum_left
= _mm_srai_epi16(sum_left
, 6);
575 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
576 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
577 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
578 dc_store_16xh(&row
, 64, dst
, stride
);
581 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
582 const uint8_t *above
,
583 const uint8_t *left
) {
585 __m128i sum_left
= dc_sum_16(left
);
586 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
587 sum_left
= _mm_add_epi16(sum_left
, eight
);
588 sum_left
= _mm_srai_epi16(sum_left
, 4);
589 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
590 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
591 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
592 dc_store_32xh(&row
, 16, dst
, stride
);
595 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
596 const uint8_t *above
,
597 const uint8_t *left
) {
599 __m128i sum_left
= dc_sum_64(left
);
600 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
601 sum_left
= _mm_add_epi16(sum_left
, thirtytwo
);
602 sum_left
= _mm_srai_epi16(sum_left
, 6);
603 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
604 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
605 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
606 dc_store_32xh(&row
, 64, dst
, stride
);
609 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
610 const uint8_t *above
,
611 const uint8_t *left
) {
613 __m128i sum_left
= dc_sum_64(left
);
614 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
615 sum_left
= _mm_add_epi16(sum_left
, thirtytwo
);
616 sum_left
= _mm_srai_epi16(sum_left
, 6);
617 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
618 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
619 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
620 dc_store_64xh(&row
, 64, dst
, stride
);
623 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
624 const uint8_t *above
,
625 const uint8_t *left
) {
627 __m128i sum_left
= dc_sum_32(left
);
628 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
629 sum_left
= _mm_add_epi16(sum_left
, sixteen
);
630 sum_left
= _mm_srai_epi16(sum_left
, 5);
631 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
632 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
633 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
634 dc_store_64xh(&row
, 32, dst
, stride
);
637 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
638 const uint8_t *above
,
639 const uint8_t *left
) {
641 __m128i sum_left
= dc_sum_16(left
);
642 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
643 sum_left
= _mm_add_epi16(sum_left
, eight
);
644 sum_left
= _mm_srai_epi16(sum_left
, 4);
645 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
646 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
647 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
648 dc_store_64xh(&row
, 16, dst
, stride
);
651 // -----------------------------------------------------------------------------
654 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
655 const uint8_t *above
, const uint8_t *left
) {
658 const uint32_t pred
= 0x80808080;
659 dc_store_4xh(pred
, 8, dst
, stride
);
662 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
663 const uint8_t *above
, const uint8_t *left
) {
666 const uint32_t pred
= 0x80808080;
667 dc_store_4xh(pred
, 16, dst
, stride
);
670 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
671 const uint8_t *above
, const uint8_t *left
) {
674 const __m128i row
= _mm_set1_epi8((uint8_t)128);
675 dc_store_8xh(&row
, 4, dst
, stride
);
678 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
679 const uint8_t *above
, const uint8_t *left
) {
682 const __m128i row
= _mm_set1_epi8((uint8_t)128);
683 dc_store_8xh(&row
, 16, dst
, stride
);
686 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
687 const uint8_t *above
, const uint8_t *left
) {
690 const __m128i row
= _mm_set1_epi8((uint8_t)128);
691 dc_store_8xh(&row
, 32, dst
, stride
);
694 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
695 const uint8_t *above
, const uint8_t *left
) {
698 const __m128i row
= _mm_set1_epi8((uint8_t)128);
699 dc_store_16xh(&row
, 8, dst
, stride
);
702 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
703 const uint8_t *above
,
704 const uint8_t *left
) {
707 const __m128i row
= _mm_set1_epi8((uint8_t)128);
708 dc_store_16xh(&row
, 32, dst
, stride
);
711 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
712 const uint8_t *above
,
713 const uint8_t *left
) {
716 const __m128i row
= _mm_set1_epi8((uint8_t)128);
717 dc_store_16xh(&row
, 64, dst
, stride
);
720 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
721 const uint8_t *above
,
722 const uint8_t *left
) {
725 const __m128i row
= _mm_set1_epi8((uint8_t)128);
726 dc_store_32xh(&row
, 16, dst
, stride
);
729 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
730 const uint8_t *above
,
731 const uint8_t *left
) {
734 const __m128i row
= _mm_set1_epi8((uint8_t)128);
735 dc_store_32xh(&row
, 64, dst
, stride
);
738 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
739 const uint8_t *above
,
740 const uint8_t *left
) {
743 const __m128i row
= _mm_set1_epi8((uint8_t)128);
744 dc_store_64xh(&row
, 64, dst
, stride
);
747 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
748 const uint8_t *above
,
749 const uint8_t *left
) {
752 const __m128i row
= _mm_set1_epi8((uint8_t)128);
753 dc_store_64xh(&row
, 32, dst
, stride
);
756 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
757 const uint8_t *above
,
758 const uint8_t *left
) {
761 const __m128i row
= _mm_set1_epi8((uint8_t)128);
762 dc_store_64xh(&row
, 16, dst
, stride
);
765 // -----------------------------------------------------------------------------
768 void aom_v_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
769 const uint8_t *above
, const uint8_t *left
) {
770 const uint32_t pred
= *(uint32_t *)above
;
772 dc_store_4xh(pred
, 8, dst
, stride
);
775 void aom_v_predictor_4x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
776 const uint8_t *above
, const uint8_t *left
) {
777 const uint32_t pred
= *(uint32_t *)above
;
779 dc_store_4xh(pred
, 16, dst
, stride
);
782 void aom_v_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
783 const uint8_t *above
, const uint8_t *left
) {
784 const __m128i row
= _mm_loadl_epi64((__m128i
const *)above
);
786 dc_store_8xh(&row
, 4, dst
, stride
);
789 void aom_v_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
790 const uint8_t *above
, const uint8_t *left
) {
791 const __m128i row
= _mm_loadl_epi64((__m128i
const *)above
);
793 dc_store_8xh(&row
, 16, dst
, stride
);
796 void aom_v_predictor_8x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
797 const uint8_t *above
, const uint8_t *left
) {
798 const __m128i row
= _mm_loadl_epi64((__m128i
const *)above
);
800 dc_store_8xh(&row
, 32, dst
, stride
);
803 void aom_v_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
804 const uint8_t *above
, const uint8_t *left
) {
805 const __m128i row
= _mm_load_si128((__m128i
const *)above
);
807 dc_store_16xh(&row
, 8, dst
, stride
);
810 void aom_v_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
811 const uint8_t *above
, const uint8_t *left
) {
812 const __m128i row
= _mm_load_si128((__m128i
const *)above
);
814 dc_store_16xh(&row
, 32, dst
, stride
);
817 void aom_v_predictor_16x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
818 const uint8_t *above
, const uint8_t *left
) {
819 const __m128i row
= _mm_load_si128((__m128i
const *)above
);
821 dc_store_16xh(&row
, 64, dst
, stride
);
824 static INLINE
void v_predictor_32xh(uint8_t *dst
, ptrdiff_t stride
,
825 const uint8_t *above
, int height
) {
826 const __m128i row0
= _mm_load_si128((__m128i
const *)above
);
827 const __m128i row1
= _mm_load_si128((__m128i
const *)(above
+ 16));
828 for (int i
= 0; i
< height
; ++i
) {
829 _mm_store_si128((__m128i
*)dst
, row0
);
830 _mm_store_si128((__m128i
*)(dst
+ 16), row1
);
835 void aom_v_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
836 const uint8_t *above
, const uint8_t *left
) {
838 v_predictor_32xh(dst
, stride
, above
, 16);
841 void aom_v_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
842 const uint8_t *above
, const uint8_t *left
) {
844 v_predictor_32xh(dst
, stride
, above
, 64);
847 static INLINE
void v_predictor_64xh(uint8_t *dst
, ptrdiff_t stride
,
848 const uint8_t *above
, int height
) {
849 const __m128i row0
= _mm_load_si128((__m128i
const *)above
);
850 const __m128i row1
= _mm_load_si128((__m128i
const *)(above
+ 16));
851 const __m128i row2
= _mm_load_si128((__m128i
const *)(above
+ 32));
852 const __m128i row3
= _mm_load_si128((__m128i
const *)(above
+ 48));
853 for (int i
= 0; i
< height
; ++i
) {
854 _mm_store_si128((__m128i
*)dst
, row0
);
855 _mm_store_si128((__m128i
*)(dst
+ 16), row1
);
856 _mm_store_si128((__m128i
*)(dst
+ 32), row2
);
857 _mm_store_si128((__m128i
*)(dst
+ 48), row3
);
862 void aom_v_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
863 const uint8_t *above
, const uint8_t *left
) {
865 v_predictor_64xh(dst
, stride
, above
, 64);
868 void aom_v_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
869 const uint8_t *above
, const uint8_t *left
) {
871 v_predictor_64xh(dst
, stride
, above
, 32);
874 void aom_v_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
875 const uint8_t *above
, const uint8_t *left
) {
877 v_predictor_64xh(dst
, stride
, above
, 16);
880 // -----------------------------------------------------------------------------
883 void aom_h_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
884 const uint8_t *above
, const uint8_t *left
) {
886 __m128i left_col
= _mm_loadl_epi64((__m128i
const *)left
);
887 left_col
= _mm_unpacklo_epi8(left_col
, left_col
);
888 __m128i row0
= _mm_shufflelo_epi16(left_col
, 0);
889 __m128i row1
= _mm_shufflelo_epi16(left_col
, 0x55);
890 __m128i row2
= _mm_shufflelo_epi16(left_col
, 0xaa);
891 __m128i row3
= _mm_shufflelo_epi16(left_col
, 0xff);
892 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
894 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
896 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
898 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
900 left_col
= _mm_unpackhi_epi64(left_col
, left_col
);
901 row0
= _mm_shufflelo_epi16(left_col
, 0);
902 row1
= _mm_shufflelo_epi16(left_col
, 0x55);
903 row2
= _mm_shufflelo_epi16(left_col
, 0xaa);
904 row3
= _mm_shufflelo_epi16(left_col
, 0xff);
905 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
907 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
909 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
911 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
914 void aom_h_predictor_4x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
915 const uint8_t *above
, const uint8_t *left
) {
917 const __m128i left_col
= _mm_load_si128((__m128i
const *)left
);
918 __m128i left_col_low
= _mm_unpacklo_epi8(left_col
, left_col
);
919 __m128i left_col_high
= _mm_unpackhi_epi8(left_col
, left_col
);
921 __m128i row0
= _mm_shufflelo_epi16(left_col_low
, 0);
922 __m128i row1
= _mm_shufflelo_epi16(left_col_low
, 0x55);
923 __m128i row2
= _mm_shufflelo_epi16(left_col_low
, 0xaa);
924 __m128i row3
= _mm_shufflelo_epi16(left_col_low
, 0xff);
925 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
927 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
929 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
931 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
934 left_col_low
= _mm_unpackhi_epi64(left_col_low
, left_col_low
);
935 row0
= _mm_shufflelo_epi16(left_col_low
, 0);
936 row1
= _mm_shufflelo_epi16(left_col_low
, 0x55);
937 row2
= _mm_shufflelo_epi16(left_col_low
, 0xaa);
938 row3
= _mm_shufflelo_epi16(left_col_low
, 0xff);
939 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
941 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
943 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
945 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
948 row0
= _mm_shufflelo_epi16(left_col_high
, 0);
949 row1
= _mm_shufflelo_epi16(left_col_high
, 0x55);
950 row2
= _mm_shufflelo_epi16(left_col_high
, 0xaa);
951 row3
= _mm_shufflelo_epi16(left_col_high
, 0xff);
952 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
954 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
956 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
958 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
961 left_col_high
= _mm_unpackhi_epi64(left_col_high
, left_col_high
);
962 row0
= _mm_shufflelo_epi16(left_col_high
, 0);
963 row1
= _mm_shufflelo_epi16(left_col_high
, 0x55);
964 row2
= _mm_shufflelo_epi16(left_col_high
, 0xaa);
965 row3
= _mm_shufflelo_epi16(left_col_high
, 0xff);
966 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
968 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
970 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
972 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
975 void aom_h_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
976 const uint8_t *above
, const uint8_t *left
) {
978 __m128i left_col
= _mm_loadl_epi64((__m128i
const *)left
);
979 left_col
= _mm_unpacklo_epi8(left_col
, left_col
);
980 __m128i row0
= _mm_shufflelo_epi16(left_col
, 0);
981 __m128i row1
= _mm_shufflelo_epi16(left_col
, 0x55);
982 __m128i row2
= _mm_shufflelo_epi16(left_col
, 0xaa);
983 __m128i row3
= _mm_shufflelo_epi16(left_col
, 0xff);
984 _mm_storel_epi64((__m128i
*)dst
, row0
);
986 _mm_storel_epi64((__m128i
*)dst
, row1
);
988 _mm_storel_epi64((__m128i
*)dst
, row2
);
990 _mm_storel_epi64((__m128i
*)dst
, row3
);
993 static INLINE
void h_predictor_8x16xc(uint8_t *dst
, ptrdiff_t stride
,
994 const uint8_t *above
, const uint8_t *left
,
997 for (int i
= 0; i
< count
; ++i
) {
998 const __m128i left_col
= _mm_load_si128((__m128i
const *)left
);
999 __m128i left_col_low
= _mm_unpacklo_epi8(left_col
, left_col
);
1000 __m128i left_col_high
= _mm_unpackhi_epi8(left_col
, left_col
);
1002 __m128i row0
= _mm_shufflelo_epi16(left_col_low
, 0);
1003 __m128i row1
= _mm_shufflelo_epi16(left_col_low
, 0x55);
1004 __m128i row2
= _mm_shufflelo_epi16(left_col_low
, 0xaa);
1005 __m128i row3
= _mm_shufflelo_epi16(left_col_low
, 0xff);
1006 _mm_storel_epi64((__m128i
*)dst
, row0
);
1008 _mm_storel_epi64((__m128i
*)dst
, row1
);
1010 _mm_storel_epi64((__m128i
*)dst
, row2
);
1012 _mm_storel_epi64((__m128i
*)dst
, row3
);
1015 left_col_low
= _mm_unpackhi_epi64(left_col_low
, left_col_low
);
1016 row0
= _mm_shufflelo_epi16(left_col_low
, 0);
1017 row1
= _mm_shufflelo_epi16(left_col_low
, 0x55);
1018 row2
= _mm_shufflelo_epi16(left_col_low
, 0xaa);
1019 row3
= _mm_shufflelo_epi16(left_col_low
, 0xff);
1020 _mm_storel_epi64((__m128i
*)dst
, row0
);
1022 _mm_storel_epi64((__m128i
*)dst
, row1
);
1024 _mm_storel_epi64((__m128i
*)dst
, row2
);
1026 _mm_storel_epi64((__m128i
*)dst
, row3
);
1029 row0
= _mm_shufflelo_epi16(left_col_high
, 0);
1030 row1
= _mm_shufflelo_epi16(left_col_high
, 0x55);
1031 row2
= _mm_shufflelo_epi16(left_col_high
, 0xaa);
1032 row3
= _mm_shufflelo_epi16(left_col_high
, 0xff);
1033 _mm_storel_epi64((__m128i
*)dst
, row0
);
1035 _mm_storel_epi64((__m128i
*)dst
, row1
);
1037 _mm_storel_epi64((__m128i
*)dst
, row2
);
1039 _mm_storel_epi64((__m128i
*)dst
, row3
);
1042 left_col_high
= _mm_unpackhi_epi64(left_col_high
, left_col_high
);
1043 row0
= _mm_shufflelo_epi16(left_col_high
, 0);
1044 row1
= _mm_shufflelo_epi16(left_col_high
, 0x55);
1045 row2
= _mm_shufflelo_epi16(left_col_high
, 0xaa);
1046 row3
= _mm_shufflelo_epi16(left_col_high
, 0xff);
1047 _mm_storel_epi64((__m128i
*)dst
, row0
);
1049 _mm_storel_epi64((__m128i
*)dst
, row1
);
1051 _mm_storel_epi64((__m128i
*)dst
, row2
);
1053 _mm_storel_epi64((__m128i
*)dst
, row3
);
1059 void aom_h_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
1060 const uint8_t *above
, const uint8_t *left
) {
1061 h_predictor_8x16xc(dst
, stride
, above
, left
, 1);
1064 void aom_h_predictor_8x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
1065 const uint8_t *above
, const uint8_t *left
) {
1066 h_predictor_8x16xc(dst
, stride
, above
, left
, 2);
1069 static INLINE
void h_pred_store_16xh(const __m128i
*row
, int h
, uint8_t *dst
,
1072 for (i
= 0; i
< h
; ++i
) {
1073 _mm_store_si128((__m128i
*)dst
, row
[i
]);
1078 static INLINE
void repeat_low_4pixels(const __m128i
*x
, __m128i
*row
) {
1079 const __m128i u0
= _mm_shufflelo_epi16(*x
, 0);
1080 const __m128i u1
= _mm_shufflelo_epi16(*x
, 0x55);
1081 const __m128i u2
= _mm_shufflelo_epi16(*x
, 0xaa);
1082 const __m128i u3
= _mm_shufflelo_epi16(*x
, 0xff);
1084 row
[0] = _mm_unpacklo_epi64(u0
, u0
);
1085 row
[1] = _mm_unpacklo_epi64(u1
, u1
);
1086 row
[2] = _mm_unpacklo_epi64(u2
, u2
);
1087 row
[3] = _mm_unpacklo_epi64(u3
, u3
);
1090 static INLINE
void repeat_high_4pixels(const __m128i
*x
, __m128i
*row
) {
1091 const __m128i u0
= _mm_shufflehi_epi16(*x
, 0);
1092 const __m128i u1
= _mm_shufflehi_epi16(*x
, 0x55);
1093 const __m128i u2
= _mm_shufflehi_epi16(*x
, 0xaa);
1094 const __m128i u3
= _mm_shufflehi_epi16(*x
, 0xff);
1096 row
[0] = _mm_unpackhi_epi64(u0
, u0
);
1097 row
[1] = _mm_unpackhi_epi64(u1
, u1
);
1098 row
[2] = _mm_unpackhi_epi64(u2
, u2
);
1099 row
[3] = _mm_unpackhi_epi64(u3
, u3
);
1102 // Process 16x8, first 4 rows
1103 // Use first 8 bytes of left register: xxxxxxxx33221100
1104 static INLINE
void h_prediction_16x8_1(const __m128i
*left
, uint8_t *dst
,
1107 repeat_low_4pixels(left
, row
);
1108 h_pred_store_16xh(row
, 4, dst
, stride
);
1111 // Process 16x8, second 4 rows
1112 // Use second 8 bytes of left register: 77665544xxxxxxxx
1113 static INLINE
void h_prediction_16x8_2(const __m128i
*left
, uint8_t *dst
,
1116 repeat_high_4pixels(left
, row
);
1117 h_pred_store_16xh(row
, 4, dst
, stride
);
1120 void aom_h_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
1121 const uint8_t *above
, const uint8_t *left
) {
1123 const __m128i left_col
= _mm_loadl_epi64((const __m128i
*)left
);
1124 const __m128i left_col_8p
= _mm_unpacklo_epi8(left_col
, left_col
);
1125 h_prediction_16x8_1(&left_col_8p
, dst
, stride
);
1127 h_prediction_16x8_2(&left_col_8p
, dst
, stride
);
1130 static INLINE
void h_predictor_16xh(uint8_t *dst
, ptrdiff_t stride
,
1131 const uint8_t *left
, int count
) {
1134 const __m128i left_col
= _mm_load_si128((const __m128i
*)left
);
1135 const __m128i left_col_8p_lo
= _mm_unpacklo_epi8(left_col
, left_col
);
1136 h_prediction_16x8_1(&left_col_8p_lo
, dst
, stride
);
1138 h_prediction_16x8_2(&left_col_8p_lo
, dst
, stride
);
1141 const __m128i left_col_8p_hi
= _mm_unpackhi_epi8(left_col
, left_col
);
1142 h_prediction_16x8_1(&left_col_8p_hi
, dst
, stride
);
1144 h_prediction_16x8_2(&left_col_8p_hi
, dst
, stride
);
1149 } while (i
< count
);
1152 void aom_h_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
1153 const uint8_t *above
, const uint8_t *left
) {
1155 h_predictor_16xh(dst
, stride
, left
, 2);
1158 void aom_h_predictor_16x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
1159 const uint8_t *above
, const uint8_t *left
) {
1161 h_predictor_16xh(dst
, stride
, left
, 4);
1164 static INLINE
void h_pred_store_32xh(const __m128i
*row
, int h
, uint8_t *dst
,
1167 for (i
= 0; i
< h
; ++i
) {
1168 _mm_store_si128((__m128i
*)dst
, row
[i
]);
1169 _mm_store_si128((__m128i
*)(dst
+ 16), row
[i
]);
1174 // Process 32x8, first 4 rows
1175 // Use first 8 bytes of left register: xxxxxxxx33221100
1176 static INLINE
void h_prediction_32x8_1(const __m128i
*left
, uint8_t *dst
,
1179 repeat_low_4pixels(left
, row
);
1180 h_pred_store_32xh(row
, 4, dst
, stride
);
1183 // Process 32x8, second 4 rows
1184 // Use second 8 bytes of left register: 77665544xxxxxxxx
1185 static INLINE
void h_prediction_32x8_2(const __m128i
*left
, uint8_t *dst
,
1188 repeat_high_4pixels(left
, row
);
1189 h_pred_store_32xh(row
, 4, dst
, stride
);
1192 void aom_h_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
1193 const uint8_t *above
, const uint8_t *left
) {
1194 __m128i left_col
, left_col_8p
;
1197 left_col
= _mm_load_si128((const __m128i
*)left
);
1199 left_col_8p
= _mm_unpacklo_epi8(left_col
, left_col
);
1200 h_prediction_32x8_1(&left_col_8p
, dst
, stride
);
1202 h_prediction_32x8_2(&left_col_8p
, dst
, stride
);
1205 left_col_8p
= _mm_unpackhi_epi8(left_col
, left_col
);
1206 h_prediction_32x8_1(&left_col_8p
, dst
, stride
);
1208 h_prediction_32x8_2(&left_col_8p
, dst
, stride
);
1211 static INLINE
void h_predictor_32xh(uint8_t *dst
, ptrdiff_t stride
,
1212 const uint8_t *left
, int height
) {
1213 int i
= height
>> 2;
1215 __m128i left4
= _mm_cvtsi32_si128(((uint32_t *)left
)[0]);
1216 left4
= _mm_unpacklo_epi8(left4
, left4
);
1217 left4
= _mm_unpacklo_epi8(left4
, left4
);
1218 const __m128i r0
= _mm_shuffle_epi32(left4
, 0x0);
1219 const __m128i r1
= _mm_shuffle_epi32(left4
, 0x55);
1220 _mm_store_si128((__m128i
*)dst
, r0
);
1221 _mm_store_si128((__m128i
*)(dst
+ 16), r0
);
1222 _mm_store_si128((__m128i
*)(dst
+ stride
), r1
);
1223 _mm_store_si128((__m128i
*)(dst
+ stride
+ 16), r1
);
1224 const __m128i r2
= _mm_shuffle_epi32(left4
, 0xaa);
1225 const __m128i r3
= _mm_shuffle_epi32(left4
, 0xff);
1226 _mm_store_si128((__m128i
*)(dst
+ stride
* 2), r2
);
1227 _mm_store_si128((__m128i
*)(dst
+ stride
* 2 + 16), r2
);
1228 _mm_store_si128((__m128i
*)(dst
+ stride
* 3), r3
);
1229 _mm_store_si128((__m128i
*)(dst
+ stride
* 3 + 16), r3
);
1235 void aom_h_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
1236 const uint8_t *above
, const uint8_t *left
) {
1238 h_predictor_32xh(dst
, stride
, left
, 64);
1241 static INLINE
void h_predictor_64xh(uint8_t *dst
, ptrdiff_t stride
,
1242 const uint8_t *left
, int height
) {
1243 int i
= height
>> 2;
1245 __m128i left4
= _mm_cvtsi32_si128(((uint32_t *)left
)[0]);
1246 left4
= _mm_unpacklo_epi8(left4
, left4
);
1247 left4
= _mm_unpacklo_epi8(left4
, left4
);
1248 const __m128i r0
= _mm_shuffle_epi32(left4
, 0x0);
1249 const __m128i r1
= _mm_shuffle_epi32(left4
, 0x55);
1250 _mm_store_si128((__m128i
*)dst
, r0
);
1251 _mm_store_si128((__m128i
*)(dst
+ 16), r0
);
1252 _mm_store_si128((__m128i
*)(dst
+ 32), r0
);
1253 _mm_store_si128((__m128i
*)(dst
+ 48), r0
);
1254 _mm_store_si128((__m128i
*)(dst
+ stride
), r1
);
1255 _mm_store_si128((__m128i
*)(dst
+ stride
+ 16), r1
);
1256 _mm_store_si128((__m128i
*)(dst
+ stride
+ 32), r1
);
1257 _mm_store_si128((__m128i
*)(dst
+ stride
+ 48), r1
);
1258 const __m128i r2
= _mm_shuffle_epi32(left4
, 0xaa);
1259 const __m128i r3
= _mm_shuffle_epi32(left4
, 0xff);
1260 _mm_store_si128((__m128i
*)(dst
+ stride
* 2), r2
);
1261 _mm_store_si128((__m128i
*)(dst
+ stride
* 2 + 16), r2
);
1262 _mm_store_si128((__m128i
*)(dst
+ stride
* 2 + 32), r2
);
1263 _mm_store_si128((__m128i
*)(dst
+ stride
* 2 + 48), r2
);
1264 _mm_store_si128((__m128i
*)(dst
+ stride
* 3), r3
);
1265 _mm_store_si128((__m128i
*)(dst
+ stride
* 3 + 16), r3
);
1266 _mm_store_si128((__m128i
*)(dst
+ stride
* 3 + 32), r3
);
1267 _mm_store_si128((__m128i
*)(dst
+ stride
* 3 + 48), r3
);
1273 void aom_h_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
1274 const uint8_t *above
, const uint8_t *left
) {
1276 h_predictor_64xh(dst
, stride
, left
, 64);
1279 void aom_h_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
1280 const uint8_t *above
, const uint8_t *left
) {
1282 h_predictor_64xh(dst
, stride
, left
, 32);
1285 void aom_h_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
1286 const uint8_t *above
, const uint8_t *left
) {
1288 h_predictor_64xh(dst
, stride
, left
, 16);