2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <emmintrin.h>
14 #include "./aom_dsp_rtcd.h"
16 static INLINE
void dc_store_4x8(uint32_t dc
, uint8_t *dst
, ptrdiff_t stride
) {
18 for (i
= 0; i
< 4; ++i
) {
19 *(uint32_t *)dst
= dc
;
21 *(uint32_t *)dst
= dc
;
26 static INLINE
void dc_store_8xh(const __m128i
*row
, int height
, uint8_t *dst
,
29 for (i
= 0; i
< height
; ++i
) {
30 _mm_storel_epi64((__m128i
*)dst
, *row
);
35 static INLINE
void dc_store_16xh(const __m128i
*row
, int height
, uint8_t *dst
,
38 for (i
= 0; i
< height
; ++i
) {
39 _mm_store_si128((__m128i
*)dst
, *row
);
44 static INLINE
void dc_store_32xh(const __m128i
*row
, int height
, uint8_t *dst
,
47 for (i
= 0; i
< height
; ++i
) {
48 _mm_store_si128((__m128i
*)dst
, *row
);
49 _mm_store_si128((__m128i
*)(dst
+ 16), *row
);
54 static INLINE
void dc_store_64xh(const __m128i
*row
, int height
, uint8_t *dst
,
56 for (int i
= 0; i
< height
; ++i
) {
57 _mm_store_si128((__m128i
*)dst
, *row
);
58 _mm_store_si128((__m128i
*)(dst
+ 16), *row
);
59 _mm_store_si128((__m128i
*)(dst
+ 32), *row
);
60 _mm_store_si128((__m128i
*)(dst
+ 48), *row
);
65 static INLINE __m128i
dc_sum_4(const uint8_t *ref
) {
66 __m128i x
= _mm_loadl_epi64((__m128i
const *)ref
);
67 const __m128i zero
= _mm_setzero_si128();
68 x
= _mm_unpacklo_epi8(x
, zero
);
69 return _mm_sad_epu8(x
, zero
);
72 static INLINE __m128i
dc_sum_8(const uint8_t *ref
) {
73 __m128i x
= _mm_loadl_epi64((__m128i
const *)ref
);
74 const __m128i zero
= _mm_setzero_si128();
75 return _mm_sad_epu8(x
, zero
);
78 static INLINE __m128i
dc_sum_16(const uint8_t *ref
) {
79 __m128i x
= _mm_load_si128((__m128i
const *)ref
);
80 const __m128i zero
= _mm_setzero_si128();
81 x
= _mm_sad_epu8(x
, zero
);
82 const __m128i high
= _mm_unpackhi_epi64(x
, x
);
83 return _mm_add_epi16(x
, high
);
86 static INLINE __m128i
dc_sum_32(const uint8_t *ref
) {
87 __m128i x0
= _mm_load_si128((__m128i
const *)ref
);
88 __m128i x1
= _mm_load_si128((__m128i
const *)(ref
+ 16));
89 const __m128i zero
= _mm_setzero_si128();
90 x0
= _mm_sad_epu8(x0
, zero
);
91 x1
= _mm_sad_epu8(x1
, zero
);
92 x0
= _mm_add_epi16(x0
, x1
);
93 const __m128i high
= _mm_unpackhi_epi64(x0
, x0
);
94 return _mm_add_epi16(x0
, high
);
97 static INLINE __m128i
dc_sum_64(const uint8_t *ref
) {
98 __m128i x0
= _mm_load_si128((__m128i
const *)ref
);
99 __m128i x1
= _mm_load_si128((__m128i
const *)(ref
+ 16));
100 __m128i x2
= _mm_load_si128((__m128i
const *)(ref
+ 32));
101 __m128i x3
= _mm_load_si128((__m128i
const *)(ref
+ 48));
102 const __m128i zero
= _mm_setzero_si128();
103 x0
= _mm_sad_epu8(x0
, zero
);
104 x1
= _mm_sad_epu8(x1
, zero
);
105 x2
= _mm_sad_epu8(x2
, zero
);
106 x3
= _mm_sad_epu8(x3
, zero
);
107 x0
= _mm_add_epi16(x0
, x1
);
108 x2
= _mm_add_epi16(x2
, x3
);
109 x0
= _mm_add_epi16(x0
, x2
);
110 const __m128i high
= _mm_unpackhi_epi64(x0
, x0
);
111 return _mm_add_epi16(x0
, high
);
114 // -----------------------------------------------------------------------------
117 void aom_dc_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
118 const uint8_t *above
, const uint8_t *left
) {
119 const __m128i sum_left
= dc_sum_8(left
);
120 __m128i sum_above
= dc_sum_4(above
);
121 sum_above
= _mm_add_epi16(sum_left
, sum_above
);
123 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
127 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
128 const uint32_t pred
= _mm_cvtsi128_si32(row
);
129 dc_store_4x8(pred
, dst
, stride
);
132 void aom_dc_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
133 const uint8_t *above
, const uint8_t *left
) {
134 const __m128i sum_left
= dc_sum_4(left
);
135 __m128i sum_above
= dc_sum_8(above
);
136 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
138 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
142 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
143 dc_store_8xh(&row
, 4, dst
, stride
);
146 void aom_dc_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
147 const uint8_t *above
, const uint8_t *left
) {
148 const __m128i sum_left
= dc_sum_16(left
);
149 __m128i sum_above
= dc_sum_8(above
);
150 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
152 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
155 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
156 dc_store_8xh(&row
, 16, dst
, stride
);
159 void aom_dc_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
160 const uint8_t *above
, const uint8_t *left
) {
161 const __m128i sum_left
= dc_sum_8(left
);
162 __m128i sum_above
= dc_sum_16(above
);
163 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
165 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
168 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
169 dc_store_16xh(&row
, 8, dst
, stride
);
172 void aom_dc_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
173 const uint8_t *above
, const uint8_t *left
) {
174 const __m128i sum_left
= dc_sum_32(left
);
175 __m128i sum_above
= dc_sum_16(above
);
176 sum_above
= _mm_add_epi16(sum_left
, sum_above
);
178 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
181 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
182 dc_store_16xh(&row
, 32, dst
, stride
);
185 void aom_dc_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
186 const uint8_t *above
, const uint8_t *left
) {
187 __m128i sum_above
= dc_sum_32(above
);
188 const __m128i sum_left
= dc_sum_16(left
);
189 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
191 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
194 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
195 dc_store_32xh(&row
, 16, dst
, stride
);
198 void aom_dc_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
199 const uint8_t *above
, const uint8_t *left
) {
200 __m128i sum_above
= dc_sum_32(above
);
201 const __m128i sum_left
= dc_sum_64(left
);
202 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
204 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
207 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
208 dc_store_32xh(&row
, 64, dst
, stride
);
211 void aom_dc_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
212 const uint8_t *above
, const uint8_t *left
) {
213 __m128i sum_above
= dc_sum_64(above
);
214 const __m128i sum_left
= dc_sum_64(left
);
215 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
217 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
220 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
221 dc_store_64xh(&row
, 64, dst
, stride
);
224 void aom_dc_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
225 const uint8_t *above
, const uint8_t *left
) {
226 __m128i sum_above
= dc_sum_64(above
);
227 const __m128i sum_left
= dc_sum_32(left
);
228 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
230 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
233 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
234 dc_store_64xh(&row
, 32, dst
, stride
);
237 void aom_dc_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
238 const uint8_t *above
, const uint8_t *left
) {
239 __m128i sum_above
= dc_sum_64(above
);
240 const __m128i sum_left
= dc_sum_16(left
);
241 sum_above
= _mm_add_epi16(sum_above
, sum_left
);
243 uint32_t sum
= _mm_cvtsi128_si32(sum_above
);
246 const __m128i row
= _mm_set1_epi8((uint8_t)sum
);
247 dc_store_64xh(&row
, 16, dst
, stride
);
250 // -----------------------------------------------------------------------------
253 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
254 const uint8_t *above
, const uint8_t *left
) {
256 __m128i sum_above
= dc_sum_4(above
);
257 const __m128i two
= _mm_set1_epi16((int16_t)2);
258 sum_above
= _mm_add_epi16(sum_above
, two
);
259 sum_above
= _mm_srai_epi16(sum_above
, 2);
260 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
261 sum_above
= _mm_packus_epi16(sum_above
, sum_above
);
263 const uint32_t pred
= _mm_cvtsi128_si32(sum_above
);
264 dc_store_4x8(pred
, dst
, stride
);
267 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
268 const uint8_t *above
, const uint8_t *left
) {
270 __m128i sum_above
= dc_sum_8(above
);
271 const __m128i four
= _mm_set1_epi16((uint16_t)4);
272 sum_above
= _mm_add_epi16(sum_above
, four
);
273 sum_above
= _mm_srai_epi16(sum_above
, 3);
274 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
275 const __m128i row
= _mm_shufflelo_epi16(sum_above
, 0);
276 dc_store_8xh(&row
, 4, dst
, stride
);
279 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
280 const uint8_t *above
, const uint8_t *left
) {
282 __m128i sum_above
= dc_sum_8(above
);
283 const __m128i four
= _mm_set1_epi16((uint16_t)4);
284 sum_above
= _mm_add_epi16(sum_above
, four
);
285 sum_above
= _mm_srai_epi16(sum_above
, 3);
286 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
287 const __m128i row
= _mm_shufflelo_epi16(sum_above
, 0);
288 dc_store_8xh(&row
, 16, dst
, stride
);
291 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
292 const uint8_t *above
, const uint8_t *left
) {
294 __m128i sum_above
= dc_sum_16(above
);
295 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
296 sum_above
= _mm_add_epi16(sum_above
, eight
);
297 sum_above
= _mm_srai_epi16(sum_above
, 4);
298 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
299 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
300 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
301 dc_store_16xh(&row
, 8, dst
, stride
);
304 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
305 const uint8_t *above
,
306 const uint8_t *left
) {
308 __m128i sum_above
= dc_sum_16(above
);
309 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
310 sum_above
= _mm_add_epi16(sum_above
, eight
);
311 sum_above
= _mm_srai_epi16(sum_above
, 4);
312 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
313 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
314 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
315 dc_store_16xh(&row
, 32, dst
, stride
);
318 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
319 const uint8_t *above
,
320 const uint8_t *left
) {
322 __m128i sum_above
= dc_sum_32(above
);
323 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
324 sum_above
= _mm_add_epi16(sum_above
, sixteen
);
325 sum_above
= _mm_srai_epi16(sum_above
, 5);
326 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
327 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
328 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
329 dc_store_32xh(&row
, 16, dst
, stride
);
332 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
333 const uint8_t *above
,
334 const uint8_t *left
) {
336 __m128i sum_above
= dc_sum_32(above
);
337 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
338 sum_above
= _mm_add_epi16(sum_above
, sixteen
);
339 sum_above
= _mm_srai_epi16(sum_above
, 5);
340 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
341 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
342 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
343 dc_store_32xh(&row
, 64, dst
, stride
);
346 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
347 const uint8_t *above
,
348 const uint8_t *left
) {
350 __m128i sum_above
= dc_sum_64(above
);
351 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
352 sum_above
= _mm_add_epi16(sum_above
, thirtytwo
);
353 sum_above
= _mm_srai_epi16(sum_above
, 6);
354 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
355 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
356 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
357 dc_store_64xh(&row
, 64, dst
, stride
);
360 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
361 const uint8_t *above
,
362 const uint8_t *left
) {
364 __m128i sum_above
= dc_sum_64(above
);
365 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
366 sum_above
= _mm_add_epi16(sum_above
, thirtytwo
);
367 sum_above
= _mm_srai_epi16(sum_above
, 6);
368 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
369 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
370 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
371 dc_store_64xh(&row
, 32, dst
, stride
);
374 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
375 const uint8_t *above
,
376 const uint8_t *left
) {
378 __m128i sum_above
= dc_sum_64(above
);
379 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
380 sum_above
= _mm_add_epi16(sum_above
, thirtytwo
);
381 sum_above
= _mm_srai_epi16(sum_above
, 6);
382 sum_above
= _mm_unpacklo_epi8(sum_above
, sum_above
);
383 sum_above
= _mm_shufflelo_epi16(sum_above
, 0);
384 const __m128i row
= _mm_unpacklo_epi64(sum_above
, sum_above
);
385 dc_store_64xh(&row
, 16, dst
, stride
);
388 // -----------------------------------------------------------------------------
391 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
392 const uint8_t *above
, const uint8_t *left
) {
394 __m128i sum_left
= dc_sum_8(left
);
395 const __m128i four
= _mm_set1_epi16((uint16_t)4);
396 sum_left
= _mm_add_epi16(sum_left
, four
);
397 sum_left
= _mm_srai_epi16(sum_left
, 3);
398 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
399 sum_left
= _mm_packus_epi16(sum_left
, sum_left
);
401 const uint32_t pred
= _mm_cvtsi128_si32(sum_left
);
402 dc_store_4x8(pred
, dst
, stride
);
405 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
406 const uint8_t *above
, const uint8_t *left
) {
408 __m128i sum_left
= dc_sum_4(left
);
409 const __m128i two
= _mm_set1_epi16((uint16_t)2);
410 sum_left
= _mm_add_epi16(sum_left
, two
);
411 sum_left
= _mm_srai_epi16(sum_left
, 2);
412 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
413 const __m128i row
= _mm_shufflelo_epi16(sum_left
, 0);
414 dc_store_8xh(&row
, 4, dst
, stride
);
417 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
418 const uint8_t *above
,
419 const uint8_t *left
) {
421 __m128i sum_left
= dc_sum_16(left
);
422 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
423 sum_left
= _mm_add_epi16(sum_left
, eight
);
424 sum_left
= _mm_srai_epi16(sum_left
, 4);
425 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
426 const __m128i row
= _mm_shufflelo_epi16(sum_left
, 0);
427 dc_store_8xh(&row
, 16, dst
, stride
);
430 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
431 const uint8_t *above
,
432 const uint8_t *left
) {
434 __m128i sum_left
= dc_sum_8(left
);
435 const __m128i four
= _mm_set1_epi16((uint16_t)4);
436 sum_left
= _mm_add_epi16(sum_left
, four
);
437 sum_left
= _mm_srai_epi16(sum_left
, 3);
438 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
439 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
440 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
441 dc_store_16xh(&row
, 8, dst
, stride
);
444 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
445 const uint8_t *above
,
446 const uint8_t *left
) {
448 __m128i sum_left
= dc_sum_32(left
);
449 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
450 sum_left
= _mm_add_epi16(sum_left
, sixteen
);
451 sum_left
= _mm_srai_epi16(sum_left
, 5);
452 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
453 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
454 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
455 dc_store_16xh(&row
, 32, dst
, stride
);
458 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
459 const uint8_t *above
,
460 const uint8_t *left
) {
462 __m128i sum_left
= dc_sum_16(left
);
463 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
464 sum_left
= _mm_add_epi16(sum_left
, eight
);
465 sum_left
= _mm_srai_epi16(sum_left
, 4);
466 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
467 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
468 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
469 dc_store_32xh(&row
, 16, dst
, stride
);
472 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
473 const uint8_t *above
,
474 const uint8_t *left
) {
476 __m128i sum_left
= dc_sum_64(left
);
477 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
478 sum_left
= _mm_add_epi16(sum_left
, thirtytwo
);
479 sum_left
= _mm_srai_epi16(sum_left
, 6);
480 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
481 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
482 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
483 dc_store_32xh(&row
, 64, dst
, stride
);
486 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
487 const uint8_t *above
,
488 const uint8_t *left
) {
490 __m128i sum_left
= dc_sum_64(left
);
491 const __m128i thirtytwo
= _mm_set1_epi16((uint16_t)32);
492 sum_left
= _mm_add_epi16(sum_left
, thirtytwo
);
493 sum_left
= _mm_srai_epi16(sum_left
, 6);
494 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
495 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
496 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
497 dc_store_64xh(&row
, 64, dst
, stride
);
500 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
501 const uint8_t *above
,
502 const uint8_t *left
) {
504 __m128i sum_left
= dc_sum_32(left
);
505 const __m128i sixteen
= _mm_set1_epi16((uint16_t)16);
506 sum_left
= _mm_add_epi16(sum_left
, sixteen
);
507 sum_left
= _mm_srai_epi16(sum_left
, 5);
508 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
509 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
510 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
511 dc_store_64xh(&row
, 32, dst
, stride
);
514 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
515 const uint8_t *above
,
516 const uint8_t *left
) {
518 __m128i sum_left
= dc_sum_16(left
);
519 const __m128i eight
= _mm_set1_epi16((uint16_t)8);
520 sum_left
= _mm_add_epi16(sum_left
, eight
);
521 sum_left
= _mm_srai_epi16(sum_left
, 4);
522 sum_left
= _mm_unpacklo_epi8(sum_left
, sum_left
);
523 sum_left
= _mm_shufflelo_epi16(sum_left
, 0);
524 const __m128i row
= _mm_unpacklo_epi64(sum_left
, sum_left
);
525 dc_store_64xh(&row
, 16, dst
, stride
);
528 // -----------------------------------------------------------------------------
531 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
532 const uint8_t *above
, const uint8_t *left
) {
535 const uint32_t pred
= 0x80808080;
536 dc_store_4x8(pred
, dst
, stride
);
539 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
540 const uint8_t *above
, const uint8_t *left
) {
543 const __m128i row
= _mm_set1_epi8((uint8_t)128);
544 dc_store_8xh(&row
, 4, dst
, stride
);
547 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
548 const uint8_t *above
, const uint8_t *left
) {
551 const __m128i row
= _mm_set1_epi8((uint8_t)128);
552 dc_store_8xh(&row
, 16, dst
, stride
);
555 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
556 const uint8_t *above
, const uint8_t *left
) {
559 const __m128i row
= _mm_set1_epi8((uint8_t)128);
560 dc_store_16xh(&row
, 8, dst
, stride
);
563 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
564 const uint8_t *above
,
565 const uint8_t *left
) {
568 const __m128i row
= _mm_set1_epi8((uint8_t)128);
569 dc_store_16xh(&row
, 32, dst
, stride
);
572 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
573 const uint8_t *above
,
574 const uint8_t *left
) {
577 const __m128i row
= _mm_set1_epi8((uint8_t)128);
578 dc_store_32xh(&row
, 16, dst
, stride
);
581 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
582 const uint8_t *above
,
583 const uint8_t *left
) {
586 const __m128i row
= _mm_set1_epi8((uint8_t)128);
587 dc_store_32xh(&row
, 64, dst
, stride
);
590 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst
, ptrdiff_t stride
,
591 const uint8_t *above
,
592 const uint8_t *left
) {
595 const __m128i row
= _mm_set1_epi8((uint8_t)128);
596 dc_store_64xh(&row
, 64, dst
, stride
);
599 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
600 const uint8_t *above
,
601 const uint8_t *left
) {
604 const __m128i row
= _mm_set1_epi8((uint8_t)128);
605 dc_store_64xh(&row
, 32, dst
, stride
);
608 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
609 const uint8_t *above
,
610 const uint8_t *left
) {
613 const __m128i row
= _mm_set1_epi8((uint8_t)128);
614 dc_store_64xh(&row
, 16, dst
, stride
);
617 // -----------------------------------------------------------------------------
620 void aom_v_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
621 const uint8_t *above
, const uint8_t *left
) {
622 const uint32_t pred
= *(uint32_t *)above
;
624 dc_store_4x8(pred
, dst
, stride
);
627 void aom_v_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
628 const uint8_t *above
, const uint8_t *left
) {
629 const __m128i row
= _mm_loadl_epi64((__m128i
const *)above
);
631 dc_store_8xh(&row
, 4, dst
, stride
);
634 void aom_v_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
635 const uint8_t *above
, const uint8_t *left
) {
636 const __m128i row
= _mm_loadl_epi64((__m128i
const *)above
);
638 dc_store_8xh(&row
, 16, dst
, stride
);
641 void aom_v_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
642 const uint8_t *above
, const uint8_t *left
) {
643 const __m128i row
= _mm_load_si128((__m128i
const *)above
);
645 dc_store_16xh(&row
, 8, dst
, stride
);
648 void aom_v_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
649 const uint8_t *above
, const uint8_t *left
) {
650 const __m128i row
= _mm_load_si128((__m128i
const *)above
);
652 dc_store_16xh(&row
, 32, dst
, stride
);
655 void aom_v_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
656 const uint8_t *above
, const uint8_t *left
) {
657 const __m128i row0
= _mm_load_si128((__m128i
const *)above
);
658 const __m128i row1
= _mm_load_si128((__m128i
const *)(above
+ 16));
661 for (i
= 0; i
< 16; ++i
) {
662 _mm_store_si128((__m128i
*)dst
, row0
);
663 _mm_store_si128((__m128i
*)(dst
+ 16), row1
);
668 // -----------------------------------------------------------------------------
671 void aom_h_predictor_4x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
672 const uint8_t *above
, const uint8_t *left
) {
674 __m128i left_col
= _mm_loadl_epi64((__m128i
const *)left
);
675 left_col
= _mm_unpacklo_epi8(left_col
, left_col
);
676 __m128i row0
= _mm_shufflelo_epi16(left_col
, 0);
677 __m128i row1
= _mm_shufflelo_epi16(left_col
, 0x55);
678 __m128i row2
= _mm_shufflelo_epi16(left_col
, 0xaa);
679 __m128i row3
= _mm_shufflelo_epi16(left_col
, 0xff);
680 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
682 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
684 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
686 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
688 left_col
= _mm_unpackhi_epi64(left_col
, left_col
);
689 row0
= _mm_shufflelo_epi16(left_col
, 0);
690 row1
= _mm_shufflelo_epi16(left_col
, 0x55);
691 row2
= _mm_shufflelo_epi16(left_col
, 0xaa);
692 row3
= _mm_shufflelo_epi16(left_col
, 0xff);
693 *(uint32_t *)dst
= _mm_cvtsi128_si32(row0
);
695 *(uint32_t *)dst
= _mm_cvtsi128_si32(row1
);
697 *(uint32_t *)dst
= _mm_cvtsi128_si32(row2
);
699 *(uint32_t *)dst
= _mm_cvtsi128_si32(row3
);
702 void aom_h_predictor_8x4_sse2(uint8_t *dst
, ptrdiff_t stride
,
703 const uint8_t *above
, const uint8_t *left
) {
705 __m128i left_col
= _mm_loadl_epi64((__m128i
const *)left
);
706 left_col
= _mm_unpacklo_epi8(left_col
, left_col
);
707 __m128i row0
= _mm_shufflelo_epi16(left_col
, 0);
708 __m128i row1
= _mm_shufflelo_epi16(left_col
, 0x55);
709 __m128i row2
= _mm_shufflelo_epi16(left_col
, 0xaa);
710 __m128i row3
= _mm_shufflelo_epi16(left_col
, 0xff);
711 _mm_storel_epi64((__m128i
*)dst
, row0
);
713 _mm_storel_epi64((__m128i
*)dst
, row1
);
715 _mm_storel_epi64((__m128i
*)dst
, row2
);
717 _mm_storel_epi64((__m128i
*)dst
, row3
);
720 void aom_h_predictor_8x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
721 const uint8_t *above
, const uint8_t *left
) {
723 const __m128i left_col
= _mm_load_si128((__m128i
const *)left
);
724 __m128i left_col_low
= _mm_unpacklo_epi8(left_col
, left_col
);
725 __m128i left_col_high
= _mm_unpackhi_epi8(left_col
, left_col
);
727 __m128i row0
= _mm_shufflelo_epi16(left_col_low
, 0);
728 __m128i row1
= _mm_shufflelo_epi16(left_col_low
, 0x55);
729 __m128i row2
= _mm_shufflelo_epi16(left_col_low
, 0xaa);
730 __m128i row3
= _mm_shufflelo_epi16(left_col_low
, 0xff);
731 _mm_storel_epi64((__m128i
*)dst
, row0
);
733 _mm_storel_epi64((__m128i
*)dst
, row1
);
735 _mm_storel_epi64((__m128i
*)dst
, row2
);
737 _mm_storel_epi64((__m128i
*)dst
, row3
);
740 left_col_low
= _mm_unpackhi_epi64(left_col_low
, left_col_low
);
741 row0
= _mm_shufflelo_epi16(left_col_low
, 0);
742 row1
= _mm_shufflelo_epi16(left_col_low
, 0x55);
743 row2
= _mm_shufflelo_epi16(left_col_low
, 0xaa);
744 row3
= _mm_shufflelo_epi16(left_col_low
, 0xff);
745 _mm_storel_epi64((__m128i
*)dst
, row0
);
747 _mm_storel_epi64((__m128i
*)dst
, row1
);
749 _mm_storel_epi64((__m128i
*)dst
, row2
);
751 _mm_storel_epi64((__m128i
*)dst
, row3
);
754 row0
= _mm_shufflelo_epi16(left_col_high
, 0);
755 row1
= _mm_shufflelo_epi16(left_col_high
, 0x55);
756 row2
= _mm_shufflelo_epi16(left_col_high
, 0xaa);
757 row3
= _mm_shufflelo_epi16(left_col_high
, 0xff);
758 _mm_storel_epi64((__m128i
*)dst
, row0
);
760 _mm_storel_epi64((__m128i
*)dst
, row1
);
762 _mm_storel_epi64((__m128i
*)dst
, row2
);
764 _mm_storel_epi64((__m128i
*)dst
, row3
);
767 left_col_high
= _mm_unpackhi_epi64(left_col_high
, left_col_high
);
768 row0
= _mm_shufflelo_epi16(left_col_high
, 0);
769 row1
= _mm_shufflelo_epi16(left_col_high
, 0x55);
770 row2
= _mm_shufflelo_epi16(left_col_high
, 0xaa);
771 row3
= _mm_shufflelo_epi16(left_col_high
, 0xff);
772 _mm_storel_epi64((__m128i
*)dst
, row0
);
774 _mm_storel_epi64((__m128i
*)dst
, row1
);
776 _mm_storel_epi64((__m128i
*)dst
, row2
);
778 _mm_storel_epi64((__m128i
*)dst
, row3
);
781 static INLINE
void h_pred_store_16xh(const __m128i
*row
, int h
, uint8_t *dst
,
784 for (i
= 0; i
< h
; ++i
) {
785 _mm_store_si128((__m128i
*)dst
, row
[i
]);
790 static INLINE
void repeat_low_4pixels(const __m128i
*x
, __m128i
*row
) {
791 const __m128i u0
= _mm_shufflelo_epi16(*x
, 0);
792 const __m128i u1
= _mm_shufflelo_epi16(*x
, 0x55);
793 const __m128i u2
= _mm_shufflelo_epi16(*x
, 0xaa);
794 const __m128i u3
= _mm_shufflelo_epi16(*x
, 0xff);
796 row
[0] = _mm_unpacklo_epi64(u0
, u0
);
797 row
[1] = _mm_unpacklo_epi64(u1
, u1
);
798 row
[2] = _mm_unpacklo_epi64(u2
, u2
);
799 row
[3] = _mm_unpacklo_epi64(u3
, u3
);
802 static INLINE
void repeat_high_4pixels(const __m128i
*x
, __m128i
*row
) {
803 const __m128i u0
= _mm_shufflehi_epi16(*x
, 0);
804 const __m128i u1
= _mm_shufflehi_epi16(*x
, 0x55);
805 const __m128i u2
= _mm_shufflehi_epi16(*x
, 0xaa);
806 const __m128i u3
= _mm_shufflehi_epi16(*x
, 0xff);
808 row
[0] = _mm_unpackhi_epi64(u0
, u0
);
809 row
[1] = _mm_unpackhi_epi64(u1
, u1
);
810 row
[2] = _mm_unpackhi_epi64(u2
, u2
);
811 row
[3] = _mm_unpackhi_epi64(u3
, u3
);
814 // Process 16x8, first 4 rows
815 // Use first 8 bytes of left register: xxxxxxxx33221100
816 static INLINE
void h_prediction_16x8_1(const __m128i
*left
, uint8_t *dst
,
819 repeat_low_4pixels(left
, row
);
820 h_pred_store_16xh(row
, 4, dst
, stride
);
823 // Process 16x8, second 4 rows
824 // Use second 8 bytes of left register: 77665544xxxxxxxx
825 static INLINE
void h_prediction_16x8_2(const __m128i
*left
, uint8_t *dst
,
828 repeat_high_4pixels(left
, row
);
829 h_pred_store_16xh(row
, 4, dst
, stride
);
832 void aom_h_predictor_16x8_sse2(uint8_t *dst
, ptrdiff_t stride
,
833 const uint8_t *above
, const uint8_t *left
) {
835 const __m128i left_col
= _mm_loadl_epi64((const __m128i
*)left
);
836 const __m128i left_col_8p
= _mm_unpacklo_epi8(left_col
, left_col
);
837 h_prediction_16x8_1(&left_col_8p
, dst
, stride
);
839 h_prediction_16x8_2(&left_col_8p
, dst
, stride
);
842 void aom_h_predictor_16x32_sse2(uint8_t *dst
, ptrdiff_t stride
,
843 const uint8_t *above
, const uint8_t *left
) {
844 __m128i left_col
, left_col_8p
;
849 left_col
= _mm_load_si128((const __m128i
*)left
);
850 left_col_8p
= _mm_unpacklo_epi8(left_col
, left_col
);
851 h_prediction_16x8_1(&left_col_8p
, dst
, stride
);
853 h_prediction_16x8_2(&left_col_8p
, dst
, stride
);
856 left_col_8p
= _mm_unpackhi_epi8(left_col
, left_col
);
857 h_prediction_16x8_1(&left_col_8p
, dst
, stride
);
859 h_prediction_16x8_2(&left_col_8p
, dst
, stride
);
867 static INLINE
void h_pred_store_32xh(const __m128i
*row
, int h
, uint8_t *dst
,
870 for (i
= 0; i
< h
; ++i
) {
871 _mm_store_si128((__m128i
*)dst
, row
[i
]);
872 _mm_store_si128((__m128i
*)(dst
+ 16), row
[i
]);
877 // Process 32x8, first 4 rows
878 // Use first 8 bytes of left register: xxxxxxxx33221100
879 static INLINE
void h_prediction_32x8_1(const __m128i
*left
, uint8_t *dst
,
882 repeat_low_4pixels(left
, row
);
883 h_pred_store_32xh(row
, 4, dst
, stride
);
886 // Process 32x8, second 4 rows
887 // Use second 8 bytes of left register: 77665544xxxxxxxx
888 static INLINE
void h_prediction_32x8_2(const __m128i
*left
, uint8_t *dst
,
891 repeat_high_4pixels(left
, row
);
892 h_pred_store_32xh(row
, 4, dst
, stride
);
895 void aom_h_predictor_32x16_sse2(uint8_t *dst
, ptrdiff_t stride
,
896 const uint8_t *above
, const uint8_t *left
) {
897 __m128i left_col
, left_col_8p
;
900 left_col
= _mm_load_si128((const __m128i
*)left
);
902 left_col_8p
= _mm_unpacklo_epi8(left_col
, left_col
);
903 h_prediction_32x8_1(&left_col_8p
, dst
, stride
);
905 h_prediction_32x8_2(&left_col_8p
, dst
, stride
);
908 left_col_8p
= _mm_unpackhi_epi8(left_col
, left_col
);
909 h_prediction_32x8_1(&left_col_8p
, dst
, stride
);
911 h_prediction_32x8_2(&left_col_8p
, dst
, stride
);