Add sse2/ssse3 intra predictors for 8x32
[aom.git] / aom_dsp / x86 / intrapred_sse2.c
blob9b71034263e75e9491937b5716b95a246f31fce8
1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <emmintrin.h>
14 #include "./aom_dsp_rtcd.h"
16 static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17 ptrdiff_t stride) {
18 for (int i = 0; i < height; i += 2) {
19 *(uint32_t *)dst = dc;
20 dst += stride;
21 *(uint32_t *)dst = dc;
22 dst += stride;
26 static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27 ptrdiff_t stride) {
28 int i;
29 for (i = 0; i < height; ++i) {
30 _mm_storel_epi64((__m128i *)dst, *row);
31 dst += stride;
35 static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36 ptrdiff_t stride) {
37 int i;
38 for (i = 0; i < height; ++i) {
39 _mm_store_si128((__m128i *)dst, *row);
40 dst += stride;
44 static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45 ptrdiff_t stride) {
46 int i;
47 for (i = 0; i < height; ++i) {
48 _mm_store_si128((__m128i *)dst, *row);
49 _mm_store_si128((__m128i *)(dst + 16), *row);
50 dst += stride;
54 static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55 ptrdiff_t stride) {
56 for (int i = 0; i < height; ++i) {
57 _mm_store_si128((__m128i *)dst, *row);
58 _mm_store_si128((__m128i *)(dst + 16), *row);
59 _mm_store_si128((__m128i *)(dst + 32), *row);
60 _mm_store_si128((__m128i *)(dst + 48), *row);
61 dst += stride;
65 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
66 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67 const __m128i zero = _mm_setzero_si128();
68 x = _mm_unpacklo_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);
72 static INLINE __m128i dc_sum_8(const uint8_t *ref) {
73 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74 const __m128i zero = _mm_setzero_si128();
75 return _mm_sad_epu8(x, zero);
78 static INLINE __m128i dc_sum_16(const uint8_t *ref) {
79 __m128i x = _mm_load_si128((__m128i const *)ref);
80 const __m128i zero = _mm_setzero_si128();
81 x = _mm_sad_epu8(x, zero);
82 const __m128i high = _mm_unpackhi_epi64(x, x);
83 return _mm_add_epi16(x, high);
86 static INLINE __m128i dc_sum_32(const uint8_t *ref) {
87 __m128i x0 = _mm_load_si128((__m128i const *)ref);
88 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
89 const __m128i zero = _mm_setzero_si128();
90 x0 = _mm_sad_epu8(x0, zero);
91 x1 = _mm_sad_epu8(x1, zero);
92 x0 = _mm_add_epi16(x0, x1);
93 const __m128i high = _mm_unpackhi_epi64(x0, x0);
94 return _mm_add_epi16(x0, high);
97 static INLINE __m128i dc_sum_64(const uint8_t *ref) {
98 __m128i x0 = _mm_load_si128((__m128i const *)ref);
99 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
100 __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
101 __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
102 const __m128i zero = _mm_setzero_si128();
103 x0 = _mm_sad_epu8(x0, zero);
104 x1 = _mm_sad_epu8(x1, zero);
105 x2 = _mm_sad_epu8(x2, zero);
106 x3 = _mm_sad_epu8(x3, zero);
107 x0 = _mm_add_epi16(x0, x1);
108 x2 = _mm_add_epi16(x2, x3);
109 x0 = _mm_add_epi16(x0, x2);
110 const __m128i high = _mm_unpackhi_epi64(x0, x0);
111 return _mm_add_epi16(x0, high);
114 // -----------------------------------------------------------------------------
115 // DC_PRED
117 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
118 const uint8_t *above, const uint8_t *left) {
119 const __m128i sum_left = dc_sum_8(left);
120 __m128i sum_above = dc_sum_4(above);
121 sum_above = _mm_add_epi16(sum_left, sum_above);
123 uint32_t sum = _mm_cvtsi128_si32(sum_above);
124 sum += 6;
125 sum /= 12;
127 const __m128i row = _mm_set1_epi8((uint8_t)sum);
128 const uint32_t pred = _mm_cvtsi128_si32(row);
129 dc_store_4xh(pred, 8, dst, stride);
132 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
133 const uint8_t *above, const uint8_t *left) {
134 const __m128i sum_left = dc_sum_16(left);
135 __m128i sum_above = dc_sum_4(above);
136 sum_above = _mm_add_epi16(sum_left, sum_above);
138 uint32_t sum = _mm_cvtsi128_si32(sum_above);
139 sum += 10;
140 sum /= 20;
142 const __m128i row = _mm_set1_epi8((uint8_t)sum);
143 const uint32_t pred = _mm_cvtsi128_si32(row);
144 dc_store_4xh(pred, 16, dst, stride);
147 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
148 const uint8_t *above, const uint8_t *left) {
149 const __m128i sum_left = dc_sum_4(left);
150 __m128i sum_above = dc_sum_8(above);
151 sum_above = _mm_add_epi16(sum_above, sum_left);
153 uint32_t sum = _mm_cvtsi128_si32(sum_above);
154 sum += 6;
155 sum /= 12;
157 const __m128i row = _mm_set1_epi8((uint8_t)sum);
158 dc_store_8xh(&row, 4, dst, stride);
161 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
162 const uint8_t *above, const uint8_t *left) {
163 const __m128i sum_left = dc_sum_16(left);
164 __m128i sum_above = dc_sum_8(above);
165 sum_above = _mm_add_epi16(sum_above, sum_left);
167 uint32_t sum = _mm_cvtsi128_si32(sum_above);
168 sum += 12;
169 sum /= 24;
170 const __m128i row = _mm_set1_epi8((uint8_t)sum);
171 dc_store_8xh(&row, 16, dst, stride);
174 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
175 const uint8_t *above, const uint8_t *left) {
176 const __m128i sum_left = dc_sum_32(left);
177 __m128i sum_above = dc_sum_8(above);
178 sum_above = _mm_add_epi16(sum_above, sum_left);
180 uint32_t sum = _mm_cvtsi128_si32(sum_above);
181 sum += 20;
182 sum /= 40;
183 const __m128i row = _mm_set1_epi8((uint8_t)sum);
184 dc_store_8xh(&row, 32, dst, stride);
187 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
188 const uint8_t *above, const uint8_t *left) {
189 const __m128i sum_left = dc_sum_8(left);
190 __m128i sum_above = dc_sum_16(above);
191 sum_above = _mm_add_epi16(sum_above, sum_left);
193 uint32_t sum = _mm_cvtsi128_si32(sum_above);
194 sum += 12;
195 sum /= 24;
196 const __m128i row = _mm_set1_epi8((uint8_t)sum);
197 dc_store_16xh(&row, 8, dst, stride);
200 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
201 const uint8_t *above, const uint8_t *left) {
202 const __m128i sum_left = dc_sum_32(left);
203 __m128i sum_above = dc_sum_16(above);
204 sum_above = _mm_add_epi16(sum_left, sum_above);
206 uint32_t sum = _mm_cvtsi128_si32(sum_above);
207 sum += 24;
208 sum /= 48;
209 const __m128i row = _mm_set1_epi8((uint8_t)sum);
210 dc_store_16xh(&row, 32, dst, stride);
213 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
214 const uint8_t *above, const uint8_t *left) {
215 const __m128i sum_left = dc_sum_64(left);
216 __m128i sum_above = dc_sum_16(above);
217 sum_above = _mm_add_epi16(sum_left, sum_above);
219 uint32_t sum = _mm_cvtsi128_si32(sum_above);
220 sum += 40;
221 sum /= 80;
222 const __m128i row = _mm_set1_epi8((uint8_t)sum);
223 dc_store_16xh(&row, 64, dst, stride);
226 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
227 const uint8_t *above, const uint8_t *left) {
228 __m128i sum_above = dc_sum_32(above);
229 const __m128i sum_left = dc_sum_16(left);
230 sum_above = _mm_add_epi16(sum_above, sum_left);
232 uint32_t sum = _mm_cvtsi128_si32(sum_above);
233 sum += 24;
234 sum /= 48;
235 const __m128i row = _mm_set1_epi8((uint8_t)sum);
236 dc_store_32xh(&row, 16, dst, stride);
239 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
240 const uint8_t *above, const uint8_t *left) {
241 __m128i sum_above = dc_sum_32(above);
242 const __m128i sum_left = dc_sum_64(left);
243 sum_above = _mm_add_epi16(sum_above, sum_left);
245 uint32_t sum = _mm_cvtsi128_si32(sum_above);
246 sum += 48;
247 sum /= 96;
248 const __m128i row = _mm_set1_epi8((uint8_t)sum);
249 dc_store_32xh(&row, 64, dst, stride);
252 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
253 const uint8_t *above, const uint8_t *left) {
254 __m128i sum_above = dc_sum_64(above);
255 const __m128i sum_left = dc_sum_64(left);
256 sum_above = _mm_add_epi16(sum_above, sum_left);
258 uint32_t sum = _mm_cvtsi128_si32(sum_above);
259 sum += 64;
260 sum /= 128;
261 const __m128i row = _mm_set1_epi8((uint8_t)sum);
262 dc_store_64xh(&row, 64, dst, stride);
265 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
266 const uint8_t *above, const uint8_t *left) {
267 __m128i sum_above = dc_sum_64(above);
268 const __m128i sum_left = dc_sum_32(left);
269 sum_above = _mm_add_epi16(sum_above, sum_left);
271 uint32_t sum = _mm_cvtsi128_si32(sum_above);
272 sum += 48;
273 sum /= 96;
274 const __m128i row = _mm_set1_epi8((uint8_t)sum);
275 dc_store_64xh(&row, 32, dst, stride);
278 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
279 const uint8_t *above, const uint8_t *left) {
280 __m128i sum_above = dc_sum_64(above);
281 const __m128i sum_left = dc_sum_16(left);
282 sum_above = _mm_add_epi16(sum_above, sum_left);
284 uint32_t sum = _mm_cvtsi128_si32(sum_above);
285 sum += 40;
286 sum /= 80;
287 const __m128i row = _mm_set1_epi8((uint8_t)sum);
288 dc_store_64xh(&row, 16, dst, stride);
291 // -----------------------------------------------------------------------------
292 // DC_TOP
294 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
295 const uint8_t *above, const uint8_t *left) {
296 (void)left;
297 __m128i sum_above = dc_sum_4(above);
298 const __m128i two = _mm_set1_epi16((int16_t)2);
299 sum_above = _mm_add_epi16(sum_above, two);
300 sum_above = _mm_srai_epi16(sum_above, 2);
301 sum_above = _mm_shufflelo_epi16(sum_above, 0);
302 sum_above = _mm_packus_epi16(sum_above, sum_above);
304 const uint32_t pred = _mm_cvtsi128_si32(sum_above);
305 dc_store_4xh(pred, 8, dst, stride);
308 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
309 const uint8_t *above, const uint8_t *left) {
310 (void)left;
311 __m128i sum_above = dc_sum_4(above);
312 const __m128i two = _mm_set1_epi16((int16_t)2);
313 sum_above = _mm_add_epi16(sum_above, two);
314 sum_above = _mm_srai_epi16(sum_above, 2);
315 sum_above = _mm_shufflelo_epi16(sum_above, 0);
316 sum_above = _mm_packus_epi16(sum_above, sum_above);
318 const uint32_t pred = _mm_cvtsi128_si32(sum_above);
319 dc_store_4xh(pred, 16, dst, stride);
322 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
323 const uint8_t *above, const uint8_t *left) {
324 (void)left;
325 __m128i sum_above = dc_sum_8(above);
326 const __m128i four = _mm_set1_epi16((uint16_t)4);
327 sum_above = _mm_add_epi16(sum_above, four);
328 sum_above = _mm_srai_epi16(sum_above, 3);
329 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
330 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
331 dc_store_8xh(&row, 4, dst, stride);
334 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
335 const uint8_t *above, const uint8_t *left) {
336 (void)left;
337 __m128i sum_above = dc_sum_8(above);
338 const __m128i four = _mm_set1_epi16((uint16_t)4);
339 sum_above = _mm_add_epi16(sum_above, four);
340 sum_above = _mm_srai_epi16(sum_above, 3);
341 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
342 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
343 dc_store_8xh(&row, 16, dst, stride);
346 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
347 const uint8_t *above, const uint8_t *left) {
348 (void)left;
349 __m128i sum_above = dc_sum_8(above);
350 const __m128i four = _mm_set1_epi16((uint16_t)4);
351 sum_above = _mm_add_epi16(sum_above, four);
352 sum_above = _mm_srai_epi16(sum_above, 3);
353 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
354 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
355 dc_store_8xh(&row, 32, dst, stride);
358 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
359 const uint8_t *above, const uint8_t *left) {
360 (void)left;
361 __m128i sum_above = dc_sum_16(above);
362 const __m128i eight = _mm_set1_epi16((uint16_t)8);
363 sum_above = _mm_add_epi16(sum_above, eight);
364 sum_above = _mm_srai_epi16(sum_above, 4);
365 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
366 sum_above = _mm_shufflelo_epi16(sum_above, 0);
367 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
368 dc_store_16xh(&row, 8, dst, stride);
371 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
372 const uint8_t *above,
373 const uint8_t *left) {
374 (void)left;
375 __m128i sum_above = dc_sum_16(above);
376 const __m128i eight = _mm_set1_epi16((uint16_t)8);
377 sum_above = _mm_add_epi16(sum_above, eight);
378 sum_above = _mm_srai_epi16(sum_above, 4);
379 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
380 sum_above = _mm_shufflelo_epi16(sum_above, 0);
381 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
382 dc_store_16xh(&row, 32, dst, stride);
385 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
386 const uint8_t *above,
387 const uint8_t *left) {
388 (void)left;
389 __m128i sum_above = dc_sum_16(above);
390 const __m128i eight = _mm_set1_epi16((uint16_t)8);
391 sum_above = _mm_add_epi16(sum_above, eight);
392 sum_above = _mm_srai_epi16(sum_above, 4);
393 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
394 sum_above = _mm_shufflelo_epi16(sum_above, 0);
395 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
396 dc_store_16xh(&row, 64, dst, stride);
399 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
400 const uint8_t *above,
401 const uint8_t *left) {
402 (void)left;
403 __m128i sum_above = dc_sum_32(above);
404 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
405 sum_above = _mm_add_epi16(sum_above, sixteen);
406 sum_above = _mm_srai_epi16(sum_above, 5);
407 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
408 sum_above = _mm_shufflelo_epi16(sum_above, 0);
409 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
410 dc_store_32xh(&row, 16, dst, stride);
413 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
414 const uint8_t *above,
415 const uint8_t *left) {
416 (void)left;
417 __m128i sum_above = dc_sum_32(above);
418 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
419 sum_above = _mm_add_epi16(sum_above, sixteen);
420 sum_above = _mm_srai_epi16(sum_above, 5);
421 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
422 sum_above = _mm_shufflelo_epi16(sum_above, 0);
423 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
424 dc_store_32xh(&row, 64, dst, stride);
427 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
428 const uint8_t *above,
429 const uint8_t *left) {
430 (void)left;
431 __m128i sum_above = dc_sum_64(above);
432 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
433 sum_above = _mm_add_epi16(sum_above, thirtytwo);
434 sum_above = _mm_srai_epi16(sum_above, 6);
435 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
436 sum_above = _mm_shufflelo_epi16(sum_above, 0);
437 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
438 dc_store_64xh(&row, 64, dst, stride);
441 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
442 const uint8_t *above,
443 const uint8_t *left) {
444 (void)left;
445 __m128i sum_above = dc_sum_64(above);
446 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
447 sum_above = _mm_add_epi16(sum_above, thirtytwo);
448 sum_above = _mm_srai_epi16(sum_above, 6);
449 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
450 sum_above = _mm_shufflelo_epi16(sum_above, 0);
451 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
452 dc_store_64xh(&row, 32, dst, stride);
455 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
456 const uint8_t *above,
457 const uint8_t *left) {
458 (void)left;
459 __m128i sum_above = dc_sum_64(above);
460 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
461 sum_above = _mm_add_epi16(sum_above, thirtytwo);
462 sum_above = _mm_srai_epi16(sum_above, 6);
463 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
464 sum_above = _mm_shufflelo_epi16(sum_above, 0);
465 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
466 dc_store_64xh(&row, 16, dst, stride);
469 // -----------------------------------------------------------------------------
470 // DC_LEFT
472 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
473 const uint8_t *above, const uint8_t *left) {
474 (void)above;
475 __m128i sum_left = dc_sum_8(left);
476 const __m128i four = _mm_set1_epi16((uint16_t)4);
477 sum_left = _mm_add_epi16(sum_left, four);
478 sum_left = _mm_srai_epi16(sum_left, 3);
479 sum_left = _mm_shufflelo_epi16(sum_left, 0);
480 sum_left = _mm_packus_epi16(sum_left, sum_left);
482 const uint32_t pred = _mm_cvtsi128_si32(sum_left);
483 dc_store_4xh(pred, 8, dst, stride);
486 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
487 const uint8_t *above,
488 const uint8_t *left) {
489 (void)above;
490 __m128i sum_left = dc_sum_16(left);
491 const __m128i eight = _mm_set1_epi16((uint16_t)8);
492 sum_left = _mm_add_epi16(sum_left, eight);
493 sum_left = _mm_srai_epi16(sum_left, 4);
494 sum_left = _mm_shufflelo_epi16(sum_left, 0);
495 sum_left = _mm_packus_epi16(sum_left, sum_left);
497 const uint32_t pred = _mm_cvtsi128_si32(sum_left);
498 dc_store_4xh(pred, 16, dst, stride);
501 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
502 const uint8_t *above, const uint8_t *left) {
503 (void)above;
504 __m128i sum_left = dc_sum_4(left);
505 const __m128i two = _mm_set1_epi16((uint16_t)2);
506 sum_left = _mm_add_epi16(sum_left, two);
507 sum_left = _mm_srai_epi16(sum_left, 2);
508 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
509 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
510 dc_store_8xh(&row, 4, dst, stride);
513 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
514 const uint8_t *above,
515 const uint8_t *left) {
516 (void)above;
517 __m128i sum_left = dc_sum_16(left);
518 const __m128i eight = _mm_set1_epi16((uint16_t)8);
519 sum_left = _mm_add_epi16(sum_left, eight);
520 sum_left = _mm_srai_epi16(sum_left, 4);
521 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
522 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
523 dc_store_8xh(&row, 16, dst, stride);
526 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
527 const uint8_t *above,
528 const uint8_t *left) {
529 (void)above;
530 __m128i sum_left = dc_sum_32(left);
531 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
532 sum_left = _mm_add_epi16(sum_left, sixteen);
533 sum_left = _mm_srai_epi16(sum_left, 5);
534 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
535 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
536 dc_store_8xh(&row, 32, dst, stride);
539 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
540 const uint8_t *above,
541 const uint8_t *left) {
542 (void)above;
543 __m128i sum_left = dc_sum_8(left);
544 const __m128i four = _mm_set1_epi16((uint16_t)4);
545 sum_left = _mm_add_epi16(sum_left, four);
546 sum_left = _mm_srai_epi16(sum_left, 3);
547 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
548 sum_left = _mm_shufflelo_epi16(sum_left, 0);
549 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
550 dc_store_16xh(&row, 8, dst, stride);
553 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
554 const uint8_t *above,
555 const uint8_t *left) {
556 (void)above;
557 __m128i sum_left = dc_sum_32(left);
558 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
559 sum_left = _mm_add_epi16(sum_left, sixteen);
560 sum_left = _mm_srai_epi16(sum_left, 5);
561 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
562 sum_left = _mm_shufflelo_epi16(sum_left, 0);
563 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
564 dc_store_16xh(&row, 32, dst, stride);
567 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
568 const uint8_t *above,
569 const uint8_t *left) {
570 (void)above;
571 __m128i sum_left = dc_sum_64(left);
572 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
573 sum_left = _mm_add_epi16(sum_left, thirtytwo);
574 sum_left = _mm_srai_epi16(sum_left, 6);
575 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
576 sum_left = _mm_shufflelo_epi16(sum_left, 0);
577 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
578 dc_store_16xh(&row, 64, dst, stride);
581 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
582 const uint8_t *above,
583 const uint8_t *left) {
584 (void)above;
585 __m128i sum_left = dc_sum_16(left);
586 const __m128i eight = _mm_set1_epi16((uint16_t)8);
587 sum_left = _mm_add_epi16(sum_left, eight);
588 sum_left = _mm_srai_epi16(sum_left, 4);
589 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
590 sum_left = _mm_shufflelo_epi16(sum_left, 0);
591 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
592 dc_store_32xh(&row, 16, dst, stride);
595 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
596 const uint8_t *above,
597 const uint8_t *left) {
598 (void)above;
599 __m128i sum_left = dc_sum_64(left);
600 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
601 sum_left = _mm_add_epi16(sum_left, thirtytwo);
602 sum_left = _mm_srai_epi16(sum_left, 6);
603 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
604 sum_left = _mm_shufflelo_epi16(sum_left, 0);
605 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
606 dc_store_32xh(&row, 64, dst, stride);
609 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
610 const uint8_t *above,
611 const uint8_t *left) {
612 (void)above;
613 __m128i sum_left = dc_sum_64(left);
614 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
615 sum_left = _mm_add_epi16(sum_left, thirtytwo);
616 sum_left = _mm_srai_epi16(sum_left, 6);
617 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
618 sum_left = _mm_shufflelo_epi16(sum_left, 0);
619 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
620 dc_store_64xh(&row, 64, dst, stride);
623 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
624 const uint8_t *above,
625 const uint8_t *left) {
626 (void)above;
627 __m128i sum_left = dc_sum_32(left);
628 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
629 sum_left = _mm_add_epi16(sum_left, sixteen);
630 sum_left = _mm_srai_epi16(sum_left, 5);
631 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
632 sum_left = _mm_shufflelo_epi16(sum_left, 0);
633 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
634 dc_store_64xh(&row, 32, dst, stride);
637 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
638 const uint8_t *above,
639 const uint8_t *left) {
640 (void)above;
641 __m128i sum_left = dc_sum_16(left);
642 const __m128i eight = _mm_set1_epi16((uint16_t)8);
643 sum_left = _mm_add_epi16(sum_left, eight);
644 sum_left = _mm_srai_epi16(sum_left, 4);
645 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
646 sum_left = _mm_shufflelo_epi16(sum_left, 0);
647 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
648 dc_store_64xh(&row, 16, dst, stride);
651 // -----------------------------------------------------------------------------
652 // DC_128
654 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
655 const uint8_t *above, const uint8_t *left) {
656 (void)above;
657 (void)left;
658 const uint32_t pred = 0x80808080;
659 dc_store_4xh(pred, 8, dst, stride);
662 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
663 const uint8_t *above, const uint8_t *left) {
664 (void)above;
665 (void)left;
666 const uint32_t pred = 0x80808080;
667 dc_store_4xh(pred, 16, dst, stride);
670 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
671 const uint8_t *above, const uint8_t *left) {
672 (void)above;
673 (void)left;
674 const __m128i row = _mm_set1_epi8((uint8_t)128);
675 dc_store_8xh(&row, 4, dst, stride);
678 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
679 const uint8_t *above, const uint8_t *left) {
680 (void)above;
681 (void)left;
682 const __m128i row = _mm_set1_epi8((uint8_t)128);
683 dc_store_8xh(&row, 16, dst, stride);
686 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
687 const uint8_t *above, const uint8_t *left) {
688 (void)above;
689 (void)left;
690 const __m128i row = _mm_set1_epi8((uint8_t)128);
691 dc_store_8xh(&row, 32, dst, stride);
694 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
695 const uint8_t *above, const uint8_t *left) {
696 (void)above;
697 (void)left;
698 const __m128i row = _mm_set1_epi8((uint8_t)128);
699 dc_store_16xh(&row, 8, dst, stride);
702 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
703 const uint8_t *above,
704 const uint8_t *left) {
705 (void)above;
706 (void)left;
707 const __m128i row = _mm_set1_epi8((uint8_t)128);
708 dc_store_16xh(&row, 32, dst, stride);
711 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
712 const uint8_t *above,
713 const uint8_t *left) {
714 (void)above;
715 (void)left;
716 const __m128i row = _mm_set1_epi8((uint8_t)128);
717 dc_store_16xh(&row, 64, dst, stride);
720 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
721 const uint8_t *above,
722 const uint8_t *left) {
723 (void)above;
724 (void)left;
725 const __m128i row = _mm_set1_epi8((uint8_t)128);
726 dc_store_32xh(&row, 16, dst, stride);
729 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
730 const uint8_t *above,
731 const uint8_t *left) {
732 (void)above;
733 (void)left;
734 const __m128i row = _mm_set1_epi8((uint8_t)128);
735 dc_store_32xh(&row, 64, dst, stride);
738 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
739 const uint8_t *above,
740 const uint8_t *left) {
741 (void)above;
742 (void)left;
743 const __m128i row = _mm_set1_epi8((uint8_t)128);
744 dc_store_64xh(&row, 64, dst, stride);
747 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
748 const uint8_t *above,
749 const uint8_t *left) {
750 (void)above;
751 (void)left;
752 const __m128i row = _mm_set1_epi8((uint8_t)128);
753 dc_store_64xh(&row, 32, dst, stride);
756 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
757 const uint8_t *above,
758 const uint8_t *left) {
759 (void)above;
760 (void)left;
761 const __m128i row = _mm_set1_epi8((uint8_t)128);
762 dc_store_64xh(&row, 16, dst, stride);
765 // -----------------------------------------------------------------------------
766 // V_PRED
768 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
769 const uint8_t *above, const uint8_t *left) {
770 const uint32_t pred = *(uint32_t *)above;
771 (void)left;
772 dc_store_4xh(pred, 8, dst, stride);
775 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
776 const uint8_t *above, const uint8_t *left) {
777 const uint32_t pred = *(uint32_t *)above;
778 (void)left;
779 dc_store_4xh(pred, 16, dst, stride);
782 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
783 const uint8_t *above, const uint8_t *left) {
784 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
785 (void)left;
786 dc_store_8xh(&row, 4, dst, stride);
789 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
790 const uint8_t *above, const uint8_t *left) {
791 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
792 (void)left;
793 dc_store_8xh(&row, 16, dst, stride);
796 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
797 const uint8_t *above, const uint8_t *left) {
798 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
799 (void)left;
800 dc_store_8xh(&row, 32, dst, stride);
803 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
804 const uint8_t *above, const uint8_t *left) {
805 const __m128i row = _mm_load_si128((__m128i const *)above);
806 (void)left;
807 dc_store_16xh(&row, 8, dst, stride);
810 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
811 const uint8_t *above, const uint8_t *left) {
812 const __m128i row = _mm_load_si128((__m128i const *)above);
813 (void)left;
814 dc_store_16xh(&row, 32, dst, stride);
817 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
818 const uint8_t *above, const uint8_t *left) {
819 const __m128i row = _mm_load_si128((__m128i const *)above);
820 (void)left;
821 dc_store_16xh(&row, 64, dst, stride);
824 static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
825 const uint8_t *above, int height) {
826 const __m128i row0 = _mm_load_si128((__m128i const *)above);
827 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
828 for (int i = 0; i < height; ++i) {
829 _mm_store_si128((__m128i *)dst, row0);
830 _mm_store_si128((__m128i *)(dst + 16), row1);
831 dst += stride;
835 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
836 const uint8_t *above, const uint8_t *left) {
837 (void)left;
838 v_predictor_32xh(dst, stride, above, 16);
841 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
842 const uint8_t *above, const uint8_t *left) {
843 (void)left;
844 v_predictor_32xh(dst, stride, above, 64);
847 static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
848 const uint8_t *above, int height) {
849 const __m128i row0 = _mm_load_si128((__m128i const *)above);
850 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
851 const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
852 const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
853 for (int i = 0; i < height; ++i) {
854 _mm_store_si128((__m128i *)dst, row0);
855 _mm_store_si128((__m128i *)(dst + 16), row1);
856 _mm_store_si128((__m128i *)(dst + 32), row2);
857 _mm_store_si128((__m128i *)(dst + 48), row3);
858 dst += stride;
862 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
863 const uint8_t *above, const uint8_t *left) {
864 (void)left;
865 v_predictor_64xh(dst, stride, above, 64);
868 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
869 const uint8_t *above, const uint8_t *left) {
870 (void)left;
871 v_predictor_64xh(dst, stride, above, 32);
874 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
875 const uint8_t *above, const uint8_t *left) {
876 (void)left;
877 v_predictor_64xh(dst, stride, above, 16);
880 // -----------------------------------------------------------------------------
881 // H_PRED
883 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
884 const uint8_t *above, const uint8_t *left) {
885 (void)above;
886 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
887 left_col = _mm_unpacklo_epi8(left_col, left_col);
888 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
889 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
890 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
891 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
892 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
893 dst += stride;
894 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
895 dst += stride;
896 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
897 dst += stride;
898 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
899 dst += stride;
900 left_col = _mm_unpackhi_epi64(left_col, left_col);
901 row0 = _mm_shufflelo_epi16(left_col, 0);
902 row1 = _mm_shufflelo_epi16(left_col, 0x55);
903 row2 = _mm_shufflelo_epi16(left_col, 0xaa);
904 row3 = _mm_shufflelo_epi16(left_col, 0xff);
905 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
906 dst += stride;
907 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
908 dst += stride;
909 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
910 dst += stride;
911 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
914 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
915 const uint8_t *above, const uint8_t *left) {
916 (void)above;
917 const __m128i left_col = _mm_load_si128((__m128i const *)left);
918 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
919 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
921 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
922 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
923 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
924 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
925 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
926 dst += stride;
927 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
928 dst += stride;
929 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
930 dst += stride;
931 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
932 dst += stride;
934 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
935 row0 = _mm_shufflelo_epi16(left_col_low, 0);
936 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
937 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
938 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
939 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
940 dst += stride;
941 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
942 dst += stride;
943 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
944 dst += stride;
945 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
946 dst += stride;
948 row0 = _mm_shufflelo_epi16(left_col_high, 0);
949 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
950 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
951 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
952 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
953 dst += stride;
954 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
955 dst += stride;
956 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
957 dst += stride;
958 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
959 dst += stride;
961 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
962 row0 = _mm_shufflelo_epi16(left_col_high, 0);
963 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
964 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
965 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
966 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
967 dst += stride;
968 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
969 dst += stride;
970 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
971 dst += stride;
972 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
975 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
976 const uint8_t *above, const uint8_t *left) {
977 (void)above;
978 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
979 left_col = _mm_unpacklo_epi8(left_col, left_col);
980 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
981 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
982 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
983 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
984 _mm_storel_epi64((__m128i *)dst, row0);
985 dst += stride;
986 _mm_storel_epi64((__m128i *)dst, row1);
987 dst += stride;
988 _mm_storel_epi64((__m128i *)dst, row2);
989 dst += stride;
990 _mm_storel_epi64((__m128i *)dst, row3);
993 static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
994 const uint8_t *above, const uint8_t *left,
995 int count) {
996 (void)above;
997 for (int i = 0; i < count; ++i) {
998 const __m128i left_col = _mm_load_si128((__m128i const *)left);
999 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1000 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1002 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1003 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1004 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1005 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1006 _mm_storel_epi64((__m128i *)dst, row0);
1007 dst += stride;
1008 _mm_storel_epi64((__m128i *)dst, row1);
1009 dst += stride;
1010 _mm_storel_epi64((__m128i *)dst, row2);
1011 dst += stride;
1012 _mm_storel_epi64((__m128i *)dst, row3);
1013 dst += stride;
1015 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1016 row0 = _mm_shufflelo_epi16(left_col_low, 0);
1017 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1018 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1019 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1020 _mm_storel_epi64((__m128i *)dst, row0);
1021 dst += stride;
1022 _mm_storel_epi64((__m128i *)dst, row1);
1023 dst += stride;
1024 _mm_storel_epi64((__m128i *)dst, row2);
1025 dst += stride;
1026 _mm_storel_epi64((__m128i *)dst, row3);
1027 dst += stride;
1029 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1030 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1031 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1032 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1033 _mm_storel_epi64((__m128i *)dst, row0);
1034 dst += stride;
1035 _mm_storel_epi64((__m128i *)dst, row1);
1036 dst += stride;
1037 _mm_storel_epi64((__m128i *)dst, row2);
1038 dst += stride;
1039 _mm_storel_epi64((__m128i *)dst, row3);
1040 dst += stride;
1042 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1043 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1044 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1045 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1046 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1047 _mm_storel_epi64((__m128i *)dst, row0);
1048 dst += stride;
1049 _mm_storel_epi64((__m128i *)dst, row1);
1050 dst += stride;
1051 _mm_storel_epi64((__m128i *)dst, row2);
1052 dst += stride;
1053 _mm_storel_epi64((__m128i *)dst, row3);
1054 dst += stride;
1055 left += 16;
1059 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1060 const uint8_t *above, const uint8_t *left) {
1061 h_predictor_8x16xc(dst, stride, above, left, 1);
1064 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1065 const uint8_t *above, const uint8_t *left) {
1066 h_predictor_8x16xc(dst, stride, above, left, 2);
1069 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1070 ptrdiff_t stride) {
1071 int i;
1072 for (i = 0; i < h; ++i) {
1073 _mm_store_si128((__m128i *)dst, row[i]);
1074 dst += stride;
1078 static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1079 const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1080 const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1081 const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1082 const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1084 row[0] = _mm_unpacklo_epi64(u0, u0);
1085 row[1] = _mm_unpacklo_epi64(u1, u1);
1086 row[2] = _mm_unpacklo_epi64(u2, u2);
1087 row[3] = _mm_unpacklo_epi64(u3, u3);
1090 static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1091 const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1092 const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1093 const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1094 const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1096 row[0] = _mm_unpackhi_epi64(u0, u0);
1097 row[1] = _mm_unpackhi_epi64(u1, u1);
1098 row[2] = _mm_unpackhi_epi64(u2, u2);
1099 row[3] = _mm_unpackhi_epi64(u3, u3);
1102 // Process 16x8, first 4 rows
1103 // Use first 8 bytes of left register: xxxxxxxx33221100
1104 static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1105 ptrdiff_t stride) {
1106 __m128i row[4];
1107 repeat_low_4pixels(left, row);
1108 h_pred_store_16xh(row, 4, dst, stride);
1111 // Process 16x8, second 4 rows
1112 // Use second 8 bytes of left register: 77665544xxxxxxxx
1113 static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1114 ptrdiff_t stride) {
1115 __m128i row[4];
1116 repeat_high_4pixels(left, row);
1117 h_pred_store_16xh(row, 4, dst, stride);
1120 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1121 const uint8_t *above, const uint8_t *left) {
1122 (void)above;
1123 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1124 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1125 h_prediction_16x8_1(&left_col_8p, dst, stride);
1126 dst += stride << 2;
1127 h_prediction_16x8_2(&left_col_8p, dst, stride);
1130 static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1131 const uint8_t *left, int count) {
1132 int i = 0;
1133 do {
1134 const __m128i left_col = _mm_load_si128((const __m128i *)left);
1135 const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1136 h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1137 dst += stride << 2;
1138 h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1139 dst += stride << 2;
1141 const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1142 h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1143 dst += stride << 2;
1144 h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1145 dst += stride << 2;
1147 left += 16;
1148 i++;
1149 } while (i < count);
1152 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1153 const uint8_t *above, const uint8_t *left) {
1154 (void)above;
1155 h_predictor_16xh(dst, stride, left, 2);
1158 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1159 const uint8_t *above, const uint8_t *left) {
1160 (void)above;
1161 h_predictor_16xh(dst, stride, left, 4);
1164 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1165 ptrdiff_t stride) {
1166 int i;
1167 for (i = 0; i < h; ++i) {
1168 _mm_store_si128((__m128i *)dst, row[i]);
1169 _mm_store_si128((__m128i *)(dst + 16), row[i]);
1170 dst += stride;
1174 // Process 32x8, first 4 rows
1175 // Use first 8 bytes of left register: xxxxxxxx33221100
1176 static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1177 ptrdiff_t stride) {
1178 __m128i row[4];
1179 repeat_low_4pixels(left, row);
1180 h_pred_store_32xh(row, 4, dst, stride);
1183 // Process 32x8, second 4 rows
1184 // Use second 8 bytes of left register: 77665544xxxxxxxx
1185 static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1186 ptrdiff_t stride) {
1187 __m128i row[4];
1188 repeat_high_4pixels(left, row);
1189 h_pred_store_32xh(row, 4, dst, stride);
1192 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1193 const uint8_t *above, const uint8_t *left) {
1194 __m128i left_col, left_col_8p;
1195 (void)above;
1197 left_col = _mm_load_si128((const __m128i *)left);
1199 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1200 h_prediction_32x8_1(&left_col_8p, dst, stride);
1201 dst += stride << 2;
1202 h_prediction_32x8_2(&left_col_8p, dst, stride);
1203 dst += stride << 2;
1205 left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1206 h_prediction_32x8_1(&left_col_8p, dst, stride);
1207 dst += stride << 2;
1208 h_prediction_32x8_2(&left_col_8p, dst, stride);
1211 static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1212 const uint8_t *left, int height) {
1213 int i = height >> 2;
1214 do {
1215 __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
1216 left4 = _mm_unpacklo_epi8(left4, left4);
1217 left4 = _mm_unpacklo_epi8(left4, left4);
1218 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1219 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1220 _mm_store_si128((__m128i *)dst, r0);
1221 _mm_store_si128((__m128i *)(dst + 16), r0);
1222 _mm_store_si128((__m128i *)(dst + stride), r1);
1223 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1224 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1225 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1226 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1227 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1228 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1229 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1230 left += 4;
1231 dst += stride * 4;
1232 } while (--i);
1235 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1236 const uint8_t *above, const uint8_t *left) {
1237 (void)above;
1238 h_predictor_32xh(dst, stride, left, 64);
1241 static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1242 const uint8_t *left, int height) {
1243 int i = height >> 2;
1244 do {
1245 __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
1246 left4 = _mm_unpacklo_epi8(left4, left4);
1247 left4 = _mm_unpacklo_epi8(left4, left4);
1248 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1249 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1250 _mm_store_si128((__m128i *)dst, r0);
1251 _mm_store_si128((__m128i *)(dst + 16), r0);
1252 _mm_store_si128((__m128i *)(dst + 32), r0);
1253 _mm_store_si128((__m128i *)(dst + 48), r0);
1254 _mm_store_si128((__m128i *)(dst + stride), r1);
1255 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1256 _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1257 _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1258 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1259 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1260 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1261 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1262 _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1263 _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1264 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1265 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1266 _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1267 _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1268 left += 4;
1269 dst += stride * 4;
1270 } while (--i);
1273 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1274 const uint8_t *above, const uint8_t *left) {
1275 (void)above;
1276 h_predictor_64xh(dst, stride, left, 64);
1279 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1280 const uint8_t *above, const uint8_t *left) {
1281 (void)above;
1282 h_predictor_64xh(dst, stride, left, 32);
1285 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1286 const uint8_t *above, const uint8_t *left) {
1287 (void)above;
1288 h_predictor_64xh(dst, stride, left, 16);