Change blend function param order from h,w to w,h
[aom.git] / aom_dsp / x86 / intrapred_sse2.c
blob5b2452c8eb77deabeb491e9b89d3bd1d14eb82e0
1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <emmintrin.h>
14 #include "config/aom_dsp_rtcd.h"
16 static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17 ptrdiff_t stride) {
18 for (int i = 0; i < height; i += 2) {
19 *(uint32_t *)dst = dc;
20 dst += stride;
21 *(uint32_t *)dst = dc;
22 dst += stride;
26 static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27 ptrdiff_t stride) {
28 int i;
29 for (i = 0; i < height; ++i) {
30 _mm_storel_epi64((__m128i *)dst, *row);
31 dst += stride;
35 static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36 ptrdiff_t stride) {
37 int i;
38 for (i = 0; i < height; ++i) {
39 _mm_store_si128((__m128i *)dst, *row);
40 dst += stride;
44 static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45 ptrdiff_t stride) {
46 int i;
47 for (i = 0; i < height; ++i) {
48 _mm_store_si128((__m128i *)dst, *row);
49 _mm_store_si128((__m128i *)(dst + 16), *row);
50 dst += stride;
54 static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55 ptrdiff_t stride) {
56 for (int i = 0; i < height; ++i) {
57 _mm_store_si128((__m128i *)dst, *row);
58 _mm_store_si128((__m128i *)(dst + 16), *row);
59 _mm_store_si128((__m128i *)(dst + 32), *row);
60 _mm_store_si128((__m128i *)(dst + 48), *row);
61 dst += stride;
65 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
66 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67 const __m128i zero = _mm_setzero_si128();
68 x = _mm_unpacklo_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);
72 static INLINE __m128i dc_sum_8(const uint8_t *ref) {
73 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74 const __m128i zero = _mm_setzero_si128();
75 return _mm_sad_epu8(x, zero);
78 static INLINE __m128i dc_sum_16(const uint8_t *ref) {
79 __m128i x = _mm_load_si128((__m128i const *)ref);
80 const __m128i zero = _mm_setzero_si128();
81 x = _mm_sad_epu8(x, zero);
82 const __m128i high = _mm_unpackhi_epi64(x, x);
83 return _mm_add_epi16(x, high);
86 static INLINE __m128i dc_sum_32(const uint8_t *ref) {
87 __m128i x0 = _mm_load_si128((__m128i const *)ref);
88 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
89 const __m128i zero = _mm_setzero_si128();
90 x0 = _mm_sad_epu8(x0, zero);
91 x1 = _mm_sad_epu8(x1, zero);
92 x0 = _mm_add_epi16(x0, x1);
93 const __m128i high = _mm_unpackhi_epi64(x0, x0);
94 return _mm_add_epi16(x0, high);
97 static INLINE __m128i dc_sum_64(const uint8_t *ref) {
98 __m128i x0 = _mm_load_si128((__m128i const *)ref);
99 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
100 __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
101 __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
102 const __m128i zero = _mm_setzero_si128();
103 x0 = _mm_sad_epu8(x0, zero);
104 x1 = _mm_sad_epu8(x1, zero);
105 x2 = _mm_sad_epu8(x2, zero);
106 x3 = _mm_sad_epu8(x3, zero);
107 x0 = _mm_add_epi16(x0, x1);
108 x2 = _mm_add_epi16(x2, x3);
109 x0 = _mm_add_epi16(x0, x2);
110 const __m128i high = _mm_unpackhi_epi64(x0, x0);
111 return _mm_add_epi16(x0, high);
114 #define DC_MULTIPLIER_1X2 0x5556
115 #define DC_MULTIPLIER_1X4 0x3334
117 #define DC_SHIFT2 16
119 static INLINE int divide_using_multiply_shift(int num, int shift1,
120 int multiplier) {
121 const int interm = num >> shift1;
122 return interm * multiplier >> DC_SHIFT2;
125 // -----------------------------------------------------------------------------
126 // DC_PRED
128 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
129 const uint8_t *above, const uint8_t *left) {
130 const __m128i sum_left = dc_sum_8(left);
131 __m128i sum_above = dc_sum_4(above);
132 sum_above = _mm_add_epi16(sum_left, sum_above);
134 uint32_t sum = _mm_cvtsi128_si32(sum_above);
135 sum += 6;
136 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
138 const __m128i row = _mm_set1_epi8((uint8_t)sum);
139 const uint32_t pred = _mm_cvtsi128_si32(row);
140 dc_store_4xh(pred, 8, dst, stride);
143 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
144 const uint8_t *above, const uint8_t *left) {
145 const __m128i sum_left = dc_sum_16(left);
146 __m128i sum_above = dc_sum_4(above);
147 sum_above = _mm_add_epi16(sum_left, sum_above);
149 uint32_t sum = _mm_cvtsi128_si32(sum_above);
150 sum += 10;
151 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
153 const __m128i row = _mm_set1_epi8((uint8_t)sum);
154 const uint32_t pred = _mm_cvtsi128_si32(row);
155 dc_store_4xh(pred, 16, dst, stride);
158 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
159 const uint8_t *above, const uint8_t *left) {
160 const __m128i sum_left = dc_sum_4(left);
161 __m128i sum_above = dc_sum_8(above);
162 sum_above = _mm_add_epi16(sum_above, sum_left);
164 uint32_t sum = _mm_cvtsi128_si32(sum_above);
165 sum += 6;
166 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
168 const __m128i row = _mm_set1_epi8((uint8_t)sum);
169 dc_store_8xh(&row, 4, dst, stride);
172 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
173 const uint8_t *above, const uint8_t *left) {
174 const __m128i sum_left = dc_sum_16(left);
175 __m128i sum_above = dc_sum_8(above);
176 sum_above = _mm_add_epi16(sum_above, sum_left);
178 uint32_t sum = _mm_cvtsi128_si32(sum_above);
179 sum += 12;
180 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
181 const __m128i row = _mm_set1_epi8((uint8_t)sum);
182 dc_store_8xh(&row, 16, dst, stride);
185 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
186 const uint8_t *above, const uint8_t *left) {
187 const __m128i sum_left = dc_sum_32(left);
188 __m128i sum_above = dc_sum_8(above);
189 sum_above = _mm_add_epi16(sum_above, sum_left);
191 uint32_t sum = _mm_cvtsi128_si32(sum_above);
192 sum += 20;
193 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
194 const __m128i row = _mm_set1_epi8((uint8_t)sum);
195 dc_store_8xh(&row, 32, dst, stride);
198 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
199 const uint8_t *above, const uint8_t *left) {
200 const __m128i sum_left = dc_sum_4(left);
201 __m128i sum_above = dc_sum_16(above);
202 sum_above = _mm_add_epi16(sum_above, sum_left);
204 uint32_t sum = _mm_cvtsi128_si32(sum_above);
205 sum += 10;
206 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
207 const __m128i row = _mm_set1_epi8((uint8_t)sum);
208 dc_store_16xh(&row, 4, dst, stride);
211 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
212 const uint8_t *above, const uint8_t *left) {
213 const __m128i sum_left = dc_sum_8(left);
214 __m128i sum_above = dc_sum_16(above);
215 sum_above = _mm_add_epi16(sum_above, sum_left);
217 uint32_t sum = _mm_cvtsi128_si32(sum_above);
218 sum += 12;
219 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
220 const __m128i row = _mm_set1_epi8((uint8_t)sum);
221 dc_store_16xh(&row, 8, dst, stride);
224 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
225 const uint8_t *above, const uint8_t *left) {
226 const __m128i sum_left = dc_sum_32(left);
227 __m128i sum_above = dc_sum_16(above);
228 sum_above = _mm_add_epi16(sum_left, sum_above);
230 uint32_t sum = _mm_cvtsi128_si32(sum_above);
231 sum += 24;
232 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
233 const __m128i row = _mm_set1_epi8((uint8_t)sum);
234 dc_store_16xh(&row, 32, dst, stride);
237 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
238 const uint8_t *above, const uint8_t *left) {
239 const __m128i sum_left = dc_sum_64(left);
240 __m128i sum_above = dc_sum_16(above);
241 sum_above = _mm_add_epi16(sum_left, sum_above);
243 uint32_t sum = _mm_cvtsi128_si32(sum_above);
244 sum += 40;
245 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
246 const __m128i row = _mm_set1_epi8((uint8_t)sum);
247 dc_store_16xh(&row, 64, dst, stride);
250 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
251 const uint8_t *above, const uint8_t *left) {
252 __m128i sum_above = dc_sum_32(above);
253 const __m128i sum_left = dc_sum_8(left);
254 sum_above = _mm_add_epi16(sum_above, sum_left);
256 uint32_t sum = _mm_cvtsi128_si32(sum_above);
257 sum += 20;
258 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
259 const __m128i row = _mm_set1_epi8((uint8_t)sum);
260 dc_store_32xh(&row, 8, dst, stride);
263 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
264 const uint8_t *above, const uint8_t *left) {
265 __m128i sum_above = dc_sum_32(above);
266 const __m128i sum_left = dc_sum_16(left);
267 sum_above = _mm_add_epi16(sum_above, sum_left);
269 uint32_t sum = _mm_cvtsi128_si32(sum_above);
270 sum += 24;
271 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
272 const __m128i row = _mm_set1_epi8((uint8_t)sum);
273 dc_store_32xh(&row, 16, dst, stride);
276 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
277 const uint8_t *above, const uint8_t *left) {
278 __m128i sum_above = dc_sum_32(above);
279 const __m128i sum_left = dc_sum_64(left);
280 sum_above = _mm_add_epi16(sum_above, sum_left);
282 uint32_t sum = _mm_cvtsi128_si32(sum_above);
283 sum += 48;
284 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
285 const __m128i row = _mm_set1_epi8((uint8_t)sum);
286 dc_store_32xh(&row, 64, dst, stride);
289 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
290 const uint8_t *above, const uint8_t *left) {
291 __m128i sum_above = dc_sum_64(above);
292 const __m128i sum_left = dc_sum_64(left);
293 sum_above = _mm_add_epi16(sum_above, sum_left);
295 uint32_t sum = _mm_cvtsi128_si32(sum_above);
296 sum += 64;
297 sum /= 128;
298 const __m128i row = _mm_set1_epi8((uint8_t)sum);
299 dc_store_64xh(&row, 64, dst, stride);
302 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
303 const uint8_t *above, const uint8_t *left) {
304 __m128i sum_above = dc_sum_64(above);
305 const __m128i sum_left = dc_sum_32(left);
306 sum_above = _mm_add_epi16(sum_above, sum_left);
308 uint32_t sum = _mm_cvtsi128_si32(sum_above);
309 sum += 48;
310 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
311 const __m128i row = _mm_set1_epi8((uint8_t)sum);
312 dc_store_64xh(&row, 32, dst, stride);
315 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
316 const uint8_t *above, const uint8_t *left) {
317 __m128i sum_above = dc_sum_64(above);
318 const __m128i sum_left = dc_sum_16(left);
319 sum_above = _mm_add_epi16(sum_above, sum_left);
321 uint32_t sum = _mm_cvtsi128_si32(sum_above);
322 sum += 40;
323 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
324 const __m128i row = _mm_set1_epi8((uint8_t)sum);
325 dc_store_64xh(&row, 16, dst, stride);
328 // -----------------------------------------------------------------------------
329 // DC_TOP
331 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
332 const uint8_t *above, const uint8_t *left) {
333 (void)left;
334 __m128i sum_above = dc_sum_4(above);
335 const __m128i two = _mm_set1_epi16((int16_t)2);
336 sum_above = _mm_add_epi16(sum_above, two);
337 sum_above = _mm_srai_epi16(sum_above, 2);
338 sum_above = _mm_shufflelo_epi16(sum_above, 0);
339 sum_above = _mm_packus_epi16(sum_above, sum_above);
341 const uint32_t pred = _mm_cvtsi128_si32(sum_above);
342 dc_store_4xh(pred, 8, dst, stride);
345 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
346 const uint8_t *above, const uint8_t *left) {
347 (void)left;
348 __m128i sum_above = dc_sum_4(above);
349 const __m128i two = _mm_set1_epi16((int16_t)2);
350 sum_above = _mm_add_epi16(sum_above, two);
351 sum_above = _mm_srai_epi16(sum_above, 2);
352 sum_above = _mm_shufflelo_epi16(sum_above, 0);
353 sum_above = _mm_packus_epi16(sum_above, sum_above);
355 const uint32_t pred = _mm_cvtsi128_si32(sum_above);
356 dc_store_4xh(pred, 16, dst, stride);
359 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
360 const uint8_t *above, const uint8_t *left) {
361 (void)left;
362 __m128i sum_above = dc_sum_8(above);
363 const __m128i four = _mm_set1_epi16((uint16_t)4);
364 sum_above = _mm_add_epi16(sum_above, four);
365 sum_above = _mm_srai_epi16(sum_above, 3);
366 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
367 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
368 dc_store_8xh(&row, 4, dst, stride);
371 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
372 const uint8_t *above, const uint8_t *left) {
373 (void)left;
374 __m128i sum_above = dc_sum_8(above);
375 const __m128i four = _mm_set1_epi16((uint16_t)4);
376 sum_above = _mm_add_epi16(sum_above, four);
377 sum_above = _mm_srai_epi16(sum_above, 3);
378 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
379 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
380 dc_store_8xh(&row, 16, dst, stride);
383 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
384 const uint8_t *above, const uint8_t *left) {
385 (void)left;
386 __m128i sum_above = dc_sum_8(above);
387 const __m128i four = _mm_set1_epi16((uint16_t)4);
388 sum_above = _mm_add_epi16(sum_above, four);
389 sum_above = _mm_srai_epi16(sum_above, 3);
390 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
391 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
392 dc_store_8xh(&row, 32, dst, stride);
395 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
396 const uint8_t *above, const uint8_t *left) {
397 (void)left;
398 __m128i sum_above = dc_sum_16(above);
399 const __m128i eight = _mm_set1_epi16((uint16_t)8);
400 sum_above = _mm_add_epi16(sum_above, eight);
401 sum_above = _mm_srai_epi16(sum_above, 4);
402 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
403 sum_above = _mm_shufflelo_epi16(sum_above, 0);
404 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
405 dc_store_16xh(&row, 4, dst, stride);
408 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
409 const uint8_t *above, const uint8_t *left) {
410 (void)left;
411 __m128i sum_above = dc_sum_16(above);
412 const __m128i eight = _mm_set1_epi16((uint16_t)8);
413 sum_above = _mm_add_epi16(sum_above, eight);
414 sum_above = _mm_srai_epi16(sum_above, 4);
415 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
416 sum_above = _mm_shufflelo_epi16(sum_above, 0);
417 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
418 dc_store_16xh(&row, 8, dst, stride);
421 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
422 const uint8_t *above,
423 const uint8_t *left) {
424 (void)left;
425 __m128i sum_above = dc_sum_16(above);
426 const __m128i eight = _mm_set1_epi16((uint16_t)8);
427 sum_above = _mm_add_epi16(sum_above, eight);
428 sum_above = _mm_srai_epi16(sum_above, 4);
429 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
430 sum_above = _mm_shufflelo_epi16(sum_above, 0);
431 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
432 dc_store_16xh(&row, 32, dst, stride);
435 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
436 const uint8_t *above,
437 const uint8_t *left) {
438 (void)left;
439 __m128i sum_above = dc_sum_16(above);
440 const __m128i eight = _mm_set1_epi16((uint16_t)8);
441 sum_above = _mm_add_epi16(sum_above, eight);
442 sum_above = _mm_srai_epi16(sum_above, 4);
443 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
444 sum_above = _mm_shufflelo_epi16(sum_above, 0);
445 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
446 dc_store_16xh(&row, 64, dst, stride);
449 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
450 const uint8_t *above, const uint8_t *left) {
451 (void)left;
452 __m128i sum_above = dc_sum_32(above);
453 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
454 sum_above = _mm_add_epi16(sum_above, sixteen);
455 sum_above = _mm_srai_epi16(sum_above, 5);
456 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
457 sum_above = _mm_shufflelo_epi16(sum_above, 0);
458 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
459 dc_store_32xh(&row, 8, dst, stride);
462 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
463 const uint8_t *above,
464 const uint8_t *left) {
465 (void)left;
466 __m128i sum_above = dc_sum_32(above);
467 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
468 sum_above = _mm_add_epi16(sum_above, sixteen);
469 sum_above = _mm_srai_epi16(sum_above, 5);
470 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
471 sum_above = _mm_shufflelo_epi16(sum_above, 0);
472 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
473 dc_store_32xh(&row, 16, dst, stride);
476 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
477 const uint8_t *above,
478 const uint8_t *left) {
479 (void)left;
480 __m128i sum_above = dc_sum_32(above);
481 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
482 sum_above = _mm_add_epi16(sum_above, sixteen);
483 sum_above = _mm_srai_epi16(sum_above, 5);
484 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
485 sum_above = _mm_shufflelo_epi16(sum_above, 0);
486 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
487 dc_store_32xh(&row, 64, dst, stride);
490 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
491 const uint8_t *above,
492 const uint8_t *left) {
493 (void)left;
494 __m128i sum_above = dc_sum_64(above);
495 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
496 sum_above = _mm_add_epi16(sum_above, thirtytwo);
497 sum_above = _mm_srai_epi16(sum_above, 6);
498 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
499 sum_above = _mm_shufflelo_epi16(sum_above, 0);
500 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
501 dc_store_64xh(&row, 64, dst, stride);
504 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
505 const uint8_t *above,
506 const uint8_t *left) {
507 (void)left;
508 __m128i sum_above = dc_sum_64(above);
509 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
510 sum_above = _mm_add_epi16(sum_above, thirtytwo);
511 sum_above = _mm_srai_epi16(sum_above, 6);
512 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
513 sum_above = _mm_shufflelo_epi16(sum_above, 0);
514 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
515 dc_store_64xh(&row, 32, dst, stride);
518 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
519 const uint8_t *above,
520 const uint8_t *left) {
521 (void)left;
522 __m128i sum_above = dc_sum_64(above);
523 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
524 sum_above = _mm_add_epi16(sum_above, thirtytwo);
525 sum_above = _mm_srai_epi16(sum_above, 6);
526 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
527 sum_above = _mm_shufflelo_epi16(sum_above, 0);
528 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
529 dc_store_64xh(&row, 16, dst, stride);
532 // -----------------------------------------------------------------------------
533 // DC_LEFT
535 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
536 const uint8_t *above, const uint8_t *left) {
537 (void)above;
538 __m128i sum_left = dc_sum_8(left);
539 const __m128i four = _mm_set1_epi16((uint16_t)4);
540 sum_left = _mm_add_epi16(sum_left, four);
541 sum_left = _mm_srai_epi16(sum_left, 3);
542 sum_left = _mm_shufflelo_epi16(sum_left, 0);
543 sum_left = _mm_packus_epi16(sum_left, sum_left);
545 const uint32_t pred = _mm_cvtsi128_si32(sum_left);
546 dc_store_4xh(pred, 8, dst, stride);
549 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
550 const uint8_t *above,
551 const uint8_t *left) {
552 (void)above;
553 __m128i sum_left = dc_sum_16(left);
554 const __m128i eight = _mm_set1_epi16((uint16_t)8);
555 sum_left = _mm_add_epi16(sum_left, eight);
556 sum_left = _mm_srai_epi16(sum_left, 4);
557 sum_left = _mm_shufflelo_epi16(sum_left, 0);
558 sum_left = _mm_packus_epi16(sum_left, sum_left);
560 const uint32_t pred = _mm_cvtsi128_si32(sum_left);
561 dc_store_4xh(pred, 16, dst, stride);
564 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
565 const uint8_t *above, const uint8_t *left) {
566 (void)above;
567 __m128i sum_left = dc_sum_4(left);
568 const __m128i two = _mm_set1_epi16((uint16_t)2);
569 sum_left = _mm_add_epi16(sum_left, two);
570 sum_left = _mm_srai_epi16(sum_left, 2);
571 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
572 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
573 dc_store_8xh(&row, 4, dst, stride);
576 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
577 const uint8_t *above,
578 const uint8_t *left) {
579 (void)above;
580 __m128i sum_left = dc_sum_16(left);
581 const __m128i eight = _mm_set1_epi16((uint16_t)8);
582 sum_left = _mm_add_epi16(sum_left, eight);
583 sum_left = _mm_srai_epi16(sum_left, 4);
584 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
585 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
586 dc_store_8xh(&row, 16, dst, stride);
589 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
590 const uint8_t *above,
591 const uint8_t *left) {
592 (void)above;
593 __m128i sum_left = dc_sum_32(left);
594 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
595 sum_left = _mm_add_epi16(sum_left, sixteen);
596 sum_left = _mm_srai_epi16(sum_left, 5);
597 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
598 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
599 dc_store_8xh(&row, 32, dst, stride);
602 void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
603 const uint8_t *above,
604 const uint8_t *left) {
605 (void)above;
606 __m128i sum_left = dc_sum_4(left);
607 const __m128i two = _mm_set1_epi16((uint16_t)2);
608 sum_left = _mm_add_epi16(sum_left, two);
609 sum_left = _mm_srai_epi16(sum_left, 2);
610 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
611 sum_left = _mm_shufflelo_epi16(sum_left, 0);
612 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
613 dc_store_16xh(&row, 4, dst, stride);
616 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
617 const uint8_t *above,
618 const uint8_t *left) {
619 (void)above;
620 __m128i sum_left = dc_sum_8(left);
621 const __m128i four = _mm_set1_epi16((uint16_t)4);
622 sum_left = _mm_add_epi16(sum_left, four);
623 sum_left = _mm_srai_epi16(sum_left, 3);
624 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
625 sum_left = _mm_shufflelo_epi16(sum_left, 0);
626 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
627 dc_store_16xh(&row, 8, dst, stride);
630 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
631 const uint8_t *above,
632 const uint8_t *left) {
633 (void)above;
634 __m128i sum_left = dc_sum_32(left);
635 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
636 sum_left = _mm_add_epi16(sum_left, sixteen);
637 sum_left = _mm_srai_epi16(sum_left, 5);
638 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
639 sum_left = _mm_shufflelo_epi16(sum_left, 0);
640 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
641 dc_store_16xh(&row, 32, dst, stride);
644 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
645 const uint8_t *above,
646 const uint8_t *left) {
647 (void)above;
648 __m128i sum_left = dc_sum_64(left);
649 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
650 sum_left = _mm_add_epi16(sum_left, thirtytwo);
651 sum_left = _mm_srai_epi16(sum_left, 6);
652 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
653 sum_left = _mm_shufflelo_epi16(sum_left, 0);
654 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
655 dc_store_16xh(&row, 64, dst, stride);
658 void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
659 const uint8_t *above,
660 const uint8_t *left) {
661 (void)above;
662 __m128i sum_left = dc_sum_8(left);
663 const __m128i four = _mm_set1_epi16((uint16_t)4);
664 sum_left = _mm_add_epi16(sum_left, four);
665 sum_left = _mm_srai_epi16(sum_left, 3);
666 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
667 sum_left = _mm_shufflelo_epi16(sum_left, 0);
668 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
669 dc_store_32xh(&row, 8, dst, stride);
672 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
673 const uint8_t *above,
674 const uint8_t *left) {
675 (void)above;
676 __m128i sum_left = dc_sum_16(left);
677 const __m128i eight = _mm_set1_epi16((uint16_t)8);
678 sum_left = _mm_add_epi16(sum_left, eight);
679 sum_left = _mm_srai_epi16(sum_left, 4);
680 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
681 sum_left = _mm_shufflelo_epi16(sum_left, 0);
682 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
683 dc_store_32xh(&row, 16, dst, stride);
686 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
687 const uint8_t *above,
688 const uint8_t *left) {
689 (void)above;
690 __m128i sum_left = dc_sum_64(left);
691 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
692 sum_left = _mm_add_epi16(sum_left, thirtytwo);
693 sum_left = _mm_srai_epi16(sum_left, 6);
694 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
695 sum_left = _mm_shufflelo_epi16(sum_left, 0);
696 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
697 dc_store_32xh(&row, 64, dst, stride);
700 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
701 const uint8_t *above,
702 const uint8_t *left) {
703 (void)above;
704 __m128i sum_left = dc_sum_64(left);
705 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
706 sum_left = _mm_add_epi16(sum_left, thirtytwo);
707 sum_left = _mm_srai_epi16(sum_left, 6);
708 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
709 sum_left = _mm_shufflelo_epi16(sum_left, 0);
710 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
711 dc_store_64xh(&row, 64, dst, stride);
714 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
715 const uint8_t *above,
716 const uint8_t *left) {
717 (void)above;
718 __m128i sum_left = dc_sum_32(left);
719 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
720 sum_left = _mm_add_epi16(sum_left, sixteen);
721 sum_left = _mm_srai_epi16(sum_left, 5);
722 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
723 sum_left = _mm_shufflelo_epi16(sum_left, 0);
724 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
725 dc_store_64xh(&row, 32, dst, stride);
728 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
729 const uint8_t *above,
730 const uint8_t *left) {
731 (void)above;
732 __m128i sum_left = dc_sum_16(left);
733 const __m128i eight = _mm_set1_epi16((uint16_t)8);
734 sum_left = _mm_add_epi16(sum_left, eight);
735 sum_left = _mm_srai_epi16(sum_left, 4);
736 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
737 sum_left = _mm_shufflelo_epi16(sum_left, 0);
738 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
739 dc_store_64xh(&row, 16, dst, stride);
742 // -----------------------------------------------------------------------------
743 // DC_128
745 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
746 const uint8_t *above, const uint8_t *left) {
747 (void)above;
748 (void)left;
749 const uint32_t pred = 0x80808080;
750 dc_store_4xh(pred, 8, dst, stride);
753 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
754 const uint8_t *above, const uint8_t *left) {
755 (void)above;
756 (void)left;
757 const uint32_t pred = 0x80808080;
758 dc_store_4xh(pred, 16, dst, stride);
761 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
762 const uint8_t *above, const uint8_t *left) {
763 (void)above;
764 (void)left;
765 const __m128i row = _mm_set1_epi8((uint8_t)128);
766 dc_store_8xh(&row, 4, dst, stride);
769 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
770 const uint8_t *above, const uint8_t *left) {
771 (void)above;
772 (void)left;
773 const __m128i row = _mm_set1_epi8((uint8_t)128);
774 dc_store_8xh(&row, 16, dst, stride);
777 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
778 const uint8_t *above, const uint8_t *left) {
779 (void)above;
780 (void)left;
781 const __m128i row = _mm_set1_epi8((uint8_t)128);
782 dc_store_8xh(&row, 32, dst, stride);
785 void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
786 const uint8_t *above, const uint8_t *left) {
787 (void)above;
788 (void)left;
789 const __m128i row = _mm_set1_epi8((uint8_t)128);
790 dc_store_16xh(&row, 4, dst, stride);
793 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
794 const uint8_t *above, const uint8_t *left) {
795 (void)above;
796 (void)left;
797 const __m128i row = _mm_set1_epi8((uint8_t)128);
798 dc_store_16xh(&row, 8, dst, stride);
801 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
802 const uint8_t *above,
803 const uint8_t *left) {
804 (void)above;
805 (void)left;
806 const __m128i row = _mm_set1_epi8((uint8_t)128);
807 dc_store_16xh(&row, 32, dst, stride);
810 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
811 const uint8_t *above,
812 const uint8_t *left) {
813 (void)above;
814 (void)left;
815 const __m128i row = _mm_set1_epi8((uint8_t)128);
816 dc_store_16xh(&row, 64, dst, stride);
819 void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
820 const uint8_t *above, const uint8_t *left) {
821 (void)above;
822 (void)left;
823 const __m128i row = _mm_set1_epi8((uint8_t)128);
824 dc_store_32xh(&row, 8, dst, stride);
827 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
828 const uint8_t *above,
829 const uint8_t *left) {
830 (void)above;
831 (void)left;
832 const __m128i row = _mm_set1_epi8((uint8_t)128);
833 dc_store_32xh(&row, 16, dst, stride);
836 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
837 const uint8_t *above,
838 const uint8_t *left) {
839 (void)above;
840 (void)left;
841 const __m128i row = _mm_set1_epi8((uint8_t)128);
842 dc_store_32xh(&row, 64, dst, stride);
845 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
846 const uint8_t *above,
847 const uint8_t *left) {
848 (void)above;
849 (void)left;
850 const __m128i row = _mm_set1_epi8((uint8_t)128);
851 dc_store_64xh(&row, 64, dst, stride);
854 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
855 const uint8_t *above,
856 const uint8_t *left) {
857 (void)above;
858 (void)left;
859 const __m128i row = _mm_set1_epi8((uint8_t)128);
860 dc_store_64xh(&row, 32, dst, stride);
863 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
864 const uint8_t *above,
865 const uint8_t *left) {
866 (void)above;
867 (void)left;
868 const __m128i row = _mm_set1_epi8((uint8_t)128);
869 dc_store_64xh(&row, 16, dst, stride);
872 // -----------------------------------------------------------------------------
873 // V_PRED
875 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
876 const uint8_t *above, const uint8_t *left) {
877 const uint32_t pred = *(uint32_t *)above;
878 (void)left;
879 dc_store_4xh(pred, 8, dst, stride);
882 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
883 const uint8_t *above, const uint8_t *left) {
884 const uint32_t pred = *(uint32_t *)above;
885 (void)left;
886 dc_store_4xh(pred, 16, dst, stride);
889 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
890 const uint8_t *above, const uint8_t *left) {
891 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
892 (void)left;
893 dc_store_8xh(&row, 4, dst, stride);
896 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
897 const uint8_t *above, const uint8_t *left) {
898 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
899 (void)left;
900 dc_store_8xh(&row, 16, dst, stride);
903 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
904 const uint8_t *above, const uint8_t *left) {
905 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
906 (void)left;
907 dc_store_8xh(&row, 32, dst, stride);
910 void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
911 const uint8_t *above, const uint8_t *left) {
912 const __m128i row = _mm_load_si128((__m128i const *)above);
913 (void)left;
914 dc_store_16xh(&row, 4, dst, stride);
917 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
918 const uint8_t *above, const uint8_t *left) {
919 const __m128i row = _mm_load_si128((__m128i const *)above);
920 (void)left;
921 dc_store_16xh(&row, 8, dst, stride);
924 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
925 const uint8_t *above, const uint8_t *left) {
926 const __m128i row = _mm_load_si128((__m128i const *)above);
927 (void)left;
928 dc_store_16xh(&row, 32, dst, stride);
931 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
932 const uint8_t *above, const uint8_t *left) {
933 const __m128i row = _mm_load_si128((__m128i const *)above);
934 (void)left;
935 dc_store_16xh(&row, 64, dst, stride);
938 static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
939 const uint8_t *above, int height) {
940 const __m128i row0 = _mm_load_si128((__m128i const *)above);
941 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
942 for (int i = 0; i < height; ++i) {
943 _mm_store_si128((__m128i *)dst, row0);
944 _mm_store_si128((__m128i *)(dst + 16), row1);
945 dst += stride;
949 void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
950 const uint8_t *above, const uint8_t *left) {
951 (void)left;
952 v_predictor_32xh(dst, stride, above, 8);
955 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
956 const uint8_t *above, const uint8_t *left) {
957 (void)left;
958 v_predictor_32xh(dst, stride, above, 16);
961 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
962 const uint8_t *above, const uint8_t *left) {
963 (void)left;
964 v_predictor_32xh(dst, stride, above, 64);
967 static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
968 const uint8_t *above, int height) {
969 const __m128i row0 = _mm_load_si128((__m128i const *)above);
970 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
971 const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
972 const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
973 for (int i = 0; i < height; ++i) {
974 _mm_store_si128((__m128i *)dst, row0);
975 _mm_store_si128((__m128i *)(dst + 16), row1);
976 _mm_store_si128((__m128i *)(dst + 32), row2);
977 _mm_store_si128((__m128i *)(dst + 48), row3);
978 dst += stride;
982 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
983 const uint8_t *above, const uint8_t *left) {
984 (void)left;
985 v_predictor_64xh(dst, stride, above, 64);
988 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
989 const uint8_t *above, const uint8_t *left) {
990 (void)left;
991 v_predictor_64xh(dst, stride, above, 32);
994 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
995 const uint8_t *above, const uint8_t *left) {
996 (void)left;
997 v_predictor_64xh(dst, stride, above, 16);
1000 // -----------------------------------------------------------------------------
1001 // H_PRED
1003 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
1004 const uint8_t *above, const uint8_t *left) {
1005 (void)above;
1006 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1007 left_col = _mm_unpacklo_epi8(left_col, left_col);
1008 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1009 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1010 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1011 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1012 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
1013 dst += stride;
1014 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
1015 dst += stride;
1016 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
1017 dst += stride;
1018 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
1019 dst += stride;
1020 left_col = _mm_unpackhi_epi64(left_col, left_col);
1021 row0 = _mm_shufflelo_epi16(left_col, 0);
1022 row1 = _mm_shufflelo_epi16(left_col, 0x55);
1023 row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1024 row3 = _mm_shufflelo_epi16(left_col, 0xff);
1025 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
1026 dst += stride;
1027 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
1028 dst += stride;
1029 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
1030 dst += stride;
1031 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
1034 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1035 const uint8_t *above, const uint8_t *left) {
1036 (void)above;
1037 const __m128i left_col = _mm_load_si128((__m128i const *)left);
1038 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1039 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1041 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1042 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1043 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1044 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1045 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
1046 dst += stride;
1047 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
1048 dst += stride;
1049 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
1050 dst += stride;
1051 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
1052 dst += stride;
1054 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1055 row0 = _mm_shufflelo_epi16(left_col_low, 0);
1056 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1057 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1058 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1059 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
1060 dst += stride;
1061 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
1062 dst += stride;
1063 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
1064 dst += stride;
1065 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
1066 dst += stride;
1068 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1069 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1070 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1071 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1072 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
1073 dst += stride;
1074 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
1075 dst += stride;
1076 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
1077 dst += stride;
1078 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
1079 dst += stride;
1081 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1082 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1083 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1084 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1085 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1086 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
1087 dst += stride;
1088 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
1089 dst += stride;
1090 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
1091 dst += stride;
1092 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
1095 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1096 const uint8_t *above, const uint8_t *left) {
1097 (void)above;
1098 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1099 left_col = _mm_unpacklo_epi8(left_col, left_col);
1100 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1101 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1102 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1103 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1104 _mm_storel_epi64((__m128i *)dst, row0);
1105 dst += stride;
1106 _mm_storel_epi64((__m128i *)dst, row1);
1107 dst += stride;
1108 _mm_storel_epi64((__m128i *)dst, row2);
1109 dst += stride;
1110 _mm_storel_epi64((__m128i *)dst, row3);
1113 static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1114 const uint8_t *above, const uint8_t *left,
1115 int count) {
1116 (void)above;
1117 for (int i = 0; i < count; ++i) {
1118 const __m128i left_col = _mm_load_si128((__m128i const *)left);
1119 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1120 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1122 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1123 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1124 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1125 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1126 _mm_storel_epi64((__m128i *)dst, row0);
1127 dst += stride;
1128 _mm_storel_epi64((__m128i *)dst, row1);
1129 dst += stride;
1130 _mm_storel_epi64((__m128i *)dst, row2);
1131 dst += stride;
1132 _mm_storel_epi64((__m128i *)dst, row3);
1133 dst += stride;
1135 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1136 row0 = _mm_shufflelo_epi16(left_col_low, 0);
1137 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1138 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1139 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1140 _mm_storel_epi64((__m128i *)dst, row0);
1141 dst += stride;
1142 _mm_storel_epi64((__m128i *)dst, row1);
1143 dst += stride;
1144 _mm_storel_epi64((__m128i *)dst, row2);
1145 dst += stride;
1146 _mm_storel_epi64((__m128i *)dst, row3);
1147 dst += stride;
1149 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1150 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1151 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1152 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1153 _mm_storel_epi64((__m128i *)dst, row0);
1154 dst += stride;
1155 _mm_storel_epi64((__m128i *)dst, row1);
1156 dst += stride;
1157 _mm_storel_epi64((__m128i *)dst, row2);
1158 dst += stride;
1159 _mm_storel_epi64((__m128i *)dst, row3);
1160 dst += stride;
1162 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1163 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1164 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1165 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1166 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1167 _mm_storel_epi64((__m128i *)dst, row0);
1168 dst += stride;
1169 _mm_storel_epi64((__m128i *)dst, row1);
1170 dst += stride;
1171 _mm_storel_epi64((__m128i *)dst, row2);
1172 dst += stride;
1173 _mm_storel_epi64((__m128i *)dst, row3);
1174 dst += stride;
1175 left += 16;
1179 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1180 const uint8_t *above, const uint8_t *left) {
1181 h_predictor_8x16xc(dst, stride, above, left, 1);
1184 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1185 const uint8_t *above, const uint8_t *left) {
1186 h_predictor_8x16xc(dst, stride, above, left, 2);
1189 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1190 ptrdiff_t stride) {
1191 int i;
1192 for (i = 0; i < h; ++i) {
1193 _mm_store_si128((__m128i *)dst, row[i]);
1194 dst += stride;
1198 static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1199 const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1200 const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1201 const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1202 const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1204 row[0] = _mm_unpacklo_epi64(u0, u0);
1205 row[1] = _mm_unpacklo_epi64(u1, u1);
1206 row[2] = _mm_unpacklo_epi64(u2, u2);
1207 row[3] = _mm_unpacklo_epi64(u3, u3);
1210 static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1211 const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1212 const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1213 const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1214 const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1216 row[0] = _mm_unpackhi_epi64(u0, u0);
1217 row[1] = _mm_unpackhi_epi64(u1, u1);
1218 row[2] = _mm_unpackhi_epi64(u2, u2);
1219 row[3] = _mm_unpackhi_epi64(u3, u3);
1222 // Process 16x8, first 4 rows
1223 // Use first 8 bytes of left register: xxxxxxxx33221100
1224 static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1225 ptrdiff_t stride) {
1226 __m128i row[4];
1227 repeat_low_4pixels(left, row);
1228 h_pred_store_16xh(row, 4, dst, stride);
1231 // Process 16x8, second 4 rows
1232 // Use second 8 bytes of left register: 77665544xxxxxxxx
1233 static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1234 ptrdiff_t stride) {
1235 __m128i row[4];
1236 repeat_high_4pixels(left, row);
1237 h_pred_store_16xh(row, 4, dst, stride);
1240 void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1241 const uint8_t *above, const uint8_t *left) {
1242 (void)above;
1243 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1244 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1245 h_prediction_16x8_1(&left_col_8p, dst, stride);
1248 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1249 const uint8_t *above, const uint8_t *left) {
1250 (void)above;
1251 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1252 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1253 h_prediction_16x8_1(&left_col_8p, dst, stride);
1254 dst += stride << 2;
1255 h_prediction_16x8_2(&left_col_8p, dst, stride);
1258 static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1259 const uint8_t *left, int count) {
1260 int i = 0;
1261 do {
1262 const __m128i left_col = _mm_load_si128((const __m128i *)left);
1263 const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1264 h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1265 dst += stride << 2;
1266 h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1267 dst += stride << 2;
1269 const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1270 h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1271 dst += stride << 2;
1272 h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1273 dst += stride << 2;
1275 left += 16;
1276 i++;
1277 } while (i < count);
1280 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1281 const uint8_t *above, const uint8_t *left) {
1282 (void)above;
1283 h_predictor_16xh(dst, stride, left, 2);
1286 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1287 const uint8_t *above, const uint8_t *left) {
1288 (void)above;
1289 h_predictor_16xh(dst, stride, left, 4);
1292 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1293 ptrdiff_t stride) {
1294 int i;
1295 for (i = 0; i < h; ++i) {
1296 _mm_store_si128((__m128i *)dst, row[i]);
1297 _mm_store_si128((__m128i *)(dst + 16), row[i]);
1298 dst += stride;
1302 // Process 32x8, first 4 rows
1303 // Use first 8 bytes of left register: xxxxxxxx33221100
1304 static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1305 ptrdiff_t stride) {
1306 __m128i row[4];
1307 repeat_low_4pixels(left, row);
1308 h_pred_store_32xh(row, 4, dst, stride);
1311 // Process 32x8, second 4 rows
1312 // Use second 8 bytes of left register: 77665544xxxxxxxx
1313 static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1314 ptrdiff_t stride) {
1315 __m128i row[4];
1316 repeat_high_4pixels(left, row);
1317 h_pred_store_32xh(row, 4, dst, stride);
1320 void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1321 const uint8_t *above, const uint8_t *left) {
1322 __m128i left_col, left_col_8p;
1323 (void)above;
1325 left_col = _mm_load_si128((const __m128i *)left);
1327 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1328 h_prediction_32x8_1(&left_col_8p, dst, stride);
1329 dst += stride << 2;
1330 h_prediction_32x8_2(&left_col_8p, dst, stride);
1333 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1334 const uint8_t *above, const uint8_t *left) {
1335 __m128i left_col, left_col_8p;
1336 (void)above;
1338 left_col = _mm_load_si128((const __m128i *)left);
1340 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1341 h_prediction_32x8_1(&left_col_8p, dst, stride);
1342 dst += stride << 2;
1343 h_prediction_32x8_2(&left_col_8p, dst, stride);
1344 dst += stride << 2;
1346 left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1347 h_prediction_32x8_1(&left_col_8p, dst, stride);
1348 dst += stride << 2;
1349 h_prediction_32x8_2(&left_col_8p, dst, stride);
1352 static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1353 const uint8_t *left, int height) {
1354 int i = height >> 2;
1355 do {
1356 __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
1357 left4 = _mm_unpacklo_epi8(left4, left4);
1358 left4 = _mm_unpacklo_epi8(left4, left4);
1359 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1360 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1361 _mm_store_si128((__m128i *)dst, r0);
1362 _mm_store_si128((__m128i *)(dst + 16), r0);
1363 _mm_store_si128((__m128i *)(dst + stride), r1);
1364 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1365 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1366 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1367 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1368 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1369 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1370 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1371 left += 4;
1372 dst += stride * 4;
1373 } while (--i);
1376 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1377 const uint8_t *above, const uint8_t *left) {
1378 (void)above;
1379 h_predictor_32xh(dst, stride, left, 64);
1382 static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1383 const uint8_t *left, int height) {
1384 int i = height >> 2;
1385 do {
1386 __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
1387 left4 = _mm_unpacklo_epi8(left4, left4);
1388 left4 = _mm_unpacklo_epi8(left4, left4);
1389 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1390 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1391 _mm_store_si128((__m128i *)dst, r0);
1392 _mm_store_si128((__m128i *)(dst + 16), r0);
1393 _mm_store_si128((__m128i *)(dst + 32), r0);
1394 _mm_store_si128((__m128i *)(dst + 48), r0);
1395 _mm_store_si128((__m128i *)(dst + stride), r1);
1396 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1397 _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1398 _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1399 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1400 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1401 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1402 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1403 _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1404 _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1405 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1406 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1407 _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1408 _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1409 left += 4;
1410 dst += stride * 4;
1411 } while (--i);
1414 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1415 const uint8_t *above, const uint8_t *left) {
1416 (void)above;
1417 h_predictor_64xh(dst, stride, left, 64);
1420 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1421 const uint8_t *above, const uint8_t *left) {
1422 (void)above;
1423 h_predictor_64xh(dst, stride, left, 32);
1426 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1427 const uint8_t *above, const uint8_t *left) {
1428 (void)above;
1429 h_predictor_64xh(dst, stride, left, 16);