Add sse2/avx2 version of aom_dc_predictor_wxh()
[aom.git] / aom_dsp / x86 / intrapred_sse2.c
blob387d0a2f11bfb906286a51d1cfa1c26cb032cc4a
1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <emmintrin.h>
14 #include "./aom_dsp_rtcd.h"
16 static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
17 int i;
18 for (i = 0; i < 4; ++i) {
19 *(uint32_t *)dst = dc;
20 dst += stride;
21 *(uint32_t *)dst = dc;
22 dst += stride;
26 static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27 ptrdiff_t stride) {
28 int i;
29 for (i = 0; i < height; ++i) {
30 _mm_storel_epi64((__m128i *)dst, *row);
31 dst += stride;
35 static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36 ptrdiff_t stride) {
37 int i;
38 for (i = 0; i < height; ++i) {
39 _mm_store_si128((__m128i *)dst, *row);
40 dst += stride;
44 static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45 ptrdiff_t stride) {
46 int i;
47 for (i = 0; i < height; ++i) {
48 _mm_store_si128((__m128i *)dst, *row);
49 _mm_store_si128((__m128i *)(dst + 16), *row);
50 dst += stride;
54 static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55 ptrdiff_t stride) {
56 for (int i = 0; i < height; ++i) {
57 _mm_store_si128((__m128i *)dst, *row);
58 _mm_store_si128((__m128i *)(dst + 16), *row);
59 _mm_store_si128((__m128i *)(dst + 32), *row);
60 _mm_store_si128((__m128i *)(dst + 48), *row);
61 dst += stride;
65 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
66 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67 const __m128i zero = _mm_setzero_si128();
68 x = _mm_unpacklo_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);
72 static INLINE __m128i dc_sum_8(const uint8_t *ref) {
73 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74 const __m128i zero = _mm_setzero_si128();
75 return _mm_sad_epu8(x, zero);
78 static INLINE __m128i dc_sum_16(const uint8_t *ref) {
79 __m128i x = _mm_load_si128((__m128i const *)ref);
80 const __m128i zero = _mm_setzero_si128();
81 x = _mm_sad_epu8(x, zero);
82 const __m128i high = _mm_unpackhi_epi64(x, x);
83 return _mm_add_epi16(x, high);
86 static INLINE __m128i dc_sum_32(const uint8_t *ref) {
87 __m128i x0 = _mm_load_si128((__m128i const *)ref);
88 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
89 const __m128i zero = _mm_setzero_si128();
90 x0 = _mm_sad_epu8(x0, zero);
91 x1 = _mm_sad_epu8(x1, zero);
92 x0 = _mm_add_epi16(x0, x1);
93 const __m128i high = _mm_unpackhi_epi64(x0, x0);
94 return _mm_add_epi16(x0, high);
97 static INLINE __m128i dc_sum_64(const uint8_t *ref) {
98 __m128i x0 = _mm_load_si128((__m128i const *)ref);
99 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
100 __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
101 __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
102 const __m128i zero = _mm_setzero_si128();
103 x0 = _mm_sad_epu8(x0, zero);
104 x1 = _mm_sad_epu8(x1, zero);
105 x2 = _mm_sad_epu8(x2, zero);
106 x3 = _mm_sad_epu8(x3, zero);
107 x0 = _mm_add_epi16(x0, x1);
108 x2 = _mm_add_epi16(x2, x3);
109 x0 = _mm_add_epi16(x0, x2);
110 const __m128i high = _mm_unpackhi_epi64(x0, x0);
111 return _mm_add_epi16(x0, high);
114 // -----------------------------------------------------------------------------
115 // DC_PRED
117 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
118 const uint8_t *above, const uint8_t *left) {
119 const __m128i sum_left = dc_sum_8(left);
120 __m128i sum_above = dc_sum_4(above);
121 sum_above = _mm_add_epi16(sum_left, sum_above);
123 uint32_t sum = _mm_cvtsi128_si32(sum_above);
124 sum += 6;
125 sum /= 12;
127 const __m128i row = _mm_set1_epi8((uint8_t)sum);
128 const uint32_t pred = _mm_cvtsi128_si32(row);
129 dc_store_4x8(pred, dst, stride);
132 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
133 const uint8_t *above, const uint8_t *left) {
134 const __m128i sum_left = dc_sum_4(left);
135 __m128i sum_above = dc_sum_8(above);
136 sum_above = _mm_add_epi16(sum_above, sum_left);
138 uint32_t sum = _mm_cvtsi128_si32(sum_above);
139 sum += 6;
140 sum /= 12;
142 const __m128i row = _mm_set1_epi8((uint8_t)sum);
143 dc_store_8xh(&row, 4, dst, stride);
146 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
147 const uint8_t *above, const uint8_t *left) {
148 const __m128i sum_left = dc_sum_16(left);
149 __m128i sum_above = dc_sum_8(above);
150 sum_above = _mm_add_epi16(sum_above, sum_left);
152 uint32_t sum = _mm_cvtsi128_si32(sum_above);
153 sum += 12;
154 sum /= 24;
155 const __m128i row = _mm_set1_epi8((uint8_t)sum);
156 dc_store_8xh(&row, 16, dst, stride);
159 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
160 const uint8_t *above, const uint8_t *left) {
161 const __m128i sum_left = dc_sum_8(left);
162 __m128i sum_above = dc_sum_16(above);
163 sum_above = _mm_add_epi16(sum_above, sum_left);
165 uint32_t sum = _mm_cvtsi128_si32(sum_above);
166 sum += 12;
167 sum /= 24;
168 const __m128i row = _mm_set1_epi8((uint8_t)sum);
169 dc_store_16xh(&row, 8, dst, stride);
172 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
173 const uint8_t *above, const uint8_t *left) {
174 const __m128i sum_left = dc_sum_32(left);
175 __m128i sum_above = dc_sum_16(above);
176 sum_above = _mm_add_epi16(sum_left, sum_above);
178 uint32_t sum = _mm_cvtsi128_si32(sum_above);
179 sum += 24;
180 sum /= 48;
181 const __m128i row = _mm_set1_epi8((uint8_t)sum);
182 dc_store_16xh(&row, 32, dst, stride);
185 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
186 const uint8_t *above, const uint8_t *left) {
187 __m128i sum_above = dc_sum_32(above);
188 const __m128i sum_left = dc_sum_16(left);
189 sum_above = _mm_add_epi16(sum_above, sum_left);
191 uint32_t sum = _mm_cvtsi128_si32(sum_above);
192 sum += 24;
193 sum /= 48;
194 const __m128i row = _mm_set1_epi8((uint8_t)sum);
195 dc_store_32xh(&row, 16, dst, stride);
198 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
199 const uint8_t *above, const uint8_t *left) {
200 __m128i sum_above = dc_sum_32(above);
201 const __m128i sum_left = dc_sum_64(left);
202 sum_above = _mm_add_epi16(sum_above, sum_left);
204 uint32_t sum = _mm_cvtsi128_si32(sum_above);
205 sum += 48;
206 sum /= 96;
207 const __m128i row = _mm_set1_epi8((uint8_t)sum);
208 dc_store_32xh(&row, 64, dst, stride);
211 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
212 const uint8_t *above, const uint8_t *left) {
213 __m128i sum_above = dc_sum_64(above);
214 const __m128i sum_left = dc_sum_64(left);
215 sum_above = _mm_add_epi16(sum_above, sum_left);
217 uint32_t sum = _mm_cvtsi128_si32(sum_above);
218 sum += 64;
219 sum /= 128;
220 const __m128i row = _mm_set1_epi8((uint8_t)sum);
221 dc_store_64xh(&row, 64, dst, stride);
224 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
225 const uint8_t *above, const uint8_t *left) {
226 __m128i sum_above = dc_sum_64(above);
227 const __m128i sum_left = dc_sum_32(left);
228 sum_above = _mm_add_epi16(sum_above, sum_left);
230 uint32_t sum = _mm_cvtsi128_si32(sum_above);
231 sum += 48;
232 sum /= 96;
233 const __m128i row = _mm_set1_epi8((uint8_t)sum);
234 dc_store_64xh(&row, 32, dst, stride);
237 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
238 const uint8_t *above, const uint8_t *left) {
239 __m128i sum_above = dc_sum_64(above);
240 const __m128i sum_left = dc_sum_16(left);
241 sum_above = _mm_add_epi16(sum_above, sum_left);
243 uint32_t sum = _mm_cvtsi128_si32(sum_above);
244 sum += 40;
245 sum /= 80;
246 const __m128i row = _mm_set1_epi8((uint8_t)sum);
247 dc_store_64xh(&row, 16, dst, stride);
250 // -----------------------------------------------------------------------------
251 // DC_TOP
253 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
254 const uint8_t *above, const uint8_t *left) {
255 (void)left;
256 __m128i sum_above = dc_sum_4(above);
257 const __m128i two = _mm_set1_epi16((int16_t)2);
258 sum_above = _mm_add_epi16(sum_above, two);
259 sum_above = _mm_srai_epi16(sum_above, 2);
260 sum_above = _mm_shufflelo_epi16(sum_above, 0);
261 sum_above = _mm_packus_epi16(sum_above, sum_above);
263 const uint32_t pred = _mm_cvtsi128_si32(sum_above);
264 dc_store_4x8(pred, dst, stride);
267 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
268 const uint8_t *above, const uint8_t *left) {
269 (void)left;
270 __m128i sum_above = dc_sum_8(above);
271 const __m128i four = _mm_set1_epi16((uint16_t)4);
272 sum_above = _mm_add_epi16(sum_above, four);
273 sum_above = _mm_srai_epi16(sum_above, 3);
274 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
275 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
276 dc_store_8xh(&row, 4, dst, stride);
279 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
280 const uint8_t *above, const uint8_t *left) {
281 (void)left;
282 __m128i sum_above = dc_sum_8(above);
283 const __m128i four = _mm_set1_epi16((uint16_t)4);
284 sum_above = _mm_add_epi16(sum_above, four);
285 sum_above = _mm_srai_epi16(sum_above, 3);
286 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
287 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
288 dc_store_8xh(&row, 16, dst, stride);
291 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
292 const uint8_t *above, const uint8_t *left) {
293 (void)left;
294 __m128i sum_above = dc_sum_16(above);
295 const __m128i eight = _mm_set1_epi16((uint16_t)8);
296 sum_above = _mm_add_epi16(sum_above, eight);
297 sum_above = _mm_srai_epi16(sum_above, 4);
298 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
299 sum_above = _mm_shufflelo_epi16(sum_above, 0);
300 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
301 dc_store_16xh(&row, 8, dst, stride);
304 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
305 const uint8_t *above,
306 const uint8_t *left) {
307 (void)left;
308 __m128i sum_above = dc_sum_16(above);
309 const __m128i eight = _mm_set1_epi16((uint16_t)8);
310 sum_above = _mm_add_epi16(sum_above, eight);
311 sum_above = _mm_srai_epi16(sum_above, 4);
312 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
313 sum_above = _mm_shufflelo_epi16(sum_above, 0);
314 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
315 dc_store_16xh(&row, 32, dst, stride);
318 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
319 const uint8_t *above,
320 const uint8_t *left) {
321 (void)left;
322 __m128i sum_above = dc_sum_32(above);
323 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
324 sum_above = _mm_add_epi16(sum_above, sixteen);
325 sum_above = _mm_srai_epi16(sum_above, 5);
326 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
327 sum_above = _mm_shufflelo_epi16(sum_above, 0);
328 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
329 dc_store_32xh(&row, 16, dst, stride);
332 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
333 const uint8_t *above,
334 const uint8_t *left) {
335 (void)left;
336 __m128i sum_above = dc_sum_32(above);
337 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
338 sum_above = _mm_add_epi16(sum_above, sixteen);
339 sum_above = _mm_srai_epi16(sum_above, 5);
340 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
341 sum_above = _mm_shufflelo_epi16(sum_above, 0);
342 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
343 dc_store_32xh(&row, 64, dst, stride);
346 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
347 const uint8_t *above,
348 const uint8_t *left) {
349 (void)left;
350 __m128i sum_above = dc_sum_64(above);
351 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
352 sum_above = _mm_add_epi16(sum_above, thirtytwo);
353 sum_above = _mm_srai_epi16(sum_above, 6);
354 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
355 sum_above = _mm_shufflelo_epi16(sum_above, 0);
356 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
357 dc_store_64xh(&row, 64, dst, stride);
360 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
361 const uint8_t *above,
362 const uint8_t *left) {
363 (void)left;
364 __m128i sum_above = dc_sum_64(above);
365 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
366 sum_above = _mm_add_epi16(sum_above, thirtytwo);
367 sum_above = _mm_srai_epi16(sum_above, 6);
368 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
369 sum_above = _mm_shufflelo_epi16(sum_above, 0);
370 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
371 dc_store_64xh(&row, 32, dst, stride);
374 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
375 const uint8_t *above,
376 const uint8_t *left) {
377 (void)left;
378 __m128i sum_above = dc_sum_64(above);
379 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
380 sum_above = _mm_add_epi16(sum_above, thirtytwo);
381 sum_above = _mm_srai_epi16(sum_above, 6);
382 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
383 sum_above = _mm_shufflelo_epi16(sum_above, 0);
384 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
385 dc_store_64xh(&row, 16, dst, stride);
388 // -----------------------------------------------------------------------------
389 // DC_LEFT
391 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
392 const uint8_t *above, const uint8_t *left) {
393 (void)above;
394 __m128i sum_left = dc_sum_8(left);
395 const __m128i four = _mm_set1_epi16((uint16_t)4);
396 sum_left = _mm_add_epi16(sum_left, four);
397 sum_left = _mm_srai_epi16(sum_left, 3);
398 sum_left = _mm_shufflelo_epi16(sum_left, 0);
399 sum_left = _mm_packus_epi16(sum_left, sum_left);
401 const uint32_t pred = _mm_cvtsi128_si32(sum_left);
402 dc_store_4x8(pred, dst, stride);
405 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
406 const uint8_t *above, const uint8_t *left) {
407 (void)above;
408 __m128i sum_left = dc_sum_4(left);
409 const __m128i two = _mm_set1_epi16((uint16_t)2);
410 sum_left = _mm_add_epi16(sum_left, two);
411 sum_left = _mm_srai_epi16(sum_left, 2);
412 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
413 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
414 dc_store_8xh(&row, 4, dst, stride);
417 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
418 const uint8_t *above,
419 const uint8_t *left) {
420 (void)above;
421 __m128i sum_left = dc_sum_16(left);
422 const __m128i eight = _mm_set1_epi16((uint16_t)8);
423 sum_left = _mm_add_epi16(sum_left, eight);
424 sum_left = _mm_srai_epi16(sum_left, 4);
425 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
426 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
427 dc_store_8xh(&row, 16, dst, stride);
430 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
431 const uint8_t *above,
432 const uint8_t *left) {
433 (void)above;
434 __m128i sum_left = dc_sum_8(left);
435 const __m128i four = _mm_set1_epi16((uint16_t)4);
436 sum_left = _mm_add_epi16(sum_left, four);
437 sum_left = _mm_srai_epi16(sum_left, 3);
438 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
439 sum_left = _mm_shufflelo_epi16(sum_left, 0);
440 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
441 dc_store_16xh(&row, 8, dst, stride);
444 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
445 const uint8_t *above,
446 const uint8_t *left) {
447 (void)above;
448 __m128i sum_left = dc_sum_32(left);
449 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
450 sum_left = _mm_add_epi16(sum_left, sixteen);
451 sum_left = _mm_srai_epi16(sum_left, 5);
452 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
453 sum_left = _mm_shufflelo_epi16(sum_left, 0);
454 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
455 dc_store_16xh(&row, 32, dst, stride);
458 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
459 const uint8_t *above,
460 const uint8_t *left) {
461 (void)above;
462 __m128i sum_left = dc_sum_16(left);
463 const __m128i eight = _mm_set1_epi16((uint16_t)8);
464 sum_left = _mm_add_epi16(sum_left, eight);
465 sum_left = _mm_srai_epi16(sum_left, 4);
466 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
467 sum_left = _mm_shufflelo_epi16(sum_left, 0);
468 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
469 dc_store_32xh(&row, 16, dst, stride);
472 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
473 const uint8_t *above,
474 const uint8_t *left) {
475 (void)above;
476 __m128i sum_left = dc_sum_64(left);
477 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
478 sum_left = _mm_add_epi16(sum_left, thirtytwo);
479 sum_left = _mm_srai_epi16(sum_left, 6);
480 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
481 sum_left = _mm_shufflelo_epi16(sum_left, 0);
482 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
483 dc_store_32xh(&row, 64, dst, stride);
486 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
487 const uint8_t *above,
488 const uint8_t *left) {
489 (void)above;
490 __m128i sum_left = dc_sum_64(left);
491 const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
492 sum_left = _mm_add_epi16(sum_left, thirtytwo);
493 sum_left = _mm_srai_epi16(sum_left, 6);
494 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
495 sum_left = _mm_shufflelo_epi16(sum_left, 0);
496 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
497 dc_store_64xh(&row, 64, dst, stride);
500 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
501 const uint8_t *above,
502 const uint8_t *left) {
503 (void)above;
504 __m128i sum_left = dc_sum_32(left);
505 const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
506 sum_left = _mm_add_epi16(sum_left, sixteen);
507 sum_left = _mm_srai_epi16(sum_left, 5);
508 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
509 sum_left = _mm_shufflelo_epi16(sum_left, 0);
510 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
511 dc_store_64xh(&row, 32, dst, stride);
514 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
515 const uint8_t *above,
516 const uint8_t *left) {
517 (void)above;
518 __m128i sum_left = dc_sum_16(left);
519 const __m128i eight = _mm_set1_epi16((uint16_t)8);
520 sum_left = _mm_add_epi16(sum_left, eight);
521 sum_left = _mm_srai_epi16(sum_left, 4);
522 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
523 sum_left = _mm_shufflelo_epi16(sum_left, 0);
524 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
525 dc_store_64xh(&row, 16, dst, stride);
528 // -----------------------------------------------------------------------------
529 // DC_128
531 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
532 const uint8_t *above, const uint8_t *left) {
533 (void)above;
534 (void)left;
535 const uint32_t pred = 0x80808080;
536 dc_store_4x8(pred, dst, stride);
539 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
540 const uint8_t *above, const uint8_t *left) {
541 (void)above;
542 (void)left;
543 const __m128i row = _mm_set1_epi8((uint8_t)128);
544 dc_store_8xh(&row, 4, dst, stride);
547 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
548 const uint8_t *above, const uint8_t *left) {
549 (void)above;
550 (void)left;
551 const __m128i row = _mm_set1_epi8((uint8_t)128);
552 dc_store_8xh(&row, 16, dst, stride);
555 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
556 const uint8_t *above, const uint8_t *left) {
557 (void)above;
558 (void)left;
559 const __m128i row = _mm_set1_epi8((uint8_t)128);
560 dc_store_16xh(&row, 8, dst, stride);
563 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
564 const uint8_t *above,
565 const uint8_t *left) {
566 (void)above;
567 (void)left;
568 const __m128i row = _mm_set1_epi8((uint8_t)128);
569 dc_store_16xh(&row, 32, dst, stride);
572 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
573 const uint8_t *above,
574 const uint8_t *left) {
575 (void)above;
576 (void)left;
577 const __m128i row = _mm_set1_epi8((uint8_t)128);
578 dc_store_32xh(&row, 16, dst, stride);
581 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
582 const uint8_t *above,
583 const uint8_t *left) {
584 (void)above;
585 (void)left;
586 const __m128i row = _mm_set1_epi8((uint8_t)128);
587 dc_store_32xh(&row, 64, dst, stride);
590 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
591 const uint8_t *above,
592 const uint8_t *left) {
593 (void)above;
594 (void)left;
595 const __m128i row = _mm_set1_epi8((uint8_t)128);
596 dc_store_64xh(&row, 64, dst, stride);
599 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
600 const uint8_t *above,
601 const uint8_t *left) {
602 (void)above;
603 (void)left;
604 const __m128i row = _mm_set1_epi8((uint8_t)128);
605 dc_store_64xh(&row, 32, dst, stride);
608 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
609 const uint8_t *above,
610 const uint8_t *left) {
611 (void)above;
612 (void)left;
613 const __m128i row = _mm_set1_epi8((uint8_t)128);
614 dc_store_64xh(&row, 16, dst, stride);
617 // -----------------------------------------------------------------------------
618 // V_PRED
620 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
621 const uint8_t *above, const uint8_t *left) {
622 const uint32_t pred = *(uint32_t *)above;
623 (void)left;
624 dc_store_4x8(pred, dst, stride);
627 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
628 const uint8_t *above, const uint8_t *left) {
629 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
630 (void)left;
631 dc_store_8xh(&row, 4, dst, stride);
634 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
635 const uint8_t *above, const uint8_t *left) {
636 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
637 (void)left;
638 dc_store_8xh(&row, 16, dst, stride);
641 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
642 const uint8_t *above, const uint8_t *left) {
643 const __m128i row = _mm_load_si128((__m128i const *)above);
644 (void)left;
645 dc_store_16xh(&row, 8, dst, stride);
648 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
649 const uint8_t *above, const uint8_t *left) {
650 const __m128i row = _mm_load_si128((__m128i const *)above);
651 (void)left;
652 dc_store_16xh(&row, 32, dst, stride);
655 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
656 const uint8_t *above, const uint8_t *left) {
657 const __m128i row0 = _mm_load_si128((__m128i const *)above);
658 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
659 (void)left;
660 int i;
661 for (i = 0; i < 16; ++i) {
662 _mm_store_si128((__m128i *)dst, row0);
663 _mm_store_si128((__m128i *)(dst + 16), row1);
664 dst += stride;
668 // -----------------------------------------------------------------------------
669 // H_PRED
671 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
672 const uint8_t *above, const uint8_t *left) {
673 (void)above;
674 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
675 left_col = _mm_unpacklo_epi8(left_col, left_col);
676 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
677 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
678 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
679 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
680 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
681 dst += stride;
682 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
683 dst += stride;
684 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
685 dst += stride;
686 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
687 dst += stride;
688 left_col = _mm_unpackhi_epi64(left_col, left_col);
689 row0 = _mm_shufflelo_epi16(left_col, 0);
690 row1 = _mm_shufflelo_epi16(left_col, 0x55);
691 row2 = _mm_shufflelo_epi16(left_col, 0xaa);
692 row3 = _mm_shufflelo_epi16(left_col, 0xff);
693 *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
694 dst += stride;
695 *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
696 dst += stride;
697 *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
698 dst += stride;
699 *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
702 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
703 const uint8_t *above, const uint8_t *left) {
704 (void)above;
705 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
706 left_col = _mm_unpacklo_epi8(left_col, left_col);
707 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
708 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
709 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
710 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
711 _mm_storel_epi64((__m128i *)dst, row0);
712 dst += stride;
713 _mm_storel_epi64((__m128i *)dst, row1);
714 dst += stride;
715 _mm_storel_epi64((__m128i *)dst, row2);
716 dst += stride;
717 _mm_storel_epi64((__m128i *)dst, row3);
720 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
721 const uint8_t *above, const uint8_t *left) {
722 (void)above;
723 const __m128i left_col = _mm_load_si128((__m128i const *)left);
724 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
725 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
727 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
728 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
729 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
730 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
731 _mm_storel_epi64((__m128i *)dst, row0);
732 dst += stride;
733 _mm_storel_epi64((__m128i *)dst, row1);
734 dst += stride;
735 _mm_storel_epi64((__m128i *)dst, row2);
736 dst += stride;
737 _mm_storel_epi64((__m128i *)dst, row3);
738 dst += stride;
740 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
741 row0 = _mm_shufflelo_epi16(left_col_low, 0);
742 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
743 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
744 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
745 _mm_storel_epi64((__m128i *)dst, row0);
746 dst += stride;
747 _mm_storel_epi64((__m128i *)dst, row1);
748 dst += stride;
749 _mm_storel_epi64((__m128i *)dst, row2);
750 dst += stride;
751 _mm_storel_epi64((__m128i *)dst, row3);
752 dst += stride;
754 row0 = _mm_shufflelo_epi16(left_col_high, 0);
755 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
756 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
757 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
758 _mm_storel_epi64((__m128i *)dst, row0);
759 dst += stride;
760 _mm_storel_epi64((__m128i *)dst, row1);
761 dst += stride;
762 _mm_storel_epi64((__m128i *)dst, row2);
763 dst += stride;
764 _mm_storel_epi64((__m128i *)dst, row3);
765 dst += stride;
767 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
768 row0 = _mm_shufflelo_epi16(left_col_high, 0);
769 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
770 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
771 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
772 _mm_storel_epi64((__m128i *)dst, row0);
773 dst += stride;
774 _mm_storel_epi64((__m128i *)dst, row1);
775 dst += stride;
776 _mm_storel_epi64((__m128i *)dst, row2);
777 dst += stride;
778 _mm_storel_epi64((__m128i *)dst, row3);
781 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
782 ptrdiff_t stride) {
783 int i;
784 for (i = 0; i < h; ++i) {
785 _mm_store_si128((__m128i *)dst, row[i]);
786 dst += stride;
790 static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
791 const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
792 const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
793 const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
794 const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
796 row[0] = _mm_unpacklo_epi64(u0, u0);
797 row[1] = _mm_unpacklo_epi64(u1, u1);
798 row[2] = _mm_unpacklo_epi64(u2, u2);
799 row[3] = _mm_unpacklo_epi64(u3, u3);
802 static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
803 const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
804 const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
805 const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
806 const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
808 row[0] = _mm_unpackhi_epi64(u0, u0);
809 row[1] = _mm_unpackhi_epi64(u1, u1);
810 row[2] = _mm_unpackhi_epi64(u2, u2);
811 row[3] = _mm_unpackhi_epi64(u3, u3);
814 // Process 16x8, first 4 rows
815 // Use first 8 bytes of left register: xxxxxxxx33221100
816 static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
817 ptrdiff_t stride) {
818 __m128i row[4];
819 repeat_low_4pixels(left, row);
820 h_pred_store_16xh(row, 4, dst, stride);
823 // Process 16x8, second 4 rows
824 // Use second 8 bytes of left register: 77665544xxxxxxxx
825 static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
826 ptrdiff_t stride) {
827 __m128i row[4];
828 repeat_high_4pixels(left, row);
829 h_pred_store_16xh(row, 4, dst, stride);
832 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
833 const uint8_t *above, const uint8_t *left) {
834 (void)above;
835 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
836 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
837 h_prediction_16x8_1(&left_col_8p, dst, stride);
838 dst += stride << 2;
839 h_prediction_16x8_2(&left_col_8p, dst, stride);
842 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
843 const uint8_t *above, const uint8_t *left) {
844 __m128i left_col, left_col_8p;
845 (void)above;
846 int i = 0;
848 do {
849 left_col = _mm_load_si128((const __m128i *)left);
850 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
851 h_prediction_16x8_1(&left_col_8p, dst, stride);
852 dst += stride << 2;
853 h_prediction_16x8_2(&left_col_8p, dst, stride);
854 dst += stride << 2;
856 left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
857 h_prediction_16x8_1(&left_col_8p, dst, stride);
858 dst += stride << 2;
859 h_prediction_16x8_2(&left_col_8p, dst, stride);
860 dst += stride << 2;
862 left += 16;
863 i++;
864 } while (i < 2);
867 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
868 ptrdiff_t stride) {
869 int i;
870 for (i = 0; i < h; ++i) {
871 _mm_store_si128((__m128i *)dst, row[i]);
872 _mm_store_si128((__m128i *)(dst + 16), row[i]);
873 dst += stride;
877 // Process 32x8, first 4 rows
878 // Use first 8 bytes of left register: xxxxxxxx33221100
879 static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
880 ptrdiff_t stride) {
881 __m128i row[4];
882 repeat_low_4pixels(left, row);
883 h_pred_store_32xh(row, 4, dst, stride);
886 // Process 32x8, second 4 rows
887 // Use second 8 bytes of left register: 77665544xxxxxxxx
888 static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
889 ptrdiff_t stride) {
890 __m128i row[4];
891 repeat_high_4pixels(left, row);
892 h_pred_store_32xh(row, 4, dst, stride);
895 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
896 const uint8_t *above, const uint8_t *left) {
897 __m128i left_col, left_col_8p;
898 (void)above;
900 left_col = _mm_load_si128((const __m128i *)left);
902 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
903 h_prediction_32x8_1(&left_col_8p, dst, stride);
904 dst += stride << 2;
905 h_prediction_32x8_2(&left_col_8p, dst, stride);
906 dst += stride << 2;
908 left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
909 h_prediction_32x8_1(&left_col_8p, dst, stride);
910 dst += stride << 2;
911 h_prediction_32x8_2(&left_col_8p, dst, stride);