Add ssse3 aom_smooth_h_predictor_16,32,64xh
[aom.git] / aom_dsp / x86 / intrapred_ssse3.c
blob069a3bf7aefbd2cf3a99d062c8ea863dcb4ef002
1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <tmmintrin.h>
14 #include "./aom_dsp_rtcd.h"
15 #include "aom_dsp/intrapred_common.h"
17 // -----------------------------------------------------------------------------
18 // PAETH_PRED
20 // Return 8 16-bit pixels in one row
21 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
22 const __m128i *topleft) {
23 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
26 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
27 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29 __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
30 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
31 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33 pl = _mm_andnot_si128(mask1, *left);
35 ptl = _mm_and_si128(mask2, *topleft);
36 pt = _mm_andnot_si128(mask2, *top);
37 pt = _mm_or_si128(pt, ptl);
38 pt = _mm_and_si128(mask1, pt);
40 return _mm_or_si128(pl, pt);
43 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
44 const uint8_t *above, const uint8_t *left) {
45 __m128i l = _mm_loadl_epi64((const __m128i *)left);
46 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
47 const __m128i zero = _mm_setzero_si128();
48 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
49 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
50 __m128i rep = _mm_set1_epi16(0x8000);
51 const __m128i one = _mm_set1_epi16(1);
53 int i;
54 for (i = 0; i < 4; ++i) {
55 const __m128i l16 = _mm_shuffle_epi8(l, rep);
56 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
59 dst += stride;
60 rep = _mm_add_epi16(rep, one);
64 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
65 const uint8_t *above, const uint8_t *left) {
66 __m128i l = _mm_loadl_epi64((const __m128i *)left);
67 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
68 const __m128i zero = _mm_setzero_si128();
69 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
70 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
71 __m128i rep = _mm_set1_epi16(0x8000);
72 const __m128i one = _mm_set1_epi16(1);
74 int i;
75 for (i = 0; i < 8; ++i) {
76 const __m128i l16 = _mm_shuffle_epi8(l, rep);
77 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
80 dst += stride;
81 rep = _mm_add_epi16(rep, one);
85 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
86 const uint8_t *above, const uint8_t *left) {
87 __m128i l = _mm_loadl_epi64((const __m128i *)left);
88 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
89 const __m128i zero = _mm_setzero_si128();
90 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
91 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
92 __m128i rep = _mm_set1_epi16(0x8000);
93 const __m128i one = _mm_set1_epi16(1);
95 int i;
96 for (i = 0; i < 4; ++i) {
97 const __m128i l16 = _mm_shuffle_epi8(l, rep);
98 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
100 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
101 dst += stride;
102 rep = _mm_add_epi16(rep, one);
106 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
107 const uint8_t *above, const uint8_t *left) {
108 __m128i l = _mm_loadl_epi64((const __m128i *)left);
109 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110 const __m128i zero = _mm_setzero_si128();
111 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
113 __m128i rep = _mm_set1_epi16(0x8000);
114 const __m128i one = _mm_set1_epi16(1);
116 int i;
117 for (i = 0; i < 8; ++i) {
118 const __m128i l16 = _mm_shuffle_epi8(l, rep);
119 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
121 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122 dst += stride;
123 rep = _mm_add_epi16(rep, one);
127 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
128 const uint8_t *above, const uint8_t *left) {
129 __m128i l = _mm_load_si128((const __m128i *)left);
130 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131 const __m128i zero = _mm_setzero_si128();
132 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
134 __m128i rep = _mm_set1_epi16(0x8000);
135 const __m128i one = _mm_set1_epi16(1);
137 int i;
138 for (i = 0; i < 16; ++i) {
139 const __m128i l16 = _mm_shuffle_epi8(l, rep);
140 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
142 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143 dst += stride;
144 rep = _mm_add_epi16(rep, one);
148 // Return 16 8-bit pixels in one row
149 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
150 const __m128i *top1,
151 const __m128i *topleft) {
152 const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
153 const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
154 return _mm_packus_epi16(p0, p1);
157 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
158 const uint8_t *above, const uint8_t *left) {
159 __m128i l = _mm_loadl_epi64((const __m128i *)left);
160 const __m128i t = _mm_load_si128((const __m128i *)above);
161 const __m128i zero = _mm_setzero_si128();
162 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
163 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
164 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
165 __m128i rep = _mm_set1_epi16(0x8000);
166 const __m128i one = _mm_set1_epi16(1);
168 int i;
169 for (i = 0; i < 8; ++i) {
170 const __m128i l16 = _mm_shuffle_epi8(l, rep);
171 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
173 _mm_store_si128((__m128i *)dst, row);
174 dst += stride;
175 rep = _mm_add_epi16(rep, one);
179 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
180 const uint8_t *above,
181 const uint8_t *left) {
182 __m128i l = _mm_load_si128((const __m128i *)left);
183 const __m128i t = _mm_load_si128((const __m128i *)above);
184 const __m128i zero = _mm_setzero_si128();
185 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
186 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
187 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
188 __m128i rep = _mm_set1_epi16(0x8000);
189 const __m128i one = _mm_set1_epi16(1);
191 int i;
192 for (i = 0; i < 16; ++i) {
193 const __m128i l16 = _mm_shuffle_epi8(l, rep);
194 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
196 _mm_store_si128((__m128i *)dst, row);
197 dst += stride;
198 rep = _mm_add_epi16(rep, one);
202 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
203 const uint8_t *above,
204 const uint8_t *left) {
205 __m128i l = _mm_load_si128((const __m128i *)left);
206 const __m128i t = _mm_load_si128((const __m128i *)above);
207 const __m128i zero = _mm_setzero_si128();
208 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
209 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
210 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
211 __m128i rep = _mm_set1_epi16(0x8000);
212 const __m128i one = _mm_set1_epi16(1);
213 __m128i l16;
215 int i;
216 for (i = 0; i < 16; ++i) {
217 l16 = _mm_shuffle_epi8(l, rep);
218 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
220 _mm_store_si128((__m128i *)dst, row);
221 dst += stride;
222 rep = _mm_add_epi16(rep, one);
225 l = _mm_load_si128((const __m128i *)(left + 16));
226 rep = _mm_set1_epi16(0x8000);
227 for (i = 0; i < 16; ++i) {
228 l16 = _mm_shuffle_epi8(l, rep);
229 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
231 _mm_store_si128((__m128i *)dst, row);
232 dst += stride;
233 rep = _mm_add_epi16(rep, one);
237 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
238 const uint8_t *above,
239 const uint8_t *left) {
240 const __m128i t = _mm_load_si128((const __m128i *)above);
241 const __m128i zero = _mm_setzero_si128();
242 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
243 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
244 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
245 const __m128i one = _mm_set1_epi16(1);
247 for (int j = 0; j < 4; ++j) {
248 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
249 __m128i rep = _mm_set1_epi16(0x8000);
250 for (int i = 0; i < 16; ++i) {
251 const __m128i l16 = _mm_shuffle_epi8(l, rep);
252 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
253 _mm_store_si128((__m128i *)dst, row);
254 dst += stride;
255 rep = _mm_add_epi16(rep, one);
260 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
261 const uint8_t *above,
262 const uint8_t *left) {
263 const __m128i a = _mm_load_si128((const __m128i *)above);
264 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
265 const __m128i zero = _mm_setzero_si128();
266 const __m128i al = _mm_unpacklo_epi8(a, zero);
267 const __m128i ah = _mm_unpackhi_epi8(a, zero);
268 const __m128i bl = _mm_unpacklo_epi8(b, zero);
269 const __m128i bh = _mm_unpackhi_epi8(b, zero);
271 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
272 __m128i rep = _mm_set1_epi16(0x8000);
273 const __m128i one = _mm_set1_epi16(1);
274 __m128i l = _mm_load_si128((const __m128i *)left);
275 __m128i l16;
277 int i;
278 for (i = 0; i < 16; ++i) {
279 l16 = _mm_shuffle_epi8(l, rep);
280 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
281 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
283 _mm_store_si128((__m128i *)dst, r32l);
284 _mm_store_si128((__m128i *)(dst + 16), r32h);
285 dst += stride;
286 rep = _mm_add_epi16(rep, one);
290 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
291 const uint8_t *above,
292 const uint8_t *left) {
293 const __m128i a = _mm_load_si128((const __m128i *)above);
294 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
295 const __m128i zero = _mm_setzero_si128();
296 const __m128i al = _mm_unpacklo_epi8(a, zero);
297 const __m128i ah = _mm_unpackhi_epi8(a, zero);
298 const __m128i bl = _mm_unpacklo_epi8(b, zero);
299 const __m128i bh = _mm_unpackhi_epi8(b, zero);
301 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
302 __m128i rep = _mm_set1_epi16(0x8000);
303 const __m128i one = _mm_set1_epi16(1);
304 __m128i l = _mm_load_si128((const __m128i *)left);
305 __m128i l16;
307 int i;
308 for (i = 0; i < 16; ++i) {
309 l16 = _mm_shuffle_epi8(l, rep);
310 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
311 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
313 _mm_store_si128((__m128i *)dst, r32l);
314 _mm_store_si128((__m128i *)(dst + 16), r32h);
315 dst += stride;
316 rep = _mm_add_epi16(rep, one);
319 rep = _mm_set1_epi16(0x8000);
320 l = _mm_load_si128((const __m128i *)(left + 16));
321 for (i = 0; i < 16; ++i) {
322 l16 = _mm_shuffle_epi8(l, rep);
323 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
324 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
326 _mm_store_si128((__m128i *)dst, r32l);
327 _mm_store_si128((__m128i *)(dst + 16), r32h);
328 dst += stride;
329 rep = _mm_add_epi16(rep, one);
333 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
334 const uint8_t *above,
335 const uint8_t *left) {
336 const __m128i a = _mm_load_si128((const __m128i *)above);
337 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
338 const __m128i zero = _mm_setzero_si128();
339 const __m128i al = _mm_unpacklo_epi8(a, zero);
340 const __m128i ah = _mm_unpackhi_epi8(a, zero);
341 const __m128i bl = _mm_unpacklo_epi8(b, zero);
342 const __m128i bh = _mm_unpackhi_epi8(b, zero);
344 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
345 const __m128i one = _mm_set1_epi16(1);
346 __m128i l16;
348 int i, j;
349 for (j = 0; j < 4; ++j) {
350 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
351 __m128i rep = _mm_set1_epi16(0x8000);
352 for (i = 0; i < 16; ++i) {
353 l16 = _mm_shuffle_epi8(l, rep);
354 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
355 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
357 _mm_store_si128((__m128i *)dst, r32l);
358 _mm_store_si128((__m128i *)(dst + 16), r32h);
359 dst += stride;
360 rep = _mm_add_epi16(rep, one);
365 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
366 const uint8_t *above,
367 const uint8_t *left) {
368 const __m128i a = _mm_load_si128((const __m128i *)above);
369 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
370 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
371 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
372 const __m128i zero = _mm_setzero_si128();
373 const __m128i al = _mm_unpacklo_epi8(a, zero);
374 const __m128i ah = _mm_unpackhi_epi8(a, zero);
375 const __m128i bl = _mm_unpacklo_epi8(b, zero);
376 const __m128i bh = _mm_unpackhi_epi8(b, zero);
377 const __m128i cl = _mm_unpacklo_epi8(c, zero);
378 const __m128i ch = _mm_unpackhi_epi8(c, zero);
379 const __m128i dl = _mm_unpacklo_epi8(d, zero);
380 const __m128i dh = _mm_unpackhi_epi8(d, zero);
382 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
383 const __m128i one = _mm_set1_epi16(1);
384 __m128i l16;
386 int i, j;
387 for (j = 0; j < 2; ++j) {
388 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
389 __m128i rep = _mm_set1_epi16(0x8000);
390 for (i = 0; i < 16; ++i) {
391 l16 = _mm_shuffle_epi8(l, rep);
392 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
393 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
394 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
395 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
397 _mm_store_si128((__m128i *)dst, r0);
398 _mm_store_si128((__m128i *)(dst + 16), r1);
399 _mm_store_si128((__m128i *)(dst + 32), r2);
400 _mm_store_si128((__m128i *)(dst + 48), r3);
401 dst += stride;
402 rep = _mm_add_epi16(rep, one);
407 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
408 const uint8_t *above,
409 const uint8_t *left) {
410 const __m128i a = _mm_load_si128((const __m128i *)above);
411 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
412 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
413 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
414 const __m128i zero = _mm_setzero_si128();
415 const __m128i al = _mm_unpacklo_epi8(a, zero);
416 const __m128i ah = _mm_unpackhi_epi8(a, zero);
417 const __m128i bl = _mm_unpacklo_epi8(b, zero);
418 const __m128i bh = _mm_unpackhi_epi8(b, zero);
419 const __m128i cl = _mm_unpacklo_epi8(c, zero);
420 const __m128i ch = _mm_unpackhi_epi8(c, zero);
421 const __m128i dl = _mm_unpacklo_epi8(d, zero);
422 const __m128i dh = _mm_unpackhi_epi8(d, zero);
424 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
425 const __m128i one = _mm_set1_epi16(1);
426 __m128i l16;
428 int i, j;
429 for (j = 0; j < 4; ++j) {
430 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
431 __m128i rep = _mm_set1_epi16(0x8000);
432 for (i = 0; i < 16; ++i) {
433 l16 = _mm_shuffle_epi8(l, rep);
434 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
435 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
436 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
437 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
439 _mm_store_si128((__m128i *)dst, r0);
440 _mm_store_si128((__m128i *)(dst + 16), r1);
441 _mm_store_si128((__m128i *)(dst + 32), r2);
442 _mm_store_si128((__m128i *)(dst + 48), r3);
443 dst += stride;
444 rep = _mm_add_epi16(rep, one);
449 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
450 const uint8_t *above,
451 const uint8_t *left) {
452 const __m128i a = _mm_load_si128((const __m128i *)above);
453 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
454 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
455 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
456 const __m128i zero = _mm_setzero_si128();
457 const __m128i al = _mm_unpacklo_epi8(a, zero);
458 const __m128i ah = _mm_unpackhi_epi8(a, zero);
459 const __m128i bl = _mm_unpacklo_epi8(b, zero);
460 const __m128i bh = _mm_unpackhi_epi8(b, zero);
461 const __m128i cl = _mm_unpacklo_epi8(c, zero);
462 const __m128i ch = _mm_unpackhi_epi8(c, zero);
463 const __m128i dl = _mm_unpacklo_epi8(d, zero);
464 const __m128i dh = _mm_unpackhi_epi8(d, zero);
466 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
467 const __m128i one = _mm_set1_epi16(1);
468 __m128i l16;
470 int i;
471 const __m128i l = _mm_load_si128((const __m128i *)left);
472 __m128i rep = _mm_set1_epi16(0x8000);
473 for (i = 0; i < 16; ++i) {
474 l16 = _mm_shuffle_epi8(l, rep);
475 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
476 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
477 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
478 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
480 _mm_store_si128((__m128i *)dst, r0);
481 _mm_store_si128((__m128i *)(dst + 16), r1);
482 _mm_store_si128((__m128i *)(dst + 32), r2);
483 _mm_store_si128((__m128i *)(dst + 48), r3);
484 dst += stride;
485 rep = _mm_add_epi16(rep, one);
489 // -----------------------------------------------------------------------------
490 // SMOOTH_PRED
492 // pixels[0]: above and below_pred interleave vector
493 // pixels[1]: left vector
494 // pixels[2]: right_pred vector
495 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
496 int height, __m128i *pixels) {
497 __m128i d = _mm_loadl_epi64((const __m128i *)above);
498 pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
499 pixels[1] = _mm_loadl_epi64((const __m128i *)left);
501 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
502 const __m128i zero = _mm_setzero_si128();
503 d = _mm_unpacklo_epi8(d, zero);
504 pixels[0] = _mm_unpacklo_epi16(d, bp);
507 // weights[0]: weights_h vector
508 // weights[1]: scale - weights_h vecotr
509 // weights[2]: weights_w and scale - weights_w interleave vector
510 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
511 __m128i *weights) {
512 __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
513 const __m128i zero = _mm_setzero_si128();
515 weights[0] = _mm_unpacklo_epi8(t, zero);
516 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
517 weights[1] = _mm_sub_epi16(d, weights[0]);
518 weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]);
520 if (height == 8) {
521 t = _mm_srli_si128(t, 4);
522 weights[0] = _mm_unpacklo_epi8(t, zero);
523 weights[1] = _mm_sub_epi16(d, weights[0]);
527 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight,
528 int h, uint8_t *dst, ptrdiff_t stride) {
529 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
530 const __m128i one = _mm_set1_epi16(1);
531 const __m128i inc = _mm_set1_epi16(0x202);
532 const __m128i gat = _mm_set1_epi32(0xc080400);
533 __m128i rep = _mm_set1_epi16(0x8000);
534 __m128i d = _mm_set1_epi16(0x100);
536 int i;
537 for (i = 0; i < h; ++i) {
538 const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
539 const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
540 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
541 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
543 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
544 b = _mm_unpacklo_epi16(b, pixel[2]);
545 __m128i sum = _mm_madd_epi16(b, weight[2]);
547 sum = _mm_add_epi32(s, sum);
548 sum = _mm_add_epi32(sum, round);
549 sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
551 sum = _mm_shuffle_epi8(sum, gat);
552 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
553 dst += stride;
555 rep = _mm_add_epi16(rep, one);
556 d = _mm_add_epi16(d, inc);
560 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
561 const uint8_t *above, const uint8_t *left) {
562 __m128i pixels[3];
563 load_pixel_w4(above, left, 4, pixels);
565 __m128i weights[3];
566 load_weight_w4(sm_weight_arrays, 4, weights);
568 smooth_pred_4xh(pixels, weights, 4, dst, stride);
571 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
572 const uint8_t *above, const uint8_t *left) {
573 __m128i pixels[3];
574 load_pixel_w4(above, left, 8, pixels);
576 __m128i weights[3];
577 load_weight_w4(sm_weight_arrays, 8, weights);
579 smooth_pred_4xh(pixels, weights, 8, dst, stride);
582 // pixels[0]: above and below_pred interleave vector, first half
583 // pixels[1]: above and below_pred interleave vector, second half
584 // pixels[2]: left vector
585 // pixels[3]: right_pred vector
586 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
587 int height, __m128i *pixels) {
588 __m128i d = _mm_loadl_epi64((const __m128i *)above);
589 pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
590 pixels[2] = _mm_load_si128((const __m128i *)left);
591 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
592 const __m128i zero = _mm_setzero_si128();
594 d = _mm_unpacklo_epi8(d, zero);
595 pixels[0] = _mm_unpacklo_epi16(d, bp);
596 pixels[1] = _mm_unpackhi_epi16(d, bp);
599 // weight_h[0]: weight_h vector
600 // weight_h[1]: scale - weight_h vector
601 // weight_h[2]: same as [0], second half for height = 16 only
602 // weight_h[3]: same as [1], second half for height = 16 only
603 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
604 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
605 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
606 __m128i *weight_h, __m128i *weight_w) {
607 const __m128i zero = _mm_setzero_si128();
608 const int we_offset = height < 8 ? 4 : 8;
609 __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
610 weight_h[0] = _mm_unpacklo_epi8(we, zero);
612 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
613 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
615 if (height == 4) {
616 we = _mm_srli_si128(we, 4);
617 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
618 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
619 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
620 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
621 } else {
622 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
623 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
626 if (height == 16) {
627 we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
628 weight_h[0] = _mm_unpacklo_epi8(we, zero);
629 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
630 weight_h[2] = _mm_unpackhi_epi8(we, zero);
631 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
635 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
636 const __m128i *ww, int h, uint8_t *dst,
637 ptrdiff_t stride, int second_half) {
638 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
639 const __m128i one = _mm_set1_epi16(1);
640 const __m128i inc = _mm_set1_epi16(0x202);
641 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
643 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
644 __m128i d = _mm_set1_epi16(0x100);
646 int i;
647 for (i = 0; i < h; ++i) {
648 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
649 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
650 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
651 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
652 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
654 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
655 b = _mm_unpacklo_epi16(b, pixels[3]);
656 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
657 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
659 s0 = _mm_add_epi32(s0, sum0);
660 s0 = _mm_add_epi32(s0, round);
661 s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
663 s1 = _mm_add_epi32(s1, sum1);
664 s1 = _mm_add_epi32(s1, round);
665 s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
667 sum0 = _mm_packus_epi16(s0, s1);
668 sum0 = _mm_shuffle_epi8(sum0, gat);
669 _mm_storel_epi64((__m128i *)dst, sum0);
670 dst += stride;
672 rep = _mm_add_epi16(rep, one);
673 d = _mm_add_epi16(d, inc);
677 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
678 const uint8_t *above, const uint8_t *left) {
679 __m128i pixels[4];
680 load_pixel_w8(above, left, 4, pixels);
682 __m128i wh[4], ww[2];
683 load_weight_w8(sm_weight_arrays, 4, wh, ww);
685 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
688 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
689 const uint8_t *above, const uint8_t *left) {
690 __m128i pixels[4];
691 load_pixel_w8(above, left, 8, pixels);
693 __m128i wh[4], ww[2];
694 load_weight_w8(sm_weight_arrays, 8, wh, ww);
696 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
699 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
700 const uint8_t *above,
701 const uint8_t *left) {
702 __m128i pixels[4];
703 load_pixel_w8(above, left, 16, pixels);
705 __m128i wh[4], ww[2];
706 load_weight_w8(sm_weight_arrays, 16, wh, ww);
708 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
709 dst += stride << 3;
710 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
713 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
714 const uint8_t *above,
715 const uint8_t *left, uint32_t bw,
716 uint32_t bh) {
717 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
718 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
719 const __m128i zero = _mm_setzero_si128();
720 const __m128i scale_value =
721 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
722 const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
723 const __m128i dup16 =
724 _mm_set_epi32(0x01000100, 0x01000100, 0x01000100, 0x01000100);
725 const __m128i top_right =
726 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
727 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
728 const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
730 for (uint32_t y = 0; y < bh; ++y) {
731 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
732 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
733 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
734 __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
735 const __m128i wl_y =
736 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
737 pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
738 pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
740 for (uint32_t x = 0; x < bw; x += 8) {
741 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
742 const __m128i weights_x =
743 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
744 const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
745 const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
746 const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
748 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
749 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
751 const __m128i scale_m_weights_x =
752 _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
753 const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
754 const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
755 const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
757 pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
758 pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
760 pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
761 pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
763 pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
764 pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
766 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
767 pred = _mm_shuffle_epi8(pred, gat);
768 _mm_storel_epi64((__m128i *)(dst + x), pred);
770 dst += stride;
774 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
775 const uint8_t *above,
776 const uint8_t *left) {
777 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
780 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
781 const uint8_t *above,
782 const uint8_t *left) {
783 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
786 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
787 const uint8_t *above,
788 const uint8_t *left) {
789 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
792 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
793 const uint8_t *above,
794 const uint8_t *left) {
795 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
798 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
799 const uint8_t *above,
800 const uint8_t *left) {
801 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
804 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
805 const uint8_t *above,
806 const uint8_t *left) {
807 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
810 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
811 const uint8_t *above,
812 const uint8_t *left) {
813 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
816 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
817 const uint8_t *above,
818 const uint8_t *left) {
819 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
822 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
823 const uint8_t *above,
824 const uint8_t *left) {
825 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
828 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
829 const uint8_t *above,
830 const uint8_t *left) {
831 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
834 // -----------------------------------------------------------------------------
835 // SMOOTH_V_PRED
837 // pixels[0]: above and below_pred interleave vector
838 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
839 int height, __m128i *pixels) {
840 const __m128i zero = _mm_setzero_si128();
841 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
842 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
843 d = _mm_unpacklo_epi8(d, zero);
844 pixels[0] = _mm_unpacklo_epi16(d, bp);
847 // weights[0]: weights_h vector
848 // weights[1]: scale - weights_h vector
849 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
850 __m128i *weights) {
851 const __m128i zero = _mm_setzero_si128();
852 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
854 if (height == 4) {
855 const __m128i weight =
856 _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
857 weights[0] = _mm_unpacklo_epi8(weight, zero);
858 weights[1] = _mm_sub_epi16(d, weights[0]);
859 } else if (height == 8) {
860 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
861 weights[0] = _mm_unpacklo_epi8(weight, zero);
862 weights[1] = _mm_sub_epi16(d, weights[0]);
863 } else {
864 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
865 weights[0] = _mm_unpacklo_epi8(weight, zero);
866 weights[1] = _mm_sub_epi16(d, weights[0]);
867 weights[2] = _mm_unpackhi_epi8(weight, zero);
868 weights[3] = _mm_sub_epi16(d, weights[2]);
872 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
873 const __m128i *weight, int h, uint8_t *dst,
874 ptrdiff_t stride) {
875 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
876 const __m128i inc = _mm_set1_epi16(0x202);
877 const __m128i gat = _mm_set1_epi32(0xc080400);
878 __m128i d = _mm_set1_epi16(0x100);
880 for (int i = 0; i < h; ++i) {
881 const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
882 const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
883 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
884 __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
885 sum = _mm_add_epi32(sum, pred_round);
886 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
887 sum = _mm_shuffle_epi8(sum, gat);
888 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
889 dst += stride;
890 d = _mm_add_epi16(d, inc);
894 void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
895 const uint8_t *above,
896 const uint8_t *left) {
897 __m128i pixels;
898 load_pixel_v_w4(above, left, 4, &pixels);
900 __m128i weights[2];
901 load_weight_v_w4(sm_weight_arrays, 4, weights);
903 smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
906 void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
907 const uint8_t *above,
908 const uint8_t *left) {
909 __m128i pixels;
910 load_pixel_v_w4(above, left, 8, &pixels);
912 __m128i weights[2];
913 load_weight_v_w4(sm_weight_arrays, 8, weights);
915 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
918 void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
919 const uint8_t *above,
920 const uint8_t *left) {
921 __m128i pixels;
922 load_pixel_v_w4(above, left, 16, &pixels);
924 __m128i weights[4];
925 load_weight_v_w4(sm_weight_arrays, 16, weights);
927 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
928 dst += stride << 3;
929 smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
932 // pixels[0]: above and below_pred interleave vector, first half
933 // pixels[1]: above and below_pred interleave vector, second half
934 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
935 int height, __m128i *pixels) {
936 const __m128i zero = _mm_setzero_si128();
937 __m128i d = _mm_loadl_epi64((const __m128i *)above);
938 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
939 d = _mm_unpacklo_epi8(d, zero);
940 pixels[0] = _mm_unpacklo_epi16(d, bp);
941 pixels[1] = _mm_unpackhi_epi16(d, bp);
944 // weight_h[0]: weight_h vector
945 // weight_h[1]: scale - weight_h vector
946 // weight_h[2]: same as [0], offset 8
947 // weight_h[3]: same as [1], offset 8
948 // weight_h[4]: same as [0], offset 16
949 // weight_h[5]: same as [1], offset 16
950 // weight_h[6]: same as [0], offset 24
951 // weight_h[7]: same as [1], offset 24
952 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
953 __m128i *weight_h) {
954 const __m128i zero = _mm_setzero_si128();
955 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
957 if (height < 16) {
958 const int offset = height < 8 ? 4 : 8;
959 const __m128i weight =
960 _mm_loadu_si128((const __m128i *)&weight_array[offset]);
961 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
962 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
963 } else if (height == 16) {
964 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
965 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
966 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
967 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
968 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
969 } else {
970 const __m128i weight_lo =
971 _mm_loadu_si128((const __m128i *)&weight_array[32]);
972 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
973 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
974 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
975 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
976 const __m128i weight_hi =
977 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
978 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
979 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
980 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
981 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
985 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
986 int h, uint8_t *dst, ptrdiff_t stride) {
987 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
988 const __m128i inc = _mm_set1_epi16(0x202);
989 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
990 __m128i d = _mm_set1_epi16(0x100);
992 for (int i = 0; i < h; ++i) {
993 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
994 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
995 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
996 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
997 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
999 s0 = _mm_add_epi32(s0, pred_round);
1000 s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
1002 s1 = _mm_add_epi32(s1, pred_round);
1003 s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
1005 __m128i sum01 = _mm_packus_epi16(s0, s1);
1006 sum01 = _mm_shuffle_epi8(sum01, gat);
1007 _mm_storel_epi64((__m128i *)dst, sum01);
1008 dst += stride;
1010 d = _mm_add_epi16(d, inc);
1014 void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1015 const uint8_t *above,
1016 const uint8_t *left) {
1017 __m128i pixels[2];
1018 load_pixel_v_w8(above, left, 4, pixels);
1020 __m128i wh[2];
1021 load_weight_v_w8(sm_weight_arrays, 4, wh);
1023 smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
1026 void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1027 const uint8_t *above,
1028 const uint8_t *left) {
1029 __m128i pixels[2];
1030 load_pixel_v_w8(above, left, 8, pixels);
1032 __m128i wh[2];
1033 load_weight_v_w8(sm_weight_arrays, 8, wh);
1035 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1038 void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1039 const uint8_t *above,
1040 const uint8_t *left) {
1041 __m128i pixels[2];
1042 load_pixel_v_w8(above, left, 16, pixels);
1044 __m128i wh[4];
1045 load_weight_v_w8(sm_weight_arrays, 16, wh);
1047 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1048 dst += stride << 3;
1049 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1052 void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1053 const uint8_t *above,
1054 const uint8_t *left) {
1055 __m128i pixels[2];
1056 load_pixel_v_w8(above, left, 32, pixels);
1058 __m128i wh[8];
1059 load_weight_v_w8(sm_weight_arrays, 32, wh);
1061 smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
1062 dst += stride << 3;
1063 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1064 dst += stride << 3;
1065 smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
1066 dst += stride << 3;
1067 smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
1070 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1071 const uint8_t *above,
1072 const uint8_t *left, uint32_t bw,
1073 uint32_t bh) {
1074 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
1075 const __m128i zero = _mm_setzero_si128();
1076 const __m128i scale_value =
1077 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1078 const __m128i dup16 =
1079 _mm_set_epi32(0x01000100, 0x01000100, 0x01000100, 0x01000100);
1080 const __m128i bottom_left =
1081 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
1082 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1083 const __m128i round =
1084 _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
1086 for (uint32_t y = 0; y < bh; ++y) {
1087 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
1088 const __m128i scale_m_weights_y =
1089 _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
1090 const __m128i wl_y =
1091 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
1093 for (uint32_t x = 0; x < bw; x += 8) {
1094 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
1095 // 8 -> 16
1096 const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
1097 const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
1098 const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
1099 // top_x * weights_y + scale_m_weights_y * bottom_left
1100 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
1101 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
1103 pred_lo = _mm_add_epi32(pred_lo, round);
1104 pred_hi = _mm_add_epi32(pred_hi, round);
1105 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1106 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1108 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1109 pred = _mm_shuffle_epi8(pred, gat);
1110 _mm_storel_epi64((__m128i *)(dst + x), pred);
1112 dst += stride;
1116 void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1117 const uint8_t *above,
1118 const uint8_t *left) {
1119 smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
1122 void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1123 const uint8_t *above,
1124 const uint8_t *left) {
1125 smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
1128 void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1129 const uint8_t *above,
1130 const uint8_t *left) {
1131 smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
1134 void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1135 const uint8_t *above,
1136 const uint8_t *left) {
1137 smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
1140 void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1141 const uint8_t *above,
1142 const uint8_t *left) {
1143 smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
1146 void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1147 const uint8_t *above,
1148 const uint8_t *left) {
1149 smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
1152 void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1153 const uint8_t *above,
1154 const uint8_t *left) {
1155 smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
1158 void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1159 const uint8_t *above,
1160 const uint8_t *left) {
1161 smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
1164 void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1165 const uint8_t *above,
1166 const uint8_t *left) {
1167 smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
1170 void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1171 const uint8_t *above,
1172 const uint8_t *left) {
1173 smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
1176 void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1177 const uint8_t *above,
1178 const uint8_t *left) {
1179 smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
1182 void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1183 const uint8_t *above,
1184 const uint8_t *left) {
1185 smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
1188 // -----------------------------------------------------------------------------
1189 // SMOOTH_H_PRED
1191 // pixels[0]: left vector
1192 // pixels[1]: right_pred vector
1193 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
1194 int height, __m128i *pixels) {
1195 if (height == 4)
1196 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1197 else if (height == 8)
1198 pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
1199 else
1200 pixels[0] = _mm_loadu_si128(((const __m128i *)left));
1201 pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
1204 // weights[0]: weights_w and scale - weights_w interleave vector
1205 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
1206 __m128i *weights) {
1207 (void)height;
1208 const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
1209 const __m128i zero = _mm_setzero_si128();
1211 const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
1212 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1213 const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
1214 weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
1217 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
1218 const __m128i *weight, int h, uint8_t *dst,
1219 ptrdiff_t stride) {
1220 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1221 const __m128i one = _mm_set1_epi16(1);
1222 const __m128i gat = _mm_set1_epi32(0xc080400);
1223 __m128i rep = _mm_set1_epi16(0x8000);
1225 for (int i = 0; i < h; ++i) {
1226 __m128i b = _mm_shuffle_epi8(pixel[0], rep);
1227 b = _mm_unpacklo_epi16(b, pixel[1]);
1228 __m128i sum = _mm_madd_epi16(b, weight[0]);
1230 sum = _mm_add_epi32(sum, pred_round);
1231 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1233 sum = _mm_shuffle_epi8(sum, gat);
1234 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1235 dst += stride;
1237 rep = _mm_add_epi16(rep, one);
1241 void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1242 const uint8_t *above,
1243 const uint8_t *left) {
1244 __m128i pixels[2];
1245 load_pixel_h_w4(above, left, 4, pixels);
1247 __m128i weights;
1248 load_weight_h_w4(sm_weight_arrays, 4, &weights);
1250 smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
1253 void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1254 const uint8_t *above,
1255 const uint8_t *left) {
1256 __m128i pixels[2];
1257 load_pixel_h_w4(above, left, 8, pixels);
1259 __m128i weights;
1260 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1262 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1265 void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1266 const uint8_t *above,
1267 const uint8_t *left) {
1268 __m128i pixels[2];
1269 load_pixel_h_w4(above, left, 16, pixels);
1271 __m128i weights;
1272 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1274 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1275 dst += stride << 3;
1277 pixels[0] = _mm_srli_si128(pixels[0], 8);
1278 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1281 // pixels[0]: left vector
1282 // pixels[1]: right_pred vector
1283 // pixels[2]: left vector + 16
1284 // pixels[3]: right_pred vector
1285 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
1286 int height, __m128i *pixels) {
1287 pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
1289 if (height == 4) {
1290 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1291 } else if (height == 8) {
1292 pixels[0] = _mm_loadl_epi64((const __m128i *)left);
1293 } else if (height == 16) {
1294 pixels[0] = _mm_load_si128((const __m128i *)left);
1295 } else {
1296 pixels[0] = _mm_load_si128((const __m128i *)left);
1297 pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
1298 pixels[3] = pixels[1];
1302 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1303 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
1304 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
1305 __m128i *weight_w) {
1306 (void)height;
1307 const __m128i zero = _mm_setzero_si128();
1308 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1309 const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
1310 const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
1311 const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
1312 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
1313 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
1316 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
1317 int h, uint8_t *dst, ptrdiff_t stride,
1318 int second_half) {
1319 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1320 const __m128i one = _mm_set1_epi16(1);
1321 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1322 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
1324 for (int i = 0; i < h; ++i) {
1325 __m128i b = _mm_shuffle_epi8(pixels[0], rep);
1326 b = _mm_unpacklo_epi16(b, pixels[1]);
1327 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
1328 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
1330 sum0 = _mm_add_epi32(sum0, pred_round);
1331 sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
1333 sum1 = _mm_add_epi32(sum1, pred_round);
1334 sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
1336 sum0 = _mm_packus_epi16(sum0, sum1);
1337 sum0 = _mm_shuffle_epi8(sum0, gat);
1338 _mm_storel_epi64((__m128i *)dst, sum0);
1339 dst += stride;
1341 rep = _mm_add_epi16(rep, one);
1345 void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1346 const uint8_t *above,
1347 const uint8_t *left) {
1348 __m128i pixels[2];
1349 load_pixel_h_w8(above, left, 4, pixels);
1351 __m128i ww[2];
1352 load_weight_h_w8(sm_weight_arrays, 4, ww);
1354 smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1357 void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1358 const uint8_t *above,
1359 const uint8_t *left) {
1360 __m128i pixels[2];
1361 load_pixel_h_w8(above, left, 8, pixels);
1363 __m128i ww[2];
1364 load_weight_h_w8(sm_weight_arrays, 8, ww);
1366 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1369 void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1370 const uint8_t *above,
1371 const uint8_t *left) {
1372 __m128i pixels[2];
1373 load_pixel_h_w8(above, left, 16, pixels);
1375 __m128i ww[2];
1376 load_weight_h_w8(sm_weight_arrays, 16, ww);
1378 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1379 dst += stride << 3;
1380 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1383 void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1384 const uint8_t *above,
1385 const uint8_t *left) {
1386 __m128i pixels[4];
1387 load_pixel_h_w8(above, left, 32, pixels);
1389 __m128i ww[2];
1390 load_weight_h_w8(sm_weight_arrays, 32, ww);
1392 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1393 dst += stride << 3;
1394 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1395 dst += stride << 3;
1396 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1397 dst += stride << 3;
1398 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1401 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1402 const uint8_t *above,
1403 const uint8_t *left, uint32_t bw,
1404 uint32_t bh) {
1405 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1406 const __m128i zero = _mm_setzero_si128();
1407 const __m128i scale_value =
1408 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1409 const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1410 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1411 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1413 for (uint32_t y = 0; y < bh; ++y) {
1414 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1415 const __m128i tr_ly =
1416 _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1418 for (uint32_t x = 0; x < bw; x += 8) {
1419 const __m128i weights_x =
1420 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1421 const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1422 const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1423 const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1424 const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1425 __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1426 __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1428 pred_lo = _mm_add_epi32(pred_lo, pred_round);
1429 pred_hi = _mm_add_epi32(pred_hi, pred_round);
1431 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1432 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1434 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1435 pred = _mm_shuffle_epi8(pred, gat);
1436 _mm_storel_epi64((__m128i *)(dst + x), pred);
1438 dst += stride;
1442 void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1443 const uint8_t *above,
1444 const uint8_t *left) {
1445 smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1448 void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1449 const uint8_t *above,
1450 const uint8_t *left) {
1451 smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1454 void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1455 const uint8_t *above,
1456 const uint8_t *left) {
1457 smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1460 void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1461 const uint8_t *above,
1462 const uint8_t *left) {
1463 smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1466 void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1467 const uint8_t *above,
1468 const uint8_t *left) {
1469 smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1472 void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1473 const uint8_t *above,
1474 const uint8_t *left) {
1475 smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1478 void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1479 const uint8_t *above,
1480 const uint8_t *left) {
1481 smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1484 void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1485 const uint8_t *above,
1486 const uint8_t *left) {
1487 smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1490 void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1491 const uint8_t *above,
1492 const uint8_t *left) {
1493 smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1496 void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1497 const uint8_t *above,
1498 const uint8_t *left) {
1499 smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1502 void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1503 const uint8_t *above,
1504 const uint8_t *left) {
1505 smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1508 void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1509 const uint8_t *above,
1510 const uint8_t *left) {
1511 smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);