2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
12 #include <tmmintrin.h>
14 #include "./aom_dsp_rtcd.h"
15 #include "aom_dsp/intrapred_common.h"
17 // -----------------------------------------------------------------------------
20 // Return 8 16-bit pixels in one row
21 static INLINE __m128i
paeth_8x1_pred(const __m128i
*left
, const __m128i
*top
,
22 const __m128i
*topleft
) {
23 const __m128i base
= _mm_sub_epi16(_mm_add_epi16(*top
, *left
), *topleft
);
25 __m128i pl
= _mm_abs_epi16(_mm_sub_epi16(base
, *left
));
26 __m128i pt
= _mm_abs_epi16(_mm_sub_epi16(base
, *top
));
27 __m128i ptl
= _mm_abs_epi16(_mm_sub_epi16(base
, *topleft
));
29 __m128i mask1
= _mm_cmpgt_epi16(pl
, pt
);
30 mask1
= _mm_or_si128(mask1
, _mm_cmpgt_epi16(pl
, ptl
));
31 __m128i mask2
= _mm_cmpgt_epi16(pt
, ptl
);
33 pl
= _mm_andnot_si128(mask1
, *left
);
35 ptl
= _mm_and_si128(mask2
, *topleft
);
36 pt
= _mm_andnot_si128(mask2
, *top
);
37 pt
= _mm_or_si128(pt
, ptl
);
38 pt
= _mm_and_si128(mask1
, pt
);
40 return _mm_or_si128(pl
, pt
);
43 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
44 const uint8_t *above
, const uint8_t *left
) {
45 __m128i l
= _mm_loadl_epi64((const __m128i
*)left
);
46 const __m128i t
= _mm_loadl_epi64((const __m128i
*)above
);
47 const __m128i zero
= _mm_setzero_si128();
48 const __m128i t16
= _mm_unpacklo_epi8(t
, zero
);
49 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
50 __m128i rep
= _mm_set1_epi16(0x8000);
51 const __m128i one
= _mm_set1_epi16(1);
54 for (i
= 0; i
< 4; ++i
) {
55 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
56 const __m128i row
= paeth_8x1_pred(&l16
, &t16
, &tl16
);
58 *(uint32_t *)dst
= _mm_cvtsi128_si32(_mm_packus_epi16(row
, row
));
60 rep
= _mm_add_epi16(rep
, one
);
64 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
65 const uint8_t *above
, const uint8_t *left
) {
66 __m128i l
= _mm_loadl_epi64((const __m128i
*)left
);
67 const __m128i t
= _mm_loadl_epi64((const __m128i
*)above
);
68 const __m128i zero
= _mm_setzero_si128();
69 const __m128i t16
= _mm_unpacklo_epi8(t
, zero
);
70 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
71 __m128i rep
= _mm_set1_epi16(0x8000);
72 const __m128i one
= _mm_set1_epi16(1);
75 for (i
= 0; i
< 8; ++i
) {
76 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
77 const __m128i row
= paeth_8x1_pred(&l16
, &t16
, &tl16
);
79 *(uint32_t *)dst
= _mm_cvtsi128_si32(_mm_packus_epi16(row
, row
));
81 rep
= _mm_add_epi16(rep
, one
);
85 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
86 const uint8_t *above
, const uint8_t *left
) {
87 __m128i l
= _mm_loadl_epi64((const __m128i
*)left
);
88 const __m128i t
= _mm_loadl_epi64((const __m128i
*)above
);
89 const __m128i zero
= _mm_setzero_si128();
90 const __m128i t16
= _mm_unpacklo_epi8(t
, zero
);
91 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
92 __m128i rep
= _mm_set1_epi16(0x8000);
93 const __m128i one
= _mm_set1_epi16(1);
96 for (i
= 0; i
< 4; ++i
) {
97 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
98 const __m128i row
= paeth_8x1_pred(&l16
, &t16
, &tl16
);
100 _mm_storel_epi64((__m128i
*)dst
, _mm_packus_epi16(row
, row
));
102 rep
= _mm_add_epi16(rep
, one
);
106 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
107 const uint8_t *above
, const uint8_t *left
) {
108 __m128i l
= _mm_loadl_epi64((const __m128i
*)left
);
109 const __m128i t
= _mm_loadl_epi64((const __m128i
*)above
);
110 const __m128i zero
= _mm_setzero_si128();
111 const __m128i t16
= _mm_unpacklo_epi8(t
, zero
);
112 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
113 __m128i rep
= _mm_set1_epi16(0x8000);
114 const __m128i one
= _mm_set1_epi16(1);
117 for (i
= 0; i
< 8; ++i
) {
118 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
119 const __m128i row
= paeth_8x1_pred(&l16
, &t16
, &tl16
);
121 _mm_storel_epi64((__m128i
*)dst
, _mm_packus_epi16(row
, row
));
123 rep
= _mm_add_epi16(rep
, one
);
127 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
128 const uint8_t *above
, const uint8_t *left
) {
129 __m128i l
= _mm_load_si128((const __m128i
*)left
);
130 const __m128i t
= _mm_loadl_epi64((const __m128i
*)above
);
131 const __m128i zero
= _mm_setzero_si128();
132 const __m128i t16
= _mm_unpacklo_epi8(t
, zero
);
133 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
134 __m128i rep
= _mm_set1_epi16(0x8000);
135 const __m128i one
= _mm_set1_epi16(1);
138 for (i
= 0; i
< 16; ++i
) {
139 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
140 const __m128i row
= paeth_8x1_pred(&l16
, &t16
, &tl16
);
142 _mm_storel_epi64((__m128i
*)dst
, _mm_packus_epi16(row
, row
));
144 rep
= _mm_add_epi16(rep
, one
);
148 // Return 16 8-bit pixels in one row
149 static INLINE __m128i
paeth_16x1_pred(const __m128i
*left
, const __m128i
*top0
,
151 const __m128i
*topleft
) {
152 const __m128i p0
= paeth_8x1_pred(left
, top0
, topleft
);
153 const __m128i p1
= paeth_8x1_pred(left
, top1
, topleft
);
154 return _mm_packus_epi16(p0
, p1
);
157 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
158 const uint8_t *above
, const uint8_t *left
) {
159 __m128i l
= _mm_loadl_epi64((const __m128i
*)left
);
160 const __m128i t
= _mm_load_si128((const __m128i
*)above
);
161 const __m128i zero
= _mm_setzero_si128();
162 const __m128i top0
= _mm_unpacklo_epi8(t
, zero
);
163 const __m128i top1
= _mm_unpackhi_epi8(t
, zero
);
164 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
165 __m128i rep
= _mm_set1_epi16(0x8000);
166 const __m128i one
= _mm_set1_epi16(1);
169 for (i
= 0; i
< 8; ++i
) {
170 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
171 const __m128i row
= paeth_16x1_pred(&l16
, &top0
, &top1
, &tl16
);
173 _mm_store_si128((__m128i
*)dst
, row
);
175 rep
= _mm_add_epi16(rep
, one
);
179 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
180 const uint8_t *above
,
181 const uint8_t *left
) {
182 __m128i l
= _mm_load_si128((const __m128i
*)left
);
183 const __m128i t
= _mm_load_si128((const __m128i
*)above
);
184 const __m128i zero
= _mm_setzero_si128();
185 const __m128i top0
= _mm_unpacklo_epi8(t
, zero
);
186 const __m128i top1
= _mm_unpackhi_epi8(t
, zero
);
187 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
188 __m128i rep
= _mm_set1_epi16(0x8000);
189 const __m128i one
= _mm_set1_epi16(1);
192 for (i
= 0; i
< 16; ++i
) {
193 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
194 const __m128i row
= paeth_16x1_pred(&l16
, &top0
, &top1
, &tl16
);
196 _mm_store_si128((__m128i
*)dst
, row
);
198 rep
= _mm_add_epi16(rep
, one
);
202 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
203 const uint8_t *above
,
204 const uint8_t *left
) {
205 __m128i l
= _mm_load_si128((const __m128i
*)left
);
206 const __m128i t
= _mm_load_si128((const __m128i
*)above
);
207 const __m128i zero
= _mm_setzero_si128();
208 const __m128i top0
= _mm_unpacklo_epi8(t
, zero
);
209 const __m128i top1
= _mm_unpackhi_epi8(t
, zero
);
210 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
211 __m128i rep
= _mm_set1_epi16(0x8000);
212 const __m128i one
= _mm_set1_epi16(1);
216 for (i
= 0; i
< 16; ++i
) {
217 l16
= _mm_shuffle_epi8(l
, rep
);
218 const __m128i row
= paeth_16x1_pred(&l16
, &top0
, &top1
, &tl16
);
220 _mm_store_si128((__m128i
*)dst
, row
);
222 rep
= _mm_add_epi16(rep
, one
);
225 l
= _mm_load_si128((const __m128i
*)(left
+ 16));
226 rep
= _mm_set1_epi16(0x8000);
227 for (i
= 0; i
< 16; ++i
) {
228 l16
= _mm_shuffle_epi8(l
, rep
);
229 const __m128i row
= paeth_16x1_pred(&l16
, &top0
, &top1
, &tl16
);
231 _mm_store_si128((__m128i
*)dst
, row
);
233 rep
= _mm_add_epi16(rep
, one
);
237 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
238 const uint8_t *above
,
239 const uint8_t *left
) {
240 const __m128i t
= _mm_load_si128((const __m128i
*)above
);
241 const __m128i zero
= _mm_setzero_si128();
242 const __m128i top0
= _mm_unpacklo_epi8(t
, zero
);
243 const __m128i top1
= _mm_unpackhi_epi8(t
, zero
);
244 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
245 const __m128i one
= _mm_set1_epi16(1);
247 for (int j
= 0; j
< 4; ++j
) {
248 const __m128i l
= _mm_load_si128((const __m128i
*)(left
+ j
* 16));
249 __m128i rep
= _mm_set1_epi16(0x8000);
250 for (int i
= 0; i
< 16; ++i
) {
251 const __m128i l16
= _mm_shuffle_epi8(l
, rep
);
252 const __m128i row
= paeth_16x1_pred(&l16
, &top0
, &top1
, &tl16
);
253 _mm_store_si128((__m128i
*)dst
, row
);
255 rep
= _mm_add_epi16(rep
, one
);
260 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
261 const uint8_t *above
,
262 const uint8_t *left
) {
263 const __m128i a
= _mm_load_si128((const __m128i
*)above
);
264 const __m128i b
= _mm_load_si128((const __m128i
*)(above
+ 16));
265 const __m128i zero
= _mm_setzero_si128();
266 const __m128i al
= _mm_unpacklo_epi8(a
, zero
);
267 const __m128i ah
= _mm_unpackhi_epi8(a
, zero
);
268 const __m128i bl
= _mm_unpacklo_epi8(b
, zero
);
269 const __m128i bh
= _mm_unpackhi_epi8(b
, zero
);
271 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
272 __m128i rep
= _mm_set1_epi16(0x8000);
273 const __m128i one
= _mm_set1_epi16(1);
274 __m128i l
= _mm_load_si128((const __m128i
*)left
);
278 for (i
= 0; i
< 16; ++i
) {
279 l16
= _mm_shuffle_epi8(l
, rep
);
280 const __m128i r32l
= paeth_16x1_pred(&l16
, &al
, &ah
, &tl16
);
281 const __m128i r32h
= paeth_16x1_pred(&l16
, &bl
, &bh
, &tl16
);
283 _mm_store_si128((__m128i
*)dst
, r32l
);
284 _mm_store_si128((__m128i
*)(dst
+ 16), r32h
);
286 rep
= _mm_add_epi16(rep
, one
);
290 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
291 const uint8_t *above
,
292 const uint8_t *left
) {
293 const __m128i a
= _mm_load_si128((const __m128i
*)above
);
294 const __m128i b
= _mm_load_si128((const __m128i
*)(above
+ 16));
295 const __m128i zero
= _mm_setzero_si128();
296 const __m128i al
= _mm_unpacklo_epi8(a
, zero
);
297 const __m128i ah
= _mm_unpackhi_epi8(a
, zero
);
298 const __m128i bl
= _mm_unpacklo_epi8(b
, zero
);
299 const __m128i bh
= _mm_unpackhi_epi8(b
, zero
);
301 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
302 __m128i rep
= _mm_set1_epi16(0x8000);
303 const __m128i one
= _mm_set1_epi16(1);
304 __m128i l
= _mm_load_si128((const __m128i
*)left
);
308 for (i
= 0; i
< 16; ++i
) {
309 l16
= _mm_shuffle_epi8(l
, rep
);
310 const __m128i r32l
= paeth_16x1_pred(&l16
, &al
, &ah
, &tl16
);
311 const __m128i r32h
= paeth_16x1_pred(&l16
, &bl
, &bh
, &tl16
);
313 _mm_store_si128((__m128i
*)dst
, r32l
);
314 _mm_store_si128((__m128i
*)(dst
+ 16), r32h
);
316 rep
= _mm_add_epi16(rep
, one
);
319 rep
= _mm_set1_epi16(0x8000);
320 l
= _mm_load_si128((const __m128i
*)(left
+ 16));
321 for (i
= 0; i
< 16; ++i
) {
322 l16
= _mm_shuffle_epi8(l
, rep
);
323 const __m128i r32l
= paeth_16x1_pred(&l16
, &al
, &ah
, &tl16
);
324 const __m128i r32h
= paeth_16x1_pred(&l16
, &bl
, &bh
, &tl16
);
326 _mm_store_si128((__m128i
*)dst
, r32l
);
327 _mm_store_si128((__m128i
*)(dst
+ 16), r32h
);
329 rep
= _mm_add_epi16(rep
, one
);
333 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
334 const uint8_t *above
,
335 const uint8_t *left
) {
336 const __m128i a
= _mm_load_si128((const __m128i
*)above
);
337 const __m128i b
= _mm_load_si128((const __m128i
*)(above
+ 16));
338 const __m128i zero
= _mm_setzero_si128();
339 const __m128i al
= _mm_unpacklo_epi8(a
, zero
);
340 const __m128i ah
= _mm_unpackhi_epi8(a
, zero
);
341 const __m128i bl
= _mm_unpacklo_epi8(b
, zero
);
342 const __m128i bh
= _mm_unpackhi_epi8(b
, zero
);
344 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
345 const __m128i one
= _mm_set1_epi16(1);
349 for (j
= 0; j
< 4; ++j
) {
350 const __m128i l
= _mm_load_si128((const __m128i
*)(left
+ j
* 16));
351 __m128i rep
= _mm_set1_epi16(0x8000);
352 for (i
= 0; i
< 16; ++i
) {
353 l16
= _mm_shuffle_epi8(l
, rep
);
354 const __m128i r32l
= paeth_16x1_pred(&l16
, &al
, &ah
, &tl16
);
355 const __m128i r32h
= paeth_16x1_pred(&l16
, &bl
, &bh
, &tl16
);
357 _mm_store_si128((__m128i
*)dst
, r32l
);
358 _mm_store_si128((__m128i
*)(dst
+ 16), r32h
);
360 rep
= _mm_add_epi16(rep
, one
);
365 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
366 const uint8_t *above
,
367 const uint8_t *left
) {
368 const __m128i a
= _mm_load_si128((const __m128i
*)above
);
369 const __m128i b
= _mm_load_si128((const __m128i
*)(above
+ 16));
370 const __m128i c
= _mm_load_si128((const __m128i
*)(above
+ 32));
371 const __m128i d
= _mm_load_si128((const __m128i
*)(above
+ 48));
372 const __m128i zero
= _mm_setzero_si128();
373 const __m128i al
= _mm_unpacklo_epi8(a
, zero
);
374 const __m128i ah
= _mm_unpackhi_epi8(a
, zero
);
375 const __m128i bl
= _mm_unpacklo_epi8(b
, zero
);
376 const __m128i bh
= _mm_unpackhi_epi8(b
, zero
);
377 const __m128i cl
= _mm_unpacklo_epi8(c
, zero
);
378 const __m128i ch
= _mm_unpackhi_epi8(c
, zero
);
379 const __m128i dl
= _mm_unpacklo_epi8(d
, zero
);
380 const __m128i dh
= _mm_unpackhi_epi8(d
, zero
);
382 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
383 const __m128i one
= _mm_set1_epi16(1);
387 for (j
= 0; j
< 2; ++j
) {
388 const __m128i l
= _mm_load_si128((const __m128i
*)(left
+ j
* 16));
389 __m128i rep
= _mm_set1_epi16(0x8000);
390 for (i
= 0; i
< 16; ++i
) {
391 l16
= _mm_shuffle_epi8(l
, rep
);
392 const __m128i r0
= paeth_16x1_pred(&l16
, &al
, &ah
, &tl16
);
393 const __m128i r1
= paeth_16x1_pred(&l16
, &bl
, &bh
, &tl16
);
394 const __m128i r2
= paeth_16x1_pred(&l16
, &cl
, &ch
, &tl16
);
395 const __m128i r3
= paeth_16x1_pred(&l16
, &dl
, &dh
, &tl16
);
397 _mm_store_si128((__m128i
*)dst
, r0
);
398 _mm_store_si128((__m128i
*)(dst
+ 16), r1
);
399 _mm_store_si128((__m128i
*)(dst
+ 32), r2
);
400 _mm_store_si128((__m128i
*)(dst
+ 48), r3
);
402 rep
= _mm_add_epi16(rep
, one
);
407 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
408 const uint8_t *above
,
409 const uint8_t *left
) {
410 const __m128i a
= _mm_load_si128((const __m128i
*)above
);
411 const __m128i b
= _mm_load_si128((const __m128i
*)(above
+ 16));
412 const __m128i c
= _mm_load_si128((const __m128i
*)(above
+ 32));
413 const __m128i d
= _mm_load_si128((const __m128i
*)(above
+ 48));
414 const __m128i zero
= _mm_setzero_si128();
415 const __m128i al
= _mm_unpacklo_epi8(a
, zero
);
416 const __m128i ah
= _mm_unpackhi_epi8(a
, zero
);
417 const __m128i bl
= _mm_unpacklo_epi8(b
, zero
);
418 const __m128i bh
= _mm_unpackhi_epi8(b
, zero
);
419 const __m128i cl
= _mm_unpacklo_epi8(c
, zero
);
420 const __m128i ch
= _mm_unpackhi_epi8(c
, zero
);
421 const __m128i dl
= _mm_unpacklo_epi8(d
, zero
);
422 const __m128i dh
= _mm_unpackhi_epi8(d
, zero
);
424 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
425 const __m128i one
= _mm_set1_epi16(1);
429 for (j
= 0; j
< 4; ++j
) {
430 const __m128i l
= _mm_load_si128((const __m128i
*)(left
+ j
* 16));
431 __m128i rep
= _mm_set1_epi16(0x8000);
432 for (i
= 0; i
< 16; ++i
) {
433 l16
= _mm_shuffle_epi8(l
, rep
);
434 const __m128i r0
= paeth_16x1_pred(&l16
, &al
, &ah
, &tl16
);
435 const __m128i r1
= paeth_16x1_pred(&l16
, &bl
, &bh
, &tl16
);
436 const __m128i r2
= paeth_16x1_pred(&l16
, &cl
, &ch
, &tl16
);
437 const __m128i r3
= paeth_16x1_pred(&l16
, &dl
, &dh
, &tl16
);
439 _mm_store_si128((__m128i
*)dst
, r0
);
440 _mm_store_si128((__m128i
*)(dst
+ 16), r1
);
441 _mm_store_si128((__m128i
*)(dst
+ 32), r2
);
442 _mm_store_si128((__m128i
*)(dst
+ 48), r3
);
444 rep
= _mm_add_epi16(rep
, one
);
449 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
450 const uint8_t *above
,
451 const uint8_t *left
) {
452 const __m128i a
= _mm_load_si128((const __m128i
*)above
);
453 const __m128i b
= _mm_load_si128((const __m128i
*)(above
+ 16));
454 const __m128i c
= _mm_load_si128((const __m128i
*)(above
+ 32));
455 const __m128i d
= _mm_load_si128((const __m128i
*)(above
+ 48));
456 const __m128i zero
= _mm_setzero_si128();
457 const __m128i al
= _mm_unpacklo_epi8(a
, zero
);
458 const __m128i ah
= _mm_unpackhi_epi8(a
, zero
);
459 const __m128i bl
= _mm_unpacklo_epi8(b
, zero
);
460 const __m128i bh
= _mm_unpackhi_epi8(b
, zero
);
461 const __m128i cl
= _mm_unpacklo_epi8(c
, zero
);
462 const __m128i ch
= _mm_unpackhi_epi8(c
, zero
);
463 const __m128i dl
= _mm_unpacklo_epi8(d
, zero
);
464 const __m128i dh
= _mm_unpackhi_epi8(d
, zero
);
466 const __m128i tl16
= _mm_set1_epi16((uint16_t)above
[-1]);
467 const __m128i one
= _mm_set1_epi16(1);
471 const __m128i l
= _mm_load_si128((const __m128i
*)left
);
472 __m128i rep
= _mm_set1_epi16(0x8000);
473 for (i
= 0; i
< 16; ++i
) {
474 l16
= _mm_shuffle_epi8(l
, rep
);
475 const __m128i r0
= paeth_16x1_pred(&l16
, &al
, &ah
, &tl16
);
476 const __m128i r1
= paeth_16x1_pred(&l16
, &bl
, &bh
, &tl16
);
477 const __m128i r2
= paeth_16x1_pred(&l16
, &cl
, &ch
, &tl16
);
478 const __m128i r3
= paeth_16x1_pred(&l16
, &dl
, &dh
, &tl16
);
480 _mm_store_si128((__m128i
*)dst
, r0
);
481 _mm_store_si128((__m128i
*)(dst
+ 16), r1
);
482 _mm_store_si128((__m128i
*)(dst
+ 32), r2
);
483 _mm_store_si128((__m128i
*)(dst
+ 48), r3
);
485 rep
= _mm_add_epi16(rep
, one
);
489 // -----------------------------------------------------------------------------
492 // pixels[0]: above and below_pred interleave vector
493 // pixels[1]: left vector
494 // pixels[2]: right_pred vector
495 static INLINE
void load_pixel_w4(const uint8_t *above
, const uint8_t *left
,
496 int height
, __m128i
*pixels
) {
497 __m128i d
= _mm_loadl_epi64((const __m128i
*)above
);
498 pixels
[2] = _mm_set1_epi16((uint16_t)above
[3]);
499 pixels
[1] = _mm_loadl_epi64((const __m128i
*)left
);
501 const __m128i bp
= _mm_set1_epi16((uint16_t)left
[height
- 1]);
502 const __m128i zero
= _mm_setzero_si128();
503 d
= _mm_unpacklo_epi8(d
, zero
);
504 pixels
[0] = _mm_unpacklo_epi16(d
, bp
);
507 // weights[0]: weights_h vector
508 // weights[1]: scale - weights_h vecotr
509 // weights[2]: weights_w and scale - weights_w interleave vector
510 static INLINE
void load_weight_w4(const uint8_t *weight_array
, int height
,
512 __m128i t
= _mm_loadu_si128((const __m128i
*)&weight_array
[4]);
513 const __m128i zero
= _mm_setzero_si128();
515 weights
[0] = _mm_unpacklo_epi8(t
, zero
);
516 const __m128i d
= _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
517 weights
[1] = _mm_sub_epi16(d
, weights
[0]);
518 weights
[2] = _mm_unpacklo_epi16(weights
[0], weights
[1]);
521 t
= _mm_srli_si128(t
, 4);
522 weights
[0] = _mm_unpacklo_epi8(t
, zero
);
523 weights
[1] = _mm_sub_epi16(d
, weights
[0]);
527 static INLINE
void smooth_pred_4xh(const __m128i
*pixel
, const __m128i
*weight
,
528 int h
, uint8_t *dst
, ptrdiff_t stride
) {
529 const __m128i round
= _mm_set1_epi32((1 << sm_weight_log2_scale
));
530 const __m128i one
= _mm_set1_epi16(1);
531 const __m128i inc
= _mm_set1_epi16(0x202);
532 const __m128i gat
= _mm_set1_epi32(0xc080400);
533 __m128i rep
= _mm_set1_epi16(0x8000);
534 __m128i d
= _mm_set1_epi16(0x100);
537 for (i
= 0; i
< h
; ++i
) {
538 const __m128i wg_wg
= _mm_shuffle_epi8(weight
[0], d
);
539 const __m128i sc_sc
= _mm_shuffle_epi8(weight
[1], d
);
540 const __m128i wh_sc
= _mm_unpacklo_epi16(wg_wg
, sc_sc
);
541 __m128i s
= _mm_madd_epi16(pixel
[0], wh_sc
);
543 __m128i b
= _mm_shuffle_epi8(pixel
[1], rep
);
544 b
= _mm_unpacklo_epi16(b
, pixel
[2]);
545 __m128i sum
= _mm_madd_epi16(b
, weight
[2]);
547 sum
= _mm_add_epi32(s
, sum
);
548 sum
= _mm_add_epi32(sum
, round
);
549 sum
= _mm_srai_epi32(sum
, 1 + sm_weight_log2_scale
);
551 sum
= _mm_shuffle_epi8(sum
, gat
);
552 *(uint32_t *)dst
= _mm_cvtsi128_si32(sum
);
555 rep
= _mm_add_epi16(rep
, one
);
556 d
= _mm_add_epi16(d
, inc
);
560 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
561 const uint8_t *above
, const uint8_t *left
) {
563 load_pixel_w4(above
, left
, 4, pixels
);
566 load_weight_w4(sm_weight_arrays
, 4, weights
);
568 smooth_pred_4xh(pixels
, weights
, 4, dst
, stride
);
571 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
572 const uint8_t *above
, const uint8_t *left
) {
574 load_pixel_w4(above
, left
, 8, pixels
);
577 load_weight_w4(sm_weight_arrays
, 8, weights
);
579 smooth_pred_4xh(pixels
, weights
, 8, dst
, stride
);
582 // pixels[0]: above and below_pred interleave vector, first half
583 // pixels[1]: above and below_pred interleave vector, second half
584 // pixels[2]: left vector
585 // pixels[3]: right_pred vector
586 static INLINE
void load_pixel_w8(const uint8_t *above
, const uint8_t *left
,
587 int height
, __m128i
*pixels
) {
588 __m128i d
= _mm_loadl_epi64((const __m128i
*)above
);
589 pixels
[3] = _mm_set1_epi16((uint16_t)above
[7]);
590 pixels
[2] = _mm_load_si128((const __m128i
*)left
);
591 const __m128i bp
= _mm_set1_epi16((uint16_t)left
[height
- 1]);
592 const __m128i zero
= _mm_setzero_si128();
594 d
= _mm_unpacklo_epi8(d
, zero
);
595 pixels
[0] = _mm_unpacklo_epi16(d
, bp
);
596 pixels
[1] = _mm_unpackhi_epi16(d
, bp
);
599 // weight_h[0]: weight_h vector
600 // weight_h[1]: scale - weight_h vector
601 // weight_h[2]: same as [0], second half for height = 16 only
602 // weight_h[3]: same as [1], second half for height = 16 only
603 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
604 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
605 static INLINE
void load_weight_w8(const uint8_t *weight_array
, int height
,
606 __m128i
*weight_h
, __m128i
*weight_w
) {
607 const __m128i zero
= _mm_setzero_si128();
608 const int we_offset
= height
< 8 ? 4 : 8;
609 __m128i we
= _mm_loadu_si128((const __m128i
*)&weight_array
[we_offset
]);
610 weight_h
[0] = _mm_unpacklo_epi8(we
, zero
);
612 const __m128i d
= _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
613 weight_h
[1] = _mm_sub_epi16(d
, weight_h
[0]);
616 we
= _mm_srli_si128(we
, 4);
617 __m128i tmp1
= _mm_unpacklo_epi8(we
, zero
);
618 __m128i tmp2
= _mm_sub_epi16(d
, tmp1
);
619 weight_w
[0] = _mm_unpacklo_epi16(tmp1
, tmp2
);
620 weight_w
[1] = _mm_unpackhi_epi16(tmp1
, tmp2
);
622 weight_w
[0] = _mm_unpacklo_epi16(weight_h
[0], weight_h
[1]);
623 weight_w
[1] = _mm_unpackhi_epi16(weight_h
[0], weight_h
[1]);
627 we
= _mm_loadu_si128((const __m128i
*)&weight_array
[16]);
628 weight_h
[0] = _mm_unpacklo_epi8(we
, zero
);
629 weight_h
[1] = _mm_sub_epi16(d
, weight_h
[0]);
630 weight_h
[2] = _mm_unpackhi_epi8(we
, zero
);
631 weight_h
[3] = _mm_sub_epi16(d
, weight_h
[2]);
635 static INLINE
void smooth_pred_8xh(const __m128i
*pixels
, const __m128i
*wh
,
636 const __m128i
*ww
, int h
, uint8_t *dst
,
637 ptrdiff_t stride
, int second_half
) {
638 const __m128i round
= _mm_set1_epi32((1 << sm_weight_log2_scale
));
639 const __m128i one
= _mm_set1_epi16(1);
640 const __m128i inc
= _mm_set1_epi16(0x202);
641 const __m128i gat
= _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
643 __m128i rep
= second_half
? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
644 __m128i d
= _mm_set1_epi16(0x100);
647 for (i
= 0; i
< h
; ++i
) {
648 const __m128i wg_wg
= _mm_shuffle_epi8(wh
[0], d
);
649 const __m128i sc_sc
= _mm_shuffle_epi8(wh
[1], d
);
650 const __m128i wh_sc
= _mm_unpacklo_epi16(wg_wg
, sc_sc
);
651 __m128i s0
= _mm_madd_epi16(pixels
[0], wh_sc
);
652 __m128i s1
= _mm_madd_epi16(pixels
[1], wh_sc
);
654 __m128i b
= _mm_shuffle_epi8(pixels
[2], rep
);
655 b
= _mm_unpacklo_epi16(b
, pixels
[3]);
656 __m128i sum0
= _mm_madd_epi16(b
, ww
[0]);
657 __m128i sum1
= _mm_madd_epi16(b
, ww
[1]);
659 s0
= _mm_add_epi32(s0
, sum0
);
660 s0
= _mm_add_epi32(s0
, round
);
661 s0
= _mm_srai_epi32(s0
, 1 + sm_weight_log2_scale
);
663 s1
= _mm_add_epi32(s1
, sum1
);
664 s1
= _mm_add_epi32(s1
, round
);
665 s1
= _mm_srai_epi32(s1
, 1 + sm_weight_log2_scale
);
667 sum0
= _mm_packus_epi16(s0
, s1
);
668 sum0
= _mm_shuffle_epi8(sum0
, gat
);
669 _mm_storel_epi64((__m128i
*)dst
, sum0
);
672 rep
= _mm_add_epi16(rep
, one
);
673 d
= _mm_add_epi16(d
, inc
);
677 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
678 const uint8_t *above
, const uint8_t *left
) {
680 load_pixel_w8(above
, left
, 4, pixels
);
682 __m128i wh
[4], ww
[2];
683 load_weight_w8(sm_weight_arrays
, 4, wh
, ww
);
685 smooth_pred_8xh(pixels
, wh
, ww
, 4, dst
, stride
, 0);
688 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
689 const uint8_t *above
, const uint8_t *left
) {
691 load_pixel_w8(above
, left
, 8, pixels
);
693 __m128i wh
[4], ww
[2];
694 load_weight_w8(sm_weight_arrays
, 8, wh
, ww
);
696 smooth_pred_8xh(pixels
, wh
, ww
, 8, dst
, stride
, 0);
699 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
700 const uint8_t *above
,
701 const uint8_t *left
) {
703 load_pixel_w8(above
, left
, 16, pixels
);
705 __m128i wh
[4], ww
[2];
706 load_weight_w8(sm_weight_arrays
, 16, wh
, ww
);
708 smooth_pred_8xh(pixels
, wh
, ww
, 8, dst
, stride
, 0);
710 smooth_pred_8xh(pixels
, &wh
[2], ww
, 8, dst
, stride
, 1);
713 static INLINE
void smooth_predictor_wxh(uint8_t *dst
, ptrdiff_t stride
,
714 const uint8_t *above
,
715 const uint8_t *left
, uint32_t bw
,
717 const uint8_t *const sm_weights_w
= sm_weight_arrays
+ bw
;
718 const uint8_t *const sm_weights_h
= sm_weight_arrays
+ bh
;
719 const __m128i zero
= _mm_setzero_si128();
720 const __m128i scale_value
=
721 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
722 const __m128i bottom_left
= _mm_cvtsi32_si128((uint32_t)left
[bh
- 1]);
723 const __m128i dup16
=
724 _mm_set_epi32(0x01000100, 0x01000100, 0x01000100, 0x01000100);
725 const __m128i top_right
=
726 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above
[bw
- 1]), dup16
);
727 const __m128i gat
= _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
728 const __m128i round
= _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale
));
730 for (uint32_t y
= 0; y
< bh
; ++y
) {
731 const __m128i weights_y
= _mm_cvtsi32_si128((uint32_t)sm_weights_h
[y
]);
732 const __m128i left_y
= _mm_cvtsi32_si128((uint32_t)left
[y
]);
733 const __m128i scale_m_weights_y
= _mm_sub_epi16(scale_value
, weights_y
);
734 __m128i pred_scaled_bl
= _mm_mullo_epi16(scale_m_weights_y
, bottom_left
);
736 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y
, left_y
), 0);
737 pred_scaled_bl
= _mm_add_epi32(pred_scaled_bl
, round
);
738 pred_scaled_bl
= _mm_shuffle_epi32(pred_scaled_bl
, 0);
740 for (uint32_t x
= 0; x
< bw
; x
+= 8) {
741 const __m128i top_x
= _mm_loadl_epi64((const __m128i
*)(above
+ x
));
742 const __m128i weights_x
=
743 _mm_loadl_epi64((const __m128i
*)(sm_weights_w
+ x
));
744 const __m128i tw_x
= _mm_unpacklo_epi8(top_x
, weights_x
);
745 const __m128i tw_x_lo
= _mm_unpacklo_epi8(tw_x
, zero
);
746 const __m128i tw_x_hi
= _mm_unpackhi_epi8(tw_x
, zero
);
748 __m128i pred_lo
= _mm_madd_epi16(tw_x_lo
, wl_y
);
749 __m128i pred_hi
= _mm_madd_epi16(tw_x_hi
, wl_y
);
751 const __m128i scale_m_weights_x
=
752 _mm_sub_epi16(scale_value
, _mm_unpacklo_epi8(weights_x
, zero
));
753 const __m128i swxtr
= _mm_mullo_epi16(scale_m_weights_x
, top_right
);
754 const __m128i swxtr_lo
= _mm_unpacklo_epi16(swxtr
, zero
);
755 const __m128i swxtr_hi
= _mm_unpackhi_epi16(swxtr
, zero
);
757 pred_lo
= _mm_add_epi32(pred_lo
, pred_scaled_bl
);
758 pred_hi
= _mm_add_epi32(pred_hi
, pred_scaled_bl
);
760 pred_lo
= _mm_add_epi32(pred_lo
, swxtr_lo
);
761 pred_hi
= _mm_add_epi32(pred_hi
, swxtr_hi
);
763 pred_lo
= _mm_srai_epi32(pred_lo
, (1 + sm_weight_log2_scale
));
764 pred_hi
= _mm_srai_epi32(pred_hi
, (1 + sm_weight_log2_scale
));
766 __m128i pred
= _mm_packus_epi16(pred_lo
, pred_hi
);
767 pred
= _mm_shuffle_epi8(pred
, gat
);
768 _mm_storel_epi64((__m128i
*)(dst
+ x
), pred
);
774 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
775 const uint8_t *above
,
776 const uint8_t *left
) {
777 smooth_predictor_wxh(dst
, stride
, above
, left
, 16, 8);
780 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
781 const uint8_t *above
,
782 const uint8_t *left
) {
783 smooth_predictor_wxh(dst
, stride
, above
, left
, 16, 16);
786 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
787 const uint8_t *above
,
788 const uint8_t *left
) {
789 smooth_predictor_wxh(dst
, stride
, above
, left
, 16, 32);
792 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
793 const uint8_t *above
,
794 const uint8_t *left
) {
795 smooth_predictor_wxh(dst
, stride
, above
, left
, 32, 16);
798 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
799 const uint8_t *above
,
800 const uint8_t *left
) {
801 smooth_predictor_wxh(dst
, stride
, above
, left
, 32, 32);
804 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
805 const uint8_t *above
,
806 const uint8_t *left
) {
807 smooth_predictor_wxh(dst
, stride
, above
, left
, 32, 64);
810 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
811 const uint8_t *above
,
812 const uint8_t *left
) {
813 smooth_predictor_wxh(dst
, stride
, above
, left
, 64, 64);
816 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
817 const uint8_t *above
,
818 const uint8_t *left
) {
819 smooth_predictor_wxh(dst
, stride
, above
, left
, 64, 32);
822 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
823 const uint8_t *above
,
824 const uint8_t *left
) {
825 smooth_predictor_wxh(dst
, stride
, above
, left
, 64, 16);
828 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
829 const uint8_t *above
,
830 const uint8_t *left
) {
831 smooth_predictor_wxh(dst
, stride
, above
, left
, 16, 64);
834 // -----------------------------------------------------------------------------
837 // pixels[0]: above and below_pred interleave vector
838 static INLINE
void load_pixel_v_w4(const uint8_t *above
, const uint8_t *left
,
839 int height
, __m128i
*pixels
) {
840 const __m128i zero
= _mm_setzero_si128();
841 __m128i d
= _mm_cvtsi32_si128(((const uint32_t *)above
)[0]);
842 const __m128i bp
= _mm_set1_epi16((uint16_t)left
[height
- 1]);
843 d
= _mm_unpacklo_epi8(d
, zero
);
844 pixels
[0] = _mm_unpacklo_epi16(d
, bp
);
847 // weights[0]: weights_h vector
848 // weights[1]: scale - weights_h vector
849 static INLINE
void load_weight_v_w4(const uint8_t *weight_array
, int height
,
851 const __m128i zero
= _mm_setzero_si128();
852 const __m128i d
= _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
855 const __m128i weight
=
856 _mm_cvtsi32_si128(((const uint32_t *)weight_array
)[1]);
857 weights
[0] = _mm_unpacklo_epi8(weight
, zero
);
858 weights
[1] = _mm_sub_epi16(d
, weights
[0]);
859 } else if (height
== 8) {
860 const __m128i weight
= _mm_loadl_epi64((const __m128i
*)&weight_array
[8]);
861 weights
[0] = _mm_unpacklo_epi8(weight
, zero
);
862 weights
[1] = _mm_sub_epi16(d
, weights
[0]);
864 const __m128i weight
= _mm_loadu_si128((const __m128i
*)&weight_array
[16]);
865 weights
[0] = _mm_unpacklo_epi8(weight
, zero
);
866 weights
[1] = _mm_sub_epi16(d
, weights
[0]);
867 weights
[2] = _mm_unpackhi_epi8(weight
, zero
);
868 weights
[3] = _mm_sub_epi16(d
, weights
[2]);
872 static INLINE
void smooth_v_pred_4xh(const __m128i
*pixel
,
873 const __m128i
*weight
, int h
, uint8_t *dst
,
875 const __m128i pred_round
= _mm_set1_epi32((1 << (sm_weight_log2_scale
- 1)));
876 const __m128i inc
= _mm_set1_epi16(0x202);
877 const __m128i gat
= _mm_set1_epi32(0xc080400);
878 __m128i d
= _mm_set1_epi16(0x100);
880 for (int i
= 0; i
< h
; ++i
) {
881 const __m128i wg_wg
= _mm_shuffle_epi8(weight
[0], d
);
882 const __m128i sc_sc
= _mm_shuffle_epi8(weight
[1], d
);
883 const __m128i wh_sc
= _mm_unpacklo_epi16(wg_wg
, sc_sc
);
884 __m128i sum
= _mm_madd_epi16(pixel
[0], wh_sc
);
885 sum
= _mm_add_epi32(sum
, pred_round
);
886 sum
= _mm_srai_epi32(sum
, sm_weight_log2_scale
);
887 sum
= _mm_shuffle_epi8(sum
, gat
);
888 *(uint32_t *)dst
= _mm_cvtsi128_si32(sum
);
890 d
= _mm_add_epi16(d
, inc
);
894 void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
895 const uint8_t *above
,
896 const uint8_t *left
) {
898 load_pixel_v_w4(above
, left
, 4, &pixels
);
901 load_weight_v_w4(sm_weight_arrays
, 4, weights
);
903 smooth_v_pred_4xh(&pixels
, weights
, 4, dst
, stride
);
906 void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
907 const uint8_t *above
,
908 const uint8_t *left
) {
910 load_pixel_v_w4(above
, left
, 8, &pixels
);
913 load_weight_v_w4(sm_weight_arrays
, 8, weights
);
915 smooth_v_pred_4xh(&pixels
, weights
, 8, dst
, stride
);
918 void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
919 const uint8_t *above
,
920 const uint8_t *left
) {
922 load_pixel_v_w4(above
, left
, 16, &pixels
);
925 load_weight_v_w4(sm_weight_arrays
, 16, weights
);
927 smooth_v_pred_4xh(&pixels
, weights
, 8, dst
, stride
);
929 smooth_v_pred_4xh(&pixels
, &weights
[2], 8, dst
, stride
);
932 // pixels[0]: above and below_pred interleave vector, first half
933 // pixels[1]: above and below_pred interleave vector, second half
934 static INLINE
void load_pixel_v_w8(const uint8_t *above
, const uint8_t *left
,
935 int height
, __m128i
*pixels
) {
936 const __m128i zero
= _mm_setzero_si128();
937 __m128i d
= _mm_loadl_epi64((const __m128i
*)above
);
938 const __m128i bp
= _mm_set1_epi16((uint16_t)left
[height
- 1]);
939 d
= _mm_unpacklo_epi8(d
, zero
);
940 pixels
[0] = _mm_unpacklo_epi16(d
, bp
);
941 pixels
[1] = _mm_unpackhi_epi16(d
, bp
);
944 // weight_h[0]: weight_h vector
945 // weight_h[1]: scale - weight_h vector
946 // weight_h[2]: same as [0], offset 8
947 // weight_h[3]: same as [1], offset 8
948 // weight_h[4]: same as [0], offset 16
949 // weight_h[5]: same as [1], offset 16
950 // weight_h[6]: same as [0], offset 24
951 // weight_h[7]: same as [1], offset 24
952 static INLINE
void load_weight_v_w8(const uint8_t *weight_array
, int height
,
954 const __m128i zero
= _mm_setzero_si128();
955 const __m128i d
= _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
958 const int offset
= height
< 8 ? 4 : 8;
959 const __m128i weight
=
960 _mm_loadu_si128((const __m128i
*)&weight_array
[offset
]);
961 weight_h
[0] = _mm_unpacklo_epi8(weight
, zero
);
962 weight_h
[1] = _mm_sub_epi16(d
, weight_h
[0]);
963 } else if (height
== 16) {
964 const __m128i weight
= _mm_loadu_si128((const __m128i
*)&weight_array
[16]);
965 weight_h
[0] = _mm_unpacklo_epi8(weight
, zero
);
966 weight_h
[1] = _mm_sub_epi16(d
, weight_h
[0]);
967 weight_h
[2] = _mm_unpackhi_epi8(weight
, zero
);
968 weight_h
[3] = _mm_sub_epi16(d
, weight_h
[2]);
970 const __m128i weight_lo
=
971 _mm_loadu_si128((const __m128i
*)&weight_array
[32]);
972 weight_h
[0] = _mm_unpacklo_epi8(weight_lo
, zero
);
973 weight_h
[1] = _mm_sub_epi16(d
, weight_h
[0]);
974 weight_h
[2] = _mm_unpackhi_epi8(weight_lo
, zero
);
975 weight_h
[3] = _mm_sub_epi16(d
, weight_h
[2]);
976 const __m128i weight_hi
=
977 _mm_loadu_si128((const __m128i
*)&weight_array
[32 + 16]);
978 weight_h
[4] = _mm_unpacklo_epi8(weight_hi
, zero
);
979 weight_h
[5] = _mm_sub_epi16(d
, weight_h
[4]);
980 weight_h
[6] = _mm_unpackhi_epi8(weight_hi
, zero
);
981 weight_h
[7] = _mm_sub_epi16(d
, weight_h
[6]);
985 static INLINE
void smooth_v_pred_8xh(const __m128i
*pixels
, const __m128i
*wh
,
986 int h
, uint8_t *dst
, ptrdiff_t stride
) {
987 const __m128i pred_round
= _mm_set1_epi32((1 << (sm_weight_log2_scale
- 1)));
988 const __m128i inc
= _mm_set1_epi16(0x202);
989 const __m128i gat
= _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
990 __m128i d
= _mm_set1_epi16(0x100);
992 for (int i
= 0; i
< h
; ++i
) {
993 const __m128i wg_wg
= _mm_shuffle_epi8(wh
[0], d
);
994 const __m128i sc_sc
= _mm_shuffle_epi8(wh
[1], d
);
995 const __m128i wh_sc
= _mm_unpacklo_epi16(wg_wg
, sc_sc
);
996 __m128i s0
= _mm_madd_epi16(pixels
[0], wh_sc
);
997 __m128i s1
= _mm_madd_epi16(pixels
[1], wh_sc
);
999 s0
= _mm_add_epi32(s0
, pred_round
);
1000 s0
= _mm_srai_epi32(s0
, sm_weight_log2_scale
);
1002 s1
= _mm_add_epi32(s1
, pred_round
);
1003 s1
= _mm_srai_epi32(s1
, sm_weight_log2_scale
);
1005 __m128i sum01
= _mm_packus_epi16(s0
, s1
);
1006 sum01
= _mm_shuffle_epi8(sum01
, gat
);
1007 _mm_storel_epi64((__m128i
*)dst
, sum01
);
1010 d
= _mm_add_epi16(d
, inc
);
1014 void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1015 const uint8_t *above
,
1016 const uint8_t *left
) {
1018 load_pixel_v_w8(above
, left
, 4, pixels
);
1021 load_weight_v_w8(sm_weight_arrays
, 4, wh
);
1023 smooth_v_pred_8xh(pixels
, wh
, 4, dst
, stride
);
1026 void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1027 const uint8_t *above
,
1028 const uint8_t *left
) {
1030 load_pixel_v_w8(above
, left
, 8, pixels
);
1033 load_weight_v_w8(sm_weight_arrays
, 8, wh
);
1035 smooth_v_pred_8xh(pixels
, wh
, 8, dst
, stride
);
1038 void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1039 const uint8_t *above
,
1040 const uint8_t *left
) {
1042 load_pixel_v_w8(above
, left
, 16, pixels
);
1045 load_weight_v_w8(sm_weight_arrays
, 16, wh
);
1047 smooth_v_pred_8xh(pixels
, wh
, 8, dst
, stride
);
1049 smooth_v_pred_8xh(pixels
, &wh
[2], 8, dst
, stride
);
1052 void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1053 const uint8_t *above
,
1054 const uint8_t *left
) {
1056 load_pixel_v_w8(above
, left
, 32, pixels
);
1059 load_weight_v_w8(sm_weight_arrays
, 32, wh
);
1061 smooth_v_pred_8xh(pixels
, &wh
[0], 8, dst
, stride
);
1063 smooth_v_pred_8xh(pixels
, &wh
[2], 8, dst
, stride
);
1065 smooth_v_pred_8xh(pixels
, &wh
[4], 8, dst
, stride
);
1067 smooth_v_pred_8xh(pixels
, &wh
[6], 8, dst
, stride
);
1070 static INLINE
void smooth_v_predictor_wxh(uint8_t *dst
, ptrdiff_t stride
,
1071 const uint8_t *above
,
1072 const uint8_t *left
, uint32_t bw
,
1074 const uint8_t *const sm_weights_h
= sm_weight_arrays
+ bh
;
1075 const __m128i zero
= _mm_setzero_si128();
1076 const __m128i scale_value
=
1077 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
1078 const __m128i dup16
=
1079 _mm_set_epi32(0x01000100, 0x01000100, 0x01000100, 0x01000100);
1080 const __m128i bottom_left
=
1081 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left
[bh
- 1]), dup16
);
1082 const __m128i gat
= _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1083 const __m128i round
=
1084 _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale
- 1)));
1086 for (uint32_t y
= 0; y
< bh
; ++y
) {
1087 const __m128i weights_y
= _mm_cvtsi32_si128((uint32_t)sm_weights_h
[y
]);
1088 const __m128i scale_m_weights_y
=
1089 _mm_shuffle_epi8(_mm_sub_epi16(scale_value
, weights_y
), dup16
);
1090 const __m128i wl_y
=
1091 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y
, bottom_left
), 0);
1093 for (uint32_t x
= 0; x
< bw
; x
+= 8) {
1094 const __m128i top_x
= _mm_loadl_epi64((const __m128i
*)(above
+ x
));
1096 const __m128i tw_x
= _mm_unpacklo_epi8(top_x
, zero
);
1097 const __m128i tw_x_lo
= _mm_unpacklo_epi16(tw_x
, scale_m_weights_y
);
1098 const __m128i tw_x_hi
= _mm_unpackhi_epi16(tw_x
, scale_m_weights_y
);
1099 // top_x * weights_y + scale_m_weights_y * bottom_left
1100 __m128i pred_lo
= _mm_madd_epi16(tw_x_lo
, wl_y
);
1101 __m128i pred_hi
= _mm_madd_epi16(tw_x_hi
, wl_y
);
1103 pred_lo
= _mm_add_epi32(pred_lo
, round
);
1104 pred_hi
= _mm_add_epi32(pred_hi
, round
);
1105 pred_lo
= _mm_srai_epi32(pred_lo
, sm_weight_log2_scale
);
1106 pred_hi
= _mm_srai_epi32(pred_hi
, sm_weight_log2_scale
);
1108 __m128i pred
= _mm_packus_epi16(pred_lo
, pred_hi
);
1109 pred
= _mm_shuffle_epi8(pred
, gat
);
1110 _mm_storel_epi64((__m128i
*)(dst
+ x
), pred
);
1116 void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1117 const uint8_t *above
,
1118 const uint8_t *left
) {
1119 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 16, 4);
1122 void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1123 const uint8_t *above
,
1124 const uint8_t *left
) {
1125 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 16, 8);
1128 void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1129 const uint8_t *above
,
1130 const uint8_t *left
) {
1131 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 16, 16);
1134 void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1135 const uint8_t *above
,
1136 const uint8_t *left
) {
1137 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 16, 32);
1140 void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1141 const uint8_t *above
,
1142 const uint8_t *left
) {
1143 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 32, 8);
1146 void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1147 const uint8_t *above
,
1148 const uint8_t *left
) {
1149 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 32, 16);
1152 void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1153 const uint8_t *above
,
1154 const uint8_t *left
) {
1155 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 32, 32);
1158 void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1159 const uint8_t *above
,
1160 const uint8_t *left
) {
1161 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 32, 64);
1164 void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1165 const uint8_t *above
,
1166 const uint8_t *left
) {
1167 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 64, 64);
1170 void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1171 const uint8_t *above
,
1172 const uint8_t *left
) {
1173 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 64, 32);
1176 void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1177 const uint8_t *above
,
1178 const uint8_t *left
) {
1179 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 64, 16);
1182 void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1183 const uint8_t *above
,
1184 const uint8_t *left
) {
1185 smooth_v_predictor_wxh(dst
, stride
, above
, left
, 16, 64);
1188 // -----------------------------------------------------------------------------
1191 // pixels[0]: left vector
1192 // pixels[1]: right_pred vector
1193 static INLINE
void load_pixel_h_w4(const uint8_t *above
, const uint8_t *left
,
1194 int height
, __m128i
*pixels
) {
1196 pixels
[0] = _mm_cvtsi32_si128(((const uint32_t *)left
)[0]);
1197 else if (height
== 8)
1198 pixels
[0] = _mm_loadl_epi64(((const __m128i
*)left
));
1200 pixels
[0] = _mm_loadu_si128(((const __m128i
*)left
));
1201 pixels
[1] = _mm_set1_epi16((uint16_t)above
[3]);
1204 // weights[0]: weights_w and scale - weights_w interleave vector
1205 static INLINE
void load_weight_h_w4(const uint8_t *weight_array
, int height
,
1208 const __m128i t
= _mm_loadu_si128((const __m128i
*)&weight_array
[4]);
1209 const __m128i zero
= _mm_setzero_si128();
1211 const __m128i weights_0
= _mm_unpacklo_epi8(t
, zero
);
1212 const __m128i d
= _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
1213 const __m128i weights_1
= _mm_sub_epi16(d
, weights_0
);
1214 weights
[0] = _mm_unpacklo_epi16(weights_0
, weights_1
);
1217 static INLINE
void smooth_h_pred_4xh(const __m128i
*pixel
,
1218 const __m128i
*weight
, int h
, uint8_t *dst
,
1220 const __m128i pred_round
= _mm_set1_epi32((1 << (sm_weight_log2_scale
- 1)));
1221 const __m128i one
= _mm_set1_epi16(1);
1222 const __m128i gat
= _mm_set1_epi32(0xc080400);
1223 __m128i rep
= _mm_set1_epi16(0x8000);
1225 for (int i
= 0; i
< h
; ++i
) {
1226 __m128i b
= _mm_shuffle_epi8(pixel
[0], rep
);
1227 b
= _mm_unpacklo_epi16(b
, pixel
[1]);
1228 __m128i sum
= _mm_madd_epi16(b
, weight
[0]);
1230 sum
= _mm_add_epi32(sum
, pred_round
);
1231 sum
= _mm_srai_epi32(sum
, sm_weight_log2_scale
);
1233 sum
= _mm_shuffle_epi8(sum
, gat
);
1234 *(uint32_t *)dst
= _mm_cvtsi128_si32(sum
);
1237 rep
= _mm_add_epi16(rep
, one
);
1241 void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1242 const uint8_t *above
,
1243 const uint8_t *left
) {
1245 load_pixel_h_w4(above
, left
, 4, pixels
);
1248 load_weight_h_w4(sm_weight_arrays
, 4, &weights
);
1250 smooth_h_pred_4xh(pixels
, &weights
, 4, dst
, stride
);
1253 void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1254 const uint8_t *above
,
1255 const uint8_t *left
) {
1257 load_pixel_h_w4(above
, left
, 8, pixels
);
1260 load_weight_h_w4(sm_weight_arrays
, 8, &weights
);
1262 smooth_h_pred_4xh(pixels
, &weights
, 8, dst
, stride
);
1265 void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1266 const uint8_t *above
,
1267 const uint8_t *left
) {
1269 load_pixel_h_w4(above
, left
, 16, pixels
);
1272 load_weight_h_w4(sm_weight_arrays
, 8, &weights
);
1274 smooth_h_pred_4xh(pixels
, &weights
, 8, dst
, stride
);
1277 pixels
[0] = _mm_srli_si128(pixels
[0], 8);
1278 smooth_h_pred_4xh(pixels
, &weights
, 8, dst
, stride
);
1281 // pixels[0]: left vector
1282 // pixels[1]: right_pred vector
1283 // pixels[2]: left vector + 16
1284 // pixels[3]: right_pred vector
1285 static INLINE
void load_pixel_h_w8(const uint8_t *above
, const uint8_t *left
,
1286 int height
, __m128i
*pixels
) {
1287 pixels
[1] = _mm_set1_epi16((uint16_t)above
[7]);
1290 pixels
[0] = _mm_cvtsi32_si128(((const uint32_t *)left
)[0]);
1291 } else if (height
== 8) {
1292 pixels
[0] = _mm_loadl_epi64((const __m128i
*)left
);
1293 } else if (height
== 16) {
1294 pixels
[0] = _mm_load_si128((const __m128i
*)left
);
1296 pixels
[0] = _mm_load_si128((const __m128i
*)left
);
1297 pixels
[2] = _mm_load_si128((const __m128i
*)(left
+ 16));
1298 pixels
[3] = pixels
[1];
1302 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1303 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
1304 static INLINE
void load_weight_h_w8(const uint8_t *weight_array
, int height
,
1305 __m128i
*weight_w
) {
1307 const __m128i zero
= _mm_setzero_si128();
1308 const __m128i d
= _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
1309 const __m128i we
= _mm_loadu_si128((const __m128i
*)&weight_array
[8]);
1310 const __m128i tmp1
= _mm_unpacklo_epi8(we
, zero
);
1311 const __m128i tmp2
= _mm_sub_epi16(d
, tmp1
);
1312 weight_w
[0] = _mm_unpacklo_epi16(tmp1
, tmp2
);
1313 weight_w
[1] = _mm_unpackhi_epi16(tmp1
, tmp2
);
1316 static INLINE
void smooth_h_pred_8xh(const __m128i
*pixels
, const __m128i
*ww
,
1317 int h
, uint8_t *dst
, ptrdiff_t stride
,
1319 const __m128i pred_round
= _mm_set1_epi32((1 << (sm_weight_log2_scale
- 1)));
1320 const __m128i one
= _mm_set1_epi16(1);
1321 const __m128i gat
= _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1322 __m128i rep
= second_half
? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
1324 for (int i
= 0; i
< h
; ++i
) {
1325 __m128i b
= _mm_shuffle_epi8(pixels
[0], rep
);
1326 b
= _mm_unpacklo_epi16(b
, pixels
[1]);
1327 __m128i sum0
= _mm_madd_epi16(b
, ww
[0]);
1328 __m128i sum1
= _mm_madd_epi16(b
, ww
[1]);
1330 sum0
= _mm_add_epi32(sum0
, pred_round
);
1331 sum0
= _mm_srai_epi32(sum0
, sm_weight_log2_scale
);
1333 sum1
= _mm_add_epi32(sum1
, pred_round
);
1334 sum1
= _mm_srai_epi32(sum1
, sm_weight_log2_scale
);
1336 sum0
= _mm_packus_epi16(sum0
, sum1
);
1337 sum0
= _mm_shuffle_epi8(sum0
, gat
);
1338 _mm_storel_epi64((__m128i
*)dst
, sum0
);
1341 rep
= _mm_add_epi16(rep
, one
);
1345 void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1346 const uint8_t *above
,
1347 const uint8_t *left
) {
1349 load_pixel_h_w8(above
, left
, 4, pixels
);
1352 load_weight_h_w8(sm_weight_arrays
, 4, ww
);
1354 smooth_h_pred_8xh(pixels
, ww
, 4, dst
, stride
, 0);
1357 void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1358 const uint8_t *above
,
1359 const uint8_t *left
) {
1361 load_pixel_h_w8(above
, left
, 8, pixels
);
1364 load_weight_h_w8(sm_weight_arrays
, 8, ww
);
1366 smooth_h_pred_8xh(pixels
, ww
, 8, dst
, stride
, 0);
1369 void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1370 const uint8_t *above
,
1371 const uint8_t *left
) {
1373 load_pixel_h_w8(above
, left
, 16, pixels
);
1376 load_weight_h_w8(sm_weight_arrays
, 16, ww
);
1378 smooth_h_pred_8xh(pixels
, ww
, 8, dst
, stride
, 0);
1380 smooth_h_pred_8xh(pixels
, ww
, 8, dst
, stride
, 1);
1383 void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1384 const uint8_t *above
,
1385 const uint8_t *left
) {
1387 load_pixel_h_w8(above
, left
, 32, pixels
);
1390 load_weight_h_w8(sm_weight_arrays
, 32, ww
);
1392 smooth_h_pred_8xh(&pixels
[0], ww
, 8, dst
, stride
, 0);
1394 smooth_h_pred_8xh(&pixels
[0], ww
, 8, dst
, stride
, 1);
1396 smooth_h_pred_8xh(&pixels
[2], ww
, 8, dst
, stride
, 0);
1398 smooth_h_pred_8xh(&pixels
[2], ww
, 8, dst
, stride
, 1);
1401 static INLINE
void smooth_h_predictor_wxh(uint8_t *dst
, ptrdiff_t stride
,
1402 const uint8_t *above
,
1403 const uint8_t *left
, uint32_t bw
,
1405 const uint8_t *const sm_weights_w
= sm_weight_arrays
+ bw
;
1406 const __m128i zero
= _mm_setzero_si128();
1407 const __m128i scale_value
=
1408 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale
));
1409 const __m128i top_right
= _mm_cvtsi32_si128((uint32_t)above
[bw
- 1]);
1410 const __m128i gat
= _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1411 const __m128i pred_round
= _mm_set1_epi32((1 << (sm_weight_log2_scale
- 1)));
1413 for (uint32_t y
= 0; y
< bh
; ++y
) {
1414 const __m128i left_y
= _mm_cvtsi32_si128((uint32_t)left
[y
]);
1415 const __m128i tr_ly
=
1416 _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right
, left_y
), 0);
1418 for (uint32_t x
= 0; x
< bw
; x
+= 8) {
1419 const __m128i weights_x
=
1420 _mm_loadl_epi64((const __m128i
*)(sm_weights_w
+ x
));
1421 const __m128i weights_xw
= _mm_unpacklo_epi8(weights_x
, zero
);
1422 const __m128i scale_m_weights_x
= _mm_sub_epi16(scale_value
, weights_xw
);
1423 const __m128i wx_lo
= _mm_unpacklo_epi16(scale_m_weights_x
, weights_xw
);
1424 const __m128i wx_hi
= _mm_unpackhi_epi16(scale_m_weights_x
, weights_xw
);
1425 __m128i pred_lo
= _mm_madd_epi16(wx_lo
, tr_ly
);
1426 __m128i pred_hi
= _mm_madd_epi16(wx_hi
, tr_ly
);
1428 pred_lo
= _mm_add_epi32(pred_lo
, pred_round
);
1429 pred_hi
= _mm_add_epi32(pred_hi
, pred_round
);
1431 pred_lo
= _mm_srai_epi32(pred_lo
, sm_weight_log2_scale
);
1432 pred_hi
= _mm_srai_epi32(pred_hi
, sm_weight_log2_scale
);
1434 __m128i pred
= _mm_packus_epi16(pred_lo
, pred_hi
);
1435 pred
= _mm_shuffle_epi8(pred
, gat
);
1436 _mm_storel_epi64((__m128i
*)(dst
+ x
), pred
);
1442 void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1443 const uint8_t *above
,
1444 const uint8_t *left
) {
1445 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 16, 4);
1448 void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1449 const uint8_t *above
,
1450 const uint8_t *left
) {
1451 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 16, 8);
1454 void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1455 const uint8_t *above
,
1456 const uint8_t *left
) {
1457 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 16, 16);
1460 void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1461 const uint8_t *above
,
1462 const uint8_t *left
) {
1463 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 16, 32);
1466 void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1467 const uint8_t *above
,
1468 const uint8_t *left
) {
1469 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 16, 64);
1472 void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1473 const uint8_t *above
,
1474 const uint8_t *left
) {
1475 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 32, 8);
1478 void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1479 const uint8_t *above
,
1480 const uint8_t *left
) {
1481 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 32, 16);
1484 void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1485 const uint8_t *above
,
1486 const uint8_t *left
) {
1487 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 32, 32);
1490 void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1491 const uint8_t *above
,
1492 const uint8_t *left
) {
1493 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 32, 64);
1496 void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1497 const uint8_t *above
,
1498 const uint8_t *left
) {
1499 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 64, 64);
1502 void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1503 const uint8_t *above
,
1504 const uint8_t *left
) {
1505 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 64, 32);
1508 void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst
, ptrdiff_t stride
,
1509 const uint8_t *above
,
1510 const uint8_t *left
) {
1511 smooth_h_predictor_wxh(dst
, stride
, above
, left
, 64, 16);