2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
17 //------------------------------------------------------------------------------
20 // 'do_above' and 'do_left' facilitate branch removal when inlined.
21 static INLINE
void dc_4x4(uint8_t *dst
, ptrdiff_t stride
,
22 const uint8_t *above
, const uint8_t *left
,
23 int do_above
, int do_left
) {
29 const uint8x8_t A
= vld1_u8(above
); // top row
30 const uint16x4_t p0
= vpaddl_u8(A
); // cascading summation of the top
31 const uint16x4_t p1
= vpadd_u16(p0
, p0
);
32 sum_top
= vcombine_u16(p1
, p1
);
36 const uint8x8_t L
= vld1_u8(left
); // left border
37 const uint16x4_t p0
= vpaddl_u8(L
); // cascading summation of the left
38 const uint16x4_t p1
= vpadd_u16(p0
, p0
);
39 sum_left
= vcombine_u16(p1
, p1
);
42 if (do_above
&& do_left
) {
43 const uint16x8_t sum
= vaddq_u16(sum_left
, sum_top
);
44 dc0
= vrshrn_n_u16(sum
, 3);
45 } else if (do_above
) {
46 dc0
= vrshrn_n_u16(sum_top
, 2);
48 dc0
= vrshrn_n_u16(sum_left
, 2);
50 dc0
= vdup_n_u8(0x80);
54 const uint8x8_t dc
= vdup_lane_u8(dc0
, 0);
56 for (i
= 0; i
< 4; ++i
) {
57 vst1_lane_u32((uint32_t*)(dst
+ i
* stride
), vreinterpret_u32_u8(dc
), 0);
62 void vp9_dc_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
63 const uint8_t *above
, const uint8_t *left
) {
64 dc_4x4(dst
, stride
, above
, left
, 1, 1);
67 void vp9_dc_left_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
68 const uint8_t *above
, const uint8_t *left
) {
70 dc_4x4(dst
, stride
, NULL
, left
, 0, 1);
73 void vp9_dc_top_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
74 const uint8_t *above
, const uint8_t *left
) {
76 dc_4x4(dst
, stride
, above
, NULL
, 1, 0);
79 void vp9_dc_128_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
80 const uint8_t *above
, const uint8_t *left
) {
83 dc_4x4(dst
, stride
, NULL
, NULL
, 0, 0);
86 //------------------------------------------------------------------------------
89 // 'do_above' and 'do_left' facilitate branch removal when inlined.
90 static INLINE
void dc_8x8(uint8_t *dst
, ptrdiff_t stride
,
91 const uint8_t *above
, const uint8_t *left
,
92 int do_above
, int do_left
) {
98 const uint8x8_t A
= vld1_u8(above
); // top row
99 const uint16x4_t p0
= vpaddl_u8(A
); // cascading summation of the top
100 const uint16x4_t p1
= vpadd_u16(p0
, p0
);
101 const uint16x4_t p2
= vpadd_u16(p1
, p1
);
102 sum_top
= vcombine_u16(p2
, p2
);
106 const uint8x8_t L
= vld1_u8(left
); // left border
107 const uint16x4_t p0
= vpaddl_u8(L
); // cascading summation of the left
108 const uint16x4_t p1
= vpadd_u16(p0
, p0
);
109 const uint16x4_t p2
= vpadd_u16(p1
, p1
);
110 sum_left
= vcombine_u16(p2
, p2
);
113 if (do_above
&& do_left
) {
114 const uint16x8_t sum
= vaddq_u16(sum_left
, sum_top
);
115 dc0
= vrshrn_n_u16(sum
, 4);
116 } else if (do_above
) {
117 dc0
= vrshrn_n_u16(sum_top
, 3);
118 } else if (do_left
) {
119 dc0
= vrshrn_n_u16(sum_left
, 3);
121 dc0
= vdup_n_u8(0x80);
125 const uint8x8_t dc
= vdup_lane_u8(dc0
, 0);
127 for (i
= 0; i
< 8; ++i
) {
128 vst1_u32((uint32_t*)(dst
+ i
* stride
), vreinterpret_u32_u8(dc
));
133 void vp9_dc_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
134 const uint8_t *above
, const uint8_t *left
) {
135 dc_8x8(dst
, stride
, above
, left
, 1, 1);
138 void vp9_dc_left_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
139 const uint8_t *above
, const uint8_t *left
) {
141 dc_8x8(dst
, stride
, NULL
, left
, 0, 1);
144 void vp9_dc_top_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
145 const uint8_t *above
, const uint8_t *left
) {
147 dc_8x8(dst
, stride
, above
, NULL
, 1, 0);
150 void vp9_dc_128_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
151 const uint8_t *above
, const uint8_t *left
) {
154 dc_8x8(dst
, stride
, NULL
, NULL
, 0, 0);
157 //------------------------------------------------------------------------------
160 // 'do_above' and 'do_left' facilitate branch removal when inlined.
161 static INLINE
void dc_16x16(uint8_t *dst
, ptrdiff_t stride
,
162 const uint8_t *above
, const uint8_t *left
,
163 int do_above
, int do_left
) {
169 const uint8x16_t A
= vld1q_u8(above
); // top row
170 const uint16x8_t p0
= vpaddlq_u8(A
); // cascading summation of the top
171 const uint16x4_t p1
= vadd_u16(vget_low_u16(p0
), vget_high_u16(p0
));
172 const uint16x4_t p2
= vpadd_u16(p1
, p1
);
173 const uint16x4_t p3
= vpadd_u16(p2
, p2
);
174 sum_top
= vcombine_u16(p3
, p3
);
178 const uint8x16_t L
= vld1q_u8(left
); // left row
179 const uint16x8_t p0
= vpaddlq_u8(L
); // cascading summation of the left
180 const uint16x4_t p1
= vadd_u16(vget_low_u16(p0
), vget_high_u16(p0
));
181 const uint16x4_t p2
= vpadd_u16(p1
, p1
);
182 const uint16x4_t p3
= vpadd_u16(p2
, p2
);
183 sum_left
= vcombine_u16(p3
, p3
);
186 if (do_above
&& do_left
) {
187 const uint16x8_t sum
= vaddq_u16(sum_left
, sum_top
);
188 dc0
= vrshrn_n_u16(sum
, 5);
189 } else if (do_above
) {
190 dc0
= vrshrn_n_u16(sum_top
, 4);
191 } else if (do_left
) {
192 dc0
= vrshrn_n_u16(sum_left
, 4);
194 dc0
= vdup_n_u8(0x80);
198 const uint8x16_t dc
= vdupq_lane_u8(dc0
, 0);
200 for (i
= 0; i
< 16; ++i
) {
201 vst1q_u8(dst
+ i
* stride
, dc
);
206 void vp9_dc_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
207 const uint8_t *above
, const uint8_t *left
) {
208 dc_16x16(dst
, stride
, above
, left
, 1, 1);
211 void vp9_dc_left_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
212 const uint8_t *above
,
213 const uint8_t *left
) {
215 dc_16x16(dst
, stride
, NULL
, left
, 0, 1);
218 void vp9_dc_top_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
219 const uint8_t *above
,
220 const uint8_t *left
) {
222 dc_16x16(dst
, stride
, above
, NULL
, 1, 0);
225 void vp9_dc_128_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
226 const uint8_t *above
,
227 const uint8_t *left
) {
230 dc_16x16(dst
, stride
, NULL
, NULL
, 0, 0);
233 //------------------------------------------------------------------------------
236 // 'do_above' and 'do_left' facilitate branch removal when inlined.
237 static INLINE
void dc_32x32(uint8_t *dst
, ptrdiff_t stride
,
238 const uint8_t *above
, const uint8_t *left
,
239 int do_above
, int do_left
) {
245 const uint8x16_t A0
= vld1q_u8(above
); // top row
246 const uint8x16_t A1
= vld1q_u8(above
+ 16);
247 const uint16x8_t p0
= vpaddlq_u8(A0
); // cascading summation of the top
248 const uint16x8_t p1
= vpaddlq_u8(A1
);
249 const uint16x8_t p2
= vaddq_u16(p0
, p1
);
250 const uint16x4_t p3
= vadd_u16(vget_low_u16(p2
), vget_high_u16(p2
));
251 const uint16x4_t p4
= vpadd_u16(p3
, p3
);
252 const uint16x4_t p5
= vpadd_u16(p4
, p4
);
253 sum_top
= vcombine_u16(p5
, p5
);
257 const uint8x16_t L0
= vld1q_u8(left
); // left row
258 const uint8x16_t L1
= vld1q_u8(left
+ 16);
259 const uint16x8_t p0
= vpaddlq_u8(L0
); // cascading summation of the left
260 const uint16x8_t p1
= vpaddlq_u8(L1
);
261 const uint16x8_t p2
= vaddq_u16(p0
, p1
);
262 const uint16x4_t p3
= vadd_u16(vget_low_u16(p2
), vget_high_u16(p2
));
263 const uint16x4_t p4
= vpadd_u16(p3
, p3
);
264 const uint16x4_t p5
= vpadd_u16(p4
, p4
);
265 sum_left
= vcombine_u16(p5
, p5
);
268 if (do_above
&& do_left
) {
269 const uint16x8_t sum
= vaddq_u16(sum_left
, sum_top
);
270 dc0
= vrshrn_n_u16(sum
, 6);
271 } else if (do_above
) {
272 dc0
= vrshrn_n_u16(sum_top
, 5);
273 } else if (do_left
) {
274 dc0
= vrshrn_n_u16(sum_left
, 5);
276 dc0
= vdup_n_u8(0x80);
280 const uint8x16_t dc
= vdupq_lane_u8(dc0
, 0);
282 for (i
= 0; i
< 32; ++i
) {
283 vst1q_u8(dst
+ i
* stride
, dc
);
284 vst1q_u8(dst
+ i
* stride
+ 16, dc
);
289 void vp9_dc_predictor_32x32_neon(uint8_t *dst
, ptrdiff_t stride
,
290 const uint8_t *above
, const uint8_t *left
) {
291 dc_32x32(dst
, stride
, above
, left
, 1, 1);
294 void vp9_dc_left_predictor_32x32_neon(uint8_t *dst
, ptrdiff_t stride
,
295 const uint8_t *above
,
296 const uint8_t *left
) {
298 dc_32x32(dst
, stride
, NULL
, left
, 0, 1);
301 void vp9_dc_top_predictor_32x32_neon(uint8_t *dst
, ptrdiff_t stride
,
302 const uint8_t *above
,
303 const uint8_t *left
) {
305 dc_32x32(dst
, stride
, above
, NULL
, 1, 0);
308 void vp9_dc_128_predictor_32x32_neon(uint8_t *dst
, ptrdiff_t stride
,
309 const uint8_t *above
,
310 const uint8_t *left
) {
313 dc_32x32(dst
, stride
, NULL
, NULL
, 0, 0);
316 // -----------------------------------------------------------------------------
318 void vp9_d45_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
319 const uint8_t *above
, const uint8_t *left
) {
320 const uint64x1_t A0
= vreinterpret_u64_u8(vld1_u8(above
)); // top row
321 const uint64x1_t A1
= vshr_n_u64(A0
, 8);
322 const uint64x1_t A2
= vshr_n_u64(A0
, 16);
323 const uint8x8_t ABCDEFGH
= vreinterpret_u8_u64(A0
);
324 const uint8x8_t BCDEFGH0
= vreinterpret_u8_u64(A1
);
325 const uint8x8_t CDEFGH00
= vreinterpret_u8_u64(A2
);
326 const uint8x8_t avg1
= vhadd_u8(ABCDEFGH
, CDEFGH00
);
327 const uint8x8_t avg2
= vrhadd_u8(avg1
, BCDEFGH0
);
328 const uint64x1_t avg2_u64
= vreinterpret_u64_u8(avg2
);
329 const uint32x2_t r0
= vreinterpret_u32_u8(avg2
);
330 const uint32x2_t r1
= vreinterpret_u32_u64(vshr_n_u64(avg2_u64
, 8));
331 const uint32x2_t r2
= vreinterpret_u32_u64(vshr_n_u64(avg2_u64
, 16));
332 const uint32x2_t r3
= vreinterpret_u32_u64(vshr_n_u64(avg2_u64
, 24));
334 vst1_lane_u32((uint32_t *)(dst
+ 0 * stride
), r0
, 0);
335 vst1_lane_u32((uint32_t *)(dst
+ 1 * stride
), r1
, 0);
336 vst1_lane_u32((uint32_t *)(dst
+ 2 * stride
), r2
, 0);
337 vst1_lane_u32((uint32_t *)(dst
+ 3 * stride
), r3
, 0);
338 dst
[3 * stride
+ 3] = above
[7];
341 void vp9_d45_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
342 const uint8_t *above
, const uint8_t *left
) {
343 static const uint8_t shuffle1
[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
344 static const uint8_t shuffle2
[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
345 const uint8x8_t sh_12345677
= vld1_u8(shuffle1
);
346 const uint8x8_t sh_23456777
= vld1_u8(shuffle2
);
347 const uint8x8_t A0
= vld1_u8(above
); // top row
348 const uint8x8_t A1
= vtbl1_u8(A0
, sh_12345677
);
349 const uint8x8_t A2
= vtbl1_u8(A0
, sh_23456777
);
350 const uint8x8_t avg1
= vhadd_u8(A0
, A2
);
351 uint8x8_t row
= vrhadd_u8(avg1
, A1
);
354 for (i
= 0; i
< 7; ++i
) {
355 vst1_u8(dst
+ i
* stride
, row
);
356 row
= vtbl1_u8(row
, sh_12345677
);
358 vst1_u8(dst
+ i
* stride
, row
);
361 void vp9_d45_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
362 const uint8_t *above
, const uint8_t *left
) {
363 const uint8x16_t A0
= vld1q_u8(above
); // top row
364 const uint8x16_t above_right
= vld1q_dup_u8(above
+ 15);
365 const uint8x16_t A1
= vextq_u8(A0
, above_right
, 1);
366 const uint8x16_t A2
= vextq_u8(A0
, above_right
, 2);
367 const uint8x16_t avg1
= vhaddq_u8(A0
, A2
);
368 uint8x16_t row
= vrhaddq_u8(avg1
, A1
);
371 for (i
= 0; i
< 15; ++i
) {
372 vst1q_u8(dst
+ i
* stride
, row
);
373 row
= vextq_u8(row
, above_right
, 1);
375 vst1q_u8(dst
+ i
* stride
, row
);
378 // -----------------------------------------------------------------------------
380 void vp9_d135_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
381 const uint8_t *above
, const uint8_t *left
) {
382 const uint8x8_t XABCD_u8
= vld1_u8(above
- 1);
383 const uint64x1_t XABCD
= vreinterpret_u64_u8(XABCD_u8
);
384 const uint64x1_t ____XABC
= vshl_n_u64(XABCD
, 32);
385 const uint32x2_t zero
= vdup_n_u32(0);
386 const uint32x2_t IJKL
= vld1_lane_u32((const uint32_t *)left
, zero
, 0);
387 const uint8x8_t IJKL_u8
= vreinterpret_u8_u32(IJKL
);
388 const uint64x1_t LKJI____
= vreinterpret_u64_u8(vrev32_u8(IJKL_u8
));
389 const uint64x1_t LKJIXABC
= vorr_u64(LKJI____
, ____XABC
);
390 const uint8x8_t KJIXABC_
= vreinterpret_u8_u64(vshr_n_u64(LKJIXABC
, 8));
391 const uint8x8_t JIXABC__
= vreinterpret_u8_u64(vshr_n_u64(LKJIXABC
, 16));
392 const uint8_t D
= vget_lane_u8(XABCD_u8
, 4);
393 const uint8x8_t JIXABCD_
= vset_lane_u8(D
, JIXABC__
, 6);
394 const uint8x8_t LKJIXABC_u8
= vreinterpret_u8_u64(LKJIXABC
);
395 const uint8x8_t avg1
= vhadd_u8(JIXABCD_
, LKJIXABC_u8
);
396 const uint8x8_t avg2
= vrhadd_u8(avg1
, KJIXABC_
);
397 const uint64x1_t avg2_u64
= vreinterpret_u64_u8(avg2
);
398 const uint32x2_t r3
= vreinterpret_u32_u8(avg2
);
399 const uint32x2_t r2
= vreinterpret_u32_u64(vshr_n_u64(avg2_u64
, 8));
400 const uint32x2_t r1
= vreinterpret_u32_u64(vshr_n_u64(avg2_u64
, 16));
401 const uint32x2_t r0
= vreinterpret_u32_u64(vshr_n_u64(avg2_u64
, 24));
402 vst1_lane_u32((uint32_t *)(dst
+ 0 * stride
), r0
, 0);
403 vst1_lane_u32((uint32_t *)(dst
+ 1 * stride
), r1
, 0);
404 vst1_lane_u32((uint32_t *)(dst
+ 2 * stride
), r2
, 0);
405 vst1_lane_u32((uint32_t *)(dst
+ 3 * stride
), r3
, 0);
410 void vp9_v_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
411 const uint8_t *above
, const uint8_t *left
) {
413 uint32x2_t d0u32
= vdup_n_u32(0);
416 d0u32
= vld1_lane_u32((const uint32_t *)above
, d0u32
, 0);
417 for (i
= 0; i
< 4; i
++, dst
+= stride
)
418 vst1_lane_u32((uint32_t *)dst
, d0u32
, 0);
421 void vp9_v_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
422 const uint8_t *above
, const uint8_t *left
) {
424 uint8x8_t d0u8
= vdup_n_u8(0);
427 d0u8
= vld1_u8(above
);
428 for (i
= 0; i
< 8; i
++, dst
+= stride
)
432 void vp9_v_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
433 const uint8_t *above
, const uint8_t *left
) {
435 uint8x16_t q0u8
= vdupq_n_u8(0);
438 q0u8
= vld1q_u8(above
);
439 for (i
= 0; i
< 16; i
++, dst
+= stride
)
443 void vp9_v_predictor_32x32_neon(uint8_t *dst
, ptrdiff_t stride
,
444 const uint8_t *above
, const uint8_t *left
) {
446 uint8x16_t q0u8
= vdupq_n_u8(0);
447 uint8x16_t q1u8
= vdupq_n_u8(0);
450 q0u8
= vld1q_u8(above
);
451 q1u8
= vld1q_u8(above
+ 16);
452 for (i
= 0; i
< 32; i
++, dst
+= stride
) {
454 vst1q_u8(dst
+ 16, q1u8
);
458 void vp9_h_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
459 const uint8_t *above
, const uint8_t *left
) {
460 uint8x8_t d0u8
= vdup_n_u8(0);
461 uint32x2_t d1u32
= vdup_n_u32(0);
464 d1u32
= vld1_lane_u32((const uint32_t *)left
, d1u32
, 0);
466 d0u8
= vdup_lane_u8(vreinterpret_u8_u32(d1u32
), 0);
467 vst1_lane_u32((uint32_t *)dst
, vreinterpret_u32_u8(d0u8
), 0);
469 d0u8
= vdup_lane_u8(vreinterpret_u8_u32(d1u32
), 1);
470 vst1_lane_u32((uint32_t *)dst
, vreinterpret_u32_u8(d0u8
), 0);
472 d0u8
= vdup_lane_u8(vreinterpret_u8_u32(d1u32
), 2);
473 vst1_lane_u32((uint32_t *)dst
, vreinterpret_u32_u8(d0u8
), 0);
475 d0u8
= vdup_lane_u8(vreinterpret_u8_u32(d1u32
), 3);
476 vst1_lane_u32((uint32_t *)dst
, vreinterpret_u32_u8(d0u8
), 0);
479 void vp9_h_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
480 const uint8_t *above
, const uint8_t *left
) {
481 uint8x8_t d0u8
= vdup_n_u8(0);
482 uint64x1_t d1u64
= vdup_n_u64(0);
485 d1u64
= vld1_u64((const uint64_t *)left
);
487 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 0);
490 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 1);
493 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 2);
496 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 3);
499 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 4);
502 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 5);
505 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 6);
508 d0u8
= vdup_lane_u8(vreinterpret_u8_u64(d1u64
), 7);
512 void vp9_h_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
513 const uint8_t *above
, const uint8_t *left
) {
515 uint8x8_t d2u8
= vdup_n_u8(0);
516 uint8x16_t q0u8
= vdupq_n_u8(0);
517 uint8x16_t q1u8
= vdupq_n_u8(0);
520 q1u8
= vld1q_u8(left
);
521 d2u8
= vget_low_u8(q1u8
);
522 for (j
= 0; j
< 2; j
++, d2u8
= vget_high_u8(q1u8
)) {
523 q0u8
= vdupq_lane_u8(d2u8
, 0);
526 q0u8
= vdupq_lane_u8(d2u8
, 1);
529 q0u8
= vdupq_lane_u8(d2u8
, 2);
532 q0u8
= vdupq_lane_u8(d2u8
, 3);
535 q0u8
= vdupq_lane_u8(d2u8
, 4);
538 q0u8
= vdupq_lane_u8(d2u8
, 5);
541 q0u8
= vdupq_lane_u8(d2u8
, 6);
544 q0u8
= vdupq_lane_u8(d2u8
, 7);
550 void vp9_h_predictor_32x32_neon(uint8_t *dst
, ptrdiff_t stride
,
551 const uint8_t *above
, const uint8_t *left
) {
553 uint8x8_t d2u8
= vdup_n_u8(0);
554 uint8x16_t q0u8
= vdupq_n_u8(0);
555 uint8x16_t q1u8
= vdupq_n_u8(0);
558 for (k
= 0; k
< 2; k
++, left
+= 16) {
559 q1u8
= vld1q_u8(left
);
560 d2u8
= vget_low_u8(q1u8
);
561 for (j
= 0; j
< 2; j
++, d2u8
= vget_high_u8(q1u8
)) {
562 q0u8
= vdupq_lane_u8(d2u8
, 0);
564 vst1q_u8(dst
+ 16, q0u8
);
566 q0u8
= vdupq_lane_u8(d2u8
, 1);
568 vst1q_u8(dst
+ 16, q0u8
);
570 q0u8
= vdupq_lane_u8(d2u8
, 2);
572 vst1q_u8(dst
+ 16, q0u8
);
574 q0u8
= vdupq_lane_u8(d2u8
, 3);
576 vst1q_u8(dst
+ 16, q0u8
);
578 q0u8
= vdupq_lane_u8(d2u8
, 4);
580 vst1q_u8(dst
+ 16, q0u8
);
582 q0u8
= vdupq_lane_u8(d2u8
, 5);
584 vst1q_u8(dst
+ 16, q0u8
);
586 q0u8
= vdupq_lane_u8(d2u8
, 6);
588 vst1q_u8(dst
+ 16, q0u8
);
590 q0u8
= vdupq_lane_u8(d2u8
, 7);
592 vst1q_u8(dst
+ 16, q0u8
);
598 void vp9_tm_predictor_4x4_neon(uint8_t *dst
, ptrdiff_t stride
,
599 const uint8_t *above
, const uint8_t *left
) {
601 uint16x8_t q1u16
, q3u16
;
603 uint8x8_t d0u8
= vdup_n_u8(0);
604 uint32x2_t d2u32
= vdup_n_u32(0);
606 d0u8
= vld1_dup_u8(above
- 1);
607 d2u32
= vld1_lane_u32((const uint32_t *)above
, d2u32
, 0);
608 q3u16
= vsubl_u8(vreinterpret_u8_u32(d2u32
), d0u8
);
609 for (i
= 0; i
< 4; i
++, dst
+= stride
) {
610 q1u16
= vdupq_n_u16((uint16_t)left
[i
]);
611 q1s16
= vaddq_s16(vreinterpretq_s16_u16(q1u16
),
612 vreinterpretq_s16_u16(q3u16
));
613 d0u8
= vqmovun_s16(q1s16
);
614 vst1_lane_u32((uint32_t *)dst
, vreinterpret_u32_u8(d0u8
), 0);
618 void vp9_tm_predictor_8x8_neon(uint8_t *dst
, ptrdiff_t stride
,
619 const uint8_t *above
, const uint8_t *left
) {
621 uint16x8_t q0u16
, q3u16
, q10u16
;
624 uint8x8_t d0u8
, d2u8
, d30u8
;
626 d0u8
= vld1_dup_u8(above
- 1);
627 d30u8
= vld1_u8(left
);
628 d2u8
= vld1_u8(above
);
629 q10u16
= vmovl_u8(d30u8
);
630 q3u16
= vsubl_u8(d2u8
, d0u8
);
631 d20u16
= vget_low_u16(q10u16
);
632 for (j
= 0; j
< 2; j
++, d20u16
= vget_high_u16(q10u16
)) {
633 q0u16
= vdupq_lane_u16(d20u16
, 0);
634 q0s16
= vaddq_s16(vreinterpretq_s16_u16(q3u16
),
635 vreinterpretq_s16_u16(q0u16
));
636 d0u8
= vqmovun_s16(q0s16
);
637 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d0u8
));
639 q0u16
= vdupq_lane_u16(d20u16
, 1);
640 q0s16
= vaddq_s16(vreinterpretq_s16_u16(q3u16
),
641 vreinterpretq_s16_u16(q0u16
));
642 d0u8
= vqmovun_s16(q0s16
);
643 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d0u8
));
645 q0u16
= vdupq_lane_u16(d20u16
, 2);
646 q0s16
= vaddq_s16(vreinterpretq_s16_u16(q3u16
),
647 vreinterpretq_s16_u16(q0u16
));
648 d0u8
= vqmovun_s16(q0s16
);
649 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d0u8
));
651 q0u16
= vdupq_lane_u16(d20u16
, 3);
652 q0s16
= vaddq_s16(vreinterpretq_s16_u16(q3u16
),
653 vreinterpretq_s16_u16(q0u16
));
654 d0u8
= vqmovun_s16(q0s16
);
655 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d0u8
));
660 void vp9_tm_predictor_16x16_neon(uint8_t *dst
, ptrdiff_t stride
,
661 const uint8_t *above
, const uint8_t *left
) {
663 uint16x8_t q0u16
, q2u16
, q3u16
, q8u16
, q10u16
;
664 uint8x16_t q0u8
, q1u8
;
665 int16x8_t q0s16
, q1s16
, q8s16
, q11s16
;
667 uint8x8_t d2u8
, d3u8
, d18u8
, d22u8
, d23u8
;
669 q0u8
= vld1q_dup_u8(above
- 1);
670 q1u8
= vld1q_u8(above
);
671 q2u16
= vsubl_u8(vget_low_u8(q1u8
), vget_low_u8(q0u8
));
672 q3u16
= vsubl_u8(vget_high_u8(q1u8
), vget_high_u8(q0u8
));
673 for (k
= 0; k
< 2; k
++, left
+= 8) {
674 d18u8
= vld1_u8(left
);
675 q10u16
= vmovl_u8(d18u8
);
676 d20u16
= vget_low_u16(q10u16
);
677 for (j
= 0; j
< 2; j
++, d20u16
= vget_high_u16(q10u16
)) {
678 q0u16
= vdupq_lane_u16(d20u16
, 0);
679 q8u16
= vdupq_lane_u16(d20u16
, 1);
680 q1s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
681 vreinterpretq_s16_u16(q2u16
));
682 q0s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
683 vreinterpretq_s16_u16(q3u16
));
684 q11s16
= vaddq_s16(vreinterpretq_s16_u16(q8u16
),
685 vreinterpretq_s16_u16(q2u16
));
686 q8s16
= vaddq_s16(vreinterpretq_s16_u16(q8u16
),
687 vreinterpretq_s16_u16(q3u16
));
688 d2u8
= vqmovun_s16(q1s16
);
689 d3u8
= vqmovun_s16(q0s16
);
690 d22u8
= vqmovun_s16(q11s16
);
691 d23u8
= vqmovun_s16(q8s16
);
692 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d2u8
));
693 vst1_u64((uint64_t *)(dst
+ 8), vreinterpret_u64_u8(d3u8
));
695 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d22u8
));
696 vst1_u64((uint64_t *)(dst
+ 8), vreinterpret_u64_u8(d23u8
));
699 q0u16
= vdupq_lane_u16(d20u16
, 2);
700 q8u16
= vdupq_lane_u16(d20u16
, 3);
701 q1s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
702 vreinterpretq_s16_u16(q2u16
));
703 q0s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
704 vreinterpretq_s16_u16(q3u16
));
705 q11s16
= vaddq_s16(vreinterpretq_s16_u16(q8u16
),
706 vreinterpretq_s16_u16(q2u16
));
707 q8s16
= vaddq_s16(vreinterpretq_s16_u16(q8u16
),
708 vreinterpretq_s16_u16(q3u16
));
709 d2u8
= vqmovun_s16(q1s16
);
710 d3u8
= vqmovun_s16(q0s16
);
711 d22u8
= vqmovun_s16(q11s16
);
712 d23u8
= vqmovun_s16(q8s16
);
713 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d2u8
));
714 vst1_u64((uint64_t *)(dst
+ 8), vreinterpret_u64_u8(d3u8
));
716 vst1_u64((uint64_t *)dst
, vreinterpret_u64_u8(d22u8
));
717 vst1_u64((uint64_t *)(dst
+ 8), vreinterpret_u64_u8(d23u8
));
723 void vp9_tm_predictor_32x32_neon(uint8_t *dst
, ptrdiff_t stride
,
724 const uint8_t *above
, const uint8_t *left
) {
726 uint16x8_t q0u16
, q3u16
, q8u16
, q9u16
, q10u16
, q11u16
;
727 uint8x16_t q0u8
, q1u8
, q2u8
;
728 int16x8_t q12s16
, q13s16
, q14s16
, q15s16
;
730 uint8x8_t d0u8
, d1u8
, d2u8
, d3u8
, d26u8
;
732 q0u8
= vld1q_dup_u8(above
- 1);
733 q1u8
= vld1q_u8(above
);
734 q2u8
= vld1q_u8(above
+ 16);
735 q8u16
= vsubl_u8(vget_low_u8(q1u8
), vget_low_u8(q0u8
));
736 q9u16
= vsubl_u8(vget_high_u8(q1u8
), vget_high_u8(q0u8
));
737 q10u16
= vsubl_u8(vget_low_u8(q2u8
), vget_low_u8(q0u8
));
738 q11u16
= vsubl_u8(vget_high_u8(q2u8
), vget_high_u8(q0u8
));
739 for (k
= 0; k
< 4; k
++, left
+= 8) {
740 d26u8
= vld1_u8(left
);
741 q3u16
= vmovl_u8(d26u8
);
742 d6u16
= vget_low_u16(q3u16
);
743 for (j
= 0; j
< 2; j
++, d6u16
= vget_high_u16(q3u16
)) {
744 q0u16
= vdupq_lane_u16(d6u16
, 0);
745 q12s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
746 vreinterpretq_s16_u16(q8u16
));
747 q13s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
748 vreinterpretq_s16_u16(q9u16
));
749 q14s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
750 vreinterpretq_s16_u16(q10u16
));
751 q15s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
752 vreinterpretq_s16_u16(q11u16
));
753 d0u8
= vqmovun_s16(q12s16
);
754 d1u8
= vqmovun_s16(q13s16
);
755 d2u8
= vqmovun_s16(q14s16
);
756 d3u8
= vqmovun_s16(q15s16
);
757 q0u8
= vcombine_u8(d0u8
, d1u8
);
758 q1u8
= vcombine_u8(d2u8
, d3u8
);
759 vst1q_u64((uint64_t *)dst
, vreinterpretq_u64_u8(q0u8
));
760 vst1q_u64((uint64_t *)(dst
+ 16), vreinterpretq_u64_u8(q1u8
));
763 q0u16
= vdupq_lane_u16(d6u16
, 1);
764 q12s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
765 vreinterpretq_s16_u16(q8u16
));
766 q13s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
767 vreinterpretq_s16_u16(q9u16
));
768 q14s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
769 vreinterpretq_s16_u16(q10u16
));
770 q15s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
771 vreinterpretq_s16_u16(q11u16
));
772 d0u8
= vqmovun_s16(q12s16
);
773 d1u8
= vqmovun_s16(q13s16
);
774 d2u8
= vqmovun_s16(q14s16
);
775 d3u8
= vqmovun_s16(q15s16
);
776 q0u8
= vcombine_u8(d0u8
, d1u8
);
777 q1u8
= vcombine_u8(d2u8
, d3u8
);
778 vst1q_u64((uint64_t *)dst
, vreinterpretq_u64_u8(q0u8
));
779 vst1q_u64((uint64_t *)(dst
+ 16), vreinterpretq_u64_u8(q1u8
));
782 q0u16
= vdupq_lane_u16(d6u16
, 2);
783 q12s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
784 vreinterpretq_s16_u16(q8u16
));
785 q13s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
786 vreinterpretq_s16_u16(q9u16
));
787 q14s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
788 vreinterpretq_s16_u16(q10u16
));
789 q15s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
790 vreinterpretq_s16_u16(q11u16
));
791 d0u8
= vqmovun_s16(q12s16
);
792 d1u8
= vqmovun_s16(q13s16
);
793 d2u8
= vqmovun_s16(q14s16
);
794 d3u8
= vqmovun_s16(q15s16
);
795 q0u8
= vcombine_u8(d0u8
, d1u8
);
796 q1u8
= vcombine_u8(d2u8
, d3u8
);
797 vst1q_u64((uint64_t *)dst
, vreinterpretq_u64_u8(q0u8
));
798 vst1q_u64((uint64_t *)(dst
+ 16), vreinterpretq_u64_u8(q1u8
));
801 q0u16
= vdupq_lane_u16(d6u16
, 3);
802 q12s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
803 vreinterpretq_s16_u16(q8u16
));
804 q13s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
805 vreinterpretq_s16_u16(q9u16
));
806 q14s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
807 vreinterpretq_s16_u16(q10u16
));
808 q15s16
= vaddq_s16(vreinterpretq_s16_u16(q0u16
),
809 vreinterpretq_s16_u16(q11u16
));
810 d0u8
= vqmovun_s16(q12s16
);
811 d1u8
= vqmovun_s16(q13s16
);
812 d2u8
= vqmovun_s16(q14s16
);
813 d3u8
= vqmovun_s16(q15s16
);
814 q0u8
= vcombine_u8(d0u8
, d1u8
);
815 q1u8
= vcombine_u8(d2u8
, d3u8
);
816 vst1q_u64((uint64_t *)dst
, vreinterpretq_u64_u8(q0u8
));
817 vst1q_u64((uint64_t *)(dst
+ 16), vreinterpretq_u64_u8(q1u8
));
822 #endif // !HAVE_NEON_ASM