From e1506184f16d3ea8b5f199c6c93c277bafaec9e2 Mon Sep 17 00:00:00 2001 From: alalaw01 Date: Tue, 29 Apr 2014 16:55:48 +0000 Subject: [PATCH] * config/aarch64/arm_neon.h (vzip1_f32, vzip1_p8, vzip1_p16, vzip1_s8, vzip1_s16, vzip1_s32, vzip1_u8, vzip1_u16, vzip1_u32, vzip1q_f32, vzip1q_f64, vzip1q_p8, vzip1q_p16, vzip1q_s8, vzip1q_s16, vzip1q_s32, vzip1q_s64, vzip1q_u8, vzip1q_u16, vzip1q_u32, vzip1q_u64, vzip2_f32, vzip2_p8, vzip2_p16, vzip2_s8, vzip2_s16, vzip2_s32, vzip2_u8, vzip2_u16, vzip2_u32, vzip2q_f32, vzip2q_f64, vzip2q_p8, vzip2q_p16, vzip2q_s8, vzip2q_s16, vzip2q_s32, vzip2q_s64, vzip2q_u8, vzip2q_u16, vzip2q_u32, vzip2q_u64): Replace inline __asm__ with __builtin_shuffle. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@209906 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 11 + gcc/config/aarch64/arm_neon.h | 900 ++++++++++++++++++++---------------------- 2 files changed, 449 insertions(+), 462 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c62e2a2922c..7b4bb549d8a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2014-04-29 Alan Lawrence + + * config/aarch64/arm_neon.h (vzip1_f32, vzip1_p8, vzip1_p16, vzip1_s8, + vzip1_s16, vzip1_s32, vzip1_u8, vzip1_u16, vzip1_u32, vzip1q_f32, + vzip1q_f64, vzip1q_p8, vzip1q_p16, vzip1q_s8, vzip1q_s16, vzip1q_s32, + vzip1q_s64, vzip1q_u8, vzip1q_u16, vzip1q_u32, vzip1q_u64, vzip2_f32, + vzip2_p8, vzip2_p16, vzip2_s8, vzip2_s16, vzip2_s32, vzip2_u8, + vzip2_u16, vzip2_u32, vzip2q_f32, vzip2q_f64, vzip2q_p8, vzip2q_p16, + vzip2q_s8, vzip2q_s16, vzip2q_s32, vzip2q_s64, vzip2q_u8, vzip2q_u16, + vzip2q_u32, vzip2q_u64): Replace inline __asm__ with __builtin_shuffle. + 2014-04-29 David Malcolm * tree-cfg.c (dump_function_to_file): Dump the return type of diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index e5c5057bc3d..fa5766787e9 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -13661,468 +13661,6 @@ vuzp2q_u64 (uint64x2_t a, uint64x2_t b) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vzip1_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("zip1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vzip1_p8 (poly8x8_t a, poly8x8_t b) -{ - poly8x8_t result; - __asm__ ("zip1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vzip1_p16 (poly16x4_t a, poly16x4_t b) -{ - poly16x4_t result; - __asm__ ("zip1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vzip1_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("zip1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vzip1_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("zip1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vzip1_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("zip1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vzip1_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("zip1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vzip1_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("zip1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vzip1_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("zip1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vzip1q_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("zip1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vzip1q_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("zip1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vzip1q_p8 (poly8x16_t a, poly8x16_t b) -{ - poly8x16_t result; - __asm__ ("zip1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vzip1q_p16 (poly16x8_t a, poly16x8_t b) -{ - poly16x8_t result; - __asm__ ("zip1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vzip1q_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("zip1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vzip1q_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("zip1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vzip1q_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("zip1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vzip1q_s64 (int64x2_t a, int64x2_t b) -{ - int64x2_t result; - __asm__ ("zip1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vzip1q_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("zip1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vzip1q_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("zip1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vzip1q_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("zip1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vzip1q_u64 (uint64x2_t a, uint64x2_t b) -{ - uint64x2_t result; - __asm__ ("zip1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vzip2_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("zip2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vzip2_p8 (poly8x8_t a, poly8x8_t b) -{ - poly8x8_t result; - __asm__ ("zip2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vzip2_p16 (poly16x4_t a, poly16x4_t b) -{ - poly16x4_t result; - __asm__ ("zip2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vzip2_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("zip2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vzip2_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("zip2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vzip2_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("zip2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vzip2_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("zip2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vzip2_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("zip2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vzip2_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("zip2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vzip2q_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("zip2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vzip2q_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("zip2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vzip2q_p8 (poly8x16_t a, poly8x16_t b) -{ - poly8x16_t result; - __asm__ ("zip2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vzip2q_p16 (poly16x8_t a, poly16x8_t b) -{ - poly16x8_t result; - __asm__ ("zip2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vzip2q_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("zip2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vzip2q_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("zip2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vzip2q_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("zip2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vzip2q_s64 (int64x2_t a, int64x2_t b) -{ - int64x2_t result; - __asm__ ("zip2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vzip2q_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("zip2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vzip2q_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("zip2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vzip2q_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("zip2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vzip2q_u64 (uint64x2_t a, uint64x2_t b) -{ - uint64x2_t result; - __asm__ ("zip2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - /* End of temporary inline asm implementations. */ /* Start of temporary inline asm for vldn, vstn and friends. */ @@ -25711,6 +25249,444 @@ __INTERLEAVE_LIST (uzp) /* vzip */ +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vzip1_f32 (float32x2_t __a, float32x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vzip1_p8 (poly8x8_t __a, poly8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vzip1_p16 (poly16x4_t __a, poly16x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); +#endif +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vzip1_s8 (int8x8_t __a, int8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vzip1_s16 (int16x4_t __a, int16x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); +#endif +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vzip1_s32 (int32x2_t __a, int32x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vzip1_u8 (uint8x8_t __a, uint8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vzip1_u16 (uint16x4_t __a, uint16x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); +#endif +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vzip1_u32 (uint32x2_t __a, uint32x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vzip1q_f32 (float32x4_t __a, float32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); +#endif +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vzip1q_f64 (float64x2_t __a, float64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vzip1q_p8 (poly8x16_t __a, poly8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); +#endif +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vzip1q_p16 (poly16x8_t __a, poly16x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) + {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vzip1q_s8 (int8x16_t __a, int8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); +#endif +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vzip1q_s16 (int16x8_t __a, int16x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) + {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vzip1q_s32 (int32x4_t __a, int32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); +#endif +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vzip1q_s64 (int64x2_t __a, int64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vzip1q_u8 (uint8x16_t __a, uint8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); +#endif +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vzip1q_u16 (uint16x8_t __a, uint16x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) + {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vzip1q_u32 (uint32x4_t __a, uint32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); +#endif +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vzip1q_u64 (uint64x2_t __a, uint64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vzip2_f32 (float32x2_t __a, float32x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vzip2_p8 (poly8x8_t __a, poly8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); +#endif +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vzip2_p16 (poly16x4_t __a, poly16x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); +#endif +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vzip2_s8 (int8x8_t __a, int8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); +#endif +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vzip2_s16 (int16x4_t __a, int16x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); +#endif +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vzip2_s32 (int32x2_t __a, int32x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vzip2_u8 (uint8x8_t __a, uint8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); +#endif +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vzip2_u16 (uint16x4_t __a, uint16x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); +#endif +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vzip2_u32 (uint32x2_t __a, uint32x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vzip2q_f32 (float32x4_t __a, float32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); +#endif +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vzip2q_f64 (float64x2_t __a, float64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vzip2q_p8 (poly8x16_t __a, poly8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); +#endif +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vzip2q_p16 (poly16x8_t __a, poly16x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) + {4, 12, 5, 13, 6, 14, 7, 15}); +#endif +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vzip2q_s8 (int8x16_t __a, int8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); +#endif +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vzip2q_s16 (int16x8_t __a, int16x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) + {4, 12, 5, 13, 6, 14, 7, 15}); +#endif +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vzip2q_s32 (int32x4_t __a, int32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); +#endif +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vzip2q_s64 (int64x2_t __a, int64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vzip2q_u8 (uint8x16_t __a, uint8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); +#endif +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vzip2q_u16 (uint16x8_t __a, uint16x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) + {4, 12, 5, 13, 6, 14, 7, 15}); +#endif +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vzip2q_u32 (uint32x4_t __a, uint32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); +#endif +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vzip2q_u64 (uint64x2_t __a, uint64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif +} + __INTERLEAVE_LIST (zip) #undef __INTERLEAVE_LIST -- 2.11.4.GIT