From 779aea46cc3b6b1a62f989ee0c04e68803733ba0 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Mon, 16 Sep 2013 09:50:21 +0000 Subject: [PATCH] [AArch64] Implement vmul_lane_<16,32,64> intrinsics in C gcc/ * config/aarch64/aarch64-simd.md (aarch64_mul3_elt): New. (aarch64_mul3_elt_): Likewise. (aarch64_mul3_elt_to_128df): Likewise. (aarch64_mul3_elt_to_64v2df): Likewise. * config/aarch64/iterators.md (VEL): Also handle DFmode. (VMUL): New. (VMUL_CHANGE_NLANES) Likewise. (h_con): Likewise. (f): Likewise. * config/aarch64/arm_neon.h (vmul_lane_<16,32,64>): Convert to C implementation. gcc/testsuite/ * gcc.target/aarch64/mul_intrinsic_1.c: New. * gcc.target/aarch64/fmul_intrinsic_1.c: Likewise. From-SVN: r202624 --- gcc/ChangeLog | 14 + gcc/config/aarch64/aarch64-simd.md | 53 +++ gcc/config/aarch64/arm_neon.h | 438 +++++++-------------- gcc/config/aarch64/iterators.md | 24 +- gcc/testsuite/ChangeLog | 5 + .../gcc.target/aarch64/fmul_intrinsic_1.c | 116 ++++++ gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c | 83 ++++ 7 files changed, 446 insertions(+), 287 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9f25d92e6fe..aaab5ece335 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,19 @@ 2013-09-16 James Greenhalgh + * config/aarch64/aarch64-simd.md (aarch64_mul3_elt): New. + (aarch64_mul3_elt_): Likewise. + (aarch64_mul3_elt_to_128df): Likewise. + (aarch64_mul3_elt_to_64v2df): Likewise. + * config/aarch64/iterators.md (VEL): Also handle DFmode. + (VMUL): New. + (VMUL_CHANGE_NLANES) Likewise. + (h_con): Likewise. + (f): Likewise. + * config/aarch64/arm_neon.h + (vmul_lane_<16,32,64>): Convert to C implementation. + +2013-09-16 James Greenhalgh + * config/aarch64/arm_neon.h (vcvtx_high_f32_f64): Fix parameters. diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 9805197a22b..04d5794ffca 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -582,6 +582,59 @@ (set_attr "simd_mode" "")] ) +(define_insn "*aarch64_mul3_elt" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (mult:VMUL + (vec_duplicate:VMUL + (vec_select: + (match_operand:VMUL 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VMUL 3 "register_operand" "w")))] + "TARGET_SIMD" + "mul\\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_mul_elt") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_mul3_elt_" + [(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w") + (mult:VMUL_CHANGE_NLANES + (vec_duplicate:VMUL_CHANGE_NLANES + (vec_select: + (match_operand: 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))] + "TARGET_SIMD" + "mul\\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_mul_elt") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_mul3_elt_to_128df" + [(set (match_operand:V2DF 0 "register_operand" "=w") + (mult:V2DF + (vec_duplicate:V2DF + (match_operand:DF 2 "register_operand" "w")) + (match_operand:V2DF 1 "register_operand" "w")))] + "TARGET_SIMD" + "fmul\\t%0.2d, %1.2d, %2.d[0]" + [(set_attr "simd_type" "simd_fmul_elt") + (set_attr "simd_mode" "V2DF")] +) + +(define_insn "*aarch64_mul3_elt_to_64v2df" + [(set (match_operand:DF 0 "register_operand" "=w") + (mult:DF + (vec_select:DF + (match_operand:V2DF 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand")])) + (match_operand:DF 3 "register_operand" "w")))] + "TARGET_SIMD" + "fmul\\t%0.2d, %3.2d, %1.d[%2]" + [(set_attr "simd_type" "simd_fmul_elt") + (set_attr "simd_mode" "V2DF")] +) + (define_insn "neg2" [(set (match_operand:VDQ 0 "register_operand" "=w") (neg:VDQ (match_operand:VDQ 1 "register_operand" "w")))] diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 23b11167913..6c9dd79a695 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9501,136 +9501,6 @@ vmovq_n_u64 (uint64_t a) return result; } -#define vmul_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("fmul %0.2s,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mul %0.4h,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mul %0.2s,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mul %0.4h,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mul %0.2s, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_laneq_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("fmul %0.2s, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mul %0.4h, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mul %0.2s, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mul %0.4h, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmul_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mul %0.2s, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmul_n_f32 (float32x2_t a, float32_t b) { @@ -10149,162 +10019,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b) return result; } -#define vmulq_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x2_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("fmul %0.4s, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x1_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("fmul %0.2d,%1.2d,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mul %0.8h,%1.8h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mul %0.4s,%1.4s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mul %0.8h,%1.8h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mul %0.4s, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_laneq_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("fmul %0.4s, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_laneq_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("fmul %0.2d,%1.2d,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mul %0.8h, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mul %0.4s, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mul %0.8h, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulq_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mul %0.4s, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vmulq_n_f32 (float32x4_t a, float32_t b) { @@ -21435,6 +21149,158 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) return a - b * c; } +/* vmul_lane */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_f32 (__b, __lane); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane) +{ + return __a * __b; +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_s16 (__b, __lane); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_s32 (__b, __lane); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_u16 (__b, __lane); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_u32 (__b, __lane); +} + +/* vmul_laneq */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_f32 (__b, __lane); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_f64 (__b, __lane); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_s16 (__b, __lane); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_s32 (__b, __lane); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_u16 (__b, __lane); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_u32 (__b, __lane); +} + +/* vmulq_lane */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_f32 (__b, __lane); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane) +{ + return __a * __b; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_s16 (__b, __lane); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_s32 (__b, __lane); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_u16 (__b, __lane); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_u32 (__b, __lane); +} + +/* vmulq_laneq */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_f32 (__b, __lane); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_f64 (__b, __lane); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_s16 (__b, __lane); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_s32 (__b, __lane); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_u16 (__b, __lane); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane) +{ + return __a * __aarch64_vgetq_lane_u32 (__b, __lane); +} + /* vqabs */ __extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index ffe125b5583..a6b3117c8a2 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -169,6 +169,12 @@ ;; Double scalar modes (define_mode_iterator DX [DI DF]) +;; Modes available for mul lane operations. +(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) + +;; Modes available for mul lane operations changing lane count. +(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF]) + ;; ------------------------------------------------------------------ ;; Unspec enumerations for Advance SIMD. These could well go into ;; aarch64.md but for their use in int_iterators here. @@ -358,7 +364,7 @@ (V2SI "SI") (V4SI "SI") (DI "DI") (V2DI "DI") (V2SF "SF") (V4SF "SF") - (V2DF "DF") + (V2DF "DF") (DF "DF") (SI "SI") (HI "HI") (QI "QI")]) @@ -541,6 +547,22 @@ (V2SF "to_128") (V4SF "to_64") (DF "to_128") (V2DF "to_64")]) +;; For certain vector-by-element multiplication instructions we must +;; constrain the HI cases to use only V0-V15. This is covered by +;; the 'x' constraint. All other modes may use the 'w' constraint. +(define_mode_attr h_con [(V2SI "w") (V4SI "w") + (V4HI "x") (V8HI "x") + (V2SF "w") (V4SF "w") + (V2DF "w") (DF "w")]) + +;; Defined to 'f' for types whose element type is a float type. +(define_mode_attr f [(V8QI "") (V16QI "") + (V4HI "") (V8HI "") + (V2SI "") (V4SI "") + (DI "") (V2DI "") + (V2SF "f") (V4SF "f") + (V2DF "f") (DF "f")]) + ;; ------------------------------------------------------------------- ;; Code Iterators ;; ------------------------------------------------------------------- diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6b9b3bc205b..17ae8ee1550 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-09-16 James Greenhalgh + + * gcc.target/aarch64/mul_intrinsic_1.c: New. + * gcc.target/aarch64/fmul_intrinsic_1.c: Likewise. + 2013-09-16 Richard Biener * gcc.dg/tree-ssa/ldist-22.c: New testcase. diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c new file mode 100644 index 00000000000..f6e32f4bf77 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c @@ -0,0 +1,116 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps" } */ + +#include + +#define DELTA 0.0001 +extern void abort (void); +extern double fabs (double); + +#define TEST_VMUL(q1, q2, size, in1_lanes, in2_lanes) \ +static void \ +test_vmul##q1##_lane##q2##_f##size (float##size##_t * res, \ + const float##size##_t *in1, \ + const float##size##_t *in2) \ +{ \ + float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \ + float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \ + float##size##x##in2_lanes##_t c; \ + if (in2_lanes > 1) \ + { \ + c = vld1##q2##_f##size (in2); \ + a = vmul##q1##_lane##q2##_f##size (b, c, 1); \ + } \ + else \ + { \ + c = vld1##q2##_f##size (in2 + 1); \ + a = vmul##q1##_lane##q2##_f##size (b, c, 0); \ + } \ + vst1##q1##_f##size (res, a); \ +} + +#define BUILD_VARS(width, n_lanes, n_half_lanes) \ +TEST_VMUL ( , , width, n_half_lanes, n_half_lanes) \ +TEST_VMUL (q, , width, n_lanes, n_half_lanes) \ +TEST_VMUL ( , q, width, n_half_lanes, n_lanes) \ +TEST_VMUL (q, q, width, n_lanes, n_lanes) + +BUILD_VARS (32, 4, 2) +BUILD_VARS (64, 2, 1) + +#define POOL2 {0.0, 1.0} +#define POOL4 {0.0, 1.0, 2.0, 3.0} +#define EMPTY2 {0.0, 0.0} +#define EMPTY4 {0.0, 0.0, 0.0, 0.0} + +#define BUILD_TEST(size, lanes) \ +static void \ +test_f##size (void) \ +{ \ + int i; \ + float##size##_t pool[lanes] = POOL##lanes; \ + float##size##_t res[lanes] = EMPTY##lanes; \ + float##size##_t res2[lanes] = EMPTY##lanes; \ + float##size##_t res3[lanes] = EMPTY##lanes; \ + float##size##_t res4[lanes] = EMPTY##lanes; \ + \ + /* Avoid constant folding the multiplication. */ \ + asm volatile ("" : : : "memory"); \ + test_vmul_lane_f##size (res, pool, pool); \ + /* Avoid fusing multiplication and subtraction. */ \ + asm volatile ("" : :"Q" (res) : "memory"); \ + for (i = 0; i < lanes / 2; i++) \ + if (fabs (res[i] - pool[i]) > DELTA) \ + abort (); \ + \ + test_vmulq_lane_f##size (res2, pool, pool); \ + /* Avoid fusing multiplication and subtraction. */ \ + asm volatile ("" : :"Q" (res2) : "memory"); \ + for (i = 0; i < lanes; i++) \ + if (fabs (res2[i] - pool[i]) > DELTA) \ + abort (); \ + \ + test_vmul_laneq_f##size (res3, pool, pool); \ + /* Avoid fusing multiplication and subtraction. */ \ + asm volatile ("" : :"Q" (res3) : "memory"); \ + for (i = 0; i < lanes / 2; i++) \ + if (fabs (res3[i] - pool[i]) > DELTA) \ + abort (); \ + \ + test_vmulq_laneq_f##size (res4, pool, pool); \ + /* Avoid fusing multiplication and subtraction. */ \ + asm volatile ("" : :"Q" (res4) : "memory"); \ + for (i = 0; i < lanes; i++) \ + if (fabs (res4[i] - pool[i]) > DELTA) \ + abort (); \ +} + +BUILD_TEST (32, 4) +BUILD_TEST (64, 2) + +int +main (int argc, char **argv) +{ + test_f32 (); + test_f64 (); + return 0; +} + +/* vmul_laneq_f32. + vmul_lane_f32. */ +/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */ + +/* vmulq_lane_f32. + vmulq_laneq_f32. */ +/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */ + +/* vmul_lane_f64. */ +/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 1 } } */ + +/* vmul_laneq_f64. + vmulq_lane_f64. + vmulq_laneq_f64. */ +/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 } } */ + +/* { dg-final { cleanup-saved-temps } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c new file mode 100644 index 00000000000..dabe10e15e5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c @@ -0,0 +1,83 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps" } */ + +#include + +extern void abort (void); + +#define MAPs(size, xx) int##size##xx##_t +#define MAPu(size, xx) uint##size##xx##_t + + +#define TEST_VMUL(q, su, size, in1_lanes, in2_lanes) \ +static void \ +test_vmulq_lane##q##_##su##size (MAP##su (size, ) * res, \ + const MAP##su(size, ) *in1, \ + const MAP##su(size, ) *in2) \ +{ \ + MAP##su (size, x##in1_lanes) a = vld1q_##su##size (in1); \ + MAP##su (size, x##in2_lanes) b = vld1##q##_##su##size (in2); \ + a = vmulq_lane##q##_##su##size (a, b, 1); \ + vst1q_##su##size (res, a); \ +} + +#define BUILD_VARS(width, n_lanes, n_half_lanes) \ +TEST_VMUL (, s, width, n_lanes, n_half_lanes) \ +TEST_VMUL (q, s, width, n_lanes, n_lanes) \ +TEST_VMUL (, u, width, n_lanes, n_half_lanes) \ +TEST_VMUL (q, u, width, n_lanes, n_lanes) \ + +BUILD_VARS (32, 4, 2) +BUILD_VARS (16, 8, 4) + +#define POOL4 {0, 1, 2, 3} +#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7} +#define EMPTY4 {0, 0, 0, 0} +#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0} + +#define BUILD_TEST(su, size, lanes) \ +static void \ +test_##su##size (void) \ +{ \ + int i; \ + MAP##su (size,) pool[lanes] = POOL##lanes; \ + MAP##su (size,) res[lanes] = EMPTY##lanes; \ + MAP##su (size,) res2[lanes] = EMPTY##lanes; \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vmulq_lane_##su##size (res, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (res[i] != pool[i]) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vmulq_laneq_##su##size (res2, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (res2[i] != pool[i]) \ + abort (); \ +} + +#undef BUILD_VARS +#define BUILD_VARS(size, lanes) \ +BUILD_TEST (s, size, lanes) \ +BUILD_TEST (u, size, lanes) + +BUILD_VARS (32, 4) +BUILD_VARS (16, 8) + +int +main (int argc, char **argv) +{ + test_s32 (); + test_u32 (); + test_s16 (); + test_u16 (); + return 0; +} + +/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 4 } } */ +/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[\[0-9\]+\\\]" 4 } } */ +/* { dg-final { cleanup-saved-temps } } */ + -- 2.11.4.GIT