diff options
author | Yvan Roux <yvan.roux@linaro.org> | 2016-09-04 13:15:56 +0200 |
---|---|---|
committer | Yvan Roux <yvan.roux@linaro.org> | 2016-09-07 22:08:36 +0200 |
commit | af0581ebfc447933ecc98748f05024bd0db2df2a (patch) | |
tree | a744b4f6cb0ffb066a43ff914b2311521214bec6 | |
parent | e59b2ff1fdebf862212b8cefd8e58a7ee73fabe0 (diff) |
gcc/
Backport from trunk r237882.
2016-06-30 James Greenhalgh <james.greenhalgh@arm.com>
Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/aarch64/aarch64-simd.md (*aarch64_simd_vec_copy_lane<mode>):
New define_insn.
(*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>): Likewise.
gcc/testsuite/
Backport from trunk r237882.
2016-06-30 James Greenhalgh <james.greenhalgh@arm.com>
Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/aarch64/vget_set_lane_1.c: New test.
gcc/
Backport from trunk r237883.
2016-06-30 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/arm_neon.h (vcopyq_lane_f32, vcopyq_lane_f64,
vcopyq_lane_p8, vcopyq_lane_p16, vcopyq_lane_s8, vcopyq_lane_s16,
vcopyq_lane_s32, vcopyq_lane_s64, vcopyq_lane_u8, vcopyq_lane_u16,
vcopyq_lane_u32, vcopyq_lane_u64): Reimplement in C.
(vcopy_lane_f32, vcopy_lane_f64, vcopy_lane_p8, vcopy_lane_p16,
vcopy_lane_s8, vcopy_lane_s16, vcopy_lane_s32, vcopy_lane_s64,
vcopy_lane_u8, vcopy_lane_u16, vcopy_lane_u32, vcopy_lane_u64,
vcopy_laneq_f32, vcopy_laneq_f64, vcopy_laneq_p8, vcopy_laneq_p16,
vcopy_laneq_s8, vcopy_laneq_s16, vcopy_laneq_s32, vcopy_laneq_s64,
vcopy_laneq_u8, vcopy_laneq_u16, vcopy_laneq_u32, vcopy_laneq_u64,
vcopyq_laneq_f32, vcopyq_laneq_f64, vcopyq_laneq_p8, vcopyq_laneq_p16,
vcopyq_laneq_s8, vcopyq_laneq_s16, vcopyq_laneq_s32, vcopyq_laneq_s64,
vcopyq_laneq_u8, vcopyq_laneq_u16, vcopyq_laneq_u32, vcopyq_laneq_u64):
New intrinsics.
gcc/testsuite/
Backport from trunk r237883.
2016-06-30 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/vect_copy_lane_1.c: New test.
Change-Id: Iea96b070d229db7d5525615dc976a1b05320485c
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 43 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 548 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c | 86 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c | 72 |
4 files changed, 593 insertions, 156 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c6af9f36d76..0bf3ac8a875 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -546,6 +546,49 @@ [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_1reg<q>")] ) +(define_insn "*aarch64_simd_vec_copy_lane<mode>" + [(set (match_operand:VALL 0 "register_operand" "=w") + (vec_merge:VALL + (vec_duplicate:VALL + (vec_select:<VEL> + (match_operand:VALL 3 "register_operand" "w") + (parallel + [(match_operand:SI 4 "immediate_operand" "i")]))) + (match_operand:VALL 1 "register_operand" "0") + (match_operand:SI 2 "immediate_operand" "i")))] + "TARGET_SIMD" + { + int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2]))); + operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt); + operands[4] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[4]))); + + return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; + } + [(set_attr "type" "neon_ins<q>")] +) + +(define_insn "*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>" + [(set (match_operand:VALL 0 "register_operand" "=w") + (vec_merge:VALL + (vec_duplicate:VALL + (vec_select:<VEL> + (match_operand:<VSWAP_WIDTH> 3 "register_operand" "w") + (parallel + [(match_operand:SI 4 "immediate_operand" "i")]))) + (match_operand:VALL 1 "register_operand" "0") + (match_operand:SI 2 "immediate_operand" "i")))] + "TARGET_SIMD" + { + int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2]))); + operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt); + operands[4] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, + INTVAL (operands[4]))); + + return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; + } + [(set_attr "type" "neon_ins<q>")] +) + (define_insn "aarch64_simd_lshr<mode>" [(set (match_operand:VDQ_I 0 "register_operand" "=w") (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index d6e510c8bc4..6477c902e49 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -5822,162 +5822,6 @@ vaddlvq_u32 (uint32x4_t a) return result; } -#define vcopyq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("ins %0.s[%2], %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_f64(a, b, c, d) \ - __extension__ \ - ({ \ - float64x2_t c_ = (c); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("ins %0.d[%2], %3.d[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_p8(a, b, c, d) \ - __extension__ \ - ({ \ - poly8x16_t c_ = (c); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("ins %0.b[%2], %3.b[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_p16(a, b, c, d) \ - __extension__ \ - ({ \ - poly16x8_t c_ = (c); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("ins %0.h[%2], %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_s8(a, b, c, d) \ - __extension__ \ - ({ \ - int8x16_t c_ = (c); \ - int8x16_t a_ = (a); \ - int8x16_t result; \ - __asm__ ("ins %0.b[%2], %3.b[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("ins %0.h[%2], %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("ins %0.s[%2], %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_s64(a, b, c, d) \ - __extension__ \ - ({ \ - int64x2_t c_ = (c); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("ins %0.d[%2], %3.d[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_u8(a, b, c, d) \ - __extension__ \ - ({ \ - uint8x16_t c_ = (c); \ - uint8x16_t a_ = (a); \ - uint8x16_t result; \ - __asm__ ("ins %0.b[%2], %3.b[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("ins %0.h[%2], %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("ins %0.s[%2], %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcopyq_lane_u64(a, b, c, d) \ - __extension__ \ - ({ \ - uint64x2_t c_ = (c); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("ins %0.d[%2], %3.d[%4]" \ - : "=w"(result) \ - : "0"(a_), "i"(b), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vcvtx_f32_f64 (float64x2_t a) { @@ -12376,6 +12220,398 @@ vcntq_u8 (uint8x16_t __a) return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a); } +/* vcopy_lane. */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcopy_lane_f32 (float32x2_t __a, const int __lane1, + float32x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vcopy_lane_f64 (float64x1_t __a, const int __lane1, + float64x1_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vcopy_lane_p8 (poly8x8_t __a, const int __lane1, + poly8x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vcopy_lane_p16 (poly16x4_t __a, const int __lane1, + poly16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vcopy_lane_s8 (int8x8_t __a, const int __lane1, + int8x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcopy_lane_s16 (int16x4_t __a, const int __lane1, + int16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcopy_lane_s32 (int32x2_t __a, const int __lane1, + int32x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vcopy_lane_s64 (int64x1_t __a, const int __lane1, + int64x1_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcopy_lane_u8 (uint8x8_t __a, const int __lane1, + uint8x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcopy_lane_u16 (uint16x4_t __a, const int __lane1, + uint16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcopy_lane_u32 (uint32x2_t __a, const int __lane1, + uint32x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcopy_lane_u64 (uint64x1_t __a, const int __lane1, + uint64x1_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +/* vcopy_laneq. */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcopy_laneq_f32 (float32x2_t __a, const int __lane1, + float32x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vcopy_laneq_f64 (float64x1_t __a, const int __lane1, + float64x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vcopy_laneq_p8 (poly8x8_t __a, const int __lane1, + poly8x16_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vcopy_laneq_p16 (poly16x4_t __a, const int __lane1, + poly16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vcopy_laneq_s8 (int8x8_t __a, const int __lane1, + int8x16_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcopy_laneq_s16 (int16x4_t __a, const int __lane1, + int16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcopy_laneq_s32 (int32x2_t __a, const int __lane1, + int32x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vcopy_laneq_s64 (int64x1_t __a, const int __lane1, + int64x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcopy_laneq_u8 (uint8x8_t __a, const int __lane1, + uint8x16_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcopy_laneq_u16 (uint16x4_t __a, const int __lane1, + uint16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcopy_laneq_u32 (uint32x2_t __a, const int __lane1, + uint32x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcopy_laneq_u64 (uint64x1_t __a, const int __lane1, + uint64x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +/* vcopyq_lane. */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcopyq_lane_f32 (float32x4_t __a, const int __lane1, + float32x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcopyq_lane_f64 (float64x2_t __a, const int __lane1, + float64x1_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vcopyq_lane_p8 (poly8x16_t __a, const int __lane1, + poly8x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vcopyq_lane_p16 (poly16x8_t __a, const int __lane1, + poly16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vcopyq_lane_s8 (int8x16_t __a, const int __lane1, + int8x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcopyq_lane_s16 (int16x8_t __a, const int __lane1, + int16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcopyq_lane_s32 (int32x4_t __a, const int __lane1, + int32x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcopyq_lane_s64 (int64x2_t __a, const int __lane1, + int64x1_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcopyq_lane_u8 (uint8x16_t __a, const int __lane1, + uint8x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcopyq_lane_u16 (uint16x8_t __a, const int __lane1, + uint16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcopyq_lane_u32 (uint32x4_t __a, const int __lane1, + uint32x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcopyq_lane_u64 (uint64x2_t __a, const int __lane1, + uint64x1_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +/* vcopyq_laneq. */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcopyq_laneq_f32 (float32x4_t __a, const int __lane1, + float32x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcopyq_laneq_f64 (float64x2_t __a, const int __lane1, + float64x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vcopyq_laneq_p8 (poly8x16_t __a, const int __lane1, + poly8x16_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vcopyq_laneq_p16 (poly16x8_t __a, const int __lane1, + poly16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vcopyq_laneq_s8 (int8x16_t __a, const int __lane1, + int8x16_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcopyq_laneq_s16 (int16x8_t __a, const int __lane1, + int16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcopyq_laneq_s32 (int32x4_t __a, const int __lane1, + int32x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcopyq_laneq_s64 (int64x2_t __a, const int __lane1, + int64x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcopyq_laneq_u8 (uint8x16_t __a, const int __lane1, + uint8x16_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcopyq_laneq_u16 (uint16x8_t __a, const int __lane1, + uint16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcopyq_laneq_u32 (uint32x4_t __a, const int __lane1, + uint32x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcopyq_laneq_u64 (uint64x2_t __a, const int __lane1, + uint64x2_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + /* vcvt (double -> float). */ __extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) diff --git a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c new file mode 100644 index 00000000000..e144def8386 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c @@ -0,0 +1,86 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#include "arm_neon.h" + +#define BUILD_TEST(TYPE1, TYPE2, Q1, Q2, SUFFIX, INDEX1, INDEX2) \ +TYPE1 __attribute__((noinline,noclone)) \ +test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b) \ +{ \ + return vcopy##Q1##_lane##Q2##_##SUFFIX (a, INDEX1, b, INDEX2); \ +} + +/* vcopy_lane. */ +BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6) +BUILD_TEST (int8x8_t, int8x8_t, , , s8, 7, 6) +BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */ +BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2) +BUILD_TEST (int16x4_t, int16x4_t, , , s16, 3, 2) +BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[2\\\]" 3 } } */ +BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0) +BUILD_TEST (int32x2_t, int32x2_t, , , s32, 1, 0) +BUILD_TEST (uint32x2_t, uint32x2_t, , , u32, 1, 0) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */ +BUILD_TEST (int64x1_t, int64x1_t, , , s64, 0, 0) +BUILD_TEST (uint64x1_t, uint64x1_t, , , u64, 0, 0) +BUILD_TEST (float64x1_t, float64x1_t, , , f64, 0, 0) +/* { dg-final { scan-assembler-times "fmov\\td0, d1" 3 } } */ + +/* vcopy_laneq. */ + +BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15) +BUILD_TEST (int8x8_t, int8x16_t, , q, s8, 7, 15) +BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */ +BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7) +BUILD_TEST (int16x4_t, int16x8_t, , q, s16, 3, 7) +BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[7\\\]" 3 } } */ +BUILD_TEST (float32x2_t, float32x4_t, , q, f32, 1, 3) +BUILD_TEST (int32x2_t, int32x4_t, , q, s32, 1, 3) +BUILD_TEST (uint32x2_t, uint32x4_t, , q, u32, 1, 3) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[3\\\]" 3 } } */ +BUILD_TEST (float64x1_t, float64x2_t, , q, f64, 0, 1) +BUILD_TEST (int64x1_t, int64x2_t, , q, s64, 0, 1) +BUILD_TEST (uint64x1_t, uint64x2_t, , q, u64, 0, 1) +/* XFAIL due to PR 71307. */ +/* { dg-final { scan-assembler-times "dup\\td0, v1.d\\\[1\\\]" 3 { xfail *-*-* } } } */ + +/* vcopyq_lane. */ +BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7) +BUILD_TEST (int8x16_t, int8x8_t, q, , s8, 15, 7) +BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */ +BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3) +BUILD_TEST (int16x8_t, int16x4_t, q, , s16, 7, 3) +BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[7\\\], v1.h\\\[3\\\]" 3 } } */ +BUILD_TEST (float32x4_t, float32x2_t, q, , f32, 3, 1) +BUILD_TEST (int32x4_t, int32x2_t, q, , s32, 3, 1) +BUILD_TEST (uint32x4_t, uint32x2_t, q, , u32, 3, 1) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[3\\\], v1.s\\\[1\\\]" 3 } } */ +BUILD_TEST (float64x2_t, float64x1_t, q, , f64, 1, 0) +BUILD_TEST (int64x2_t, int64x1_t, q, , s64, 1, 0) +BUILD_TEST (uint64x2_t, uint64x1_t, q, , u64, 1, 0) +/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[0\\\]" 3 } } */ + +/* vcopyq_laneq. */ + +BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15) +BUILD_TEST (int8x16_t, int8x16_t, q, q, s8, 14, 15) +BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */ +BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7) +BUILD_TEST (int16x8_t, int16x8_t, q, q, s16, 6, 7) +BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[6\\\], v1.h\\\[7\\\]" 3 } } */ +BUILD_TEST (float32x4_t, float32x4_t, q, q, f32, 2, 3) +BUILD_TEST (int32x4_t, int32x4_t, q, q, s32, 2, 3) +BUILD_TEST (uint32x4_t, uint32x4_t, q, q, u32, 2, 3) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[2\\\], v1.s\\\[3\\\]" 3 } } */ +BUILD_TEST (float64x2_t, float64x2_t, q, q, f64, 1, 1) +BUILD_TEST (int64x2_t, int64x2_t, q, q, s64, 1, 1) +BUILD_TEST (uint64x2_t, uint64x2_t, q, q, u64, 1, 1) +/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c new file mode 100644 index 00000000000..07a77de3192 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c @@ -0,0 +1,72 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_neon.h" + +#define BUILD_TEST(TYPE1, TYPE2, Q1, Q2, SUFFIX, INDEX1, INDEX2) \ +TYPE1 __attribute__((noinline,noclone)) \ +test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b) \ +{ \ + return vset##Q1##_lane_##SUFFIX (vget##Q2##_lane_##SUFFIX (b, INDEX2),\ + a, INDEX1); \ +} + +BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6) +BUILD_TEST (int8x8_t, int8x8_t, , , s8, 7, 6) +BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */ +BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2) +BUILD_TEST (int16x4_t, int16x4_t, , , s16, 3, 2) +BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[2\\\]" 3 } } */ +BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0) +BUILD_TEST (int32x2_t, int32x2_t, , , s32, 1, 0) +BUILD_TEST (uint32x2_t, uint32x2_t, , , u32, 1, 0) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */ + +BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15) +BUILD_TEST (int8x8_t, int8x16_t, , q, s8, 7, 15) +BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */ +BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7) +BUILD_TEST (int16x4_t, int16x8_t, , q, s16, 3, 7) +BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[7\\\]" 3 } } */ +BUILD_TEST (float32x2_t, float32x4_t, , q, f32, 1, 3) +BUILD_TEST (int32x2_t, int32x4_t, , q, s32, 1, 3) +BUILD_TEST (uint32x2_t, uint32x4_t, , q, u32, 1, 3) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[3\\\]" 3 } } */ + +BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7) +BUILD_TEST (int8x16_t, int8x8_t, q, , s8, 15, 7) +BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */ +BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3) +BUILD_TEST (int16x8_t, int16x4_t, q, , s16, 7, 3) +BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[7\\\], v1.h\\\[3\\\]" 3 } } */ +BUILD_TEST (float32x4_t, float32x2_t, q, , f32, 3, 1) +BUILD_TEST (int32x4_t, int32x2_t, q, , s32, 3, 1) +BUILD_TEST (uint32x4_t, uint32x2_t, q, , u32, 3, 1) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[3\\\], v1.s\\\[1\\\]" 3 } } */ +BUILD_TEST (float64x2_t, float64x1_t, q, , f64, 1, 0) +BUILD_TEST (int64x2_t, int64x1_t, q, , s64, 1, 0) +BUILD_TEST (uint64x2_t, uint64x1_t, q, , u64, 1, 0) +/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[0\\\]" 3 } } */ + +BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15) +BUILD_TEST (int8x16_t, int8x16_t, q, q, s8, 14, 15) +BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15) +/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */ +BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7) +BUILD_TEST (int16x8_t, int16x8_t, q, q, s16, 6, 7) +BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7) +/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[6\\\], v1.h\\\[7\\\]" 3 } } */ +BUILD_TEST (float32x4_t, float32x4_t, q, q, f32, 2, 3) +BUILD_TEST (int32x4_t, int32x4_t, q, q, s32, 2, 3) +BUILD_TEST (uint32x4_t, uint32x4_t, q, q, u32, 2, 3) +/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[2\\\], v1.s\\\[3\\\]" 3 } } */ +BUILD_TEST (float64x2_t, float64x2_t, q, q, f64, 1, 1) +BUILD_TEST (int64x2_t, int64x2_t, q, q, s64, 1, 1) +BUILD_TEST (uint64x2_t, uint64x2_t, q, q, u64, 1, 1) +/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */ |