diff options
author | Matthew Gretton-Dann <matthew.gretton-dann@linaro.org> | 2013-05-14 09:30:32 +0000 |
---|---|---|
committer | Matthew Gretton-Dann <matthew.gretton-dann@linaro.org> | 2013-05-14 09:30:32 +0000 |
commit | a6002864cd18f9049b5511293d4d84a6a4ea4e73 (patch) | |
tree | 4ac0a2e47b7948b71a66fd00c4d1ba1bfef2fbb6 /gcc/config | |
parent | e118fade5f65318cb59db4b7593d0019e2302ed2 (diff) |
Backport from trunk r198490-198496.
git-svn-id: https://gcc.gnu.org/svn/gcc/branches/linaro/gcc-4_8-branch@198868 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/aarch64/aarch64-builtins.c | 20 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd-builtins.def | 14 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 289 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.md | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/arm_neon.h | 1736 | ||||
-rw-r--r-- | gcc/config/aarch64/iterators.md | 64 | ||||
-rw-r--r-- | gcc/config/aarch64/predicates.md | 5 |
7 files changed, 1545 insertions, 585 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 53d2c6ad557..3016f256869 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -191,6 +191,9 @@ typedef struct #define BUILTIN_VALL(T, N, MAP) \ VAR10 (T, N, MAP, v8qi, v16qi, v4hi, v8hi, v2si, \ v4si, v2di, v2sf, v4sf, v2df) +#define BUILTIN_VALLDI(T, N, MAP) \ + VAR11 (T, N, MAP, v8qi, v16qi, v4hi, v8hi, v2si, \ + v4si, v2di, v2sf, v4sf, v2df, di) #define BUILTIN_VB(T, N, MAP) \ VAR2 (T, N, MAP, v8qi, v16qi) #define BUILTIN_VD(T, N, MAP) \ @@ -1314,11 +1317,26 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args, BUILTIN_VDQF (UNOP, abs, 2) return fold_build1 (ABS_EXPR, type, args[0]); break; + BUILTIN_VALLDI (BINOP, cmge, 0) + return fold_build2 (GE_EXPR, type, args[0], args[1]); + break; + BUILTIN_VALLDI (BINOP, cmgt, 0) + return fold_build2 (GT_EXPR, type, args[0], args[1]); + break; + BUILTIN_VALLDI (BINOP, cmeq, 0) + return fold_build2 (EQ_EXPR, type, args[0], args[1]); + break; + BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0) + { + tree and_node = fold_build2 (BIT_AND_EXPR, type, args[0], args[1]); + tree vec_zero_node = build_zero_cst (type); + return fold_build2 (NE_EXPR, type, and_node, vec_zero_node); + break; + } VAR1 (UNOP, floatv2si, 2, v2sf) VAR1 (UNOP, floatv4si, 2, v4sf) VAR1 (UNOP, floatv2di, 2, v2df) return fold_build1 (FLOAT_EXPR, type, args[0]); - break; default: break; } diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 6093341b199..620406b449d 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -224,14 +224,14 @@ BUILTIN_VSDQ_I (SHIFTIMM, uqshl_n, 0) /* Implemented by aarch64_cm<cmp><mode>. */ - BUILTIN_VSDQ_I_DI (BINOP, cmeq, 0) - BUILTIN_VSDQ_I_DI (BINOP, cmge, 0) - BUILTIN_VSDQ_I_DI (BINOP, cmgt, 0) - BUILTIN_VSDQ_I_DI (BINOP, cmle, 0) - BUILTIN_VSDQ_I_DI (BINOP, cmlt, 0) + BUILTIN_VALLDI (BINOP, cmeq, 0) + BUILTIN_VALLDI (BINOP, cmge, 0) + BUILTIN_VALLDI (BINOP, cmgt, 0) + BUILTIN_VALLDI (BINOP, cmle, 0) + BUILTIN_VALLDI (BINOP, cmlt, 0) /* Implemented by aarch64_cm<cmp><mode>. */ - BUILTIN_VSDQ_I_DI (BINOP, cmhs, 0) - BUILTIN_VSDQ_I_DI (BINOP, cmhi, 0) + BUILTIN_VSDQ_I_DI (BINOP, cmgeu, 0) + BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0) BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0) /* Implemented by aarch64_<fmaxmin><mode>. */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index a88d4cca05d..2b252540519 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -21,7 +21,7 @@ ; Main data types used by the insntructions -(define_attr "simd_mode" "unknown,none,V8QI,V16QI,V4HI,V8HI,V2SI,V4SI,V2DI,V2SF,V4SF,V2DF,OI,CI,XI,DI,DF,SI,HI,QI" +(define_attr "simd_mode" "unknown,none,V8QI,V16QI,V4HI,V8HI,V2SI,V4SI,V2DI,V2SF,V4SF,V2DF,OI,CI,XI,DI,DF,SI,SF,HI,QI" (const_string "unknown")) @@ -1683,11 +1683,13 @@ (match_operator 3 "comparison_operator" [(match_operand:VDQ 4 "register_operand") (match_operand:VDQ 5 "nonmemory_operand")]) - (match_operand:VDQ 1 "register_operand") - (match_operand:VDQ 2 "register_operand")))] + (match_operand:VDQ 1 "nonmemory_operand") + (match_operand:VDQ 2 "nonmemory_operand")))] "TARGET_SIMD" { int inverse = 0, has_zero_imm_form = 0; + rtx op1 = operands[1]; + rtx op2 = operands[2]; rtx mask = gen_reg_rtx (<MODE>mode); switch (GET_CODE (operands[3])) @@ -1728,12 +1730,12 @@ case LTU: case GEU: - emit_insn (gen_aarch64_cmhs<mode> (mask, operands[4], operands[5])); + emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[4], operands[5])); break; case LEU: case GTU: - emit_insn (gen_aarch64_cmhi<mode> (mask, operands[4], operands[5])); + emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[4], operands[5])); break; case NE: @@ -1746,11 +1748,26 @@ } if (inverse) - emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[2], - operands[1])); - else - emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[1], - operands[2])); + { + op1 = operands[2]; + op2 = operands[1]; + } + + /* If we have (a = (b CMP c) ? -1 : 0); + Then we can simply move the generated mask. */ + + if (op1 == CONSTM1_RTX (<V_cmp_result>mode) + && op2 == CONST0_RTX (<V_cmp_result>mode)) + emit_move_insn (operands[0], mask); + else + { + if (!REG_P (op1)) + op1 = force_reg (<MODE>mode, op1); + if (!REG_P (op2)) + op2 = force_reg (<MODE>mode, op2); + emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, + op1, op2)); + } DONE; }) @@ -1761,12 +1778,14 @@ (match_operator 3 "comparison_operator" [(match_operand:VDQF 4 "register_operand") (match_operand:VDQF 5 "nonmemory_operand")]) - (match_operand:VDQF 1 "register_operand") - (match_operand:VDQF 2 "register_operand")))] + (match_operand:VDQF 1 "nonmemory_operand") + (match_operand:VDQF 2 "nonmemory_operand")))] "TARGET_SIMD" { int inverse = 0; int swap_bsl_operands = 0; + rtx op1 = operands[1]; + rtx op2 = operands[2]; rtx mask = gen_reg_rtx (<V_cmp_result>mode); rtx tmp = gen_reg_rtx (<V_cmp_result>mode); @@ -1888,11 +1907,27 @@ } if (swap_bsl_operands) - emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[2], - operands[1])); - else - emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[1], - operands[2])); + { + op1 = operands[2]; + op2 = operands[1]; + } + + /* If we have (a = (b CMP c) ? -1 : 0); + Then we can simply move the generated mask. */ + + if (op1 == CONSTM1_RTX (<V_cmp_result>mode) + && op2 == CONST0_RTX (<V_cmp_result>mode)) + emit_move_insn (operands[0], mask); + else + { + if (!REG_P (op1)) + op1 = force_reg (<MODE>mode, op1); + if (!REG_P (op2)) + op2 = force_reg (<MODE>mode, op2); + emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, + op1, op2)); + } + DONE; }) @@ -1902,8 +1937,8 @@ (match_operator 3 "comparison_operator" [(match_operand:VALL 4 "register_operand") (match_operand:VALL 5 "nonmemory_operand")]) - (match_operand:VALL 1 "register_operand") - (match_operand:VALL 2 "register_operand")))] + (match_operand:VALL 1 "nonmemory_operand") + (match_operand:VALL 2 "nonmemory_operand")))] "TARGET_SIMD" { emit_insn (gen_aarch64_vcond_internal<mode> (operands[0], operands[1], @@ -1912,6 +1947,22 @@ DONE; }) +(define_expand "vcond<v_cmp_result><mode>" + [(set (match_operand:<V_cmp_result> 0 "register_operand") + (if_then_else:<V_cmp_result> + (match_operator 3 "comparison_operator" + [(match_operand:VDQF 4 "register_operand") + (match_operand:VDQF 5 "nonmemory_operand")]) + (match_operand:<V_cmp_result> 1 "nonmemory_operand") + (match_operand:<V_cmp_result> 2 "nonmemory_operand")))] + "TARGET_SIMD" +{ + emit_insn (gen_aarch64_vcond_internal<v_cmp_result> ( + operands[0], operands[1], + operands[2], operands[3], + operands[4], operands[5])); + DONE; +}) (define_expand "vcondu<mode><mode>" [(set (match_operand:VDQ 0 "register_operand") @@ -1919,8 +1970,8 @@ (match_operator 3 "comparison_operator" [(match_operand:VDQ 4 "register_operand") (match_operand:VDQ 5 "nonmemory_operand")]) - (match_operand:VDQ 1 "register_operand") - (match_operand:VDQ 2 "register_operand")))] + (match_operand:VDQ 1 "nonmemory_operand") + (match_operand:VDQ 2 "nonmemory_operand")))] "TARGET_SIMD" { emit_insn (gen_aarch64_vcond_internal<mode> (operands[0], operands[1], @@ -3146,48 +3197,198 @@ ) -;; cm(eq|ge|le|lt|gt) +;; cm(eq|ge|gt|lt|le) +;; Note, we have constraints for Dz and Z as different expanders +;; have different ideas of what should be passed to this pattern. -(define_insn "aarch64_cm<cmp><mode>" +(define_insn "aarch64_cm<optab><mode>" [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w") - (unspec:<V_cmp_result> - [(match_operand:VSDQ_I_DI 1 "register_operand" "w,w") - (match_operand:VSDQ_I_DI 2 "aarch64_simd_reg_or_zero" "w,Z")] - VCMP_S))] + (neg:<V_cmp_result> + (COMPARISONS:<V_cmp_result> + (match_operand:VDQ 1 "register_operand" "w,w") + (match_operand:VDQ 2 "aarch64_simd_reg_or_zero" "w,ZDz") + )))] "TARGET_SIMD" "@ - cm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype> - cm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #0" + cm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype> + cm<optab>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #0" [(set_attr "simd_type" "simd_cmp") (set_attr "simd_mode" "<MODE>")] ) -;; cm(hs|hi|tst) +(define_insn_and_split "aarch64_cm<optab>di" + [(set (match_operand:DI 0 "register_operand" "=w,w,r") + (neg:DI + (COMPARISONS:DI + (match_operand:DI 1 "register_operand" "w,w,r") + (match_operand:DI 2 "aarch64_simd_reg_or_zero" "w,ZDz,r") + )))] + "TARGET_SIMD" + "@ + cm<n_optab>\t%d0, %d<cmp_1>, %d<cmp_2> + cm<optab>\t%d0, %d1, #0 + #" + "reload_completed + /* We need to prevent the split from + happening in the 'w' constraint cases. */ + && GP_REGNUM_P (REGNO (operands[0])) + && GP_REGNUM_P (REGNO (operands[1]))" + [(set (reg:CC CC_REGNUM) + (compare:CC + (match_dup 1) + (match_dup 2))) + (set (match_dup 0) + (neg:DI + (COMPARISONS:DI + (match_operand 3 "cc_register" "") + (const_int 0))))] + { + enum machine_mode mode = SELECT_CC_MODE (<CMP>, operands[1], operands[2]); + rtx cc_reg = aarch64_gen_compare_reg (<CMP>, operands[1], operands[2]); + rtx comparison = gen_rtx_<CMP> (mode, operands[1], operands[2]); + emit_insn (gen_cstoredi_neg (operands[0], comparison, cc_reg)); + DONE; + } + [(set_attr "simd_type" "simd_cmp") + (set_attr "simd_mode" "DI")] +) + +;; cm(hs|hi) -(define_insn "aarch64_cm<cmp><mode>" +(define_insn "aarch64_cm<optab><mode>" [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w") - (unspec:<V_cmp_result> - [(match_operand:VSDQ_I_DI 1 "register_operand" "w") - (match_operand:VSDQ_I_DI 2 "register_operand" "w")] - VCMP_U))] + (neg:<V_cmp_result> + (UCOMPARISONS:<V_cmp_result> + (match_operand:VDQ 1 "register_operand" "w") + (match_operand:VDQ 2 "register_operand" "w") + )))] "TARGET_SIMD" - "cm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" + "cm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>" [(set_attr "simd_type" "simd_cmp") (set_attr "simd_mode" "<MODE>")] ) -;; fcm(eq|ge|le|lt|gt) +(define_insn_and_split "aarch64_cm<optab>di" + [(set (match_operand:DI 0 "register_operand" "=w,r") + (neg:DI + (UCOMPARISONS:DI + (match_operand:DI 1 "register_operand" "w,r") + (match_operand:DI 2 "aarch64_simd_reg_or_zero" "w,r") + )))] + "TARGET_SIMD" + "@ + cm<n_optab>\t%d0, %d<cmp_1>, %d<cmp_2> + #" + "reload_completed + /* We need to prevent the split from + happening in the 'w' constraint cases. */ + && GP_REGNUM_P (REGNO (operands[0])) + && GP_REGNUM_P (REGNO (operands[1]))" + [(set (reg:CC CC_REGNUM) + (compare:CC + (match_dup 1) + (match_dup 2))) + (set (match_dup 0) + (neg:DI + (UCOMPARISONS:DI + (match_operand 3 "cc_register" "") + (const_int 0))))] + { + enum machine_mode mode = SELECT_CC_MODE (<CMP>, operands[1], operands[2]); + rtx cc_reg = aarch64_gen_compare_reg (<CMP>, operands[1], operands[2]); + rtx comparison = gen_rtx_<CMP> (mode, operands[1], operands[2]); + emit_insn (gen_cstoredi_neg (operands[0], comparison, cc_reg)); + DONE; + } + [(set_attr "simd_type" "simd_cmp") + (set_attr "simd_mode" "DI")] +) + +;; cmtst + +(define_insn "aarch64_cmtst<mode>" + [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w") + (neg:<V_cmp_result> + (ne:<V_cmp_result> + (and:VDQ + (match_operand:VDQ 1 "register_operand" "w") + (match_operand:VDQ 2 "register_operand" "w")) + (vec_duplicate:<V_cmp_result> (const_int 0)))))] + "TARGET_SIMD" + "cmtst\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" + [(set_attr "simd_type" "simd_cmp") + (set_attr "simd_mode" "<MODE>")] +) + +(define_insn_and_split "aarch64_cmtstdi" + [(set (match_operand:DI 0 "register_operand" "=w,r") + (neg:DI + (ne:DI + (and:DI + (match_operand:DI 1 "register_operand" "w,r") + (match_operand:DI 2 "register_operand" "w,r")) + (const_int 0))))] + "TARGET_SIMD" + "@ + cmtst\t%d0, %d1, %d2 + #" + "reload_completed + /* We need to prevent the split from + happening in the 'w' constraint cases. */ + && GP_REGNUM_P (REGNO (operands[0])) + && GP_REGNUM_P (REGNO (operands[1]))" + [(set (reg:CC_NZ CC_REGNUM) + (compare:CC_NZ + (and:DI (match_dup 1) + (match_dup 2)) + (const_int 0))) + (set (match_dup 0) + (neg:DI + (ne:DI + (match_operand 3 "cc_register" "") + (const_int 0))))] + { + rtx and_tree = gen_rtx_AND (DImode, operands[1], operands[2]); + enum machine_mode mode = SELECT_CC_MODE (NE, and_tree, const0_rtx); + rtx cc_reg = aarch64_gen_compare_reg (NE, and_tree, const0_rtx); + rtx comparison = gen_rtx_NE (mode, and_tree, const0_rtx); + emit_insn (gen_cstoredi_neg (operands[0], comparison, cc_reg)); + DONE; + } + [(set_attr "simd_type" "simd_cmp") + (set_attr "simd_mode" "DI")] +) + +;; fcm(eq|ge|gt|le|lt) -(define_insn "aarch64_cm<cmp><mode>" +(define_insn "aarch64_cm<optab><mode>" [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w") - (unspec:<V_cmp_result> - [(match_operand:VDQF 1 "register_operand" "w,w") - (match_operand:VDQF 2 "aarch64_simd_reg_or_zero" "w,Dz")] - VCMP_S))] + (neg:<V_cmp_result> + (COMPARISONS:<V_cmp_result> + (match_operand:VALLF 1 "register_operand" "w,w") + (match_operand:VALLF 2 "aarch64_simd_reg_or_zero" "w,YDz") + )))] "TARGET_SIMD" "@ - fcm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype> - fcm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, 0" + fcm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype> + fcm<optab>\t%<v>0<Vmtype>, %<v>1<Vmtype>, 0" + [(set_attr "simd_type" "simd_fcmp") + (set_attr "simd_mode" "<MODE>")] +) + +;; fac(ge|gt) +;; Note we can also handle what would be fac(le|lt) by +;; generating fac(ge|gt). + +(define_insn "*aarch64_fac<optab><mode>" + [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w") + (neg:<V_cmp_result> + (FAC_COMPARISONS:<V_cmp_result> + (abs:VALLF (match_operand:VALLF 1 "register_operand" "w")) + (abs:VALLF (match_operand:VALLF 2 "register_operand" "w")) + )))] + "TARGET_SIMD" + "fac<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>" [(set_attr "simd_type" "simd_fcmp") (set_attr "simd_mode" "<MODE>")] ) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 08f86cf10fb..80e202a88cd 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -2409,7 +2409,7 @@ (set_attr "mode" "SI")] ) -(define_insn "*cstore<mode>_neg" +(define_insn "cstore<mode>_neg" [(set (match_operand:ALLI 0 "register_operand" "=r") (neg:ALLI (match_operator:ALLI 1 "aarch64_comparison_operator" [(match_operand 2 "cc_register" "") (const_int 0)])))] diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 8db224930fd..29e4d64e052 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -4545,50 +4545,6 @@ vabsq_s64 (int64x2_t a) return result; } -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vacged_f64 (float64_t a, float64_t b) -{ - float64_t result; - __asm__ ("facge %d0,%d1,%d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vacges_f32 (float32_t a, float32_t b) -{ - float32_t result; - __asm__ ("facge %s0,%s1,%s2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vacgtd_f64 (float64_t a, float64_t b) -{ - float64_t result; - __asm__ ("facgt %d0,%d1,%d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vacgts_f32 (float32_t a, float32_t b) -{ - float32_t result; - __asm__ ("facgt %s0,%s1,%s2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vaddlv_s8 (int8x8_t a) { @@ -5062,358 +5018,6 @@ vbslq_u64 (uint64x2_t a, uint64x2_t b, uint64x2_t c) return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcage_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("facge %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcageq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("facge %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcageq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("facge %0.2d, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcagt_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("facgt %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcagtq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("facgt %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcagtq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("facgt %0.2d, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcale_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("facge %0.2s, %2.2s, %1.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcaleq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("facge %0.4s, %2.4s, %1.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcaleq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("facge %0.2d, %2.2d, %1.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcalt_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("facgt %0.2s, %2.2s, %1.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcaltq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("facgt %0.4s, %2.4s, %1.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcaltq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("facgt %0.2d, %2.2d, %1.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceq_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("fcmeq %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceq_f64 (float64x1_t a, float64x1_t b) -{ - uint64x1_t result; - __asm__ ("fcmeq %d0, %d1, %d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vceqd_f64 (float64_t a, float64_t b) -{ - float64_t result; - __asm__ ("fcmeq %d0,%d1,%d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("fcmeq %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("fcmeq %0.2d, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vceqs_f32 (float32_t a, float32_t b) -{ - float32_t result; - __asm__ ("fcmeq %s0,%s1,%s2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vceqzd_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcmeq %d0,%d1,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vceqzs_f32 (float32_t a) -{ - float32_t result; - __asm__ ("fcmeq %s0,%s1,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcge_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("fcmge %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcge_f64 (float64x1_t a, float64x1_t b) -{ - uint64x1_t result; - __asm__ ("fcmge %d0, %d1, %d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgeq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("fcmge %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgeq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("fcmge %0.2d, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgt_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("fcmgt %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgt_f64 (float64x1_t a, float64x1_t b) -{ - uint64x1_t result; - __asm__ ("fcmgt %d0, %d1, %d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("fcmgt %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("fcmgt %0.2d, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcle_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("fcmge %0.2s, %2.2s, %1.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcle_f64 (float64x1_t a, float64x1_t b) -{ - uint64x1_t result; - __asm__ ("fcmge %d0, %d2, %d1" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcleq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("fcmge %0.4s, %2.4s, %1.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcleq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("fcmge %0.2d, %2.2d, %1.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vcls_s8 (int8x8_t a) { @@ -5480,50 +5084,6 @@ vclsq_s32 (int32x4_t a) return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclt_f32 (float32x2_t a, float32x2_t b) -{ - uint32x2_t result; - __asm__ ("fcmgt %0.2s, %2.2s, %1.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclt_f64 (float64x1_t a, float64x1_t b) -{ - uint64x1_t result; - __asm__ ("fcmgt %d0, %d2, %d1" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltq_f32 (float32x4_t a, float32x4_t b) -{ - uint32x4_t result; - __asm__ ("fcmgt %0.4s, %2.4s, %1.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltq_f64 (float64x2_t a, float64x2_t b) -{ - uint64x2_t result; - __asm__ ("fcmgt %0.2d, %2.2d, %1.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vclz_s8 (int8x8_t a) { @@ -18848,7 +18408,123 @@ vaddvq_f64 (float64x2_t __a) return vgetq_lane_f64 (t, 0); } -/* vceq */ +/* vcage */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcages_f32 (float32_t __a, float32_t __b) +{ + return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0; +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcage_f32 (float32x2_t __a, float32x2_t __b) +{ + return vabs_f32 (__a) >= vabs_f32 (__b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcageq_f32 (float32x4_t __a, float32x4_t __b) +{ + return vabsq_f32 (__a) >= vabsq_f32 (__b); +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcaged_f64 (float64_t __a, float64_t __b) +{ + return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0; +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcageq_f64 (float64x2_t __a, float64x2_t __b) +{ + return vabsq_f64 (__a) >= vabsq_f64 (__b); +} + +/* vcagt */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcagts_f32 (float32_t __a, float32_t __b) +{ + return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0; +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcagt_f32 (float32x2_t __a, float32x2_t __b) +{ + return vabs_f32 (__a) > vabs_f32 (__b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcagtq_f32 (float32x4_t __a, float32x4_t __b) +{ + return vabsq_f32 (__a) > vabsq_f32 (__b); +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcagtd_f64 (float64_t __a, float64_t __b) +{ + return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0; +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcagtq_f64 (float64x2_t __a, float64x2_t __b) +{ + return vabsq_f64 (__a) > vabsq_f64 (__b); +} + +/* vcale */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcale_f32 (float32x2_t __a, float32x2_t __b) +{ + return vabs_f32 (__a) <= vabs_f32 (__b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcaleq_f32 (float32x4_t __a, float32x4_t __b) +{ + return vabsq_f32 (__a) <= vabsq_f32 (__b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcaleq_f64 (float64x2_t __a, float64x2_t __b) +{ + return vabsq_f64 (__a) <= vabsq_f64 (__b); +} + +/* vcalt */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcalt_f32 (float32x2_t __a, float32x2_t __b) +{ + return vabs_f32 (__a) < vabs_f32 (__b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcaltq_f32 (float32x4_t __a, float32x4_t __b) +{ + return vabsq_f32 (__a) < vabsq_f32 (__b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcaltq_f64 (float64x2_t __a, float64x2_t __b) +{ + return vabsq_f64 (__a) < vabsq_f64 (__b); +} + +/* vceq - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_f64 (float64x1_t __a, float64x1_t __b) +{ + return __a == __b ? -1ll : 0ll; +} __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vceq_p8 (poly8x8_t __a, poly8x8_t __b) @@ -18878,7 +18554,7 @@ vceq_s32 (int32x2_t __a, int32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vceq_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); + return __a == __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -18905,8 +18581,19 @@ vceq_u32 (uint32x2_t __a, uint32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vceq_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmeqdi ((int64x1_t) __a, - (int64x1_t) __b); + return __a == __b ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqq_f64 (float64x2_t __a, float64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -18968,25 +18655,243 @@ vceqq_u64 (uint64x2_t __a, uint64x2_t __b) (int64x2_t) __b); } +/* vceq - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vceqs_f32 (float32_t __a, float32_t __b) +{ + return __a == __b ? -1 : 0; +} + __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vceqd_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); + return __a == __b ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vceqd_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); + return __a == __b ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vceqd_f64 (float64_t __a, float64_t __b) +{ + return __a == __b ? -1ll : 0ll; +} + +/* vceqz - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceqz_f32 (float32x2_t __a) +{ + float32x2_t __b = {0.0f, 0.0f}; + return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqz_f64 (float64x1_t __a) +{ + return __a == 0.0 ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceqz_p8 (poly8x8_t __a) +{ + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceqz_s8 (int8x8_t __a) +{ + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceqz_s16 (int16x4_t __a) +{ + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceqz_s32 (int32x2_t __a) +{ + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqz_s64 (int64x1_t __a) +{ + return __a == 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceqz_u8 (uint8x8_t __a) +{ + uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceqz_u16 (uint16x4_t __a) +{ + uint16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceqz_u32 (uint32x2_t __a) +{ + uint32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqz_u64 (uint64x1_t __a) +{ + return __a == 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqzq_f32 (float32x4_t __a) +{ + float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; + return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqzq_f64 (float64x2_t __a) +{ + float64x2_t __b = {0.0, 0.0}; + return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqzq_p8 (poly8x16_t __a) +{ + poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqzq_s8 (int8x16_t __a) +{ + int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqzq_s16 (int16x8_t __a) +{ + int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqzq_s32 (int32x4_t __a) +{ + int32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqzq_s64 (int64x2_t __a) +{ + int64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqzq_u8 (uint8x16_t __a) +{ + uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqzq_u16 (uint16x8_t __a) +{ + uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqzq_u32 (uint32x4_t __a) +{ + uint32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqzq_u64 (uint64x2_t __a) +{ + uint64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a, + (int64x2_t) __b); +} + +/* vceqz - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vceqzs_f32 (float32_t __a) +{ + return __a == 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vceqzd_s64 (int64x1_t __a) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, 0); + return __a == 0 ? -1ll : 0ll; } -/* vcge */ +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqzd_u64 (int64x1_t __a) +{ + return __a == 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vceqzd_f64 (float64_t __a) +{ + return __a == 0.0 ? -1ll : 0ll; +} + +/* vcge - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcge_f64 (float64x1_t __a, float64x1_t __b) +{ + return __a >= __b ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcge_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a, + (int8x8_t) __b); +} __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vcge_s8 (int8x8_t __a, int8x8_t __b) @@ -19009,35 +18914,53 @@ vcge_s32 (int32x2_t __a, int32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcge_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b); + return __a >= __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vcge_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __a, + return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a, (int8x8_t) __b); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vcge_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __a, + return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a, (int16x4_t) __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) vcge_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __a, + return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a, (int32x2_t) __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcge_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a, - (int64x1_t) __b); + return __a >= __b ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgeq_f64 (float64x2_t __a, float64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a, + (int8x16_t) __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -19067,51 +18990,268 @@ vcgeq_s64 (int64x2_t __a, int64x2_t __b) __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vcgeq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) vcgeq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __a, + return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a, (int16x8_t) __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) vcgeq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __a, + return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a, (int32x4_t) __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) vcgeq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __a, + return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a, (int64x2_t) __b); } +/* vcge - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcges_f32 (float32_t __a, float32_t __b) +{ + return __a >= __b ? -1 : 0; +} + __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcged_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b); + return __a >= __b ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcged_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a, - (int64x1_t) __b); + return __a >= __b ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcged_f64 (float64_t __a, float64_t __b) +{ + return __a >= __b ? -1ll : 0ll; +} + +/* vcgez - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgez_f32 (float32x2_t __a) +{ + float32x2_t __b = {0.0f, 0.0f}; + return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgez_f64 (float64x1_t __a) +{ + return __a >= 0.0 ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgez_p8 (poly8x8_t __a) +{ + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgez_s8 (int8x8_t __a) +{ + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgez_s16 (int16x4_t __a) +{ + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgez_s32 (int32x2_t __a) +{ + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgez_s64 (int64x1_t __a) +{ + return __a >= 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgez_u8 (uint8x8_t __a) +{ + uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgez_u16 (uint16x4_t __a) +{ + uint16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgez_u32 (uint32x2_t __a) +{ + uint32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgez_u64 (uint64x1_t __a) +{ + return __a >= 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgezq_f32 (float32x4_t __a) +{ + float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; + return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgezq_f64 (float64x2_t __a) +{ + float64x2_t __b = {0.0, 0.0}; + return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgezq_p8 (poly8x16_t __a) +{ + poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgezq_s8 (int8x16_t __a) +{ + int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgezq_s16 (int16x8_t __a) +{ + int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgezq_s32 (int32x4_t __a) +{ + int32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgezq_s64 (int64x2_t __a) +{ + int64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgezq_u8 (uint8x16_t __a) +{ + uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgezq_u16 (uint16x8_t __a) +{ + uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgezq_u32 (uint32x4_t __a) +{ + uint32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgezq_u64 (uint64x2_t __a) +{ + uint64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a, + (int64x2_t) __b); +} + +/* vcgez - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcgezs_f32 (float32_t __a) +{ + return __a >= 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcgezd_s64 (int64x1_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__a, 0); + return __a >= 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgezd_u64 (int64x1_t __a) +{ + return __a >= 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcgezd_f64 (float64_t __a) +{ + return __a >= 0.0 ? -1ll : 0ll; +} + +/* vcgt - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgt_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgt_f64 (float64x1_t __a, float64x1_t __b) +{ + return __a > __b ? -1ll : 0ll; } -/* vcgt */ +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgt_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vcgt_s8 (int8x8_t __a, int8x8_t __b) @@ -19134,35 +19274,53 @@ vcgt_s32 (int32x2_t __a, int32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcgt_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b); + return __a > __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vcgt_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __a, + return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a, (int8x8_t) __b); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vcgt_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __a, + return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a, (int16x4_t) __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) vcgt_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __a, + return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a, (int32x2_t) __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcgt_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a, - (int64x1_t) __b); + return __a > __b ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgtq_f64 (float64x2_t __a, float64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a, + (int8x16_t) __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -19192,51 +19350,268 @@ vcgtq_s64 (int64x2_t __a, int64x2_t __b) __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vcgtq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) vcgtq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __a, + return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a, (int16x8_t) __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) vcgtq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __a, + return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a, (int32x4_t) __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) vcgtq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __a, + return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a, (int64x2_t) __b); } +/* vcgt - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcgts_f32 (float32_t __a, float32_t __b) +{ + return __a > __b ? -1 : 0; +} + __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcgtd_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b); + return __a > __b ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcgtd_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a, - (int64x1_t) __b); + return __a > __b ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcgtd_f64 (float64_t __a, float64_t __b) +{ + return __a > __b ? -1ll : 0ll; +} + +/* vcgtz - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgtz_f32 (float32x2_t __a) +{ + float32x2_t __b = {0.0f, 0.0f}; + return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgtz_f64 (float64x1_t __a) +{ + return __a > 0.0 ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgtz_p8 (poly8x8_t __a) +{ + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgtz_s8 (int8x8_t __a) +{ + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgtz_s16 (int16x4_t __a) +{ + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgtz_s32 (int32x2_t __a) +{ + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgtz_s64 (int64x1_t __a) +{ + return __a > 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgtz_u8 (uint8x8_t __a) +{ + uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgtz_u16 (uint16x4_t __a) +{ + uint16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgtz_u32 (uint32x2_t __a) +{ + uint32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgtz_u64 (uint64x1_t __a) +{ + return __a > 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtzq_f32 (float32x4_t __a) +{ + float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; + return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgtzq_f64 (float64x2_t __a) +{ + float64x2_t __b = {0.0, 0.0}; + return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtzq_p8 (poly8x16_t __a) +{ + poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtzq_s8 (int8x16_t __a) +{ + int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgtzq_s16 (int16x8_t __a) +{ + int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtzq_s32 (int32x4_t __a) +{ + int32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgtzq_s64 (int64x2_t __a) +{ + int64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtzq_u8 (uint8x16_t __a) +{ + uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgtzq_u16 (uint16x8_t __a) +{ + uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtzq_u32 (uint32x4_t __a) +{ + uint32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgtzq_u64 (uint64x2_t __a) +{ + uint64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a, + (int64x2_t) __b); +} + +/* vcgtz - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcgtzs_f32 (float32_t __a) +{ + return __a > 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcgtzd_s64 (int64x1_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, 0); + return __a > 0 ? -1ll : 0ll; } -/* vcle */ +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgtzd_u64 (int64x1_t __a) +{ + return __a > 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcgtzd_f64 (float64_t __a) +{ + return __a > 0.0 ? -1ll : 0ll; +} + +/* vcle - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcle_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmgev2sf (__b, __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcle_f64 (float64x1_t __a, float64x1_t __b) +{ + return __a <= __b ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcle_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __b, + (int8x8_t) __a); +} __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vcle_s8 (int8x8_t __a, int8x8_t __b) @@ -19259,35 +19634,53 @@ vcle_s32 (int32x2_t __a, int32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcle_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a); + return __a <= __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vcle_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __b, + return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __b, (int8x8_t) __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vcle_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __b, + return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __b, (int16x4_t) __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) vcle_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __b, + return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __b, (int32x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcle_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __b, - (int64x1_t) __a); + return __a <= __b ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcleq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmgev4sf (__b, __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcleq_f64 (float64x2_t __a, float64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmgev2df (__b, __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcleq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -19317,44 +19710,211 @@ vcleq_s64 (int64x2_t __a, int64x2_t __b) __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vcleq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __b, + return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __b, (int8x16_t) __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) vcleq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __b, + return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __b, (int16x8_t) __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) vcleq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __b, + return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __b, (int32x4_t) __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) vcleq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __b, + return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __b, (int64x2_t) __a); } +/* vcle - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcles_f32 (float32_t __a, float32_t __b) +{ + return __a <= __b ? -1 : 0; +} + __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcled_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a); + return __a <= __b ? -1ll : 0ll; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcled_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return __a <= __b ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcled_f64 (float64_t __a, float64_t __b) +{ + return __a <= __b ? -1ll : 0ll; +} + +/* vclez - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vclez_f32 (float32x2_t __a) +{ + float32x2_t __b = {0.0f, 0.0f}; + return (uint32x2_t) __builtin_aarch64_cmlev2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vclez_f64 (float64x1_t __a) +{ + return __a <= 0.0 ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vclez_p8 (poly8x8_t __a) +{ + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmlev8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vclez_s8 (int8x8_t __a) +{ + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmlev8qi (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vclez_s16 (int16x4_t __a) +{ + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmlev4hi (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vclez_s32 (int32x2_t __a) +{ + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmlev2si (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vclez_s64 (int64x1_t __a) +{ + return __a <= 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vclez_u64 (uint64x1_t __a) +{ + return __a <= 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vclezq_f32 (float32x4_t __a) +{ + float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; + return (uint32x4_t) __builtin_aarch64_cmlev4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vclezq_f64 (float64x2_t __a) +{ + float64x2_t __b = {0.0, 0.0}; + return (uint64x2_t) __builtin_aarch64_cmlev2df (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vclezq_p8 (poly8x16_t __a) +{ + poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmlev16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vclezq_s8 (int8x16_t __a) +{ + int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmlev16qi (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vclezq_s16 (int16x8_t __a) +{ + int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmlev8hi (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vclezq_s32 (int32x4_t __a) +{ + int32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmlev4si (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vclezq_s64 (int64x2_t __a) +{ + int64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmlev2di (__a, __b); +} + +/* vclez - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vclezs_f32 (float32_t __a) +{ + return __a <= 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vclezd_s64 (int64x1_t __a) { - return (uint64x1_t) __builtin_aarch64_cmledi (__a, 0); + return __a <= 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vclezd_u64 (int64x1_t __a) +{ + return __a <= 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vclezd_f64 (float64_t __a) +{ + return __a <= 0.0 ? -1ll : 0ll; +} + +/* vclt - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vclt_f32 (float32x2_t __a, float32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__b, __a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vclt_f64 (float64x1_t __a, float64x1_t __b) +{ + return __a < __b ? -1ll : 0ll; } -/* vclt */ +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vclt_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __b, + (int8x8_t) __a); +} __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vclt_s8 (int8x8_t __a, int8x8_t __b) @@ -19377,35 +19937,53 @@ vclt_s32 (int32x2_t __a, int32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vclt_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a); + return __a < __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vclt_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __b, + return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __b, (int8x8_t) __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vclt_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __b, + return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __b, (int16x4_t) __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) vclt_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __b, + return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __b, (int32x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vclt_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __b, - (int64x1_t) __a); + return __a < __b ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcltq_f32 (float32x4_t __a, float32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__b, __a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcltq_f64 (float64x2_t __a, float64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmgtv2df (__b, __a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcltq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -19435,41 +20013,183 @@ vcltq_s64 (int64x2_t __a, int64x2_t __b) __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vcltq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __b, + return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __b, (int8x16_t) __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) vcltq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __b, + return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __b, (int16x8_t) __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) vcltq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __b, + return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __b, (int32x4_t) __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) vcltq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __b, + return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __b, (int64x2_t) __a); } +/* vclt - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vclts_f32 (float32_t __a, float32_t __b) +{ + return __a < __b ? -1 : 0; +} + __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcltd_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a); + return __a < __b ? -1ll : 0ll; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcltd_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return __a < __b ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcltd_f64 (float64_t __a, float64_t __b) +{ + return __a < __b ? -1ll : 0ll; +} + +/* vcltz - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcltz_f32 (float32x2_t __a) +{ + float32x2_t __b = {0.0f, 0.0f}; + return (uint32x2_t) __builtin_aarch64_cmltv2sf (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcltz_f64 (float64x1_t __a) +{ + return __a < 0.0 ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcltz_p8 (poly8x8_t __a) +{ + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmltv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcltz_s8 (int8x8_t __a) +{ + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmltv8qi (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcltz_s16 (int16x4_t __a) +{ + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmltv4hi (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcltz_s32 (int32x2_t __a) +{ + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmltv2si (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcltz_s64 (int64x1_t __a) +{ + return __a < 0ll ? -1ll : 0ll; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcltzq_f32 (float32x4_t __a) +{ + float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; + return (uint32x4_t) __builtin_aarch64_cmltv4sf (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcltzq_f64 (float64x2_t __a) +{ + float64x2_t __b = {0.0, 0.0}; + return (uint64x2_t) __builtin_aarch64_cmltv2df (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcltzq_p8 (poly8x16_t __a) +{ + poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmltv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcltzq_s8 (int8x16_t __a) +{ + int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmltv16qi (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcltzq_s16 (int16x8_t __a) +{ + int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmltv8hi (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcltzq_s32 (int32x4_t __a) +{ + int32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmltv4si (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcltzq_s64 (int64x2_t __a) +{ + int64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmltv2di (__a, __b); +} + +/* vcltz - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcltzs_f32 (float32_t __a) +{ + return __a < 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vcltzd_s64 (int64x1_t __a) { - return (uint64x1_t) __builtin_aarch64_cmltdi (__a, 0); + return __a < 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcltzd_u64 (int64x1_t __a) +{ + return __a < 0 ? -1ll : 0ll; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcltzd_f64 (float64_t __a) +{ + return __a < 0.0 ? -1ll : 0ll; } /* vcvt (double -> float). */ @@ -24953,7 +25673,7 @@ vtst_s32 (int32x2_t __a, int32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vtst_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmtstdi (__a, __b); + return (__a & __b) ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) @@ -24980,8 +25700,7 @@ vtst_u32 (uint32x2_t __a, uint32x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vtst_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmtstdi ((int64x1_t) __a, - (int64x1_t) __b); + return (__a & __b) ? -1ll : 0ll; } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) @@ -25039,14 +25758,13 @@ vtstq_u64 (uint64x2_t __a, uint64x2_t __b) __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vtstd_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmtstdi (__a, __b); + return (__a & __b) ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vtstd_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmtstdi ((int64x1_t) __a, - (int64x1_t) __b); + return (__a & __b) ? -1ll : 0ll; } /* vuqadd */ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index d774c4c8cbc..00e315d920c 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -83,6 +83,9 @@ ;; Vector Float modes. (define_mode_iterator VDQF [V2SF V4SF V2DF]) +;; All Float modes. +(define_mode_iterator VALLF [V2SF V4SF V2DF SF DF]) + ;; Vector Float modes with 2 elements. (define_mode_iterator V2F [V2SF V2DF]) @@ -213,13 +216,6 @@ UNSPEC_URSHL ; Used in aarch64-simd.md. UNSPEC_SQRSHL ; Used in aarch64-simd.md. UNSPEC_UQRSHL ; Used in aarch64-simd.md. - UNSPEC_CMEQ ; Used in aarch64-simd.md. - UNSPEC_CMLE ; Used in aarch64-simd.md. - UNSPEC_CMLT ; Used in aarch64-simd.md. - UNSPEC_CMGE ; Used in aarch64-simd.md. - UNSPEC_CMGT ; Used in aarch64-simd.md. - UNSPEC_CMHS ; Used in aarch64-simd.md. - UNSPEC_CMHI ; Used in aarch64-simd.md. UNSPEC_SSLI ; Used in aarch64-simd.md. UNSPEC_USLI ; Used in aarch64-simd.md. UNSPEC_SSRI ; Used in aarch64-simd.md. @@ -227,7 +223,6 @@ UNSPEC_SSHLL ; Used in aarch64-simd.md. UNSPEC_USHLL ; Used in aarch64-simd.md. UNSPEC_ADDP ; Used in aarch64-simd.md. - UNSPEC_CMTST ; Used in aarch64-simd.md. UNSPEC_FMAX ; Used in aarch64-simd.md. UNSPEC_FMIN ; Used in aarch64-simd.md. UNSPEC_TBL ; Used in vector permute patterns. @@ -253,6 +248,7 @@ ;; For scalar usage of vector/FP registers (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d") + (SF "s") (DF "d") (V8QI "") (V16QI "") (V4HI "") (V8HI "") (V2SI "") (V4SI "") @@ -307,7 +303,8 @@ (V4SF ".4s") (V2DF ".2d") (DI "") (SI "") (HI "") (QI "") - (TI "")]) + (TI "") (SF "") + (DF "")]) ;; Register suffix narrowed modes for VQN. (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h") @@ -446,7 +443,8 @@ (V2SI "V2SI") (V4SI "V4SI") (DI "DI") (V2DI "V2DI") (V2SF "V2SI") (V4SF "V4SI") - (V2DF "V2DI")]) + (V2DF "V2DI") (DF "DI") + (SF "SI")]) ;; Lower case mode of results of comparison operations. (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi") @@ -454,7 +452,8 @@ (V2SI "v2si") (V4SI "v4si") (DI "di") (V2DI "v2di") (V2SF "v2si") (V4SF "v4si") - (V2DF "v2di")]) + (V2DF "v2di") (DF "di") + (SF "si")]) ;; Vm for lane instructions is restricted to FP_LO_REGS. (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x") @@ -548,6 +547,15 @@ ;; Code iterator for signed variants of vector saturating binary ops. (define_code_iterator SBINQOPS [ss_plus ss_minus]) +;; Comparison operators for <F>CM. +(define_code_iterator COMPARISONS [lt le eq ge gt]) + +;; Unsigned comparison operators. +(define_code_iterator UCOMPARISONS [ltu leu geu gtu]) + +;; Unsigned comparison operators. +(define_code_iterator FAC_COMPARISONS [lt le ge gt]) + ;; ------------------------------------------------------------------- ;; Code Attributes ;; ------------------------------------------------------------------- @@ -580,7 +588,28 @@ (eq "eq") (ne "ne") (lt "lt") - (ge "ge")]) + (ge "ge") + (le "le") + (gt "gt") + (ltu "ltu") + (leu "leu") + (geu "geu") + (gtu "gtu")]) + +;; For comparison operators we use the FCM* and CM* instructions. +;; As there are no CMLE or CMLT instructions which act on 3 vector +;; operands, we must use CMGE or CMGT and swap the order of the +;; source operands. + +(define_code_attr n_optab [(lt "gt") (le "ge") (eq "eq") (ge "ge") (gt "gt") + (ltu "hi") (leu "hs") (geu "hs") (gtu "hi")]) +(define_code_attr cmp_1 [(lt "2") (le "2") (eq "1") (ge "1") (gt "1") + (ltu "2") (leu "2") (geu "1") (gtu "1")]) +(define_code_attr cmp_2 [(lt "1") (le "1") (eq "2") (ge "2") (gt "2") + (ltu "1") (leu "1") (geu "2") (gtu "2")]) + +(define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT") + (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")]) (define_code_attr fix_trunc_optab [(fix "fix_trunc") (unsigned_fix "fixuns_trunc")]) @@ -693,11 +722,6 @@ UNSPEC_SQSHRN UNSPEC_UQSHRN UNSPEC_SQRSHRN UNSPEC_UQRSHRN]) -(define_int_iterator VCMP_S [UNSPEC_CMEQ UNSPEC_CMGE UNSPEC_CMGT - UNSPEC_CMLE UNSPEC_CMLT]) - -(define_int_iterator VCMP_U [UNSPEC_CMHS UNSPEC_CMHI UNSPEC_CMTST]) - (define_int_iterator PERMUTE [UNSPEC_ZIP1 UNSPEC_ZIP2 UNSPEC_TRN1 UNSPEC_TRN2 UNSPEC_UZP1 UNSPEC_UZP2]) @@ -784,12 +808,6 @@ (UNSPEC_RADDHN2 "add") (UNSPEC_RSUBHN2 "sub")]) -(define_int_attr cmp [(UNSPEC_CMGE "ge") (UNSPEC_CMGT "gt") - (UNSPEC_CMLE "le") (UNSPEC_CMLT "lt") - (UNSPEC_CMEQ "eq") - (UNSPEC_CMHS "hs") (UNSPEC_CMHI "hi") - (UNSPEC_CMTST "tst")]) - (define_int_attr offsetlr [(UNSPEC_SSLI "1") (UNSPEC_USLI "1") (UNSPEC_SSRI "0") (UNSPEC_USRI "0")]) diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 8f80b202811..8514e8f8fbd 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -31,6 +31,11 @@ (ior (match_operand 0 "register_operand") (match_test "op == const0_rtx")))) +(define_predicate "aarch64_reg_or_fp_zero" + (and (match_code "reg,subreg,const_double") + (ior (match_operand 0 "register_operand") + (match_test "aarch64_float_const_zero_rtx_p (op)")))) + (define_predicate "aarch64_reg_zero_or_m1_or_1" (and (match_code "reg,subreg,const_int") (ior (match_operand 0 "register_operand") |