Backport from trunk r198490-198496.

git-svn-id: https://gcc.gnu.org/svn/gcc/branches/linaro/gcc-4_8-branch@198868 138bc75d-0d04-0410-961f-82ee72b054a4
author: Matthew Gretton-Dann <matthew.gretton-dann@linaro.org> 2013-05-14 09:30:32 +0000
committer: Matthew Gretton-Dann <matthew.gretton-dann@linaro.org> 2013-05-14 09:30:32 +0000
commit: a6002864cd18f9049b5511293d4d84a6a4ea4e73 (patch)
tree: 4ac0a2e47b7948b71a66fd00c4d1ba1bfef2fbb6 /gcc/config
parent: e118fade5f65318cb59db4b7593d0019e2302ed2 (diff)
7 files changed, 1545 insertions, 585 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 53d2c6ad557..3016f256869 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -191,6 +191,9 @@ typedef struct
 #define BUILTIN_VALL(T, N, MAP) \
   VAR10 (T, N, MAP, v8qi, v16qi, v4hi, v8hi, v2si, \
 	 v4si, v2di, v2sf, v4sf, v2df)
+#define BUILTIN_VALLDI(T, N, MAP) \
+  VAR11 (T, N, MAP, v8qi, v16qi, v4hi, v8hi, v2si, \
+	 v4si, v2di, v2sf, v4sf, v2df, di)
 #define BUILTIN_VB(T, N, MAP) \
   VAR2 (T, N, MAP, v8qi, v16qi)
 #define BUILTIN_VD(T, N, MAP) \
@@ -1314,11 +1317,26 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
       BUILTIN_VDQF (UNOP, abs, 2)
 	return fold_build1 (ABS_EXPR, type, args[0]);
 	break;
+      BUILTIN_VALLDI (BINOP, cmge, 0)
+	return fold_build2 (GE_EXPR, type, args[0], args[1]);
+	break;
+      BUILTIN_VALLDI (BINOP, cmgt, 0)
+	return fold_build2 (GT_EXPR, type, args[0], args[1]);
+	break;
+      BUILTIN_VALLDI (BINOP, cmeq, 0)
+	return fold_build2 (EQ_EXPR, type, args[0], args[1]);
+	break;
+      BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
+	{
+	  tree and_node = fold_build2 (BIT_AND_EXPR, type, args[0], args[1]);
+	  tree vec_zero_node = build_zero_cst (type);
+	  return fold_build2 (NE_EXPR, type, and_node, vec_zero_node);
+	  break;
+	}
       VAR1 (UNOP, floatv2si, 2, v2sf)
       VAR1 (UNOP, floatv4si, 2, v4sf)
       VAR1 (UNOP, floatv2di, 2, v2df)
 	return fold_build1 (FLOAT_EXPR, type, args[0]);
-	break;
       default:
 	break;
     }
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 6093341b199..620406b449d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -224,14 +224,14 @@
   BUILTIN_VSDQ_I (SHIFTIMM, uqshl_n, 0)
 
   /* Implemented by aarch64_cm<cmp><mode>.  */
-  BUILTIN_VSDQ_I_DI (BINOP, cmeq, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, cmge, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, cmgt, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, cmle, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, cmlt, 0)
+  BUILTIN_VALLDI (BINOP, cmeq, 0)
+  BUILTIN_VALLDI (BINOP, cmge, 0)
+  BUILTIN_VALLDI (BINOP, cmgt, 0)
+  BUILTIN_VALLDI (BINOP, cmle, 0)
+  BUILTIN_VALLDI (BINOP, cmlt, 0)
   /* Implemented by aarch64_cm<cmp><mode>.  */
-  BUILTIN_VSDQ_I_DI (BINOP, cmhs, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, cmhi, 0)
+  BUILTIN_VSDQ_I_DI (BINOP, cmgeu, 0)
+  BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0)
   BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
 
   /* Implemented by aarch64_<fmaxmin><mode>.  */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index a88d4cca05d..2b252540519 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -21,7 +21,7 @@
 
 ; Main data types used by the insntructions
 
-(define_attr "simd_mode" "unknown,none,V8QI,V16QI,V4HI,V8HI,V2SI,V4SI,V2DI,V2SF,V4SF,V2DF,OI,CI,XI,DI,DF,SI,HI,QI"
+(define_attr "simd_mode" "unknown,none,V8QI,V16QI,V4HI,V8HI,V2SI,V4SI,V2DI,V2SF,V4SF,V2DF,OI,CI,XI,DI,DF,SI,SF,HI,QI"
   (const_string "unknown"))
 
 
@@ -1683,11 +1683,13 @@
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:VDQ 4 "register_operand")
 	     (match_operand:VDQ 5 "nonmemory_operand")])
-	  (match_operand:VDQ 1 "register_operand")
-	  (match_operand:VDQ 2 "register_operand")))]
+	  (match_operand:VDQ 1 "nonmemory_operand")
+	  (match_operand:VDQ 2 "nonmemory_operand")))]
   "TARGET_SIMD"
 {
   int inverse = 0, has_zero_imm_form = 0;
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
   rtx mask = gen_reg_rtx (<MODE>mode);
 
   switch (GET_CODE (operands[3]))
@@ -1728,12 +1730,12 @@
 
     case LTU:
     case GEU:
-      emit_insn (gen_aarch64_cmhs<mode> (mask, operands[4], operands[5]));
+      emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[4], operands[5]));
       break;
 
     case LEU:
     case GTU:
-      emit_insn (gen_aarch64_cmhi<mode> (mask, operands[4], operands[5]));
+      emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[4], operands[5]));
       break;
 
     case NE:
@@ -1746,11 +1748,26 @@
     }
 
   if (inverse)
-    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[2],
-				    operands[1]));
-  else
-    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[1],
-				    operands[2]));
+    {
+      op1 = operands[2];
+      op2 = operands[1];
+    }
+
+    /* If we have (a = (b CMP c) ? -1 : 0);
+       Then we can simply move the generated mask.  */
+
+    if (op1 == CONSTM1_RTX (<V_cmp_result>mode)
+	&& op2 == CONST0_RTX (<V_cmp_result>mode))
+      emit_move_insn (operands[0], mask);
+    else
+      {
+	if (!REG_P (op1))
+	  op1 = force_reg (<MODE>mode, op1);
+	if (!REG_P (op2))
+	  op2 = force_reg (<MODE>mode, op2);
+	emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask,
+					       op1, op2));
+      }
 
   DONE;
 })
@@ -1761,12 +1778,14 @@
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:VDQF 4 "register_operand")
 	     (match_operand:VDQF 5 "nonmemory_operand")])
-	  (match_operand:VDQF 1 "register_operand")
-	  (match_operand:VDQF 2 "register_operand")))]
+	  (match_operand:VDQF 1 "nonmemory_operand")
+	  (match_operand:VDQF 2 "nonmemory_operand")))]
   "TARGET_SIMD"
 {
   int inverse = 0;
   int swap_bsl_operands = 0;
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
   rtx mask = gen_reg_rtx (<V_cmp_result>mode);
   rtx tmp = gen_reg_rtx (<V_cmp_result>mode);
 
@@ -1888,11 +1907,27 @@
     }
 
   if (swap_bsl_operands)
-    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[2],
-				    operands[1]));
-  else
-    emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask, operands[1],
-				    operands[2]));
+    {
+      op1 = operands[2];
+      op2 = operands[1];
+    }
+
+    /* If we have (a = (b CMP c) ? -1 : 0);
+       Then we can simply move the generated mask.  */
+
+    if (op1 == CONSTM1_RTX (<V_cmp_result>mode)
+	&& op2 == CONST0_RTX (<V_cmp_result>mode))
+      emit_move_insn (operands[0], mask);
+    else
+      {
+	if (!REG_P (op1))
+	  op1 = force_reg (<MODE>mode, op1);
+	if (!REG_P (op2))
+	  op2 = force_reg (<MODE>mode, op2);
+	emit_insn (gen_aarch64_simd_bsl<mode> (operands[0], mask,
+					       op1, op2));
+      }
+
   DONE;
 })
 
@@ -1902,8 +1937,8 @@
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:VALL 4 "register_operand")
 	     (match_operand:VALL 5 "nonmemory_operand")])
-	  (match_operand:VALL 1 "register_operand")
-	  (match_operand:VALL 2 "register_operand")))]
+	  (match_operand:VALL 1 "nonmemory_operand")
+	  (match_operand:VALL 2 "nonmemory_operand")))]
   "TARGET_SIMD"
 {
   emit_insn (gen_aarch64_vcond_internal<mode> (operands[0], operands[1],
@@ -1912,6 +1947,22 @@
   DONE;
 })
 
+(define_expand "vcond<v_cmp_result><mode>"
+  [(set (match_operand:<V_cmp_result> 0 "register_operand")
+	(if_then_else:<V_cmp_result>
+	  (match_operator 3 "comparison_operator"
+	    [(match_operand:VDQF 4 "register_operand")
+	     (match_operand:VDQF 5 "nonmemory_operand")])
+	  (match_operand:<V_cmp_result> 1 "nonmemory_operand")
+	  (match_operand:<V_cmp_result> 2 "nonmemory_operand")))]
+  "TARGET_SIMD"
+{
+  emit_insn (gen_aarch64_vcond_internal<v_cmp_result> (
+						operands[0], operands[1],
+						operands[2], operands[3],
+						operands[4], operands[5]));
+  DONE;
+})
 
 (define_expand "vcondu<mode><mode>"
   [(set (match_operand:VDQ 0 "register_operand")
@@ -1919,8 +1970,8 @@
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:VDQ 4 "register_operand")
 	     (match_operand:VDQ 5 "nonmemory_operand")])
-	  (match_operand:VDQ 1 "register_operand")
-	  (match_operand:VDQ 2 "register_operand")))]
+	  (match_operand:VDQ 1 "nonmemory_operand")
+	  (match_operand:VDQ 2 "nonmemory_operand")))]
   "TARGET_SIMD"
 {
   emit_insn (gen_aarch64_vcond_internal<mode> (operands[0], operands[1],
@@ -3146,48 +3197,198 @@
 )
 
 
-;; cm(eq|ge|le|lt|gt)
+;; cm(eq|ge|gt|lt|le)
+;; Note, we have constraints for Dz and Z as different expanders
+;; have different ideas of what should be passed to this pattern.
 
-(define_insn "aarch64_cm<cmp><mode>"
+(define_insn "aarch64_cm<optab><mode>"
   [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w")
-        (unspec:<V_cmp_result>
-	  [(match_operand:VSDQ_I_DI 1 "register_operand" "w,w")
-	   (match_operand:VSDQ_I_DI 2 "aarch64_simd_reg_or_zero" "w,Z")]
-          VCMP_S))]
+	(neg:<V_cmp_result>
+	  (COMPARISONS:<V_cmp_result>
+	    (match_operand:VDQ 1 "register_operand" "w,w")
+	    (match_operand:VDQ 2 "aarch64_simd_reg_or_zero" "w,ZDz")
+	  )))]
   "TARGET_SIMD"
   "@
-  cm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
-  cm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #0"
+  cm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>
+  cm<optab>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #0"
   [(set_attr "simd_type" "simd_cmp")
    (set_attr "simd_mode" "<MODE>")]
 )
 
-;; cm(hs|hi|tst)
+(define_insn_and_split "aarch64_cm<optab>di"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,r")
+	(neg:DI
+	  (COMPARISONS:DI
+	    (match_operand:DI 1 "register_operand" "w,w,r")
+	    (match_operand:DI 2 "aarch64_simd_reg_or_zero" "w,ZDz,r")
+	  )))]
+  "TARGET_SIMD"
+  "@
+  cm<n_optab>\t%d0, %d<cmp_1>, %d<cmp_2>
+  cm<optab>\t%d0, %d1, #0
+  #"
+  "reload_completed
+   /* We need to prevent the split from
+      happening in the 'w' constraint cases.  */
+   && GP_REGNUM_P (REGNO (operands[0]))
+   && GP_REGNUM_P (REGNO (operands[1]))"
+  [(set (reg:CC CC_REGNUM)
+    (compare:CC
+      (match_dup 1)
+      (match_dup 2)))
+  (set (match_dup 0)
+    (neg:DI
+      (COMPARISONS:DI
+	(match_operand 3 "cc_register" "")
+	(const_int 0))))]
+  {
+    enum machine_mode mode = SELECT_CC_MODE (<CMP>, operands[1], operands[2]);
+    rtx cc_reg = aarch64_gen_compare_reg (<CMP>, operands[1], operands[2]);
+    rtx comparison = gen_rtx_<CMP> (mode, operands[1], operands[2]);
+    emit_insn (gen_cstoredi_neg (operands[0], comparison, cc_reg));
+    DONE;
+  }
+  [(set_attr "simd_type" "simd_cmp")
+   (set_attr "simd_mode" "DI")]
+)
+
+;; cm(hs|hi)
 
-(define_insn "aarch64_cm<cmp><mode>"
+(define_insn "aarch64_cm<optab><mode>"
   [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
-        (unspec:<V_cmp_result>
-	  [(match_operand:VSDQ_I_DI 1 "register_operand" "w")
-	   (match_operand:VSDQ_I_DI 2 "register_operand" "w")]
-          VCMP_U))]
+	(neg:<V_cmp_result>
+	  (UCOMPARISONS:<V_cmp_result>
+	    (match_operand:VDQ 1 "register_operand" "w")
+	    (match_operand:VDQ 2 "register_operand" "w")
+	  )))]
   "TARGET_SIMD"
-  "cm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  "cm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>"
   [(set_attr "simd_type" "simd_cmp")
    (set_attr "simd_mode" "<MODE>")]
 )
 
-;; fcm(eq|ge|le|lt|gt)
+(define_insn_and_split "aarch64_cm<optab>di"
+  [(set (match_operand:DI 0 "register_operand" "=w,r")
+	(neg:DI
+	  (UCOMPARISONS:DI
+	    (match_operand:DI 1 "register_operand" "w,r")
+	    (match_operand:DI 2 "aarch64_simd_reg_or_zero" "w,r")
+	  )))]
+  "TARGET_SIMD"
+  "@
+  cm<n_optab>\t%d0, %d<cmp_1>, %d<cmp_2>
+  #"
+  "reload_completed
+   /* We need to prevent the split from
+      happening in the 'w' constraint cases.  */
+   && GP_REGNUM_P (REGNO (operands[0]))
+   && GP_REGNUM_P (REGNO (operands[1]))"
+  [(set (reg:CC CC_REGNUM)
+    (compare:CC
+      (match_dup 1)
+      (match_dup 2)))
+  (set (match_dup 0)
+    (neg:DI
+      (UCOMPARISONS:DI
+	(match_operand 3 "cc_register" "")
+	(const_int 0))))]
+  {
+    enum machine_mode mode = SELECT_CC_MODE (<CMP>, operands[1], operands[2]);
+    rtx cc_reg = aarch64_gen_compare_reg (<CMP>, operands[1], operands[2]);
+    rtx comparison = gen_rtx_<CMP> (mode, operands[1], operands[2]);
+    emit_insn (gen_cstoredi_neg (operands[0], comparison, cc_reg));
+    DONE;
+  }
+  [(set_attr "simd_type" "simd_cmp")
+   (set_attr "simd_mode" "DI")]
+)
+
+;; cmtst
+
+(define_insn "aarch64_cmtst<mode>"
+  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
+	(neg:<V_cmp_result>
+	  (ne:<V_cmp_result>
+	    (and:VDQ
+	      (match_operand:VDQ 1 "register_operand" "w")
+	      (match_operand:VDQ 2 "register_operand" "w"))
+	    (vec_duplicate:<V_cmp_result> (const_int 0)))))]
+  "TARGET_SIMD"
+  "cmtst\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "simd_type" "simd_cmp")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn_and_split "aarch64_cmtstdi"
+  [(set (match_operand:DI 0 "register_operand" "=w,r")
+	(neg:DI
+	  (ne:DI
+	    (and:DI
+	      (match_operand:DI 1 "register_operand" "w,r")
+	      (match_operand:DI 2 "register_operand" "w,r"))
+	    (const_int 0))))]
+  "TARGET_SIMD"
+  "@
+  cmtst\t%d0, %d1, %d2
+  #"
+  "reload_completed
+   /* We need to prevent the split from
+      happening in the 'w' constraint cases.  */
+   && GP_REGNUM_P (REGNO (operands[0]))
+   && GP_REGNUM_P (REGNO (operands[1]))"
+   [(set (reg:CC_NZ CC_REGNUM)
+	(compare:CC_NZ
+	 (and:DI (match_dup 1)
+		  (match_dup 2))
+	 (const_int 0)))
+  (set (match_dup 0)
+    (neg:DI
+      (ne:DI
+	(match_operand 3 "cc_register" "")
+	(const_int 0))))]
+  {
+    rtx and_tree = gen_rtx_AND (DImode, operands[1], operands[2]);
+    enum machine_mode mode = SELECT_CC_MODE (NE, and_tree, const0_rtx);
+    rtx cc_reg = aarch64_gen_compare_reg (NE, and_tree, const0_rtx);
+    rtx comparison = gen_rtx_NE (mode, and_tree, const0_rtx);
+    emit_insn (gen_cstoredi_neg (operands[0], comparison, cc_reg));
+    DONE;
+  }
+  [(set_attr "simd_type" "simd_cmp")
+   (set_attr "simd_mode" "DI")]
+)
+
+;; fcm(eq|ge|gt|le|lt)
 
-(define_insn "aarch64_cm<cmp><mode>"
+(define_insn "aarch64_cm<optab><mode>"
   [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w")
-	(unspec:<V_cmp_result>
-	  [(match_operand:VDQF 1 "register_operand" "w,w")
-	   (match_operand:VDQF 2 "aarch64_simd_reg_or_zero" "w,Dz")]
-	   VCMP_S))]
+	(neg:<V_cmp_result>
+	  (COMPARISONS:<V_cmp_result>
+	    (match_operand:VALLF 1 "register_operand" "w,w")
+	    (match_operand:VALLF 2 "aarch64_simd_reg_or_zero" "w,YDz")
+	  )))]
   "TARGET_SIMD"
   "@
-  fcm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
-  fcm<cmp>\t%<v>0<Vmtype>, %<v>1<Vmtype>, 0"
+  fcm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>
+  fcm<optab>\t%<v>0<Vmtype>, %<v>1<Vmtype>, 0"
+  [(set_attr "simd_type" "simd_fcmp")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+;; fac(ge|gt)
+;; Note we can also handle what would be fac(le|lt) by
+;; generating fac(ge|gt).
+
+(define_insn "*aarch64_fac<optab><mode>"
+  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
+	(neg:<V_cmp_result>
+	  (FAC_COMPARISONS:<V_cmp_result>
+	    (abs:VALLF (match_operand:VALLF 1 "register_operand" "w"))
+	    (abs:VALLF (match_operand:VALLF 2 "register_operand" "w"))
+  )))]
+  "TARGET_SIMD"
+  "fac<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>"
   [(set_attr "simd_type" "simd_fcmp")
    (set_attr "simd_mode" "<MODE>")]
 )
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 08f86cf10fb..80e202a88cd 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -2409,7 +2409,7 @@
    (set_attr "mode" "SI")]
 )
 
-(define_insn "*cstore<mode>_neg"
+(define_insn "cstore<mode>_neg"
   [(set (match_operand:ALLI 0 "register_operand" "=r")
 	(neg:ALLI (match_operator:ALLI 1 "aarch64_comparison_operator"
 		  [(match_operand 2 "cc_register" "") (const_int 0)])))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 8db224930fd..29e4d64e052 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -4545,50 +4545,6 @@ vabsq_s64 (int64x2_t a)
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vacged_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("facge %d0,%d1,%d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vacges_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("facge %s0,%s1,%s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vacgtd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("facgt %d0,%d1,%d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vacgts_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("facgt %s0,%s1,%s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddlv_s8 (int8x8_t a)
 {
@@ -5062,358 +5018,6 @@ vbslq_u64 (uint64x2_t a, uint64x2_t b, uint64x2_t c)
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcage_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("facge %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcageq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("facge %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcageq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("facge %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcagt_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("facgt %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcagtq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("facgt %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcagtq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("facgt %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcale_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("facge %0.2s, %2.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcaleq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("facge %0.4s, %2.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcaleq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("facge %0.2d, %2.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcalt_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("facgt %0.2s, %2.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcaltq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("facgt %0.4s, %2.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcaltq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("facgt %0.2d, %2.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("fcmeq %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceq_f64 (float64x1_t a, float64x1_t b)
-{
-  uint64x1_t result;
-  __asm__ ("fcmeq %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vceqd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fcmeq %d0,%d1,%d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("fcmeq %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("fcmeq %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vceqs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fcmeq %s0,%s1,%s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vceqzd_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcmeq %d0,%d1,#0"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vceqzs_f32 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcmeq %s0,%s1,#0"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("fcmge %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcge_f64 (float64x1_t a, float64x1_t b)
-{
-  uint64x1_t result;
-  __asm__ ("fcmge %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("fcmge %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgeq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("fcmge %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("fcmgt %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgt_f64 (float64x1_t a, float64x1_t b)
-{
-  uint64x1_t result;
-  __asm__ ("fcmgt %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("fcmgt %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("fcmgt %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("fcmge %0.2s, %2.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcle_f64 (float64x1_t a, float64x1_t b)
-{
-  uint64x1_t result;
-  __asm__ ("fcmge %d0, %d2, %d1"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("fcmge %0.4s, %2.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcleq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("fcmge %0.2d, %2.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vcls_s8 (int8x8_t a)
 {
@@ -5480,50 +5084,6 @@ vclsq_s32 (int32x4_t a)
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_f32 (float32x2_t a, float32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("fcmgt %0.2s, %2.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclt_f64 (float64x1_t a, float64x1_t b)
-{
-  uint64x1_t result;
-  __asm__ ("fcmgt %d0, %d2, %d1"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_f32 (float32x4_t a, float32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("fcmgt %0.4s, %2.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltq_f64 (float64x2_t a, float64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("fcmgt %0.2d, %2.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vclz_s8 (int8x8_t a)
 {
@@ -18848,7 +18408,123 @@ vaddvq_f64 (float64x2_t __a)
   return vgetq_lane_f64 (t, 0);
 }
 
-/* vceq */
+/* vcage  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcages_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcage_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return vabs_f32 (__a) >= vabs_f32 (__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcageq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return vabsq_f32 (__a) >= vabsq_f32 (__b);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcaged_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcageq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) >= vabsq_f64 (__b);
+}
+
+/* vcagt  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcagts_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcagt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return vabs_f32 (__a) > vabs_f32 (__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcagtq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return vabsq_f32 (__a) > vabsq_f32 (__b);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcagtd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcagtq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) > vabsq_f64 (__b);
+}
+
+/* vcale  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcale_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return vabs_f32 (__a) <= vabs_f32 (__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcaleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return vabsq_f32 (__a) <= vabsq_f32 (__b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcaleq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) <= vabsq_f64 (__b);
+}
+
+/* vcalt  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcalt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return vabs_f32 (__a) < vabs_f32 (__b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcaltq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return vabsq_f32 (__a) < vabsq_f32 (__b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcaltq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return vabsq_f64 (__a) < vabsq_f64 (__b);
+}
+
+/* vceq - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceq_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return __a == __b ? -1ll : 0ll;
+}
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vceq_p8 (poly8x8_t __a, poly8x8_t __b)
@@ -18878,7 +18554,7 @@ vceq_s32 (int32x2_t __a, int32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vceq_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
+  return __a == __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -18905,8 +18581,19 @@ vceq_u32 (uint32x2_t __a, uint32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vceq_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi ((int64x1_t) __a,
-						(int64x1_t) __b);
+  return __a == __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vceqq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -18968,25 +18655,243 @@ vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
 						  (int64x2_t) __b);
 }
 
+/* vceq - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vceqs_f32 (float32_t __a, float32_t __b)
+{
+  return __a == __b ? -1 : 0;
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vceqd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
+  return __a == __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vceqd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
+  return __a == __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vceqd_f64 (float64_t __a, float64_t __b)
+{
+  return __a == __b ? -1ll : 0ll;
+}
+
+/* vceqz - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceqz_f32 (float32x2_t __a)
+{
+  float32x2_t __b = {0.0f, 0.0f};
+  return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceqz_f64 (float64x1_t __a)
+{
+  return __a == 0.0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceqz_p8 (poly8x8_t __a)
+{
+  poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceqz_s8 (int8x8_t __a)
+{
+  int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceqz_s16 (int16x4_t __a)
+{
+  int16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceqz_s32 (int32x2_t __a)
+{
+  int32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceqz_s64 (int64x1_t __a)
+{
+  return __a == 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceqz_u8 (uint8x8_t __a)
+{
+  uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceqz_u16 (uint16x4_t __a)
+{
+  uint16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a,
+						  (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceqz_u32 (uint32x2_t __a)
+{
+  uint32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a,
+						  (int32x2_t) __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceqz_u64 (uint64x1_t __a)
+{
+  return __a == 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqzq_f32 (float32x4_t __a)
+{
+  float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f};
+  return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vceqzq_f64 (float64x2_t __a)
+{
+  float64x2_t __b = {0.0, 0.0};
+  return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqzq_p8 (poly8x16_t __a)
+{
+  poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqzq_s8 (int8x16_t __a)
+{
+  int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		   0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqzq_s16 (int16x8_t __a)
+{
+  int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqzq_s32 (int32x4_t __a)
+{
+  int32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vceqzq_s64 (int64x2_t __a)
+{
+  int64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqzq_u8 (uint8x16_t __a)
+{
+  uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqzq_u16 (uint16x8_t __a)
+{
+  uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqzq_u32 (uint32x4_t __a)
+{
+  uint32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a,
+						  (int32x4_t) __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vceqzq_u64 (uint64x2_t __a)
+{
+  uint64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a,
+						  (int64x2_t) __b);
+}
+
+/* vceqz - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vceqzs_f32 (float32_t __a)
+{
+  return __a == 0.0f ? -1 : 0;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vceqzd_s64 (int64x1_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, 0);
+  return __a == 0 ? -1ll : 0ll;
 }
 
-/* vcge */
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceqzd_u64 (int64x1_t __a)
+{
+  return __a == 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vceqzd_f64 (float64_t __a)
+{
+  return __a == 0.0 ? -1ll : 0ll;
+}
+
+/* vcge - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcge_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return __a >= __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcge_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vcge_s8 (int8x8_t __a, int8x8_t __b)
@@ -19009,35 +18914,53 @@ vcge_s32 (int32x2_t __a, int32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcge_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b);
+  return __a >= __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vcge_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __a,
+  return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a,
 						 (int8x8_t) __b);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vcge_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __a,
+  return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a,
 						  (int16x4_t) __b);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vcge_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __a,
+  return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a,
 						  (int32x2_t) __b);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcge_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a,
-						(int64x1_t) __b);
+  return __a >= __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgeq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgeq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -19067,51 +18990,268 @@ vcgeq_s64 (int64x2_t __a, int64x2_t __b)
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __a,
+  return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a,
 						   (int8x16_t) __b);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __a,
+  return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a,
 						  (int16x8_t) __b);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __a,
+  return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a,
 						  (int32x4_t) __b);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __a,
+  return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a,
 						  (int64x2_t) __b);
 }
 
+/* vcge - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcges_f32 (float32_t __a, float32_t __b)
+{
+  return __a >= __b ? -1 : 0;
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcged_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b);
+  return __a >= __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcged_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a,
-						(int64x1_t) __b);
+  return __a >= __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcged_f64 (float64_t __a, float64_t __b)
+{
+  return __a >= __b ? -1ll : 0ll;
+}
+
+/* vcgez - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgez_f32 (float32x2_t __a)
+{
+  float32x2_t __b = {0.0f, 0.0f};
+  return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgez_f64 (float64x1_t __a)
+{
+  return __a >= 0.0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgez_p8 (poly8x8_t __a)
+{
+  poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgez_s8 (int8x8_t __a)
+{
+  int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgez_s16 (int16x4_t __a)
+{
+  int16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgez_s32 (int32x2_t __a)
+{
+  int32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgez_s64 (int64x1_t __a)
+{
+  return __a >= 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgez_u8 (uint8x8_t __a)
+{
+  uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgez_u16 (uint16x4_t __a)
+{
+  uint16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a,
+						  (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgez_u32 (uint32x2_t __a)
+{
+  uint32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a,
+						  (int32x2_t) __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgez_u64 (uint64x1_t __a)
+{
+  return __a >= 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgezq_f32 (float32x4_t __a)
+{
+  float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f};
+  return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgezq_f64 (float64x2_t __a)
+{
+  float64x2_t __b = {0.0, 0.0};
+  return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgezq_p8 (poly8x16_t __a)
+{
+  poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgezq_s8 (int8x16_t __a)
+{
+  int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		   0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgezq_s16 (int16x8_t __a)
+{
+  int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgezq_s32 (int32x4_t __a)
+{
+  int32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgezq_s64 (int64x2_t __a)
+{
+  int64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgezq_u8 (uint8x16_t __a)
+{
+  uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgezq_u16 (uint16x8_t __a)
+{
+  uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgezq_u32 (uint32x4_t __a)
+{
+  uint32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a,
+						  (int32x4_t) __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgezq_u64 (uint64x2_t __a)
+{
+  uint64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a,
+						  (int64x2_t) __b);
+}
+
+/* vcgez - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcgezs_f32 (float32_t __a)
+{
+  return __a >= 0.0f ? -1 : 0;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcgezd_s64 (int64x1_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, 0);
+  return __a >= 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgezd_u64 (int64x1_t __a)
+{
+  return __a >= 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcgezd_f64 (float64_t __a)
+{
+  return __a >= 0.0 ? -1ll : 0ll;
+}
+
+/* vcgt - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgt_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return __a > __b ? -1ll : 0ll;
 }
 
-/* vcgt */
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgt_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vcgt_s8 (int8x8_t __a, int8x8_t __b)
@@ -19134,35 +19274,53 @@ vcgt_s32 (int32x2_t __a, int32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcgt_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b);
+  return __a > __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __a,
+  return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a,
 						 (int8x8_t) __b);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __a,
+  return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a,
 						  (int16x4_t) __b);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __a,
+  return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a,
 						  (int32x2_t) __b);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a,
-						(int64x1_t) __b);
+  return __a > __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgtq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -19192,51 +19350,268 @@ vcgtq_s64 (int64x2_t __a, int64x2_t __b)
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __a,
+  return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a,
 						   (int8x16_t) __b);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __a,
+  return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a,
 						  (int16x8_t) __b);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __a,
+  return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a,
 						  (int32x4_t) __b);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __a,
+  return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a,
 						  (int64x2_t) __b);
 }
 
+/* vcgt - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcgts_f32 (float32_t __a, float32_t __b)
+{
+  return __a > __b ? -1 : 0;
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcgtd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b);
+  return __a > __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcgtd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a,
-						(int64x1_t) __b);
+  return __a > __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcgtd_f64 (float64_t __a, float64_t __b)
+{
+  return __a > __b ? -1ll : 0ll;
+}
+
+/* vcgtz - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgtz_f32 (float32x2_t __a)
+{
+  float32x2_t __b = {0.0f, 0.0f};
+  return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgtz_f64 (float64x1_t __a)
+{
+  return __a > 0.0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgtz_p8 (poly8x8_t __a)
+{
+  poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgtz_s8 (int8x8_t __a)
+{
+  int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgtz_s16 (int16x4_t __a)
+{
+  int16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgtz_s32 (int32x2_t __a)
+{
+  int32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgtz_s64 (int64x1_t __a)
+{
+  return __a > 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgtz_u8 (uint8x8_t __a)
+{
+  uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgtz_u16 (uint16x4_t __a)
+{
+  uint16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a,
+						  (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgtz_u32 (uint32x2_t __a)
+{
+  uint32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a,
+						  (int32x2_t) __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgtz_u64 (uint64x1_t __a)
+{
+  return __a > 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtzq_f32 (float32x4_t __a)
+{
+  float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f};
+  return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgtzq_f64 (float64x2_t __a)
+{
+  float64x2_t __b = {0.0, 0.0};
+  return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtzq_p8 (poly8x16_t __a)
+{
+  poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtzq_s8 (int8x16_t __a)
+{
+  int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		   0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtzq_s16 (int16x8_t __a)
+{
+  int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtzq_s32 (int32x4_t __a)
+{
+  int32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgtzq_s64 (int64x2_t __a)
+{
+  int64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtzq_u8 (uint8x16_t __a)
+{
+  uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtzq_u16 (uint16x8_t __a)
+{
+  uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtzq_u32 (uint32x4_t __a)
+{
+  uint32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a,
+						  (int32x4_t) __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgtzq_u64 (uint64x2_t __a)
+{
+  uint64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a,
+						  (int64x2_t) __b);
+}
+
+/* vcgtz - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcgtzs_f32 (float32_t __a)
+{
+  return __a > 0.0f ? -1 : 0;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcgtzd_s64 (int64x1_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, 0);
+  return __a > 0 ? -1ll : 0ll;
 }
 
-/* vcle */
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgtzd_u64 (int64x1_t __a)
+{
+  return __a > 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcgtzd_f64 (float64_t __a)
+{
+  return __a > 0.0 ? -1ll : 0ll;
+}
+
+/* vcle - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcle_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmgev2sf (__b, __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcle_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return __a <= __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcle_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __b,
+						 (int8x8_t) __a);
+}
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vcle_s8 (int8x8_t __a, int8x8_t __b)
@@ -19259,35 +19634,53 @@ vcle_s32 (int32x2_t __a, int32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcle_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a);
+  return __a <= __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vcle_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __b,
+  return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __b,
 						 (int8x8_t) __a);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vcle_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __b,
+  return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __b,
 						  (int16x4_t) __a);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vcle_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __b,
+  return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __b,
 						  (int32x2_t) __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcle_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __b,
-						(int64x1_t) __a);
+  return __a <= __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_cmgev4sf (__b, __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcleq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmgev2df (__b, __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcleq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __b,
+						   (int8x16_t) __a);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -19317,44 +19710,211 @@ vcleq_s64 (int64x2_t __a, int64x2_t __b)
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __b,
+  return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __b,
 						   (int8x16_t) __a);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __b,
+  return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __b,
 						  (int16x8_t) __a);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __b,
+  return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __b,
 						  (int32x4_t) __a);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __b,
+  return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __b,
 						  (int64x2_t) __a);
 }
 
+/* vcle - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcles_f32 (float32_t __a, float32_t __b)
+{
+  return __a <= __b ? -1 : 0;
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcled_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a);
+  return __a <= __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcled_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return __a <= __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcled_f64 (float64_t __a, float64_t __b)
+{
+  return __a <= __b ? -1ll : 0ll;
+}
+
+/* vclez - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclez_f32 (float32x2_t __a)
+{
+  float32x2_t __b = {0.0f, 0.0f};
+  return (uint32x2_t) __builtin_aarch64_cmlev2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vclez_f64 (float64x1_t __a)
+{
+  return __a <= 0.0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclez_p8 (poly8x8_t __a)
+{
+  poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmlev8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclez_s8 (int8x8_t __a)
+{
+  int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmlev8qi (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclez_s16 (int16x4_t __a)
+{
+  int16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmlev4hi (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclez_s32 (int32x2_t __a)
+{
+  int32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmlev2si (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vclez_s64 (int64x1_t __a)
+{
+  return __a <= 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vclez_u64 (uint64x1_t __a)
+{
+  return __a <= 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vclezq_f32 (float32x4_t __a)
+{
+  float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f};
+  return (uint32x4_t) __builtin_aarch64_cmlev4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vclezq_f64 (float64x2_t __a)
+{
+  float64x2_t __b = {0.0, 0.0};
+  return (uint64x2_t) __builtin_aarch64_cmlev2df (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vclezq_p8 (poly8x16_t __a)
+{
+  poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmlev16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vclezq_s8 (int8x16_t __a)
+{
+  int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		   0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmlev16qi (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vclezq_s16 (int16x8_t __a)
+{
+  int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmlev8hi (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vclezq_s32 (int32x4_t __a)
+{
+  int32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmlev4si (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vclezq_s64 (int64x2_t __a)
+{
+  int64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmlev2di (__a, __b);
+}
+
+/* vclez - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vclezs_f32 (float32_t __a)
+{
+  return __a <= 0.0f ? -1 : 0;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vclezd_s64 (int64x1_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmledi (__a, 0);
+  return __a <= 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vclezd_u64 (int64x1_t __a)
+{
+  return __a <= 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vclezd_f64 (float64_t __a)
+{
+  return __a <= 0.0 ? -1ll : 0ll;
+}
+
+/* vclt - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclt_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__b, __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vclt_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return __a < __b ? -1ll : 0ll;
 }
 
-/* vclt */
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclt_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __b,
+						 (int8x8_t) __a);
+}
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vclt_s8 (int8x8_t __a, int8x8_t __b)
@@ -19377,35 +19937,53 @@ vclt_s32 (int32x2_t __a, int32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vclt_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a);
+  return __a < __b ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vclt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __b,
+  return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __b,
 						 (int8x8_t) __a);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vclt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __b,
+  return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __b,
 						  (int16x4_t) __a);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vclt_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __b,
+  return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __b,
 						  (int32x2_t) __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vclt_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __b,
-						(int64x1_t) __a);
+  return __a < __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__b, __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcltq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmgtv2df (__b, __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcltq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __b,
+						   (int8x16_t) __a);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -19435,41 +20013,183 @@ vcltq_s64 (int64x2_t __a, int64x2_t __b)
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __b,
+  return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __b,
 						   (int8x16_t) __a);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __b,
+  return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __b,
 						  (int16x8_t) __a);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __b,
+  return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __b,
 						  (int32x4_t) __a);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __b,
+  return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __b,
 						  (int64x2_t) __a);
 }
 
+/* vclt - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vclts_f32 (float32_t __a, float32_t __b)
+{
+  return __a < __b ? -1 : 0;
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcltd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a);
+  return __a < __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcltd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return __a < __b ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcltd_f64 (float64_t __a, float64_t __b)
+{
+  return __a < __b ? -1ll : 0ll;
+}
+
+/* vcltz - vector.  */
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcltz_f32 (float32x2_t __a)
+{
+  float32x2_t __b = {0.0f, 0.0f};
+  return (uint32x2_t) __builtin_aarch64_cmltv2sf (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcltz_f64 (float64x1_t __a)
+{
+  return __a < 0.0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcltz_p8 (poly8x8_t __a)
+{
+  poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmltv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcltz_s8 (int8x8_t __a)
+{
+  int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x8_t) __builtin_aarch64_cmltv8qi (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcltz_s16 (int16x4_t __a)
+{
+  int16x4_t __b = {0, 0, 0, 0};
+  return (uint16x4_t) __builtin_aarch64_cmltv4hi (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcltz_s32 (int32x2_t __a)
+{
+  int32x2_t __b = {0, 0};
+  return (uint32x2_t) __builtin_aarch64_cmltv2si (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcltz_s64 (int64x1_t __a)
+{
+  return __a < 0ll ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltzq_f32 (float32x4_t __a)
+{
+  float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f};
+  return (uint32x4_t) __builtin_aarch64_cmltv4sf (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcltzq_f64 (float64x2_t __a)
+{
+  float64x2_t __b = {0.0, 0.0};
+  return (uint64x2_t) __builtin_aarch64_cmltv2df (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcltzq_p8 (poly8x16_t __a)
+{
+  poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		    0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmltv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcltzq_s8 (int8x16_t __a)
+{
+  int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0,
+		   0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint8x16_t) __builtin_aarch64_cmltv16qi (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltzq_s16 (int16x8_t __a)
+{
+  int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return (uint16x8_t) __builtin_aarch64_cmltv8hi (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltzq_s32 (int32x4_t __a)
+{
+  int32x4_t __b = {0, 0, 0, 0};
+  return (uint32x4_t) __builtin_aarch64_cmltv4si (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcltzq_s64 (int64x2_t __a)
+{
+  int64x2_t __b = {0, 0};
+  return (uint64x2_t) __builtin_aarch64_cmltv2di (__a, __b);
+}
+
+/* vcltz - scalar.  */
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcltzs_f32 (float32_t __a)
+{
+  return __a < 0.0f ? -1 : 0;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vcltzd_s64 (int64x1_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmltdi (__a, 0);
+  return __a < 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcltzd_u64 (int64x1_t __a)
+{
+  return __a < 0 ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcltzd_f64 (float64_t __a)
+{
+  return __a < 0.0 ? -1ll : 0ll;
 }
 
 /* vcvt (double -> float).  */
@@ -24953,7 +25673,7 @@ vtst_s32 (int32x2_t __a, int32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vtst_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmtstdi (__a, __b);
+  return (__a & __b) ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
@@ -24980,8 +25700,7 @@ vtst_u32 (uint32x2_t __a, uint32x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vtst_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmtstdi ((int64x1_t) __a,
-						(int64x1_t) __b);
+  return (__a & __b) ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
@@ -25039,14 +25758,13 @@ vtstq_u64 (uint64x2_t __a, uint64x2_t __b)
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vtstd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmtstdi (__a, __b);
+  return (__a & __b) ? -1ll : 0ll;
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vtstd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmtstdi ((int64x1_t) __a,
-						(int64x1_t) __b);
+  return (__a & __b) ? -1ll : 0ll;
 }
 
 /* vuqadd */
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d774c4c8cbc..00e315d920c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -83,6 +83,9 @@
 ;; Vector Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
 
+;; All Float modes.
+(define_mode_iterator VALLF [V2SF V4SF V2DF SF DF])
+
 ;; Vector Float modes with 2 elements.
 (define_mode_iterator V2F [V2SF V2DF])
 
@@ -213,13 +216,6 @@
     UNSPEC_URSHL	; Used in aarch64-simd.md.
     UNSPEC_SQRSHL	; Used in aarch64-simd.md.
     UNSPEC_UQRSHL	; Used in aarch64-simd.md.
-    UNSPEC_CMEQ		; Used in aarch64-simd.md.
-    UNSPEC_CMLE		; Used in aarch64-simd.md.
-    UNSPEC_CMLT		; Used in aarch64-simd.md.
-    UNSPEC_CMGE		; Used in aarch64-simd.md.
-    UNSPEC_CMGT		; Used in aarch64-simd.md.
-    UNSPEC_CMHS		; Used in aarch64-simd.md.
-    UNSPEC_CMHI		; Used in aarch64-simd.md.
     UNSPEC_SSLI		; Used in aarch64-simd.md.
     UNSPEC_USLI		; Used in aarch64-simd.md.
     UNSPEC_SSRI		; Used in aarch64-simd.md.
@@ -227,7 +223,6 @@
     UNSPEC_SSHLL	; Used in aarch64-simd.md.
     UNSPEC_USHLL	; Used in aarch64-simd.md.
     UNSPEC_ADDP		; Used in aarch64-simd.md.
-    UNSPEC_CMTST	; Used in aarch64-simd.md.
     UNSPEC_FMAX		; Used in aarch64-simd.md.
     UNSPEC_FMIN		; Used in aarch64-simd.md.
     UNSPEC_TBL		; Used in vector permute patterns.
@@ -253,6 +248,7 @@
 
 ;; For scalar usage of vector/FP registers
 (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
+		    (SF "s") (DF "d")
 		    (V8QI "") (V16QI "")
 		    (V4HI "") (V8HI "")
 		    (V2SI "") (V4SI  "")
@@ -307,7 +303,8 @@
 			 (V4SF ".4s") (V2DF ".2d")
 			 (DI   "")    (SI   "")
 			 (HI   "")    (QI   "")
-			 (TI   "")])
+			 (TI   "")    (SF   "")
+			 (DF   "")])
 
 ;; Register suffix narrowed modes for VQN.
 (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
@@ -446,7 +443,8 @@
 				(V2SI "V2SI") (V4SI  "V4SI")
 				(DI   "DI")   (V2DI  "V2DI")
 				(V2SF "V2SI") (V4SF  "V4SI")
-				(V2DF "V2DI")])
+				(V2DF "V2DI") (DF    "DI")
+				(SF   "SI")])
 
 ;; Lower case mode of results of comparison operations.
 (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
@@ -454,7 +452,8 @@
 				(V2SI "v2si") (V4SI  "v4si")
 				(DI   "di")   (V2DI  "v2di")
 				(V2SF "v2si") (V4SF  "v4si")
-				(V2DF "v2di")])
+				(V2DF "v2di") (DF    "di")
+				(SF   "si")])
 
 ;; Vm for lane instructions is restricted to FP_LO_REGS.
 (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x")
@@ -548,6 +547,15 @@
 ;; Code iterator for signed variants of vector saturating binary ops.
 (define_code_iterator SBINQOPS [ss_plus ss_minus])
 
+;; Comparison operators for <F>CM.
+(define_code_iterator COMPARISONS [lt le eq ge gt])
+
+;; Unsigned comparison operators.
+(define_code_iterator UCOMPARISONS [ltu leu geu gtu])
+
+;; Unsigned comparison operators.
+(define_code_iterator FAC_COMPARISONS [lt le ge gt])
+
 ;; -------------------------------------------------------------------
 ;; Code Attributes
 ;; -------------------------------------------------------------------
@@ -580,7 +588,28 @@
 			 (eq "eq")
 			 (ne "ne")
 			 (lt "lt")
-			 (ge "ge")])
+			 (ge "ge")
+			 (le "le")
+			 (gt "gt")
+			 (ltu "ltu")
+			 (leu "leu")
+			 (geu "geu")
+			 (gtu "gtu")])
+
+;; For comparison operators we use the FCM* and CM* instructions.
+;; As there are no CMLE or CMLT instructions which act on 3 vector
+;; operands, we must use CMGE or CMGT and swap the order of the
+;; source operands.
+
+(define_code_attr n_optab [(lt "gt") (le "ge") (eq "eq") (ge "ge") (gt "gt")
+			   (ltu "hi") (leu "hs") (geu "hs") (gtu "hi")])
+(define_code_attr cmp_1   [(lt "2") (le "2") (eq "1") (ge "1") (gt "1")
+			   (ltu "2") (leu "2") (geu "1") (gtu "1")])
+(define_code_attr cmp_2   [(lt "1") (le "1") (eq "2") (ge "2") (gt "2")
+			   (ltu "1") (leu "1") (geu "2") (gtu "2")])
+
+(define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT")
+			   (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")])
 
 (define_code_attr fix_trunc_optab [(fix "fix_trunc")
 				   (unsigned_fix "fixuns_trunc")])
@@ -693,11 +722,6 @@
                                UNSPEC_SQSHRN UNSPEC_UQSHRN
                                UNSPEC_SQRSHRN UNSPEC_UQRSHRN])
 
-(define_int_iterator VCMP_S [UNSPEC_CMEQ UNSPEC_CMGE UNSPEC_CMGT
-			     UNSPEC_CMLE UNSPEC_CMLT])
-
-(define_int_iterator VCMP_U [UNSPEC_CMHS UNSPEC_CMHI UNSPEC_CMTST])
-
 (define_int_iterator PERMUTE [UNSPEC_ZIP1 UNSPEC_ZIP2
 			      UNSPEC_TRN1 UNSPEC_TRN2
 			      UNSPEC_UZP1 UNSPEC_UZP2])
@@ -784,12 +808,6 @@
 			 (UNSPEC_RADDHN2 "add")
 			 (UNSPEC_RSUBHN2 "sub")])
 
-(define_int_attr cmp [(UNSPEC_CMGE "ge") (UNSPEC_CMGT "gt")
-		      (UNSPEC_CMLE "le") (UNSPEC_CMLT "lt")
-                      (UNSPEC_CMEQ "eq")
-		      (UNSPEC_CMHS "hs") (UNSPEC_CMHI "hi")
-		      (UNSPEC_CMTST "tst")])
-
 (define_int_attr offsetlr [(UNSPEC_SSLI	"1") (UNSPEC_USLI "1")
 			   (UNSPEC_SSRI	"0") (UNSPEC_USRI "0")])
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 8f80b202811..8514e8f8fbd 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -31,6 +31,11 @@
        (ior (match_operand 0 "register_operand")
 	    (match_test "op == const0_rtx"))))
 
+(define_predicate "aarch64_reg_or_fp_zero"
+  (and (match_code "reg,subreg,const_double")
+       (ior (match_operand 0 "register_operand")
+	    (match_test "aarch64_float_const_zero_rtx_p (op)"))))
+
 (define_predicate "aarch64_reg_zero_or_m1_or_1"
   (and (match_code "reg,subreg,const_int")
        (ior (match_operand 0 "register_operand")
author	Matthew Gretton-Dann <matthew.gretton-dann@linaro.org>	2013-05-14 09:30:32 +0000
committer	Matthew Gretton-Dann <matthew.gretton-dann@linaro.org>	2013-05-14 09:30:32 +0000
commit	a6002864cd18f9049b5511293d4d84a6a4ea4e73 (patch)
tree	4ac0a2e47b7948b71a66fd00c4d1ba1bfef2fbb6 /gcc/config
parent	e118fade5f65318cb59db4b7593d0019e2302ed2 (diff)