aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Meissner <meissner@linux.vnet.ibm.com>2010-07-16 20:18:45 +0000
committerMichael Meissner <meissner@linux.vnet.ibm.com>2010-07-16 20:18:45 +0000
commit1e6e854870979176153252312f78480f6b690988 (patch)
tree30731bca0be35327e2cd9e9b14bb78ea468f1c95
parentc1741a5d2d2772712acd132ea1eac5ed312b115d (diff)
Merge -mrecip patches from mainline
git-svn-id: https://gcc.gnu.org/svn/gcc/branches/ibm/gcc-4_4-branch@162265 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--gcc/ChangeLog.ibm145
-rw-r--r--gcc/config/rs6000/altivec.h2
-rw-r--r--gcc/config/rs6000/altivec.md84
-rw-r--r--gcc/config/rs6000/rs6000-builtin.def11
-rw-r--r--gcc/config/rs6000/rs6000-c.c22
-rw-r--r--gcc/config/rs6000/rs6000-protos.h5
-rw-r--r--gcc/config/rs6000/rs6000.c1179
-rw-r--r--gcc/config/rs6000/rs6000.h40
-rw-r--r--gcc/config/rs6000/rs6000.md119
-rw-r--r--gcc/config/rs6000/rs6000.opt12
-rw-r--r--gcc/config/rs6000/vector.md14
-rw-r--r--gcc/config/rs6000/vsx.md20
-rw-r--r--gcc/doc/extend.texi17
-rw-r--r--gcc/doc/invoke.texi60
-rw-r--r--gcc/testsuite/ChangeLog.ibm16
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-1.c18
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-2.c21
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-3.c22
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-4.c36
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-5.c94
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-6.c16
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-7.c16
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-test.h149
-rw-r--r--gcc/testsuite/gcc.target/powerpc/recip-test2.h432
-rw-r--r--gcc/testsuite/lib/target-supports.exp28
25 files changed, 2124 insertions, 454 deletions
diff --git a/gcc/ChangeLog.ibm b/gcc/ChangeLog.ibm
index 17ffe642f66..231fd6a6999 100644
--- a/gcc/ChangeLog.ibm
+++ b/gcc/ChangeLog.ibm
@@ -1,3 +1,148 @@
+2010-07-16 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ Backport from mainline
+ 2010-07-09 Peter Bergner <bergner@vnet.ibm.com>
+
+ * config/rs6000/rs6000.c (rs6000_override_options): Fix setting of
+ default ISA flags.
+ * config/rs6000/rs6000.h (ASM_CPU_SPEC): Add -mvsx.
+
+ 2010-06-01 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ PR target/44218
+ * doc/invoke.texi (RS/6000 and PowerPC Options): Delete obsolete
+ -mswdiv option. Add -mrecip, -mrecip=<xxx>, -mrecip-precision
+ options.
+
+ * doc/extend.texi (powerpc builtins): Document vec_recip,
+ vec_rsqrt, vec_rsqrte altivec/vsx builtins.
+
+ * config/rs6000/rs60000-protos.h (rs6000_emit_swdiv): New
+ function.
+ (rs6000_emit_swrsqrt): Ditto.
+ (rs6000_emit_swdivsf): Delete.
+ (rs6000_emit_swdivdf): Ditto.
+ (rs6000_emit_swrsqrtsf): Ditto.
+
+ * config/rs6000/rs6000.c (rs6000_recip_bits): New global to
+ describe the reciprocal estimate support for each type.
+ (recip_options): Map -mrecip=<opt> into option bits.
+ (gen_2arg_fn_t): New typedef for binary rtx gen function.
+ (rs6000_debug_reg_global): If -mdebug=reg, print the state of the
+ reciprocal estimate instructions.
+ (rs6000_init_hard_regno_mode_ok): Key ws constraint off of the
+ debug -mvsx-scalar-memory switch instead of -mvsx-scalar-double.
+ Set up rs6000_recip_bits based on the -mrecip* options. Print the
+ cost information if -mdebug=cost or -mdebug=reg.
+ (rs6000_override_options): Set -mrecip-precision for power6, and
+ power7 machines. If -mvsx or -mdfp, enable various options that
+ came in previous instruction set ISAs, unless the option was
+ explicitly disabled by the command line option. Parse
+ -mrecip=<opt> options.
+ (rs6000_builtin_vectorized_function): Add support for vectorizing
+ the reciprocal estimate builtins and expansions.
+ (rs6000_handle_option): Add -mrecip, -mrecip=<opt> support.
+ (bdesc_2arg): Add reciprocal estimate builtins.
+ (bdesc_1arg): Add reciprocal square root estimate builtins.
+ (rs6000_expand_builtin): Rewrite to use a switch statement,
+ instead of multiple if/then/elses. Add reciprocal estimate
+ builtins.
+ (rs6000_init_builtins): Create declarations for reciprocal
+ estimate builtins.
+ (rs6000_preferred_reload_class): Simplify VSX preferences, if scalar
+ sized, prefer traditional floating point registers, if integer
+ vector types, prefer altivec registers. Don't actually look at
+ the memory address any more.
+ (rs6000_builtin_reciprocal): Add new builtin reciprocal estimate
+ builtins.
+ (rs6000_load_constant_and_splat): New helper function to load up
+ the constant for reciprocal estimate instructions.
+ (rs6000_emit_madd): New helper function for generating
+ multiply/add type instructions, based on the current switches.
+ (rs6000_emit_msub): Ditto.
+ (rs6000_emit_mnsub): Ditto.
+ (rs6000_emit_swdiv_high_precision): Replace rs6000_emit_swdivsf to
+ replace a divide with a reciprocal estimate and fixup, adding
+ support for machines with high precision and vectors.
+ (rs6000_emit_swdiv_low_precision): Rewrite rs6000_emit_swdivdf for
+ low precision machines.
+ (rs6000_emit_swdiv): New common function to be called to replace a
+ division with reciprocal estimate and fixup.
+ (rs6000_emit_swrsqrt): Replace rs6000_emit_swrsqrtsf. Add support
+ for double and vector types. Add support for high precision
+ machines.
+
+ * config/rs6000/rs6000.h (TARGET_FRES): New macro to say whether
+ the reciprocal estimate instructions can be generated.
+ (TARGET_FRE): Ditto.
+ (TARGET_FRSQRTES): Ditto.
+ (TARGET_FRSQRTE): Ditto.
+ (RS6000_RECIP_*): New macros for reciprocal estimate support.
+
+ * config/rs6000/vector.md (rsqrte<mode>2): New insn for reciprocal
+ square root estimate on vectors.
+ (re<mode>2): New insn for reciprocal division estimate on vectors.
+
+ * config/rs6000/rs6000-buitlins.def (ALTIVEC_BUILTIN_VRSQRTFP):
+ New builtin.
+ (ALTIVEC_BUILTIN_VRECIPFP): Ditto.
+ (ALTIVEC_BUITLIN_VEC_RE): Ditto.
+ (ALTIVEC_BUILTIN_VEC_RSQRT): Ditto.
+ (VSX_BUILTIN_RSQRT_V4SF): Ditto.
+ (VSX_BUITLIN_RSQRT_V2DF): Ditto.
+ (RS6000_BUILTIN_RSQRT): Ditto.
+ (ALTIVEC_BUILTIN_VEC_RSQRTE): Denote that the builtin is a
+ floating point builtin.
+
+ * config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define
+ macros __RECIP__, __RECIPF__, __RSQRTE__, __RSQRTEF__,
+ __RECIP_PRECISION__ based on the command line switches.
+ (altivec_overloaded_builtins): Add reciprocal estimate builtins.
+
+ * config/rs6000/rs6000.opt (-mrecip): Document add support for
+ replacing division instructions with reciprocal estimate and
+ fixup.
+ (-mrecip=<opt>): New option.
+ (-mrecip-precision): Ditto.
+
+ * config/rs6000/vsx.md (UNSPEC_VSX_RSQRTE): Delete.
+ (vsx_rsqrte<mode>2): Use UNSPEC_RSQRT not UNSPEC_VSX_RSQRTE.
+ (vsx_copysignsf3): If -mvsx, use double precision cpsign on single
+ precision scalar.
+
+ * config/rs6000/altivec.md (UNSPEC_RSQRTEFP): Delete.
+ (UNSPEC_VREFP): Ditto.
+ (altivec_vnmsubfp*): Make altivec nmsub mirror the scalar and VSX
+ conterparts with regard to support of -mno-fused-madd and
+ -ffast-math.
+ (altivec_vrsqrtefp): Use common UNSPEC to allow scalar/vector
+ reciprocal estimate instructions to be generated.
+ (altivec_vrefp): Ditto.
+
+ * config/rs6000/rs6000.md (RECIPF): New iterator for reciprocal
+ estimate support.
+ (rreg): New mode attribute for reciprocal estimate support.
+ (recip<mode>3): New insn for division using reciprocal estimate
+ and fixup builtins.
+ (divide define_split): New define_split to convert floating point
+ division to use reciprocal estimate if the user used the
+ appropriate options and the split is run when we can add new
+ pseudo registers for the fixup.
+ (rsqrt<mode>2): New insn for reciprocal square root support.
+ (recipsf3): Move into recip<mode>3.
+ (recipdf3): Ditto.
+ (fres): Use TARGET_FRES.
+ (rsqrtsf2): Move into rsqrt<mode>2.
+ (rsqrtsf_internal1): Use TARGET_FRSQRTSES.
+ (copysignsf3): Add support for VSX.
+ (fred): Use TARGET_FRE.
+ (fred_fpr): Ditto.
+ (rsqrtdf_internal1): New function for frsqrte instruciton.
+
+ * config/rs6000/altivec.h (vec_recipdiv): Define new vector
+ builtin.
+ (vec_rsqrt): Ditto.
+
2010-06-09 Peter Bergner <bergner@vnet.ibm.com>
Backport from mainline:
diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h
index bc4f30f7cb2..5f4510adc30 100644
--- a/gcc/config/rs6000/altivec.h
+++ b/gcc/config/rs6000/altivec.h
@@ -163,6 +163,8 @@
#define vec_vpkshus __builtin_vec_vpkshus
#define vec_re __builtin_vec_re
#define vec_round __builtin_vec_round
+#define vec_recipdiv __builtin_vec_recipdiv
+#define vec_rsqrt __builtin_vec_rsqrt
#define vec_rsqrte __builtin_vec_rsqrte
#define vec_vsubfp __builtin_vec_vsubfp
#define vec_subc __builtin_vec_subc
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 6fbb7cdcdac..7bf3c660312 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -75,9 +75,7 @@
(UNSPEC_VCTSXS 154)
(UNSPEC_VLOGEFP 155)
(UNSPEC_VEXPTEFP 156)
- (UNSPEC_VRSQRTEFP 157)
- (UNSPEC_VREFP 158)
- ;; 159-162 deleted
+ ;; 157-162 deleted
(UNSPEC_VLSDOI 163)
(UNSPEC_VUPKHSB 167)
(UNSPEC_VUPKHPX 168)
@@ -141,10 +139,11 @@
(UNSPEC_VPERMHI 321)
(UNSPEC_INTERHI 322)
(UNSPEC_INTERLO 323)
- (UNSPEC_VUPKHS_V4SF 324)
- (UNSPEC_VUPKLS_V4SF 325)
- (UNSPEC_VUPKHU_V4SF 326)
- (UNSPEC_VUPKLU_V4SF 327)
+ (UNSPEC_VUPKHS_V4SF 324)
+ (UNSPEC_VUPKLS_V4SF 325)
+ (UNSPEC_VUPKHU_V4SF 326)
+ (UNSPEC_VUPKLU_V4SF 327)
+ (UNSPEC_VNMSUBFP 328)
])
(define_constants
@@ -628,11 +627,64 @@
}")
;; Fused multiply subtract
-(define_insn "altivec_vnmsubfp"
+(define_expand "altivec_vnmsubfp"
+ [(match_operand:V4SF 0 "register_operand" "")
+ (match_operand:V4SF 1 "register_operand" "")
+ (match_operand:V4SF 2 "register_operand" "")
+ (match_operand:V4SF 3 "register_operand" "")]
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
+{
+ if (TARGET_FUSED_MADD && HONOR_SIGNED_ZEROS (SFmode))
+ {
+ emit_insn (gen_altivec_vnmsubfp_1 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+ }
+ else if (TARGET_FUSED_MADD && !HONOR_SIGNED_ZEROS (DFmode))
+ {
+ emit_insn (gen_altivec_vnmsubfp_2 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+ }
+ else
+ {
+ emit_insn (gen_altivec_vnmsubfp_3 (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+ }
+})
+
+(define_insn "altivec_vnmsubfp_1"
[(set (match_operand:V4SF 0 "register_operand" "=v")
- (neg:V4SF (minus:V4SF (mult:V4SF (match_operand:V4SF 1 "register_operand" "v")
- (match_operand:V4SF 2 "register_operand" "v"))
- (match_operand:V4SF 3 "register_operand" "v"))))]
+ (neg:V4SF
+ (minus:V4SF
+ (mult:V4SF
+ (match_operand:V4SF 1 "register_operand" "v")
+ (match_operand:V4SF 2 "register_operand" "v"))
+ (match_operand:V4SF 3 "register_operand" "v"))))]
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD
+ && HONOR_SIGNED_ZEROS (SFmode)"
+ "vnmsubfp %0,%1,%2,%3"
+ [(set_attr "type" "vecfloat")])
+
+(define_insn "altivec_vnmsubfp_2"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (minus:V4SF
+ (match_operand:V4SF 3 "register_operand" "v")
+ (mult:V4SF
+ (match_operand:V4SF 1 "register_operand" "v")
+ (match_operand:V4SF 2 "register_operand" "v"))))]
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD
+ && !HONOR_SIGNED_ZEROS (SFmode)"
+ "vnmsubfp %0,%1,%2,%3"
+ [(set_attr "type" "vecfloat")])
+
+(define_insn "altivec_vnmsubfp_3"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
+ (match_operand:V4SF 2 "register_operand" "v")
+ (match_operand:V4SF 3 "register_operand" "v")]
+ UNSPEC_VNMSUBFP))]
"VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
"vnmsubfp %0,%1,%2,%3"
[(set_attr "type" "vecfloat")])
@@ -1444,19 +1496,19 @@
"vexptefp %0,%1"
[(set_attr "type" "vecfloat")])
-(define_insn "altivec_vrsqrtefp"
+(define_insn "*altivec_vrsqrtefp"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")]
- UNSPEC_VRSQRTEFP))]
- "TARGET_ALTIVEC"
+ UNSPEC_RSQRT))]
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
"vrsqrtefp %0,%1"
[(set_attr "type" "vecfloat")])
(define_insn "altivec_vrefp"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")]
- UNSPEC_VREFP))]
- "TARGET_ALTIVEC"
+ UNSPEC_FRES))]
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
"vrefp %0,%1"
[(set_attr "type" "vecfloat")])
diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def
index e66e8c4318f..f5585bf7982 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -159,6 +159,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VRFIZ, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLB, RS6000_BTC_CONST)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLH, RS6000_BTC_CONST)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLW, RS6000_BTC_CONST)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTFP, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTEFP, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLB, RS6000_BTC_CONST)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLH, RS6000_BTC_CONST)
@@ -269,6 +270,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V8HI, RS6000_BTC_CONST)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V16QI, RS6000_BTC_CONST)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V4SF, RS6000_BTC_CONST)
RS6000_BUILTIN(ALTIVEC_BUILTIN_COPYSIGN_V4SF, RS6000_BTC_CONST)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VRECIPFP, RS6000_BTC_FP_PURE)
/* Altivec overloaded builtins. */
/* For now, don't set the classification for overloaded functions.
@@ -351,10 +353,12 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKS, RS6000_BTC_MISC)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKSU, RS6000_BTC_MISC)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PERM, RS6000_BTC_MISC)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RE, RS6000_BTC_MISC)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RECIP, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RL, RS6000_BTC_MISC)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RINT, RS6000_BTC_MISC)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_ROUND, RS6000_BTC_MISC)
-RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE, RS6000_BTC_MISC)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRT, RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SEL, RS6000_BTC_MISC)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SL, RS6000_BTC_MISC)
RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SLD, RS6000_BTC_MISC)
@@ -959,6 +963,10 @@ RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DF, RS6000_BTC_CONST)
RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DI, RS6000_BTC_CONST)
RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DF, RS6000_BTC_CONST)
RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DI, RS6000_BTC_CONST)
+RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V4SF, RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V2DF, RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_RECIP_V4SF, RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_RECIP_V2DF, RS6000_BTC_FP_PURE)
/* VSX overloaded builtins, add the overloaded functions not present in
Altivec. */
@@ -991,4 +999,5 @@ RS6000_BUILTIN(POWER7_BUILTIN_BPERMD, RS6000_BTC_CONST)
RS6000_BUILTIN(RS6000_BUILTIN_RECIP, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(RS6000_BUILTIN_RECIPF, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(RS6000_BUILTIN_RSQRTF, RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(RS6000_BUILTIN_RSQRT, RS6000_BTC_FP_PURE)
RS6000_BUILTIN(RS6000_BUILTIN_BSWAP_HI, RS6000_BTC_CONST)
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 0afa096cb55..bf6ac4b2de2 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -363,6 +363,16 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
builtin_define ("__builtin_vsx_xvnmsubasp=__builtin_vsx_xvnmsubsp");
builtin_define ("__builtin_vsx_xvnmsubmsp=__builtin_vsx_xvnmsubsp");
}
+ if (RS6000_RECIP_HAVE_RE_P (DFmode))
+ builtin_define ("__RECIP__");
+ if (RS6000_RECIP_HAVE_RE_P (SFmode))
+ builtin_define ("__RECIPF__");
+ if (RS6000_RECIP_HAVE_RSQRTE_P (DFmode))
+ builtin_define ("__RSQRTE__");
+ if (RS6000_RECIP_HAVE_RSQRTE_P (SFmode))
+ builtin_define ("__RSQRTEF__");
+ if (TARGET_RECIP_PRECISION)
+ builtin_define ("__RECIP_PRECISION__");
/* Tell users they can use __builtin_bswap{16,64}. */
builtin_define ("__HAVE_BSWAP__");
@@ -460,10 +470,22 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
RS6000_BTI_void, RS6000_BTI_bool_V16QI, 0, 0 },
{ ALTIVEC_BUILTIN_VEC_RE, ALTIVEC_BUILTIN_VREFP,
RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+ { ALTIVEC_BUILTIN_VEC_RE, VSX_BUILTIN_XVREDP,
+ RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
{ ALTIVEC_BUILTIN_VEC_ROUND, ALTIVEC_BUILTIN_VRFIN,
RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+ { ALTIVEC_BUILTIN_VEC_RECIP, ALTIVEC_BUILTIN_VRECIPFP,
+ RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 },
+ { ALTIVEC_BUILTIN_VEC_RECIP, VSX_BUILTIN_RECIP_V2DF,
+ RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0 },
+ { ALTIVEC_BUILTIN_VEC_RSQRT, ALTIVEC_BUILTIN_VRSQRTFP,
+ RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+ { ALTIVEC_BUILTIN_VEC_RSQRT, VSX_BUILTIN_VEC_RSQRT_V2DF,
+ RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
{ ALTIVEC_BUILTIN_VEC_RSQRTE, ALTIVEC_BUILTIN_VRSQRTEFP,
RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+ { ALTIVEC_BUILTIN_VEC_RSQRTE, VSX_BUILTIN_XVRSQRTEDP,
+ RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
{ ALTIVEC_BUILTIN_VEC_TRUNC, ALTIVEC_BUILTIN_VRFIZ,
RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
{ ALTIVEC_BUILTIN_VEC_TRUNC, VSX_BUILTIN_XVRDPIZ,
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index b41dbf39387..6e846d75f15 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -105,9 +105,8 @@ extern void rs6000_split_compare_and_swap (rtx, rtx, rtx, rtx, rtx);
extern void rs6000_expand_compare_and_swapqhi (rtx, rtx, rtx, rtx);
extern void rs6000_split_compare_and_swapqhi (rtx, rtx, rtx, rtx, rtx, rtx);
extern void rs6000_split_lock_test_and_set (rtx, rtx, rtx, rtx);
-extern void rs6000_emit_swdivsf (rtx, rtx, rtx);
-extern void rs6000_emit_swdivdf (rtx, rtx, rtx);
-extern void rs6000_emit_swrsqrtsf (rtx, rtx);
+extern void rs6000_emit_swdiv (rtx, rtx, rtx, bool);
+extern void rs6000_emit_swrsqrt (rtx, rtx);
extern void output_toc (FILE *, rtx, int, enum machine_mode);
extern void rs6000_initialize_trampoline (rtx, rtx, rtx);
extern rtx rs6000_longcall_ref (rtx);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 13325127e87..844bee46690 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -322,6 +322,61 @@ int rs6000_vector_align[NUM_MACHINE_MODES];
/* Map selected modes to types for builtins. */
static GTY(()) tree builtin_mode_to_type[MAX_MACHINE_MODE][2];
+
+/* What modes to automatically generate reciprocal divide estimate (fre) and
+ reciprocal sqrt (frsqrte) for. */
+unsigned char rs6000_recip_bits[MAX_MACHINE_MODE];
+
+/* Masks to determine which reciprocal esitmate instructions to generate
+ automatically. */
+enum rs6000_recip_mask {
+ RECIP_SF_DIV = 0x001, /* Use divide estimate */
+ RECIP_DF_DIV = 0x002,
+ RECIP_V4SF_DIV = 0x004,
+ RECIP_V2DF_DIV = 0x008,
+
+ RECIP_SF_RSQRT = 0x010, /* Use reciprocal sqrt estimate. */
+ RECIP_DF_RSQRT = 0x020,
+ RECIP_V4SF_RSQRT = 0x040,
+ RECIP_V2DF_RSQRT = 0x080,
+
+ /* Various combination of flags for -mrecip=xxx. */
+ RECIP_NONE = 0,
+ RECIP_ALL = (RECIP_SF_DIV | RECIP_DF_DIV | RECIP_V4SF_DIV
+ | RECIP_V2DF_DIV | RECIP_SF_RSQRT | RECIP_DF_RSQRT
+ | RECIP_V4SF_RSQRT | RECIP_V2DF_RSQRT),
+
+ RECIP_HIGH_PRECISION = RECIP_ALL,
+
+ /* On low precision machines like the power5, don't enable double precision
+ reciprocal square root estimate, since it isn't accurate enough. */
+ RECIP_LOW_PRECISION = (RECIP_ALL & ~(RECIP_DF_RSQRT | RECIP_V2DF_RSQRT))
+};
+
+static unsigned int rs6000_recip_control;
+static const char *rs6000_recip_name;
+
+/* -mrecip options. */
+static struct
+{
+ const char *string; /* option name */
+ unsigned int mask; /* mask bits to set */
+} recip_options[] = {
+ { "all", RECIP_ALL },
+ { "none", RECIP_NONE },
+ { "div", (RECIP_SF_DIV | RECIP_DF_DIV | RECIP_V4SF_DIV
+ | RECIP_V2DF_DIV) },
+ { "divf", (RECIP_SF_DIV | RECIP_V4SF_DIV) },
+ { "divd", (RECIP_DF_DIV | RECIP_V2DF_DIV) },
+ { "rsqrt", (RECIP_SF_RSQRT | RECIP_DF_RSQRT | RECIP_V4SF_RSQRT
+ | RECIP_V2DF_RSQRT) },
+ { "rsqrtf", (RECIP_SF_RSQRT | RECIP_V4SF_RSQRT) },
+ { "rsqrtd", (RECIP_DF_RSQRT | RECIP_V2DF_RSQRT) },
+};
+
+/* 2 argument gen function typedef. */
+typedef rtx (*gen_2arg_fn_t) (rtx, rtx, rtx);
+
/* Target cpu costs. */
@@ -1761,6 +1816,27 @@ rs6000_debug_reg_global (void)
if (nl)
fputs (nl, stderr);
+ if (rs6000_recip_control)
+ {
+ fprintf (stderr, "\nReciprocal mask = 0x%x\n", rs6000_recip_control);
+
+ for (m = 0; m < NUM_MACHINE_MODES; ++m)
+ if (rs6000_recip_bits[m])
+ {
+ fprintf (stderr,
+ "Reciprocal estimate mode: %-5s divide: %s rsqrt: %s\n",
+ GET_MODE_NAME (m),
+ (RS6000_RECIP_AUTO_RE_P (m)
+ ? "auto"
+ : (RS6000_RECIP_HAVE_RE_P (m) ? "have" : "none")),
+ (RS6000_RECIP_AUTO_RSQRTE_P (m)
+ ? "auto"
+ : (RS6000_RECIP_HAVE_RSQRTE_P (m) ? "have" : "none")));
+ }
+
+ fputs ("\n", stderr);
+ }
+
switch (rs6000_sched_costly_dep)
{
case max_dep_latency:
@@ -1968,8 +2044,9 @@ rs6000_init_hard_regno_mode_ok (void)
rs6000_constraints[RS6000_CONSTRAINT_wa] = VSX_REGS;
rs6000_constraints[RS6000_CONSTRAINT_wf] = VSX_REGS;
rs6000_constraints[RS6000_CONSTRAINT_wd] = VSX_REGS;
- if (TARGET_VSX_SCALAR_DOUBLE)
- rs6000_constraints[RS6000_CONSTRAINT_ws] = VSX_REGS;
+ rs6000_constraints[RS6000_CONSTRAINT_ws] = (TARGET_VSX_SCALAR_MEMORY
+ ? VSX_REGS
+ : FLOAT_REGS);
}
if (TARGET_ALTIVEC)
@@ -2047,8 +2124,111 @@ rs6000_init_hard_regno_mode_ok (void)
if (TARGET_E500_DOUBLE)
rs6000_class_max_nregs[DFmode][GENERAL_REGS] = 1;
+ /* Calculate which modes to automatically generate code to use a the
+ reciprocal divide and square root instructions. In the future, possibly
+ automatically generate the instructions even if the user did not specify
+ -mrecip. The older machines double precision reciprocal sqrt estimate is
+ not accurate enough. */
+ memset (rs6000_recip_bits, 0, sizeof (rs6000_recip_bits));
+ if (TARGET_FRES)
+ rs6000_recip_bits[SFmode] = RS6000_RECIP_MASK_HAVE_RE;
+ if (TARGET_FRE)
+ rs6000_recip_bits[DFmode] = RS6000_RECIP_MASK_HAVE_RE;
+ if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode))
+ rs6000_recip_bits[V4SFmode] = RS6000_RECIP_MASK_HAVE_RE;
+ if (VECTOR_UNIT_VSX_P (V2DFmode))
+ rs6000_recip_bits[V2DFmode] = RS6000_RECIP_MASK_HAVE_RE;
+
+ if (TARGET_FRSQRTES)
+ rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+ if (TARGET_FRSQRTE)
+ rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+ if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode))
+ rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+ if (VECTOR_UNIT_VSX_P (V2DFmode))
+ rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+
+ if (rs6000_recip_control)
+ {
+ if (!TARGET_FUSED_MADD)
+ warning (0, "-mrecip requires -mfused-madd");
+ if (!flag_finite_math_only)
+ warning (0, "-mrecip requires -ffinite-math or -ffast-math");
+ if (flag_trapping_math)
+ warning (0, "-mrecip requires -fno-trapping-math or -ffast-math");
+ if (!flag_reciprocal_math)
+ warning (0, "-mrecip requires -freciprocal-math or -ffast-math");
+ if (TARGET_FUSED_MADD && flag_finite_math_only && !flag_trapping_math
+ && flag_reciprocal_math)
+ {
+ if (RS6000_RECIP_HAVE_RE_P (SFmode)
+ && (rs6000_recip_control & RECIP_SF_DIV) != 0)
+ rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+ if (RS6000_RECIP_HAVE_RE_P (DFmode)
+ && (rs6000_recip_control & RECIP_DF_DIV) != 0)
+ rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+ if (RS6000_RECIP_HAVE_RE_P (V4SFmode)
+ && (rs6000_recip_control & RECIP_V4SF_DIV) != 0)
+ rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+ if (RS6000_RECIP_HAVE_RE_P (V2DFmode)
+ && (rs6000_recip_control & RECIP_V2DF_DIV) != 0)
+ rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+ if (RS6000_RECIP_HAVE_RSQRTE_P (SFmode)
+ && (rs6000_recip_control & RECIP_SF_RSQRT) != 0)
+ rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+
+ if (RS6000_RECIP_HAVE_RSQRTE_P (DFmode)
+ && (rs6000_recip_control & RECIP_DF_RSQRT) != 0)
+ rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+
+ if (RS6000_RECIP_HAVE_RSQRTE_P (V4SFmode)
+ && (rs6000_recip_control & RECIP_V4SF_RSQRT) != 0)
+ rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+
+ if (RS6000_RECIP_HAVE_RSQRTE_P (V2DFmode)
+ && (rs6000_recip_control & RECIP_V2DF_RSQRT) != 0)
+ rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+ }
+ }
+
if (TARGET_DEBUG_REG)
rs6000_debug_reg_global ();
+
+ if (TARGET_DEBUG_COST || TARGET_DEBUG_REG)
+ fprintf (stderr,
+ "SImode variable mult cost = %d\n"
+ "SImode constant mult cost = %d\n"
+ "SImode short constant mult cost = %d\n"
+ "DImode multipliciation cost = %d\n"
+ "SImode division cost = %d\n"
+ "DImode division cost = %d\n"
+ "Simple fp operation cost = %d\n"
+ "DFmode multiplication cost = %d\n"
+ "SFmode division cost = %d\n"
+ "DFmode division cost = %d\n"
+ "cache line size = %d\n"
+ "l1 cache size = %d\n"
+ "l2 cache size = %d\n"
+ "simultaneous prefetches = %d\n"
+ "\n",
+ rs6000_cost->mulsi,
+ rs6000_cost->mulsi_const,
+ rs6000_cost->mulsi_const9,
+ rs6000_cost->muldi,
+ rs6000_cost->divsi,
+ rs6000_cost->divdi,
+ rs6000_cost->fp,
+ rs6000_cost->dmul,
+ rs6000_cost->sdiv,
+ rs6000_cost->ddiv,
+ rs6000_cost->cache_line_size,
+ rs6000_cost->l1_cache_size,
+ rs6000_cost->l2_cache_size,
+ rs6000_cost->simultaneous_prefetches);
}
#if TARGET_MACHO
@@ -2223,15 +2403,16 @@ rs6000_override_options (const char *default_cpu)
| MASK_MFCRF | MASK_POPCNTB | MASK_FPRND},
{"power6", PROCESSOR_POWER6,
POWERPC_BASE_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_PPC_GFXOPT
- | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP},
+ | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP
+ | MASK_RECIP_PRECISION},
{"power6x", PROCESSOR_POWER6,
POWERPC_BASE_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_PPC_GFXOPT
| MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP
- | MASK_MFPGPR},
+ | MASK_MFPGPR | MASK_RECIP_PRECISION},
{"power7", PROCESSOR_POWER7,
POWERPC_7400_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_MFCRF
| MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP | MASK_POPCNTD
- | MASK_VSX}, /* Don't add MASK_ISEL by default */
+ | MASK_VSX| MASK_RECIP_PRECISION}, /* Don't add MASK_ISEL by default */
{"powerpc", PROCESSOR_POWERPC, POWERPC_BASE_MASK},
{"powerpc64", PROCESSOR_POWERPC64,
POWERPC_BASE_MASK | MASK_PPC_GFXOPT | MASK_POWERPC64},
@@ -2259,7 +2440,24 @@ rs6000_override_options (const char *default_cpu)
| MASK_PPC_GFXOPT | MASK_POWERPC64 | MASK_ALTIVEC
| MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_MULHW
| MASK_DLMZB | MASK_CMPB | MASK_MFPGPR | MASK_DFP
- | MASK_POPCNTD | MASK_VSX | MASK_ISEL | MASK_NO_UPDATE)
+ | MASK_POPCNTD | MASK_VSX | MASK_ISEL | MASK_NO_UPDATE
+ | MASK_RECIP_PRECISION)
+ };
+
+ /* Masks for instructions set at various powerpc ISAs. */
+ enum {
+ ISA_2_1_MASKS = MASK_MFCRF,
+ ISA_2_2_MASKS = (ISA_2_1_MASKS | MASK_POPCNTB | MASK_FPRND),
+
+ /* For ISA 2.05, do not add MFPGPR, since it isn't in ISA 2.06, and
+ don't add ALTIVEC, since in general it isn't a win on power6. */
+ ISA_2_5_MASKS = (ISA_2_2_MASKS | MASK_CMPB | MASK_RECIP_PRECISION
+ | MASK_DFP),
+
+ /* For ISA 2.06, don't add ISEL, since in general it isn't a win, but
+ altivec is a win so enable it. */
+ ISA_2_6_MASKS = (ISA_2_5_MASKS | MASK_ALTIVEC | MASK_POPCNTD
+ | MASK_VSX | MASK_RECIP_PRECISION)
};
/* Set the pointer size. */
@@ -2394,10 +2592,17 @@ rs6000_override_options (const char *default_cpu)
warning (0, msg);
target_flags &= ~ MASK_VSX;
}
- else if (TARGET_VSX && !TARGET_ALTIVEC)
- target_flags |= MASK_ALTIVEC;
}
+ /* For the newer switches (vsx, dfp, etc.) set some of the older options,
+ unless the user explicitly used the -mno-<option> to disable the code. */
+ if (TARGET_VSX)
+ target_flags |= (ISA_2_6_MASKS & ~target_flags_explicit);
+ else if (TARGET_DFP)
+ target_flags |= (ISA_2_5_MASKS & ~target_flags_explicit);
+ else if (TARGET_ALTIVEC)
+ target_flags |= (MASK_PPC_GFXOPT & ~target_flags_explicit);
+
/* Set debug flags */
if (rs6000_debug_name)
{
@@ -2813,6 +3018,52 @@ rs6000_override_options (const char *default_cpu)
the DERAT mispredict penalty. */
TARGET_AVOID_XFORM = (rs6000_cpu == PROCESSOR_POWER6 && TARGET_CMPB);
+ /* Set the -mrecip options. */
+ if (rs6000_recip_name)
+ {
+ char *p = ASTRDUP (rs6000_recip_name);
+ char *q;
+ unsigned int mask, i;
+ bool invert;
+
+ while ((q = strtok (p, ",")) != NULL)
+ {
+ p = NULL;
+ if (*q == '!')
+ {
+ invert = true;
+ q++;
+ }
+ else
+ invert = false;
+
+ if (!strcmp (q, "default"))
+ mask = ((TARGET_RECIP_PRECISION)
+ ? RECIP_HIGH_PRECISION : RECIP_LOW_PRECISION);
+ else
+ {
+ for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+ if (!strcmp (q, recip_options[i].string))
+ {
+ mask = recip_options[i].mask;
+ break;
+ }
+
+ if (i == ARRAY_SIZE (recip_options))
+ {
+ error ("Unknown option for -mrecip=%s", q);
+ invert = false;
+ mask = 0;
+ }
+ }
+
+ if (invert)
+ rs6000_recip_control &= ~mask;
+ else
+ rs6000_recip_control |= mask;
+ }
+ }
+
rs6000_init_hard_regno_mode_ok ();
}
@@ -3127,12 +3378,10 @@ rs6000_builtin_vectorized_function (tree fndecl, tree type_out,
{
enum machine_mode in_mode, out_mode;
int in_n, out_n;
- enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
if (TREE_CODE (type_out) != VECTOR_TYPE
|| TREE_CODE (type_in) != VECTOR_TYPE
- || !TARGET_VECTORIZE_BUILTINS
- || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
+ || !TARGET_VECTORIZE_BUILTINS)
return NULL_TREE;
out_mode = TYPE_MODE (TREE_TYPE (type_out));
@@ -3140,111 +3389,151 @@ rs6000_builtin_vectorized_function (tree fndecl, tree type_out,
in_mode = TYPE_MODE (TREE_TYPE (type_in));
in_n = TYPE_VECTOR_SUBPARTS (type_in);
- switch (fn)
+ if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
{
- case BUILT_IN_COPYSIGN:
- if (VECTOR_UNIT_VSX_P (V2DFmode)
- && out_mode == DFmode && out_n == 2
- && in_mode == DFmode && in_n == 2)
- return rs6000_builtin_decls[VSX_BUILTIN_CPSGNDP];
- break;
- case BUILT_IN_COPYSIGNF:
- if (out_mode != SFmode || out_n != 4
- || in_mode != SFmode || in_n != 4)
- break;
- if (VECTOR_UNIT_VSX_P (V4SFmode))
- return rs6000_builtin_decls[VSX_BUILTIN_CPSGNSP];
- if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
- return rs6000_builtin_decls[ALTIVEC_BUILTIN_COPYSIGN_V4SF];
- break;
- case BUILT_IN_SQRT:
- if (VECTOR_UNIT_VSX_P (V2DFmode)
- && out_mode == DFmode && out_n == 2
- && in_mode == DFmode && in_n == 2)
- return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTDP];
- break;
- case BUILT_IN_SQRTF:
- if (VECTOR_UNIT_VSX_P (V4SFmode)
- && out_mode == SFmode && out_n == 4
- && in_mode == SFmode && in_n == 4)
- return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTSP];
- break;
- case BUILT_IN_CEIL:
- if (VECTOR_UNIT_VSX_P (V2DFmode)
- && out_mode == DFmode && out_n == 2
- && in_mode == DFmode && in_n == 2)
- return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIP];
- break;
- case BUILT_IN_CEILF:
- if (out_mode != SFmode || out_n != 4
- || in_mode != SFmode || in_n != 4)
- break;
- if (VECTOR_UNIT_VSX_P (V4SFmode))
- return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIP];
- if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
- return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIP];
- break;
- case BUILT_IN_FLOOR:
- if (VECTOR_UNIT_VSX_P (V2DFmode)
- && out_mode == DFmode && out_n == 2
- && in_mode == DFmode && in_n == 2)
- return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIM];
- break;
- case BUILT_IN_FLOORF:
- if (out_mode != SFmode || out_n != 4
- || in_mode != SFmode || in_n != 4)
- break;
- if (VECTOR_UNIT_VSX_P (V4SFmode))
- return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIM];
- if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
- return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM];
- break;
- case BUILT_IN_TRUNC:
- if (VECTOR_UNIT_VSX_P (V2DFmode)
- && out_mode == DFmode && out_n == 2
- && in_mode == DFmode && in_n == 2)
- return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIZ];
- break;
- case BUILT_IN_TRUNCF:
- if (out_mode != SFmode || out_n != 4
- || in_mode != SFmode || in_n != 4)
- break;
- if (VECTOR_UNIT_VSX_P (V4SFmode))
- return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIZ];
- if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
- return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIZ];
- break;
- case BUILT_IN_NEARBYINT:
- if (VECTOR_UNIT_VSX_P (V2DFmode)
- && flag_unsafe_math_optimizations
- && out_mode == DFmode && out_n == 2
- && in_mode == DFmode && in_n == 2)
- return rs6000_builtin_decls[VSX_BUILTIN_XVRDPI];
- break;
- case BUILT_IN_NEARBYINTF:
- if (VECTOR_UNIT_VSX_P (V4SFmode)
- && flag_unsafe_math_optimizations
- && out_mode == SFmode && out_n == 4
- && in_mode == SFmode && in_n == 4)
- return rs6000_builtin_decls[VSX_BUILTIN_XVRSPI];
- break;
- case BUILT_IN_RINT:
- if (VECTOR_UNIT_VSX_P (V2DFmode)
- && !flag_trapping_math
- && out_mode == DFmode && out_n == 2
- && in_mode == DFmode && in_n == 2)
- return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIC];
- break;
- case BUILT_IN_RINTF:
- if (VECTOR_UNIT_VSX_P (V4SFmode)
- && !flag_trapping_math
- && out_mode == SFmode && out_n == 4
- && in_mode == SFmode && in_n == 4)
- return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIC];
- break;
- default:
- break;
+ enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
+ switch (fn)
+ {
+ case BUILT_IN_COPYSIGN:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_CPSGNDP];
+ break;
+ case BUILT_IN_COPYSIGNF:
+ if (out_mode != SFmode || out_n != 4
+ || in_mode != SFmode || in_n != 4)
+ break;
+ if (VECTOR_UNIT_VSX_P (V4SFmode))
+ return rs6000_builtin_decls[VSX_BUILTIN_CPSGNSP];
+ if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_COPYSIGN_V4SF];
+ break;
+ case BUILT_IN_SQRT:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTDP];
+ break;
+ case BUILT_IN_SQRTF:
+ if (VECTOR_UNIT_VSX_P (V4SFmode)
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTSP];
+ break;
+ case BUILT_IN_CEIL:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIP];
+ break;
+ case BUILT_IN_CEILF:
+ if (out_mode != SFmode || out_n != 4
+ || in_mode != SFmode || in_n != 4)
+ break;
+ if (VECTOR_UNIT_VSX_P (V4SFmode))
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIP];
+ if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIP];
+ break;
+ case BUILT_IN_FLOOR:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIM];
+ break;
+ case BUILT_IN_FLOORF:
+ if (out_mode != SFmode || out_n != 4
+ || in_mode != SFmode || in_n != 4)
+ break;
+ if (VECTOR_UNIT_VSX_P (V4SFmode))
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIM];
+ if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM];
+ break;
+ case BUILT_IN_TRUNC:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIZ];
+ break;
+ case BUILT_IN_TRUNCF:
+ if (out_mode != SFmode || out_n != 4
+ || in_mode != SFmode || in_n != 4)
+ break;
+ if (VECTOR_UNIT_VSX_P (V4SFmode))
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIZ];
+ if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIZ];
+ break;
+ case BUILT_IN_NEARBYINT:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && flag_unsafe_math_optimizations
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRDPI];
+ break;
+ case BUILT_IN_NEARBYINTF:
+ if (VECTOR_UNIT_VSX_P (V4SFmode)
+ && flag_unsafe_math_optimizations
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRSPI];
+ break;
+ case BUILT_IN_RINT:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && !flag_trapping_math
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIC];
+ break;
+ case BUILT_IN_RINTF:
+ if (VECTOR_UNIT_VSX_P (V4SFmode)
+ && !flag_trapping_math
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIC];
+ break;
+ default:
+ break;
+ }
}
+
+ else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+ {
+ enum rs6000_builtins fn
+ = (enum rs6000_builtins)DECL_FUNCTION_CODE (fndecl);
+ switch (fn)
+ {
+ case RS6000_BUILTIN_RSQRTF:
+ if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRSQRTFP];
+ break;
+ case RS6000_BUILTIN_RSQRT:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V2DF];
+ break;
+ case RS6000_BUILTIN_RECIPF:
+ if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRECIPFP];
+ break;
+ case RS6000_BUILTIN_RECIP:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_RECIP_V2DF];
+ break;
+ default:
+ break;
+ }
+ }
+
return NULL_TREE;
}
@@ -3604,6 +3893,13 @@ rs6000_handle_option (size_t code, const char *arg, int value)
target_flags_explicit |= MASK_SOFT_FLOAT;
rs6000_single_float = rs6000_double_float = 0;
}
+
+ case OPT_mrecip:
+ rs6000_recip_name = (value) ? "default" : "none";
+ break;
+
+ case OPT_mrecip_:
+ rs6000_recip_name = arg;
break;
}
return true;
@@ -8791,6 +9087,7 @@ static struct builtin_description bdesc_2arg[] =
{ MASK_ALTIVEC, CODE_FOR_altivec_vpkshus, "__builtin_altivec_vpkshus", ALTIVEC_BUILTIN_VPKSHUS },
{ MASK_ALTIVEC, CODE_FOR_altivec_vpkuwus, "__builtin_altivec_vpkuwus", ALTIVEC_BUILTIN_VPKUWUS },
{ MASK_ALTIVEC, CODE_FOR_altivec_vpkswus, "__builtin_altivec_vpkswus", ALTIVEC_BUILTIN_VPKSWUS },
+ { MASK_ALTIVEC, CODE_FOR_recipv4sf3, "__builtin_altivec_vrecipdivfp", ALTIVEC_BUILTIN_VRECIPFP },
{ MASK_ALTIVEC, CODE_FOR_vrotlv16qi3, "__builtin_altivec_vrlb", ALTIVEC_BUILTIN_VRLB },
{ MASK_ALTIVEC, CODE_FOR_vrotlv8hi3, "__builtin_altivec_vrlh", ALTIVEC_BUILTIN_VRLH },
{ MASK_ALTIVEC, CODE_FOR_vrotlv4si3, "__builtin_altivec_vrlw", ALTIVEC_BUILTIN_VRLW },
@@ -8833,6 +9130,7 @@ static struct builtin_description bdesc_2arg[] =
{ MASK_VSX, CODE_FOR_subv2df3, "__builtin_vsx_xvsubdp", VSX_BUILTIN_XVSUBDP },
{ MASK_VSX, CODE_FOR_mulv2df3, "__builtin_vsx_xvmuldp", VSX_BUILTIN_XVMULDP },
{ MASK_VSX, CODE_FOR_divv2df3, "__builtin_vsx_xvdivdp", VSX_BUILTIN_XVDIVDP },
+ { MASK_VSX, CODE_FOR_recipv2df3, "__builtin_vsx_xvrecipdivdp", VSX_BUILTIN_RECIP_V2DF },
{ MASK_VSX, CODE_FOR_sminv2df3, "__builtin_vsx_xvmindp", VSX_BUILTIN_XVMINDP },
{ MASK_VSX, CODE_FOR_smaxv2df3, "__builtin_vsx_xvmaxdp", VSX_BUILTIN_XVMAXDP },
{ MASK_VSX, CODE_FOR_vsx_tdivv2df3_fe, "__builtin_vsx_xvtdivdp_fe", VSX_BUILTIN_XVTDIVDP_FE },
@@ -8845,6 +9143,7 @@ static struct builtin_description bdesc_2arg[] =
{ MASK_VSX, CODE_FOR_subv4sf3, "__builtin_vsx_xvsubsp", VSX_BUILTIN_XVSUBSP },
{ MASK_VSX, CODE_FOR_mulv4sf3, "__builtin_vsx_xvmulsp", VSX_BUILTIN_XVMULSP },
{ MASK_VSX, CODE_FOR_divv4sf3, "__builtin_vsx_xvdivsp", VSX_BUILTIN_XVDIVSP },
+ { MASK_VSX, CODE_FOR_recipv4sf3, "__builtin_vsx_xvrecipdivsp", VSX_BUILTIN_RECIP_V4SF },
{ MASK_VSX, CODE_FOR_sminv4sf3, "__builtin_vsx_xvminsp", VSX_BUILTIN_XVMINSP },
{ MASK_VSX, CODE_FOR_smaxv4sf3, "__builtin_vsx_xvmaxsp", VSX_BUILTIN_XVMAXSP },
{ MASK_VSX, CODE_FOR_vsx_tdivv4sf3_fe, "__builtin_vsx_xvtdivsp_fe", VSX_BUILTIN_XVTDIVSP_FE },
@@ -8961,6 +9260,7 @@ static struct builtin_description bdesc_2arg[] =
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_packsu", ALTIVEC_BUILTIN_VEC_PACKSU },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vpkswus", ALTIVEC_BUILTIN_VEC_VPKSWUS },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vpkshus", ALTIVEC_BUILTIN_VEC_VPKSHUS },
+ { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_recipdiv", ALTIVEC_BUILTIN_VEC_RECIP },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rl", ALTIVEC_BUILTIN_VEC_RL },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vrlw", ALTIVEC_BUILTIN_VEC_VRLW },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vrlh", ALTIVEC_BUILTIN_VEC_VRLH },
@@ -9290,12 +9590,13 @@ static struct builtin_description bdesc_1arg[] =
{
{ MASK_ALTIVEC, CODE_FOR_altivec_vexptefp, "__builtin_altivec_vexptefp", ALTIVEC_BUILTIN_VEXPTEFP },
{ MASK_ALTIVEC, CODE_FOR_altivec_vlogefp, "__builtin_altivec_vlogefp", ALTIVEC_BUILTIN_VLOGEFP },
- { MASK_ALTIVEC, CODE_FOR_altivec_vrefp, "__builtin_altivec_vrefp", ALTIVEC_BUILTIN_VREFP },
+ { MASK_ALTIVEC, CODE_FOR_rev4sf2, "__builtin_altivec_vrefp", ALTIVEC_BUILTIN_VREFP },
{ MASK_ALTIVEC, CODE_FOR_vector_floorv4sf2, "__builtin_altivec_vrfim", ALTIVEC_BUILTIN_VRFIM },
{ MASK_ALTIVEC, CODE_FOR_altivec_vrfin, "__builtin_altivec_vrfin", ALTIVEC_BUILTIN_VRFIN },
{ MASK_ALTIVEC, CODE_FOR_vector_ceilv4sf2, "__builtin_altivec_vrfip", ALTIVEC_BUILTIN_VRFIP },
{ MASK_ALTIVEC, CODE_FOR_vector_btruncv4sf2, "__builtin_altivec_vrfiz", ALTIVEC_BUILTIN_VRFIZ },
- { MASK_ALTIVEC, CODE_FOR_altivec_vrsqrtefp, "__builtin_altivec_vrsqrtefp", ALTIVEC_BUILTIN_VRSQRTEFP },
+ { MASK_ALTIVEC, CODE_FOR_rsqrtv4sf2, "__builtin_altivec_vrsqrtfp", ALTIVEC_BUILTIN_VRSQRTFP },
+ { MASK_ALTIVEC, CODE_FOR_rsqrtev4sf2, "__builtin_altivec_vrsqrtefp", ALTIVEC_BUILTIN_VRSQRTEFP },
{ MASK_ALTIVEC, CODE_FOR_altivec_vspltisb, "__builtin_altivec_vspltisb", ALTIVEC_BUILTIN_VSPLTISB },
{ MASK_ALTIVEC, CODE_FOR_altivec_vspltish, "__builtin_altivec_vspltish", ALTIVEC_BUILTIN_VSPLTISH },
{ MASK_ALTIVEC, CODE_FOR_altivec_vspltisw, "__builtin_altivec_vspltisw", ALTIVEC_BUILTIN_VSPLTISW },
@@ -9308,14 +9609,16 @@ static struct builtin_description bdesc_1arg[] =
{ MASK_VSX, CODE_FOR_negv2df2, "__builtin_vsx_xvnegdp", VSX_BUILTIN_XVNEGDP },
{ MASK_VSX, CODE_FOR_sqrtv2df2, "__builtin_vsx_xvsqrtdp", VSX_BUILTIN_XVSQRTDP },
- { MASK_VSX, CODE_FOR_vsx_rsqrtev2df2, "__builtin_vsx_xvrsqrtedp", VSX_BUILTIN_XVRSQRTEDP },
+ { MASK_VSX, CODE_FOR_rsqrtv2df2, "__builtin_vsx_xvrsqrtdp", VSX_BUILTIN_VEC_RSQRT_V2DF },
+ { MASK_VSX, CODE_FOR_rsqrtev2df2, "__builtin_vsx_xvrsqrtedp", VSX_BUILTIN_XVRSQRTEDP },
{ MASK_VSX, CODE_FOR_vsx_tsqrtv2df2_fe, "__builtin_vsx_xvtsqrtdp_fe", VSX_BUILTIN_XVTSQRTDP_FE },
{ MASK_VSX, CODE_FOR_vsx_tsqrtv2df2_fg, "__builtin_vsx_xvtsqrtdp_fg", VSX_BUILTIN_XVTSQRTDP_FG },
{ MASK_VSX, CODE_FOR_vsx_frev2df2, "__builtin_vsx_xvredp", VSX_BUILTIN_XVREDP },
{ MASK_VSX, CODE_FOR_negv4sf2, "__builtin_vsx_xvnegsp", VSX_BUILTIN_XVNEGSP },
{ MASK_VSX, CODE_FOR_sqrtv4sf2, "__builtin_vsx_xvsqrtsp", VSX_BUILTIN_XVSQRTSP },
- { MASK_VSX, CODE_FOR_vsx_rsqrtev4sf2, "__builtin_vsx_xvrsqrtesp", VSX_BUILTIN_XVRSQRTESP },
+ { MASK_VSX, CODE_FOR_rsqrtv4sf2, "__builtin_vsx_xvrsqrtsp", VSX_BUILTIN_VEC_RSQRT_V4SF },
+ { MASK_VSX, CODE_FOR_rsqrtev4sf2, "__builtin_vsx_xvrsqrtesp", VSX_BUILTIN_XVRSQRTESP },
{ MASK_VSX, CODE_FOR_vsx_tsqrtv4sf2_fe, "__builtin_vsx_xvtsqrtsp_fe", VSX_BUILTIN_XVTSQRTSP_FE },
{ MASK_VSX, CODE_FOR_vsx_tsqrtv4sf2_fg, "__builtin_vsx_xvtsqrtsp_fg", VSX_BUILTIN_XVTSQRTSP_FG },
{ MASK_VSX, CODE_FOR_vsx_frev4sf2, "__builtin_vsx_xvresp", VSX_BUILTIN_XVRESP },
@@ -9374,6 +9677,7 @@ static struct builtin_description bdesc_1arg[] =
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_mtvscr", ALTIVEC_BUILTIN_VEC_MTVSCR },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_re", ALTIVEC_BUILTIN_VEC_RE },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_round", ALTIVEC_BUILTIN_VEC_ROUND },
+ { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rsqrt", ALTIVEC_BUILTIN_VEC_RSQRT },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rsqrte", ALTIVEC_BUILTIN_VEC_RSQRTE },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_trunc", ALTIVEC_BUILTIN_VEC_TRUNC },
{ MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_unpackh", ALTIVEC_BUILTIN_VEC_UNPACKH },
@@ -10890,73 +11194,83 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
rtx ret;
bool success;
- if (fcode == RS6000_BUILTIN_RECIP)
+ switch (fcode)
+ {
+ case RS6000_BUILTIN_RECIP:
return rs6000_expand_binop_builtin (CODE_FOR_recipdf3, exp, target);
- if (fcode == RS6000_BUILTIN_RECIPF)
+ case RS6000_BUILTIN_RECIPF:
return rs6000_expand_binop_builtin (CODE_FOR_recipsf3, exp, target);
- if (fcode == RS6000_BUILTIN_RSQRTF)
+ case RS6000_BUILTIN_RSQRTF:
return rs6000_expand_unop_builtin (CODE_FOR_rsqrtsf2, exp, target);
- if (fcode == RS6000_BUILTIN_BSWAP_HI)
- return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target);
+ case RS6000_BUILTIN_RSQRT:
+ return rs6000_expand_unop_builtin (CODE_FOR_rsqrtdf2, exp, target);
- if (fcode == POWER7_BUILTIN_BPERMD)
- return rs6000_expand_binop_builtin (((TARGET_64BIT)
- ? CODE_FOR_bpermd_di
- : CODE_FOR_bpermd_si), exp, target);
+ case RS6000_BUILTIN_BSWAP_HI:
+ return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target);
- if (fcode == ALTIVEC_BUILTIN_MASK_FOR_LOAD
- || fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE)
- {
- int icode = (int) CODE_FOR_altivec_lvsr;
- enum machine_mode tmode = insn_data[icode].operand[0].mode;
- enum machine_mode mode = insn_data[icode].operand[1].mode;
- tree arg;
- rtx op, addr, pat;
+ case POWER7_BUILTIN_BPERMD:
+ return rs6000_expand_binop_builtin (((TARGET_64BIT)
+ ? CODE_FOR_bpermd_di
+ : CODE_FOR_bpermd_si), exp, target);
- gcc_assert (TARGET_ALTIVEC);
+ case ALTIVEC_BUILTIN_MASK_FOR_LOAD:
+ case ALTIVEC_BUILTIN_MASK_FOR_STORE:
+ {
+ int icode = (int) CODE_FOR_altivec_lvsr;
+ enum machine_mode tmode = insn_data[icode].operand[0].mode;
+ enum machine_mode mode = insn_data[icode].operand[1].mode;
+ tree arg;
+ rtx op, addr, pat;
+
+ gcc_assert (TARGET_ALTIVEC);
+
+ arg = CALL_EXPR_ARG (exp, 0);
+ gcc_assert (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE);
+ op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL);
+ addr = memory_address (mode, op);
+ if (fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE)
+ op = addr;
+ else
+ {
+ /* For the load case need to negate the address. */
+ op = gen_reg_rtx (GET_MODE (addr));
+ emit_insn (gen_rtx_SET (VOIDmode, op,
+ gen_rtx_NEG (GET_MODE (addr), addr)));
+ }
+ op = gen_rtx_MEM (mode, op);
- arg = CALL_EXPR_ARG (exp, 0);
- gcc_assert (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE);
- op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL);
- addr = memory_address (mode, op);
- if (fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE)
- op = addr;
- else
- {
- /* For the load case need to negate the address. */
- op = gen_reg_rtx (GET_MODE (addr));
- emit_insn (gen_rtx_SET (VOIDmode, op,
- gen_rtx_NEG (GET_MODE (addr), addr)));
- }
- op = gen_rtx_MEM (mode, op);
+ if (target == 0
+ || GET_MODE (target) != tmode
+ || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+ target = gen_reg_rtx (tmode);
- if (target == 0
- || GET_MODE (target) != tmode
- || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
- target = gen_reg_rtx (tmode);
-
- /*pat = gen_altivec_lvsr (target, op);*/
- pat = GEN_FCN (icode) (target, op);
- if (!pat)
- return 0;
- emit_insn (pat);
+ /*pat = gen_altivec_lvsr (target, op);*/
+ pat = GEN_FCN (icode) (target, op);
+ if (!pat)
+ return 0;
+ emit_insn (pat);
- return target;
- }
+ return target;
+ }
+ case ALTIVEC_BUILTIN_VCFUX:
+ case ALTIVEC_BUILTIN_VCFSX:
+ case ALTIVEC_BUILTIN_VCTUXS:
+ case ALTIVEC_BUILTIN_VCTSXS:
/* FIXME: There's got to be a nicer way to handle this case than
constructing a new CALL_EXPR. */
- if (fcode == ALTIVEC_BUILTIN_VCFUX
- || fcode == ALTIVEC_BUILTIN_VCFSX
- || fcode == ALTIVEC_BUILTIN_VCTUXS
- || fcode == ALTIVEC_BUILTIN_VCTSXS)
- {
if (call_expr_nargs (exp) == 1)
- exp = build_call_nary (TREE_TYPE (exp), CALL_EXPR_FN (exp),
- 2, CALL_EXPR_ARG (exp, 0), integer_zero_node);
+ {
+ exp = build_call_nary (TREE_TYPE (exp), CALL_EXPR_FN (exp),
+ 2, CALL_EXPR_ARG (exp, 0), integer_zero_node);
+ }
+ break;
+
+ default:
+ break;
}
if (TARGET_ALTIVEC)
@@ -11016,6 +11330,8 @@ build_opaque_vector_type (tree node, int nunits)
static void
rs6000_init_builtins (void)
{
+ tree ftype;
+
V2SI_type_node = build_vector_type (intSI_type_node, 2);
V2SF_type_node = build_vector_type (float_type_node, 2);
V2DI_type_node = build_vector_type (intDI_type_node, 2);
@@ -11167,29 +11483,38 @@ rs6000_init_builtins (void)
altivec_init_builtins ();
if (TARGET_ALTIVEC || TARGET_SPE || TARGET_PAIRED_FLOAT || TARGET_VSX)
rs6000_common_init_builtins ();
- if (TARGET_PPC_GFXOPT)
+ if (TARGET_FRE)
+ {
+ ftype = builtin_function_type (DFmode, DFmode, DFmode, VOIDmode,
+ RS6000_BUILTIN_RECIP,
+ "__builtin_recipdiv");
+ def_builtin (MASK_POPCNTB, "__builtin_recipdiv", ftype,
+ RS6000_BUILTIN_RECIP);
+ }
+ if (TARGET_FRES)
{
- tree ftype = builtin_function_type (SFmode, SFmode, SFmode, VOIDmode,
- RS6000_BUILTIN_RECIPF,
- "__builtin_recipdivf");
+ ftype = builtin_function_type (SFmode, SFmode, SFmode, VOIDmode,
+ RS6000_BUILTIN_RECIPF,
+ "__builtin_recipdivf");
def_builtin (MASK_PPC_GFXOPT, "__builtin_recipdivf", ftype,
RS6000_BUILTIN_RECIPF);
-
+ }
+ if (TARGET_FRSQRTE)
+ {
+ ftype = builtin_function_type (DFmode, DFmode, VOIDmode, VOIDmode,
+ RS6000_BUILTIN_RSQRT,
+ "__builtin_rsqrt");
+ def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrt", ftype,
+ RS6000_BUILTIN_RSQRT);
+ }
+ if (TARGET_FRSQRTES)
+ {
ftype = builtin_function_type (SFmode, SFmode, VOIDmode, VOIDmode,
RS6000_BUILTIN_RSQRTF,
"__builtin_rsqrtf");
def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrtf", ftype,
RS6000_BUILTIN_RSQRTF);
}
- if (TARGET_POPCNTB)
- {
- tree ftype = builtin_function_type (DFmode, DFmode, DFmode, VOIDmode,
- RS6000_BUILTIN_RECIP,
- "__builtin_recipdiv");
- def_builtin (MASK_POPCNTB, "__builtin_recipdiv", ftype,
- RS6000_BUILTIN_RECIP);
-
- }
if (TARGET_POPCNTD)
{
enum machine_mode mode = (TARGET_64BIT) ? DImode : SImode;
@@ -13685,30 +14010,16 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
if (GET_MODE_CLASS (mode) == MODE_INT && rclass == NON_SPECIAL_REGS)
return GENERAL_REGS;
- /* For VSX, prefer the traditional registers for DF if the address is of the
- form reg+offset because we can use the non-VSX loads. Prefer the Altivec
- registers if Altivec is handling the vector operations (i.e. V16QI, V8HI,
- and V4SI). */
- if (rclass == VSX_REGS && VECTOR_MEM_VSX_P (mode))
+ /* For VSX, prefer the traditional registers for 64-bit values because we can
+ use the non-VSX loads. Prefer the Altivec registers if Altivec is
+ handling the vector operations (i.e. V16QI, V8HI, and V4SI), or if we
+ prefer Altivec loads.. */
+ if (rclass == VSX_REGS)
{
- if (mode == DFmode && GET_CODE (x) == MEM)
- {
- rtx addr = XEXP (x, 0);
-
- if (legitimate_indirect_address_p (addr, false)) /* reg */
- return VSX_REGS;
-
- if (legitimate_indexed_address_p (addr, false)) /* reg+reg */
- return VSX_REGS;
+ if (GET_MODE_SIZE (mode) <= 8)
+ return FLOAT_REGS;
- if (GET_CODE (addr) == PRE_MODIFY
- && legitimate_indexed_address_p (XEXP (addr, 0), false))
- return VSX_REGS;
-
- return FLOAT_REGS;
- }
-
- if (VECTOR_UNIT_ALTIVEC_P (mode))
+ if (VECTOR_UNIT_ALTIVEC_P (mode) || VECTOR_MEM_ALTIVEC_P (mode))
return ALTIVEC_REGS;
return rclass;
@@ -24967,17 +25278,41 @@ static tree
rs6000_builtin_reciprocal (unsigned int fn, bool md_fn,
bool sqrt ATTRIBUTE_UNUSED)
{
- if (! (TARGET_RECIP && TARGET_PPC_GFXOPT && !optimize_size
- && flag_finite_math_only && !flag_trapping_math
- && flag_unsafe_math_optimizations))
+ if (optimize_insn_for_size_p ())
return NULL_TREE;
if (md_fn)
- return NULL_TREE;
+ switch (fn)
+ {
+ case VSX_BUILTIN_XVSQRTDP:
+ if (!RS6000_RECIP_AUTO_RSQRTE_P (V2DFmode))
+ return NULL_TREE;
+
+ return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V2DF];
+
+ case VSX_BUILTIN_XVSQRTSP:
+ if (!RS6000_RECIP_AUTO_RSQRTE_P (V4SFmode))
+ return NULL_TREE;
+
+ return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V4SF];
+
+ default:
+ return NULL_TREE;
+ }
+
else
switch (fn)
{
+ case BUILT_IN_SQRT:
+ if (!RS6000_RECIP_AUTO_RSQRTE_P (DFmode))
+ return NULL_TREE;
+
+ return rs6000_builtin_decls[RS6000_BUILTIN_RSQRT];
+
case BUILT_IN_SQRTF:
+ if (!RS6000_RECIP_AUTO_RSQRTE_P (SFmode))
+ return NULL_TREE;
+
return rs6000_builtin_decls[RS6000_BUILTIN_RSQRTF];
default:
@@ -24985,192 +25320,300 @@ rs6000_builtin_reciprocal (unsigned int fn, bool md_fn,
}
}
-/* Newton-Raphson approximation of single-precision floating point divide n/d.
- Assumes no trapping math and finite arguments. */
+/* Load up a constant. If the mode is a vector mode, splat the value across
+ all of the vector elements. */
-void
-rs6000_emit_swdivsf (rtx dst, rtx n, rtx d)
+static rtx
+rs6000_load_constant_and_splat (enum machine_mode mode, REAL_VALUE_TYPE dconst)
+{
+ rtx reg;
+
+ if (mode == SFmode || mode == DFmode)
+ {
+ rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, mode);
+ reg = force_reg (mode, d);
+ }
+ else if (mode == V4SFmode)
+ {
+ rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, SFmode);
+ rtvec v = gen_rtvec (4, d, d, d, d);
+ reg = gen_reg_rtx (mode);
+ rs6000_expand_vector_init (reg, gen_rtx_PARALLEL (mode, v));
+ }
+ else if (mode == V2DFmode)
+ {
+ rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, DFmode);
+ rtvec v = gen_rtvec (2, d, d);
+ reg = gen_reg_rtx (mode);
+ rs6000_expand_vector_init (reg, gen_rtx_PARALLEL (mode, v));
+ }
+ else
+ gcc_unreachable ();
+
+ return reg;
+}
+
+/* Generate a FMADD instruction:
+ dst = (m1 * m2) + a
+
+ generating different RTL based on the fused multiply/add switch. */
+
+static void
+rs6000_emit_madd (rtx dst, rtx m1, rtx m2, rtx a)
+{
+ enum machine_mode mode = GET_MODE (dst);
+
+ if (!TARGET_FUSED_MADD)
+ {
+ /* For the simple ops, use the generator function, rather than assuming
+ that the RTL is standard. */
+ enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code;
+ enum insn_code acode = optab_handler (add_optab, mode)->insn_code;
+ gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode);
+ gen_2arg_fn_t gen_add = (gen_2arg_fn_t) GEN_FCN (acode);
+ rtx mreg = gen_reg_rtx (mode);
+
+ gcc_assert (mcode != CODE_FOR_nothing && acode != CODE_FOR_nothing);
+ emit_insn (gen_mul (mreg, m1, m2));
+ emit_insn (gen_add (dst, mreg, a));
+ }
+
+ else
+ emit_insn (gen_rtx_SET (VOIDmode, dst,
+ gen_rtx_PLUS (mode,
+ gen_rtx_MULT (mode, m1, m2),
+ a)));
+}
+
+/* Generate a FMSUB instruction:
+ dst = (m1 * m2) - a
+
+ generating different RTL based on the fused multiply/add switch. */
+
+static void
+rs6000_emit_msub (rtx dst, rtx m1, rtx m2, rtx a)
+{
+ enum machine_mode mode = GET_MODE (dst);
+
+ if (!TARGET_FUSED_MADD
+ || (mode == V4SFmode && VECTOR_UNIT_ALTIVEC_P (V4SFmode)))
+ {
+ /* For the simple ops, use the generator function, rather than assuming
+ that the RTL is standard. */
+ enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code;
+ enum insn_code scode = optab_handler (add_optab, mode)->insn_code;
+ gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode);
+ gen_2arg_fn_t gen_sub = (gen_2arg_fn_t) GEN_FCN (scode);
+ rtx mreg = gen_reg_rtx (mode);
+
+ gcc_assert (mcode != CODE_FOR_nothing && scode != CODE_FOR_nothing);
+ emit_insn (gen_mul (mreg, m1, m2));
+ emit_insn (gen_sub (dst, mreg, a));
+ }
+
+ else
+ emit_insn (gen_rtx_SET (VOIDmode, dst,
+ gen_rtx_MINUS (mode,
+ gen_rtx_MULT (mode, m1, m2),
+ a)));
+}
+
+/* Generate a FNMSUB instruction:
+ dst = - ((m1 * m2) - a)
+
+ Which is equivalent to (except in the prescence of -0.0):
+ dst = a - (m1 * m2)
+
+ generating different RTL based on the fast-math and fused multiply/add
+ switches. */
+
+static void
+rs6000_emit_nmsub (rtx dst, rtx m1, rtx m2, rtx a)
{
- rtx x0, e0, e1, y1, u0, v0, one;
+ enum machine_mode mode = GET_MODE (dst);
+
+ if (!TARGET_FUSED_MADD)
+ {
+ /* For the simple ops, use the generator function, rather than assuming
+ that the RTL is standard. */
+ enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code;
+ enum insn_code scode = optab_handler (sub_optab, mode)->insn_code;
+ gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode);
+ gen_2arg_fn_t gen_sub = (gen_2arg_fn_t) GEN_FCN (scode);
+ rtx mreg = gen_reg_rtx (mode);
- x0 = gen_reg_rtx (SFmode);
- e0 = gen_reg_rtx (SFmode);
- e1 = gen_reg_rtx (SFmode);
- y1 = gen_reg_rtx (SFmode);
- u0 = gen_reg_rtx (SFmode);
- v0 = gen_reg_rtx (SFmode);
- one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode));
+ gcc_assert (mcode != CODE_FOR_nothing && scode != CODE_FOR_nothing);
+ emit_insn (gen_mul (mreg, m1, m2));
+ emit_insn (gen_sub (dst, a, mreg));
+ }
+
+ else
+ {
+ rtx m = gen_rtx_MULT (mode, m1, m2);
+
+ if (!HONOR_SIGNED_ZEROS (mode))
+ emit_insn (gen_rtx_SET (VOIDmode, dst, gen_rtx_MINUS (mode, a, m)));
+
+ else
+ emit_insn (gen_rtx_SET (VOIDmode, dst,
+ gen_rtx_NEG (mode,
+ gen_rtx_MINUS (mode, m, a))));
+ }
+}
+
+/* Newton-Raphson approximation of floating point divide with just 2 passes
+ (either single precision floating point, or newer machines with higher
+ accuracy estimates). Support both scalar and vector divide. Assumes no
+ trapping math and finite arguments. */
+
+static void
+rs6000_emit_swdiv_high_precision (rtx dst, rtx n, rtx d)
+{
+ enum machine_mode mode = GET_MODE (dst);
+ rtx x0, e0, e1, y1, u0, v0;
+ enum insn_code code = optab_handler (smul_optab, mode)->insn_code;
+ gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
+ rtx one = rs6000_load_constant_and_splat (mode, dconst1);
+
+ gcc_assert (code != CODE_FOR_nothing);
/* x0 = 1./d estimate */
+ x0 = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (VOIDmode, x0,
- gen_rtx_UNSPEC (SFmode, gen_rtvec (1, d),
+ gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
UNSPEC_FRES)));
- /* e0 = 1. - d * x0 */
- emit_insn (gen_rtx_SET (VOIDmode, e0,
- gen_rtx_MINUS (SFmode, one,
- gen_rtx_MULT (SFmode, d, x0))));
- /* e1 = e0 + e0 * e0 */
- emit_insn (gen_rtx_SET (VOIDmode, e1,
- gen_rtx_PLUS (SFmode,
- gen_rtx_MULT (SFmode, e0, e0), e0)));
- /* y1 = x0 + e1 * x0 */
- emit_insn (gen_rtx_SET (VOIDmode, y1,
- gen_rtx_PLUS (SFmode,
- gen_rtx_MULT (SFmode, e1, x0), x0)));
- /* u0 = n * y1 */
- emit_insn (gen_rtx_SET (VOIDmode, u0,
- gen_rtx_MULT (SFmode, n, y1)));
- /* v0 = n - d * u0 */
- emit_insn (gen_rtx_SET (VOIDmode, v0,
- gen_rtx_MINUS (SFmode, n,
- gen_rtx_MULT (SFmode, d, u0))));
- /* dst = u0 + v0 * y1 */
- emit_insn (gen_rtx_SET (VOIDmode, dst,
- gen_rtx_PLUS (SFmode,
- gen_rtx_MULT (SFmode, v0, y1), u0)));
-}
-
-/* Newton-Raphson approximation of double-precision floating point divide n/d.
- Assumes no trapping math and finite arguments. */
-void
-rs6000_emit_swdivdf (rtx dst, rtx n, rtx d)
+ e0 = gen_reg_rtx (mode);
+ rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - (d * x0) */
+
+ e1 = gen_reg_rtx (mode);
+ rs6000_emit_madd (e1, e0, e0, e0); /* e1 = (e0 * e0) + e0 */
+
+ y1 = gen_reg_rtx (mode);
+ rs6000_emit_madd (y1, e1, x0, x0); /* y1 = (e1 * x0) + x0 */
+
+ u0 = gen_reg_rtx (mode);
+ emit_insn (gen_mul (u0, n, y1)); /* u0 = n * y1 */
+
+ v0 = gen_reg_rtx (mode);
+ rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - (d * u0) */
+
+ rs6000_emit_madd (dst, v0, y1, u0); /* dst = (v0 * y1) + u0 */
+}
+
+/* Newton-Raphson approximation of floating point divide that has a low
+ precision estimate. Assumes no trapping math and finite arguments. */
+
+static void
+rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d)
{
+ enum machine_mode mode = GET_MODE (dst);
rtx x0, e0, e1, e2, y1, y2, y3, u0, v0, one;
+ enum insn_code code = optab_handler (smul_optab, mode)->insn_code;
+ gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
- x0 = gen_reg_rtx (DFmode);
- e0 = gen_reg_rtx (DFmode);
- e1 = gen_reg_rtx (DFmode);
- e2 = gen_reg_rtx (DFmode);
- y1 = gen_reg_rtx (DFmode);
- y2 = gen_reg_rtx (DFmode);
- y3 = gen_reg_rtx (DFmode);
- u0 = gen_reg_rtx (DFmode);
- v0 = gen_reg_rtx (DFmode);
- one = force_reg (DFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, DFmode));
+ gcc_assert (code != CODE_FOR_nothing);
+
+ one = rs6000_load_constant_and_splat (mode, dconst1);
/* x0 = 1./d estimate */
+ x0 = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (VOIDmode, x0,
- gen_rtx_UNSPEC (DFmode, gen_rtvec (1, d),
+ gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
UNSPEC_FRES)));
- /* e0 = 1. - d * x0 */
- emit_insn (gen_rtx_SET (VOIDmode, e0,
- gen_rtx_MINUS (DFmode, one,
- gen_rtx_MULT (SFmode, d, x0))));
- /* y1 = x0 + e0 * x0 */
- emit_insn (gen_rtx_SET (VOIDmode, y1,
- gen_rtx_PLUS (DFmode,
- gen_rtx_MULT (DFmode, e0, x0), x0)));
- /* e1 = e0 * e0 */
- emit_insn (gen_rtx_SET (VOIDmode, e1,
- gen_rtx_MULT (DFmode, e0, e0)));
- /* y2 = y1 + e1 * y1 */
- emit_insn (gen_rtx_SET (VOIDmode, y2,
- gen_rtx_PLUS (DFmode,
- gen_rtx_MULT (DFmode, e1, y1), y1)));
- /* e2 = e1 * e1 */
- emit_insn (gen_rtx_SET (VOIDmode, e2,
- gen_rtx_MULT (DFmode, e1, e1)));
- /* y3 = y2 + e2 * y2 */
- emit_insn (gen_rtx_SET (VOIDmode, y3,
- gen_rtx_PLUS (DFmode,
- gen_rtx_MULT (DFmode, e2, y2), y2)));
- /* u0 = n * y3 */
- emit_insn (gen_rtx_SET (VOIDmode, u0,
- gen_rtx_MULT (DFmode, n, y3)));
- /* v0 = n - d * u0 */
- emit_insn (gen_rtx_SET (VOIDmode, v0,
- gen_rtx_MINUS (DFmode, n,
- gen_rtx_MULT (DFmode, d, u0))));
- /* dst = u0 + v0 * y3 */
- emit_insn (gen_rtx_SET (VOIDmode, dst,
- gen_rtx_PLUS (DFmode,
- gen_rtx_MULT (DFmode, v0, y3), u0)));
-}
-
-
-/* Newton-Raphson approximation of single-precision floating point rsqrt.
- Assumes no trapping math and finite arguments. */
+
+ e0 = gen_reg_rtx (mode);
+ rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - d * x0 */
+
+ y1 = gen_reg_rtx (mode);
+ rs6000_emit_madd (y1, e0, x0, x0); /* y1 = x0 + e0 * x0 */
+
+ e1 = gen_reg_rtx (mode);
+ emit_insn (gen_mul (e1, e0, e0)); /* e1 = e0 * e0 */
+
+ y2 = gen_reg_rtx (mode);
+ rs6000_emit_madd (y2, e1, y1, y1); /* y2 = y1 + e1 * y1 */
+
+ e2 = gen_reg_rtx (mode);
+ emit_insn (gen_mul (e2, e1, e1)); /* e2 = e1 * e1 */
+
+ y3 = gen_reg_rtx (mode);
+ rs6000_emit_madd (y3, e2, y2, y2); /* y3 = y2 + e2 * y2 */
+
+ u0 = gen_reg_rtx (mode);
+ emit_insn (gen_mul (u0, n, y3)); /* u0 = n * y3 */
+
+ v0 = gen_reg_rtx (mode);
+ rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - d * u0 */
+
+ rs6000_emit_madd (dst, v0, y3, u0); /* dst = u0 + v0 * y3 */
+}
+
+/* Newton-Raphson approximation of floating point divide DST = N/D. If NOTE_P,
+ add a reg_note saying that this was a division. Support both scalar and
+ vector divide. Assumes no trapping math and finite arguments. */
void
-rs6000_emit_swrsqrtsf (rtx dst, rtx src)
-{
- rtx x0, x1, x2, y1, u0, u1, u2, v0, v1, v2, t0,
- half, one, halfthree, c1, cond, label;
-
- x0 = gen_reg_rtx (SFmode);
- x1 = gen_reg_rtx (SFmode);
- x2 = gen_reg_rtx (SFmode);
- y1 = gen_reg_rtx (SFmode);
- u0 = gen_reg_rtx (SFmode);
- u1 = gen_reg_rtx (SFmode);
- u2 = gen_reg_rtx (SFmode);
- v0 = gen_reg_rtx (SFmode);
- v1 = gen_reg_rtx (SFmode);
- v2 = gen_reg_rtx (SFmode);
- t0 = gen_reg_rtx (SFmode);
- halfthree = gen_reg_rtx (SFmode);
- cond = gen_rtx_REG (CCFPmode, CR1_REGNO);
- label = gen_rtx_LABEL_REF (VOIDmode, gen_label_rtx ());
+rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p)
+{
+ enum machine_mode mode = GET_MODE (dst);
- /* check 0.0, 1.0, NaN, Inf by testing src * src = src */
- emit_insn (gen_rtx_SET (VOIDmode, t0,
- gen_rtx_MULT (SFmode, src, src)));
+ if (RS6000_RECIP_HIGH_PRECISION_P (mode))
+ rs6000_emit_swdiv_high_precision (dst, n, d);
+ else
+ rs6000_emit_swdiv_low_precision (dst, n, d);
- emit_insn (gen_rtx_SET (VOIDmode, cond,
- gen_rtx_COMPARE (CCFPmode, t0, src)));
- c1 = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
- emit_unlikely_jump (c1, label);
+ if (note_p)
+ add_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_DIV (mode, n, d));
+}
- half = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode));
- one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode));
+/* Newton-Raphson approximation of single/double-precision floating point
+ rsqrt. Assumes no trapping math and finite arguments. */
+
+void
+rs6000_emit_swrsqrt (rtx dst, rtx src)
+{
+ enum machine_mode mode = GET_MODE (src);
+ rtx x0 = gen_reg_rtx (mode);
+ rtx y = gen_reg_rtx (mode);
+ int passes = (TARGET_RECIP_PRECISION) ? 2 : 3;
+ REAL_VALUE_TYPE dconst3_2;
+ int i;
+ rtx halfthree;
+ enum insn_code code = optab_handler (smul_optab, mode)->insn_code;
+ gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
- /* halfthree = 1.5 = 1.0 + 0.5 */
- emit_insn (gen_rtx_SET (VOIDmode, halfthree,
- gen_rtx_PLUS (SFmode, one, half)));
+ gcc_assert (code != CODE_FOR_nothing);
+
+ /* Load up the constant 1.5 either as a scalar, or as a vector. */
+ real_from_integer (&dconst3_2, VOIDmode, 3, 0, 0);
+ SET_REAL_EXP (&dconst3_2, REAL_EXP (&dconst3_2) - 1);
+
+ halfthree = rs6000_load_constant_and_splat (mode, dconst3_2);
/* x0 = rsqrt estimate */
emit_insn (gen_rtx_SET (VOIDmode, x0,
- gen_rtx_UNSPEC (SFmode, gen_rtvec (1, src),
+ gen_rtx_UNSPEC (mode, gen_rtvec (1, src),
UNSPEC_RSQRT)));
- /* y1 = 0.5 * src = 1.5 * src - src -> fewer constants */
- emit_insn (gen_rtx_SET (VOIDmode, y1,
- gen_rtx_MINUS (SFmode,
- gen_rtx_MULT (SFmode, src, halfthree),
- src)));
-
- /* x1 = x0 * (1.5 - y1 * (x0 * x0)) */
- emit_insn (gen_rtx_SET (VOIDmode, u0,
- gen_rtx_MULT (SFmode, x0, x0)));
- emit_insn (gen_rtx_SET (VOIDmode, v0,
- gen_rtx_MINUS (SFmode,
- halfthree,
- gen_rtx_MULT (SFmode, y1, u0))));
- emit_insn (gen_rtx_SET (VOIDmode, x1,
- gen_rtx_MULT (SFmode, x0, v0)));
-
- /* x2 = x1 * (1.5 - y1 * (x1 * x1)) */
- emit_insn (gen_rtx_SET (VOIDmode, u1,
- gen_rtx_MULT (SFmode, x1, x1)));
- emit_insn (gen_rtx_SET (VOIDmode, v1,
- gen_rtx_MINUS (SFmode,
- halfthree,
- gen_rtx_MULT (SFmode, y1, u1))));
- emit_insn (gen_rtx_SET (VOIDmode, x2,
- gen_rtx_MULT (SFmode, x1, v1)));
-
- /* dst = x2 * (1.5 - y1 * (x2 * x2)) */
- emit_insn (gen_rtx_SET (VOIDmode, u2,
- gen_rtx_MULT (SFmode, x2, x2)));
- emit_insn (gen_rtx_SET (VOIDmode, v2,
- gen_rtx_MINUS (SFmode,
- halfthree,
- gen_rtx_MULT (SFmode, y1, u2))));
- emit_insn (gen_rtx_SET (VOIDmode, dst,
- gen_rtx_MULT (SFmode, x2, v2)));
+ /* y = 0.5 * src = 1.5 * src - src -> fewer constants */
+ rs6000_emit_msub (y, src, halfthree, src);
- emit_label (XEXP (label, 0));
+ for (i = 0; i < passes; i++)
+ {
+ rtx x1 = gen_reg_rtx (mode);
+ rtx u = gen_reg_rtx (mode);
+ rtx v = gen_reg_rtx (mode);
+
+ /* x1 = x0 * (1.5 - y * (x0 * x0)) */
+ emit_insn (gen_mul (u, x0, x0));
+ rs6000_emit_nmsub (v, y, u, halfthree);
+ emit_insn (gen_mul (x1, x0, v));
+ x0 = x1;
+ }
+
+ emit_move_insn (dst, x0);
+ return;
}
/* Emit popcount intrinsic on TARGET_POPCNTB (Power5) and TARGET_POPCNTD
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 6a005f68cac..c51e4184fa3 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -541,6 +541,46 @@ extern int rs6000_vector_align[];
/* E500 processors only support plain "sync", not lwsync. */
#define TARGET_NO_LWSYNC TARGET_E500
+/* Which machine supports the various reciprocal estimate instructions. */
+#define TARGET_FRES (TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT \
+ && TARGET_FPRS && TARGET_SINGLE_FLOAT)
+
+#define TARGET_FRE (TARGET_HARD_FLOAT && TARGET_FPRS \
+ && TARGET_DOUBLE_FLOAT \
+ && (TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode)))
+
+#define TARGET_FRSQRTES (TARGET_HARD_FLOAT && TARGET_POPCNTB \
+ && TARGET_FPRS && TARGET_SINGLE_FLOAT)
+
+#define TARGET_FRSQRTE (TARGET_HARD_FLOAT && TARGET_FPRS \
+ && TARGET_DOUBLE_FLOAT \
+ && (TARGET_PPC_GFXOPT || VECTOR_UNIT_VSX_P (DFmode)))
+
+/* Whether the various reciprocal divide/square root estimate instructions
+ exist, and whether we should automatically generate code for the instruction
+ by default. */
+#define RS6000_RECIP_MASK_HAVE_RE 0x1 /* have RE instruction. */
+#define RS6000_RECIP_MASK_AUTO_RE 0x2 /* generate RE by default. */
+#define RS6000_RECIP_MASK_HAVE_RSQRTE 0x4 /* have RSQRTE instruction. */
+#define RS6000_RECIP_MASK_AUTO_RSQRTE 0x8 /* gen. RSQRTE by default. */
+
+extern unsigned char rs6000_recip_bits[];
+
+#define RS6000_RECIP_HAVE_RE_P(MODE) \
+ (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RE)
+
+#define RS6000_RECIP_AUTO_RE_P(MODE) \
+ (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RE)
+
+#define RS6000_RECIP_HAVE_RSQRTE_P(MODE) \
+ (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RSQRTE)
+
+#define RS6000_RECIP_AUTO_RSQRTE_P(MODE) \
+ (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RSQRTE)
+
+#define RS6000_RECIP_HIGH_PRECISION_P(MODE) \
+ ((MODE) == SFmode || (MODE) == V4SFmode || TARGET_RECIP_PRECISION)
+
/* Sometimes certain combinations of command options do not make sense
on a particular target machine. You can define a macro
`OVERRIDE_OPTIONS' to take account of this. This macro, if
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index b1aac23893d..1cc20af4bb0 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -220,6 +220,9 @@
; but on e500v2, the gpr are 64 bit registers
(define_mode_iterator DIFD [DI (DF "!TARGET_E500_DOUBLE") DD])
+;; Iterator for reciprocal estimate instructions
+(define_mode_iterator RECIPF [SF DF V4SF V2DF])
+
; Various instructions that come in SI and DI forms.
; A generic w/d attribute, for things like cmpw/cmpd.
(define_mode_attr wd [(QI "b") (HI "h") (SI "w") (DI "d")])
@@ -240,6 +243,11 @@
(define_mode_attr mptrsize [(SI "si")
(DI "di")])
+(define_mode_attr rreg [(SF "f")
+ (DF "Ws")
+ (V4SF "Wf")
+ (V2DF "Wd")])
+
;; Start with fixed-point load and store insns. Here we put only the more
;; complex forms. Basic data transfer is done later.
@@ -5544,6 +5552,45 @@
[(set_attr "type" "var_delayed_compare,delayed_compare,var_delayed_compare,delayed_compare")
(set_attr "length" "4,4,8,8")])
+;; Builtins to replace a division to generate FRE reciprocal estimate
+;; instructions and the necessary fixup instructions
+(define_expand "recip<mode>3"
+ [(match_operand:RECIPF 0 "gpc_reg_operand" "")
+ (match_operand:RECIPF 1 "gpc_reg_operand" "")
+ (match_operand:RECIPF 2 "gpc_reg_operand" "")]
+ "RS6000_RECIP_HAVE_RE_P (<MODE>mode)"
+{
+ rs6000_emit_swdiv (operands[0], operands[1], operands[2], false);
+ DONE;
+})
+
+;; Split to create division from FRE/FRES/etc. and fixup instead of the normal
+;; hardware division. This is only done before register allocation and with
+;; -ffast-math. This must appear before the divsf3/divdf3 insns.
+(define_split
+ [(set (match_operand:RECIPF 0 "gpc_reg_operand" "")
+ (div:RECIPF (match_operand 1 "gpc_reg_operand" "")
+ (match_operand 2 "gpc_reg_operand" "")))]
+ "RS6000_RECIP_AUTO_RE_P (<MODE>mode)
+ && can_create_pseudo_p () && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math && flag_reciprocal_math"
+ [(const_int 0)]
+{
+ rs6000_emit_swdiv (operands[0], operands[1], operands[2], true);
+ DONE;
+})
+
+;; Builtins to replace 1/sqrt(x) with instructions using RSQRTE and the
+;; appropriate fixup.
+(define_expand "rsqrt<mode>2"
+ [(match_operand:RECIPF 0 "gpc_reg_operand" "")
+ (match_operand:RECIPF 1 "gpc_reg_operand" "")]
+ "RS6000_RECIP_HAVE_RSQRT_P (<MODE>mode)"
+{
+ rs6000_emit_swrsqrt (operands[0], operands[1]);
+ DONE;
+})
+
(define_split
[(set (match_operand:CC 3 "cc_reg_not_micro_cr0_operand" "")
(compare:CC (ashiftrt:SI (match_operand:SI 1 "gpc_reg_operand" "")
@@ -5747,22 +5794,10 @@
"{fd|fdiv} %0,%1,%2"
[(set_attr "type" "ddiv")])
-(define_expand "recipsf3"
- [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
- (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")
- (match_operand:SF 2 "gpc_reg_operand" "f")]
- UNSPEC_FRES))]
- "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size
- && flag_finite_math_only && !flag_trapping_math"
-{
- rs6000_emit_swdivsf (operands[0], operands[1], operands[2]);
- DONE;
-})
-
(define_insn "fres"
[(set (match_operand:SF 0 "gpc_reg_operand" "=f")
(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))]
- "TARGET_PPC_GFXOPT && flag_finite_math_only"
+ "TARGET_FRES"
"fres %0,%1"
[(set_attr "type" "fp")])
@@ -5912,23 +5947,12 @@
"fsqrt %0,%1"
[(set_attr "type" "dsqrt")])
-(define_expand "rsqrtsf2"
+(define_insn "*rsqrtsf_internal1"
[(set (match_operand:SF 0 "gpc_reg_operand" "=f")
(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
UNSPEC_RSQRT))]
- "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size
- && flag_finite_math_only && !flag_trapping_math"
-{
- rs6000_emit_swrsqrtsf (operands[0], operands[1]);
- DONE;
-})
-
-(define_insn "*rsqrt_internal1"
- [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
- (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
- UNSPEC_RSQRT))]
- "TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT"
- "frsqrte %0,%1"
+ "TARGET_FRSQRTES"
+ "frsqrtes %0,%1"
[(set_attr "type" "fp")])
(define_expand "copysignsf3"
@@ -5941,9 +5965,18 @@
(match_dup 5))
(match_dup 3)
(match_dup 4)))]
- "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
- && !HONOR_NANS (SFmode) && !HONOR_SIGNED_ZEROS (SFmode)"
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
+ && ((TARGET_PPC_GFXOPT
+ && !HONOR_NANS (SFmode)
+ && !HONOR_SIGNED_ZEROS (SFmode))
+ || VECTOR_UNIT_VSX_P (DFmode))"
{
+ if (VECTOR_UNIT_VSX_P (DFmode))
+ {
+ emit_insn (gen_vsx_copysignsf3 (operands[0], operands[1], operands[2],
+ CONST0_RTX (SFmode)));
+ DONE;
+ }
operands[3] = gen_reg_rtx (SFmode);
operands[4] = gen_reg_rtx (SFmode);
operands[5] = CONST0_RTX (SFmode);
@@ -6203,31 +6236,21 @@
"{fd|fdiv} %0,%1,%2"
[(set_attr "type" "ddiv")])
-(define_expand "recipdf3"
- [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
- (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")
- (match_operand:DF 2 "gpc_reg_operand" "d")]
- UNSPEC_FRES))]
- "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_POPCNTB && !optimize_size
- && flag_finite_math_only && !flag_trapping_math"
-{
- rs6000_emit_swdivdf (operands[0], operands[1], operands[2]);
- DONE;
-})
-
-(define_expand "fred"
- [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
- (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")] UNSPEC_FRES))]
- "(TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode)) && flag_finite_math_only"
- "")
-
(define_insn "*fred_fpr"
[(set (match_operand:DF 0 "gpc_reg_operand" "=f")
(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))]
- "TARGET_POPCNTB && flag_finite_math_only && !VECTOR_UNIT_VSX_P (DFmode)"
+ "TARGET_FRE && !VECTOR_UNIT_VSX_P (DFmode)"
"fre %0,%1"
[(set_attr "type" "fp")])
+(define_insn "*rsqrtdf_internal1"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
+ (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")]
+ UNSPEC_RSQRT))]
+ "TARGET_FRSQRTE && !VECTOR_UNIT_VSX_P (DFmode)"
+ "frsqrte %0,%1"
+ [(set_attr "type" "fp")])
+
(define_insn "*fmadddf4_fpr"
[(set (match_operand:DF 0 "gpc_reg_operand" "=d")
(plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d")
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 63f0f8c1582..e70172a19a6 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -195,8 +195,16 @@ Target Report Var(TARGET_XL_COMPAT)
Conform more closely to IBM XLC semantics
mrecip
-Target Report Var(TARGET_RECIP)
-Generate software reciprocal sqrt for better throughput
+Target Report
+Generate software reciprocal divide and square root for better throughput.
+
+mrecip=
+Target Report RejectNegative Joined
+Generate software reciprocal divide and square root for better throughput.
+
+mrecip-precision
+Target Report Mask(RECIP_PRECISION)
+Assume that the reciprocal estimate instructions provide more accuracy.
mno-fp-in-toc
Target Report RejectNegative Var(TARGET_NO_FP_IN_TOC)
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 7f927f103d2..b1927ce2e94 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -267,6 +267,20 @@
"VECTOR_UNIT_VSX_P (<MODE>mode)"
"")
+(define_expand "rsqrte<mode>2"
+ [(set (match_operand:VEC_F 0 "vfloat_operand" "")
+ (unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "")]
+ UNSPEC_RSQRT))]
+ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
+ "")
+
+(define_expand "re<mode>2"
+ [(set (match_operand:VEC_F 0 "vfloat_operand" "")
+ (unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "f")]
+ UNSPEC_FRES))]
+ "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
+ "")
+
(define_expand "ftrunc<mode>2"
[(set (match_operand:VEC_F 0 "vfloat_operand" "")
(fix:VEC_F (match_operand:VEC_F 1 "vfloat_operand" "")))]
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 7d572a48412..213d53ae5d1 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -195,7 +195,7 @@
(UNSPEC_VSX_MSUB 511)
(UNSPEC_VSX_NMADD 512)
(UNSPEC_VSX_NMSUB 513)
- (UNSPEC_VSX_RSQRTE 514)
+ ;; 514 deleted
(UNSPEC_VSX_TDIV 515)
(UNSPEC_VSX_TSQRT 516)
(UNSPEC_VSX_XXPERMDI 517)
@@ -446,10 +446,10 @@
[(set_attr "type" "<VStype_sqrt>")
(set_attr "fp_type" "<VSfptype_sqrt>")])
-(define_insn "vsx_rsqrte<mode>2"
+(define_insn "*vsx_rsqrte<mode>2"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,?wa")
(unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,wa")]
- UNSPEC_VSX_RSQRTE))]
+ UNSPEC_RSQRT))]
"VECTOR_UNIT_VSX_P (<MODE>mode)"
"x<VSv>rsqrte<VSs> %x0,%x1"
[(set_attr "type" "<VStype_simple>")
@@ -862,6 +862,20 @@
[(set_attr "type" "<VStype_simple>")
(set_attr "fp_type" "<VSfptype_simple>")])
+;; Special version of copysign for single precision that knows internally
+;; scalar single values are kept as double
+(define_insn "vsx_copysignsf3"
+ [(set (match_operand:SF 0 "vsx_register_operand" "=f")
+ (if_then_else:SF
+ (ge:SF (match_operand:SF 2 "vsx_register_operand" "f")
+ (match_operand:SF 3 "zero_constant" "j"))
+ (abs:SF (match_operand:SF 1 "vsx_register_operand" "f"))
+ (neg:SF (abs:SF (match_dup 1)))))]
+ "VECTOR_UNIT_VSX_P (DFmode)"
+ "xscpsgndp %x0,%x2,%x1"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_addsub_d")])
+
;; For the conversions, limit the register class for the integer value to be
;; the fprs because we don't want to add the altivec registers to movdi/movsi.
;; For the unsigned tests, there isn't a generic double -> unsigned conversion
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 0d9862f52b9..dc4e462abfd 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -10664,6 +10664,10 @@ vector unsigned char vec_vrlb (vector unsigned char,
vector float vec_round (vector float);
+vector float vec_recip (vector float, vector float);
+
+vector float vec_rsqrt (vector float);
+
vector float vec_rsqrte (vector float);
vector float vec_sel (vector float, vector float, vector bool int);
@@ -11592,8 +11596,10 @@ vector double vec_or (vector bool long, vector double);
vector double vec_perm (vector double,
vector double,
vector unsigned char);
-vector float vec_rint (vector float);
vector double vec_rint (vector double);
+vector double vec_recip (vector double, vector double);
+vector double vec_rsqrt (vector double);
+vector double vec_rsqrte (vector double);
vector double vec_sel (vector double, vector double, vector bool long);
vector double vec_sel (vector double, vector double, vector unsigned long);
vector double vec_sub (vector double, vector double);
@@ -11638,6 +11644,15 @@ long __builtin_bpermd (long, long);
int __builtin_bswap16 (int);
@end smallexample
+The @code{vec_rsqrt}, @code{__builtin_rsqrt}, and
+@code{__builtin_rsqrtf} functions generate multiple instructions to
+implement the reciprocal sqrt functionality using reciprocal sqrt
+estimate instructions.
+
+The @code{__builtin_recipdiv}, and @code{__builtin_recipdivf}
+functions generate multiple instructions to implement division using
+the reciprocal estimate instructions.
+
@node SPARC VIS Built-in Functions
@subsection SPARC VIS Built-in Functions
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 14da45fc6fc..0cfae8069fc 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -753,7 +753,8 @@ See RS/6000 and PowerPC Options.
-mfloat-gprs=yes -mfloat-gprs=no -mfloat-gprs=single -mfloat-gprs=double @gol
-mprototype -mno-prototype @gol
-msim -mmvme -mads -myellowknife -memb -msdata @gol
--msdata=@var{opt} -mvxworks -G @var{num} -pthread}
+-msdata=@var{opt} -mvxworks -G @var{num} -pthread @gol
+-mrecip -mrecip=@var{opt} -mno-recip -mrecip-precision -mno-recip-precision}
@emph{S/390 and zSeries Options}
@gccoptlist{-mtune=@var{cpu-type} -march=@var{cpu-type} @gol
@@ -13966,17 +13967,6 @@ values for @var{cpu_type} are used for @option{-mtune} as for
architecture, registers, and mnemonics set by @option{-mcpu}, but the
scheduling parameters set by @option{-mtune}.
-@item -mswdiv
-@itemx -mno-swdiv
-@opindex mswdiv
-@opindex mno-swdiv
-Generate code to compute division as reciprocal estimate and iterative
-refinement, creating opportunities for increased throughput. This
-feature requires: optional PowerPC Graphics instruction set for single
-precision and FRE instruction for double precision, assuming divides
-cannot generate user-visible traps, and the domain values not include
-Infinities, denormals or zero denominator.
-
@item -maltivec
@itemx -mno-altivec
@opindex maltivec
@@ -14620,6 +14610,52 @@ sequence.
Adds support for multithreading with the @dfn{pthreads} library.
This option sets flags for both the preprocessor and linker.
+@item -mrecip
+@itemx -mno-recip
+@opindex mrecip
+This option will enable GCC to use the reciprocal estimate and
+reciprocal square root estimate instructions with additional
+Newton-Raphson steps to increase precision instead of doing a divide or
+square root and divide for floating point arguments. You should use
+the @option{-ffast-math} option when using @option{-mrecip} (or at
+least @option{-funsafe-math-optimizations},
+@option{-finite-math-only}, @option{-freciprocal-math} and
+@option{-fno-trapping-math}). Note that while the throughput of the
+sequence is generally higher than the throughput of the non-reciprocal
+instruction, the precision of the sequence can be decreased by up to 2
+ulp (i.e. the inverse of 1.0 equals 0.99999994) for reciprocal square
+roots.
+
+@item -mrecip=@var{opt}
+@opindex mrecip=opt
+This option allows to control which reciprocal estimate instructions
+may be used. @var{opt} is a comma separated list of options, that may
+be preceeded by a @code{!} to invert the option:
+@code{all}: enable all estimate instructions,
+@code{default}: enable the default instructions, equvalent to @option{-mrecip},
+@code{none}: disable all estimate instructions, equivalent to @option{-mno-recip};
+@code{div}: enable the reciprocal approximation instructions for both single and double precision;
+@code{divf}: enable the single precision reciprocal approximation instructions;
+@code{divd}: enable the double precision reciprocal approximation instructions;
+@code{rsqrt}: enable the reciprocal square root approximation instructions for both single and double precision;
+@code{rsqrtf}: enable the single precision reciprocal square root approximation instructions;
+@code{rsqrtd}: enable the double precision reciprocal square root approximation instructions;
+
+So for example, @option{-mrecip=all,!rsqrtd} would enable the
+all of the reciprocal estimate instructions, except for the
+@code{FRSQRTE}, @code{XSRSQRTEDP}, and @code{XVRSQRTEDP} instructions
+which handle the double precision reciprocal square root calculations.
+
+@item -mrecip-precision
+@itemx -mno-recip-precision
+@opindex mrecip-precision
+Assume (do not assume) that the reciprocal estimate instructions
+provide higher precision estimates than is mandated by the powerpc
+ABI. Selecting @option{-mcpu=power6} or @option{-mcpu=power7}
+automatically selects @option{-mrecip-precision}. The double
+precision square root estimate instructions are not generated by
+default on low precision machines, since they do not provide an
+estimate that converges after three steps.
@end table
@node S/390 and zSeries Options
diff --git a/gcc/testsuite/ChangeLog.ibm b/gcc/testsuite/ChangeLog.ibm
index e9d4738d529..7d105ac35aa 100644
--- a/gcc/testsuite/ChangeLog.ibm
+++ b/gcc/testsuite/ChangeLog.ibm
@@ -1,3 +1,19 @@
+2010-07-16 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ Backport from mainline
+ 2010-06-01 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ PR target/44218
+ * gcc.target/powerpc/recip-1.c: New test for -mrecip support.
+ * gcc.target/powerpc/recip-2.c: Ditto.
+ * gcc.target/powerpc/recip-3.c: Ditto.
+ * gcc.target/powerpc/recip-4.c: Ditto.
+ * gcc.target/powerpc/recip-5.c: Ditto.
+ * gcc.target/powerpc/recip-6.c: Ditto.
+ * gcc.target/powerpc/recip-7.c: Ditto.
+ * gcc.target/powerpc/recip-test.h: Ditto.
+ * gcc.target/powerpc/recip-test2.h: Ditto.
+
2010-04-30 Michael Meissner <meissner@linux.vnet.ibm.com>
Backport from mainline
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-1.c b/gcc/testsuite/gcc.target/powerpc/recip-1.c
new file mode 100644
index 00000000000..d1e383dc4ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power6" } */
+/* { dg-final { scan-assembler-times "frsqrte" 2 } } */
+/* { dg-final { scan-assembler-times "fmsub" 2 } } */
+/* { dg-final { scan-assembler-times "fmul" 8 } } */
+/* { dg-final { scan-assembler-times "fnmsub" 4 } } */
+
+double
+rsqrt_d (double a)
+{
+ return 1.0 / __builtin_sqrt (a);
+}
+
+float
+rsqrt_f (float a)
+{
+ return 1.0f / __builtin_sqrtf (a);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-2.c b/gcc/testsuite/gcc.target/powerpc/recip-2.c
new file mode 100644
index 00000000000..69442733aab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power5" } */
+/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fmuls" 6 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 3 } } */
+/* { dg-final { scan-assembler-times "fsqrt" 1 } } */
+
+/* power5 resqrte is not accurate enough, and should not be generated by
+ default for -mrecip. */
+double
+rsqrt_d (double a)
+{
+ return 1.0 / __builtin_sqrt (a);
+}
+
+float
+rsqrt_f (float a)
+{
+ return 1.0f / __builtin_sqrtf (a);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-3.c b/gcc/testsuite/gcc.target/powerpc/recip-3.c
new file mode 100644
index 00000000000..80a34e8ee59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power7" } */
+/* { dg-final { scan-assembler-times "xsrsqrtedp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmsub.dp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmuldp" 4 } } */
+/* { dg-final { scan-assembler-times "xsnmsub.dp" 2 } } */
+/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fmuls" 4 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */
+
+double
+rsqrt_d (double a)
+{
+ return 1.0 / __builtin_sqrt (a);
+}
+
+float
+rsqrt_f (float a)
+{
+ return 1.0f / __builtin_sqrtf (a);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-4.c b/gcc/testsuite/gcc.target/powerpc/recip-4.c
new file mode 100644
index 00000000000..bd496d70e25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-4.c
@@ -0,0 +1,36 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O3 -ftree-vectorize -mrecip -ffast-math -mcpu=power7 -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvrsqrtedp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmsub.dp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmuldp" 4 } } */
+/* { dg-final { scan-assembler-times "xvnmsub.dp" 2 } } */
+/* { dg-final { scan-assembler-times "xvrsqrtesp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmsub.sp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmulsp" 4 } } */
+/* { dg-final { scan-assembler-times "xvnmsub.sp" 2 } } */
+
+#define SIZE 1024
+
+extern double a_d[SIZE] __attribute__((__aligned__(32)));
+extern double b_d[SIZE] __attribute__((__aligned__(32)));
+
+void
+vectorize_rsqrt_d (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a_d[i] = 1.0 / __builtin_sqrt (b_d[i]);
+}
+
+extern float a_f[SIZE] __attribute__((__aligned__(32)));
+extern float b_f[SIZE] __attribute__((__aligned__(32)));
+
+void
+vectorize_rsqrt_f (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a_f[i] = 1.0f / __builtin_sqrtf (b_f[i]);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-5.c b/gcc/testsuite/gcc.target/powerpc/recip-5.c
new file mode 100644
index 00000000000..4a9c496201a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-5.c
@@ -0,0 +1,94 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O3 -ftree-vectorize -mrecip=all -ffast-math -mcpu=power7 -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvredp" 4 } } */
+/* { dg-final { scan-assembler-times "xvresp" 5 } } */
+/* { dg-final { scan-assembler-times "xsredp" 2 } } */
+/* { dg-final { scan-assembler-times "fres" 2 } } */
+
+#include <altivec.h>
+
+float f_recip (float a, float b) { return __builtin_recipdivf (a, b); }
+double d_recip (double a, double b) { return __builtin_recipdiv (a, b); }
+
+float f_div (float a, float b) { return a / b; }
+double d_div (double a, double b) { return a / b; }
+
+#define SIZE 1024
+
+double d_a[SIZE] __attribute__((__aligned__(32)));
+double d_b[SIZE] __attribute__((__aligned__(32)));
+double d_c[SIZE] __attribute__((__aligned__(32)));
+
+float f_a[SIZE] __attribute__((__aligned__(32)));
+float f_b[SIZE] __attribute__((__aligned__(32)));
+float f_c[SIZE] __attribute__((__aligned__(32)));
+
+void vec_f_recip (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ f_a[i] = __builtin_recipdivf (f_b[i], f_c[i]);
+}
+
+void vec_d_recip (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ d_a[i] = __builtin_recipdiv (d_b[i], d_c[i]);
+}
+
+void vec_f_div (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ f_a[i] = f_b[i] / f_c[i];
+}
+
+void vec_f_div2 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ f_a[i] = f_b[i] / 2.0f;
+}
+
+void vec_f_div53 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ f_a[i] = f_b[i] / 53.0f;
+}
+
+void vec_d_div (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ d_a[i] = d_b[i] / d_c[i];
+}
+
+void vec_d_div2 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ d_a[i] = d_b[i] / 2.0;
+}
+
+void vec_d_div53 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ d_a[i] = d_b[i] / 53.0;
+}
+
+vector float v4sf_recip1 (vector float a, vector float b) { return vec_recipdiv (a, b); }
+vector float v4sf_recip2 (vector float a, vector float b) { return __builtin_altivec_vrecipdivfp (a, b); }
+vector double v2df_recip1 (vector double a, vector double b) { return vec_recipdiv (a, b); }
+vector float v4sf_recip3 (vector float a, vector float b) { return __builtin_vsx_xvrecipdivsp (a, b); }
+vector double v2df_recip2 (vector double a, vector double b) { return __builtin_vsx_xvrecipdivdp (a, b); }
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-6.c b/gcc/testsuite/gcc.target/powerpc/recip-6.c
new file mode 100644
index 00000000000..7d71df6709d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-6.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { powerpc*-*-linux* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-mcpu=power7 -O3 -ftree-vectorize -ffast-math -mrecip=all -mrecip-precision" } */
+
+/* Check reciprocal estimate functions for accuracy. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "recip-test.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-7.c b/gcc/testsuite/gcc.target/powerpc/recip-7.c
new file mode 100644
index 00000000000..7b32ba076a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-7.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { powerpc*-*-linux* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target ppc_recip_hw } */
+/* { dg-options "-O3 -ftree-vectorize -ffast-math -mrecip -mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb" } */
+
+/* Check reciprocal estimate functions for accuracy. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "recip-test.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-test.h b/gcc/testsuite/gcc.target/powerpc/recip-test.h
new file mode 100644
index 00000000000..7a42df5757d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-test.h
@@ -0,0 +1,149 @@
+/* Check reciprocal estimate functions for accuracy. */
+
+#ifdef _ARCH_PPC64
+typedef unsigned long uns64_t;
+#define UNUM64(x) x ## L
+
+#else
+typedef unsigned long long uns64_t;
+#define UNUM64(x) x ## LL
+#endif
+
+typedef unsigned int uns32_t;
+
+#define TNAME2(x) #x
+#define TNAME(x) TNAME2(x)
+
+/*
+ * Float functions.
+ */
+
+#define TYPE float
+#define NAME(PREFIX) PREFIX ## _float
+#define UNS_TYPE uns32_t
+#define UNS_ABS __builtin_abs
+#define EXP_SIZE 8
+#define MAN_SIZE 23
+#define FABS __builtin_fabsf
+#define FMAX __builtin_fmaxf
+#define FMIN __builtin_fminf
+#define SQRT __builtin_sqrtf
+#define RMIN 1.0e-10
+#define RMAX 1.0e+10
+#define BDIV 1
+#define BRSQRT 2
+#define ASMDIV "fdivs"
+#define ASMSQRT "fsqrts"
+
+#define INIT_DIV \
+{ \
+ { 0x4fffffff }, /* 8589934080 */ \
+ { 0x4effffff }, /* 2147483520 */ \
+ { 0x40ffffff }, /* 7.99999952316284 */ \
+ { 0x3fffffff }, /* 1.99999988079071 */ \
+ { 0x417fffff }, /* 15.9999990463257 */ \
+ { 0x42ffffff }, /* 127.999992370605 */ \
+ { 0x3dffffff }, /* 0.124999992549419 */ \
+ { 0x3effffff }, /* 0.499999970197678 */ \
+}
+
+#define INIT_RSQRT \
+{ \
+ { 0x457ffffe }, /* 4096 - small amount */ \
+ { 0x4c7fffff }, /* 6.71089e+07 */ \
+ { 0x3d7fffff }, /* 0.0625 - small amount */ \
+ { 0x307ffffe }, /* 9.31322e-10 */ \
+ { 0x4c7ffffe }, /* 6.71089e+07 */ \
+ { 0x397ffffe }, /* 0.000244141 */ \
+ { 0x2e7fffff }, /* 5.82077e-11 */ \
+ { 0x2f7fffff }, /* 2.32831e-10 */ \
+}
+
+
+#include "recip-test2.h"
+
+/*
+ * Double functions.
+ */
+
+#undef TYPE
+#undef NAME
+#undef UNS_TYPE
+#undef UNS_ABS
+#undef EXP_SIZE
+#undef MAN_SIZE
+#undef FABS
+#undef FMAX
+#undef FMIN
+#undef SQRT
+#undef RMIN
+#undef RMAX
+#undef BDIV
+#undef BRSQRT
+#undef ASMDIV
+#undef ASMSQRT
+#undef INIT_DIV
+#undef INIT_RSQRT
+
+#define TYPE double
+#define NAME(PREFIX) PREFIX ## _double
+#define UNS_TYPE uns64_t
+#define UNS_ABS __builtin_imaxabs
+#define EXP_SIZE 11
+#define MAN_SIZE 52
+#define FABS __builtin_fabs
+#define FMAX __builtin_fmax
+#define FMIN __builtin_fmin
+#define SQRT __builtin_sqrt
+#define RMIN 1.0e-100
+#define RMAX 1.0e+100
+#define BDIV 1
+#define BRSQRT 2
+#define ASMDIV "fdiv"
+#define ASMSQRT "fsqrt"
+
+#define INIT_DIV \
+{ \
+ { UNUM64 (0x2b57be53f2a2f3a0) }, /* 6.78462e-100 */ \
+ { UNUM64 (0x2b35f8e8ea553e52) }, /* 1.56963e-100 */ \
+ { UNUM64 (0x2b5b9d861d2fe4fb) }, /* 7.89099e-100 */ \
+ { UNUM64 (0x2b45dc44a084e682) }, /* 3.12327e-100 */ \
+ { UNUM64 (0x2b424ce16945d777) }, /* 2.61463e-100 */ \
+ { UNUM64 (0x2b20b5023d496b50) }, /* 5.96749e-101 */ \
+ { UNUM64 (0x2b61170547f57caa) }, /* 9.76678e-100 */ \
+ { UNUM64 (0x2b543b9d498aac37) }, /* 5.78148e-100 */ \
+}
+
+#define INIT_RSQRT \
+{ \
+ { UNUM64 (0x2b616f2d8cbbc646) }, /* 9.96359e-100 */ \
+ { UNUM64 (0x2b5c4db2da0a011d) }, /* 8.08764e-100 */ \
+ { UNUM64 (0x2b55a82d5735b262) }, /* 6.1884e-100 */ \
+ { UNUM64 (0x2b50b52908258cb8) }, /* 4.77416e-100 */ \
+ { UNUM64 (0x2b363989a4fb29af) }, /* 1.58766e-100 */ \
+ { UNUM64 (0x2b508b9f6f4180a9) }, /* 4.7278e-100 */ \
+ { UNUM64 (0x2b4f7a1d48accb40) }, /* 4.49723e-100 */ \
+ { UNUM64 (0x2b1146a37372a81f) }, /* 3.08534e-101 */ \
+ { UNUM64 (0x2b33f876a8c48050) }, /* 1.42663e-100 */ \
+}
+
+#include "recip-test2.h"
+
+int
+main (int argc __attribute__((__unused__)),
+ char *argv[] __attribute__((__unused__)))
+{
+ srand48 (1);
+ run_float ();
+
+#ifdef VERBOSE
+ printf ("\n");
+#endif
+
+ run_double ();
+
+ if (error_count_float != 0 || error_count_double != 0)
+ abort ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-test2.h b/gcc/testsuite/gcc.target/powerpc/recip-test2.h
new file mode 100644
index 00000000000..3ec356cdfd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-test2.h
@@ -0,0 +1,432 @@
+/*
+ * Included file to common source float/double checking
+ * The following macros should be defined:
+ * TYPE -- floating point type
+ * NAME -- convert a name to include the type
+ * UNS_TYPE -- type to hold TYPE as an unsigned number
+ * EXP_SIZE -- size in bits of the exponent
+ * MAN_SIZE -- size in bits of the mantissa
+ * UNS_ABS -- absolute value for UNS_TYPE
+ * FABS -- absolute value function for TYPE
+ * FMAX -- maximum function for TYPE
+ * FMIN -- minimum function for TYPE
+ * SQRT -- square root function for TYPE
+ * RMIN -- minimum random number to generate
+ * RMAX -- maximum random number to generate
+ * ASMDIV -- assembler instruction to do divide
+ * ASMSQRT -- assembler instruction to do square root
+ * BDIV -- # of bits of inaccuracy to allow for division
+ * BRSQRT -- # of bits of inaccuracy to allow for 1/sqrt
+ * INIT_DIV -- Initial values to test 1/x against
+ * INIT_RSQRT -- Initial values to test 1/sqrt(x) against
+ */
+
+typedef union
+{
+ UNS_TYPE i;
+ TYPE x;
+} NAME (union);
+
+/*
+ * Input/output arrays.
+ */
+
+static NAME (union) NAME (div_input) [] __attribute__((__aligned__(32))) = INIT_DIV;
+static NAME (union) NAME (rsqrt_input)[] __attribute__((__aligned__(32))) = INIT_RSQRT;
+
+#define DIV_SIZE (sizeof (NAME (div_input)) / sizeof (TYPE))
+#define RSQRT_SIZE (sizeof (NAME (rsqrt_input)) / sizeof (TYPE))
+
+static TYPE NAME (div_expected)[DIV_SIZE] __attribute__((__aligned__(32)));
+static TYPE NAME (div_output) [DIV_SIZE] __attribute__((__aligned__(32)));
+
+static TYPE NAME (rsqrt_expected)[RSQRT_SIZE] __attribute__((__aligned__(32)));
+static TYPE NAME (rsqrt_output) [RSQRT_SIZE] __attribute__((__aligned__(32)));
+
+
+/*
+ * Crack a floating point number into sign bit, exponent, and mantissa.
+ */
+
+static void
+NAME (crack) (TYPE number, unsigned int *p_sign, unsigned *p_exponent, UNS_TYPE *p_mantissa)
+{
+ NAME (union) u;
+ UNS_TYPE bits;
+
+ u.x = number;
+ bits = u.i;
+
+ *p_sign = (unsigned int)((bits >> (EXP_SIZE + MAN_SIZE)) & 0x1);
+ *p_exponent = (unsigned int)((bits >> MAN_SIZE) & ((((UNS_TYPE)1) << EXP_SIZE) - 1));
+ *p_mantissa = bits & ((((UNS_TYPE)1) << MAN_SIZE) - 1);
+ return;
+}
+
+
+/*
+ * Prevent optimizer from eliminating + 0.0 to remove -0.0.
+ */
+
+volatile TYPE NAME (math_diff_0) = ((TYPE) 0.0);
+
+/*
+ * Return negative if two numbers are significanly different or return the
+ * number of bits that are different in the mantissa.
+ */
+
+static int
+NAME (math_diff) (TYPE a, TYPE b, int bits)
+{
+ TYPE zero = NAME (math_diff_0);
+ unsigned int sign_a, sign_b;
+ unsigned int exponent_a, exponent_b;
+ UNS_TYPE mantissa_a, mantissa_b, diff;
+ int i;
+
+ /* eliminate signed zero. */
+ a += zero;
+ b += zero;
+
+ /* special case Nan. */
+ if (__builtin_isnan (a))
+ return (__builtin_isnan (b) ? 0 : -1);
+
+ if (a == b)
+ return 0;
+
+ /* special case infinity. */
+ if (__builtin_isinf (a))
+ return (__builtin_isinf (b) ? 0 : -1);
+
+ /* punt on denormal numbers. */
+ if (!__builtin_isnormal (a) || !__builtin_isnormal (b))
+ return -1;
+
+ NAME (crack) (a, &sign_a, &exponent_a, &mantissa_a);
+ NAME (crack) (b, &sign_b, &exponent_b, &mantissa_b);
+
+ /* If the sign is different, there is no hope. */
+ if (sign_a != sign_b)
+ return -1;
+
+ /* If the exponent is off by 1, see if the values straddle the power of two,
+ and adjust things to do the mantassa check if we can. */
+ if ((exponent_a == (exponent_b+1)) || (exponent_a == (exponent_b-1)))
+ {
+ TYPE big = FMAX (a, b);
+ TYPE small = FMIN (a, b);
+ TYPE diff = FABS (a - b);
+ unsigned int sign_big, sign_small, sign_test;
+ unsigned int exponent_big, exponent_small, exponent_test;
+ UNS_TYPE mantissa_big, mantissa_small, mantissa_test;
+
+ NAME (crack) (big, &sign_big, &exponent_big, &mantissa_big);
+ NAME (crack) (small, &sign_small, &exponent_small, &mantissa_small);
+
+ NAME (crack) (small - diff, &sign_test, &exponent_test, &mantissa_test);
+ if ((sign_test == sign_small) && (exponent_test == exponent_small))
+ {
+ mantissa_a = mantissa_small;
+ mantissa_b = mantissa_test;
+ }
+
+ else
+ {
+ NAME (crack) (big + diff, &sign_test, &exponent_test, &mantissa_test);
+ if ((sign_test == sign_big) && (exponent_test == exponent_big))
+ {
+ mantissa_a = mantissa_big;
+ mantissa_b = mantissa_test;
+ }
+
+ else
+ return -1;
+ }
+ }
+
+ else if (exponent_a != exponent_b)
+ return -1;
+
+ diff = UNS_ABS (mantissa_a - mantissa_b);
+ for (i = MAN_SIZE; i > 0; i--)
+ {
+ if ((diff & ((UNS_TYPE)1) << (i-1)) != 0)
+ return i;
+ }
+
+ return -1;
+}
+
+
+/*
+ * Turn off inlining to make code inspection easier.
+ */
+
+static void NAME (asm_div) (void) __attribute__((__noinline__));
+static void NAME (vector_div) (void) __attribute__((__noinline__));
+static void NAME (scalar_div) (void) __attribute__((__noinline__));
+static void NAME (asm_rsqrt) (void) __attribute__((__noinline__));
+static void NAME (vector_rsqrt) (void) __attribute__((__noinline__));
+static void NAME (scalar_rsqrt) (void) __attribute__((__noinline__));
+static void NAME (check_div) (const char *) __attribute__((__noinline__));
+static void NAME (check_rsqrt) (const char *) __attribute__((__noinline__));
+static void NAME (run) (void) __attribute__((__noinline__));
+
+
+/*
+ * Division function that might be vectorized.
+ */
+
+static void
+NAME (vector_div) (void)
+{
+ size_t i;
+
+ for (i = 0; i < DIV_SIZE; i++)
+ NAME (div_output)[i] = ((TYPE) 1.0) / NAME (div_input)[i].x;
+}
+
+/*
+ * Division function that is not vectorized.
+ */
+
+static void
+NAME (scalar_div) (void)
+{
+ size_t i;
+
+ for (i = 0; i < DIV_SIZE; i++)
+ {
+ TYPE x = ((TYPE) 1.0) / NAME (div_input)[i].x;
+ TYPE y;
+ __asm__ ("" : "=d" (y) : "0" (x));
+ NAME (div_output)[i] = y;
+ }
+}
+
+/*
+ * Generate the division instruction via asm.
+ */
+
+static void
+NAME (asm_div) (void)
+{
+ size_t i;
+
+ for (i = 0; i < DIV_SIZE; i++)
+ {
+ TYPE x;
+ __asm__ (ASMDIV " %0,%1,%2"
+ : "=d" (x)
+ : "d" ((TYPE) 1.0), "d" (NAME (div_input)[i].x));
+ NAME (div_expected)[i] = x;
+ }
+}
+
+/*
+ * Reciprocal square root function that might be vectorized.
+ */
+
+static void
+NAME (vector_rsqrt) (void)
+{
+ size_t i;
+
+ for (i = 0; i < RSQRT_SIZE; i++)
+ NAME (rsqrt_output)[i] = ((TYPE) 1.0) / SQRT (NAME (rsqrt_input)[i].x);
+}
+
+/*
+ * Reciprocal square root function that is not vectorized.
+ */
+
+static void
+NAME (scalar_rsqrt) (void)
+{
+ size_t i;
+
+ for (i = 0; i < RSQRT_SIZE; i++)
+ {
+ TYPE x = ((TYPE) 1.0) / SQRT (NAME (rsqrt_input)[i].x);
+ TYPE y;
+ __asm__ ("" : "=d" (y) : "0" (x));
+ NAME (rsqrt_output)[i] = y;
+ }
+}
+
+/*
+ * Generate the 1/sqrt instructions via asm.
+ */
+
+static void
+NAME (asm_rsqrt) (void)
+{
+ size_t i;
+
+ for (i = 0; i < RSQRT_SIZE; i++)
+ {
+ TYPE x;
+ TYPE y;
+ __asm__ (ASMSQRT " %0,%1" : "=d" (x) : "d" (NAME (rsqrt_input)[i].x));
+ __asm__ (ASMDIV " %0,%1,%2" : "=d" (y) : "d" ((TYPE) 1.0), "d" (x));
+ NAME (rsqrt_expected)[i] = y;
+ }
+}
+
+
+/*
+ * Functions to abort or report errors.
+ */
+
+static int NAME (error_count) = 0;
+
+#ifdef VERBOSE
+static int NAME (max_bits_div) = 0;
+static int NAME (max_bits_rsqrt) = 0;
+#endif
+
+
+/*
+ * Compare the expected value with the value we got.
+ */
+
+static void
+NAME (check_div) (const char *test)
+{
+ size_t i;
+ int b;
+
+ for (i = 0; i < DIV_SIZE; i++)
+ {
+ TYPE exp = NAME (div_expected)[i];
+ TYPE out = NAME (div_output)[i];
+ b = NAME (math_diff) (exp, out, BDIV);
+
+#ifdef VERBOSE
+ if (b != 0)
+ {
+ NAME (union) u_in = NAME (div_input)[i];
+ NAME (union) u_exp;
+ NAME (union) u_out;
+ char explanation[64];
+ const char *p_exp;
+
+ if (b < 0)
+ p_exp = "failed";
+ else
+ {
+ p_exp = explanation;
+ sprintf (explanation, "%d bit error%s", b, (b > BDIV) ? ", failed" : "");
+ }
+
+ u_exp.x = exp;
+ u_out.x = out;
+ printf ("%s %s %s for 1.0 / %g [0x%llx], expected %g [0x%llx], got %g [0x%llx]\n",
+ TNAME (TYPE), test, p_exp,
+ (double) u_in.x, (unsigned long long) u_in.i,
+ (double) exp, (unsigned long long) u_exp.i,
+ (double) out, (unsigned long long) u_out.i);
+ }
+#endif
+
+ if (b < 0 || b > BDIV)
+ NAME (error_count)++;
+
+#ifdef VERBOSE
+ if (b > NAME (max_bits_div))
+ NAME (max_bits_div) = b;
+#endif
+ }
+}
+
+static void
+NAME (check_rsqrt) (const char *test)
+{
+ size_t i;
+ int b;
+
+ for (i = 0; i < RSQRT_SIZE; i++)
+ {
+ TYPE exp = NAME (rsqrt_expected)[i];
+ TYPE out = NAME (rsqrt_output)[i];
+ b = NAME (math_diff) (exp, out, BRSQRT);
+
+#ifdef VERBOSE
+ if (b != 0)
+ {
+ NAME (union) u_in = NAME (rsqrt_input)[i];
+ NAME (union) u_exp;
+ NAME (union) u_out;
+ char explanation[64];
+ const char *p_exp;
+
+ if (b < 0)
+ p_exp = "failed";
+ else
+ {
+ p_exp = explanation;
+ sprintf (explanation, "%d bit error%s", b, (b > BDIV) ? ", failed" : "");
+ }
+
+ u_exp.x = exp;
+ u_out.x = out;
+ printf ("%s %s %s for 1 / sqrt (%g) [0x%llx], expected %g [0x%llx], got %g [0x%llx]\n",
+ TNAME (TYPE), test, p_exp,
+ (double) u_in.x, (unsigned long long) u_in.i,
+ (double) exp, (unsigned long long) u_exp.i,
+ (double) out, (unsigned long long) u_out.i);
+ }
+#endif
+
+ if (b < 0 || b > BRSQRT)
+ NAME (error_count)++;
+
+#ifdef VERBOSE
+ if (b > NAME (max_bits_rsqrt))
+ NAME (max_bits_rsqrt) = b;
+#endif
+ }
+}
+
+
+/*
+ * Now do everything.
+ */
+
+static void
+NAME (run) (void)
+{
+#ifdef VERBOSE
+ printf ("start run_%s, divide size = %ld, rsqrt size = %ld, %d bit%s for a/b, %d bit%s for 1/sqrt(a)\n",
+ TNAME (TYPE),
+ (long)DIV_SIZE,
+ (long)RSQRT_SIZE,
+ BDIV, (BDIV == 1) ? "" : "s",
+ BRSQRT, (BRSQRT == 1) ? "" : "s");
+#endif
+
+ NAME (asm_div) ();
+
+ NAME (scalar_div) ();
+ NAME (check_div) ("scalar");
+
+ NAME (vector_div) ();
+ NAME (check_div) ("vector");
+
+ NAME (asm_rsqrt) ();
+
+ NAME (scalar_rsqrt) ();
+ NAME (check_rsqrt) ("scalar");
+
+ NAME (vector_rsqrt) ();
+ NAME (check_rsqrt) ("vector");
+
+#ifdef VERBOSE
+ printf ("end run_%s, errors = %d, max div bits = %d, max rsqrt bits = %d\n",
+ TNAME (TYPE),
+ NAME (error_count),
+ NAME (max_bits_div),
+ NAME (max_bits_rsqrt));
+#endif
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 19d4fafa2c7..f5633014f37 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -938,6 +938,30 @@ proc check_vmx_hw_available { } {
}]
}
+proc check_ppc_recip_hw_available { } {
+ return [check_cached_effective_target ppc_recip_hw_available {
+ # Some simulators may not support FRE/FRES/FRSQRTE/FRSQRTES
+ # For now, disable on Darwin
+ if { [istarget powerpc-*-eabi] || [istarget powerpc*-*-eabispe] || [istarget *-*-darwin*]} {
+ expr 0
+ } else {
+ set options "-mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb"
+ check_runtime_nocache ppc_recip_hw_available {
+ volatile double d_recip, d_rsqrt, d_four = 4.0;
+ volatile float f_recip, f_rsqrt, f_four = 4.0f;
+ int main()
+ {
+ asm volatile ("fres %0,%1" : "=f" (f_recip) : "f" (f_four));
+ asm volatile ("fre %0,%1" : "=d" (d_recip) : "d" (d_four));
+ asm volatile ("frsqrtes %0,%1" : "=f" (f_rsqrt) : "f" (f_four));
+ asm volatile ("frsqrte %0,%1" : "=f" (d_rsqrt) : "d" (d_four));
+ return 0;
+ }
+ } $options
+ }
+ }]
+}
+
# Return 1 if the target supports executing AltiVec and Cell PPU
# instructions, 0 otherwise. Cache the result.
@@ -2678,6 +2702,8 @@ proc is-effective-target { arg } {
} else {
switch $arg {
"vmx_hw" { set selected [check_vmx_hw_available] }
+ "vsx_hw" { set selected [check_vsx_hw_available] }
+ "ppc_recip_hw" { set selected [check_ppc_recip_hw_available] }
"named_sections" { set selected [check_named_sections_available] }
"gc_sections" { set selected [check_gc_sections_available] }
"cxa_atexit" { set selected [check_cxa_atexit_available] }
@@ -2697,6 +2723,8 @@ proc is-effective-target-keyword { arg } {
# These have different names for their check_* procs.
switch $arg {
"vmx_hw" { return 1 }
+ "vsx_hw" { return 1 }
+ "ppc_recip_hw" { return 1 }
"named_sections" { return 1 }
"gc_sections" { return 1 }
"cxa_atexit" { return 1 }