diff options
author | Michael Meissner <meissner@linux.vnet.ibm.com> | 2010-04-01 17:20:58 +0000 |
---|---|---|
committer | Michael Meissner <meissner@linux.vnet.ibm.com> | 2010-04-01 17:20:58 +0000 |
commit | 0c831bb685c6872226b653c8802be6237cfa9bd8 (patch) | |
tree | dbcd4ef24ee3d53e1f19184e7df9d4d5f3f3ae4e | |
parent | 3c3686b95104e5042ba8d046bcd158e521f54378 (diff) |
Add -mrecip changes to make gromacs faster
git-svn-id: https://gcc.gnu.org/svn/gcc/branches/ibm/gcc-4_4-branch@157917 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r-- | gcc/ChangeLog.ibm | 35 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000-builtin.def | 1 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.c | 153 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.md | 22 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.opt | 9 | ||||
-rw-r--r-- | gcc/config/rs6000/vsx.md | 4 | ||||
-rw-r--r-- | gcc/doc/extend.texi | 1 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 24 |
9 files changed, 161 insertions, 90 deletions
diff --git a/gcc/ChangeLog.ibm b/gcc/ChangeLog.ibm index 1d582a19d8a..87216504c3c 100644 --- a/gcc/ChangeLog.ibm +++ b/gcc/ChangeLog.ibm @@ -1,5 +1,40 @@ 2010-03-31 Michael Meissner <meissner@linux.vnet.ibm.com> + * doc/extend.texi (__builtin_rsqrt): Document. + + * doc/invoke.texi (-mrecip): Document. + (-mno-recip): Ditto. + (-mrecip-passes=n): New switch to control the number of passes for + reciprocal square root estimate. + + * config/rs6000/rs6000-protos.h (rs6000_emit_swrsqrt): Rename from + rs6000_emit_swrsqrtsf. + + * config/rs6000/rs6000.opt (-mrecip): Change documentation. + (-mrecip-passes=n): New switch. + + * config/rs6000/rs6000-builtin.def (RS6000_BUILTIN_RSQRT): New + builtin. + + * config/rs6000/rs6000.c (rs6000_override_options): Don't allow + -mvsx -mno-altivec. Set rs6000_recip_passes to 2 on power6 and + power7, or 3 on other machines. Add some enum casts. + (rs6000_expand_builtin): Add __builtin_rsqrt support. + (rs6000_init_builtins): Ditto. + (rs6000_builtin_reciprocal): Ditto. + (rs6000_emit_swdivdf): Emit correct type for MULT. + (rs6000_emit_swrsqrt): Rewrite. Support both single and double + precision. Add support for reducing the number of Newton-Raphson + passes on newer machines. + + * config/rs6000/vsx.md (UNSPEC_VSX_RSQRTE): Delete. + (vsx_rsqrte<mode>2): Use UNSPEC_RSQRT. + (FP2): New iterator. + (rsqrt<mode>2): Rename from rsqrtsf2, and add DFmode support. + Call rs6000_emit_swrsqrt instead of rs6000_emit_swrsqrtsf. + (rsqrtsf_internal1): Rename from rsqrt_internal1. Add test for + TARGET_SINGLE_FLOAT for completeness. + (back ported from mainline, 2010-03-02, Jeff Law) PR middle-end/42431 * reload1.c (rtx_p, substitute_stack): Declare. diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index e66e8c4318f..968f3321ae0 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -991,4 +991,5 @@ RS6000_BUILTIN(POWER7_BUILTIN_BPERMD, RS6000_BTC_CONST) RS6000_BUILTIN(RS6000_BUILTIN_RECIP, RS6000_BTC_FP_PURE) RS6000_BUILTIN(RS6000_BUILTIN_RECIPF, RS6000_BTC_FP_PURE) RS6000_BUILTIN(RS6000_BUILTIN_RSQRTF, RS6000_BTC_FP_PURE) +RS6000_BUILTIN(RS6000_BUILTIN_RSQRT, RS6000_BTC_FP_PURE) RS6000_BUILTIN(RS6000_BUILTIN_BSWAP_HI, RS6000_BTC_CONST) diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index b41dbf39387..d6955616107 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -107,7 +107,7 @@ extern void rs6000_split_compare_and_swapqhi (rtx, rtx, rtx, rtx, rtx, rtx); extern void rs6000_split_lock_test_and_set (rtx, rtx, rtx, rtx); extern void rs6000_emit_swdivsf (rtx, rtx, rtx); extern void rs6000_emit_swdivdf (rtx, rtx, rtx); -extern void rs6000_emit_swrsqrtsf (rtx, rtx); +extern void rs6000_emit_swrsqrt (rtx, rtx); extern void output_toc (FILE *, rtx, int, enum machine_mode); extern void rs6000_initialize_trampoline (rtx, rtx, rtx); extern rtx rs6000_longcall_ref (rtx); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 6c70f2cd61b..a99a98ead79 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -2309,8 +2309,7 @@ rs6000_override_options (const char *default_cpu) } } - /* Add some warnings for VSX. Enable -maltivec unless the user explicitly - used -mno-altivec */ + /* Add some warnings for VSX. */ if (TARGET_VSX) { const char *msg = NULL; @@ -2331,14 +2330,20 @@ rs6000_override_options (const char *default_cpu) msg = N_("-mvsx used with little endian code"); else if (TARGET_AVOID_XFORM > 0) msg = N_("-mvsx needs indexed addressing"); + else if (!TARGET_ALTIVEC && (target_flags_explicit & MASK_ALTIVEC)) + { + if (target_flags_explicit & MASK_VSX) + msg = N_("-mvsx and -mno-altivec are incompatible"); + else + msg = N_("-mno-altivec disables vsx"); + } if (msg) { warning (0, msg); target_flags &= ~ MASK_VSX; } - else if (TARGET_VSX && !TARGET_ALTIVEC - && (target_flags_explicit & MASK_ALTIVEC) == 0) + else if (TARGET_VSX && !TARGET_ALTIVEC) target_flags |= MASK_ALTIVEC; } @@ -2496,6 +2501,11 @@ rs6000_override_options (const char *default_cpu) || rs6000_cpu == PROCESSOR_POWER6 || rs6000_cpu == PROCESSOR_POWER7); + /* Set the default # of passes to use for -mrecip. */ + if (rs6000_recip_passes < 0) + rs6000_recip_passes = (rs6000_cpu == PROCESSOR_POWER6 + || rs6000_cpu == PROCESSOR_POWER7) ? 2 : 3; + /* Allow debug switches to override the above settings. */ if (TARGET_ALWAYS_HINT > 0) rs6000_always_hint = TARGET_ALWAYS_HINT; @@ -2524,7 +2534,8 @@ rs6000_override_options (const char *default_cpu) else if (! strcmp (rs6000_sched_costly_dep_str, "store_to_load")) rs6000_sched_costly_dep = store_to_load_dep_costly; else - rs6000_sched_costly_dep = atoi (rs6000_sched_costly_dep_str); + rs6000_sched_costly_dep = ((enum rs6000_dependence_cost) + atoi (rs6000_sched_costly_dep_str)); } /* Handle -minsert-sched-nops option. */ @@ -2540,7 +2551,8 @@ rs6000_override_options (const char *default_cpu) else if (! strcmp (rs6000_sched_insert_nops_str, "regroup_exact")) rs6000_sched_insert_nops = sched_finish_regroup_exact; else - rs6000_sched_insert_nops = atoi (rs6000_sched_insert_nops_str); + rs6000_sched_insert_nops = ((enum rs6000_nop_insertion) + atoi (rs6000_sched_insert_nops_str)); } #ifdef TARGET_REGNAMES @@ -10802,6 +10814,9 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, if (fcode == RS6000_BUILTIN_RSQRTF) return rs6000_expand_unop_builtin (CODE_FOR_rsqrtsf2, exp, target); + if (fcode == RS6000_BUILTIN_RSQRT) + return rs6000_expand_unop_builtin (CODE_FOR_rsqrtdf2, exp, target); + if (fcode == RS6000_BUILTIN_BSWAP_HI) return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target); @@ -11083,6 +11098,12 @@ rs6000_init_builtins (void) "__builtin_rsqrtf"); def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrtf", ftype, RS6000_BUILTIN_RSQRTF); + + ftype = builtin_function_type (DFmode, DFmode, VOIDmode, VOIDmode, + RS6000_BUILTIN_RSQRT, + "__builtin_rsqrt"); + def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrt", ftype, + RS6000_BUILTIN_RSQRT); } if (TARGET_POPCNTB) { @@ -24796,6 +24817,9 @@ rs6000_builtin_reciprocal (unsigned int fn, bool md_fn, else switch (fn) { + case BUILT_IN_SQRT: + return rs6000_builtin_decls[RS6000_BUILTIN_RSQRT]; + case BUILT_IN_SQRTF: return rs6000_builtin_decls[RS6000_BUILTIN_RSQRTF]; @@ -24875,7 +24899,7 @@ rs6000_emit_swdivdf (rtx dst, rtx n, rtx d) /* e0 = 1. - d * x0 */ emit_insn (gen_rtx_SET (VOIDmode, e0, gen_rtx_MINUS (DFmode, one, - gen_rtx_MULT (SFmode, d, x0)))); + gen_rtx_MULT (DFmode, d, x0)))); /* y1 = x0 + e0 * x0 */ emit_insn (gen_rtx_SET (VOIDmode, y1, gen_rtx_PLUS (DFmode, @@ -24908,88 +24932,63 @@ rs6000_emit_swdivdf (rtx dst, rtx n, rtx d) } -/* Newton-Raphson approximation of single-precision floating point rsqrt. - Assumes no trapping math and finite arguments. */ +/* Newton-Raphson approximation of single/double-precision floating point + rsqrt. Assumes no trapping math and finite arguments. */ void -rs6000_emit_swrsqrtsf (rtx dst, rtx src) +rs6000_emit_swrsqrt (rtx dst, rtx src) { - rtx x0, x1, x2, y1, u0, u1, u2, v0, v1, v2, t0, - half, one, halfthree, c1, cond, label; + enum machine_mode mode = GET_MODE (src); + rtx x0 = gen_reg_rtx (mode); - x0 = gen_reg_rtx (SFmode); - x1 = gen_reg_rtx (SFmode); - x2 = gen_reg_rtx (SFmode); - y1 = gen_reg_rtx (SFmode); - u0 = gen_reg_rtx (SFmode); - u1 = gen_reg_rtx (SFmode); - u2 = gen_reg_rtx (SFmode); - v0 = gen_reg_rtx (SFmode); - v1 = gen_reg_rtx (SFmode); - v2 = gen_reg_rtx (SFmode); - t0 = gen_reg_rtx (SFmode); - halfthree = gen_reg_rtx (SFmode); - cond = gen_rtx_REG (CCFPmode, CR1_REGNO); - label = gen_rtx_LABEL_REF (VOIDmode, gen_label_rtx ()); + gcc_assert (flag_finite_math_only && !flag_trapping_math); + gcc_assert (mode == SFmode || mode == DFmode); + + /* x0 = rsqrt estimate */ + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, src), + UNSPEC_RSQRT))); - /* check 0.0, 1.0, NaN, Inf by testing src * src = src */ - emit_insn (gen_rtx_SET (VOIDmode, t0, - gen_rtx_MULT (SFmode, src, src))); + if (rs6000_recip_passes > 0) + { + REAL_VALUE_TYPE dconst3_2; + int i; + rtx halfthree; + rtx y = gen_reg_rtx (mode); + rtx m; + rtx d; - emit_insn (gen_rtx_SET (VOIDmode, cond, - gen_rtx_COMPARE (CCFPmode, t0, src))); - c1 = gen_rtx_EQ (VOIDmode, cond, const0_rtx); - emit_unlikely_jump (c1, label); + real_from_integer (&dconst3_2, VOIDmode, 3, 0, 0); + SET_REAL_EXP (&dconst3_2, REAL_EXP (&dconst3_2) - 1); + d = CONST_DOUBLE_FROM_REAL_VALUE (dconst3_2, mode); + halfthree = force_reg (mode, d); - half = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode)); - one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode)); + /* y = 0.5 * src = 1.5 * src - src -> fewer constants */ + m = gen_rtx_MULT (mode, src, halfthree), + emit_insn (gen_rtx_SET (VOIDmode, y, gen_rtx_MINUS (mode, m, src))); - /* halfthree = 1.5 = 1.0 + 0.5 */ - emit_insn (gen_rtx_SET (VOIDmode, halfthree, - gen_rtx_PLUS (SFmode, one, half))); + for (i = 0; i < rs6000_recip_passes; i++) + { + rtx x1 = gen_reg_rtx (mode); + rtx u = gen_reg_rtx (mode); + rtx v = gen_reg_rtx (mode); - /* x0 = rsqrt estimate */ - emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_UNSPEC (SFmode, gen_rtvec (1, src), - UNSPEC_RSQRT))); + /* x1 = x0 * (1.5 - y * (x0 * x0)) */ + emit_insn (gen_rtx_SET (VOIDmode, u, + gen_rtx_MULT (mode, x0, x0))); - /* y1 = 0.5 * src = 1.5 * src - src -> fewer constants */ - emit_insn (gen_rtx_SET (VOIDmode, y1, - gen_rtx_MINUS (SFmode, - gen_rtx_MULT (SFmode, src, halfthree), - src))); + m = gen_rtx_MULT (mode, y, u); + emit_insn (gen_rtx_SET (VOIDmode, v, + gen_rtx_MINUS (mode, halfthree, m))); - /* x1 = x0 * (1.5 - y1 * (x0 * x0)) */ - emit_insn (gen_rtx_SET (VOIDmode, u0, - gen_rtx_MULT (SFmode, x0, x0))); - emit_insn (gen_rtx_SET (VOIDmode, v0, - gen_rtx_MINUS (SFmode, - halfthree, - gen_rtx_MULT (SFmode, y1, u0)))); - emit_insn (gen_rtx_SET (VOIDmode, x1, - gen_rtx_MULT (SFmode, x0, v0))); - - /* x2 = x1 * (1.5 - y1 * (x1 * x1)) */ - emit_insn (gen_rtx_SET (VOIDmode, u1, - gen_rtx_MULT (SFmode, x1, x1))); - emit_insn (gen_rtx_SET (VOIDmode, v1, - gen_rtx_MINUS (SFmode, - halfthree, - gen_rtx_MULT (SFmode, y1, u1)))); - emit_insn (gen_rtx_SET (VOIDmode, x2, - gen_rtx_MULT (SFmode, x1, v1))); - - /* dst = x2 * (1.5 - y1 * (x2 * x2)) */ - emit_insn (gen_rtx_SET (VOIDmode, u2, - gen_rtx_MULT (SFmode, x2, x2))); - emit_insn (gen_rtx_SET (VOIDmode, v2, - gen_rtx_MINUS (SFmode, - halfthree, - gen_rtx_MULT (SFmode, y1, u2)))); - emit_insn (gen_rtx_SET (VOIDmode, dst, - gen_rtx_MULT (SFmode, x2, v2))); + emit_insn (gen_rtx_SET (VOIDmode, x1, + gen_rtx_MULT (mode, x0, v))); + x0 = x1; + } + } - emit_label (XEXP (label, 0)); + emit_move_insn (dst, x0); + return; } /* Emit popcount intrinsic on TARGET_POPCNTB (Power5) and TARGET_POPCNTD diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 9b323e262cf..6b95e0becd2 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -213,6 +213,13 @@ (DD "TARGET_DFP") (TD "TARGET_DFP")]) +; Single/double precision +(define_mode_iterator FP2 [ + (SF "TARGET_HARD_FLOAT + && ((TARGET_FPRS && TARGET_SINGLE_FLOAT) || TARGET_E500_SINGLE)") + (DF "TARGET_HARD_FLOAT + && ((TARGET_FPRS && TARGET_DOUBLE_FLOAT) || TARGET_E500_DOUBLE)")]) + ; Various instructions that come in SI and DI forms. ; A generic w/d attribute, for things like cmpw/cmpd. (define_mode_attr wd [(QI "b") (HI "h") (SI "w") (DI "d")]) @@ -5861,22 +5868,23 @@ "fsqrt %0,%1" [(set_attr "type" "dsqrt")]) -(define_expand "rsqrtsf2" - [(set (match_operand:SF 0 "gpc_reg_operand" "=f") - (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] - UNSPEC_RSQRT))] +(define_expand "rsqrt<mode>2" + [(set (match_operand:FP2 0 "gpc_reg_operand" "") + (unspec:FP2 [(match_operand:FP2 1 "gpc_reg_operand" "")] + UNSPEC_RSQRT))] "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size && flag_finite_math_only && !flag_trapping_math" { - rs6000_emit_swrsqrtsf (operands[0], operands[1]); + rs6000_emit_swrsqrt (operands[0], operands[1]); DONE; }) -(define_insn "*rsqrt_internal1" +(define_insn "*rsqrtsf_internal1" [(set (match_operand:SF 0 "gpc_reg_operand" "=f") (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] UNSPEC_RSQRT))] - "TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT" + "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT + && TARGET_PPC_GFXOPT" "frsqrte %0,%1" [(set_attr "type" "fp")]) diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index d3fa10f856f..47da0156792 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -187,8 +187,13 @@ Target Report Var(TARGET_XL_COMPAT) Conform more closely to IBM XLC semantics mrecip -Target Report Var(TARGET_RECIP) -Generate software reciprocal sqrt for better throughput +Target Report Var(TARGET_RECIP) Init(-1) +Generate software reciprocal square root for better throughput. + +mrecip-passes= +Target Report UInteger Joined Var(rs6000_recip_passes) Init(-1) +Number of fixup passes after doing the reciprocal sqrt esitmate. Default is 2 +for the power6 and newer machines, and 3 for older machines. mno-fp-in-toc Target Report RejectNegative Var(TARGET_NO_FP_IN_TOC) diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 7d572a48412..e6fefad232f 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -195,7 +195,7 @@ (UNSPEC_VSX_MSUB 511) (UNSPEC_VSX_NMADD 512) (UNSPEC_VSX_NMSUB 513) - (UNSPEC_VSX_RSQRTE 514) + ; 514 deleted (UNSPEC_VSX_TDIV 515) (UNSPEC_VSX_TSQRT 516) (UNSPEC_VSX_XXPERMDI 517) @@ -449,7 +449,7 @@ (define_insn "vsx_rsqrte<mode>2" [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,?wa") (unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,wa")] - UNSPEC_VSX_RSQRTE))] + UNSPEC_RSQRT))] "VECTOR_UNIT_VSX_P (<MODE>mode)" "x<VSv>rsqrte<VSs> %x0,%x1" [(set_attr "type" "<VStype_simple>") diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 0d9862f52b9..ebf53e16001 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -11634,6 +11634,7 @@ GCC provides a few other builtins on Powerpc to access certain instructions: float __builtin_recipdivf (float, float); float __builtin_rsqrtf (float); double __builtin_recipdiv (double, double); +double __builtin_rsqrt (double); long __builtin_bpermd (long, long); int __builtin_bswap16 (int); @end smallexample diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 3d4e720b004..edd8421f8bc 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -753,7 +753,8 @@ See RS/6000 and PowerPC Options. -mfloat-gprs=yes -mfloat-gprs=no -mfloat-gprs=single -mfloat-gprs=double @gol -mprototype -mno-prototype @gol -msim -mmvme -mads -myellowknife -memb -msdata @gol --msdata=@var{opt} -mvxworks -G @var{num} -pthread} +-msdata=@var{opt} -mvxworks -G @var{num} -pthread @gol +-mrecip -mno-recip -mrecip-passes=@var{num}} @emph{S/390 and zSeries Options} @gccoptlist{-mtune=@var{cpu-type} -march=@var{cpu-type} @gol @@ -14610,6 +14611,27 @@ when the linker is known to generate glue. Adds support for multithreading with the @dfn{pthreads} library. This option sets flags for both the preprocessor and linker. +@item -mrecip +@itemx -mno-recip +@opindex mrecip +This option will enable GCC to use FRSQRTE instruction for reciprocal +square root with an additional Newton-Raphson step to increase +precision instead of FSQRTS/FDIVS for single precision floating point +arguments. These instructions are generated only when +@option{-funsafe-math-optimizations} is enabled together with +@option{-finite-math-only} and @option{-fno-trapping-math}. Note that +while the throughput of the sequence is generally higher than the +throughput of the non-reciprocal instruction, the precision of the +sequence can be decreased by up to 2 ulp (i.e. the inverse of 1.0 +equals 0.99999994). + +@item -mrecip-passes=@var{num} +@opindex mrecip-passes +Control the number of Newton-Raphson passes to use after an estimate +instruction if @option{-mrecip} was used. The default is 2 on newer +machines that support higher precision estimate instruction and 3 on +older machines. + @end table @node S/390 and zSeries Options |