Add -mrecip changes to make gromacs faster

git-svn-id: https://gcc.gnu.org/svn/gcc/branches/ibm/gcc-4_4-branch@157917 138bc75d-0d04-0410-961f-82ee72b054a4
author: Michael Meissner <meissner@linux.vnet.ibm.com> 2010-04-01 17:20:58 +0000
committer: Michael Meissner <meissner@linux.vnet.ibm.com> 2010-04-01 17:20:58 +0000
commit: 0c831bb685c6872226b653c8802be6237cfa9bd8 (patch)
tree: dbcd4ef24ee3d53e1f19184e7df9d4d5f3f3ae4e
parent: 3c3686b95104e5042ba8d046bcd158e521f54378 (diff)
9 files changed, 161 insertions, 90 deletions
diff --git a/gcc/ChangeLog.ibm b/gcc/ChangeLog.ibm
index 1d582a19d8a..87216504c3c 100644
--- a/gcc/ChangeLog.ibm
+++ b/gcc/ChangeLog.ibm
@@ -1,5 +1,40 @@
 2010-03-31  Michael Meissner  <meissner@linux.vnet.ibm.com>
 
+	* doc/extend.texi (__builtin_rsqrt): Document.
+
+	* doc/invoke.texi (-mrecip): Document.
+	(-mno-recip): Ditto.
+	(-mrecip-passes=n): New switch to control the number of passes for
+	reciprocal square root estimate.
+
+	* config/rs6000/rs6000-protos.h (rs6000_emit_swrsqrt): Rename from
+	rs6000_emit_swrsqrtsf.
+
+	* config/rs6000/rs6000.opt (-mrecip): Change documentation.
+	(-mrecip-passes=n): New switch.
+
+	* config/rs6000/rs6000-builtin.def (RS6000_BUILTIN_RSQRT): New
+	builtin.
+
+	* config/rs6000/rs6000.c (rs6000_override_options): Don't allow
+	-mvsx -mno-altivec.  Set rs6000_recip_passes to 2 on power6 and
+	power7, or 3 on other machines.  Add some enum casts.
+	(rs6000_expand_builtin): Add __builtin_rsqrt support.
+	(rs6000_init_builtins): Ditto.
+	(rs6000_builtin_reciprocal): Ditto.
+	(rs6000_emit_swdivdf): Emit correct type for MULT.
+	(rs6000_emit_swrsqrt): Rewrite.  Support both single and double
+	precision.  Add support for reducing the number of Newton-Raphson
+	passes on newer machines.
+
+	* config/rs6000/vsx.md (UNSPEC_VSX_RSQRTE): Delete.
+	(vsx_rsqrte<mode>2): Use UNSPEC_RSQRT.
+	(FP2): New iterator.
+	(rsqrt<mode>2): Rename from rsqrtsf2, and add DFmode support.
+	Call rs6000_emit_swrsqrt instead of rs6000_emit_swrsqrtsf.
+	(rsqrtsf_internal1): Rename from rsqrt_internal1.  Add test for
+	TARGET_SINGLE_FLOAT for completeness.
+
 	(back ported from mainline, 2010-03-02, Jeff Law)
 	PR middle-end/42431
 	* reload1.c (rtx_p, substitute_stack): Declare.
diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def
index e66e8c4318f..968f3321ae0 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -991,4 +991,5 @@ RS6000_BUILTIN(POWER7_BUILTIN_BPERMD,			RS6000_BTC_CONST)
 RS6000_BUILTIN(RS6000_BUILTIN_RECIP,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_RECIPF,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_RSQRTF,			RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(RS6000_BUILTIN_RSQRT,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_BSWAP_HI,			RS6000_BTC_CONST)
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index b41dbf39387..d6955616107 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -107,7 +107,7 @@ extern void rs6000_split_compare_and_swapqhi (rtx, rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_split_lock_test_and_set (rtx, rtx, rtx, rtx);
 extern void rs6000_emit_swdivsf (rtx, rtx, rtx);
 extern void rs6000_emit_swdivdf (rtx, rtx, rtx);
-extern void rs6000_emit_swrsqrtsf (rtx, rtx);
+extern void rs6000_emit_swrsqrt (rtx, rtx);
 extern void output_toc (FILE *, rtx, int, enum machine_mode);
 extern void rs6000_initialize_trampoline (rtx, rtx, rtx);
 extern rtx rs6000_longcall_ref (rtx);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6c70f2cd61b..a99a98ead79 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -2309,8 +2309,7 @@ rs6000_override_options (const char *default_cpu)
 	}
     }
 
-  /* Add some warnings for VSX.  Enable -maltivec unless the user explicitly
-     used -mno-altivec  */
+  /* Add some warnings for VSX.  */
   if (TARGET_VSX)
     {
       const char *msg = NULL;
@@ -2331,14 +2330,20 @@ rs6000_override_options (const char *default_cpu)
 	msg = N_("-mvsx used with little endian code");
       else if (TARGET_AVOID_XFORM > 0)
 	msg = N_("-mvsx needs indexed addressing");
+      else if (!TARGET_ALTIVEC && (target_flags_explicit & MASK_ALTIVEC))
+        {
+	  if (target_flags_explicit & MASK_VSX)
+	    msg = N_("-mvsx and -mno-altivec are incompatible");
+	  else
+	    msg = N_("-mno-altivec disables vsx");
+        }
 
       if (msg)
 	{
 	  warning (0, msg);
 	  target_flags &= ~ MASK_VSX;
 	}
-      else if (TARGET_VSX && !TARGET_ALTIVEC
-	       && (target_flags_explicit & MASK_ALTIVEC) == 0)
+      else if (TARGET_VSX && !TARGET_ALTIVEC)
 	target_flags |= MASK_ALTIVEC;
     }
 
@@ -2496,6 +2501,11 @@ rs6000_override_options (const char *default_cpu)
 				 || rs6000_cpu == PROCESSOR_POWER6
 				 || rs6000_cpu == PROCESSOR_POWER7);
 
+  /* Set the default # of passes to use for -mrecip.  */
+  if (rs6000_recip_passes < 0)
+    rs6000_recip_passes = (rs6000_cpu == PROCESSOR_POWER6
+			   || rs6000_cpu == PROCESSOR_POWER7) ? 2 : 3;
+
   /* Allow debug switches to override the above settings.  */
   if (TARGET_ALWAYS_HINT > 0)
     rs6000_always_hint = TARGET_ALWAYS_HINT;
@@ -2524,7 +2534,8 @@ rs6000_override_options (const char *default_cpu)
       else if (! strcmp (rs6000_sched_costly_dep_str, "store_to_load"))
 	rs6000_sched_costly_dep = store_to_load_dep_costly;
       else
-	rs6000_sched_costly_dep = atoi (rs6000_sched_costly_dep_str);
+	rs6000_sched_costly_dep = ((enum rs6000_dependence_cost)
+				   atoi (rs6000_sched_costly_dep_str));
     }
 
   /* Handle -minsert-sched-nops option.  */
@@ -2540,7 +2551,8 @@ rs6000_override_options (const char *default_cpu)
       else if (! strcmp (rs6000_sched_insert_nops_str, "regroup_exact"))
 	rs6000_sched_insert_nops = sched_finish_regroup_exact;
       else
-	rs6000_sched_insert_nops = atoi (rs6000_sched_insert_nops_str);
+	rs6000_sched_insert_nops = ((enum rs6000_nop_insertion)
+				    atoi (rs6000_sched_insert_nops_str));
     }
 
 #ifdef TARGET_REGNAMES
@@ -10802,6 +10814,9 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
   if (fcode == RS6000_BUILTIN_RSQRTF)
       return rs6000_expand_unop_builtin (CODE_FOR_rsqrtsf2, exp, target);
 
+  if (fcode == RS6000_BUILTIN_RSQRT)
+      return rs6000_expand_unop_builtin (CODE_FOR_rsqrtdf2, exp, target);
+
   if (fcode == RS6000_BUILTIN_BSWAP_HI)
     return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target);
 
@@ -11083,6 +11098,12 @@ rs6000_init_builtins (void)
 				     "__builtin_rsqrtf");
       def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrtf", ftype,
 		   RS6000_BUILTIN_RSQRTF);
+
+      ftype = builtin_function_type (DFmode, DFmode, VOIDmode, VOIDmode,
+				     RS6000_BUILTIN_RSQRT,
+				     "__builtin_rsqrt");
+      def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrt", ftype,
+		   RS6000_BUILTIN_RSQRT);
     }
   if (TARGET_POPCNTB)
     {
@@ -24796,6 +24817,9 @@ rs6000_builtin_reciprocal (unsigned int fn, bool md_fn,
   else
     switch (fn)
       {
+      case BUILT_IN_SQRT:
+	return rs6000_builtin_decls[RS6000_BUILTIN_RSQRT];
+
       case BUILT_IN_SQRTF:
 	return rs6000_builtin_decls[RS6000_BUILTIN_RSQRTF];
 
@@ -24875,7 +24899,7 @@ rs6000_emit_swdivdf (rtx dst, rtx n, rtx d)
   /* e0 = 1. - d * x0 */
   emit_insn (gen_rtx_SET (VOIDmode, e0,
 			  gen_rtx_MINUS (DFmode, one,
-					 gen_rtx_MULT (SFmode, d, x0))));
+					 gen_rtx_MULT (DFmode, d, x0))));
   /* y1 = x0 + e0 * x0 */
   emit_insn (gen_rtx_SET (VOIDmode, y1,
 			  gen_rtx_PLUS (DFmode,
@@ -24908,88 +24932,63 @@ rs6000_emit_swdivdf (rtx dst, rtx n, rtx d)
 }
 
 
-/* Newton-Raphson approximation of single-precision floating point rsqrt.
-   Assumes no trapping math and finite arguments.  */
+/* Newton-Raphson approximation of single/double-precision floating point
+   rsqrt.  Assumes no trapping math and finite arguments.  */
 
 void
-rs6000_emit_swrsqrtsf (rtx dst, rtx src)
+rs6000_emit_swrsqrt (rtx dst, rtx src)
 {
-  rtx x0, x1, x2, y1, u0, u1, u2, v0, v1, v2, t0,
-    half, one, halfthree, c1, cond, label;
+  enum machine_mode mode = GET_MODE (src);
+  rtx x0 = gen_reg_rtx (mode);
 
-  x0 = gen_reg_rtx (SFmode);
-  x1 = gen_reg_rtx (SFmode);
-  x2 = gen_reg_rtx (SFmode);
-  y1 = gen_reg_rtx (SFmode);
-  u0 = gen_reg_rtx (SFmode);
-  u1 = gen_reg_rtx (SFmode);
-  u2 = gen_reg_rtx (SFmode);
-  v0 = gen_reg_rtx (SFmode);
-  v1 = gen_reg_rtx (SFmode);
-  v2 = gen_reg_rtx (SFmode);
-  t0 = gen_reg_rtx (SFmode);
-  halfthree = gen_reg_rtx (SFmode);
-  cond = gen_rtx_REG (CCFPmode, CR1_REGNO);
-  label = gen_rtx_LABEL_REF (VOIDmode, gen_label_rtx ());
+  gcc_assert (flag_finite_math_only && !flag_trapping_math);
+  gcc_assert (mode == SFmode || mode == DFmode);
+
+  /* x0 = rsqrt estimate */
+  emit_insn (gen_rtx_SET (VOIDmode, x0,
+			  gen_rtx_UNSPEC (mode, gen_rtvec (1, src),
+					  UNSPEC_RSQRT)));
 
-  /* check 0.0, 1.0, NaN, Inf by testing src * src = src */
-  emit_insn (gen_rtx_SET (VOIDmode, t0,
-			  gen_rtx_MULT (SFmode, src, src)));
+  if (rs6000_recip_passes > 0)
+    {
+      REAL_VALUE_TYPE dconst3_2;
+      int i;
+      rtx halfthree;
+      rtx y = gen_reg_rtx (mode);
+      rtx m;
+      rtx d;
 
-  emit_insn (gen_rtx_SET (VOIDmode, cond,
-			  gen_rtx_COMPARE (CCFPmode, t0, src)));
-  c1 = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
-  emit_unlikely_jump (c1, label);
+      real_from_integer (&dconst3_2, VOIDmode, 3, 0, 0);
+      SET_REAL_EXP (&dconst3_2, REAL_EXP (&dconst3_2) - 1);
+      d = CONST_DOUBLE_FROM_REAL_VALUE (dconst3_2, mode);
+      halfthree = force_reg (mode, d);
 
-  half = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode));
-  one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode));
+      /* y = 0.5 * src = 1.5 * src - src -> fewer constants */
+      m = gen_rtx_MULT (mode, src, halfthree),
+      emit_insn (gen_rtx_SET (VOIDmode, y, gen_rtx_MINUS (mode, m, src)));
 
-  /* halfthree = 1.5 = 1.0 + 0.5 */
-  emit_insn (gen_rtx_SET (VOIDmode, halfthree,
-			  gen_rtx_PLUS (SFmode, one, half)));
+      for (i = 0; i < rs6000_recip_passes; i++)
+	{
+	  rtx x1 = gen_reg_rtx (mode);
+	  rtx u = gen_reg_rtx (mode);
+	  rtx v = gen_reg_rtx (mode);
 
-  /* x0 = rsqrt estimate */
-  emit_insn (gen_rtx_SET (VOIDmode, x0,
-			  gen_rtx_UNSPEC (SFmode, gen_rtvec (1, src),
-					  UNSPEC_RSQRT)));
+	  /* x1 = x0 * (1.5 - y * (x0 * x0)) */
+	  emit_insn (gen_rtx_SET (VOIDmode, u,
+				  gen_rtx_MULT (mode, x0, x0)));
 
-  /* y1 = 0.5 * src = 1.5 * src - src -> fewer constants */
-  emit_insn (gen_rtx_SET (VOIDmode, y1,
-			  gen_rtx_MINUS (SFmode,
-					 gen_rtx_MULT (SFmode, src, halfthree),
-					 src)));
+	  m = gen_rtx_MULT (mode, y, u);
+	  emit_insn (gen_rtx_SET (VOIDmode, v,
+				  gen_rtx_MINUS (mode, halfthree, m)));
 
-  /* x1 = x0 * (1.5 - y1 * (x0 * x0)) */
-  emit_insn (gen_rtx_SET (VOIDmode, u0,
-			  gen_rtx_MULT (SFmode, x0, x0)));
-  emit_insn (gen_rtx_SET (VOIDmode, v0,
-			  gen_rtx_MINUS (SFmode,
-					 halfthree,
-					 gen_rtx_MULT (SFmode, y1, u0))));
-  emit_insn (gen_rtx_SET (VOIDmode, x1,
-			  gen_rtx_MULT (SFmode, x0, v0)));
-
-  /* x2 = x1 * (1.5 - y1 * (x1 * x1)) */
-  emit_insn (gen_rtx_SET (VOIDmode, u1,
-			  gen_rtx_MULT (SFmode, x1, x1)));
-  emit_insn (gen_rtx_SET (VOIDmode, v1,
-			  gen_rtx_MINUS (SFmode,
-					 halfthree,
-					 gen_rtx_MULT (SFmode, y1, u1))));
-  emit_insn (gen_rtx_SET (VOIDmode, x2,
-			  gen_rtx_MULT (SFmode, x1, v1)));
-
-  /* dst = x2 * (1.5 - y1 * (x2 * x2)) */
-  emit_insn (gen_rtx_SET (VOIDmode, u2,
-			  gen_rtx_MULT (SFmode, x2, x2)));
-  emit_insn (gen_rtx_SET (VOIDmode, v2,
-			  gen_rtx_MINUS (SFmode,
-					 halfthree,
-					 gen_rtx_MULT (SFmode, y1, u2))));
-  emit_insn (gen_rtx_SET (VOIDmode, dst,
-			  gen_rtx_MULT (SFmode, x2, v2)));
+	  emit_insn (gen_rtx_SET (VOIDmode, x1,
+				  gen_rtx_MULT (mode, x0, v)));
+	  x0 = x1;
+	}
+    }
 
-  emit_label (XEXP (label, 0));
+  emit_move_insn (dst, x0);
+  return;
 }
 
 /* Emit popcount intrinsic on TARGET_POPCNTB (Power5) and TARGET_POPCNTD
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 9b323e262cf..6b95e0becd2 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -213,6 +213,13 @@
   (DD "TARGET_DFP")
   (TD "TARGET_DFP")])
 
+; Single/double precision
+(define_mode_iterator FP2 [
+  (SF "TARGET_HARD_FLOAT 
+   && ((TARGET_FPRS && TARGET_SINGLE_FLOAT) || TARGET_E500_SINGLE)")
+  (DF "TARGET_HARD_FLOAT 
+   && ((TARGET_FPRS && TARGET_DOUBLE_FLOAT) || TARGET_E500_DOUBLE)")])
+
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI "b") (HI "h") (SI "w") (DI "d")])
@@ -5861,22 +5868,23 @@
   "fsqrt %0,%1"
   [(set_attr "type" "dsqrt")])
 
-(define_expand "rsqrtsf2"
-  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
-	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
-		   UNSPEC_RSQRT))]
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:FP2 0 "gpc_reg_operand" "")
+	(unspec:FP2 [(match_operand:FP2 1 "gpc_reg_operand" "")]
+		    UNSPEC_RSQRT))]
   "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size
    && flag_finite_math_only && !flag_trapping_math"
 {
-  rs6000_emit_swrsqrtsf (operands[0], operands[1]);
+  rs6000_emit_swrsqrt (operands[0], operands[1]);
   DONE;
 })
 
-(define_insn "*rsqrt_internal1"
+(define_insn "*rsqrtsf_internal1"
   [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
 	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
 		   UNSPEC_RSQRT))]
-  "TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT"
+  "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
+   && TARGET_PPC_GFXOPT"
   "frsqrte %0,%1"
   [(set_attr "type" "fp")])
 
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index d3fa10f856f..47da0156792 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -187,8 +187,13 @@ Target Report Var(TARGET_XL_COMPAT)
 Conform more closely to IBM XLC semantics
 
 mrecip
-Target Report Var(TARGET_RECIP)
-Generate software reciprocal sqrt for better throughput
+Target Report Var(TARGET_RECIP) Init(-1)
+Generate software reciprocal square root for better throughput.
+
+mrecip-passes=
+Target Report UInteger Joined Var(rs6000_recip_passes) Init(-1)
+Number of fixup passes after doing the reciprocal sqrt esitmate.  Default is 2
+for the power6 and newer machines, and 3 for older machines.
 
 mno-fp-in-toc
 Target Report RejectNegative Var(TARGET_NO_FP_IN_TOC)
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 7d572a48412..e6fefad232f 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -195,7 +195,7 @@
    (UNSPEC_VSX_MSUB		511)
    (UNSPEC_VSX_NMADD		512)
    (UNSPEC_VSX_NMSUB		513)
-   (UNSPEC_VSX_RSQRTE		514)
+   ; 514 deleted
    (UNSPEC_VSX_TDIV		515)
    (UNSPEC_VSX_TSQRT		516)
    (UNSPEC_VSX_XXPERMDI		517)
@@ -449,7 +449,7 @@
 (define_insn "vsx_rsqrte<mode>2"
   [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,?wa")
 	(unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,wa")]
-		      UNSPEC_VSX_RSQRTE))]
+		      UNSPEC_RSQRT))]
   "VECTOR_UNIT_VSX_P (<MODE>mode)"
   "x<VSv>rsqrte<VSs> %x0,%x1"
   [(set_attr "type" "<VStype_simple>")
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 0d9862f52b9..ebf53e16001 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -11634,6 +11634,7 @@ GCC provides a few other builtins on Powerpc to access certain instructions:
 float __builtin_recipdivf (float, float);
 float __builtin_rsqrtf (float);
 double __builtin_recipdiv (double, double);
+double __builtin_rsqrt (double);
 long __builtin_bpermd (long, long);
 int __builtin_bswap16 (int);
 @end smallexample
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 3d4e720b004..edd8421f8bc 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -753,7 +753,8 @@ See RS/6000 and PowerPC Options.
 -mfloat-gprs=yes  -mfloat-gprs=no -mfloat-gprs=single -mfloat-gprs=double @gol
 -mprototype  -mno-prototype @gol
 -msim  -mmvme  -mads  -myellowknife  -memb  -msdata @gol
--msdata=@var{opt}  -mvxworks  -G @var{num}  -pthread}
+-msdata=@var{opt}  -mvxworks  -G @var{num}  -pthread @gol
+-mrecip -mno-recip -mrecip-passes=@var{num}}
 
 @emph{S/390 and zSeries Options}
 @gccoptlist{-mtune=@var{cpu-type}  -march=@var{cpu-type} @gol
@@ -14610,6 +14611,27 @@ when the linker is known to generate glue.
 Adds support for multithreading with the @dfn{pthreads} library.
 This option sets flags for both the preprocessor and linker.
 
+@item -mrecip
+@itemx -mno-recip
+@opindex mrecip
+This option will enable GCC to use FRSQRTE instruction for reciprocal
+square root with an additional Newton-Raphson step to increase
+precision instead of FSQRTS/FDIVS for single precision floating point
+arguments.  These instructions are generated only when
+@option{-funsafe-math-optimizations} is enabled together with
+@option{-finite-math-only} and @option{-fno-trapping-math}.  Note that
+while the throughput of the sequence is generally higher than the
+throughput of the non-reciprocal instruction, the precision of the
+sequence can be decreased by up to 2 ulp (i.e. the inverse of 1.0
+equals 0.99999994).
+
+@item -mrecip-passes=@var{num}
+@opindex mrecip-passes
+Control the number of Newton-Raphson passes to use after an estimate
+instruction if @option{-mrecip} was used.  The default is 2 on newer
+machines that support higher precision estimate instruction and 3 on
+older machines.
+
 @end table
 
 @node S/390 and zSeries Options
author	Michael Meissner <meissner@linux.vnet.ibm.com>	2010-04-01 17:20:58 +0000
committer	Michael Meissner <meissner@linux.vnet.ibm.com>	2010-04-01 17:20:58 +0000
commit	0c831bb685c6872226b653c8802be6237cfa9bd8 (patch)
tree	dbcd4ef24ee3d53e1f19184e7df9d4d5f3f3ae4e
parent	3c3686b95104e5042ba8d046bcd158e521f54378 (diff)