35 files changed, 3982 insertions, 712 deletions
diff --git a/gcc/ChangeLog.ibm b/gcc/ChangeLog.ibm
new file mode 100644
index 00000000000..d030ebd3ec9
--- /dev/null
+++ b/gcc/ChangeLog.ibm
@@ -0,0 +1,265 @@
+2018-01-31  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 257158.
+	* REVISION: Update subversion id.
+
+2017-07-20  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 250395.
+	* REVISION: Update subversion id.
+
+2017-07-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 250101.
+	* REVISION: Update subversion id.
+
+2017-05-15  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 248065.
+	* REVISION: Update subversion id.
+
+2017-05-08  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+
+	Backport from trunk
+	PR target/80358
+	* config/rs6000/rs6000.c (expand_block_compare): Fix boundary check.
+
+2017-04-05  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 246707.
+	* REVISION: Update subversion id.
+
+2017-03-30  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 246596.
+	* REVISION: Update subversion id.
+
+2017-03-29  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 246577.
+	* REVISION: Update subversion id.
+
+2017-03-27  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 246509.
+	* REVISION: Update subversion id.
+
+2017-03-27  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+
+	Backport from trunk
+	PR target/79449
+	PR target/79170
+
+	* gcc.dg/strncmp-2.c: New.  Test strncmp and memcmp builtin expansion
+	for reading beyond a 4k boundary.
+	* config/rs6000/rs6000.c (expand_block_compare): Make sure runtime
+	boundary crossing check and subsequent code generation agree.
+	* gcc.dg/memcmp-1.c: Improved to catch failures seen in PR 79170.
+	* config/rs6000/altivec.md (*setb_internal): Rename to setb_signed.
+	(setb_unsigned) New pattern for setb with CCUNS.
+	* config/rs6000/rs6000.c (expand_block_compare): Use a different
+	subfc./subfe sequence to avoid overflow problems.  Generate a
+	shorter sequence with cmpld/setb for power9.
+	* config/rs6000/rs6000.md (subf<mode>3_carry_dot2): Add a new pattern
+	for generating subfc. instruction.
+	(cmpstrsi): Add TARGET_POPCNTD predicate as the generate sequence
+	now uses this instruction.
+	* config/rs6000/rs6000-protos.h (expand_strn_compare): Add arg.
+	* config/rs6000/rs6000.c (expand_strn_compare): Add ability to expand
+	strcmp. Fix bug where comparison didn't stop with zero byte. Fix
+	case where N arg is SIZE_MAX.
+	* config/rs6000/rs6000.md (cmpstrnsi): Args to expand_strn_compare.
+	(cmpstrsi): Add pattern.
+	* gcc.dg/memcmp-1.c: New.
+	* gcc.dg/strncmp-1.c: New.
+	* config/rs6000/rs6000-protos.h (expand_strn_compare): Declare.
+	* config/rs6000/rs6000.md (UNSPEC_CMPB): New unspec.
+	(cmpb<mode>3): pattern for generating cmpb.
+	(cmpstrnsi): pattern to expand strncmp ().
+	* config/rs6000/rs6000.opt (mstring-compare-inline-limit): Add a new
+	target option for controlling how much code inline expansion of
+	strncmp() will be allowed to generate.
+	* config/rs6000/rs6000.c (expand_strncmp_align_check): generate code
+	for runtime page crossing check of strncmp () args.
+	(expand_strn_compare): Function to do builtin expansion of strncmp ().
+	* config/i386/i386.md (cmpstrnsi): New test to bail out if neither
+	string input is a string constant.
+	* builtins.c (expand_builtin_strncmp): Attempt expansion of strncmp
+	via cmpstrnsi even if neither string is constant.
+
+2017-03-14  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 246128.
+	* REVISION: Update subversion id.
+
+2017-01-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 244285.
+	* REVISION: Update subversion id.
+
+2017-01-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Back port from trunk
+	2017-01-04  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	PR target/71977
+	PR target/70568
+	PR target/78823
+	* config/rs6000/predicates.md (sf_subreg_operand): New predicate.
+	(altivec_register_operand): Do not return true if the operand
+	contains a SUBREG mixing SImode and SFmode.
+	(vsx_register_operand): Likewise.
+	(vsx_reg_sfsubreg_ok): New predicate.
+	(vfloat_operand): Do not return true if the operand contains a
+	SUBREG mixing SImode and SFmode.
+	(vint_operand): Likewise.
+	(vlogical_operand): Likewise.
+	(gpc_reg_operand): Likewise.
+	(int_reg_operand): Likewise.
+	* config/rs6000/rs6000-protos.h (valid_sf_si_move): Add declaration.
+	* config/rs6000/rs6000.c (valid_sf_si_move): New function to
+	determine if a MOVSI or MOVSF operation contains SUBREGs that mix
+	SImode and SFmode.
+	(rs6000_emit_move_si_sf_subreg): New helper function.
+	(rs6000_emit_move): Call rs6000_emit_move_si_sf_subreg to possbily
+	fixup SUBREGs involving SImode and SFmode.
+	* config/rs6000/vsx.md (SFBOOL_*): New constants that are operand
+	numbers for the new peephole2 optimization.
+	(peephole2 for SFmode unions): New peephole2 to optimize cases in
+	the GLIBC math library that do AND/IOR/XOR operations on single
+	precision floating point.
+	* config/rs6000/rs6000.h (TARGET_NO_SF_SUBREG): New internal
+	target macros to say whether we need to avoid SUBREGs mixing
+	SImode and SFmode.
+	(TARGET_ALLOW_SF_SUBREG): Likewise.
+	* config/rs6000/rs6000.md (UNSPEC_SF_FROM_SI): New unspecs.
+	(UNSPEC_SI_FROM_SF): Likewise.
+	(iorxor): Change spacing.
+	(and_ior_xor): New iterator for AND, IOR, and XOR.
+	(movsi_from_sf): New insns for SImode/SFmode SUBREG support.
+	(movdi_from_sf_zero_ext): Likewise.
+	(mov<mode>_hardfloat, FMOVE32 iterator): Use register_operand
+	instead of gpc_reg_operand.  Add SImode/SFmode SUBREG support.
+	(movsf_from_si): New insn for SImode/SFmode SUBREG support.
+	(fma<mode>4): Use gpc_reg_operand instead of register_operand.
+	(fms<mode>4): Likewise.
+	(fnma<mode>4): Likewise.
+	(fnms<mode>4): Likewise.
+	(nfma<mode>4): Likewise.
+	(nfms<mode>4): Likewise.
+
+	* config/rs6000/rs6000.h (TARGET_DIRECT_MOVE_64BIT): Define macro
+	used by the above patch in a limited form.
+	* config/rs6000/vsx.md (peephole2 for SFmode unions): Use DFmode
+	for doing direct moves instead of DImode, since GCC 6.x does not
+	support DImode in Altivec registers.
+
+2016-12-15  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 243710.
+	* REVISION: Update subversion id.
+
+2016-12-13  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 243626.
+	* REVISION: Update subversion id.
+
+2016-12-05  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	Backport the swdiv improvement.
+
+2016-12-05  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	Separate shrink-wrapping.
+
+The rs6000.c patch needed a little adjustment: GCC 6 does not have
+SAVE_MULTIPLE.  I regression tested this, everything passed (and a
+few failures no longer trigger).
+
+2016-12-05  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	Preparation for separate shrink-wrapping, step 2
+
+2016-12-05  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	Preparation for separate shrink-wrapping, step 1
+
+2016-12-01  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 243130.
+	* REVISION: Update subversion id.
+
+2016-12-01  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+
+	Backport from mainline
+	2016-09-23  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+
+	* config/rs6000/rs6000.md (cmpmemsi): New define_expand.
+	* config/rs6000/rs6000.c (expand_block_compare): New function used by
+	cmpmemsi pattern to do builtin expansion of memcmp ().
+	(compute_current_alignment): Add helper function for
+	expand_block_compare used to compute alignment as the compare proceeds.
+	(select_block_compare_mode): Used by expand_block_compare to select
+	the mode used for reading the next chunk of bytes in the compare.
+	(do_load_for_compare): Used by expand_block_compare to emit the load
+	insns for the compare.
+	(rs6000_emit_dot_insn): Moved this function to avoid a forward
+	reference from expand_block_compare ().
+	* config/rs6000/rs6000-protos.h (expand_block_compare): Add a
+	prototype for this function.
+	* config/rs6000/rs6000.opt (mblock-compare-inline-limit): Add a new
+	target option for controlling how much code inline expansion of
+	memcmp() will be allowed to generate.
+	* config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED):
+	Add macro to say we can efficiently handle overlapping unaligned
+	loads.
+
+2016-11-30  Pat Haugen  <pthaugen@us.ibm.com>
+
+	Backport from mainline
+	2016-09-13  Pat Haugen  <pthaugen@us.ibm.com>
+
+	PR tree-optimization/77536
+	PR rtl-optimization/68212
+	* config/rs6000/rs6000.md (div->recip splitter): Remove
+	optimize_insn_for_speed_p condition.
+
+2016-11-07  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 241939.
+	* REVISION: Update subversion id.
+
+2016-10-07  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 240874.
+	* REVISION: Update subversion id.
+
+2016-08-22  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 239677.
+	* REVISION: Update subversion id.
+
+2016-08-10  Peter Bergner  <bergner@vnet.ibm.com>
+
+	Merge up to 239331.
+	* REVISION: Update subversion id.
+
+2016-08-04  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 239155.
+	* REVISION: Update subversion id.
+
+2016-07-12  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 238258.
+	* REVISION: Update subversion id.
+
+2016-06-13  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 237401.
+	* REVISION: Update subversion id.
+
+2016-04-28   Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Clone branch subversion id 235611
+
diff --git a/gcc/REVISION b/gcc/REVISION
new file mode 100644
index 00000000000..2070b79bd7c
--- /dev/null
+++ b/gcc/REVISION
@@ -0,0 +1 @@
+IBM AT 10 branch, based on subversion id 257158.
diff --git a/gcc/builtins.c b/gcc/builtins.c
index c70c0f5004c..2dd5bd93881 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3903,7 +3903,7 @@ expand_builtin_strncmp (tree exp, ATTRIBUTE_UNUSED rtx target,
   insn_code cmpstrn_icode = direct_optab_handler (cmpstrn_optab, SImode);
   if (cmpstrn_icode != CODE_FOR_nothing)
   {
-    tree len, len1, len2;
+    tree len, len1, len2, len3;
     rtx arg1_rtx, arg2_rtx, arg3_rtx;
     rtx result;
     tree fndecl, fn;
@@ -3922,14 +3922,19 @@ expand_builtin_strncmp (tree exp, ATTRIBUTE_UNUSED rtx target,
     if (len2)
       len2 = size_binop_loc (loc, PLUS_EXPR, ssize_int (1), len2);
 
+    len3 = fold_convert_loc (loc, sizetype, arg3);
+
     /* If we don't have a constant length for the first, use the length
-       of the second, if we know it.  We don't require a constant for
+       of the second, if we know it.  If neither string is constant length,
+       use the given length argument.  We don't require a constant for
        this case; some cost analysis could be done if both are available
        but neither is constant.  For now, assume they're equally cheap,
        unless one has side effects.  If both strings have constant lengths,
        use the smaller.  */
 
-    if (!len1)
+    if (!len1 && !len2)
+      len = len3;
+    else if (!len1)
       len = len2;
     else if (!len2)
       len = len1;
@@ -3946,23 +3951,10 @@ expand_builtin_strncmp (tree exp, ATTRIBUTE_UNUSED rtx target,
     else
       len = len2;
 
-    /* If both arguments have side effects, we cannot optimize.  */
-    if (!len || TREE_SIDE_EFFECTS (len))
-      return NULL_RTX;
-
-    /* The actual new length parameter is MIN(len,arg3).  */
-    len = fold_build2_loc (loc, MIN_EXPR, TREE_TYPE (len), len,
-		       fold_convert_loc (loc, TREE_TYPE (len), arg3));
-
-    /* If we don't have POINTER_TYPE, call the function.  */
-    if (arg1_align == 0 || arg2_align == 0)
-      return NULL_RTX;
-
-    /* Stabilize the arguments in case gen_cmpstrnsi fails.  */
-    arg1 = builtin_save_expr (arg1);
-    arg2 = builtin_save_expr (arg2);
-    len = builtin_save_expr (len);
-
+    /* If we are not using the given length, we must incorporate it here.
+       The actual new length parameter will be MIN(len,arg3) in this case.  */
+    if (len != len3)
+      len = fold_build2_loc (loc, MIN_EXPR, TREE_TYPE (len), len, len3);
     arg1_rtx = get_memory_rtx (arg1, len);
     arg2_rtx = get_memory_rtx (arg2, len);
     arg3_rtx = expand_normal (len);
diff --git a/gcc/cfgcleanup.c b/gcc/cfgcleanup.c
index 6e92d4cdde2..023b9d239a8 100644
--- a/gcc/cfgcleanup.c
+++ b/gcc/cfgcleanup.c
@@ -156,6 +156,7 @@ try_simplify_condjump (basic_block cbranch_block)
   cbranch_dest_block = cbranch_jump_edge->dest;
 
   if (cbranch_dest_block == EXIT_BLOCK_PTR_FOR_FN (cfun)
+      || jump_dest_block == EXIT_BLOCK_PTR_FOR_FN (cfun)
       || !can_fallthru (jump_block, cbranch_dest_block))
     return false;
 
@@ -2605,6 +2606,35 @@ trivially_empty_bb_p (basic_block bb)
     }
 }
 
+/* Return true if BB contains just a return and possibly a USE of the
+   return value.  Fill in *RET and *USE with the return and use insns
+   if any found, otherwise NULL.  */
+
+static bool
+bb_is_just_return (basic_block bb, rtx_insn **ret, rtx_insn **use)
+{
+  *ret = *use = NULL;
+  rtx_insn *insn;
+
+  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+    return false;
+
+  FOR_BB_INSNS (bb, insn)
+    if (NONDEBUG_INSN_P (insn))
+      {
+	if (!*ret && ANY_RETURN_P (PATTERN (insn)))
+	  *ret = insn;
+	else if (!*ret && !*use && GET_CODE (PATTERN (insn)) == USE
+	    && REG_P (XEXP (PATTERN (insn), 0))
+	    && REG_FUNCTION_VALUE_P (XEXP (PATTERN (insn), 0)))
+	  *use = insn;
+	else
+	  return false;
+      }
+
+  return !!*ret;
+}
+
 /* Do simple CFG optimizations - basic block merging, simplifying of jump
    instructions etc.  Return nonzero if changes were made.  */
 
@@ -2791,6 +2821,99 @@ try_optimize_cfg (int mode)
 		      }
 		}
 
+	      /* Try to change a branch to a return to just that return.  */
+	      rtx_insn *ret, *use;
+	      if (single_succ_p (b)
+		  && onlyjump_p (BB_END (b))
+		  && bb_is_just_return (single_succ (b), &ret, &use))
+		{
+		  if (redirect_jump (as_a <rtx_jump_insn *> (BB_END (b)),
+				     PATTERN (ret), 0))
+		    {
+		      if (use)
+			emit_insn_before (copy_insn (PATTERN (use)),
+					  BB_END (b));
+		      if (dump_file)
+			fprintf (dump_file, "Changed jump %d->%d to return.\n",
+				 b->index, single_succ (b)->index);
+		      redirect_edge_succ (single_succ_edge (b),
+					  EXIT_BLOCK_PTR_FOR_FN (cfun));
+		      single_succ_edge (b)->flags &= ~EDGE_CROSSING;
+		      changed_here = true;
+		    }
+		}
+
+	      /* Try to change a conditional branch to a return to the
+		 respective conditional return.  */
+	      if (EDGE_COUNT (b->succs) == 2
+		  && any_condjump_p (BB_END (b))
+		  && bb_is_just_return (BRANCH_EDGE (b)->dest, &ret, &use))
+		{
+		  if (redirect_jump (as_a <rtx_jump_insn *> (BB_END (b)),
+				     PATTERN (ret), 0))
+		    {
+		      if (use)
+			emit_insn_before (copy_insn (PATTERN (use)),
+					  BB_END (b));
+		      if (dump_file)
+			fprintf (dump_file, "Changed conditional jump %d->%d "
+				 "to conditional return.\n",
+				 b->index, BRANCH_EDGE (b)->dest->index);
+		      redirect_edge_succ (BRANCH_EDGE (b),
+					  EXIT_BLOCK_PTR_FOR_FN (cfun));
+		      BRANCH_EDGE (b)->flags &= ~EDGE_CROSSING;
+		      changed_here = true;
+		    }
+		}
+
+	      /* Try to flip a conditional branch that falls through to
+		 a return so that it becomes a conditional return and a
+		 new jump to the original branch target.  */
+	      if (EDGE_COUNT (b->succs) == 2
+		  && BRANCH_EDGE (b)->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
+		  && any_condjump_p (BB_END (b))
+		  && bb_is_just_return (FALLTHRU_EDGE (b)->dest, &ret, &use))
+		{
+		  if (invert_jump (as_a <rtx_jump_insn *> (BB_END (b)),
+				   JUMP_LABEL (BB_END (b)), 0))
+		    {
+		      basic_block new_ft = BRANCH_EDGE (b)->dest;
+		      if (redirect_jump (as_a <rtx_jump_insn *> (BB_END (b)),
+					 PATTERN (ret), 0))
+			{
+			  if (use)
+			    emit_insn_before (copy_insn (PATTERN (use)),
+					      BB_END (b));
+			  if (dump_file)
+			    fprintf (dump_file, "Changed conditional jump "
+				     "%d->%d to conditional return, adding "
+				     "fall-through jump.\n",
+				     b->index, BRANCH_EDGE (b)->dest->index);
+			  redirect_edge_succ (BRANCH_EDGE (b),
+					      EXIT_BLOCK_PTR_FOR_FN (cfun));
+			  BRANCH_EDGE (b)->flags &= ~EDGE_CROSSING;
+			  std::swap (BRANCH_EDGE (b)->probability,
+				     FALLTHRU_EDGE (b)->probability);
+			  update_br_prob_note (b);
+			  basic_block jb = force_nonfallthru (FALLTHRU_EDGE (b));
+			  notice_new_block (jb);
+			  if (!redirect_jump (as_a <rtx_jump_insn *> (BB_END (jb)),
+					      block_label (new_ft), 0))
+			    gcc_unreachable ();
+			  redirect_edge_succ (single_succ_edge (jb), new_ft);
+			  changed_here = true;
+			}
+		      else
+			{
+			  /* Invert the jump back to what it was.  This should
+			     never fail.  */
+			  if (!invert_jump (as_a <rtx_jump_insn *> (BB_END (b)),
+					    JUMP_LABEL (BB_END (b)), 0))
+			    gcc_unreachable ();
+			}
+		    }
+		}
+
 	      /* Simplify branch over branch.  */
 	      if ((mode & CLEANUP_EXPENSIVE)
 		   && !(mode & CLEANUP_CFGLAYOUT)
diff --git a/gcc/common.opt b/gcc/common.opt
index a25557720b5..926582c6e35 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2140,6 +2140,10 @@ Common Report Var(flag_shrink_wrap) Optimization
 Emit function prologues only before parts of the function that need it,
 rather than at the top of the function.
 
+fshrink-wrap-separate
+Common Report Var(flag_shrink_wrap_separate) Init(1) Optimization
+Shrink-wrap parts of the prologue and epilogue separately.
+
 fsignaling-nans
 Common Report Var(flag_signaling_nans) Optimization SetByCombined
 Disable optimizations observable by IEEE signaling NaNs.
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d2bfe314f71..a5ffa310cec 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16905,6 +16905,21 @@
   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
     FAIL;
 
+  /* One of the strings must be a constant.  If so, expand_builtin_strncmp()
+     will have rewritten the length arg to be the minimum of the const string
+     length and the actual length arg.  If both strings are the same and
+     shorter than the length arg, repz cmpsb will not stop at the 0 byte and
+     will incorrectly base the results on chars past the 0 byte.  */
+  tree t1 = MEM_EXPR (operands[1]);
+  tree t2 = MEM_EXPR (operands[2]);
+  if (!((t1 && TREE_CODE (t1) == MEM_REF
+         && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
+         && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0)) == STRING_CST)
+      || (t2 && TREE_CODE (t2) == MEM_REF
+          && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
+          && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0)) == STRING_CST)))
+    FAIL;
+
   out = operands[0];
   if (!REG_P (out))
     out = gen_reg_rtx (SImode);
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 30b7375382a..4a1ca91d451 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -3709,3 +3709,36 @@
 				  (match_dup 3)]
 				 UNSPEC_BCD_ADD_SUB)
 		    (match_dup 4)))])])
+
+;; Set operand 0 register to -1 if the LT bit (0x8) of condition
+;; register operand 1 is on.  Otherwise, set operand 0 register to 1
+;; if the GT bit (0x4) of condition register operand 1 is on.
+;; Otherwise, set operand 0 to 0.  Note that the result stored into
+;; register operand 0 is non-zero iff either the LT or GT bits are on
+;; within condition register operand 1.
+(define_insn "setb_signed"
+   [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
+	 (if_then_else:SI (lt (match_operand:CC 1 "cc_reg_operand" "y")
+			      (const_int 0))
+			  (const_int -1)
+			  (if_then_else (gt (match_dup 1)
+					    (const_int 0))
+					(const_int 1)
+					(const_int 0))))]
+  "TARGET_P9_MISC"
+  "setb %0,%1"
+  [(set_attr "type" "logical")])
+
+(define_insn "setb_unsigned"
+   [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
+	 (if_then_else:SI (ltu (match_operand:CCUNS 1 "cc_reg_operand" "y")
+			      (const_int 0))
+			  (const_int -1)
+			  (if_then_else (gtu (match_dup 1)
+					    (const_int 0))
+					(const_int 1)
+					(const_int 0))))]
+  "TARGET_P9_MISC"
+  "setb %0,%1"
+  [(set_attr "type" "logical")])
+
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 3c1214f33ca..a25384680bb 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -31,12 +31,47 @@
        (match_test "REGNO (op) == CTR_REGNO
 		    || REGNO (op) > LAST_VIRTUAL_REGISTER")))
 
+;; Return 1 if op is a SUBREG that is used to look at a SFmode value as
+;; and integer or vice versa.
+;;
+;; In the normal case where SFmode is in a floating point/vector register, it
+;; is stored as a DFmode and has a different format.  If we don't transform the
+;; value, things that use logical operations on the values will get the wrong
+;; value.
+;;
+;; If we don't have 64-bit and direct move, this conversion will be done by
+;; store and load, instead of by fiddling with the bits within the register.
+(define_predicate "sf_subreg_operand"
+  (match_code "subreg")
+{
+  rtx inner_reg = SUBREG_REG (op);
+  machine_mode inner_mode = GET_MODE (inner_reg);
+
+  if (TARGET_ALLOW_SF_SUBREG || !REG_P (inner_reg))
+    return 0;
+
+  if ((mode == SFmode && GET_MODE_CLASS (inner_mode) == MODE_INT)
+       || (GET_MODE_CLASS (mode) == MODE_INT && inner_mode == SFmode))
+    {
+      if (INT_REGNO_P (REGNO (inner_reg)))
+	return 0;
+
+      return 1;
+    }
+  return 0;
+})
+
 ;; Return 1 if op is an Altivec register.
 (define_predicate "altivec_register_operand"
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -52,6 +87,27 @@
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
+
+  if (!REG_P (op))
+    return 0;
+
+  if (REGNO (op) >= FIRST_PSEUDO_REGISTER)
+    return 1;
+
+  return VSX_REGNO_P (REGNO (op));
+})
+
+;; Like vsx_register_operand, but allow SF SUBREGS
+(define_predicate "vsx_reg_sfsubreg_ok"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
     op = SUBREG_REG (op);
 
   if (!REG_P (op))
@@ -69,7 +125,12 @@
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -86,7 +147,12 @@
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -103,7 +169,13 @@
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
+
 
   if (!REG_P (op))
     return 0;
@@ -206,6 +278,9 @@
        (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
 
 ;; Return 1 if op is a register that is not special.
+;; Disallow (SUBREG:SF (REG:SI)) and (SUBREG:SI (REG:SF)) on VSX systems where
+;; you need to be careful in moving a SFmode to SImode and vice versa due to
+;; the fact that SFmode is represented as DFmode in the VSX registers.
 (define_predicate "gpc_reg_operand"
   (match_operand 0 "register_operand")
 {
@@ -213,7 +288,12 @@
     return 0;
 
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -231,7 +311,8 @@
 })
 
 ;; Return 1 if op is a general purpose register.  Unlike gpc_reg_operand, don't
-;; allow floating point or vector registers.
+;; allow floating point or vector registers.  Since vector registers are not
+;; allowed, we don't have to reject SFmode/SImode subregs.
 (define_predicate "int_reg_operand"
   (match_operand 0 "register_operand")
 {
@@ -239,7 +320,12 @@
     return 0;
 
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -251,6 +337,8 @@
 })
 
 ;; Like int_reg_operand, but don't return true for pseudo registers
+;; We don't have to check for SF SUBREGS because pseudo registers
+;; are not allowed, and SF SUBREGs are ok within GPR registers.
 (define_predicate "int_reg_operand_not_pseudo"
   (match_operand 0 "register_operand")
 {
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 490000dcfe8..a09470a43c3 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -74,6 +74,8 @@ extern void rs6000_expand_interleave (rtx, rtx, rtx, bool);
 extern void rs6000_scale_v2df (rtx, rtx, int);
 extern int expand_block_clear (rtx[]);
 extern int expand_block_move (rtx[]);
+extern bool expand_block_compare (rtx[]);
+extern bool expand_strn_compare (rtx[], int);
 extern const char * rs6000_output_load_multiple (rtx[]);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
 extern bool rs6000_is_valid_and_mask (rtx, machine_mode);
@@ -147,6 +149,7 @@ extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
+extern bool valid_sf_si_move (rtx, rtx, machine_mode);
 extern void rs6000_emit_move (rtx, rtx, machine_mode);
 extern rtx rs6000_secondary_memory_needed_rtx (machine_mode);
 extern machine_mode rs6000_secondary_memory_needed_mode (machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 80391acf3cd..1dca80e01b7 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -153,6 +153,10 @@ typedef struct GTY(()) machine_function
   bool split_stack_argp_used;
   /* Flag if r2 setup is needed with ELFv2 ABI.  */
   bool r2_setup_needed;
+  /* The components already handled by separate shrink-wrapping, which should
+     not be considered by the prologue and epilogue.  */
+  bool gpr_is_wrapped_separately[32];
+  bool lr_is_wrapped_separately;
 } machine_function;
 
 /* Support targetm.vectorize.builtin_mask_for_load.  */
@@ -1601,6 +1605,19 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_SET_UP_BY_PROLOGUE
 #define TARGET_SET_UP_BY_PROLOGUE rs6000_set_up_by_prologue
 
+#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
+#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS rs6000_get_separate_components
+#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
+#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB rs6000_components_for_bb
+#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
+#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS rs6000_disqualify_components
+#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS rs6000_emit_prologue_components
+#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS rs6000_emit_epilogue_components
+#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
+#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS rs6000_set_handled_components
+
 #undef TARGET_EXTRA_LIVE_ON_ENTRY
 #define TARGET_EXTRA_LIVE_ON_ENTRY rs6000_live_on_entry
 
@@ -9667,6 +9684,78 @@ rs6000_emit_le_vsx_move (rtx dest, rtx source, machine_mode mode)
     }
 }
 
+/* Return whether a SFmode or SImode move can be done without converting one
+   mode to another.  This arrises when we have:
+
+	(SUBREG:SF (REG:SI ...))
+	(SUBREG:SI (REG:SF ...))
+
+   and one of the values is in a floating point/vector register, where SFmode
+   scalars are stored in DFmode format.  */
+
+bool
+valid_sf_si_move (rtx dest, rtx src, machine_mode mode)
+{
+  if (TARGET_ALLOW_SF_SUBREG)
+    return true;
+
+  if (mode != SFmode && GET_MODE_CLASS (mode) != MODE_INT)
+    return true;
+
+  if (!SUBREG_P (src) || !sf_subreg_operand (src, mode))
+    return true;
+
+  /*.  Allow (set (SUBREG:SI (REG:SF)) (SUBREG:SI (REG:SF))).  */
+  if (SUBREG_P (dest))
+    {
+      rtx dest_subreg = SUBREG_REG (dest);
+      rtx src_subreg = SUBREG_REG (src);
+      return GET_MODE (dest_subreg) == GET_MODE (src_subreg);
+    }
+
+  return false;
+}
+
+
+/* Helper function to change moves with:
+
+	(SUBREG:SF (REG:SI)) and
+	(SUBREG:SI (REG:SF))
+
+   into separate UNSPEC insns.  In the PowerPC architecture, scalar SFmode
+   values are stored as DFmode values in the VSX registers.  We need to convert
+   the bits before we can use a direct move or operate on the bits in the
+   vector register as an integer type.
+
+   Skip things like (set (SUBREG:SI (...) (SUBREG:SI (...)).  */
+
+static bool
+rs6000_emit_move_si_sf_subreg (rtx dest, rtx source, machine_mode mode)
+{
+  if (TARGET_DIRECT_MOVE_64BIT && !reload_in_progress && !reload_completed
+      && !lra_in_progress
+      && (!SUBREG_P (dest) || !sf_subreg_operand (dest, mode))
+      && SUBREG_P (source) && sf_subreg_operand (source, mode))
+    {
+      rtx inner_source = SUBREG_REG (source);
+      machine_mode inner_mode = GET_MODE (inner_source);
+
+      if (mode == SImode && inner_mode == SFmode)
+	{
+	  emit_insn (gen_movsi_from_sf (dest, inner_source));
+	  return true;
+	}
+
+      if (mode == SFmode && inner_mode == SImode)
+	{
+	  emit_insn (gen_movsf_from_si (dest, inner_source));
+	  return true;
+	}
+    }
+
+  return false;
+}
+
 /* Emit a move from SOURCE to DEST in mode MODE.  */
 void
 rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
@@ -9697,6 +9786,11 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
       gcc_unreachable ();
     }
 
+  /* See if we need to special case SImode/SFmode SUBREG moves.  */
+  if ((mode == SImode || mode == SFmode) && SUBREG_P (source)
+      && rs6000_emit_move_si_sf_subreg (dest, source, mode))
+    return;
+
   /* Check if GCC is setting up a block move that will end up using FP
      registers as temporaries.  We must make sure this is acceptable.  */
   if (GET_CODE (operands[0]) == MEM
@@ -16081,7 +16175,7 @@ rs6000_init_builtins (void)
   TYPE_NAME (V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector __bool char", bool_V16QI_type_node);
-  TYPE_NAME ( bool_V16QI_type_node) = tdecl;
+  TYPE_NAME (bool_V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector unsigned short", unsigned_V8HI_type_node);
   TYPE_NAME (unsigned_V8HI_type_node) = tdecl;
@@ -17938,7 +18032,1100 @@ expand_block_clear (rtx operands[])
   return 1;
 }
 
+/* Emit a potentially record-form instruction, setting DST from SRC.
+   If DOT is 0, that is all; otherwise, set CCREG to the result of the
+   signed comparison of DST with zero.  If DOT is 1, the generated RTL
+   doesn't care about the DST result; if DOT is 2, it does.  If CCREG
+   is CR0 do a single dot insn (as a PARALLEL); otherwise, do a SET and
+   a separate COMPARE.  */
 
+static void
+rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg)
+{
+  if (dot == 0)
+    {
+      emit_move_insn (dst, src);
+      return;
+    }
+
+  if (cc_reg_not_cr0_operand (ccreg, CCmode))
+    {
+      emit_move_insn (dst, src);
+      emit_move_insn (ccreg, gen_rtx_COMPARE (CCmode, dst, const0_rtx));
+      return;
+    }
+
+  rtx ccset = gen_rtx_SET (ccreg, gen_rtx_COMPARE (CCmode, src, const0_rtx));
+  if (dot == 1)
+    {
+      rtx clobber = gen_rtx_CLOBBER (VOIDmode, dst);
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, clobber)));
+    }
+  else
+    {
+      rtx set = gen_rtx_SET (dst, src);
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, set)));
+    }
+}
+
+/* Figure out the correct instructions to generate to load data for
+   block compare.  MODE is used for the read from memory, and
+   data is zero extended if REG is wider than MODE.  If LE code
+   is being generated, bswap loads are used.
+
+   REG is the destination register to move the data into.
+   MEM is the memory block being read.
+   MODE is the mode of memory to use for the read.  */
+static void
+do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
+{
+  switch (GET_MODE (reg))
+    {
+    case DImode:
+      switch (mode)
+	{
+	case QImode:
+	  emit_insn (gen_zero_extendqidi2 (reg, mem));
+	  break;
+	case HImode:
+	  {
+	    rtx src = mem;
+	    if (!BYTES_BIG_ENDIAN)
+	      {
+		src = gen_reg_rtx (HImode);
+		emit_insn (gen_bswaphi2 (src, mem));
+	      }
+	    emit_insn (gen_zero_extendhidi2 (reg, src));
+	    break;
+	  }
+	case SImode:
+	  {
+	    rtx src = mem;
+	    if (!BYTES_BIG_ENDIAN)
+	      {
+		src = gen_reg_rtx (SImode);
+		emit_insn (gen_bswapsi2 (src, mem));
+	      }
+	    emit_insn (gen_zero_extendsidi2 (reg, src));
+	  }
+	  break;
+	case DImode:
+	  if (!BYTES_BIG_ENDIAN)
+	    emit_insn (gen_bswapdi2 (reg, mem));
+	  else
+	    emit_insn (gen_movdi (reg, mem));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case SImode:
+      switch (mode)
+	{
+	case QImode:
+	  emit_insn (gen_zero_extendqisi2 (reg, mem));
+	  break;
+	case HImode:
+	  {
+	    rtx src = mem;
+	    if (!BYTES_BIG_ENDIAN)
+	      {
+		src = gen_reg_rtx (HImode);
+		emit_insn (gen_bswaphi2 (src, mem));
+	      }
+	    emit_insn (gen_zero_extendhisi2 (reg, src));
+	    break;
+	  }
+	case SImode:
+	  if (!BYTES_BIG_ENDIAN)
+	    emit_insn (gen_bswapsi2 (reg, mem));
+	  else
+	    emit_insn (gen_movsi (reg, mem));
+	  break;
+	case DImode:
+	  /* DImode is larger than the destination reg so is not expected.  */
+	  gcc_unreachable ();
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+    default:
+      gcc_unreachable ();
+      break;
+    }
+}
+
+/* Select the mode to be used for reading the next chunk of bytes
+   in the compare.
+
+   OFFSET is the current read offset from the beginning of the block.
+   BYTES is the number of bytes remaining to be read.
+   ALIGN is the minimum alignment of the memory blocks being compared in bytes.
+   WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
+   the largest allowable mode.  */
+static machine_mode
+select_block_compare_mode (unsigned HOST_WIDE_INT offset,
+			   unsigned HOST_WIDE_INT bytes,
+			   unsigned HOST_WIDE_INT align, bool word_mode_ok)
+{
+  /* First see if we can do a whole load unit
+     as that will be more efficient than a larger load + shift.  */
+
+  /* If big, use biggest chunk.
+     If exactly chunk size, use that size.
+     If remainder can be done in one piece with shifting, do that.
+     Do largest chunk possible without violating alignment rules.  */
+
+  /* The most we can read without potential page crossing.  */
+  unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
+
+  if (word_mode_ok && bytes >= UNITS_PER_WORD)
+    return word_mode;
+  else if (bytes == GET_MODE_SIZE (SImode))
+    return SImode;
+  else if (bytes == GET_MODE_SIZE (HImode))
+    return HImode;
+  else if (bytes == GET_MODE_SIZE (QImode))
+    return QImode;
+  else if (bytes < GET_MODE_SIZE (SImode)
+	   && offset >= GET_MODE_SIZE (SImode) - bytes)
+    /* This matches the case were we have SImode and 3 bytes
+       and offset >= 1 and permits us to move back one and overlap
+       with the previous read, thus avoiding having to shift
+       unwanted bytes off of the input.  */
+    return SImode;
+  else if (word_mode_ok && bytes < UNITS_PER_WORD
+	   && offset >= UNITS_PER_WORD-bytes)
+    /* Similarly, if we can use DImode it will get matched here and
+       can do an overlapping read that ends at the end of the block.  */
+    return word_mode;
+  else if (word_mode_ok && maxread >= UNITS_PER_WORD)
+    /* It is safe to do all remaining in one load of largest size,
+       possibly with a shift to get rid of unwanted bytes.  */
+    return word_mode;
+  else if (maxread >= GET_MODE_SIZE (SImode))
+    /* It is safe to do all remaining in one SImode load,
+       possibly with a shift to get rid of unwanted bytes.  */
+    return SImode;
+  else if (bytes > GET_MODE_SIZE (SImode))
+    return SImode;
+  else if (bytes > GET_MODE_SIZE (HImode))
+    return HImode;
+
+  /* final fallback is do one byte */
+  return QImode;
+}
+
+/* Compute the alignment of pointer+OFFSET where the original alignment
+   of pointer was BASE_ALIGN.  */
+static unsigned HOST_WIDE_INT
+compute_current_alignment (unsigned HOST_WIDE_INT base_align,
+			   unsigned HOST_WIDE_INT offset)
+{
+  if (offset == 0)
+    return base_align;
+  return min (base_align, offset & -offset);
+}
+
+/* Expand a block compare operation, and return true if successful.
+   Return false if we should let the compiler generate normal code,
+   probably a memcmp call.
+
+   OPERANDS[0] is the target (result).
+   OPERANDS[1] is the first source.
+   OPERANDS[2] is the second source.
+   OPERANDS[3] is the length.
+   OPERANDS[4] is the alignment.  */
+bool
+expand_block_compare (rtx operands[])
+{
+  rtx target = operands[0];
+  rtx orig_src1 = operands[1];
+  rtx orig_src2 = operands[2];
+  rtx bytes_rtx = operands[3];
+  rtx align_rtx = operands[4];
+  HOST_WIDE_INT cmp_bytes = 0;
+  rtx src1 = orig_src1;
+  rtx src2 = orig_src2;
+
+  /* This case is complicated to handle because the subtract
+     with carry instructions do not generate the 64-bit
+     carry and so we must emit code to calculate it ourselves.
+     We choose not to implement this yet.  */
+  if (TARGET_32BIT && TARGET_POWERPC64)
+    return false;
+
+  /* If this is not a fixed size compare, just call memcmp.  */
+  if (!CONST_INT_P (bytes_rtx))
+    return false;
+
+  /* This must be a fixed size alignment.  */
+  if (!CONST_INT_P (align_rtx))
+    return false;
+
+  unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
+
+  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
+  if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1))
+      || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2)))
+    return false;
+
+  gcc_assert (GET_MODE (target) == SImode);
+
+  /* Anything to move?  */
+  unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
+  if (bytes == 0)
+    return true;
+
+  /* The code generated for p7 and older is not faster than glibc
+     memcmp if alignment is small and length is not short, so bail
+     out to avoid those conditions.  */
+  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
+      && ((base_align == 1 && bytes > 16)
+	  || (base_align == 2 && bytes > 32)))
+    return false;
+
+  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
+  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
+  /* P7/P8 code uses cond for subfc. but P9 uses
+     it for cmpld which needs CCUNSmode. */
+  rtx cond;
+  if (TARGET_P9_MISC)
+    cond = gen_reg_rtx (CCUNSmode);
+  else
+    cond = gen_reg_rtx (CCmode);
+
+  /* If we have an LE target without ldbrx and word_mode is DImode,
+     then we must avoid using word_mode.  */
+  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
+		       && word_mode == DImode);
+
+  /* Strategy phase.  How many ops will this take and should we expand it?  */
+
+  unsigned HOST_WIDE_INT offset = 0;
+  machine_mode load_mode =
+    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
+  unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
+
+  /* We don't want to generate too much code.  */
+  unsigned HOST_WIDE_INT max_bytes =
+    load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit;
+  if (!IN_RANGE (bytes, 1, max_bytes))
+    return false;
+
+  bool generate_6432_conversion = false;
+  rtx convert_label = NULL;
+  rtx final_label = NULL;
+
+  /* Example of generated code for 18 bytes aligned 1 byte.
+     Compiled with -fno-reorder-blocks for clarity.
+             ldbrx 10,31,8
+             ldbrx 9,7,8
+             subfc. 9,9,10
+             bne 0,.L6487
+             addi 9,12,8
+             addi 5,11,8
+             ldbrx 10,0,9
+             ldbrx 9,0,5
+             subfc. 9,9,10
+             bne 0,.L6487
+             addi 9,12,16
+             lhbrx 10,0,9
+             addi 9,11,16
+             lhbrx 9,0,9
+             subf 9,9,10
+             b .L6488
+             .p2align 4,,15
+     .L6487: #convert_label
+             popcntd 9,9
+             subfe 10,10,10
+             or 9,9,10
+     .L6488: #final_label
+             extsw 10,9
+
+     We start off with DImode for two blocks that jump to the DI->SI conversion
+     if the difference is found there, then a final block of HImode that skips
+     the DI->SI conversion.  */
+
+  while (bytes > 0)
+    {
+      unsigned int align = compute_current_alignment (base_align, offset);
+      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	load_mode = select_block_compare_mode (offset, bytes, align,
+					       word_mode_ok);
+      else
+	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
+      load_mode_size = GET_MODE_SIZE (load_mode);
+      if (bytes >= load_mode_size)
+	cmp_bytes = load_mode_size;
+      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	{
+	  /* Move this load back so it doesn't go past the end.
+	     P8/P9 can do this efficiently.  */
+	  unsigned int extra_bytes = load_mode_size - bytes;
+	  cmp_bytes = bytes;
+	  if (extra_bytes < offset)
+	    {
+	      offset -= extra_bytes;
+	      cmp_bytes = load_mode_size;
+	      bytes = cmp_bytes;
+	    }
+	}
+      else
+	/* P7 and earlier can't do the overlapping load trick fast,
+	   so this forces a non-overlapping load and a shift to get
+	   rid of the extra bytes.  */
+	cmp_bytes = bytes;
+
+      src1 = adjust_address (orig_src1, load_mode, offset);
+      src2 = adjust_address (orig_src2, load_mode, offset);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+      set_mem_size (src1, cmp_bytes);
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+      set_mem_size (src2, cmp_bytes);
+
+      do_load_for_compare (tmp_reg_src1, src1, load_mode);
+      do_load_for_compare (tmp_reg_src2, src2, load_mode);
+
+      if (cmp_bytes < load_mode_size)
+	{
+	  /* Shift unneeded bytes off.  */
+	  rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
+	  if (word_mode == DImode)
+	    {
+	      emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	  else
+	    {
+	      emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	}
+
+      int remain = bytes - cmp_bytes;
+      if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
+	{
+	  /* Target is larger than load size so we don't need to
+	     reduce result size.  */
+
+	  /* We previously did a block that need 64->32 conversion but
+	     the current block does not, so a label is needed to jump
+	     to the end.  */
+	  if (generate_6432_conversion && !final_label)
+	    final_label = gen_label_rtx ();
+
+	  if (remain > 0)
+	    {
+	      /* This is not the last block, branch to the end if the result
+		 of this subtract is not zero.  */
+	      if (!final_label)
+		final_label = gen_label_rtx ();
+	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
+	      rtx cr = gen_reg_rtx (CCmode);
+	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
+	      emit_insn (gen_movsi (target,
+				    gen_lowpart (SImode, tmp_reg_src2)));
+	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
+	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
+						 fin_ref, pc_rtx);
+	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+	      JUMP_LABEL (j) = final_label;
+	      LABEL_NUSES (final_label) += 1;
+	    }
+	  else
+	    {
+	      if (word_mode == DImode)
+		{
+		  emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
+					 tmp_reg_src2));
+		  emit_insn (gen_movsi (target,
+					gen_lowpart (SImode, tmp_reg_src2)));
+		}
+	      else
+		emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
+
+	      if (final_label)
+		{
+		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
+		  JUMP_LABEL(j) = final_label;
+		  LABEL_NUSES (final_label) += 1;
+		  emit_barrier ();
+		}
+	    }
+	}
+      else
+	{
+	  /* Do we need a 64->32 conversion block? We need the 64->32
+	     conversion even if target size == load_mode size because
+	     the subtract generates one extra bit.  */
+	  generate_6432_conversion = true;
+
+	  if (remain > 0)
+	    {
+	      if (!convert_label)
+		convert_label = gen_label_rtx ();
+
+	      /* Compare to zero and branch to convert_label if not zero.  */
+	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
+	      if (TARGET_P9_MISC)
+		{
+		/* Generate a compare, and convert with a setb later.  */
+		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+					     tmp_reg_src2);
+		  emit_insn (gen_rtx_SET (cond, cmp));
+		}
+	      else
+		/* Generate a subfc. and use the longer
+		   sequence for conversion.  */
+		if (TARGET_64BIT)
+		  emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+						     tmp_reg_src1, cond));
+		else
+		  emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+						     tmp_reg_src1, cond));
+	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
+						 cvt_ref, pc_rtx);
+	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+	      JUMP_LABEL(j) = convert_label;
+	      LABEL_NUSES (convert_label) += 1;
+	    }
+	  else
+	    {
+	      /* Just do the subtract/compare.  Since this is the last block
+		 the convert code will be generated immediately following.  */
+	      if (TARGET_P9_MISC)
+		{
+		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+					     tmp_reg_src2);
+		  emit_insn (gen_rtx_SET (cond, cmp));
+		}
+	      else
+		if (TARGET_64BIT)
+		  emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
+						tmp_reg_src1));
+		else
+		  emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
+						tmp_reg_src1));
+	    }
+	}
+
+      offset += cmp_bytes;
+      bytes -= cmp_bytes;
+    }
+
+  if (generate_6432_conversion)
+    {
+      if (convert_label)
+	emit_label (convert_label);
+
+      /* We need to produce DI result from sub, then convert to target SI
+	 while maintaining <0 / ==0 / >0 properties. This sequence works:
+	 subfc L,A,B
+	 subfe H,H,H
+	 popcntd L,L
+	 rldimi L,H,6,0
+
+	 This is an alternate one Segher cooked up if somebody
+	 wants to expand this for something that doesn't have popcntd:
+	 subfc L,a,b
+	 subfe H,x,x
+	 addic t,L,-1
+	 subfe v,t,L
+	 or z,v,H
+
+	 And finally, p9 can just do this:
+	 cmpld A,B
+	 setb r */
+
+      if (TARGET_P9_MISC)
+	{
+	  emit_insn (gen_setb_unsigned (target, cond));
+	}
+      else
+	{
+	  if (TARGET_64BIT)
+	    {
+	      rtx tmp_reg_ca = gen_reg_rtx (DImode);
+	      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
+	      emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
+	      emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
+	      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
+	    }
+	  else
+	    {
+	      rtx tmp_reg_ca = gen_reg_rtx (SImode);
+	      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
+	      emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
+	      emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
+	    }
+	}
+    }
+
+  if (final_label)
+    emit_label (final_label);
+
+  gcc_assert (bytes == 0);
+  return true;
+}
+
+/* Generate alignment check and branch code to set up for
+   strncmp when we don't have DI alignment.
+   STRNCMP_LABEL is the label to branch if there is a page crossing.
+   SRC is the string pointer to be examined.
+   BYTES is the max number of bytes to compare.  */
+static void
+expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
+{
+  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
+  rtx src_check = copy_addr_to_reg (XEXP (src, 0));
+  if (GET_MODE (src_check) == SImode)
+    emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
+  else
+    emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
+  rtx cond = gen_reg_rtx (CCmode);
+  emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
+					 GEN_INT (4096 - bytes)));
+
+  rtx cmp_rtx = gen_rtx_LT (VOIDmode, cond, const0_rtx);
+
+  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
+				     pc_rtx, lab_ref);
+  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  JUMP_LABEL (j) = strncmp_label;
+  LABEL_NUSES (strncmp_label) += 1;
+}
+
+/* Expand a string compare operation with length, and return
+   true if successful. Return false if we should let the
+   compiler generate normal code, probably a strncmp call.
+
+   OPERANDS[0] is the target (result).
+   OPERANDS[1] is the first source.
+   OPERANDS[2] is the second source.
+   If NO_LENGTH is zero, then:
+   OPERANDS[3] is the length.
+   OPERANDS[4] is the alignment in bytes.
+   If NO_LENGTH is nonzero, then:
+   OPERANDS[3] is the alignment in bytes.  */
+bool
+expand_strn_compare (rtx operands[], int no_length)
+{
+  rtx target = operands[0];
+  rtx orig_src1 = operands[1];
+  rtx orig_src2 = operands[2];
+  rtx bytes_rtx, align_rtx;
+  if (no_length)
+    {
+      bytes_rtx = NULL;
+      align_rtx = operands[3];
+    }
+  else
+    {
+      bytes_rtx = operands[3];
+      align_rtx = operands[4];
+    }
+  unsigned HOST_WIDE_INT cmp_bytes = 0;
+  rtx src1 = orig_src1;
+  rtx src2 = orig_src2;
+
+  /* If we have a length, it must be constant. This simplifies things
+     a bit as we don't have to generate code to check if we've exceeded
+     the length. Later this could be expanded to handle this case.  */
+  if (!no_length && !CONST_INT_P (bytes_rtx))
+    return false;
+
+  /* This must be a fixed size alignment.  */
+  if (!CONST_INT_P (align_rtx))
+    return false;
+
+  unsigned int base_align = UINTVAL (align_rtx);
+  int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
+  int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
+
+  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
+  if (SLOW_UNALIGNED_ACCESS (word_mode, align1)
+      || SLOW_UNALIGNED_ACCESS (word_mode, align2))
+    return false;
+
+  gcc_assert (GET_MODE (target) == SImode);
+
+  /* If we have an LE target without ldbrx and word_mode is DImode,
+     then we must avoid using word_mode.  */
+  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
+		       && word_mode == DImode);
+
+  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
+
+  unsigned HOST_WIDE_INT offset = 0;
+  unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
+  unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
+  if (no_length)
+    /* Use this as a standin to determine the mode to use.  */
+    bytes = rs6000_string_compare_inline_limit * word_mode_size;
+  else
+    bytes = UINTVAL (bytes_rtx);
+
+  machine_mode load_mode =
+    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
+  unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
+  compare_length = rs6000_string_compare_inline_limit * load_mode_size;
+
+  /* If we have equality at the end of the last compare and we have not
+     found the end of the string, we need to call strcmp/strncmp to
+     compare the remainder.  */
+  bool equality_compare_rest = false;
+
+  if (no_length)
+    {
+      bytes = compare_length;
+      equality_compare_rest = true;
+    }
+  else
+    {
+      if (bytes <= compare_length)
+	compare_length = bytes;
+      else
+	equality_compare_rest = true;
+    }
+
+  rtx result_reg = gen_reg_rtx (word_mode);
+  rtx final_move_label = gen_label_rtx ();
+  rtx final_label = gen_label_rtx ();
+  rtx begin_compare_label = NULL;
+
+  if (base_align < 8)
+    {
+      /* Generate code that checks distance to 4k boundary for this case.  */
+      begin_compare_label = gen_label_rtx ();
+      rtx strncmp_label = gen_label_rtx ();
+      rtx jmp;
+
+      /* Strncmp for power8 in glibc does this:
+	 rldicl	r8,r3,0,52
+	 cmpldi	cr7,r8,4096-16
+	 bgt	cr7,L(pagecross) */
+
+      /* Make sure that the length we use for the alignment test and
+         the subsequent code generation are in agreement so we do not
+         go past the length we tested for a 4k boundary crossing.  */
+      unsigned HOST_WIDE_INT align_test = compare_length;
+      if (align_test < 8)
+        {
+          align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
+          base_align = align_test;
+        }
+      else
+        {
+          align_test = ROUND_UP (align_test, 8);
+          base_align = 8;
+        }
+
+      if (align1 < 8)
+        expand_strncmp_align_check (strncmp_label, src1, align_test);
+      if (align2 < 8)
+        expand_strncmp_align_check (strncmp_label, src2, align_test);
+
+      /* Now generate the following sequence:
+	 - branch to begin_compare
+	 - strncmp_label
+	 - call to strncmp
+	 - branch to final_label
+	 - begin_compare_label */
+
+      rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
+      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
+      JUMP_LABEL (jmp) = begin_compare_label;
+      LABEL_NUSES (begin_compare_label) += 1;
+      emit_barrier ();
+
+      emit_label (strncmp_label);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+
+      if (no_length)
+	emit_library_call_value (gen_rtx_SYMBOL_REF (Pmode, "strcmp"),
+				 target, LCT_NORMAL, GET_MODE (target), 2,
+				 force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				 force_reg (Pmode, XEXP (src2, 0)), Pmode);
+      else
+	{
+	  /* -m32 -mpowerpc64 results in word_mode being DImode even
+	     though otherwise it is 32-bit. The length arg to strncmp
+	     is a size_t which will be the same size as pointers.  */
+	  rtx len_rtx;
+	  if (TARGET_64BIT)
+	    len_rtx = gen_reg_rtx (DImode);
+	  else
+	    len_rtx = gen_reg_rtx (SImode);
+
+	  emit_move_insn (len_rtx, bytes_rtx);
+
+	  emit_library_call_value (gen_rtx_SYMBOL_REF (Pmode, "strncmp"),
+				   target, LCT_NORMAL, GET_MODE (target), 3,
+				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
+				   len_rtx, GET_MODE (len_rtx));
+	}
+
+      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
+      JUMP_LABEL (jmp) = final_label;
+      LABEL_NUSES (final_label) += 1;
+      emit_barrier ();
+      emit_label (begin_compare_label);
+    }
+
+  rtx cleanup_label = NULL;
+  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
+  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
+
+  /* Generate sequence of ld/ldbrx, cmpb to compare out
+     to the length specified.  */
+  unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
+  while (bytes_to_compare > 0)
+    {
+      /* Compare sequence:
+         check each 8B with: ld/ld cmpd bne
+	 If equal, use rldicr/cmpb to check for zero byte.
+         cleanup code at end:
+         cmpb          get byte that differs
+         cmpb          look for zero byte
+         orc           combine
+         cntlzd        get bit of first zero/diff byte
+         subfic        convert for rldcl use
+         rldcl rldcl   extract diff/zero byte
+         subf          subtract for final result
+
+         The last compare can branch around the cleanup code if the
+         result is zero because the strings are exactly equal.  */
+      unsigned int align = compute_current_alignment (base_align, offset);
+      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
+					       word_mode_ok);
+      else
+	load_mode = select_block_compare_mode (0, bytes_to_compare, align,
+					       word_mode_ok);
+      load_mode_size = GET_MODE_SIZE (load_mode);
+      if (bytes_to_compare >= load_mode_size)
+	cmp_bytes = load_mode_size;
+      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	{
+	  /* Move this load back so it doesn't go past the end.
+	     P8/P9 can do this efficiently.  */
+	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
+	  cmp_bytes = bytes_to_compare;
+	  if (extra_bytes < offset)
+	    {
+	      offset -= extra_bytes;
+	      cmp_bytes = load_mode_size;
+	      bytes_to_compare = cmp_bytes;
+	    }
+	}
+      else
+	/* P7 and earlier can't do the overlapping load trick fast,
+	   so this forces a non-overlapping load and a shift to get
+	   rid of the extra bytes.  */
+	cmp_bytes = bytes_to_compare;
+
+      src1 = adjust_address (orig_src1, load_mode, offset);
+      src2 = adjust_address (orig_src2, load_mode, offset);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+      set_mem_size (src1, cmp_bytes);
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+      set_mem_size (src2, cmp_bytes);
+
+      do_load_for_compare (tmp_reg_src1, src1, load_mode);
+      do_load_for_compare (tmp_reg_src2, src2, load_mode);
+
+      /* We must always left-align the data we read, and
+	 clear any bytes to the right that are beyond the string.
+	 Otherwise the cmpb sequence won't produce the correct
+	 results.  The beginning of the compare will be done
+	 with word_mode so will not have any extra shifts or
+	 clear rights.  */
+
+      if (load_mode_size < word_mode_size)
+	{
+	  /* Rotate left first. */
+	  rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
+	  if (word_mode == DImode)
+	    {
+	      emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	  else
+	    {
+	      emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	}
+
+      if (cmp_bytes < word_mode_size)
+	{
+	  /* Now clear right.  This plus the rotate can be
+	     turned into a rldicr instruction. */
+	  HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
+	  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
+	  if (word_mode == DImode)
+	    {
+	      emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
+	      emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
+	    }
+	  else
+	    {
+	      emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
+	      emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
+	    }
+	}
+
+      /* Cases to handle.  A and B are chunks of the two strings.
+	 1: Not end of comparison:
+	 A != B: branch to cleanup code to compute result.
+	 A == B: check for 0 byte, next block if not found.
+	 2: End of the inline comparison:
+	 A != B: branch to cleanup code to compute result.
+	 A == B: check for 0 byte, call strcmp/strncmp
+	 3: compared requested N bytes:
+	 A == B: branch to result 0.
+	 A != B: cleanup code to compute result.  */
+
+      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
+
+      rtx dst_label;
+      if (remain > 0 || equality_compare_rest)
+	{
+	  /* Branch to cleanup code, otherwise fall through to do
+	     more compares.  */
+	  if (!cleanup_label)
+	    cleanup_label = gen_label_rtx ();
+	  dst_label = cleanup_label;
+	}
+      else
+	/* Branch to end and produce result of 0.  */
+	dst_label = final_move_label;
+
+      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
+      rtx cond = gen_reg_rtx (CCmode);
+
+      /* Always produce the 0 result, it is needed if
+	 cmpb finds a 0 byte in this chunk.  */
+      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
+      rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
+
+      rtx cmp_rtx;
+      if (remain == 0 && !equality_compare_rest)
+	cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
+      else
+	cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+
+      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
+					 lab_ref, pc_rtx);
+      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+      JUMP_LABEL (j) = dst_label;
+      LABEL_NUSES (dst_label) += 1;
+
+      if (remain > 0 || equality_compare_rest)
+	{
+	  /* Generate a cmpb to test for a 0 byte and branch
+	     to final result if found.  */
+	  rtx cmpb_zero = gen_reg_rtx (word_mode);
+	  rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
+	  rtx condz = gen_reg_rtx (CCmode);
+	  rtx zero_reg = gen_reg_rtx (word_mode);
+	  if (word_mode == SImode)
+	    {
+	      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
+	      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+	      if (cmp_bytes < word_mode_size)
+		{
+		  /* Don't want to look at zero bytes past end.  */
+		  HOST_WIDE_INT mb =
+		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
+		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
+		  emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
+		}
+	    }
+	  else
+	    {
+	      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
+	      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+	      if (cmp_bytes < word_mode_size)
+		{
+		  /* Don't want to look at zero bytes past end.  */
+		  HOST_WIDE_INT mb =
+		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
+		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
+		  emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
+		}
+	    }
+
+	  emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
+	  rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
+	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
+					     lab_ref_fin, pc_rtx);
+	  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+	  JUMP_LABEL (j2) = final_move_label;
+	  LABEL_NUSES (final_move_label) += 1;
+
+	}
+
+      offset += cmp_bytes;
+      bytes_to_compare -= cmp_bytes;
+    }
+
+  if (equality_compare_rest)
+    {
+      /* Update pointers past what has been compared already.  */
+      src1 = adjust_address (orig_src1, load_mode, offset);
+      src2 = adjust_address (orig_src2, load_mode, offset);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+      set_mem_size (src1, cmp_bytes);
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+      set_mem_size (src2, cmp_bytes);
+
+      /* Construct call to strcmp/strncmp to compare the rest of the string.  */
+      if (no_length)
+	emit_library_call_value (gen_rtx_SYMBOL_REF (Pmode, "strcmp"),
+				 target, LCT_NORMAL, GET_MODE (target), 2,
+				 force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				 force_reg (Pmode, XEXP (src2, 0)), Pmode);
+      else
+	{
+	  rtx len_rtx;
+	  if (TARGET_64BIT)
+	    len_rtx = gen_reg_rtx (DImode);
+	  else
+	    len_rtx = gen_reg_rtx (SImode);
+
+	  emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
+	  emit_library_call_value (gen_rtx_SYMBOL_REF (Pmode, "strncmp"),
+				   target, LCT_NORMAL, GET_MODE (target), 3,
+				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
+				   len_rtx, GET_MODE (len_rtx));
+	}
+
+      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+      rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
+      JUMP_LABEL (jmp) = final_label;
+      LABEL_NUSES (final_label) += 1;
+      emit_barrier ();
+    }
+
+  if (cleanup_label)
+    emit_label (cleanup_label);
+
+  /* Generate the final sequence that identifies the differing
+     byte and generates the final result, taking into account
+     zero bytes:
+
+     cmpb              cmpb_result1, src1, src2
+     cmpb              cmpb_result2, src1, zero
+     orc               cmpb_result1, cmp_result1, cmpb_result2
+     cntlzd            get bit of first zero/diff byte
+     addi              convert for rldcl use
+     rldcl rldcl       extract diff/zero byte
+     subf              subtract for final result
+  */
+
+  rtx cmpb_diff = gen_reg_rtx (word_mode);
+  rtx cmpb_zero = gen_reg_rtx (word_mode);
+  rtx rot_amt = gen_reg_rtx (word_mode);
+  rtx zero_reg = gen_reg_rtx (word_mode);
+
+  rtx rot1_1 = gen_reg_rtx (word_mode);
+  rtx rot1_2 = gen_reg_rtx (word_mode);
+  rtx rot2_1 = gen_reg_rtx (word_mode);
+  rtx rot2_2 = gen_reg_rtx (word_mode);
+
+  if (word_mode == SImode)
+    {
+      emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
+      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
+      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+      emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
+      emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
+      emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
+      emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
+      emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
+      emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
+      emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
+    }
+  else
+    {
+      emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
+      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
+      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+      emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
+      emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
+      emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
+      emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
+      emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
+      emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
+      emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
+    }
+
+  emit_label (final_move_label);
+  emit_insn (gen_movsi (target,
+			gen_lowpart (SImode, result_reg)));
+  emit_label (final_label);
+  return true;
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.
 
@@ -18619,42 +19806,6 @@ rs6000_is_valid_2insn_and (rtx c, machine_mode mode)
   return rs6000_is_valid_and_mask (GEN_INT (val + bit3 - bit2), mode);
 }
 
-/* Emit a potentially record-form instruction, setting DST from SRC.
-   If DOT is 0, that is all; otherwise, set CCREG to the result of the
-   signed comparison of DST with zero.  If DOT is 1, the generated RTL
-   doesn't care about the DST result; if DOT is 2, it does.  If CCREG
-   is CR0 do a single dot insn (as a PARALLEL); otherwise, do a SET and
-   a separate COMPARE.  */
-
-static void
-rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg)
-{
-  if (dot == 0)
-    {
-      emit_move_insn (dst, src);
-      return;
-    }
-
-  if (cc_reg_not_cr0_operand (ccreg, CCmode))
-    {
-      emit_move_insn (dst, src);
-      emit_move_insn (ccreg, gen_rtx_COMPARE (CCmode, dst, const0_rtx));
-      return;
-    }
-
-  rtx ccset = gen_rtx_SET (ccreg, gen_rtx_COMPARE (CCmode, src, const0_rtx));
-  if (dot == 1)
-    {
-      rtx clobber = gen_rtx_CLOBBER (VOIDmode, dst);
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, clobber)));
-    }
-  else
-    {
-      rtx set = gen_rtx_SET (dst, src);
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, set)));
-    }
-}
-
 /* Emit the two insns to do an AND in mode MODE, with operands OPERANDS.
    If EXPAND is true, split rotate-and-mask instructions we generate to
    their constituent parts as well (this is used during expand); if DOT
@@ -18968,7 +20119,7 @@ register_to_reg_type (rtx reg, bool *is_altivec)
       regno = true_regnum (reg);
       if (regno < 0 || regno >= FIRST_PSEUDO_REGISTER)
 	return PSEUDO_REG_TYPE;
-    }	
+    }
 
   gcc_assert (regno >= 0);
 
@@ -26397,6 +27548,225 @@ rs6000_global_entry_point_needed_p (void)
   return cfun->machine->r2_setup_needed;
 }
 
+/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
+static sbitmap
+rs6000_get_separate_components (void)
+{
+  rs6000_stack_t *info = rs6000_stack_info ();
+
+  if (WORLD_SAVE_P (info))
+    return NULL;
+
+  if (TARGET_SPE_ABI)
+    return NULL;
+
+  sbitmap components = sbitmap_alloc (32);
+  bitmap_clear (components);
+
+  /* The GPRs we need saved to the frame.  */
+  if ((info->savres_strategy & SAVE_INLINE_GPRS)
+      && (info->savres_strategy & REST_INLINE_GPRS))
+    {
+      int reg_size = TARGET_32BIT ? 4 : 8;
+      int offset = info->gp_save_offset;
+      if (info->push_p)
+	offset += info->total_size;
+
+      for (unsigned regno = info->first_gp_reg_save; regno < 32; regno++)
+	{
+	  if (IN_RANGE (offset, -0x8000, 0x7fff)
+	      && rs6000_reg_live_or_pic_offset_p (regno))
+	    bitmap_set_bit (components, regno);
+
+	  offset += reg_size;
+	}
+    }
+
+  /* Don't mess with the hard frame pointer.  */
+  if (frame_pointer_needed)
+    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+
+  /* Don't mess with the fixed TOC register.  */
+  if ((TARGET_TOC && TARGET_MINIMAL_TOC)
+      || (flag_pic == 1 && DEFAULT_ABI == ABI_V4)
+      || (flag_pic && DEFAULT_ABI == ABI_DARWIN))
+    bitmap_clear_bit (components, RS6000_PIC_OFFSET_TABLE_REGNUM);
+
+  /* Optimize LR save and restore if we can.  This is component 0.  Any
+     out-of-line register save/restore routines need LR.  */
+  if (info->lr_save_p
+      && !(flag_pic && (DEFAULT_ABI == ABI_V4 || DEFAULT_ABI == ABI_DARWIN))
+      && (info->savres_strategy & SAVE_INLINE_GPRS)
+      && (info->savres_strategy & REST_INLINE_GPRS)
+      && (info->savres_strategy & SAVE_INLINE_FPRS)
+      && (info->savres_strategy & REST_INLINE_FPRS)
+      && (info->savres_strategy & SAVE_INLINE_VRS)
+      && (info->savres_strategy & REST_INLINE_VRS))
+    {
+      int offset = info->lr_save_offset;
+      if (info->push_p)
+	offset += info->total_size;
+      if (IN_RANGE (offset, -0x8000, 0x7fff))
+	bitmap_set_bit (components, 0);
+    }
+
+  return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
+static sbitmap
+rs6000_components_for_bb (basic_block bb)
+{
+  rs6000_stack_t *info = rs6000_stack_info ();
+
+  bitmap in = DF_LIVE_IN (bb);
+  bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
+  bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+
+  sbitmap components = sbitmap_alloc (32);
+  bitmap_clear (components);
+
+  /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
+  for (unsigned regno = info->first_gp_reg_save; regno < 32; regno++)
+    if (bitmap_bit_p (in, regno)
+	|| bitmap_bit_p (gen, regno)
+	|| bitmap_bit_p (kill, regno))
+      bitmap_set_bit (components, regno);
+
+  /* LR needs to be saved around a bb if it is killed in that bb.  */
+  if (bitmap_bit_p (in, LR_REGNO)
+      || bitmap_bit_p (gen, LR_REGNO)
+      || bitmap_bit_p (kill, LR_REGNO))
+    bitmap_set_bit (components, 0);
+
+  return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.  */
+static void
+rs6000_disqualify_components (sbitmap components, edge e,
+			      sbitmap edge_components, bool /*is_prologue*/)
+{
+  /* Our LR pro/epilogue code moves LR via R0, so R0 had better not be
+     live where we want to place that code.  */
+  if (bitmap_bit_p (edge_components, 0)
+      && bitmap_bit_p (DF_LIVE_IN (e->dest), 0))
+    {
+      if (dump_file)
+	fprintf (dump_file, "Disqualifying LR because GPR0 is live "
+		 "on entry to bb %d\n", e->dest->index);
+      bitmap_clear_bit (components, 0);
+    }
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
+static void
+rs6000_emit_prologue_components (sbitmap components)
+{
+  rs6000_stack_t *info = rs6000_stack_info ();
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
+			     ? HARD_FRAME_POINTER_REGNUM
+			     : STACK_POINTER_REGNUM);
+  int reg_size = TARGET_32BIT ? 4 : 8;
+
+  /* Prologue for LR.  */
+  if (bitmap_bit_p (components, 0))
+    {
+      rtx reg = gen_rtx_REG (Pmode, 0);
+      rtx_insn *insn = emit_move_insn (reg, gen_rtx_REG (Pmode, LR_REGNO));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_CFA_REGISTER, NULL);
+
+      int offset = info->lr_save_offset;
+      if (info->push_p)
+	offset += info->total_size;
+
+      insn = emit_insn (gen_frame_store (reg, ptr_reg, offset));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      rtx lr = gen_rtx_REG (Pmode, LR_REGNO);
+      rtx mem = copy_rtx (SET_DEST (single_set (insn)));
+      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, lr));
+    }
+
+  /* Prologue for the GPRs.  */
+  int offset = info->gp_save_offset;
+  if (info->push_p)
+    offset += info->total_size;
+
+  for (int i = info->first_gp_reg_save; i < 32; i++)
+    {
+      if (bitmap_bit_p (components, i))
+	{
+	  rtx reg = gen_rtx_REG (Pmode, i);
+	  rtx_insn *insn = emit_insn (gen_frame_store (reg, ptr_reg, offset));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  rtx set = copy_rtx (single_set (insn));
+	  add_reg_note (insn, REG_CFA_OFFSET, set);
+	}
+
+      offset += reg_size;
+    }
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
+static void
+rs6000_emit_epilogue_components (sbitmap components)
+{
+  rs6000_stack_t *info = rs6000_stack_info ();
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
+			     ? HARD_FRAME_POINTER_REGNUM
+			     : STACK_POINTER_REGNUM);
+  int reg_size = TARGET_32BIT ? 4 : 8;
+
+  /* Epilogue for the GPRs.  */
+  int offset = info->gp_save_offset;
+  if (info->push_p)
+    offset += info->total_size;
+
+  for (int i = info->first_gp_reg_save; i < 32; i++)
+    {
+      if (bitmap_bit_p (components, i))
+	{
+	  rtx reg = gen_rtx_REG (Pmode, i);
+	  rtx_insn *insn = emit_insn (gen_frame_load (reg, ptr_reg, offset));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_CFA_RESTORE, reg);
+	}
+
+      offset += reg_size;
+    }
+
+  /* Epilogue for LR.  */
+  if (bitmap_bit_p (components, 0))
+    {
+      int offset = info->lr_save_offset;
+      if (info->push_p)
+	offset += info->total_size;
+
+      rtx reg = gen_rtx_REG (Pmode, 0);
+      rtx_insn *insn = emit_insn (gen_frame_load (reg, ptr_reg, offset));
+
+      rtx lr = gen_rtx_REG (Pmode, LR_REGNO);
+      insn = emit_move_insn (lr, reg);
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_CFA_RESTORE, lr);
+    }
+}
+
+/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
+static void
+rs6000_set_handled_components (sbitmap components)
+{
+  rs6000_stack_t *info = rs6000_stack_info ();
+
+  for (int i = info->first_gp_reg_save; i < 32; i++)
+    if (bitmap_bit_p (components, i))
+      cfun->machine->gpr_is_wrapped_separately[i] = true;
+
+  if (bitmap_bit_p (components, 0))
+    cfun->machine->lr_is_wrapped_separately = true;
+}
+
 /* Emit function prologue as insns.  */
 
 void
@@ -26654,7 +28024,8 @@ rs6000_emit_prologue (void)
     }
 
   /* If we use the link register, get it into r0.  */
-  if (!WORLD_SAVE_P (info) && info->lr_save_p)
+  if (!WORLD_SAVE_P (info) && info->lr_save_p
+      && !cfun->machine->lr_is_wrapped_separately)
     {
       rtx addr, reg, mem;
 
@@ -26882,13 +28253,16 @@ rs6000_emit_prologue (void)
     }
   else if (!WORLD_SAVE_P (info))
     {
-      int i;
-      for (i = 0; i < 32 - info->first_gp_reg_save; i++)
-	if (rs6000_reg_live_or_pic_offset_p (info->first_gp_reg_save + i))
-	  emit_frame_save (frame_reg_rtx, reg_mode,
-			   info->first_gp_reg_save + i,
-			   info->gp_save_offset + frame_off + reg_size * i,
-			   sp_off - frame_off);
+      int offset = info->gp_save_offset + frame_off;
+      for (int i = info->first_gp_reg_save; i < 32; i++)
+	{
+	  if (rs6000_reg_live_or_pic_offset_p (i)
+	      && !cfun->machine->gpr_is_wrapped_separately[i])
+	    emit_frame_save (frame_reg_rtx, reg_mode, i, offset,
+			     sp_off - frame_off);
+
+	  offset += reg_size;
+	}
     }
 
   if (crtl->calls_eh_return)
@@ -27811,7 +29185,9 @@ rs6000_emit_epilogue (int sibcall)
 		&& (restoring_FPRs_inline
 		    || (strategy & REST_NOINLINE_FPRS_DOESNT_RESTORE_LR))
 		&& (restoring_GPRs_inline
-		    || info->first_fp_reg_save < 64));
+		    || info->first_fp_reg_save < 64)
+		&& !cfun->machine->lr_is_wrapped_separately);
+
 
   if (WORLD_SAVE_P (info))
     {
@@ -27828,7 +29204,6 @@ rs6000_emit_epilogue (int sibcall)
 	 longer necessary.  */
 
       p = rtvec_alloc (9
-		       + 1
 		       + 32 - info->first_gp_reg_save
 		       + LAST_ALTIVEC_REGNO + 1 - info->first_altivec_reg_save
 		       + 63 + 1 - info->first_fp_reg_save);
@@ -27839,9 +29214,6 @@ rs6000_emit_epilogue (int sibcall)
 
       j = 0;
       RTVEC_ELT (p, j++) = ret_rtx;
-      RTVEC_ELT (p, j++) = gen_rtx_USE (VOIDmode,
-					gen_rtx_REG (Pmode,
-						     LR_REGNO));
       RTVEC_ELT (p, j++)
 	= gen_rtx_USE (VOIDmode, gen_rtx_SYMBOL_REF (Pmode, alloc_rname));
       /* The instruction pattern requires a clobber here;
@@ -28450,12 +29822,18 @@ rs6000_emit_epilogue (int sibcall)
     }
   else
     {
-      for (i = 0; i < 32 - info->first_gp_reg_save; i++)
-	if (rs6000_reg_live_or_pic_offset_p (info->first_gp_reg_save + i))
-	  emit_insn (gen_frame_load
-		     (gen_rtx_REG (reg_mode, info->first_gp_reg_save + i),
-		      frame_reg_rtx,
-		      info->gp_save_offset + frame_off + reg_size * i));
+      int offset = info->gp_save_offset + frame_off;
+      for (i = info->first_gp_reg_save; i < 32; i++)
+	{
+	  if (rs6000_reg_live_or_pic_offset_p (i)
+	      && !cfun->machine->gpr_is_wrapped_separately[i])
+	    {
+	      rtx reg = gen_rtx_REG (reg_mode, i);
+	      emit_insn (gen_frame_load (reg, frame_reg_rtx, offset));
+	    }
+
+	  offset += reg_size;
+	}
     }
 
   if (DEFAULT_ABI == ABI_V4 || flag_shrink_wrap)
@@ -28494,8 +29872,10 @@ rs6000_emit_epilogue (int sibcall)
 	    || using_load_multiple
 	    || rs6000_reg_live_or_pic_offset_p (i))
 	  {
-	    rtx reg = gen_rtx_REG (reg_mode, i);
+	    if (cfun->machine->gpr_is_wrapped_separately[i])
+	      continue;
 
+	    rtx reg = gen_rtx_REG (reg_mode, i);
 	    cfa_restores = alloc_reg_note (REG_CFA_RESTORE, reg, cfa_restores);
 	  }
     }
@@ -28564,73 +29944,63 @@ rs6000_emit_epilogue (int sibcall)
       emit_insn (gen_add3_insn (sp_reg_rtx, sp_reg_rtx, sa));
     }
 
-  if (!sibcall)
+  if (!sibcall && restoring_FPRs_inline)
     {
-      rtvec p;
-      bool lr = (strategy & REST_NOINLINE_FPRS_DOESNT_RESTORE_LR) == 0;
-      if (! restoring_FPRs_inline)
-	{
-	  p = rtvec_alloc (4 + 64 - info->first_fp_reg_save);
-	  RTVEC_ELT (p, 0) = ret_rtx;
-	}
-      else
+      if (cfa_restores)
 	{
-	  if (cfa_restores)
-	    {
-	      /* We can't hang the cfa_restores off a simple return,
-		 since the shrink-wrap code sometimes uses an existing
-		 return.  This means there might be a path from
-		 pre-prologue code to this return, and dwarf2cfi code
-		 wants the eh_frame unwinder state to be the same on
-		 all paths to any point.  So we need to emit the
-		 cfa_restores before the return.  For -m64 we really
-		 don't need epilogue cfa_restores at all, except for
-		 this irritating dwarf2cfi with shrink-wrap
-		 requirement;  The stack red-zone means eh_frame info
-		 from the prologue telling the unwinder to restore
-		 from the stack is perfectly good right to the end of
-		 the function.  */
-	      emit_insn (gen_blockage ());
-	      emit_cfa_restores (cfa_restores);
-	      cfa_restores = NULL_RTX;
-	    }
-	  p = rtvec_alloc (2);
-	  RTVEC_ELT (p, 0) = simple_return_rtx;
+	  /* We can't hang the cfa_restores off a simple return,
+	     since the shrink-wrap code sometimes uses an existing
+	     return.  This means there might be a path from
+	     pre-prologue code to this return, and dwarf2cfi code
+	     wants the eh_frame unwinder state to be the same on
+	     all paths to any point.  So we need to emit the
+	     cfa_restores before the return.  For -m64 we really
+	     don't need epilogue cfa_restores at all, except for
+	     this irritating dwarf2cfi with shrink-wrap
+	     requirement;  The stack red-zone means eh_frame info
+	     from the prologue telling the unwinder to restore
+	     from the stack is perfectly good right to the end of
+	     the function.  */
+	  emit_insn (gen_blockage ());
+	  emit_cfa_restores (cfa_restores);
+	  cfa_restores = NULL_RTX;
 	}
 
-      RTVEC_ELT (p, 1) = ((restoring_FPRs_inline || !lr)
-			  ? gen_rtx_USE (VOIDmode,
-					 gen_rtx_REG (Pmode, LR_REGNO))
-			  : gen_rtx_CLOBBER (VOIDmode,
-					     gen_rtx_REG (Pmode, LR_REGNO)));
+      emit_jump_insn (targetm.gen_simple_return ());
+    }
+
+  if (!sibcall && !restoring_FPRs_inline)
+    {
+      bool lr = (strategy & REST_NOINLINE_FPRS_DOESNT_RESTORE_LR) == 0;
+      rtvec p = rtvec_alloc (3 + !!lr + 64 - info->first_fp_reg_save);
+      int elt = 0;
+      RTVEC_ELT (p, elt++) = ret_rtx;
+      if (lr)
+	RTVEC_ELT (p, elt++)
+	  = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNO));
 
-      /* If we have to restore more than two FP registers, branch to the
+      /* We have to restore more than two FP registers, so branch to the
 	 restore function.  It will return to our caller.  */
-      if (! restoring_FPRs_inline)
-	{
-	  int i;
-	  int reg;
-	  rtx sym;
+      int i;
+      int reg;
+      rtx sym;
 
-	  if (flag_shrink_wrap)
-	    cfa_restores = add_crlr_cfa_restore (info, cfa_restores);
+      if (flag_shrink_wrap)
+	cfa_restores = add_crlr_cfa_restore (info, cfa_restores);
 
-	  sym = rs6000_savres_routine_sym (info,
-					   SAVRES_FPR | (lr ? SAVRES_LR : 0));
-	  RTVEC_ELT (p, 2) = gen_rtx_USE (VOIDmode, sym);
-	  reg = (DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2)? 1 : 11;
-	  RTVEC_ELT (p, 3) = gen_rtx_USE (VOIDmode, gen_rtx_REG (Pmode, reg));
+      sym = rs6000_savres_routine_sym (info, SAVRES_FPR | (lr ? SAVRES_LR : 0));
+      RTVEC_ELT (p, elt++) = gen_rtx_USE (VOIDmode, sym);
+      reg = (DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2)? 1 : 11;
+      RTVEC_ELT (p, elt++) = gen_rtx_USE (VOIDmode, gen_rtx_REG (Pmode, reg));
 
-	  for (i = 0; i < 64 - info->first_fp_reg_save; i++)
-	    {
-	      rtx reg = gen_rtx_REG (DFmode, info->first_fp_reg_save + i);
+      for (i = 0; i < 64 - info->first_fp_reg_save; i++)
+	{
+	  rtx reg = gen_rtx_REG (DFmode, info->first_fp_reg_save + i);
 
-	      RTVEC_ELT (p, i + 4)
-		= gen_frame_load (reg, sp_reg_rtx, info->fp_save_offset + 8 * i);
-	      if (flag_shrink_wrap)
-		cfa_restores = alloc_reg_note (REG_CFA_RESTORE, reg,
-					       cfa_restores);
-	    }
+	  RTVEC_ELT (p, elt++)
+	    = gen_frame_load (reg, sp_reg_rtx, info->fp_save_offset + 8 * i);
+	  if (flag_shrink_wrap)
+	    cfa_restores = alloc_reg_note (REG_CFA_RESTORE, reg, cfa_restores);
 	}
 
       emit_jump_insn (gen_rtx_PARALLEL (VOIDmode, p));
@@ -29254,13 +30624,10 @@ rs6000_output_mi_thunk (FILE *file, tree thunk_fndecl ATTRIBUTE_UNUSED,
      generate sibcall RTL explicitly.  */
   insn = emit_call_insn (
 	   gen_rtx_PARALLEL (VOIDmode,
-	     gen_rtvec (4,
+	     gen_rtvec (3,
 			gen_rtx_CALL (VOIDmode,
 				      funexp, const0_rtx),
 			gen_rtx_USE (VOIDmode, const0_rtx),
-			gen_rtx_USE (VOIDmode,
-				     gen_rtx_REG (SImode,
-						  LR_REGNO)),
 			simple_return_rtx)));
   SIBLING_CALL_P (insn) = 1;
   emit_barrier ();
@@ -37043,9 +38410,6 @@ rs6000_sibcall_aix (rtx value, rtx func_desc, rtx flag, rtx cookie)
 
   /* Note use of the TOC register.  */
   use_reg (&CALL_INSN_FUNCTION_USAGE (insn), gen_rtx_REG (Pmode, TOC_REGNUM));
-  /* We need to also mark a use of the link register since the function we
-     sibling-call to will use it to return to our caller.  */
-  use_reg (&CALL_INSN_FUNCTION_USAGE (insn), gen_rtx_REG (Pmode, LR_REGNO));
 }
 
 /* Return whether we need to always update the saved TOC pointer when we update
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 9aba7529e86..c5904a56f26 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -602,6 +602,23 @@ extern int rs6000_vector_align[];
 #define TARGET_DIRECT_MOVE_128	(TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \
 				 && TARGET_POWERPC64)
 
+/* This wants to be set for p8 and newer.  On p7, overlapping unaligned
+   loads are slow. */
+#define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX
+
+/* Macro to say whether we can do optimizations where we need to do parts of
+   the calculation in 64-bit GPRs and then is transfered to the vector
+   registers.  Do not allow -maltivec=be for these optimizations, because it
+   adds to the complexity of the code.  */
+#define TARGET_DIRECT_MOVE_64BIT	(TARGET_DIRECT_MOVE		\
+					 && TARGET_P8_VECTOR		\
+					 && TARGET_POWERPC64		\
+					 && (rs6000_altivec_element_order != 2))
+
+/* Whether we should avoid (SUBREG:SI (REG:SF) and (SUBREG:SF (REG:SI).  */
+#define TARGET_NO_SF_SUBREG	TARGET_DIRECT_MOVE_64BIT
+#define TARGET_ALLOW_SF_SUBREG	(!TARGET_DIRECT_MOVE_64BIT)
+
 /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
    in power7, so conditionalize them on p8 features.  TImode syncs need quad
    memory support.  */
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index bda465db833..5c7a4caf10b 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -117,6 +117,7 @@
    UNSPEC_BPERM
    UNSPEC_COPYSIGN
    UNSPEC_PARITY
+   UNSPEC_CMPB
    UNSPEC_FCTIW
    UNSPEC_FCTID
    UNSPEC_LFIWAX
@@ -149,6 +150,8 @@
    UNSPEC_IEEE128_CONVERT
    UNSPEC_SIGNBIT
    UNSPEC_DOLOOP
+   UNSPEC_SF_FROM_SI
+   UNSPEC_SI_FROM_SF
   ])
 
 ;;
@@ -522,7 +525,8 @@
 (define_code_attr return_str [(return "") (simple_return "simple_")])
 
 ; Logical operators.
-(define_code_iterator iorxor [ior xor])
+(define_code_iterator iorxor		[ior xor])
+(define_code_iterator and_ior_xor	[and ior xor])
 
 ; Signed/unsigned variants of ops.
 (define_code_iterator any_extend	[sign_extend zero_extend])
@@ -1993,6 +1997,35 @@
   "subfic %0,%1,%2"
   [(set_attr "type" "add")])
 
+(define_insn_and_split "subf<mode>3_carry_dot2"
+  [(set (match_operand:CC 3 "cc_reg_operand" "=x,?y")
+	(compare:CC (minus:P (match_operand:P 2 "gpc_reg_operand" "r,r")
+			       (match_operand:P 1 "gpc_reg_operand" "r,r"))
+		    (const_int 0)))
+   (set (match_operand:P 0 "gpc_reg_operand" "=r,r")
+	(minus:P (match_dup 2)
+		   (match_dup 1)))
+   (set (reg:P CA_REGNO)
+	(leu:P (match_dup 1)
+	       (match_dup 2)))]
+  "<MODE>mode == Pmode"
+  "@
+   subfc. %0,%1,%2
+   #"
+  "&& reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"
+  [(parallel [(set (match_dup 0)
+                   (minus:P (match_dup 2)
+                            (match_dup 1)))
+              (set (reg:P CA_REGNO)
+                   (leu:P (match_dup 1)
+                          (match_dup 2)))])
+   (set (match_dup 3)
+        (compare:CC (match_dup 0)
+                    (const_int 0)))]
+  ""
+  [(set_attr "type" "add")
+   (set_attr "dot" "yes")
+   (set_attr "length" "4,8")])
 
 (define_insn "subf<mode>3_carry"
   [(set (match_operand:P 0 "gpc_reg_operand" "=r")
@@ -2236,6 +2269,13 @@
   "prty<wd> %0,%1"
   [(set_attr "type" "popcnt")])
 
+(define_insn "cmpb<mode>3"
+  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
+	(unspec:GPR [(match_operand:GPR 1 "gpc_reg_operand" "r")
+		     (match_operand:GPR 2 "gpc_reg_operand" "r")] UNSPEC_CMPB))]
+  "TARGET_CMPB"
+  "cmpb %0,%1,%2"
+  [(set_attr "type" "cmp")])
 
 ;; Since the hardware zeros the upper part of the register, save generating the
 ;; AND immediate if we are converting to unsigned
@@ -4289,13 +4329,15 @@
 ;; Split to create division from FRE/FRES/etc. and fixup instead of the normal
 ;; hardware division.  This is only done before register allocation and with
 ;; -ffast-math.  This must appear before the divsf3/divdf3 insns.
+;; We used to also check optimize_insn_for_speed_p () but problems with guessed
+;; frequencies (pr68212/pr77536) yields that unreliable so it was removed.
 (define_split
   [(set (match_operand:RECIPF 0 "gpc_reg_operand" "")
 	(div:RECIPF (match_operand 1 "gpc_reg_operand" "")
 		    (match_operand 2 "gpc_reg_operand" "")))]
   "RS6000_RECIP_AUTO_RE_P (<MODE>mode)
-   && can_create_pseudo_p () && optimize_insn_for_speed_p ()
-   && flag_finite_math_only && !flag_trapping_math && flag_reciprocal_math"
+   && can_create_pseudo_p () && flag_finite_math_only
+   && !flag_trapping_math && flag_reciprocal_math"
   [(const_int 0)]
 {
   rs6000_emit_swdiv (operands[0], operands[1], operands[2], true);
@@ -4421,7 +4463,15 @@
 	(div:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "")
 		  (match_operand:SFDF 2 "gpc_reg_operand" "")))]
   "TARGET_<MODE>_INSN && !TARGET_SIMPLE_FPU"
-  "")
+{
+  if (RS6000_RECIP_AUTO_RE_P (<MODE>mode)
+      && can_create_pseudo_p () && flag_finite_math_only
+      && !flag_trapping_math && flag_reciprocal_math)
+    {
+      rs6000_emit_swdiv (operands[0], operands[1], operands[2], true);
+      DONE;
+    }
+})
 
 (define_insn "*div<mode>3_fpr"
   [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv2>")
@@ -6414,6 +6464,107 @@
   [(set_attr "type" "*,*,load,store,*,*,*,mfjmpr,mtjmpr,*,*,fpstore,fpload")
    (set_attr "length" "4,4,4,4,4,4,8,4,4,4,4,4,4")])
 
+;; Like movsi, but adjust a SF value to be used in a SI context, i.e.
+;; (set (reg:SI ...) (subreg:SI (reg:SF ...) 0))
+;;
+;; Because SF values are actually stored as DF values within the vector
+;; registers, we need to convert the value to the vector SF format when
+;; we need to use the bits in a union or similar cases.  We only need
+;; to do this transformation when the value is a vector register.  Loads,
+;; stores, and transfers within GPRs are assumed to be safe.  GCC 6.x
+;; does not support SI values in vector registers, so the primary case
+;; we need to support is direct move from a vector register to a GPR.
+;;
+;; This is a more general case of reload_gpr_from_vsxsf.  That insn must have
+;; no alternatives, because the call is created as part of secondary_reload,
+;; and operand #2's register class is used to allocate the temporary register.
+;; This function is called before reload, and it creates the temporary as
+;; needed.
+
+;;		MR           LWZ          STW          STFS      STXSSP
+;;		STXSSPX      VSX->GPR
+
+(define_insn_and_split "movsi_from_sf"
+  [(set (match_operand:SI 0 "rs6000_nonimmediate_operand"
+		"=r,         r,           m,           m,        wY,
+		 Z,          r")
+
+	(unspec:SI [(match_operand:SF 1 "input_operand"
+		"r,          m,           r,           f,        wu,
+	         wu,         f")]
+		    UNSPEC_SI_FROM_SF))
+
+   (clobber (match_scratch:V4SF 2
+		"=X,         X,           X,           X,        X,
+		 X,          wa"))]
+
+  "TARGET_NO_SF_SUBREG
+   && (register_operand (operands[0], SImode)
+       || register_operand (operands[1], SFmode))"
+  "@
+   mr %0,%1
+   lwz%U1%X1 %0,%1
+   stw%U0%X0 %1,%0
+   stfs%U0%X0 %1,%0
+   stxssp %1,%0
+   stxsspx %x1,%y0
+   #"
+  "&& reload_completed
+   && int_reg_operand (operands[0], SImode)
+   && vsx_reg_sfsubreg_ok (operands[1], SFmode)"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
+
+  emit_insn (gen_vsx_xscvdpspn_scalar (op2, op1));
+  emit_insn (gen_p8_mfvsrd_4_disf (op0_di, op2));
+  emit_insn (gen_lshrdi3 (op0_di, op0_di, GEN_INT (32)));
+  DONE;
+}
+  [(set_attr "type"
+		"*,          load,        store,       fpstore,  fpstore,
+		 fpstore,    mftgpr")
+
+   (set_attr "length"
+		"4,          4,           4,           4,        4,
+		 4,          12")])
+
+;; movsi_from_sf with zero extension
+(define_insn_and_split "*movsi_from_sf_zero_ext"
+  [(set (match_operand:DI 0 "rs6000_nonimmediate_operand" "=r, r, r")
+	(zero_extend:DI
+	 (unspec:SI [(match_operand:SF 1 "input_operand"  "r,  m, wu")]
+		    UNSPEC_SI_FROM_SF)))
+   (clobber (match_scratch:V4SF 2			  "=X, X, wa"))]
+
+  "TARGET_NO_SF_SUBREG
+   && (register_operand (operands[0], SImode)
+       || register_operand (operands[1], SFmode))"
+  "@
+   rldicl %0,%1,0,32
+   lwz%U1%X1 %0,%1
+   #"
+  "&& reload_completed
+   && int_reg_operand (operands[0], DImode)
+   && vsx_reg_sfsubreg_ok (operands[1], SFmode)"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
+
+  emit_insn (gen_vsx_xscvdpspn_scalar (op2, op1));
+  emit_insn (gen_p8_mfvsrd_4_disf (op0_di, op2));
+  emit_insn (gen_lshrdi3 (op0_di, op0_di, GEN_INT (32)));
+  DONE;
+}
+  [(set_attr "type"	"*, load, mftgpr")
+   (set_attr "length"	"4, 4,    12")])
+
 ;; Split a load of a large constant into the appropriate two-insn
 ;; sequence.
 
@@ -6507,8 +6658,10 @@
   "")
 
 (define_insn "*movcc_internal1"
-  [(set (match_operand:CC 0 "nonimmediate_operand" "=y,x,?y,y,r,r,r,r,r,cl,r,m")
-	(match_operand:CC 1 "general_operand" "y,r,r,O,x,y,r,I,h,r,m,r"))]
+  [(set (match_operand:CC 0 "nonimmediate_operand"
+			    "=y,x,?y,y,r,r,r,r,r,*c*l,r,m")
+	(match_operand:CC 1 "general_operand"
+			    " y,r, r,O,x,y,r,I,h,   r,m,r"))]
   "register_operand (operands[0], CCmode)
    || register_operand (operands[1], CCmode)"
   "@
@@ -6676,6 +6829,74 @@
   [(set_attr "type" "*,mtjmpr,mfjmpr,load,store,*,*,*,*,*")
    (set_attr "length" "4,4,4,4,4,4,4,4,8,4")])
 
+;; Like movsf, but adjust a SI value to be used in a SF context, i.e.
+;; (set (reg:SF ...) (subreg:SF (reg:SI ...) 0))
+;;
+;; Because SF values are actually stored as DF values within the vector
+;; registers, we need to convert the value to the vector SF format when
+;; we need to use the bits in a union or similar cases.  We only need
+;; to do this transformation when the value is a vector register.  Loads,
+;; stores, and transfers within GPRs are assumed to be safe.
+;;
+;; This is a more general case of reload_vsx_from_gprsf.  That insn must have
+;; no alternatives, because the call is created as part of secondary_reload,
+;; and operand #2's register class is used to allocate the temporary register.
+;; This function is called before reload, and it creates the temporary as
+;; needed.
+
+;;	    LWZ          LFS        LXSSP      LXSSPX     STW        STFIWX
+;;	    STXSIWX      GPR->VSX   GPR->GPR
+(define_insn_and_split "movsf_from_si"
+  [(set (match_operand:SF 0 "rs6000_nonimmediate_operand"
+	    "=!r,       f,         wb,        wu,        m,         Z,
+	     Z,         wy,        !r")
+
+	(unspec:SF [(match_operand:SI 1 "input_operand" 
+	    "m,         m,         wY,        Z,         r,         f,
+	     wu,        r,         r")]
+		   UNSPEC_SF_FROM_SI))
+
+   (clobber (match_scratch:DI 2
+	    "=X,        X,         X,         X,         X,         X,
+             X,         r,         X"))]
+
+  "TARGET_NO_SF_SUBREG
+   && (register_operand (operands[0], SFmode)
+       || register_operand (operands[1], SImode))"
+  "@
+   lwz%U1%X1 %0,%1
+   lfs%U1%X1 %0,%1
+   lxssp %0,%1
+   lxsspx %x0,%y1
+   stw%U0%X0 %1,%0
+   stfiwx %1,%y0
+   stxsiwx %x1,%y0
+   #
+   mr %0,%1"
+
+  "&& reload_completed
+   && vsx_reg_sfsubreg_ok (operands[0], SFmode)
+   && int_reg_operand_not_pseudo (operands[1], SImode)"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
+
+  /* Move SF value to upper 32-bits for xscvspdpn.  */
+  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
+  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
+  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+  DONE;
+}
+  [(set_attr "length"
+	    "4,          4,         4,         4,         4,         4,
+	     4,          12,        4")
+   (set_attr "type"
+	    "load,       fpload,    fpload,    fpload,    store,     fpstore,
+	     fpstore,    vecfloat,  *")])
+
 
 ;; Move 64-bit binary/decimal floating point
 (define_expand "mov<mode>"
@@ -8401,6 +8622,67 @@
     FAIL;
 }")
 
+;; String compare N insn.
+;; Argument 0 is the target (result)
+;; Argument 1 is the destination
+;; Argument 2 is the source
+;; Argument 3 is the length
+;; Argument 4 is the alignment
+
+(define_expand "cmpstrnsi"
+  [(parallel [(set (match_operand:SI 0)
+               (compare:SI (match_operand:BLK 1)
+                           (match_operand:BLK 2)))
+	      (use (match_operand:SI 3))
+	      (use (match_operand:SI 4))])]
+  "TARGET_CMPB && (BYTES_BIG_ENDIAN || TARGET_LDBRX)"
+{
+  if (expand_strn_compare (operands, 0))
+    DONE;
+  else	
+    FAIL;
+})
+
+;; String compare insn.
+;; Argument 0 is the target (result)
+;; Argument 1 is the destination
+;; Argument 2 is the source
+;; Argument 3 is the alignment
+
+(define_expand "cmpstrsi"
+  [(parallel [(set (match_operand:SI 0)
+               (compare:SI (match_operand:BLK 1)
+                           (match_operand:BLK 2)))
+	      (use (match_operand:SI 3))])]
+  "TARGET_CMPB && (BYTES_BIG_ENDIAN || TARGET_LDBRX)"
+{
+  if (expand_strn_compare (operands, 1))
+    DONE;
+  else	
+    FAIL;
+})
+
+;; Block compare insn.
+;; Argument 0 is the target (result)
+;; Argument 1 is the destination
+;; Argument 2 is the source
+;; Argument 3 is the length
+;; Argument 4 is the alignment
+
+(define_expand "cmpmemsi"
+  [(parallel [(set (match_operand:SI 0)
+               (compare:SI (match_operand:BLK 1)
+                           (match_operand:BLK 2)))
+	      (use (match_operand:SI 3))
+	      (use (match_operand:SI 4))])]
+  "TARGET_POPCNTD"
+{
+  if (expand_block_compare (operands))
+    DONE;
+  else
+    FAIL;
+})
+
 ;; String/block move insn.
 ;; Argument 0 is the destination
 ;; Argument 1 is the source
@@ -10353,7 +10635,6 @@
   [(parallel [(call (mem:SI (match_operand 0 "address_operand" ""))
 		    (match_operand 1 "" ""))
 	      (use (match_operand 2 "" ""))
-	      (use (reg:SI LR_REGNO))
 	      (simple_return)])]
   ""
   "
@@ -10380,7 +10661,6 @@
 		(call (mem:SI (match_operand 1 "address_operand" ""))
 		      (match_operand 2 "" "")))
 	      (use (match_operand 3 "" ""))
-	      (use (reg:SI LR_REGNO))
 	      (simple_return)])]
   ""
   "
@@ -10402,15 +10682,10 @@
     }
 }")
 
-;; this and similar patterns must be marked as using LR, otherwise
-;; dataflow will try to delete the store into it.  This is true
-;; even when the actual reg to jump to is in CTR, when LR was
-;; saved and restored around the PIC-setting BCL.
 (define_insn "*sibcall_local32"
   [(call (mem:SI (match_operand:SI 0 "current_file_function_operand" "s,s"))
 	 (match_operand 1 "" "g,g"))
    (use (match_operand:SI 2 "immediate_operand" "O,n"))
-   (use (reg:SI LR_REGNO))
    (simple_return)]
   "(INTVAL (operands[2]) & CALL_LONG) == 0"
   "*
@@ -10430,7 +10705,6 @@
   [(call (mem:SI (match_operand:DI 0 "current_file_function_operand" "s,s"))
 	 (match_operand 1 "" "g,g"))
    (use (match_operand:SI 2 "immediate_operand" "O,n"))
-   (use (reg:SI LR_REGNO))
    (simple_return)]
   "TARGET_64BIT && (INTVAL (operands[2]) & CALL_LONG) == 0"
   "*
@@ -10451,7 +10725,6 @@
 	(call (mem:SI (match_operand:SI 1 "current_file_function_operand" "s,s"))
 	      (match_operand 2 "" "g,g")))
    (use (match_operand:SI 3 "immediate_operand" "O,n"))
-   (use (reg:SI LR_REGNO))
    (simple_return)]
   "(INTVAL (operands[3]) & CALL_LONG) == 0"
   "*
@@ -10472,7 +10745,6 @@
 	(call (mem:SI (match_operand:DI 1 "current_file_function_operand" "s,s"))
 	      (match_operand 2 "" "g,g")))
    (use (match_operand:SI 3 "immediate_operand" "O,n"))
-   (use (reg:SI LR_REGNO))
    (simple_return)]
   "TARGET_64BIT && (INTVAL (operands[3]) & CALL_LONG) == 0"
   "*
@@ -10492,7 +10764,6 @@
   [(call (mem:SI (match_operand:P 0 "call_operand" "s,s,c,c"))
 	 (match_operand 1 "" ""))
    (use (match_operand 2 "immediate_operand" "O,n,O,n"))
-   (use (reg:SI LR_REGNO))
    (simple_return)]
   "(DEFAULT_ABI == ABI_DARWIN
     || DEFAULT_ABI == ABI_V4)
@@ -10523,7 +10794,6 @@
 	(call (mem:SI (match_operand:P 1 "call_operand" "s,s,c,c"))
 	      (match_operand 2 "" "")))
    (use (match_operand:SI 3 "immediate_operand" "O,n,O,n"))
-   (use (reg:SI LR_REGNO))
    (simple_return)]
   "(DEFAULT_ABI == ABI_DARWIN
     || DEFAULT_ABI == ABI_V4)
@@ -12473,13 +12743,6 @@
    (set_attr "indexed" "yes")
    (set_attr "cell_micro" "always")])
 
-(define_insn "*return_internal_<mode>"
-  [(simple_return)
-   (use (match_operand:P 0 "register_operand" "lc"))]
-  ""
-  "b%T0"
-  [(set_attr "type" "jmpreg")])
-
 ; FIXME: This would probably be somewhat simpler if the Cygnus sibcall
 ; stuff was in GCC.  Oh, and "any_parallel_operand" is a bit flexible...
 
@@ -12615,26 +12878,24 @@
 (define_insn "*return_and_restore_fpregs_aix_<mode>_r11"
  [(match_parallel 0 "any_parallel_operand"
 		  [(return)
-		   (use (match_operand:P 1 "register_operand" "l"))
-		   (use (match_operand:P 2 "symbol_ref_operand" "s"))
+		   (use (match_operand:P 1 "symbol_ref_operand" "s"))
 		   (use (reg:P 11))
-		   (set (match_operand:DF 3 "gpc_reg_operand" "=d")
-			(match_operand:DF 4 "memory_operand" "m"))])]
+		   (set (match_operand:DF 2 "gpc_reg_operand" "=d")
+			(match_operand:DF 3 "memory_operand" "m"))])]
  ""
- "b %2"
+ "b %1"
  [(set_attr "type" "branch")
   (set_attr "length" "4")])
 
 (define_insn "*return_and_restore_fpregs_aix_<mode>_r1"
  [(match_parallel 0 "any_parallel_operand"
 		  [(return)
-		   (use (match_operand:P 1 "register_operand" "l"))
-		   (use (match_operand:P 2 "symbol_ref_operand" "s"))
+		   (use (match_operand:P 1 "symbol_ref_operand" "s"))
 		   (use (reg:P 1))
-		   (set (match_operand:DF 3 "gpc_reg_operand" "=d")
-			(match_operand:DF 4 "memory_operand" "m"))])]
+		   (set (match_operand:DF 2 "gpc_reg_operand" "=d")
+			(match_operand:DF 3 "memory_operand" "m"))])]
  ""
- "b %2"
+ "b %1"
  [(set_attr "type" "branch")
   (set_attr "length" "4")])
 
@@ -12764,11 +13025,11 @@
 ;; Note that the conditions for expansion are in the FMA_F iterator.
 
 (define_expand "fma<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(fma:FMA_F
-	  (match_operand:FMA_F 1 "register_operand" "")
-	  (match_operand:FMA_F 2 "register_operand" "")
-	  (match_operand:FMA_F 3 "register_operand" "")))]
+	  (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	  (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	  (match_operand:FMA_F 3 "gpc_reg_operand" "")))]
   ""
   "")
 
@@ -12788,11 +13049,11 @@
 
 ; Altivec only has fma and nfms.
 (define_expand "fms<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(fma:FMA_F
-	  (match_operand:FMA_F 1 "register_operand" "")
-	  (match_operand:FMA_F 2 "register_operand" "")
-	  (neg:FMA_F (match_operand:FMA_F 3 "register_operand" ""))))]
+	  (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	  (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	  (neg:FMA_F (match_operand:FMA_F 3 "gpc_reg_operand" ""))))]
   "!VECTOR_UNIT_ALTIVEC_P (<MODE>mode)"
   "")
 
@@ -12812,34 +13073,34 @@
 
 ;; If signed zeros are ignored, -(a * b - c) = -a * b + c.
 (define_expand "fnma<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (neg:FMA_F (match_operand:FMA_F 3 "register_operand" "")))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (neg:FMA_F (match_operand:FMA_F 3 "gpc_reg_operand" "")))))]
   "!HONOR_SIGNED_ZEROS (<MODE>mode)"
   "")
 
 ;; If signed zeros are ignored, -(a * b + c) = -a * b - c.
 (define_expand "fnms<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (match_operand:FMA_F 3 "register_operand" ""))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 3 "gpc_reg_operand" ""))))]
   "!HONOR_SIGNED_ZEROS (<MODE>mode) && !VECTOR_UNIT_ALTIVEC_P (<MODE>mode)"
   "")
 
 ; Not an official optab name, but used from builtins.
 (define_expand "nfma<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (match_operand:FMA_F 3 "register_operand" ""))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 3 "gpc_reg_operand" ""))))]
   "!VECTOR_UNIT_ALTIVEC_P (<MODE>mode)"
   "")
 
@@ -12860,12 +13121,12 @@
 
 ; Not an official optab name, but used from builtins.
 (define_expand "nfms<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (neg:FMA_F (match_operand:FMA_F 3 "register_operand" "")))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (neg:FMA_F (match_operand:FMA_F 3 "gpc_reg_operand" "")))))]
   ""
   "")
 
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 9a25a86640b..add6254d984 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -333,6 +333,14 @@ mblock-move-inline-limit=
 Target Report Var(rs6000_block_move_inline_limit) Init(0) RejectNegative Joined UInteger Save
 Specify how many bytes should be moved inline before calling out to memcpy/memmove.
 
+mblock-compare-inline-limit=
+Target Report Var(rs6000_block_compare_inline_limit) Init(5) RejectNegative Joined UInteger Save
+Specify the maximum number pairs of load instructions that should be generated inline for the compare.  If the number needed exceeds the limit, a call to memcmp will be generated instead.
+
+mstring-compare-inline-limit=
+Target Report Var(rs6000_string_compare_inline_limit) Init(8) RejectNegative Joined UInteger Save
+Specify the maximum number pairs of load instructions that should be generated inline for the compare.  If the number needed exceeds the limit, a call to strncmp will be generated instead.
+
 misel
 Target Report Mask(ISEL) Var(rs6000_isa_flags)
 Generate isel instructions.
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index f3d28228ee8..7c7a170d934 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -230,7 +230,15 @@
 	(div:VEC_F (match_operand:VEC_F 1 "vfloat_operand" "")
 		   (match_operand:VEC_F 2 "vfloat_operand" "")))]
   "VECTOR_UNIT_VSX_P (<MODE>mode)"
-  "")
+{
+  if (RS6000_RECIP_AUTO_RE_P (<MODE>mode)
+      && can_create_pseudo_p () && flag_finite_math_only
+      && !flag_trapping_math && flag_reciprocal_math)
+    {
+      rs6000_emit_swdiv (operands[0], operands[1], operands[2], true);
+      DONE;
+    }
+})
 
 (define_expand "neg<mode>2"
   [(set (match_operand:VEC_F 0 "vfloat_operand" "")
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index ca8a702a7b9..7108f794fa2 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -2873,3 +2873,152 @@
   "TARGET_P9_VECTOR"
   "vextsw2d %0,%1"
   [(set_attr "type" "vecexts")])
+
+
+
+;; Operand numbers for the following peephole2
+(define_constants
+  [(SFBOOL_TMP_GPR		 0)		;; GPR temporary
+   (SFBOOL_TMP_VSX		 1)		;; vector temporary
+   (SFBOOL_MFVSR_D		 2)		;; move to gpr dest
+   (SFBOOL_MFVSR_A		 3)		;; move to gpr src
+   (SFBOOL_BOOL_D		 4)		;; and/ior/xor dest
+   (SFBOOL_BOOL_A1		 5)		;; and/ior/xor arg1
+   (SFBOOL_BOOL_A2		 6)		;; and/ior/xor arg1
+   (SFBOOL_SHL_D		 7)		;; shift left dest
+   (SFBOOL_SHL_A		 8)		;; shift left arg
+   (SFBOOL_MTVSR_D		 9)		;; move to vecter dest
+   (SFBOOL_BOOL_A_DI		10)		;; SFBOOL_BOOL_A1/A2 as DImode
+   (SFBOOL_TMP_VSX_DF		11)		;; SFBOOL_TMP_VSX as DFmode
+   (SFBOOL_TMP_GPR_DF		12)		;; SFBOOL_TMP_GPR as DFmode
+   (SFBOOL_MTVSR_D_V4SF		13)])		;; SFBOOL_MTVSRD_D as V4SFmode
+
+;; Attempt to optimize some common GLIBC operations using logical operations to
+;; pick apart SFmode operations.  For example, there is code from e_powf.c
+;; after macro expansion that looks like:
+;;
+;;	typedef union {
+;;	  float value;
+;;	  uint32_t word;
+;;	} ieee_float_shape_type;
+;;
+;;	float t1;
+;;	int32_t is;
+;;
+;;	do {
+;;	  ieee_float_shape_type gf_u;
+;;	  gf_u.value = (t1);
+;;	  (is) = gf_u.word;
+;;	} while (0);
+;;
+;;	do {
+;;	  ieee_float_shape_type sf_u;
+;;	  sf_u.word = (is & 0xfffff000);
+;;	  (t1) = sf_u.value;
+;;	} while (0);
+;;
+;;
+;; This would result in two direct move operations (convert to memory format,
+;; direct move to GPR, do the AND operation, direct move to VSX, convert to
+;; scalar format).  With this peephole, we eliminate the direct move to the
+;; GPR, and instead move the integer mask value to the vector register after a
+;; shift and do the VSX logical operation.
+
+;; The insns for dealing with SFmode in GPR registers looks like:
+;; (set (reg:V4SF reg2) (unspec:V4SF [(reg:SF reg1)] UNSPEC_VSX_CVDPSPN))
+;;
+;; (set (reg:DI reg3) (unspec:DI [(reg:V4SF reg2)] UNSPEC_P8V_RELOAD_FROM_VSX))
+;;
+;; (set (reg:DI reg3) (lshiftrt:DI (reg:DI reg3) (const_int 32)))
+;;
+;; (set (reg:DI reg5) (and:DI (reg:DI reg3) (reg:DI reg4)))
+;;
+;; (set (reg:DI reg6) (ashift:DI (reg:DI reg5) (const_int 32)))
+;;
+;; (set (reg:SF reg7) (unspec:SF [(reg:DI reg6)] UNSPEC_P8V_MTVSRD))
+;;
+;; (set (reg:SF reg7) (unspec:SF [(reg:SF reg7)] UNSPEC_VSX_CVSPDPN))
+
+(define_peephole2
+  [(match_scratch:DI SFBOOL_TMP_GPR "r")
+   (match_scratch:V4SF SFBOOL_TMP_VSX "wa")
+
+   ;; MFVSRD
+   (set (match_operand:DI SFBOOL_MFVSR_D "int_reg_operand")
+	(unspec:DI [(match_operand:V4SF SFBOOL_MFVSR_A "vsx_register_operand")]
+		   UNSPEC_P8V_RELOAD_FROM_VSX))
+
+   ;; SRDI
+   (set (match_dup SFBOOL_MFVSR_D)
+	(lshiftrt:DI (match_dup SFBOOL_MFVSR_D)
+		     (const_int 32)))
+
+   ;; AND/IOR/XOR operation on int
+   (set (match_operand:SI SFBOOL_BOOL_D "int_reg_operand")
+	(and_ior_xor:SI (match_operand:SI SFBOOL_BOOL_A1 "int_reg_operand")
+			(match_operand:SI SFBOOL_BOOL_A2 "reg_or_cint_operand")))
+
+   ;; SLDI
+   (set (match_operand:DI SFBOOL_SHL_D "int_reg_operand")
+	(ashift:DI (match_operand:DI SFBOOL_SHL_A "int_reg_operand")
+		   (const_int 32)))
+
+   ;; MTVSRD
+   (set (match_operand:SF SFBOOL_MTVSR_D "vsx_register_operand")
+	(unspec:SF [(match_dup SFBOOL_SHL_D)] UNSPEC_P8V_MTVSRD))]
+
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE
+   /* The REG_P (xxx) tests prevents SUBREG's, which allows us to use REGNO
+      to compare registers, when the mode is different.  */
+   && REG_P (operands[SFBOOL_MFVSR_D]) && REG_P (operands[SFBOOL_BOOL_D])
+   && REG_P (operands[SFBOOL_BOOL_A1]) && REG_P (operands[SFBOOL_SHL_D])
+   && REG_P (operands[SFBOOL_SHL_A])   && REG_P (operands[SFBOOL_MTVSR_D])
+   && (REG_P (operands[SFBOOL_BOOL_A2])
+       || CONST_INT_P (operands[SFBOOL_BOOL_A2]))
+   && (REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_MFVSR_D])
+       || peep2_reg_dead_p (3, operands[SFBOOL_MFVSR_D]))
+   && (REGNO (operands[SFBOOL_MFVSR_D]) == REGNO (operands[SFBOOL_BOOL_A1])
+       || (REG_P (operands[SFBOOL_BOOL_A2])
+	   && REGNO (operands[SFBOOL_MFVSR_D])
+		== REGNO (operands[SFBOOL_BOOL_A2])))
+   && REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_SHL_A])
+   && (REGNO (operands[SFBOOL_SHL_D]) == REGNO (operands[SFBOOL_BOOL_D])
+       || peep2_reg_dead_p (4, operands[SFBOOL_BOOL_D]))
+   && peep2_reg_dead_p (5, operands[SFBOOL_SHL_D])"
+  [(set (match_dup SFBOOL_TMP_GPR)
+	(ashift:DI (match_dup SFBOOL_BOOL_A_DI)
+		   (const_int 32)))
+
+   (set (match_dup SFBOOL_TMP_VSX_DF)
+	(match_dup SFBOOL_TMP_GPR_DF))
+
+   (set (match_dup SFBOOL_MTVSR_D_V4SF)
+	(and_ior_xor:V4SF (match_dup SFBOOL_MFVSR_A)
+			  (match_dup SFBOOL_TMP_VSX)))]
+{
+  rtx bool_a1 = operands[SFBOOL_BOOL_A1];
+  rtx bool_a2 = operands[SFBOOL_BOOL_A2];
+  int regno_mfvsr_d = REGNO (operands[SFBOOL_MFVSR_D]);
+  int regno_tmp_gpr = REGNO (operands[SFBOOL_TMP_GPR]);
+  int regno_tmp_vsx = REGNO (operands[SFBOOL_TMP_VSX]);
+  int regno_mtvsr_d = REGNO (operands[SFBOOL_MTVSR_D]);
+
+  if (CONST_INT_P (bool_a2))
+    {
+      rtx tmp_gpr = operands[SFBOOL_TMP_GPR];
+      emit_move_insn (tmp_gpr, bool_a2);
+      operands[SFBOOL_BOOL_A_DI] = tmp_gpr;
+    }
+  else
+    {
+      int regno_bool_a1 = REGNO (bool_a1);
+      int regno_bool_a2 = REGNO (bool_a2);
+      int regno_bool_a = (regno_mfvsr_d == regno_bool_a1
+			  ? regno_bool_a2 : regno_bool_a1);
+      operands[SFBOOL_BOOL_A_DI] = gen_rtx_REG (DImode, regno_bool_a);
+    }
+
+  operands[SFBOOL_TMP_VSX_DF] = gen_rtx_REG (DFmode, regno_tmp_vsx);
+  operands[SFBOOL_TMP_GPR_DF] = gen_rtx_REG (DFmode, regno_tmp_gpr);
+  operands[SFBOOL_MTVSR_D_V4SF] = gen_rtx_REG (V4SFmode, regno_mtvsr_d);
+})
diff --git a/gcc/dce.c b/gcc/dce.c
index ea3fb00d433..d5102873f60 100644
--- a/gcc/dce.c
+++ b/gcc/dce.c
@@ -587,6 +587,15 @@ delete_unmarked_insns (void)
 	  if (!dbg_cnt (dce))
 	    continue;
 
+	  if (crtl->shrink_wrapped_separate
+	      && find_reg_note (insn, REG_CFA_RESTORE, NULL))
+	    {
+	      if (dump_file)
+		fprintf (dump_file, "DCE: NOT deleting insn %d, it's a "
+				    "callee-save restore\n", INSN_UID (insn));
+	      continue;
+	    }
+
 	  if (dump_file)
 	    fprintf (dump_file, "DCE: Deleting insn %d\n", INSN_UID (insn));
 
diff --git a/gcc/df-scan.c b/gcc/df-scan.c
index 98de8440542..f9e98cedc77 100644
--- a/gcc/df-scan.c
+++ b/gcc/df-scan.c
@@ -3494,6 +3494,14 @@ df_get_entry_block_def_set (bitmap entry_block_defs)
 
   bitmap_clear (entry_block_defs);
 
+  /* For separate shrink-wrapping we use LIVE to analyze which basic blocks
+     need a prologue for some component to be executed before that block,
+     and we do not care about any other registers.  Hence, we do not want
+     any register for any component defined in the entry block, and we can
+     just leave all registers undefined.  */
+  if (df_scan->local_flags & DF_SCAN_EMPTY_ENTRY_EXIT)
+    return;
+
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     {
       if (global_regs[i])
@@ -3653,6 +3661,14 @@ df_get_exit_block_use_set (bitmap exit_block_uses)
 
   bitmap_clear (exit_block_uses);
 
+  /* For separate shrink-wrapping we use LIVE to analyze which basic blocks
+     need an epilogue for some component to be executed after that block,
+     and we do not care about any other registers.  Hence, we do not want
+     any register for any component seen as used in the exit block, and we
+     can just say no registers at all are used.  */
+  if (df_scan->local_flags & DF_SCAN_EMPTY_ENTRY_EXIT)
+    return;
+
   /* Stack pointer is always live at the exit.  */
   bitmap_set_bit (exit_block_uses, STACK_POINTER_REGNUM);
 
diff --git a/gcc/df.h b/gcc/df.h
index 34de926bcb6..d14c78a94f2 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -447,6 +447,13 @@ enum df_chain_flags
   DF_UD_CHAIN      =  2  /* Build UD chains.  */
 };
 
+enum df_scan_flags
+{
+  /* Flags for the SCAN problem.  */
+  DF_SCAN_EMPTY_ENTRY_EXIT = 1  /* Don't define any registers in the entry
+				   block; don't use any in the exit block.  */
+};
+
 enum df_changeable_flags
 {
   /* Scanning flags.  */
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b066f7bd419..15364002044 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -392,7 +392,8 @@ Objective-C and Objective-C++ Dialects}.
 -fschedule-insns -fschedule-insns2 -fsection-anchors @gol
 -fselective-scheduling -fselective-scheduling2 @gol
 -fsel-sched-pipelining -fsel-sched-pipelining-outer-loops @gol
--fsemantic-interposition -fshrink-wrap -fsignaling-nans @gol
+-fsemantic-interposition -fshrink-wrap -fshrink-wrap-separate @gol
+-fsignaling-nans @gol
 -fsingle-precision-constant -fsplit-ivs-in-unroller @gol
 -fsplit-paths @gol
 -fsplit-wide-types -fssa-backprop -fssa-phiopt @gol
@@ -6204,6 +6205,7 @@ compilation time.
 -fmove-loop-invariants @gol
 -freorder-blocks @gol
 -fshrink-wrap @gol
+-fshrink-wrap-separate @gol
 -fsplit-wide-types @gol
 -fssa-backprop @gol
 -fssa-phiopt @gol
@@ -7120,6 +7122,13 @@ Emit function prologues only before parts of the function that need it,
 rather than at the top of the function.  This flag is enabled by default at
 @option{-O} and higher.
 
+@item -fshrink-wrap-separate
+@opindex fshrink-wrap-separate
+Shrink-wrap separate parts of the prologue and epilogue separately, so that
+those parts are only executed when needed.
+This option is on by default, but has no effect unless @option{-fshrink-wrap}
+is also turned on and the target supports this.
+
 @item -fcaller-saves
 @opindex fcaller-saves
 Enable allocation of values to registers that are clobbered by
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 68e222e11bc..81d704eee81 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -2924,6 +2924,7 @@ This describes the stack layout and calling conventions.
 * Function Entry::
 * Profiling::
 * Tail Calls::
+* Shrink-wrapping separate components::
 * Stack Smashing Protection::
 * Miscellaneous Register Hooks::
 @end menu
@@ -4848,6 +4849,68 @@ This hook should add additional registers that are computed by the prologue to t
 True if a function's return statements should be checked for matching the function's return type.  This includes checking for falling off the end of a non-void function.  Return false if no such check should be made.
 @end deftypefn
 
+@node Shrink-wrapping separate components
+@subsection Shrink-wrapping separate components
+@cindex shrink-wrapping separate components
+
+The prologue may perform a variety of target dependent tasks such as
+saving callee-saved registers, saving the return address, aligning the
+stack, creating a stack frame, initializing the PIC register, setting
+up the static chain, etc.
+
+On some targets some of these tasks may be independent of others and
+thus may be shrink-wrapped separately.  These independent tasks are
+referred to as components and are handled generically by the target
+independent parts of GCC.
+
+Using the following hooks those prologue or epilogue components can be
+shrink-wrapped separately, so that the initialization (and possibly
+teardown) those components do is not done as frequently on execution
+paths where this would unnecessary.
+
+What exactly those components are is up to the target code; the generic
+code treats them abstractly, as a bit in an @code{sbitmap}.  These
+@code{sbitmap}s are allocated by the @code{shrink_wrap.get_separate_components}
+and @code{shrink_wrap.components_for_bb} hooks, and deallocated by the
+generic code.
+
+@deftypefn {Target Hook} sbitmap TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS (void)
+This hook should return an @code{sbitmap} with the bits set for those
+components that can be separately shrink-wrapped in the current function.
+Return @code{NULL} if the current function should not get any separate
+shrink-wrapping.
+Don't define this hook if it would always return @code{NULL}.
+If it is defined, the other hooks in this group have to be defined as well.
+@end deftypefn
+
+@deftypefn {Target Hook} sbitmap TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB (basic_block)
+This hook should return an @code{sbitmap} with the bits set for those
+components where either the prologue component has to be executed before
+the @code{basic_block}, or the epilogue component after it, or both.
+@end deftypefn
+
+@deftypefn {Target Hook} void TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS (sbitmap @var{components}, edge @var{e}, sbitmap @var{edge_components}, bool @var{is_prologue})
+This hook should clear the bits in the @var{components} bitmap for those
+components in @var{edge_components} that the target cannot handle on edge
+@var{e}, where @var{is_prologue} says if this is for a prologue or an
+epilogue instead.
+@end deftypefn
+
+@deftypefn {Target Hook} void TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS (sbitmap)
+Emit prologue insns for the components indicated by the parameter.
+@end deftypefn
+
+@deftypefn {Target Hook} void TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS (sbitmap)
+Emit epilogue insns for the components indicated by the parameter.
+@end deftypefn
+
+@deftypefn {Target Hook} void TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS (sbitmap)
+Mark the components in the parameter as handled, so that the
+@code{prologue} and @code{epilogue} named patterns know to ignore those
+components.  The target code should not hang on to the @code{sbitmap}, it
+will be deleted after this call.
+@end deftypefn
+
 @node Stack Smashing Protection
 @subsection Stack smashing protection
 @cindex stack smashing protection
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f31c763991c..2bad3a7db86 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -2530,6 +2530,7 @@ This describes the stack layout and calling conventions.
 * Function Entry::
 * Profiling::
 * Tail Calls::
+* Shrink-wrapping separate components::
 * Stack Smashing Protection::
 * Miscellaneous Register Hooks::
 @end menu
@@ -3789,6 +3790,43 @@ the function prologue.  Normally, the profiling code comes after.
 
 @hook TARGET_WARN_FUNC_RETURN
 
+@node Shrink-wrapping separate components
+@subsection Shrink-wrapping separate components
+@cindex shrink-wrapping separate components
+
+The prologue may perform a variety of target dependent tasks such as
+saving callee-saved registers, saving the return address, aligning the
+stack, creating a stack frame, initializing the PIC register, setting
+up the static chain, etc.
+
+On some targets some of these tasks may be independent of others and
+thus may be shrink-wrapped separately.  These independent tasks are
+referred to as components and are handled generically by the target
+independent parts of GCC.
+
+Using the following hooks those prologue or epilogue components can be
+shrink-wrapped separately, so that the initialization (and possibly
+teardown) those components do is not done as frequently on execution
+paths where this would unnecessary.
+
+What exactly those components are is up to the target code; the generic
+code treats them abstractly, as a bit in an @code{sbitmap}.  These
+@code{sbitmap}s are allocated by the @code{shrink_wrap.get_separate_components}
+and @code{shrink_wrap.components_for_bb} hooks, and deallocated by the
+generic code.
+
+@hook TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
+
+@hook TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
+
+@hook TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
+
+@hook TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
+
+@hook TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
+
+@hook TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
+
 @node Stack Smashing Protection
 @subsection Stack smashing protection
 @cindex stack smashing protection
diff --git a/gcc/emit-rtl.h b/gcc/emit-rtl.h
index 39dfce946d4..001f775b245 100644
--- a/gcc/emit-rtl.h
+++ b/gcc/emit-rtl.h
@@ -254,6 +254,10 @@ struct GTY(()) rtl_data {
   /* True if we performed shrink-wrapping for the current function.  */
   bool shrink_wrapped;
 
+  /* True if we performed shrink-wrapping for separate components for
+     the current function.  */
+  bool shrink_wrapped_separate;
+
   /* Nonzero if function being compiled doesn't modify the stack pointer
      (ignoring the prologue and epilogue).  This is only valid after
      pass_stack_ptr_mod has run.  */
diff --git a/gcc/function.c b/gcc/function.c
index 6942a504127..98034b8e67f 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -5747,6 +5747,18 @@ contains (const_rtx insn, hash_table<insn_cache_hasher> *hash)
 }
 
 int
+prologue_contains (const_rtx insn)
+{
+  return contains (insn, prologue_insn_hash);
+}
+
+int
+epilogue_contains (const_rtx insn)
+{
+  return contains (insn, epilogue_insn_hash);
+}
+
+int
 prologue_epilogue_contains (const_rtx insn)
 {
   if (contains (insn, prologue_insn_hash))
@@ -5756,48 +5768,16 @@ prologue_epilogue_contains (const_rtx insn)
   return 0;
 }
 
-/* Insert use of return register before the end of BB.  */
-
-static void
-emit_use_return_register_into_block (basic_block bb)
-{
-  start_sequence ();
-  use_return_register ();
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-  rtx_insn *insn = BB_END (bb);
-  if (HAVE_cc0 && reg_mentioned_p (cc0_rtx, PATTERN (insn)))
-    insn = prev_cc0_setter (insn);
-
-  emit_insn_before (seq, insn);
-}
-
-
-/* Create a return pattern, either simple_return or return, depending on
-   simple_p.  */
-
-static rtx_insn *
-gen_return_pattern (bool simple_p)
+void
+record_prologue_seq (rtx_insn *seq)
 {
-  return (simple_p
-	  ? targetm.gen_simple_return ()
-	  : targetm.gen_return ());
+  record_insns (seq, NULL, &prologue_insn_hash);
 }
 
-/* Insert an appropriate return pattern at the end of block BB.  This
-   also means updating block_for_insn appropriately.  SIMPLE_P is
-   the same as in gen_return_pattern and passed to it.  */
-
 void
-emit_return_into_block (bool simple_p, basic_block bb)
+record_epilogue_seq (rtx_insn *seq)
 {
-  rtx_jump_insn *jump = emit_jump_insn_after (gen_return_pattern (simple_p),
-					      BB_END (bb));
-  rtx pat = PATTERN (jump);
-  if (GET_CODE (pat) == PARALLEL)
-    pat = XVECEXP (pat, 0, 0);
-  gcc_assert (ANY_RETURN_P (pat));
-  JUMP_LABEL (jump) = pat;
+  record_insns (seq, NULL, &epilogue_insn_hash);
 }
 
 /* Set JUMP_LABEL for a return insn.  */
@@ -5814,133 +5794,89 @@ set_return_jump_label (rtx_insn *returnjump)
     JUMP_LABEL (returnjump) = ret_rtx;
 }
 
-/* Return true if there are any active insns between HEAD and TAIL.  */
-bool
-active_insn_between (rtx_insn *head, rtx_insn *tail)
-{
-  while (tail)
-    {
-      if (active_insn_p (tail))
-	return true;
-      if (tail == head)
-	return false;
-      tail = PREV_INSN (tail);
-    }
-  return false;
-}
+/* Return a sequence to be used as the split prologue for the current
+   function, or NULL.  */
 
-/* LAST_BB is a block that exits, and empty of active instructions.
-   Examine its predecessors for jumps that can be converted to
-   (conditional) returns.  */
-vec<edge>
-convert_jumps_to_returns (basic_block last_bb, bool simple_p,
-			  vec<edge> unconverted ATTRIBUTE_UNUSED)
+static rtx_insn *
+make_split_prologue_seq (void)
 {
-  int i;
-  basic_block bb;
-  edge_iterator ei;
-  edge e;
-  auto_vec<basic_block> src_bbs (EDGE_COUNT (last_bb->preds));
+  if (!flag_split_stack
+      || lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl)))
+    return NULL;
 
-  FOR_EACH_EDGE (e, ei, last_bb->preds)
-    if (e->src != ENTRY_BLOCK_PTR_FOR_FN (cfun))
-      src_bbs.quick_push (e->src);
+  start_sequence ();
+  emit_insn (targetm.gen_split_stack_prologue ());
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
 
-  rtx_insn *label = BB_HEAD (last_bb);
+  record_insns (seq, NULL, &prologue_insn_hash);
+  set_insn_locations (seq, prologue_location);
 
-  FOR_EACH_VEC_ELT (src_bbs, i, bb)
-    {
-      rtx_insn *jump = BB_END (bb);
+  return seq;
+}
 
-      if (!JUMP_P (jump) || JUMP_LABEL (jump) != label)
-	continue;
+/* Return a sequence to be used as the prologue for the current function,
+   or NULL.  */
 
-      e = find_edge (bb, last_bb);
+static rtx_insn *
+make_prologue_seq (void)
+{
+  if (!targetm.have_prologue ())
+    return NULL;
 
-      /* If we have an unconditional jump, we can replace that
-	 with a simple return instruction.  */
-      if (simplejump_p (jump))
-	{
-	  /* The use of the return register might be present in the exit
-	     fallthru block.  Either:
-	     - removing the use is safe, and we should remove the use in
-	     the exit fallthru block, or
-	     - removing the use is not safe, and we should add it here.
-	     For now, we conservatively choose the latter.  Either of the
-	     2 helps in crossjumping.  */
-	  emit_use_return_register_into_block (bb);
-
-	  emit_return_into_block (simple_p, bb);
-	  delete_insn (jump);
-	}
+  start_sequence ();
+  rtx_insn *seq = targetm.gen_prologue ();
+  emit_insn (seq);
+
+  /* Insert an explicit USE for the frame pointer
+     if the profiling is on and the frame pointer is required.  */
+  if (crtl->profile && frame_pointer_needed)
+    emit_use (hard_frame_pointer_rtx);
+
+  /* Retain a map of the prologue insns.  */
+  record_insns (seq, NULL, &prologue_insn_hash);
+  emit_note (NOTE_INSN_PROLOGUE_END);
+
+  /* Ensure that instructions are not moved into the prologue when
+     profiling is on.  The call to the profiling routine can be
+     emitted within the live range of a call-clobbered register.  */
+  if (!targetm.profile_before_prologue () && crtl->profile)
+    emit_insn (gen_blockage ());
 
-      /* If we have a conditional jump branching to the last
-	 block, we can try to replace that with a conditional
-	 return instruction.  */
-      else if (condjump_p (jump))
-	{
-	  rtx dest;
+  seq = get_insns ();
+  end_sequence ();
+  set_insn_locations (seq, prologue_location);
 
-	  if (simple_p)
-	    dest = simple_return_rtx;
-	  else
-	    dest = ret_rtx;
-	  if (!redirect_jump (as_a <rtx_jump_insn *> (jump), dest, 0))
-	    {
-	      if (targetm.have_simple_return () && simple_p)
-		{
-		  if (dump_file)
-		    fprintf (dump_file,
-			     "Failed to redirect bb %d branch.\n", bb->index);
-		  unconverted.safe_push (e);
-		}
-	      continue;
-	    }
+  return seq;
+}
 
-	  /* See comment in simplejump_p case above.  */
-	  emit_use_return_register_into_block (bb);
+/* Return a sequence to be used as the epilogue for the current function,
+   or NULL.  */
 
-	  /* If this block has only one successor, it both jumps
-	     and falls through to the fallthru block, so we can't
-	     delete the edge.  */
-	  if (single_succ_p (bb))
-	    continue;
-	}
-      else
-	{
-	  if (targetm.have_simple_return () && simple_p)
-	    {
-	      if (dump_file)
-		fprintf (dump_file,
-			 "Failed to redirect bb %d branch.\n", bb->index);
-	      unconverted.safe_push (e);
-	    }
-	  continue;
-	}
+static rtx_insn *
+make_epilogue_seq (void)
+{
+  if (!targetm.have_epilogue ())
+    return NULL;
 
-      /* Fix up the CFG for the successful change we just made.  */
-      redirect_edge_succ (e, EXIT_BLOCK_PTR_FOR_FN (cfun));
-      e->flags &= ~EDGE_CROSSING;
-    }
-  src_bbs.release ();
-  return unconverted;
-}
+  start_sequence ();
+  emit_note (NOTE_INSN_EPILOGUE_BEG);
+  rtx_insn *seq = targetm.gen_epilogue ();
+  if (seq)
+    emit_jump_insn (seq);
 
-/* Emit a return insn for the exit fallthru block.  */
-basic_block
-emit_return_for_exit (edge exit_fallthru_edge, bool simple_p)
-{
-  basic_block last_bb = exit_fallthru_edge->src;
+  /* Retain a map of the epilogue insns.  */
+  record_insns (seq, NULL, &epilogue_insn_hash);
+  set_insn_locations (seq, epilogue_location);
 
-  if (JUMP_P (BB_END (last_bb)))
-    {
-      last_bb = split_edge (exit_fallthru_edge);
-      exit_fallthru_edge = single_succ_edge (last_bb);
-    }
-  emit_barrier_after (BB_END (last_bb));
-  emit_return_into_block (simple_p, last_bb);
-  exit_fallthru_edge->flags &= ~EDGE_FALLTHRU;
-  return last_bb;
+  seq = get_insns ();
+  rtx_insn *returnjump = get_last_insn ();
+  end_sequence ();
+
+  if (JUMP_P (returnjump))
+    set_return_jump_label (returnjump);
+
+  return seq;
 }
 
 
@@ -5995,143 +5931,44 @@ emit_return_for_exit (edge exit_fallthru_edge, bool simple_p)
 void
 thread_prologue_and_epilogue_insns (void)
 {
-  bool inserted;
-  vec<edge> unconverted_simple_returns = vNULL;
-  bitmap_head bb_flags;
-  rtx_insn *returnjump;
-  rtx_insn *epilogue_end ATTRIBUTE_UNUSED;
-  rtx_insn *prologue_seq ATTRIBUTE_UNUSED, *split_prologue_seq ATTRIBUTE_UNUSED;
-  edge e, entry_edge, orig_entry_edge, exit_fallthru_edge;
-  edge_iterator ei;
-
   df_analyze ();
 
-  rtl_profile_for_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun));
-
-  inserted = false;
-  epilogue_end = NULL;
-  returnjump = NULL;
-
   /* Can't deal with multiple successors of the entry block at the
      moment.  Function should always have at least one entry
      point.  */
   gcc_assert (single_succ_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
-  entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
-  orig_entry_edge = entry_edge;
-
-  split_prologue_seq = NULL;
-  if (flag_split_stack
-      && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
-	  == NULL))
-    {
-      start_sequence ();
-      emit_insn (targetm.gen_split_stack_prologue ());
-      split_prologue_seq = get_insns ();
-      end_sequence ();
-
-      record_insns (split_prologue_seq, NULL, &prologue_insn_hash);
-      set_insn_locations (split_prologue_seq, prologue_location);
-    }
-
-  prologue_seq = NULL;
-  if (targetm.have_prologue ())
-    {
-      start_sequence ();
-      rtx_insn *seq = targetm.gen_prologue ();
-      emit_insn (seq);
-
-      /* Insert an explicit USE for the frame pointer
-         if the profiling is on and the frame pointer is required.  */
-      if (crtl->profile && frame_pointer_needed)
-	emit_use (hard_frame_pointer_rtx);
-
-      /* Retain a map of the prologue insns.  */
-      record_insns (seq, NULL, &prologue_insn_hash);
-      emit_note (NOTE_INSN_PROLOGUE_END);
 
-      /* Ensure that instructions are not moved into the prologue when
-	 profiling is on.  The call to the profiling routine can be
-	 emitted within the live range of a call-clobbered register.  */
-      if (!targetm.profile_before_prologue () && crtl->profile)
-        emit_insn (gen_blockage ());
+  edge entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+  edge orig_entry_edge = entry_edge;
 
-      prologue_seq = get_insns ();
-      end_sequence ();
-      set_insn_locations (prologue_seq, prologue_location);
-    }
-
-  bitmap_initialize (&bb_flags, &bitmap_default_obstack);
+  rtx_insn *split_prologue_seq = make_split_prologue_seq ();
+  rtx_insn *prologue_seq = make_prologue_seq ();
+  rtx_insn *epilogue_seq = make_epilogue_seq ();
 
   /* Try to perform a kind of shrink-wrapping, making sure the
      prologue/epilogue is emitted only around those parts of the
      function that require it.  */
+  try_shrink_wrapping (&entry_edge, prologue_seq);
 
-  try_shrink_wrapping (&entry_edge, &bb_flags, prologue_seq);
+  /* If the target can handle splitting the prologue/epilogue into separate
+     components, try to shrink-wrap these components separately.  */
+  try_shrink_wrapping_separate (entry_edge->dest);
 
-  rtx_insn *split_prologue_insn = split_prologue_seq;
-  if (split_prologue_seq != NULL_RTX)
-    {
-      while (split_prologue_insn && !NONDEBUG_INSN_P (split_prologue_insn))
-	split_prologue_insn = NEXT_INSN (split_prologue_insn);
-      insert_insn_on_edge (split_prologue_seq, orig_entry_edge);
-      inserted = true;
-    }
-  rtx_insn *prologue_insn = prologue_seq;
-  if (prologue_seq != NULL_RTX)
+  /* If that did anything for any component we now need the generate the
+     "main" prologue again.  Because some targets require some of these
+     to be called in a specific order (i386 requires the split prologue
+     to be first, for example), we create all three sequences again here.
+     If this does not work for some target, that target should not enable
+     separate shrink-wrapping.  */
+  if (crtl->shrink_wrapped_separate)
     {
-      while (prologue_insn && !NONDEBUG_INSN_P (prologue_insn))
-	prologue_insn = NEXT_INSN (prologue_insn);
-      insert_insn_on_edge (prologue_seq, entry_edge);
-      inserted = true;
+      split_prologue_seq = make_split_prologue_seq ();
+      prologue_seq = make_prologue_seq ();
+      epilogue_seq = make_epilogue_seq ();
     }
 
-  /* If the exit block has no non-fake predecessors, we don't need
-     an epilogue.  */
-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-    if ((e->flags & EDGE_FAKE) == 0)
-      break;
-  if (e == NULL)
-    goto epilogue_done;
-
   rtl_profile_for_bb (EXIT_BLOCK_PTR_FOR_FN (cfun));
 
-  exit_fallthru_edge = find_fallthru_edge (EXIT_BLOCK_PTR_FOR_FN (cfun)->preds);
-
-  if (targetm.have_simple_return () && entry_edge != orig_entry_edge)
-    exit_fallthru_edge
-	= get_unconverted_simple_return (exit_fallthru_edge, bb_flags,
-					 &unconverted_simple_returns,
-					 &returnjump);
-  if (targetm.have_return ())
-    {
-      if (exit_fallthru_edge == NULL)
-	goto epilogue_done;
-
-      if (optimize)
-	{
-	  basic_block last_bb = exit_fallthru_edge->src;
-
-	  if (LABEL_P (BB_HEAD (last_bb))
-	      && !active_insn_between (BB_HEAD (last_bb), BB_END (last_bb)))
-	    convert_jumps_to_returns (last_bb, false, vNULL);
-
-	  if (EDGE_COUNT (last_bb->preds) != 0
-	      && single_succ_p (last_bb))
-	    {
-	      last_bb = emit_return_for_exit (exit_fallthru_edge, false);
-	      epilogue_end = returnjump = BB_END (last_bb);
-
-	      /* Emitting the return may add a basic block.
-		 Fix bb_flags for the added block.  */
-	      if (targetm.have_simple_return ()
-		  && last_bb != exit_fallthru_edge->src)
-		bitmap_set_bit (&bb_flags, last_bb->index);
-
-	      goto epilogue_done;
-	    }
-	}
-    }
-
   /* A small fib -- epilogue is not yet completed, but we wish to re-use
      this marker for the splits of EH_RETURN patterns, and nothing else
      uses the flag in the meantime.  */
@@ -6142,6 +5979,8 @@ thread_prologue_and_epilogue_insns (void)
      code.  In order to be able to properly annotate these with unwind
      info, try to split them now.  If we get a valid split, drop an
      EPILOGUE_BEG note and mark the insns as epilogue insns.  */
+  edge e;
+  edge_iterator ei;
   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
     {
       rtx_insn *prev, *last, *trial;
@@ -6161,62 +6000,64 @@ thread_prologue_and_epilogue_insns (void)
       emit_note_after (NOTE_INSN_EPILOGUE_BEG, prev);
     }
 
-  /* If nothing falls through into the exit block, we don't need an
-     epilogue.  */
+  edge exit_fallthru_edge = find_fallthru_edge (EXIT_BLOCK_PTR_FOR_FN (cfun)->preds);
 
-  if (exit_fallthru_edge == NULL)
-    goto epilogue_done;
-
-  if (targetm.have_epilogue ())
+  if (exit_fallthru_edge)
     {
-      start_sequence ();
-      epilogue_end = emit_note (NOTE_INSN_EPILOGUE_BEG);
-      rtx_insn *seq = targetm.gen_epilogue ();
-      if (seq)
-	emit_jump_insn (seq);
-
-      /* Retain a map of the epilogue insns.  */
-      record_insns (seq, NULL, &epilogue_insn_hash);
-      set_insn_locations (seq, epilogue_location);
-
-      seq = get_insns ();
-      returnjump = get_last_insn ();
-      end_sequence ();
-
-      insert_insn_on_edge (seq, exit_fallthru_edge);
-      inserted = true;
-
-      if (JUMP_P (returnjump))
-	set_return_jump_label (returnjump);
-    }
-  else
-    {
-      basic_block cur_bb;
+      if (epilogue_seq)
+	{
+	  insert_insn_on_edge (epilogue_seq, exit_fallthru_edge);
+	  commit_edge_insertions ();
 
-      if (! next_active_insn (BB_END (exit_fallthru_edge->src)))
-	goto epilogue_done;
-      /* We have a fall-through edge to the exit block, the source is not
-         at the end of the function, and there will be an assembler epilogue
-         at the end of the function.
-         We can't use force_nonfallthru here, because that would try to
-	 use return.  Inserting a jump 'by hand' is extremely messy, so
-	 we take advantage of cfg_layout_finalize using
-	 fixup_fallthru_exit_predecessor.  */
-      cfg_layout_initialize (0);
-      FOR_EACH_BB_FN (cur_bb, cfun)
-	if (cur_bb->index >= NUM_FIXED_BLOCKS
-	    && cur_bb->next_bb->index >= NUM_FIXED_BLOCKS)
-	  cur_bb->aux = cur_bb->next_bb;
-      cfg_layout_finalize ();
+	  /* The epilogue insns we inserted may cause the exit edge to no longer
+	     be fallthru.  */
+	  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+	    {
+	      if (((e->flags & EDGE_FALLTHRU) != 0)
+		  && returnjump_p (BB_END (e->src)))
+		e->flags &= ~EDGE_FALLTHRU;
+	    }
+	}
+      else if (next_active_insn (BB_END (exit_fallthru_edge->src)))
+	{
+	  /* We have a fall-through edge to the exit block, the source is not
+	     at the end of the function, and there will be an assembler epilogue
+	     at the end of the function.
+	     We can't use force_nonfallthru here, because that would try to
+	     use return.  Inserting a jump 'by hand' is extremely messy, so
+	     we take advantage of cfg_layout_finalize using
+	     fixup_fallthru_exit_predecessor.  */
+	  cfg_layout_initialize (0);
+	  basic_block cur_bb;
+	  FOR_EACH_BB_FN (cur_bb, cfun)
+	    if (cur_bb->index >= NUM_FIXED_BLOCKS
+		&& cur_bb->next_bb->index >= NUM_FIXED_BLOCKS)
+	      cur_bb->aux = cur_bb->next_bb;
+	  cfg_layout_finalize ();
+	}
     }
 
-epilogue_done:
+  /* Insert the prologue.  */
 
-  default_rtl_profile ();
+  rtl_profile_for_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun));
 
-  if (inserted)
+  if (split_prologue_seq || prologue_seq)
     {
-      sbitmap blocks;
+      rtx_insn *split_prologue_insn = split_prologue_seq;
+      if (split_prologue_seq)
+	{
+	  while (split_prologue_insn && !NONDEBUG_INSN_P (split_prologue_insn))
+	    split_prologue_insn = NEXT_INSN (split_prologue_insn);
+	  insert_insn_on_edge (split_prologue_seq, orig_entry_edge);
+	}
+
+      rtx_insn *prologue_insn = prologue_seq;
+      if (prologue_seq)
+	{
+	  while (prologue_insn && !NONDEBUG_INSN_P (prologue_insn))
+	    prologue_insn = NEXT_INSN (prologue_insn);
+	  insert_insn_on_edge (prologue_seq, entry_edge);
+	}
 
       commit_edge_insertions ();
 
@@ -6227,7 +6068,7 @@ epilogue_done:
       if (prologue_insn
 	  && BLOCK_FOR_INSN (prologue_insn) == NULL)
 	prologue_insn = NULL;
-      blocks = sbitmap_alloc (last_basic_block_for_fn (cfun));
+      sbitmap blocks = sbitmap_alloc (last_basic_block_for_fn (cfun));
       bitmap_clear (blocks);
       if (split_prologue_insn)
 	bitmap_set_bit (blocks,
@@ -6238,39 +6079,27 @@ epilogue_done:
       bitmap_set_bit (blocks, orig_entry_edge->dest->index);
       find_many_sub_basic_blocks (blocks);
       sbitmap_free (blocks);
-
-      /* The epilogue insns we inserted may cause the exit edge to no longer
-	 be fallthru.  */
-      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-	{
-	  if (((e->flags & EDGE_FALLTHRU) != 0)
-	      && returnjump_p (BB_END (e->src)))
-	    e->flags &= ~EDGE_FALLTHRU;
-	}
     }
 
-  if (targetm.have_simple_return ())
-    convert_to_simple_return (entry_edge, orig_entry_edge, bb_flags,
-			      returnjump, unconverted_simple_returns);
+  default_rtl_profile ();
 
   /* Emit sibling epilogues before any sibling call sites.  */
-  for (ei = ei_start (EXIT_BLOCK_PTR_FOR_FN (cfun)->preds); (e =
-							     ei_safe_edge (ei));
-							     )
-    {
-      basic_block bb = e->src;
-      rtx_insn *insn = BB_END (bb);
-
-      if (!CALL_P (insn)
-	  || ! SIBLING_CALL_P (insn)
-	  || (targetm.have_simple_return ()
-	      && entry_edge != orig_entry_edge
-	      && !bitmap_bit_p (&bb_flags, bb->index)))
+  for (ei = ei_start (EXIT_BLOCK_PTR_FOR_FN (cfun)->preds);
+       (e = ei_safe_edge (ei));
+       ei_next (&ei))
+    {
+      /* Skip those already handled, the ones that run without prologue.  */
+      if (e->flags & EDGE_IGNORE)
 	{
-	  ei_next (&ei);
+	  e->flags &= ~EDGE_IGNORE;
 	  continue;
 	}
 
+      rtx_insn *insn = BB_END (e->src);
+
+      if (!(CALL_P (insn) && SIBLING_CALL_P (insn)))
+	continue;
+
       if (rtx_insn *ep_seq = targetm.gen_sibcall_epilogue ())
 	{
 	  start_sequence ();
@@ -6287,10 +6116,9 @@ epilogue_done:
 
 	  emit_insn_before (seq, insn);
 	}
-      ei_next (&ei);
     }
 
-  if (epilogue_end)
+  if (epilogue_seq)
     {
       rtx_insn *insn, *next;
 
@@ -6299,17 +6127,15 @@ epilogue_done:
 	 of such a note.  Also possibly move
 	 NOTE_INSN_FUNCTION_BEG notes, as those can be relevant for debug
 	 info generation.  */
-      for (insn = epilogue_end; insn; insn = next)
+      for (insn = epilogue_seq; insn; insn = next)
 	{
 	  next = NEXT_INSN (insn);
 	  if (NOTE_P (insn)
 	      && (NOTE_KIND (insn) == NOTE_INSN_FUNCTION_BEG))
-	    reorder_insns (insn, insn, PREV_INSN (epilogue_end));
+	    reorder_insns (insn, insn, PREV_INSN (epilogue_seq));
 	}
     }
 
-  bitmap_clear (&bb_flags);
-
   /* Threading the prologue and epilogue changes the artificial refs
      in the entry and exit blocks.  */
   epilogue_completed = 1;
@@ -6598,8 +6424,10 @@ make_pass_leaf_regs (gcc::context *ctxt)
 static unsigned int
 rest_of_handle_thread_prologue_and_epilogue (void)
 {
+  /* prepare_shrink_wrap is sensitive to the block structure of the control
+     flow graph, so clean it up first.  */
   if (optimize)
-    cleanup_cfg (CLEANUP_EXPENSIVE);
+    cleanup_cfg (0);
 
   /* On some machines, the prologue and epilogue code, or parts thereof,
      can be represented as RTL.  Doing so lets us schedule insns between
@@ -6613,7 +6441,7 @@ rest_of_handle_thread_prologue_and_epilogue (void)
 
   /* Shrink-wrapping can result in unreachable edges in the epilogue,
      see PR57320.  */
-  cleanup_cfg (0);
+  cleanup_cfg (optimize ? CLEANUP_EXPENSIVE : 0);
 
   /* The stack usage info is finalized during prologue expansion.  */
   if (flag_stack_usage_info)
diff --git a/gcc/function.h b/gcc/function.h
index 3badf9f4cc5..78f91384618 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -628,7 +628,11 @@ extern void clobber_return_register (void);
 extern void expand_function_end (void);
 extern rtx get_arg_pointer_save_area (void);
 extern void maybe_copy_prologue_epilogue_insn (rtx, rtx);
+extern int prologue_contains (const_rtx);
+extern int epilogue_contains (const_rtx);
 extern int prologue_epilogue_contains (const_rtx);
+extern void record_prologue_seq (rtx_insn *);
+extern void record_epilogue_seq (rtx_insn *);
 extern void emit_return_into_block (bool simple_p, basic_block bb);
 extern void set_return_jump_label (rtx_insn *);
 extern bool active_insn_between (rtx_insn *head, rtx_insn *tail);
diff --git a/gcc/regrename.c b/gcc/regrename.c
index 9643f328ea3..a85703f75ce 100644
--- a/gcc/regrename.c
+++ b/gcc/regrename.c
@@ -1628,6 +1628,7 @@ build_def_use (basic_block bb)
 	     (6) For any non-earlyclobber write we find in an operand, make
 	         a new chain or mark the hard register as live.
 	     (7) For any REG_UNUSED, close any chains we just opened.
+	     (8) For any REG_CFA_RESTORE, kill any chain containing it.
 
 	     We cannot deal with situations where we track a reg in one mode
 	     and see a reference in another mode; these will cause the chain
@@ -1840,6 +1841,12 @@ build_def_use (basic_block bb)
 		scan_rtx (insn, &XEXP (note, 0), NO_REGS, terminate_dead,
 			  OP_IN);
 	      }
+
+	  /* Step 8: Kill the chains involving register restores.  Those
+	     should restore _that_ register.  */
+	  for (note = REG_NOTES (insn); note; note = XEXP (note, 1))
+	    if (REG_NOTE_KIND (note) == REG_CFA_RESTORE)
+	      scan_rtx (insn, &XEXP (note, 0), NO_REGS, mark_all_read, OP_IN);
 	}
       else if (DEBUG_INSN_P (insn)
 	       && !VAR_LOC_UNKNOWN_P (INSN_VAR_LOCATION_LOC (insn)))
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index dd1918d3b57..59dbaa11676 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -3501,6 +3501,31 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
       if (!deps->readonly)
 	deps->last_args_size = insn;
     }
+
+  /* We must not mix prologue and epilogue insns.  See PR78029.  */
+  if (prologue_contains (insn))
+    {
+      add_dependence_list (insn, deps->last_epilogue, true, REG_DEP_ANTI, true);
+      if (!deps->readonly)
+	{
+	  if (deps->last_logue_was_epilogue)
+	    free_INSN_LIST_list (&deps->last_prologue);
+	  deps->last_prologue = alloc_INSN_LIST (insn, deps->last_prologue);
+	  deps->last_logue_was_epilogue = false;
+	}
+    }
+
+  if (epilogue_contains (insn))
+    {
+      add_dependence_list (insn, deps->last_prologue, true, REG_DEP_ANTI, true);
+      if (!deps->readonly)
+	{
+	  if (!deps->last_logue_was_epilogue)
+	    free_INSN_LIST_list (&deps->last_epilogue);
+	  deps->last_epilogue = alloc_INSN_LIST (insn, deps->last_epilogue);
+	  deps->last_logue_was_epilogue = true;
+	}
+    }
 }
 
 /* Return TRUE if INSN might not always return normally (e.g. call exit,
@@ -3906,6 +3931,9 @@ init_deps (struct deps_desc *deps, bool lazy_reg_last)
   deps->in_post_call_group_p = not_post_call;
   deps->last_debug_insn = 0;
   deps->last_args_size = 0;
+  deps->last_prologue = 0;
+  deps->last_epilogue = 0;
+  deps->last_logue_was_epilogue = false;
   deps->last_reg_pending_barrier = NOT_A_BARRIER;
   deps->readonly = 0;
 }
diff --git a/gcc/sched-int.h b/gcc/sched-int.h
index de5d3269266..56be1d7f15b 100644
--- a/gcc/sched-int.h
+++ b/gcc/sched-int.h
@@ -537,6 +537,17 @@ struct deps_desc
   /* The last insn bearing REG_ARGS_SIZE that we've seen.  */
   rtx_insn *last_args_size;
 
+  /* A list of all prologue insns we have seen without intervening epilogue
+     insns, and one of all epilogue insns we have seen without intervening
+     prologue insns.  This is used to prevent mixing prologue and epilogue
+     insns.  See PR78029.  */
+  rtx_insn_list *last_prologue;
+  rtx_insn_list *last_epilogue;
+
+  /* Whether the last *logue insn was an epilogue insn or a prologue insn
+     instead.  */
+  bool last_logue_was_epilogue;
+
   /* The maximum register number for the following arrays.  Before reload
      this is max_reg_num; after reload it is FIRST_PSEUDO_REGISTER.  */
   int max_reg;
diff --git a/gcc/shrink-wrap.c b/gcc/shrink-wrap.c
index fe795196709..1b2bc929c21 100644
--- a/gcc/shrink-wrap.c
+++ b/gcc/shrink-wrap.c
@@ -30,10 +30,12 @@ along with GCC; see the file COPYING3.  If not see
 #include "df.h"
 #include "tm_p.h"
 #include "regs.h"
+#include "insn-config.h"
 #include "emit-rtl.h"
 #include "output.h"
 #include "tree-pass.h"
 #include "cfgrtl.h"
+#include "cfgbuild.h"
 #include "params.h"
 #include "bb-reorder.h"
 #include "shrink-wrap.h"
@@ -529,30 +531,49 @@ can_dup_for_shrink_wrapping (basic_block bb, basic_block pro, unsigned max_size)
   return true;
 }
 
-/* If the source of edge E has more than one successor, the verifier for
-   branch probabilities gets confused by the fake edges we make where
-   simple_return statements will be inserted later (because those are not
-   marked as fallthrough edges).  Fix this by creating an extra block just
-   for that fallthrough.  */
+/* Do whatever needs to be done for exits that run without prologue.
+   Sibcalls need nothing done.  Normal exits get a simple_return inserted.  */
 
-static edge
-fix_fake_fallthrough_edge (edge e)
+static void
+handle_simple_exit (edge e)
 {
-  if (EDGE_COUNT (e->src->succs) <= 1)
-    return e;
 
-  basic_block old_bb = e->src;
-  rtx_insn *end = BB_END (old_bb);
-  rtx_note *note = emit_note_after (NOTE_INSN_DELETED, end);
-  basic_block new_bb = create_basic_block (note, note, old_bb);
-  BB_COPY_PARTITION (new_bb, old_bb);
-  BB_END (old_bb) = end;
+  if (e->flags & EDGE_SIBCALL)
+    {
+      /* Tell function.c to take no further action on this edge.  */
+      e->flags |= EDGE_IGNORE;
 
-  redirect_edge_succ (e, new_bb);
-  e->flags |= EDGE_FALLTHRU;
-  e->flags &= ~EDGE_FAKE;
+      e->flags &= ~EDGE_FALLTHRU;
+      emit_barrier_after_bb (e->src);
+      return;
+    }
 
-  return make_edge (new_bb, EXIT_BLOCK_PTR_FOR_FN (cfun), EDGE_FAKE);
+  /* If the basic block the edge comes from has multiple successors,
+     split the edge.  */
+  if (EDGE_COUNT (e->src->succs) > 1)
+    {
+      basic_block old_bb = e->src;
+      rtx_insn *end = BB_END (old_bb);
+      rtx_note *note = emit_note_after (NOTE_INSN_DELETED, end);
+      basic_block new_bb = create_basic_block (note, note, old_bb);
+      BB_COPY_PARTITION (new_bb, old_bb);
+      BB_END (old_bb) = end;
+
+      redirect_edge_succ (e, new_bb);
+      e->flags |= EDGE_FALLTHRU;
+
+      e = make_edge (new_bb, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
+    }
+
+  e->flags &= ~EDGE_FALLTHRU;
+  rtx_jump_insn *ret = emit_jump_insn_after (targetm.gen_simple_return (),
+					     BB_END (e->src));
+  JUMP_LABEL (ret) = simple_return_rtx;
+  emit_barrier_after_bb (e->src);
+
+  if (dump_file)
+    fprintf (dump_file, "Made simple_return with UID %d in bb %d\n",
+	     INSN_UID (ret), e->src->index);
 }
 
 /* Try to perform a kind of shrink-wrapping, making sure the
@@ -610,13 +631,10 @@ fix_fake_fallthrough_edge (edge e)
    (bb 4 is duplicated to 5; the prologue is inserted on the edge 5->3).
 
    ENTRY_EDGE is the edge where the prologue will be placed, possibly
-   changed by this function.  BB_WITH is a bitmap that, if we do shrink-
-   wrap, will on return contain the interesting blocks that run with
-   prologue.  PROLOGUE_SEQ is the prologue we will insert.  */
+   changed by this function.  PROLOGUE_SEQ is the prologue we will insert.  */
 
 void
-try_shrink_wrapping (edge *entry_edge, bitmap_head *bb_with,
-		     rtx_insn *prologue_seq)
+try_shrink_wrapping (edge *entry_edge, rtx_insn *prologue_seq)
 {
   /* If we cannot shrink-wrap, are told not to shrink-wrap, or it makes
      no sense to shrink-wrap: then do not shrink-wrap!  */
@@ -739,6 +757,7 @@ try_shrink_wrapping (edge *entry_edge, bitmap_head *bb_with,
      reachable from PRO that we already found, and in VEC a stack of
      those we still need to consider (to find successors).  */
 
+  bitmap bb_with = BITMAP_ALLOC (NULL);
   bitmap_set_bit (bb_with, pro->index);
 
   vec<basic_block> vec;
@@ -851,6 +870,7 @@ try_shrink_wrapping (edge *entry_edge, bitmap_head *bb_with,
 
   if (pro == entry)
     {
+      BITMAP_FREE (bb_with);
       free_dominance_info (CDI_DOMINATORS);
       return;
     }
@@ -946,22 +966,13 @@ try_shrink_wrapping (edge *entry_edge, bitmap_head *bb_with,
 	redirect_edge_and_branch_force (e, (basic_block) e->dest->aux);
       }
 
-  /* Change all the exits that should get a simple_return to FAKE.
-     They will be converted later.  */
+  /* Make a simple_return for those exits that run without prologue.  */
 
-  FOR_EACH_BB_FN (bb, cfun)
+  FOR_EACH_BB_REVERSE_FN (bb, cfun)
     if (!bitmap_bit_p (bb_with, bb->index))
       FOR_EACH_EDGE (e, ei, bb->succs)
 	if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
-	  {
-	    e = fix_fake_fallthrough_edge (e);
-
-	    e->flags &= ~EDGE_FALLTHRU;
-	    if (!(e->flags & EDGE_SIBCALL))
-	      e->flags |= EDGE_FAKE;
-
-	    emit_barrier_after_bb (e->src);
-	  }
+	  handle_simple_exit (e);
 
   /* Finally, we want a single edge to put the prologue on.  Make a new
      block before the PRO block; the edge beteen them is the edge we want.
@@ -994,158 +1005,867 @@ try_shrink_wrapping (edge *entry_edge, bitmap_head *bb_with,
   *entry_edge = make_single_succ_edge (new_bb, pro, EDGE_FALLTHRU);
   force_nonfallthru (*entry_edge);
 
+  BITMAP_FREE (bb_with);
   free_dominance_info (CDI_DOMINATORS);
 }
+
+/* Separate shrink-wrapping
+
+   Instead of putting all of the prologue and epilogue in one spot, we
+   can put parts of it in places where those components are executed less
+   frequently.  The following code does this, for prologue and epilogue
+   components that can be put in more than one location, and where those
+   components can be executed more than once (the epilogue component will
+   always be executed before the prologue component is executed a second
+   time).
+
+   What exactly is a component is target-dependent.  The more usual
+   components are simple saves/restores to/from the frame of callee-saved
+   registers.  This code treats components abstractly (as an sbitmap),
+   letting the target handle all details.
+
+   Prologue components are placed in such a way that for every component
+   the prologue is executed as infrequently as possible.  We do this by
+   walking the dominator tree, comparing the cost of placing a prologue
+   component before a block to the sum of costs determined for all subtrees
+   of that block.
+
+   From this placement, we then determine for each component all blocks
+   where at least one of this block's dominators (including itself) will
+   get a prologue inserted.  That then is how the components are placed.
+   We could place the epilogue components a bit smarter (we can save a
+   bit of code size sometimes); this is a possible future improvement.
+
+   Prologues and epilogues are preferably placed into a block, either at
+   the beginning or end of it, if it is needed for all predecessor resp.
+   successor edges; or placed on the edge otherwise.
+
+   If the placement of any prologue/epilogue leads to a situation we cannot
+   handle (for example, an abnormal edge would need to be split, or some
+   targets want to use some specific registers that may not be available
+   where we want to put them), separate shrink-wrapping for the components
+   in that prologue/epilogue is aborted.  */
+
+
+/* Print the sbitmap COMPONENTS to the DUMP_FILE if not empty, with the
+   label LABEL.  */
+static void
+dump_components (const char *label, sbitmap components)
+{
+  if (bitmap_empty_p (components))
+    return;
 
-/* If we're allowed to generate a simple return instruction, then by
-   definition we don't need a full epilogue.  If the last basic
-   block before the exit block does not contain active instructions,
-   examine its predecessors and try to emit (conditional) return
-   instructions.  */
+  fprintf (dump_file, " [%s", label);
 
-edge
-get_unconverted_simple_return (edge exit_fallthru_edge, bitmap_head bb_flags,
-			       vec<edge> *unconverted_simple_returns,
-			       rtx_insn **returnjump)
+  for (unsigned int j = 0; j < components->n_bits; j++)
+    if (bitmap_bit_p (components, j))
+      fprintf (dump_file, " %u", j);
+
+  fprintf (dump_file, "]");
+}
+
+/* The data we collect for each bb.  */
+struct sw {
+  /* What components does this BB need?  */
+  sbitmap needs_components;
+
+  /* What components does this BB have?  This is the main decision this
+     pass makes.  */
+  sbitmap has_components;
+
+  /* The components for which we placed code at the start of the BB (instead
+     of on all incoming edges).  */
+  sbitmap head_components;
+
+  /* The components for which we placed code at the end of the BB (instead
+     of on all outgoing edges).  */
+  sbitmap tail_components;
+
+  /* The frequency of executing the prologue for this BB, if a prologue is
+     placed on this BB.  This is a pessimistic estimate (no prologue is
+     needed for edges from blocks that have the component under consideration
+     active already).  */
+  gcov_type own_cost;
+
+  /* The frequency of executing the prologue for this BB and all BBs
+     dominated by it.  */
+  gcov_type total_cost;
+};
+
+/* A helper function for accessing the pass-specific info.  */
+static inline struct sw *
+SW (basic_block bb)
 {
-  if (optimize)
+  gcc_assert (bb->aux);
+  return (struct sw *) bb->aux;
+}
+
+/* Create the pass-specific data structures for separately shrink-wrapping
+   with components COMPONENTS.  */
+static void
+init_separate_shrink_wrap (sbitmap components)
+{
+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
     {
-      unsigned i, last;
+      bb->aux = xcalloc (1, sizeof (struct sw));
+
+      SW (bb)->needs_components = targetm.shrink_wrap.components_for_bb (bb);
+
+      /* Mark all basic blocks without successor as needing all components.
+	 This avoids problems in at least cfgcleanup, sel-sched, and
+	 regrename (largely to do with all paths to such a block still
+	 needing the same dwarf CFI info).  */
+      if (EDGE_COUNT (bb->succs) == 0)
+	bitmap_copy (SW (bb)->needs_components, components);
 
-      /* convert_jumps_to_returns may add to preds of the exit block
-         (but won't remove).  Stop at end of current preds.  */
-      last = EDGE_COUNT (EXIT_BLOCK_PTR_FOR_FN (cfun)->preds);
-      for (i = 0; i < last; i++)
+      if (dump_file)
 	{
-	  edge e = EDGE_I (EXIT_BLOCK_PTR_FOR_FN (cfun)->preds, i);
-	  if (LABEL_P (BB_HEAD (e->src))
-	      && !bitmap_bit_p (&bb_flags, e->src->index)
-	      && !active_insn_between (BB_HEAD (e->src), BB_END (e->src)))
-	    *unconverted_simple_returns
-		  = convert_jumps_to_returns (e->src, true,
-					      *unconverted_simple_returns);
+	  fprintf (dump_file, "bb %d components:", bb->index);
+	  dump_components ("has", SW (bb)->needs_components);
+	  fprintf (dump_file, "\n");
 	}
+
+      SW (bb)->has_components = sbitmap_alloc (SBITMAP_SIZE (components));
+      SW (bb)->head_components = sbitmap_alloc (SBITMAP_SIZE (components));
+      SW (bb)->tail_components = sbitmap_alloc (SBITMAP_SIZE (components));
+      bitmap_clear (SW (bb)->has_components);
     }
+}
+
+/* Destroy the pass-specific data.  */
+static void
+fini_separate_shrink_wrap (void)
+{
+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
+    if (bb->aux)
+      {
+	sbitmap_free (SW (bb)->needs_components);
+	sbitmap_free (SW (bb)->has_components);
+	sbitmap_free (SW (bb)->head_components);
+	sbitmap_free (SW (bb)->tail_components);
+	free (bb->aux);
+	bb->aux = 0;
+      }
+}
 
-  if (exit_fallthru_edge != NULL
-      && EDGE_COUNT (exit_fallthru_edge->src->preds) != 0
-      && !bitmap_bit_p (&bb_flags, exit_fallthru_edge->src->index))
+/* Place the prologue for component WHICH, in the basic blocks dominated
+   by HEAD.  Do a DFS over the dominator tree, and set bit WHICH in the
+   HAS_COMPONENTS of a block if either the block has that bit set in
+   NEEDS_COMPONENTS, or it is cheaper to place the prologue here than in all
+   dominator subtrees separately.  */
+static void
+place_prologue_for_one_component (unsigned int which, basic_block head)
+{
+  /* The block we are currently dealing with.  */
+  basic_block bb = head;
+  /* Is this the first time we visit this block, i.e. have we just gone
+     down the tree.  */
+  bool first_visit = true;
+
+  /* Walk the dominator tree, visit one block per iteration of this loop.
+     Each basic block is visited twice: once before visiting any children
+     of the block, and once after visiting all of them (leaf nodes are
+     visited only once).  As an optimization, we do not visit subtrees
+     that can no longer influence the prologue placement.  */
+  for (;;)
     {
-      basic_block last_bb;
+      /* First visit of a block: set the (children) cost accumulator to zero;
+	 if the block does not have the component itself, walk down.  */
+      if (first_visit)
+	{
+	  /* Initialize the cost.  The cost is the block execution frequency
+	     that does not come from backedges.  Calculating this by simply
+	     adding the cost of all edges that aren't backedges does not
+	     work: this does not always add up to the block frequency at
+	     all, and even if it does, rounding error makes for bad
+	     decisions.  */
+	  SW (bb)->own_cost = bb->frequency;
+
+	  edge e;
+	  edge_iterator ei;
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    if (dominated_by_p (CDI_DOMINATORS, e->src, bb))
+	      {
+		if (SW (bb)->own_cost > EDGE_FREQUENCY (e))
+		  SW (bb)->own_cost -= EDGE_FREQUENCY (e);
+		else
+		  SW (bb)->own_cost = 0;
+	      }
+
+	  SW (bb)->total_cost = 0;
+
+	  if (!bitmap_bit_p (SW (bb)->needs_components, which)
+	      && first_dom_son (CDI_DOMINATORS, bb))
+	    {
+	      bb = first_dom_son (CDI_DOMINATORS, bb);
+	      continue;
+	    }
+	}
 
-      last_bb = emit_return_for_exit (exit_fallthru_edge, true);
-      *returnjump = BB_END (last_bb);
-      exit_fallthru_edge = NULL;
+      /* If this block does need the component itself, or it is cheaper to
+	 put the prologue here than in all the descendants that need it,
+	 mark it so.  If this block's immediate post-dominator is dominated
+	 by this block, and that needs the prologue, we can put it on this
+	 block as well (earlier is better).  */
+      if (bitmap_bit_p (SW (bb)->needs_components, which)
+	  || SW (bb)->total_cost > SW (bb)->own_cost)
+	{
+	  SW (bb)->total_cost = SW (bb)->own_cost;
+	  bitmap_set_bit (SW (bb)->has_components, which);
+	}
+      else
+	{
+	  basic_block kid = get_immediate_dominator (CDI_POST_DOMINATORS, bb);
+	  if (dominated_by_p (CDI_DOMINATORS, kid, bb)
+	      && bitmap_bit_p (SW (kid)->has_components, which))
+	    {
+	      SW (bb)->total_cost = SW (bb)->own_cost;
+	      bitmap_set_bit (SW (bb)->has_components, which);
+	    }
+	}
+
+      /* We are back where we started, so we are done now.  */
+      if (bb == head)
+	return;
+
+      /* We now know the cost of the subtree rooted at the current block.
+	 Accumulate this cost in the parent.  */
+      basic_block parent = get_immediate_dominator (CDI_DOMINATORS, bb);
+      SW (parent)->total_cost += SW (bb)->total_cost;
+
+      /* Don't walk the tree down unless necessary.  */
+      if (next_dom_son (CDI_DOMINATORS, bb)
+          && SW (parent)->total_cost <= SW (parent)->own_cost)
+	{
+	  bb = next_dom_son (CDI_DOMINATORS, bb);
+	  first_visit = true;
+	}
+      else
+	{
+	  bb = parent;
+	  first_visit = false;
+	}
     }
-  return exit_fallthru_edge;
 }
 
-/* If there were branches to an empty LAST_BB which we tried to
-   convert to conditional simple_returns, but couldn't for some
-   reason, create a block to hold a simple_return insn and redirect
-   those remaining edges.  */
-
-void
-convert_to_simple_return (edge entry_edge, edge orig_entry_edge,
-			  bitmap_head bb_flags, rtx_insn *returnjump,
-			  vec<edge> unconverted_simple_returns)
+/* Set HAS_COMPONENTS in every block to the maximum it can be set to without
+   setting it on any path from entry to exit where it was not already set
+   somewhere (or, for blocks that have no path to the exit, consider only
+   paths from the entry to the block itself).  */
+static void
+spread_components (sbitmap components)
 {
+  basic_block entry_block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+  basic_block exit_block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+
+  /* A stack of all blocks left to consider, and a bitmap of all blocks
+     on that stack.  */
+  vec<basic_block> todo;
+  todo.create (n_basic_blocks_for_fn (cfun));
+  bitmap seen = BITMAP_ALLOC (NULL);
+
+  sbitmap old = sbitmap_alloc (SBITMAP_SIZE (components));
+
+  /* Find for every block the components that are *not* needed on some path
+     from the entry to that block.  Do this with a flood fill from the entry
+     block.  Every block can be visited at most as often as the number of
+     components (plus one), and usually much less often.  */
+
+  if (dump_file)
+    fprintf (dump_file, "Spreading down...\n");
+
+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
+    bitmap_clear (SW (bb)->head_components);
+
+  bitmap_copy (SW (entry_block)->head_components, components);
+
   edge e;
   edge_iterator ei;
 
-  if (!unconverted_simple_returns.is_empty ())
+  todo.quick_push (single_succ (entry_block));
+  bitmap_set_bit (seen, single_succ (entry_block)->index);
+  while (!todo.is_empty ())
+    {
+      bb = todo.pop ();
+
+      bitmap_copy (old, SW (bb)->head_components);
+
+      FOR_EACH_EDGE (e, ei, bb->preds)
+	bitmap_ior (SW (bb)->head_components, SW (bb)->head_components,
+		    SW (e->src)->head_components);
+
+      bitmap_and_compl (SW (bb)->head_components, SW (bb)->head_components,
+			SW (bb)->has_components);
+
+      if (!bitmap_equal_p (old, SW (bb)->head_components))
+	FOR_EACH_EDGE (e, ei, bb->succs)
+	  if (bitmap_set_bit (seen, e->dest->index))
+	    todo.quick_push (e->dest);
+
+      bitmap_clear_bit (seen, bb->index);
+    }
+
+  /* Find for every block the components that are *not* needed on some reverse
+     path from the exit to that block.  */
+
+  if (dump_file)
+    fprintf (dump_file, "Spreading up...\n");
+
+  /* First, mark all blocks not reachable from the exit block as not needing
+     any component on any path to the exit.  Mark everything, and then clear
+     again by a flood fill.  */
+
+  FOR_ALL_BB_FN (bb, cfun)
+    bitmap_copy (SW (bb)->tail_components, components);
+
+  FOR_EACH_EDGE (e, ei, exit_block->preds)
+    {
+      todo.quick_push (e->src);
+      bitmap_set_bit (seen, e->src->index);
+    }
+
+  while (!todo.is_empty ())
+    {
+      bb = todo.pop ();
+
+      if (!bitmap_empty_p (SW (bb)->tail_components))
+	FOR_EACH_EDGE (e, ei, bb->preds)
+	  if (bitmap_set_bit (seen, e->src->index))
+	    todo.quick_push (e->src);
+
+      bitmap_clear (SW (bb)->tail_components);
+
+      bitmap_clear_bit (seen, bb->index);
+    }
+
+  /* And then, flood fill backwards to find for every block the components
+     not needed on some path to the exit.  */
+
+  bitmap_copy (SW (exit_block)->tail_components, components);
+
+  FOR_EACH_EDGE (e, ei, exit_block->preds)
     {
-      basic_block simple_return_block_hot = NULL;
-      basic_block simple_return_block_cold = NULL;
-      edge pending_edge_hot = NULL;
-      edge pending_edge_cold = NULL;
-      basic_block exit_pred;
-      int i;
-
-      gcc_assert (entry_edge != orig_entry_edge);
-
-      /* See if we can reuse the last insn that was emitted for the
-	 epilogue.  */
-      if (returnjump != NULL_RTX
-	  && JUMP_LABEL (returnjump) == simple_return_rtx)
+      todo.quick_push (e->src);
+      bitmap_set_bit (seen, e->src->index);
+    }
+
+  while (!todo.is_empty ())
+    {
+      bb = todo.pop ();
+
+      bitmap_copy (old, SW (bb)->tail_components);
+
+      FOR_EACH_EDGE (e, ei, bb->succs)
+	bitmap_ior (SW (bb)->tail_components, SW (bb)->tail_components,
+		    SW (e->dest)->tail_components);
+
+      bitmap_and_compl (SW (bb)->tail_components, SW (bb)->tail_components,
+			SW (bb)->has_components);
+
+      if (!bitmap_equal_p (old, SW (bb)->tail_components))
+	FOR_EACH_EDGE (e, ei, bb->preds)
+	  if (bitmap_set_bit (seen, e->src->index))
+	    todo.quick_push (e->src);
+
+      bitmap_clear_bit (seen, bb->index);
+    }
+
+  /* Finally, mark everything not not needed both forwards and backwards.  */
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      bitmap_and (SW (bb)->head_components, SW (bb)->head_components,
+		  SW (bb)->tail_components);
+      bitmap_and_compl (SW (bb)->has_components, components,
+			SW (bb)->head_components);
+    }
+
+  FOR_ALL_BB_FN (bb, cfun)
+    {
+      if (dump_file)
 	{
-	  e = split_block (BLOCK_FOR_INSN (returnjump), PREV_INSN (returnjump));
-	  if (BB_PARTITION (e->src) == BB_HOT_PARTITION)
-	    simple_return_block_hot = e->dest;
-	  else
-	    simple_return_block_cold = e->dest;
+	  fprintf (dump_file, "bb %d components:", bb->index);
+	  dump_components ("has", SW (bb)->has_components);
+	  fprintf (dump_file, "\n");
 	}
+    }
 
-      /* Also check returns we might need to add to tail blocks.  */
-      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-	if (EDGE_COUNT (e->src->preds) != 0
-	    && (e->flags & EDGE_FAKE) != 0
-	    && !bitmap_bit_p (&bb_flags, e->src->index))
-	  {
-	    if (BB_PARTITION (e->src) == BB_HOT_PARTITION)
-	      pending_edge_hot = e;
-	    else
-	      pending_edge_cold = e;
-	  }
+  sbitmap_free (old);
+  BITMAP_FREE (seen);
+}
 
-      /* Save a pointer to the exit's predecessor BB for use in
-         inserting new BBs at the end of the function.  Do this
-         after the call to split_block above which may split
-         the original exit pred.  */
-      exit_pred = EXIT_BLOCK_PTR_FOR_FN (cfun)->prev_bb;
+/* If we cannot handle placing some component's prologues or epilogues where
+   we decided we should place them, unmark that component in COMPONENTS so
+   that it is not wrapped separately.  */
+static void
+disqualify_problematic_components (sbitmap components)
+{
+  sbitmap pro = sbitmap_alloc (SBITMAP_SIZE (components));
+  sbitmap epi = sbitmap_alloc (SBITMAP_SIZE (components));
 
-      FOR_EACH_VEC_ELT (unconverted_simple_returns, i, e)
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb->succs)
 	{
-	  basic_block *pdest_bb;
-	  edge pending;
+	  /* Find which components we want pro/epilogues for here.  */
+	  bitmap_and_compl (epi, SW (e->src)->has_components,
+			    SW (e->dest)->has_components);
+	  bitmap_and_compl (pro, SW (e->dest)->has_components,
+			    SW (e->src)->has_components);
+
+	  /* Ask the target what it thinks about things.  */
+	  if (!bitmap_empty_p (epi))
+	    targetm.shrink_wrap.disqualify_components (components, e, epi,
+						       false);
+	  if (!bitmap_empty_p (pro))
+	    targetm.shrink_wrap.disqualify_components (components, e, pro,
+						       true);
+
+	  /* If this edge doesn't need splitting, we're fine.  */
+	  if (single_pred_p (e->dest)
+	      && e->dest != EXIT_BLOCK_PTR_FOR_FN (cfun))
+	    continue;
+
+	  /* If the edge can be split, that is fine too.  */
+	  if ((e->flags & EDGE_ABNORMAL) == 0)
+	    continue;
+
+	  /* We also can handle sibcalls.  */
+	  if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
+	    {
+	      gcc_assert (e->flags & EDGE_SIBCALL);
+	      continue;
+	    }
 
-	  if (BB_PARTITION (e->src) == BB_HOT_PARTITION)
+	  /* Remove from consideration those components we would need
+	     pro/epilogues for on edges where we cannot insert them.  */
+	  bitmap_and_compl (components, components, epi);
+	  bitmap_and_compl (components, components, pro);
+
+	  if (dump_file && !bitmap_subset_p (epi, components))
 	    {
-	      pdest_bb = &simple_return_block_hot;
-	      pending = pending_edge_hot;
+	      fprintf (dump_file, "  BAD epi %d->%d", e->src->index,
+		       e->dest->index);
+	      if (e->flags & EDGE_EH)
+		fprintf (dump_file, " for EH");
+	      dump_components ("epi", epi);
+	      fprintf (dump_file, "\n");
 	    }
-	  else
+
+	  if (dump_file && !bitmap_subset_p (pro, components))
+	    {
+	      fprintf (dump_file, "  BAD pro %d->%d", e->src->index,
+		       e->dest->index);
+	      if (e->flags & EDGE_EH)
+		fprintf (dump_file, " for EH");
+	      dump_components ("pro", pro);
+	      fprintf (dump_file, "\n");
+	    }
+	}
+    }
+
+  sbitmap_free (pro);
+  sbitmap_free (epi);
+}
+
+/* Place code for prologues and epilogues for COMPONENTS where we can put
+   that code at the start of basic blocks.  */
+static void
+emit_common_heads_for_components (sbitmap components)
+{
+  sbitmap pro = sbitmap_alloc (SBITMAP_SIZE (components));
+  sbitmap epi = sbitmap_alloc (SBITMAP_SIZE (components));
+  sbitmap tmp = sbitmap_alloc (SBITMAP_SIZE (components));
+
+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
+    bitmap_clear (SW (bb)->head_components);
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      /* Find which prologue resp. epilogue components are needed for all
+	 predecessor edges to this block.  */
+
+      /* First, select all possible components.  */
+      bitmap_copy (epi, components);
+      bitmap_copy (pro, components);
+
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb->preds)
+	{
+	  if (e->flags & EDGE_ABNORMAL)
 	    {
-	      pdest_bb = &simple_return_block_cold;
-	      pending = pending_edge_cold;
+	      bitmap_clear (epi);
+	      bitmap_clear (pro);
+	      break;
 	    }
 
-	  if (*pdest_bb == NULL && pending != NULL)
+	  /* Deselect those epilogue components that should not be inserted
+	     for this edge.  */
+	  bitmap_and_compl (tmp, SW (e->src)->has_components,
+			    SW (e->dest)->has_components);
+	  bitmap_and (epi, epi, tmp);
+
+	  /* Similar, for the prologue.  */
+	  bitmap_and_compl (tmp, SW (e->dest)->has_components,
+			    SW (e->src)->has_components);
+	  bitmap_and (pro, pro, tmp);
+	}
+
+      if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro)))
+	fprintf (dump_file, "  bb %d", bb->index);
+
+      if (dump_file && !bitmap_empty_p (epi))
+	dump_components ("epi", epi);
+      if (dump_file && !bitmap_empty_p (pro))
+	dump_components ("pro", pro);
+
+      if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro)))
+	fprintf (dump_file, "\n");
+
+      /* Place code after the BB note.  */
+      if (!bitmap_empty_p (pro))
+	{
+	  start_sequence ();
+	  targetm.shrink_wrap.emit_prologue_components (pro);
+	  rtx_insn *seq = get_insns ();
+	  end_sequence ();
+	  record_prologue_seq (seq);
+
+	  emit_insn_after (seq, bb_note (bb));
+
+	  bitmap_ior (SW (bb)->head_components, SW (bb)->head_components, pro);
+	}
+
+      if (!bitmap_empty_p (epi))
+	{
+	  start_sequence ();
+	  targetm.shrink_wrap.emit_epilogue_components (epi);
+	  rtx_insn *seq = get_insns ();
+	  end_sequence ();
+	  record_epilogue_seq (seq);
+
+	  emit_insn_after (seq, bb_note (bb));
+
+	  bitmap_ior (SW (bb)->head_components, SW (bb)->head_components, epi);
+	}
+    }
+
+  sbitmap_free (pro);
+  sbitmap_free (epi);
+  sbitmap_free (tmp);
+}
+
+/* Place code for prologues and epilogues for COMPONENTS where we can put
+   that code at the end of basic blocks.  */
+static void
+emit_common_tails_for_components (sbitmap components)
+{
+  sbitmap pro = sbitmap_alloc (SBITMAP_SIZE (components));
+  sbitmap epi = sbitmap_alloc (SBITMAP_SIZE (components));
+  sbitmap tmp = sbitmap_alloc (SBITMAP_SIZE (components));
+
+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
+    bitmap_clear (SW (bb)->tail_components);
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      /* Find which prologue resp. epilogue components are needed for all
+	 successor edges from this block.  */
+      if (EDGE_COUNT (bb->succs) == 0)
+	continue;
+
+      /* First, select all possible components.  */
+      bitmap_copy (epi, components);
+      bitmap_copy (pro, components);
+
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb->succs)
+	{
+	  if (e->flags & EDGE_ABNORMAL)
 	    {
-	      emit_return_into_block (true, pending->src);
-	      pending->flags &= ~(EDGE_FALLTHRU | EDGE_FAKE);
-	      *pdest_bb = pending->src;
+	      bitmap_clear (epi);
+	      bitmap_clear (pro);
+	      break;
 	    }
-	  else if (*pdest_bb == NULL)
+
+	  /* Deselect those epilogue components that should not be inserted
+	     for this edge, and also those that are already put at the head
+	     of the successor block.  */
+	  bitmap_and_compl (tmp, SW (e->src)->has_components,
+			    SW (e->dest)->has_components);
+	  bitmap_and_compl (tmp, tmp, SW (e->dest)->head_components);
+	  bitmap_and (epi, epi, tmp);
+
+	  /* Similarly, for the prologue.  */
+	  bitmap_and_compl (tmp, SW (e->dest)->has_components,
+			    SW (e->src)->has_components);
+	  bitmap_and_compl (tmp, tmp, SW (e->dest)->head_components);
+	  bitmap_and (pro, pro, tmp);
+	}
+
+      /* If the last insn of this block is a control flow insn we cannot
+	 put anything after it.  We can put our code before it instead,
+	 but only if that jump insn is a simple jump.  */
+      rtx_insn *last_insn = BB_END (bb);
+      if (control_flow_insn_p (last_insn) && !simplejump_p (last_insn))
+	{
+	  bitmap_clear (epi);
+	  bitmap_clear (pro);
+	}
+
+      if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro)))
+	fprintf (dump_file, "  bb %d", bb->index);
+
+      if (dump_file && !bitmap_empty_p (epi))
+	dump_components ("epi", epi);
+      if (dump_file && !bitmap_empty_p (pro))
+	dump_components ("pro", pro);
+
+      if (dump_file && !(bitmap_empty_p (epi) && bitmap_empty_p (pro)))
+	fprintf (dump_file, "\n");
+
+      /* Put the code at the end of the BB, but before any final jump.  */
+      if (!bitmap_empty_p (epi))
+	{
+	  start_sequence ();
+	  targetm.shrink_wrap.emit_epilogue_components (epi);
+	  rtx_insn *seq = get_insns ();
+	  end_sequence ();
+	  record_epilogue_seq (seq);
+
+	  if (control_flow_insn_p (last_insn))
+	    emit_insn_before (seq, last_insn);
+	  else
+	    emit_insn_after (seq, last_insn);
+
+	  bitmap_ior (SW (bb)->tail_components, SW (bb)->tail_components, epi);
+	}
+
+      if (!bitmap_empty_p (pro))
+	{
+	  start_sequence ();
+	  targetm.shrink_wrap.emit_prologue_components (pro);
+	  rtx_insn *seq = get_insns ();
+	  end_sequence ();
+	  record_prologue_seq (seq);
+
+	  if (control_flow_insn_p (last_insn))
+	    emit_insn_before (seq, last_insn);
+	  else
+	    emit_insn_after (seq, last_insn);
+
+	  bitmap_ior (SW (bb)->tail_components, SW (bb)->tail_components, pro);
+	}
+    }
+
+  sbitmap_free (pro);
+  sbitmap_free (epi);
+  sbitmap_free (tmp);
+}
+
+/* Place prologues and epilogues for COMPONENTS on edges, if we haven't already
+   placed them inside blocks directly.  */
+static void
+insert_prologue_epilogue_for_components (sbitmap components)
+{
+  sbitmap pro = sbitmap_alloc (SBITMAP_SIZE (components));
+  sbitmap epi = sbitmap_alloc (SBITMAP_SIZE (components));
+
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      if (!bb->aux)
+	continue;
+
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb->succs)
+	{
+	  /* Find which pro/epilogue components are needed on this edge.  */
+	  bitmap_and_compl (epi, SW (e->src)->has_components,
+			    SW (e->dest)->has_components);
+	  bitmap_and_compl (pro, SW (e->dest)->has_components,
+			    SW (e->src)->has_components);
+	  bitmap_and (epi, epi, components);
+	  bitmap_and (pro, pro, components);
+
+	  /* Deselect those we already have put at the head or tail of the
+	     edge's dest resp. src.  */
+	  bitmap_and_compl (epi, epi, SW (e->dest)->head_components);
+	  bitmap_and_compl (pro, pro, SW (e->dest)->head_components);
+	  bitmap_and_compl (epi, epi, SW (e->src)->tail_components);
+	  bitmap_and_compl (pro, pro, SW (e->src)->tail_components);
+
+	  if (!bitmap_empty_p (epi) || !bitmap_empty_p (pro))
 	    {
-	      basic_block bb;
+	      if (dump_file)
+		{
+		  fprintf (dump_file, "  %d->%d", e->src->index,
+			   e->dest->index);
+		  dump_components ("epi", epi);
+		  dump_components ("pro", pro);
+		  if (e->flags & EDGE_SIBCALL)
+		    fprintf (dump_file, "  (SIBCALL)");
+		  else if (e->flags & EDGE_ABNORMAL)
+		    fprintf (dump_file, "  (ABNORMAL)");
+		  fprintf (dump_file, "\n");
+		}
 
-	      bb = create_basic_block (NULL, NULL, exit_pred);
-	      BB_COPY_PARTITION (bb, e->src);
-	      rtx_insn *ret = targetm.gen_simple_return ();
-	      rtx_jump_insn *start = emit_jump_insn_after (ret, BB_END (bb));
-	      JUMP_LABEL (start) = simple_return_rtx;
-	      emit_barrier_after (start);
+	      /* Put the epilogue components in place.  */
+	      start_sequence ();
+	      targetm.shrink_wrap.emit_epilogue_components (epi);
+	      rtx_insn *seq = get_insns ();
+	      end_sequence ();
+	      record_epilogue_seq (seq);
 
-	      *pdest_bb = bb;
-	      make_edge (bb, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
+	      if (e->flags & EDGE_SIBCALL)
+		{
+		  gcc_assert (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun));
+
+		  rtx_insn *insn = BB_END (e->src);
+		  gcc_assert (CALL_P (insn) && SIBLING_CALL_P (insn));
+		  emit_insn_before (seq, insn);
+		}
+	      else if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
+		{
+		  gcc_assert (e->flags & EDGE_FALLTHRU);
+		  basic_block new_bb = split_edge (e);
+		  emit_insn_after (seq, BB_END (new_bb));
+		}
+	      else
+		insert_insn_on_edge (seq, e);
+
+	      /* Put the prologue components in place.  */
+	      start_sequence ();
+	      targetm.shrink_wrap.emit_prologue_components (pro);
+	      seq = get_insns ();
+	      end_sequence ();
+	      record_prologue_seq (seq);
+
+	      insert_insn_on_edge (seq, e);
 	    }
-	  redirect_edge_and_branch_force (e, *pdest_bb);
 	}
-      unconverted_simple_returns.release ();
     }
 
-  if (entry_edge != orig_entry_edge)
+  sbitmap_free (pro);
+  sbitmap_free (epi);
+
+  commit_edge_insertions ();
+}
+
+/* The main entry point to this subpass.  FIRST_BB is where the prologue
+   would be normally put.  */
+void
+try_shrink_wrapping_separate (basic_block first_bb)
+{
+  if (HAVE_cc0)
+    return;
+
+  if (!(SHRINK_WRAPPING_ENABLED
+	&& flag_shrink_wrap_separate
+	&& optimize_function_for_speed_p (cfun)
+	&& targetm.shrink_wrap.get_separate_components))
+    return;
+
+  /* We don't handle "strange" functions.  */
+  if (cfun->calls_alloca
+      || cfun->calls_setjmp
+      || cfun->can_throw_non_call_exceptions
+      || crtl->calls_eh_return
+      || crtl->has_nonlocal_goto
+      || crtl->saves_all_registers)
+    return;
+
+  /* Ask the target what components there are.  If it returns NULL, don't
+     do anything.  */
+  sbitmap components = targetm.shrink_wrap.get_separate_components ();
+  if (!components)
+    return;
+
+  /* We need LIVE info, not defining anything in the entry block and not
+     using anything in the exit block.  A block then needs a component if
+     the register for that component is in the IN or GEN or KILL set for
+     that block.  */
+  df_scan->local_flags |= DF_SCAN_EMPTY_ENTRY_EXIT;
+  df_update_entry_exit_and_calls ();
+  df_live_add_problem ();
+  df_live_set_all_dirty ();
+  df_analyze ();
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_POST_DOMINATORS);
+
+  init_separate_shrink_wrap (components);
+
+  sbitmap_iterator sbi;
+  unsigned int j;
+  EXECUTE_IF_SET_IN_BITMAP (components, 0, j, sbi)
+    place_prologue_for_one_component (j, first_bb);
+
+  spread_components (components);
+
+  disqualify_problematic_components (components);
+
+  /* Don't separately shrink-wrap anything where the "main" prologue will
+     go; the target code can often optimize things if it is presented with
+     all components together (say, if it generates store-multiple insns).  */
+  bitmap_and_compl (components, components, SW (first_bb)->has_components);
+
+  if (bitmap_empty_p (components))
     {
-      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-	if (EDGE_COUNT (e->src->preds) != 0
-	    && (e->flags & EDGE_FAKE) != 0
-	    && !bitmap_bit_p (&bb_flags, e->src->index))
-	  {
-	    e = fix_fake_fallthrough_edge (e);
+      if (dump_file)
+	fprintf (dump_file, "Not wrapping anything separately.\n");
+    }
+  else
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file, "The components we wrap separately are");
+	  dump_components ("sep", components);
+	  fprintf (dump_file, "\n");
 
-	    emit_return_into_block (true, e->src);
-	    e->flags &= ~(EDGE_FALLTHRU | EDGE_FAKE);
-	  }
+	  fprintf (dump_file, "... Inserting common heads...\n");
+	}
+
+      emit_common_heads_for_components (components);
+
+      if (dump_file)
+	fprintf (dump_file, "... Inserting common tails...\n");
+
+      emit_common_tails_for_components (components);
+
+      if (dump_file)
+	fprintf (dump_file, "... Inserting the more difficult ones...\n");
+
+      insert_prologue_epilogue_for_components (components);
+
+      if (dump_file)
+	fprintf (dump_file, "... Done.\n");
+
+      targetm.shrink_wrap.set_handled_components (components);
+
+      crtl->shrink_wrapped_separate = true;
     }
+
+  fini_separate_shrink_wrap ();
+
+  sbitmap_free (components);
+  free_dominance_info (CDI_DOMINATORS);
+  free_dominance_info (CDI_POST_DOMINATORS);
+
+  /* All done.  */
+  df_scan->local_flags &= ~DF_SCAN_EMPTY_ENTRY_EXIT;
+  df_update_entry_exit_and_calls ();
+  df_live_set_all_dirty ();
+  df_analyze ();
 }
diff --git a/gcc/shrink-wrap.h b/gcc/shrink-wrap.h
index 6e7a4f74864..05fcb41277a 100644
--- a/gcc/shrink-wrap.h
+++ b/gcc/shrink-wrap.h
@@ -24,14 +24,8 @@ along with GCC; see the file COPYING3.  If not see
 
 /* In shrink-wrap.c.  */
 extern bool requires_stack_frame_p (rtx_insn *, HARD_REG_SET, HARD_REG_SET);
-extern void try_shrink_wrapping (edge *entry_edge, bitmap_head *bb_flags,
-				 rtx_insn *prologue_seq);
-extern edge get_unconverted_simple_return (edge, bitmap_head,
-					   vec<edge> *, rtx_insn **);
-extern void convert_to_simple_return (edge entry_edge, edge orig_entry_edge,
-				      bitmap_head bb_flags,
-				      rtx_insn *returnjump,
-				      vec<edge> unconverted_simple_returns);
+extern void try_shrink_wrapping (edge *entry_edge, rtx_insn *prologue_seq);
+extern void try_shrink_wrapping_separate (basic_block first_bb);
 #define SHRINK_WRAPPING_ENABLED \
   (flag_shrink_wrap && targetm.have_simple_return ())
 
diff --git a/gcc/target.def b/gcc/target.def
index 7e76499e25b..6ebfe294aa2 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -5749,6 +5749,63 @@ DEFHOOK
  bool, (tree),
  hook_bool_tree_true)
 
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_SHRINK_WRAP_"
+HOOK_VECTOR (TARGET_SHRINK_WRAP_HOOKS, shrink_wrap)
+
+DEFHOOK
+(get_separate_components,
+ "This hook should return an @code{sbitmap} with the bits set for those\n\
+components that can be separately shrink-wrapped in the current function.\n\
+Return @code{NULL} if the current function should not get any separate\n\
+shrink-wrapping.\n\
+Don't define this hook if it would always return @code{NULL}.\n\
+If it is defined, the other hooks in this group have to be defined as well.",
+ sbitmap, (void),
+ NULL)
+
+DEFHOOK
+(components_for_bb,
+ "This hook should return an @code{sbitmap} with the bits set for those\n\
+components where either the prologue component has to be executed before\n\
+the @code{basic_block}, or the epilogue component after it, or both.",
+ sbitmap, (basic_block),
+ NULL)
+
+DEFHOOK
+(disqualify_components,
+ "This hook should clear the bits in the @var{components} bitmap for those\n\
+components in @var{edge_components} that the target cannot handle on edge\n\
+@var{e}, where @var{is_prologue} says if this is for a prologue or an\n\
+epilogue instead.",
+ void, (sbitmap components, edge e, sbitmap edge_components, bool is_prologue),
+ NULL)
+
+DEFHOOK
+(emit_prologue_components,
+ "Emit prologue insns for the components indicated by the parameter.",
+ void, (sbitmap),
+ NULL)
+
+DEFHOOK
+(emit_epilogue_components,
+ "Emit epilogue insns for the components indicated by the parameter.",
+ void, (sbitmap),
+ NULL)
+
+DEFHOOK
+(set_handled_components,
+ "Mark the components in the parameter as handled, so that the\n\
+@code{prologue} and @code{epilogue} named patterns know to ignore those\n\
+components.  The target code should not hang on to the @code{sbitmap}, it\n\
+will be deleted after this call.",
+ void, (sbitmap),
+ NULL)
+
+HOOK_VECTOR_END (shrink_wrap)
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_"
+
 /* Determine the type of unwind info to emit for debugging.  */
 DEFHOOK
 (debug_unwind_info,
diff --git a/gcc/testsuite/ChangeLog.ibm b/gcc/testsuite/ChangeLog.ibm
new file mode 100644
index 00000000000..cde942be92d
--- /dev/null
+++ b/gcc/testsuite/ChangeLog.ibm
@@ -0,0 +1,45 @@
+2017-07-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 250101.
+
+2017-03-14  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 246128.
+
+2017-01-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 244285.
+
+2017-01-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Back port from trunk
+	2017-01-04  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	PR target/71977
+	PR target/70568
+	PR target/78823
+	* gcc.target/powerpc/pr71977-1.c: New tests to check whether on
+	64-bit VSX systems with direct move, whether we optimize common
+	code sequences in the GLIBC math library for float math functions.
+	* gcc.target/powerpc/pr71977-2.c: Likewise.
+
+2016-10-07  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 240874.
+
+2016-08-04  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 239155.
+
+2016-07-12  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 238258.
+
+2016-06-13  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 237401.
+
+2016-04-28   Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Clone branch subversion id 235611
+
diff --git a/gcc/testsuite/gcc.target/i386/pr81766.c b/gcc/testsuite/gcc.target/i386/pr81766.c
index 4bcae7f610d..e3263d7cd76 100644
--- a/gcc/testsuite/gcc.target/i386/pr81766.c
+++ b/gcc/testsuite/gcc.target/i386/pr81766.c
@@ -3,3 +3,8 @@
 /* { dg-options "-O2 -fPIE -mcmodel=large" } */
 
 int main() { return 0; }
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -fPIE -mcmodel=large" } */
+
+int main() { return 0; }
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71977-1.c b/gcc/testsuite/gcc.target/powerpc/pr71977-1.c
new file mode 100644
index 00000000000..c4413b8747a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71977-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+#include <stdint.h>
+
+typedef union
+{
+  float value;
+  uint32_t word;
+} ieee_float_shape_type;
+
+float
+mask_and_float_var (float f, uint32_t mask)
+{ 
+  ieee_float_shape_type u;
+
+  u.value = f;
+  u.word &= mask;
+
+  return u.value;
+}
+
+/* { dg-final { scan-assembler     "\[ \t\]xxland " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]and "    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]mfvsrd " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stxv"    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]lxv"     } } */
+/* { dg-final { scan-assembler-not "\[ \t\]srdi "   } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71977-2.c b/gcc/testsuite/gcc.target/powerpc/pr71977-2.c
new file mode 100644
index 00000000000..8ec1b6126ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71977-2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+#include <stdint.h>
+
+typedef union
+{
+  float value;
+  uint32_t word;
+} ieee_float_shape_type;
+
+float
+mask_and_float_sign (float f)
+{ 
+  ieee_float_shape_type u;
+
+  u.value = f;
+  u.word &= 0x80000000;
+
+  return u.value;
+}
+
+/* { dg-final { scan-assembler     "\[ \t\]xxland " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]and "    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]mfvsrd " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stxv"    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]lxv"     } } */
+/* { dg-final { scan-assembler-not "\[ \t\]srdi "   } } */
diff --git a/libgcc/ChangeLog.ibm b/libgcc/ChangeLog.ibm
new file mode 100644
index 00000000000..92d75722c6a
--- /dev/null
+++ b/libgcc/ChangeLog.ibm
@@ -0,0 +1,32 @@
+2017-07-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 250101.
+
+2017-03-14  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 246128.
+
+2017-01-10  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 244285.
+
+2016-10-07  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 240874.
+
+2016-08-04  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 239155.
+
+2016-07-12  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 238258.
+
+2016-06-13  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Merge up to 237401.
+
+2016-04-28   Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	Clone branch subversion id 235611
+