2006-11-03 Paolo Bonzini <bonzini@gnu.org>

Steven Bosscher <stevenb.gcc@gmail.com> * fwprop.c: New file. * Makefile.in: Add fwprop.o. * tree-pass.h (pass_rtl_fwprop, pass_rtl_fwprop_with_addr): New. * passes.c (init_optimization_passes): Schedule forward propagation. * rtlanal.c (loc_mentioned_in_p): Support NULL value of the second parameter. * timevar.def (TV_FWPROP): New. * common.opt (-fforward-propagate): New. * opts.c (decode_options): Enable forward propagation at -O2. * gcse.c (one_cprop_pass): Do not run local cprop unless touching jumps. * cse.c (fold_rtx_subreg, fold_rtx_mem, fold_rtx_mem_1, find_best_addr, canon_for_address, table_size): Remove. (new_basic_block, insert, remove_from_table): Remove references to table_size. (fold_rtx): Process SUBREGs and MEMs with equiv_constant, make simplification loop more straightforward by not calling fold_rtx recursively. (equiv_constant): Move here a small part of fold_rtx_subreg, do not call fold_rtx. Call avoid_constant_pool_reference to process MEMs. * recog.c (canonicalize_change_group): New. * recog.h (canonicalize_change_group): New. * doc/invoke.texi (Optimization Options): Document fwprop. * doc/passes.texi (RTL passes): Document fwprop. git-svn-id: https://gcc.gnu.org/svn/gcc/trunk@118475 138bc75d-0d04-0410-961f-82ee72b054a4
author: Paolo Bonzini <bonzini@gnu.org> 2006-11-04 08:36:45 +0000
committer: Paolo Bonzini <bonzini@gnu.org> 2006-11-04 08:36:45 +0000
commit: 0beb16d2dfcb01910979375e7b846fe73471990d (patch)
tree: ba024c11cf4d0fba9de80471ac1eedc16e891dca
parent: 7f4985ca2f42779511223b115b65ca7af63c97cd (diff)
15 files changed, 1212 insertions, 892 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 67734b04b2c..029b1c9d66a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,32 @@
+2006-11-03  Paolo Bonzini  <bonzini@gnu.org>
+            Steven Bosscher  <stevenb.gcc@gmail.com>
+
+        * fwprop.c: New file.
+        * Makefile.in: Add fwprop.o.
+        * tree-pass.h (pass_rtl_fwprop, pass_rtl_fwprop_with_addr): New.
+        * passes.c (init_optimization_passes): Schedule forward propagation.
+        * rtlanal.c (loc_mentioned_in_p): Support NULL value of the second
+        parameter.
+        * timevar.def (TV_FWPROP): New.
+        * common.opt (-fforward-propagate): New.
+        * opts.c (decode_options): Enable forward propagation at -O2.
+        * gcse.c (one_cprop_pass): Do not run local cprop unless touching jumps.
+        * cse.c (fold_rtx_subreg, fold_rtx_mem, fold_rtx_mem_1, find_best_addr,
+        canon_for_address, table_size): Remove.
+        (new_basic_block, insert, remove_from_table): Remove references to
+        table_size.
+        (fold_rtx): Process SUBREGs and MEMs with equiv_constant, make
+        simplification loop more straightforward by not calling fold_rtx
+        recursively.
+        (equiv_constant): Move here a small part of fold_rtx_subreg,
+        do not call fold_rtx.  Call avoid_constant_pool_reference
+        to process MEMs.
+        * recog.c (canonicalize_change_group): New.
+        * recog.h (canonicalize_change_group): New.
+
+        * doc/invoke.texi (Optimization Options): Document fwprop.
+        * doc/passes.texi (RTL passes): Document fwprop.
+
 2006-11-03  Geoffrey Keating  <geoffk@apple.com>
 
 	* c-decl.c (WANT_C99_INLINE_SEMANTICS): New, set to 1.
@@ -23,7 +52,6 @@
 
 2006-11-03  Paul Brook  <paul@codesourcery.com>
 
-	gcc/
 	* config/arm/arm.c (arm_file_start): New function.
 	(TARGET_ASM_FILE_START): Define.
 	(arm_default_cpu): New variable.
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 55682b5e0fd..59be2fe09c6 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -997,7 +997,7 @@ OBJS-common = \
  debug.o df-core.o df-problems.o df-scan.o dfp.o diagnostic.o dojump.o     \
  dominance.o loop-doloop.o	   					   \
  dwarf2asm.o dwarf2out.o emit-rtl.o except.o explow.o loop-iv.o		   \
- expmed.o expr.o final.o flow.o fold-const.o function.o gcse.o		   \
+ expmed.o expr.o final.o flow.o fold-const.o function.o fwprop.o gcse.o	   \
  genrtl.o ggc-common.o global.o graph.o gtype-desc.o			   \
  haifa-sched.o hooks.o ifcvt.o insn-attrtab.o insn-emit.o insn-modes.o	   \
  insn-extract.o insn-opinit.o insn-output.o insn-peep.o insn-recog.o	   \
@@ -2336,6 +2336,9 @@ cse.o : cse.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(REGS_H) \
    hard-reg-set.h $(FLAGS_H) insn-config.h $(RECOG_H) $(EXPR_H) toplev.h \
    output.h $(FUNCTION_H) $(BASIC_BLOCK_H) $(GGC_H) $(TM_P_H) $(TIMEVAR_H) \
    except.h $(TARGET_H) $(PARAMS_H) rtlhooks-def.h tree-pass.h $(REAL_H)
+fwprop.o : fwprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) \
+   toplev.h insn-config.h $(RECOG_H) $(FLAGS_H) $(OBSTACK_H) $(BASIC_BLOCK_H) \
+   output.h $(DF_H) alloc-pool.h $(TIMEVAR_H) tree-pass.h
 web.o : web.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) \
    hard-reg-set.h $(FLAGS_H) $(BASIC_BLOCK_H) $(FUNCTION_H) output.h toplev.h \
    $(DF_H) $(OBSTACK_H) $(TIMEVAR_H) tree-pass.h
diff --git a/gcc/common.opt b/gcc/common.opt
index 204560f5fba..4aaa4d20f18 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -444,6 +444,10 @@ fforce-mem
 Common Report Var(flag_force_mem)
 Copy memory operands into registers before use
 
+fforward-propagate
+Common Report Var(flag_forward_propagate)
+Perform a forward propagation pass on RTL
+
 ; Nonzero means don't put addresses of constant functions in registers.
 ; Used for compiling the Unix kernel, where strange substitutions are
 ; done on the assembly output.
diff --git a/gcc/cse.c b/gcc/cse.c
index 0304d6f763f..198837774ff 100644
--- a/gcc/cse.c
+++ b/gcc/cse.c
@@ -528,10 +528,6 @@ struct table_elt
 
 static struct table_elt *table[HASH_SIZE];
 
-/* Number of elements in the hash table.  */
-
-static unsigned int table_size;
-
 /* Chain of `struct table_elt's made so far for this function
    but currently removed from the table.  */
 
@@ -604,7 +600,6 @@ static inline unsigned safe_hash (rtx, enum machine_mode);
 static unsigned hash_rtx_string (const char *);
 
 static rtx canon_reg (rtx, rtx);
-static void find_best_addr (rtx, rtx *, enum machine_mode);
 static enum rtx_code find_comparison_args (enum rtx_code, rtx *, rtx *,
 					   enum machine_mode *,
 					   enum machine_mode *);
@@ -735,57 +730,6 @@ approx_reg_cost (rtx x)
   return cost;
 }
 
-/* Returns a canonical version of X for the address, from the point of view,
-   that all multiplications are represented as MULT instead of the multiply
-   by a power of 2 being represented as ASHIFT.  */
-
-static rtx
-canon_for_address (rtx x)
-{
-  enum rtx_code code;
-  enum machine_mode mode;
-  rtx new = 0;
-  int i;
-  const char *fmt;
-  
-  if (!x)
-    return x;
-  
-  code = GET_CODE (x);
-  mode = GET_MODE (x);
-  
-  switch (code)
-    {
-    case ASHIFT:
-      if (GET_CODE (XEXP (x, 1)) == CONST_INT
-	  && INTVAL (XEXP (x, 1)) < GET_MODE_BITSIZE (mode)
-	  && INTVAL (XEXP (x, 1)) >= 0)
-        {
-	  new = canon_for_address (XEXP (x, 0));
-	  new = gen_rtx_MULT (mode, new,
-			      gen_int_mode ((HOST_WIDE_INT) 1
-				            << INTVAL (XEXP (x, 1)),
-					    mode));
-	}
-      break;
-    default:
-      break;
-      
-    }
-  if (new)
-    return new;
-  
-  /* Now recursively process each operand of this operation.  */
-  fmt = GET_RTX_FORMAT (code);
-  for (i = 0; i < GET_RTX_LENGTH (code); i++)
-    if (fmt[i] == 'e')
-      {
-	new = canon_for_address (XEXP (x, i));
-	XEXP (x, i) = new;
-      }
-  return x;
-}
-
 /* Return a negative value if an rtx A, whose costs are given by COST_A
    and REGCOST_A, is more desirable than an rtx B.
    Return a positive value if A is less desirable, or 0 if the two are
@@ -965,8 +909,6 @@ new_basic_block (void)
 	}
     }
 
-  table_size = 0;
-
 #ifdef HAVE_cc0
   prev_insn = 0;
   prev_insn_cc0 = 0;
@@ -1377,8 +1319,6 @@ remove_from_table (struct table_elt *elt, unsigned int hash)
   /* Now add it to the free element chain.  */
   elt->next_same_hash = free_element_chain;
   free_element_chain = elt;
-
-  table_size--;
 }
 
 /* Look up X in the hash table and return its table element,
@@ -1656,8 +1596,6 @@ insert (rtx x, struct table_elt *classp, unsigned int hash, enum machine_mode mo
 	}
     }
 
-  table_size++;
-
   return elt;
 }
 
@@ -2824,231 +2762,6 @@ canon_reg (rtx x, rtx insn)
   return x;
 }
 
-/* LOC is a location within INSN that is an operand address (the contents of
-   a MEM).  Find the best equivalent address to use that is valid for this
-   insn.
-
-   On most CISC machines, complicated address modes are costly, and rtx_cost
-   is a good approximation for that cost.  However, most RISC machines have
-   only a few (usually only one) memory reference formats.  If an address is
-   valid at all, it is often just as cheap as any other address.  Hence, for
-   RISC machines, we use `address_cost' to compare the costs of various
-   addresses.  For two addresses of equal cost, choose the one with the
-   highest `rtx_cost' value as that has the potential of eliminating the
-   most insns.  For equal costs, we choose the first in the equivalence
-   class.  Note that we ignore the fact that pseudo registers are cheaper than
-   hard registers here because we would also prefer the pseudo registers.  */
-
-static void
-find_best_addr (rtx insn, rtx *loc, enum machine_mode mode)
-{
-  struct table_elt *elt;
-  rtx addr = *loc;
-  struct table_elt *p;
-  int found_better = 1;
-  int save_do_not_record = do_not_record;
-  int save_hash_arg_in_memory = hash_arg_in_memory;
-  int addr_volatile;
-  int regno;
-  unsigned hash;
-
-  /* Do not try to replace constant addresses or addresses of local and
-     argument slots.  These MEM expressions are made only once and inserted
-     in many instructions, as well as being used to control symbol table
-     output.  It is not safe to clobber them.
-
-     There are some uncommon cases where the address is already in a register
-     for some reason, but we cannot take advantage of that because we have
-     no easy way to unshare the MEM.  In addition, looking up all stack
-     addresses is costly.  */
-  if ((GET_CODE (addr) == PLUS
-       && REG_P (XEXP (addr, 0))
-       && GET_CODE (XEXP (addr, 1)) == CONST_INT
-       && (regno = REGNO (XEXP (addr, 0)),
-	   regno == FRAME_POINTER_REGNUM || regno == HARD_FRAME_POINTER_REGNUM
-	   || regno == ARG_POINTER_REGNUM))
-      || (REG_P (addr)
-	  && (regno = REGNO (addr), regno == FRAME_POINTER_REGNUM
-	      || regno == HARD_FRAME_POINTER_REGNUM
-	      || regno == ARG_POINTER_REGNUM))
-      || CONSTANT_ADDRESS_P (addr))
-    return;
-
-  /* If this address is not simply a register, try to fold it.  This will
-     sometimes simplify the expression.  Many simplifications
-     will not be valid, but some, usually applying the associative rule, will
-     be valid and produce better code.  */
-  if (!REG_P (addr))
-    {
-      rtx folded = canon_for_address (fold_rtx (addr, NULL_RTX));
-
-      if (folded != addr)
-	{
-	  int addr_folded_cost = address_cost (folded, mode);
-	  int addr_cost = address_cost (addr, mode);
-
-	  if ((addr_folded_cost < addr_cost
-	       || (addr_folded_cost == addr_cost
-		   /* ??? The rtx_cost comparison is left over from an older
-		      version of this code.  It is probably no longer helpful.*/
-		   && (rtx_cost (folded, MEM) > rtx_cost (addr, MEM)
-		       || approx_reg_cost (folded) < approx_reg_cost (addr))))
-	      && validate_change (insn, loc, folded, 0))
-	    addr = folded;
-	}
-    }
-
-  /* If this address is not in the hash table, we can't look for equivalences
-     of the whole address.  Also, ignore if volatile.  */
-
-  do_not_record = 0;
-  hash = HASH (addr, Pmode);
-  addr_volatile = do_not_record;
-  do_not_record = save_do_not_record;
-  hash_arg_in_memory = save_hash_arg_in_memory;
-
-  if (addr_volatile)
-    return;
-
-  elt = lookup (addr, hash, Pmode);
-
-  if (elt)
-    {
-      /* We need to find the best (under the criteria documented above) entry
-	 in the class that is valid.  We use the `flag' field to indicate
-	 choices that were invalid and iterate until we can't find a better
-	 one that hasn't already been tried.  */
-
-      for (p = elt->first_same_value; p; p = p->next_same_value)
-	p->flag = 0;
-
-      while (found_better)
-	{
-	  int best_addr_cost = address_cost (*loc, mode);
-	  int best_rtx_cost = (elt->cost + 1) >> 1;
-	  int exp_cost;
-	  struct table_elt *best_elt = elt;
-
-	  found_better = 0;
-	  for (p = elt->first_same_value; p; p = p->next_same_value)
-	    if (! p->flag)
-	      {
-		if ((REG_P (p->exp)
-		     || exp_equiv_p (p->exp, p->exp, 1, false))
-		    && ((exp_cost = address_cost (p->exp, mode)) < best_addr_cost
-			|| (exp_cost == best_addr_cost
-			    && ((p->cost + 1) >> 1) > best_rtx_cost)))
-		  {
-		    found_better = 1;
-		    best_addr_cost = exp_cost;
-		    best_rtx_cost = (p->cost + 1) >> 1;
-		    best_elt = p;
-		  }
-	      }
-
-	  if (found_better)
-	    {
-	      if (validate_change (insn, loc,
-				   canon_reg (copy_rtx (best_elt->exp),
-					      NULL_RTX), 0))
-		return;
-	      else
-		best_elt->flag = 1;
-	    }
-	}
-    }
-
-  /* If the address is a binary operation with the first operand a register
-     and the second a constant, do the same as above, but looking for
-     equivalences of the register.  Then try to simplify before checking for
-     the best address to use.  This catches a few cases:  First is when we
-     have REG+const and the register is another REG+const.  We can often merge
-     the constants and eliminate one insn and one register.  It may also be
-     that a machine has a cheap REG+REG+const.  Finally, this improves the
-     code on the Alpha for unaligned byte stores.  */
-
-  if (flag_expensive_optimizations
-      && ARITHMETIC_P (*loc)
-      && REG_P (XEXP (*loc, 0)))
-    {
-      rtx op1 = XEXP (*loc, 1);
-
-      do_not_record = 0;
-      hash = HASH (XEXP (*loc, 0), Pmode);
-      do_not_record = save_do_not_record;
-      hash_arg_in_memory = save_hash_arg_in_memory;
-
-      elt = lookup (XEXP (*loc, 0), hash, Pmode);
-      if (elt == 0)
-	return;
-
-      /* We need to find the best (under the criteria documented above) entry
-	 in the class that is valid.  We use the `flag' field to indicate
-	 choices that were invalid and iterate until we can't find a better
-	 one that hasn't already been tried.  */
-
-      for (p = elt->first_same_value; p; p = p->next_same_value)
-	p->flag = 0;
-
-      while (found_better)
-	{
-	  int best_addr_cost = address_cost (*loc, mode);
-	  int best_rtx_cost = (COST (*loc) + 1) >> 1;
-	  struct table_elt *best_elt = elt;
-	  rtx best_rtx = *loc;
-	  int count;
-
-	  /* This is at worst case an O(n^2) algorithm, so limit our search
-	     to the first 32 elements on the list.  This avoids trouble
-	     compiling code with very long basic blocks that can easily
-	     call simplify_gen_binary so many times that we run out of
-	     memory.  */
-
-	  found_better = 0;
-	  for (p = elt->first_same_value, count = 0;
-	       p && count < 32;
-	       p = p->next_same_value, count++)
-	    if (! p->flag
-		&& (REG_P (p->exp)
-		    || (GET_CODE (p->exp) != EXPR_LIST
-			&& exp_equiv_p (p->exp, p->exp, 1, false))))
-
-	      {
-		rtx new = simplify_gen_binary (GET_CODE (*loc), Pmode,
-					       p->exp, op1);
-		int new_cost;
-		
-		/* Get the canonical version of the address so we can accept
-		   more.  */
-		new = canon_for_address (new);
-		
-		new_cost = address_cost (new, mode);
-
-		if (new_cost < best_addr_cost
-		    || (new_cost == best_addr_cost
-			&& (COST (new) + 1) >> 1 > best_rtx_cost))
-		  {
-		    found_better = 1;
-		    best_addr_cost = new_cost;
-		    best_rtx_cost = (COST (new) + 1) >> 1;
-		    best_elt = p;
-		    best_rtx = new;
-		  }
-	      }
-
-	  if (found_better)
-	    {
-	      if (validate_change (insn, loc,
-				   canon_reg (copy_rtx (best_rtx),
-					      NULL_RTX), 0))
-		return;
-	      else
-		best_elt->flag = 1;
-	    }
-	}
-    }
-}
-
 /* Given an operation (CODE, *PARG1, *PARG2), where code is a comparison
    operation (EQ, NE, GT, etc.), follow it back through the hash table and
    what values are being compared.
@@ -3243,425 +2956,14 @@ find_comparison_args (enum rtx_code code, rtx *parg1, rtx *parg2,
   return code;
 }
 
-/* Fold SUBREG.  */
-
-static rtx
-fold_rtx_subreg (rtx x, rtx insn)
-{
-  enum machine_mode mode = GET_MODE (x);
-  rtx folded_arg0;
-  rtx const_arg0;
-  rtx new;
-
-  /* See if we previously assigned a constant value to this SUBREG.  */
-  if ((new = lookup_as_function (x, CONST_INT)) != 0
-      || (new = lookup_as_function (x, CONST_DOUBLE)) != 0)
-    return new;
-
-  /* If this is a paradoxical SUBREG, we have no idea what value the
-     extra bits would have.  However, if the operand is equivalent to
-     a SUBREG whose operand is the same as our mode, and all the modes
-     are within a word, we can just use the inner operand because
-     these SUBREGs just say how to treat the register.
-
-     Similarly if we find an integer constant.  */
-
-  if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (SUBREG_REG (x))))
-    {
-      enum machine_mode imode = GET_MODE (SUBREG_REG (x));
-      struct table_elt *elt;
-
-      if (GET_MODE_SIZE (mode) <= UNITS_PER_WORD
-	  && GET_MODE_SIZE (imode) <= UNITS_PER_WORD
-	  && (elt = lookup (SUBREG_REG (x), HASH (SUBREG_REG (x), imode),
-			    imode)) != 0)
-	for (elt = elt->first_same_value; elt; elt = elt->next_same_value)
-	  {
-	    if (CONSTANT_P (elt->exp)
-		&& GET_MODE (elt->exp) == VOIDmode)
-	      return elt->exp;
-
-	    if (GET_CODE (elt->exp) == SUBREG
-		&& GET_MODE (SUBREG_REG (elt->exp)) == mode
-		&& exp_equiv_p (elt->exp, elt->exp, 1, false))
-	      return copy_rtx (SUBREG_REG (elt->exp));
-	  }
-
-      return x;
-    }
-
-  /* Fold SUBREG_REG.  If it changed, see if we can simplify the
-     SUBREG.  We might be able to if the SUBREG is extracting a single
-     word in an integral mode or extracting the low part.  */
-
-  folded_arg0 = fold_rtx (SUBREG_REG (x), insn);
-  const_arg0 = equiv_constant (folded_arg0);
-  if (const_arg0)
-    folded_arg0 = const_arg0;
-
-  if (folded_arg0 != SUBREG_REG (x))
-    {
-      new = simplify_subreg (mode, folded_arg0,
-			     GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x));
-      if (new)
-	return new;
-    }
-
-  if (REG_P (folded_arg0)
-      && GET_MODE_SIZE (mode) < GET_MODE_SIZE (GET_MODE (folded_arg0)))
-    {
-      struct table_elt *elt;
-
-      elt = lookup (folded_arg0,
-		    HASH (folded_arg0, GET_MODE (folded_arg0)),
-		    GET_MODE (folded_arg0));
-
-      if (elt)
-	elt = elt->first_same_value;
-
-      if (subreg_lowpart_p (x))
-	/* If this is a narrowing SUBREG and our operand is a REG, see
-	   if we can find an equivalence for REG that is an arithmetic
-	   operation in a wider mode where both operands are
-	   paradoxical SUBREGs from objects of our result mode.  In
-	   that case, we couldn-t report an equivalent value for that
-	   operation, since we don't know what the extra bits will be.
-	   But we can find an equivalence for this SUBREG by folding
-	   that operation in the narrow mode.  This allows us to fold
-	   arithmetic in narrow modes when the machine only supports
-	   word-sized arithmetic.
-
-	   Also look for a case where we have a SUBREG whose operand
-	   is the same as our result.  If both modes are smaller than
-	   a word, we are simply interpreting a register in different
-	   modes and we can use the inner value.  */
-
-	for (; elt; elt = elt->next_same_value)
-	  {
-	    enum rtx_code eltcode = GET_CODE (elt->exp);
-
-	    /* Just check for unary and binary operations.  */
-	    if (UNARY_P (elt->exp)
-		&& eltcode != SIGN_EXTEND
-		&& eltcode != ZERO_EXTEND
-		&& GET_CODE (XEXP (elt->exp, 0)) == SUBREG
-		&& GET_MODE (SUBREG_REG (XEXP (elt->exp, 0))) == mode
-		&& (GET_MODE_CLASS (mode)
-		    == GET_MODE_CLASS (GET_MODE (XEXP (elt->exp, 0)))))
-	      {
-		rtx op0 = SUBREG_REG (XEXP (elt->exp, 0));
-
-		if (!REG_P (op0) && ! CONSTANT_P (op0))
-		  op0 = fold_rtx (op0, NULL_RTX);
-
-		op0 = equiv_constant (op0);
-		if (op0)
-		  new = simplify_unary_operation (GET_CODE (elt->exp), mode,
-						  op0, mode);
-	      }
-	    else if (ARITHMETIC_P (elt->exp)
-		     && eltcode != DIV && eltcode != MOD
-		     && eltcode != UDIV && eltcode != UMOD
-		     && eltcode != ASHIFTRT && eltcode != LSHIFTRT
-		     && eltcode != ROTATE && eltcode != ROTATERT
-		     && ((GET_CODE (XEXP (elt->exp, 0)) == SUBREG
-			  && (GET_MODE (SUBREG_REG (XEXP (elt->exp, 0)))
-			      == mode))
-			 || CONSTANT_P (XEXP (elt->exp, 0)))
-		     && ((GET_CODE (XEXP (elt->exp, 1)) == SUBREG
-			  && (GET_MODE (SUBREG_REG (XEXP (elt->exp, 1)))
-			      == mode))
-			 || CONSTANT_P (XEXP (elt->exp, 1))))
-	      {
-		rtx op0 = gen_lowpart_common (mode, XEXP (elt->exp, 0));
-		rtx op1 = gen_lowpart_common (mode, XEXP (elt->exp, 1));
-
-		if (op0 && !REG_P (op0) && ! CONSTANT_P (op0))
-		  op0 = fold_rtx (op0, NULL_RTX);
-
-		if (op0)
-		  op0 = equiv_constant (op0);
-
-		if (op1 && !REG_P (op1) && ! CONSTANT_P (op1))
-		  op1 = fold_rtx (op1, NULL_RTX);
-
-		if (op1)
-		  op1 = equiv_constant (op1);
-
-		/* If we are looking for the low SImode part of
-		   (ashift:DI c (const_int 32)), it doesn't work to
-		   compute that in SImode, because a 32-bit shift in
-		   SImode is unpredictable.  We know the value is
-		   0.  */
-		if (op0 && op1
-		    && GET_CODE (elt->exp) == ASHIFT
-		    && GET_CODE (op1) == CONST_INT
-		    && INTVAL (op1) >= GET_MODE_BITSIZE (mode))
-		  {
-		    if (INTVAL (op1)
-			< GET_MODE_BITSIZE (GET_MODE (elt->exp)))
-		      /* If the count fits in the inner mode's width,
-			 but exceeds the outer mode's width, the value
-			 will get truncated to 0 by the subreg.  */
-		      new = CONST0_RTX (mode);
-		    else
-		      /* If the count exceeds even the inner mode's width,
-			 don't fold this expression.  */
-		      new = 0;
-		  }
-		else if (op0 && op1)
-		  new = simplify_binary_operation (GET_CODE (elt->exp),
-						   mode, op0, op1);
-	      }
-
-	    else if (GET_CODE (elt->exp) == SUBREG
-		     && GET_MODE (SUBREG_REG (elt->exp)) == mode
-		     && (GET_MODE_SIZE (GET_MODE (folded_arg0))
-			 <= UNITS_PER_WORD)
-		     && exp_equiv_p (elt->exp, elt->exp, 1, false))
-	      new = copy_rtx (SUBREG_REG (elt->exp));
-
-	    if (new)
-	      return new;
-	  }
-      else
-	/* A SUBREG resulting from a zero extension may fold to zero
-	   if it extracts higher bits than the ZERO_EXTEND's source
-	   bits.  FIXME: if combine tried to, er, combine these
-	   instructions, this transformation may be moved to
-	   simplify_subreg.  */
-	for (; elt; elt = elt->next_same_value)
-	  {
-	    if (GET_CODE (elt->exp) == ZERO_EXTEND
-		&& subreg_lsb (x)
-		>= GET_MODE_BITSIZE (GET_MODE (XEXP (elt->exp, 0))))
-	      return CONST0_RTX (mode);
-	  }
-    }
-
-  return x;
-}
-
-/* Fold MEM.  Not to be called directly, see fold_rtx_mem instead.  */
-
-static rtx
-fold_rtx_mem_1 (rtx x, rtx insn)
-{
-  enum machine_mode mode = GET_MODE (x);
-  rtx new;
-
-  /* If we are not actually processing an insn, don't try to find the
-     best address.  Not only don't we care, but we could modify the
-     MEM in an invalid way since we have no insn to validate
-     against.  */
-  if (insn != 0)
-    find_best_addr (insn, &XEXP (x, 0), mode);
-
-  {
-    /* Even if we don't fold in the insn itself, we can safely do so
-       here, in hopes of getting a constant.  */
-    rtx addr = fold_rtx (XEXP (x, 0), NULL_RTX);
-    rtx base = 0;
-    HOST_WIDE_INT offset = 0;
-
-    if (REG_P (addr)
-	&& REGNO_QTY_VALID_P (REGNO (addr)))
-      {
-	int addr_q = REG_QTY (REGNO (addr));
-	struct qty_table_elem *addr_ent = &qty_table[addr_q];
-
-	if (GET_MODE (addr) == addr_ent->mode
-	    && addr_ent->const_rtx != NULL_RTX)
-	  addr = addr_ent->const_rtx;
-      }
-
-    /* Call target hook to avoid the effects of -fpic etc....  */
-    addr = targetm.delegitimize_address (addr);
-
-    /* If address is constant, split it into a base and integer
-       offset.  */
-    if (GET_CODE (addr) == SYMBOL_REF || GET_CODE (addr) == LABEL_REF)
-      base = addr;
-    else if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
-	     && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
-      {
-	base = XEXP (XEXP (addr, 0), 0);
-	offset = INTVAL (XEXP (XEXP (addr, 0), 1));
-      }
-    else if (GET_CODE (addr) == LO_SUM
-	     && GET_CODE (XEXP (addr, 1)) == SYMBOL_REF)
-      base = XEXP (addr, 1);
-
-    /* If this is a constant pool reference, we can fold it into its
-       constant to allow better value tracking.  */
-    if (base && GET_CODE (base) == SYMBOL_REF
-	&& CONSTANT_POOL_ADDRESS_P (base))
-      {
-	rtx constant = get_pool_constant (base);
-	enum machine_mode const_mode = get_pool_mode (base);
-	rtx new;
-
-	if (CONSTANT_P (constant) && GET_CODE (constant) != CONST_INT)
-	  {
-	    constant_pool_entries_cost = COST (constant);
-	    constant_pool_entries_regcost = approx_reg_cost (constant);
-	  }
-
-	/* If we are loading the full constant, we have an
-	   equivalence.  */
-	if (offset == 0 && mode == const_mode)
-	  return constant;
-
-	/* If this actually isn't a constant (weird!), we can't do
-	   anything.  Otherwise, handle the two most common cases:
-	   extracting a word from a multi-word constant, and
-	   extracting the low-order bits.  Other cases don't seem
-	   common enough to worry about.  */
-	if (! CONSTANT_P (constant))
-	  return x;
-
-	if (GET_MODE_CLASS (mode) == MODE_INT
-	    && GET_MODE_SIZE (mode) == UNITS_PER_WORD
-	    && offset % UNITS_PER_WORD == 0
-	    && (new = operand_subword (constant,
-				       offset / UNITS_PER_WORD,
-				       0, const_mode)) != 0)
-	  return new;
-
-	if (((BYTES_BIG_ENDIAN
-	      && offset == GET_MODE_SIZE (GET_MODE (constant)) - 1)
-	     || (! BYTES_BIG_ENDIAN && offset == 0))
-	    && (new = gen_lowpart (mode, constant)) != 0)
-	  return new;
-      }
-
-    /* If this is a reference to a label at a known position in a jump
-       table, we also know its value.  */
-    if (base && GET_CODE (base) == LABEL_REF)
-      {
-	rtx label = XEXP (base, 0);
-	rtx table_insn = NEXT_INSN (label);
-
-	if (table_insn && JUMP_P (table_insn)
-	    && GET_CODE (PATTERN (table_insn)) == ADDR_VEC)
-	  {
-	    rtx table = PATTERN (table_insn);
-
-	    if (offset >= 0
-		&& (offset / GET_MODE_SIZE (GET_MODE (table))
-		    < XVECLEN (table, 0)))
-	      {
-		rtx label = XVECEXP
-		  (table, 0, offset / GET_MODE_SIZE (GET_MODE (table)));
-		rtx set;
-
-		/* If we have an insn that loads the label from the
-		   jumptable into a reg, we don't want to set the reg
-		   to the label, because this may cause a reference to
-		   the label to remain after the label is removed in
-		   some very obscure cases (PR middle-end/18628).  */
-		if (!insn)
-		  return label;
-
-		set = single_set (insn);
+/* If X is a nontrivial arithmetic operation on an argument for which
+   a constant value can be determined, return the result of operating
+   on that value, as a constant.  Otherwise, return X, possibly with
+   one or more operands changed to a forward-propagated constant.
 
-		if (! set || SET_SRC (set) != x)
-		  return x;
-
-		/* If it's a jump, it's safe to reference the label.  */
-		if (SET_DEST (set) == pc_rtx)
-		  return label;
-
-		return x;
-	      }
-	  }
-	if (table_insn && JUMP_P (table_insn)
-	    && GET_CODE (PATTERN (table_insn)) == ADDR_DIFF_VEC)
-	  {
-	    rtx table = PATTERN (table_insn);
-
-	    if (offset >= 0
-		&& (offset / GET_MODE_SIZE (GET_MODE (table))
-		    < XVECLEN (table, 1)))
-	      {
-		offset /= GET_MODE_SIZE (GET_MODE (table));
-		new = gen_rtx_MINUS (Pmode, XVECEXP (table, 1, offset),
-				     XEXP (table, 0));
-
-		if (GET_MODE (table) != Pmode)
-		  new = gen_rtx_TRUNCATE (GET_MODE (table), new);
-
-		/* Indicate this is a constant.  This isn't a valid
-		   form of CONST, but it will only be used to fold the
-		   next insns and then discarded, so it should be
-		   safe.
-
-		   Note this expression must be explicitly discarded,
-		   by cse_insn, else it may end up in a REG_EQUAL note
-		   and "escape" to cause problems elsewhere.  */
-		return gen_rtx_CONST (GET_MODE (new), new);
-	      }
-	  }
-      }
-
-    return x;
-  }
-}
-
-/* Fold MEM.  */
-
-static rtx
-fold_rtx_mem (rtx x, rtx insn)
-{
-  /* To avoid infinite oscillations between fold_rtx and fold_rtx_mem,
-     refuse to allow recursion of the latter past n levels.  This can
-     happen because fold_rtx_mem will try to fold the address of the
-     memory reference it is passed, i.e. conceptually throwing away
-     the MEM and reinjecting the bare address into fold_rtx.  As a
-     result, patterns like
-
-       set (reg1)
-	   (plus (reg)
-		 (mem (plus (reg2) (const_int))))
-
-       set (reg2)
-	   (plus (reg)
-		 (mem (plus (reg1) (const_int))))
-
-     will defeat any "first-order" short-circuit put in either
-     function to prevent these infinite oscillations.
-
-     The heuristics for determining n is as follows: since each time
-     it is invoked fold_rtx_mem throws away a MEM, and since MEMs
-     are generically not nested, we assume that each invocation of
-     fold_rtx_mem corresponds to a new "top-level" operand, i.e.
-     the source or the destination of a SET.  So fold_rtx_mem is
-     bound to stop or cycle before n recursions, n being the number
-     of expressions recorded in the hash table.  We also leave some
-     play to account for the initial steps.  */
-
-  static unsigned int depth;
-  rtx ret;
-
-  if (depth > 3 + table_size)
-    return x;
-
-  depth++;
-  ret = fold_rtx_mem_1 (x, insn);
-  depth--;
-
-  return ret;
-}
-
-/* If X is a nontrivial arithmetic operation on an argument
-   for which a constant value can be determined, return
-   the result of operating on that value, as a constant.
-   Otherwise, return X, possibly with one or more operands
-   modified by recursive calls to this function.
-
-   If X is a register whose contents are known, we do NOT
-   return those contents here.  equiv_constant is called to
-   perform that task.
+   If X is a register whose contents are known, we do NOT return
+   those contents here; equiv_constant is called to perform that task.
+   For SUBREGs and MEMs, we do that both here and in equiv_constant.
 
    INSN is the insn that we may be modifying.  If it is 0, make a copy
    of X before modifying it.  */
@@ -3674,10 +2976,9 @@ fold_rtx (rtx x, rtx insn)
   const char *fmt;
   int i;
   rtx new = 0;
-  int copied = 0;
-  int must_swap = 0;
+  int changed = 0;
 
-  /* Folded equivalents of first two operands of X.  */
+  /* Operands of X.  */
   rtx folded_arg0;
   rtx folded_arg1;
 
@@ -3694,10 +2995,16 @@ fold_rtx (rtx x, rtx insn)
   if (x == 0)
     return x;
 
-  mode = GET_MODE (x);
+  /* Try to perform some initial simplifications on X.  */
   code = GET_CODE (x);
   switch (code)
     {
+    case MEM:
+    case SUBREG:
+      if ((new = equiv_constant (x)) != NULL_RTX)
+        return new;
+      return x;
+
     case CONST:
     case CONST_INT:
     case CONST_DOUBLE:
@@ -3717,28 +3024,6 @@ fold_rtx (rtx x, rtx insn)
       return prev_insn_cc0;
 #endif
 
-    case SUBREG:
-      return fold_rtx_subreg (x, insn);
-
-    case NOT:
-    case NEG:
-      /* If we have (NOT Y), see if Y is known to be (NOT Z).
-	 If so, (NOT Y) simplifies to Z.  Similarly for NEG.  */
-      new = lookup_as_function (XEXP (x, 0), code);
-      if (new)
-	return fold_rtx (copy_rtx (XEXP (new, 0)), insn);
-      break;
-
-    case MEM:
-      return fold_rtx_mem (x, insn);
-
-#ifdef NO_FUNCTION_CSE
-    case CALL:
-      if (CONSTANT_P (XEXP (XEXP (x, 0), 0)))
-	return x;
-      break;
-#endif
-
     case ASM_OPERANDS:
       if (insn)
 	{
@@ -3746,12 +3031,21 @@ fold_rtx (rtx x, rtx insn)
 	    validate_change (insn, &ASM_OPERANDS_INPUT (x, i),
 			     fold_rtx (ASM_OPERANDS_INPUT (x, i), insn), 0);
 	}
+      return x;
+
+#ifdef NO_FUNCTION_CSE
+    case CALL:
+      if (CONSTANT_P (XEXP (XEXP (x, 0), 0)))
+	return x;
       break;
+#endif
 
+    /* Anything else goes through the loop below.  */
     default:
       break;
     }
 
+  mode = GET_MODE (x);
   const_arg0 = 0;
   const_arg1 = 0;
   const_arg2 = 0;
@@ -3764,55 +3058,13 @@ fold_rtx (rtx x, rtx insn)
   for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
     if (fmt[i] == 'e')
       {
-	rtx arg = XEXP (x, i);
-	rtx folded_arg = arg, const_arg = 0;
-	enum machine_mode mode_arg = GET_MODE (arg);
-	rtx cheap_arg, expensive_arg;
-	rtx replacements[2];
-	int j;
-	int old_cost = COST_IN (XEXP (x, i), code);
-
-	/* Most arguments are cheap, so handle them specially.  */
-	switch (GET_CODE (arg))
-	  {
-	  case REG:
-	    /* This is the same as calling equiv_constant; it is duplicated
-	       here for speed.  */
-	    if (REGNO_QTY_VALID_P (REGNO (arg)))
-	      {
-		int arg_q = REG_QTY (REGNO (arg));
-		struct qty_table_elem *arg_ent = &qty_table[arg_q];
-
-		if (arg_ent->const_rtx != NULL_RTX
-		    && !REG_P (arg_ent->const_rtx)
-		    && GET_CODE (arg_ent->const_rtx) != PLUS)
-		  const_arg
-		    = gen_lowpart (GET_MODE (arg),
-					       arg_ent->const_rtx);
-	      }
-	    break;
-
-	  case CONST:
-	  case CONST_INT:
-	  case SYMBOL_REF:
-	  case LABEL_REF:
-	  case CONST_DOUBLE:
-	  case CONST_VECTOR:
-	    const_arg = arg;
-	    break;
-
+	rtx folded_arg = XEXP (x, i), const_arg;
+	enum machine_mode mode_arg = GET_MODE (folded_arg);
 #ifdef HAVE_cc0
-	  case CC0:
-	    folded_arg = prev_insn_cc0;
-	    mode_arg = prev_insn_cc0_mode;
-	    const_arg = equiv_constant (folded_arg);
-	    break;
+	if (CC0_P (folded_arg))
+	  folded_arg = prev_insn_cc0, mode_arg = prev_insn_cc0_mode;
 #endif
-
-	  default:
-	    folded_arg = fold_rtx (arg, insn);
-	    const_arg = equiv_constant (folded_arg);
-	  }
+	const_arg = equiv_constant (folded_arg);
 
 	/* For the first three operands, see if the operand
 	   is constant or equivalent to a constant.  */
@@ -3832,120 +3084,50 @@ fold_rtx (rtx x, rtx insn)
 	    break;
 	  }
 
-	/* Pick the least expensive of the folded argument and an
-	   equivalent constant argument.  */
-	if (const_arg == 0 || const_arg == folded_arg
-	    || COST_IN (const_arg, code) > COST_IN (folded_arg, code))
-	  cheap_arg = folded_arg, expensive_arg = const_arg;
-	else
-	  cheap_arg = const_arg, expensive_arg = folded_arg;
-
-	/* Try to replace the operand with the cheapest of the two
-	   possibilities.  If it doesn't work and this is either of the first
-	   two operands of a commutative operation, try swapping them.
-	   If THAT fails, try the more expensive, provided it is cheaper
-	   than what is already there.  */
-
-	if (cheap_arg == XEXP (x, i))
-	  continue;
-
-	if (insn == 0 && ! copied)
-	  {
-	    x = copy_rtx (x);
-	    copied = 1;
-	  }
-
-	/* Order the replacements from cheapest to most expensive.  */
-	replacements[0] = cheap_arg;
-	replacements[1] = expensive_arg;
-
-	for (j = 0; j < 2 && replacements[j]; j++)
-	  {
-	    int new_cost = COST_IN (replacements[j], code);
-
-	    /* Stop if what existed before was cheaper.  Prefer constants
-	       in the case of a tie.  */
-	    if (new_cost > old_cost
-		|| (new_cost == old_cost && CONSTANT_P (XEXP (x, i))))
-	      break;
+	/* Pick the least expensive of the argument and an equivalent constant
+	   argument.  */
+	if (const_arg != 0
+	    && const_arg != folded_arg
+	    && COST_IN (const_arg, code) <= COST_IN (folded_arg, code)
 
 	    /* It's not safe to substitute the operand of a conversion
 	       operator with a constant, as the conversion's identity
 	       depends upon the mode of its operand.  This optimization
 	       is handled by the call to simplify_unary_operation.  */
-	    if (GET_RTX_CLASS (code) == RTX_UNARY
-		&& GET_MODE (replacements[j]) != mode_arg0
-		&& (code == ZERO_EXTEND
-		    || code == SIGN_EXTEND
-		    || code == TRUNCATE
-		    || code == FLOAT_TRUNCATE
-		    || code == FLOAT_EXTEND
-		    || code == FLOAT
-		    || code == FIX
-		    || code == UNSIGNED_FLOAT
-		    || code == UNSIGNED_FIX))
-	      continue;
-
-	    if (validate_change (insn, &XEXP (x, i), replacements[j], 0))
-	      break;
-
-	    if (GET_RTX_CLASS (code) == RTX_COMM_COMPARE
-		|| GET_RTX_CLASS (code) == RTX_COMM_ARITH)
-	      {
-		validate_change (insn, &XEXP (x, i), XEXP (x, 1 - i), 1);
-		validate_change (insn, &XEXP (x, 1 - i), replacements[j], 1);
-
-		if (apply_change_group ())
-		  {
-		    /* Swap them back to be invalid so that this loop can
-		       continue and flag them to be swapped back later.  */
-		    rtx tem;
-
-		    tem = XEXP (x, 0); XEXP (x, 0) = XEXP (x, 1);
-				       XEXP (x, 1) = tem;
-		    must_swap = 1;
-		    break;
-		  }
-	      }
-	  }
-      }
+	    && (GET_RTX_CLASS (code) != RTX_UNARY
+		|| GET_MODE (const_arg) == mode_arg0
+		|| (code != ZERO_EXTEND
+		    && code != SIGN_EXTEND
+		    && code != TRUNCATE
+		    && code != FLOAT_TRUNCATE
+		    && code != FLOAT_EXTEND
+		    && code != FLOAT
+		    && code != FIX
+		    && code != UNSIGNED_FLOAT
+		    && code != UNSIGNED_FIX)))
+	  folded_arg = const_arg;
+
+	if (folded_arg == XEXP (x, i))
+	  continue;
 
-    else
-      {
-	if (fmt[i] == 'E')
-	  /* Don't try to fold inside of a vector of expressions.
-	     Doing nothing is harmless.  */
-	  {;}
+	if (insn == NULL_RTX && !changed)
+	  x = copy_rtx (x);
+	changed = 1;
+	validate_change (insn, &XEXP (x, i), folded_arg, 1);
       }
 
-  /* If a commutative operation, place a constant integer as the second
-     operand unless the first operand is also a constant integer.  Otherwise,
-     place any constant second unless the first operand is also a constant.  */
-
-  if (COMMUTATIVE_P (x))
+  if (changed)
     {
-      if (must_swap
-	  || swap_commutative_operands_p (const_arg0 ? const_arg0
-						     : XEXP (x, 0),
-					  const_arg1 ? const_arg1
-						     : XEXP (x, 1)))
+      /* Canonicalize X if necessary, and keep const_argN and folded_argN
+	 consistent with the order in X.  */
+      if (canonicalize_change_group (insn, x))
 	{
-	  rtx tem = XEXP (x, 0);
-
-	  if (insn == 0 && ! copied)
-	    {
-	      x = copy_rtx (x);
-	      copied = 1;
-	    }
-
-	  validate_change (insn, &XEXP (x, 0), XEXP (x, 1), 1);
-	  validate_change (insn, &XEXP (x, 1), tem, 1);
-	  if (apply_change_group ())
-	    {
-	      tem = const_arg0, const_arg0 = const_arg1, const_arg1 = tem;
-	      tem = folded_arg0, folded_arg0 = folded_arg1, folded_arg1 = tem;
-	    }
+	  rtx tem;
+	  tem = const_arg0, const_arg0 = const_arg1, const_arg1 = tem;
+	  tem = folded_arg0, folded_arg0 = folded_arg1, folded_arg1 = tem;
 	}
+
+      apply_change_group ();
     }
 
   /* If X is an arithmetic operation, see if we can simplify it.  */
@@ -4477,16 +3659,31 @@ equiv_constant (rtx x)
   if (x == 0 || CONSTANT_P (x))
     return x;
 
-  /* If X is a MEM, try to fold it outside the context of any insn to see if
-     it might be equivalent to a constant.  That handles the case where it
-     is a constant-pool reference.  Then try to look it up in the hash table
-     in case it is something whose value we have seen before.  */
+  if (GET_CODE (x) == SUBREG)
+    {
+      rtx new;
+
+      /* See if we previously assigned a constant value to this SUBREG.  */
+      if ((new = lookup_as_function (x, CONST_INT)) != 0
+          || (new = lookup_as_function (x, CONST_DOUBLE)) != 0)
+        return new;
+
+      if (REG_P (SUBREG_REG (x))
+	  && (new = equiv_constant (SUBREG_REG (x))) != 0)
+        return simplify_subreg (GET_MODE (x), SUBREG_REG (x),
+				GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x));
+
+      return 0;
+    }
+
+  /* If X is a MEM, see if it is a constant-pool reference, or look it up in
+     the hash table in case its value was seen before.  */
 
   if (MEM_P (x))
     {
       struct table_elt *elt;
 
-      x = fold_rtx (x, NULL_RTX);
+      x = avoid_constant_pool_reference (x);
       if (CONSTANT_P (x))
 	return x;
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5bbbfc53664..2e0de418bc4 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -310,7 +310,7 @@ Objective-C and Objective-C++ Dialects}.
 -fcse-skip-blocks  -fcx-limited-range  -fdata-sections @gol
 -fdelayed-branch  -fdelete-null-pointer-checks -fearly-inlining @gol
 -fexpensive-optimizations  -ffast-math  -ffloat-store @gol
--fforce-addr  -ffunction-sections @gol
+-fforce-addr  -fforward-propagate  -ffunction-sections @gol
 -fgcse  -fgcse-lm  -fgcse-sm  -fgcse-las  -fgcse-after-reload @gol
 -fcrossjumping  -fif-conversion  -fif-conversion2 @gol
 -finline-functions  -finline-functions-called-once @gol
@@ -4621,6 +4621,16 @@ register-load. This option is now a nop and will be removed in 4.2.
 Force memory address constants to be copied into registers before
 doing arithmetic on them.
 
+@item -fforward-propagate
+@opindex fforward-propagate
+Perform a forward propagation pass on RTL.  The pass tries to combine two
+instructions and checks if the result can be simplified.  If loop unrolling
+is active, two passes are performed and the second is scheduled after
+loop unrolling.
+
+This option is enabled by default at optimization levels @option{-O2},
+@option{-O3}, @option{-Os}.
+
 @item -fomit-frame-pointer
 @opindex fomit-frame-pointer
 Don't keep the frame pointer in a register for functions that
diff --git a/gcc/doc/passes.texi b/gcc/doc/passes.texi
index 9da919152b7..fc6aa2696a4 100644
--- a/gcc/doc/passes.texi
+++ b/gcc/doc/passes.texi
@@ -685,6 +685,15 @@ optimization pass''.  The bulk of the code for this pass is in
 @file{cfgcleanup.c}, and there are support routines in @file{cfgrtl.c}
 and @file{jump.c}.
 
+@item Forward propagation of single-def values
+
+This pass attempts to remove redundant computation by substituting
+variables that come from a single definition, and
+seeing if the result can be simplified.  It performs copy propagation
+and addressing mode selection.  The pass is run twice, with values
+being propagated into loops only on the second run.  It is located in
+@file{fwprop.c}.
+
 @item Common subexpression elimination
 
 This pass removes redundant computation within basic blocks, and
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
new file mode 100644
index 00000000000..1e4f749eb12
--- /dev/null
+++ b/gcc/fwprop.c
@@ -0,0 +1,1034 @@
+/* RTL-based forward propagation pass for GNU compiler.
+   Copyright (C) 2005, 2006 Free Software Foundation, Inc.
+   Contributed by Paolo Bonzini and Steven Bosscher.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING.  If not, write to the Free
+Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301, USA.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "toplev.h"
+
+#include "timevar.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "emit-rtl.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "flags.h"
+#include "obstack.h"
+#include "basic-block.h"
+#include "output.h"
+#include "df.h"
+#include "target.h"
+#include "cfgloop.h"
+#include "tree-pass.h"
+
+
+/* This pass does simple forward propagation and simplification when an
+   operand of an insn can only come from a single def.  This pass uses
+   df.c, so it is global.  However, we only do limited analysis of
+   available expressions.
+
+   1) The pass tries to propagate the source of the def into the use,
+   and checks if the result is independent of the substituted value.
+   For example, the high word of a (zero_extend:DI (reg:SI M)) is always
+   zero, independent of the source register.
+
+   In particular, we propagate constants into the use site.  Sometimes
+   RTL expansion did not put the constant in the same insn on purpose,
+   to satisfy a predicate, and the result will fail to be recognized;
+   but this happens rarely and in this case we can still create a
+   REG_EQUAL note.  For multi-word operations, this
+
+      (set (subreg:SI (reg:DI 120) 0) (const_int 0))
+      (set (subreg:SI (reg:DI 120) 4) (const_int -1))
+      (set (subreg:SI (reg:DI 122) 0)
+         (ior:SI (subreg:SI (reg:DI 119) 0) (subreg:SI (reg:DI 120) 0)))
+      (set (subreg:SI (reg:DI 122) 4)
+         (ior:SI (subreg:SI (reg:DI 119) 4) (subreg:SI (reg:DI 120) 4)))
+
+   can be simplified to the much simpler
+
+      (set (subreg:SI (reg:DI 122) 0) (subreg:SI (reg:DI 119)))
+      (set (subreg:SI (reg:DI 122) 4) (const_int -1))
+
+   This particular propagation is also effective at putting together
+   complex addressing modes.  We are more aggressive inside MEMs, in
+   that all definitions are propagated if the use is in a MEM; if the
+   result is a valid memory address we check address_cost to decide
+   whether the substitution is worthwhile.
+
+   2) The pass propagates register copies.  This is not as effective as
+   the copy propagation done by CSE's canon_reg, which works by walking
+   the instruction chain, it can help the other transformations.
+
+   We should consider removing this optimization, and instead reorder the
+   RTL passes, because GCSE does this transformation too.  With some luck,
+   the CSE pass at the end of rest_of_handle_gcse could also go away.
+
+   3) The pass looks for paradoxical subregs that are actually unnecessary.
+   Things like this:
+
+     (set (reg:QI 120) (subreg:QI (reg:SI 118) 0))
+     (set (reg:QI 121) (subreg:QI (reg:SI 119) 0))
+     (set (reg:SI 122) (plus:SI (subreg:SI (reg:QI 120) 0)
+                                (subreg:SI (reg:QI 121) 0)))
+
+   are very common on machines that can only do word-sized operations.
+   For each use of a paradoxical subreg (subreg:WIDER (reg:NARROW N) 0),
+   if it has a single def and it is (subreg:NARROW (reg:WIDE M) 0),
+   we can replace the paradoxical subreg with simply (reg:WIDE M).  The
+   above will simplify this to
+
+     (set (reg:QI 120) (subreg:QI (reg:SI 118) 0))
+     (set (reg:QI 121) (subreg:QI (reg:SI 119) 0))
+     (set (reg:SI 122) (plus:SI (reg:SI 118) (reg:SI 119)))
+
+   where the first two insns are now dead.  */
+
+
+static struct loops loops;
+static struct df *df;
+static int num_changes;
+
+
+/* Do not try to replace constant addresses or addresses of local and
+   argument slots.  These MEM expressions are made only once and inserted
+   in many instructions, as well as being used to control symbol table
+   output.  It is not safe to clobber them.
+
+   There are some uncommon cases where the address is already in a register
+   for some reason, but we cannot take advantage of that because we have
+   no easy way to unshare the MEM.  In addition, looking up all stack
+   addresses is costly.  */
+
+static bool
+can_simplify_addr (rtx addr)
+{
+  rtx reg;
+
+  if (CONSTANT_ADDRESS_P (addr))
+    return false;
+
+  if (GET_CODE (addr) == PLUS)
+    reg = XEXP (addr, 0);
+  else
+    reg = addr;
+
+  return (!REG_P (reg)
+	  || (REGNO (reg) != FRAME_POINTER_REGNUM
+	      && REGNO (reg) != HARD_FRAME_POINTER_REGNUM
+	      && REGNO (reg) != ARG_POINTER_REGNUM));
+}
+
+/* Returns a canonical version of X for the address, from the point of view,
+   that all multiplications are represented as MULT instead of the multiply
+   by a power of 2 being represented as ASHIFT.
+
+   Every ASHIFT we find has been made by simplify_gen_binary and was not
+   there before, so it is not shared.  So we can do this in place.  */
+
+static void
+canonicalize_address (rtx x)
+{
+  for (;;)
+    switch (GET_CODE (x))
+      {
+      case ASHIFT:
+        if (GET_CODE (XEXP (x, 1)) == CONST_INT
+            && INTVAL (XEXP (x, 1)) < GET_MODE_BITSIZE (GET_MODE (x))
+            && INTVAL (XEXP (x, 1)) >= 0)
+	  {
+	    HOST_WIDE_INT shift = INTVAL (XEXP (x, 1));
+	    PUT_CODE (x, MULT);
+	    XEXP (x, 1) = gen_int_mode ((HOST_WIDE_INT) 1 << shift,
+					GET_MODE (x));
+	  }
+
+	x = XEXP (x, 0);
+        break;
+
+      case PLUS:
+        if (GET_CODE (XEXP (x, 0)) == PLUS
+	    || GET_CODE (XEXP (x, 0)) == ASHIFT
+	    || GET_CODE (XEXP (x, 0)) == CONST)
+	  canonicalize_address (XEXP (x, 0));
+
+	x = XEXP (x, 1);
+        break;
+
+      case CONST:
+	x = XEXP (x, 0);
+        break;
+
+      default:
+        return;
+      }
+}
+
+/* OLD is a memory address.  Return whether it is good to use NEW instead,
+   for a memory access in the given MODE.  */
+
+static bool
+should_replace_address (rtx old, rtx new, enum machine_mode mode)
+{
+  int gain;
+
+  if (rtx_equal_p (old, new) || !memory_address_p (mode, new))
+    return false;
+
+  /* Copy propagation is always ok.  */
+  if (REG_P (old) && REG_P (new))
+    return true;
+
+  /* Prefer the new address if it is less expensive.  */
+  gain = address_cost (old, mode) - address_cost (new, mode);
+
+  /* If the addresses have equivalent cost, prefer the new address
+     if it has the highest `rtx_cost'.  That has the potential of
+     eliminating the most insns without additional costs, and it
+     is the same that cse.c used to do.  */
+  if (gain == 0)
+    gain = rtx_cost (new, SET) - rtx_cost (old, SET);
+
+  return (gain > 0);
+}
+
+/* Replace all occurrences of OLD in *PX with NEW and try to simplify the
+   resulting expression.  Replace *PX with a new RTL expression if an
+   occurrence of OLD was found.
+
+   If CAN_APPEAR is true, we always return true; if it is false, we
+   can return false if, for at least one occurrence OLD, we failed to
+   collapse the result to a constant.  For example, (mult:M (reg:M A)
+   (minus:M (reg:M B) (reg:M A))) may collapse to zero if replacing
+   (reg:M B) with (reg:M A).
+
+   CAN_APPEAR is disregarded inside MEMs: in that case, we always return
+   true if the simplification is a cheaper and valid memory address.
+
+   This is only a wrapper around simplify-rtx.c: do not add any pattern
+   matching code here.  (The sole exception is the handling of LO_SUM, but
+   that is because there is no simplify_gen_* function for LO_SUM).  */
+
+static bool
+propagate_rtx_1 (rtx *px, rtx old, rtx new, bool can_appear)
+{
+  rtx x = *px, tem = NULL_RTX, op0, op1, op2;
+  enum rtx_code code = GET_CODE (x);
+  enum machine_mode mode = GET_MODE (x);
+  enum machine_mode op_mode;
+  bool valid_ops = true;
+
+  /* If X is OLD_RTX, return NEW_RTX.  Otherwise, if this is an expression,
+     try to build a new expression from recursive substitution.  */
+
+  if (x == old)
+    {
+      *px = new;
+      return can_appear;
+    }
+
+  switch (GET_RTX_CLASS (code))
+    {
+    case RTX_UNARY:
+      op0 = XEXP (x, 0);
+      op_mode = GET_MODE (op0);
+      valid_ops &= propagate_rtx_1 (&op0, old, new, can_appear);
+      if (op0 == XEXP (x, 0))
+	return true;
+      tem = simplify_gen_unary (code, mode, op0, op_mode);
+      break;
+
+    case RTX_BIN_ARITH:
+    case RTX_COMM_ARITH:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+      valid_ops &= propagate_rtx_1 (&op0, old, new, can_appear);
+      valid_ops &= propagate_rtx_1 (&op1, old, new, can_appear);
+      if (op0 == XEXP (x, 0) && op1 == XEXP (x, 1))
+	return true;
+      tem = simplify_gen_binary (code, mode, op0, op1);
+      break;
+
+    case RTX_COMPARE:
+    case RTX_COMM_COMPARE:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+      op_mode = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
+      valid_ops &= propagate_rtx_1 (&op0, old, new, can_appear);
+      valid_ops &= propagate_rtx_1 (&op1, old, new, can_appear);
+      if (op0 == XEXP (x, 0) && op1 == XEXP (x, 1))
+	return true;
+      tem = simplify_gen_relational (code, mode, op_mode, op0, op1);
+      break;
+
+    case RTX_TERNARY:
+    case RTX_BITFIELD_OPS:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+      op2 = XEXP (x, 2);
+      op_mode = GET_MODE (op0);
+      valid_ops &= propagate_rtx_1 (&op0, old, new, can_appear);
+      valid_ops &= propagate_rtx_1 (&op1, old, new, can_appear);
+      valid_ops &= propagate_rtx_1 (&op2, old, new, can_appear);
+      if (op0 == XEXP (x, 0) && op1 == XEXP (x, 1) && op2 == XEXP (x, 2))
+	return true;
+      if (op_mode == VOIDmode)
+	op_mode = GET_MODE (op0);
+      tem = simplify_gen_ternary (code, mode, op_mode, op0, op1, op2);
+      break;
+
+    case RTX_EXTRA:
+      /* The only case we try to handle is a SUBREG.  */
+      if (code == SUBREG)
+	{
+          op0 = XEXP (x, 0);
+	  valid_ops &= propagate_rtx_1 (&op0, old, new, can_appear);
+          if (op0 == XEXP (x, 0))
+	    return true;
+	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
+				     SUBREG_BYTE (x));
+	}
+      break;
+
+    case RTX_OBJ:
+      if (code == MEM && x != new)
+	{
+	  rtx new_op0;
+	  op0 = XEXP (x, 0);
+
+	  /* There are some addresses that we cannot work on.  */
+	  if (!can_simplify_addr (op0))
+	    return true;
+
+	  op0 = new_op0 = targetm.delegitimize_address (op0);
+	  valid_ops &= propagate_rtx_1 (&new_op0, old, new, true);
+
+	  /* Dismiss transformation that we do not want to carry on.  */
+	  if (!valid_ops
+	      || new_op0 == op0
+	      || GET_MODE (new_op0) != GET_MODE (op0))
+	    return true;
+
+	  canonicalize_address (new_op0);
+
+	  /* Copy propagations are always ok.  Otherwise check the costs.  */
+	  if (!(REG_P (old) && REG_P (new))
+	      && !should_replace_address (op0, new_op0, GET_MODE (x)))
+	    return true;
+
+	  tem = replace_equiv_address_nv (x, new_op0);
+	}
+
+      else if (code == LO_SUM)
+	{
+          op0 = XEXP (x, 0);
+          op1 = XEXP (x, 1);
+
+	  /* The only simplification we do attempts to remove references to op0
+	     or make it constant -- in both cases, op0's invalidity will not
+	     make the result invalid.  */
+	  propagate_rtx_1 (&op0, old, new, true);
+	  valid_ops &= propagate_rtx_1 (&op1, old, new, can_appear);
+          if (op0 == XEXP (x, 0) && op1 == XEXP (x, 1))
+	    return true;
+
+	  /* (lo_sum (high x) x) -> x  */
+	  if (GET_CODE (op0) == HIGH && rtx_equal_p (XEXP (op0, 0), op1))
+	    tem = op1;
+	  else
+	    tem = gen_rtx_LO_SUM (mode, op0, op1);
+
+	  /* OP1 is likely not a legitimate address, otherwise there would have
+	     been no LO_SUM.  We want it to disappear if it is invalid, return
+	     false in that case.  */
+	  return memory_address_p (mode, tem);
+	}
+
+      else if (code == REG)
+	{
+	  if (rtx_equal_p (x, old))
+	    {
+              *px = new;
+              return can_appear;
+	    }
+	}
+      break;
+
+    default:
+      break;
+    }
+
+  /* No change, no trouble.  */
+  if (tem == NULL_RTX)
+    return true;
+
+  *px = tem;
+
+  /* The replacement we made so far is valid, if all of the recursive
+     replacements were valid, or we could simplify everything to
+     a constant.  */
+  return valid_ops || can_appear || CONSTANT_P (tem);
+}
+
+/* Replace all occurrences of OLD in X with NEW and try to simplify the
+   resulting expression (in mode MODE).  Return a new expresion if it is
+   a constant, otherwise X.
+
+   Simplifications where occurrences of NEW collapse to a constant are always
+   accepted.  All simplifications are accepted if NEW is a pseudo too.
+   Otherwise, we accept simplifications that have a lower or equal cost.  */
+
+static rtx
+propagate_rtx (rtx x, enum machine_mode mode, rtx old, rtx new)
+{
+  rtx tem;
+  bool collapsed;
+
+  if (REG_P (new) && REGNO (new) < FIRST_PSEUDO_REGISTER)
+    return NULL_RTX;
+
+  new = copy_rtx (new);
+
+  tem = x;
+  collapsed = propagate_rtx_1 (&tem, old, new, REG_P (new) || CONSTANT_P (new));
+  if (tem == x || !collapsed)
+    return NULL_RTX;
+
+  /* gen_lowpart_common will not be able to process VOIDmode entities other
+     than CONST_INTs.  */
+  if (GET_MODE (tem) == VOIDmode && GET_CODE (tem) != CONST_INT)
+    return NULL_RTX;
+
+  if (GET_MODE (tem) == VOIDmode)
+    tem = rtl_hooks.gen_lowpart_no_emit (mode, tem);
+  else
+    gcc_assert (GET_MODE (tem) == mode);
+
+  return tem;
+}
+
+
+
+
+/* Return true if the register from reference REF is killed
+   between FROM to (but not including) TO.  */
+
+static bool 
+local_ref_killed_between_p (struct df_ref * ref, rtx from, rtx to)
+{
+  rtx insn;
+  struct df_ref *def;
+
+  for (insn = from; insn != to; insn = NEXT_INSN (insn))
+    {
+      if (!INSN_P (insn))
+	continue;
+
+      def = DF_INSN_DEFS (df, insn);
+      while (def)
+	{
+	  if (DF_REF_REGNO (ref) == DF_REF_REGNO (def))
+	    return true;
+	  def = def->next_ref;
+	}
+    }
+  return false;
+}
+
+
+/* Check if the given DEF is available in INSN.  This would require full
+   computation of available expressions; we check only restricted conditions:
+   - if DEF is the sole definition of its register, go ahead;
+   - in the same basic block, we check for no definitions killing the
+     definition of DEF_INSN;
+   - if USE's basic block has DEF's basic block as the sole predecessor,
+     we check if the definition is killed after DEF_INSN or before
+     TARGET_INSN insn, in their respective basic blocks.  */
+static bool
+use_killed_between (struct df_ref *use, rtx def_insn, rtx target_insn)
+{
+  basic_block def_bb, target_bb;
+  int regno;
+  struct df_ref * def;
+
+  /* Check if the reg in USE has only one definition.  We already
+     know that this definition reaches use, or we wouldn't be here.  */
+  regno = DF_REF_REGNO (use);
+  def = DF_REG_DEF_GET (df, regno)->reg_chain;
+  if (def && (def->next_reg == NULL))
+    return false;
+
+  /* Check if we are in the same basic block.  */
+  def_bb = BLOCK_FOR_INSN (def_insn);
+  target_bb = BLOCK_FOR_INSN (target_insn);
+  if (def_bb == target_bb)
+    {
+      /* In some obscure situations we can have a def reaching a use
+	 that is _before_ the def.  In other words the def does not
+	 dominate the use even though the use and def are in the same
+	 basic block.  This can happen when a register may be used
+	 uninitialized in a loop.  In such cases, we must assume that
+	 DEF is not available.  */
+      if (DF_INSN_LUID (df, def_insn) >= DF_INSN_LUID (df, target_insn))
+	return true;
+
+      return local_ref_killed_between_p (use, def_insn, target_insn);
+    }
+
+  /* Finally, if DEF_BB is the sole predecessor of TARGET_BB.  */
+  if (single_pred_p (target_bb)
+      && single_pred (target_bb) == def_bb)
+    {
+      struct df_ref *x;
+
+      /* See if USE is killed between DEF_INSN and the last insn in the
+	 basic block containing DEF_INSN.  */
+      x = df_bb_regno_last_def_find (df, def_bb, regno);
+      if (x && DF_INSN_LUID (df, x->insn) >= DF_INSN_LUID (df, def_insn))
+	return true;
+
+      /* See if USE is killed between TARGET_INSN and the first insn in the
+	 basic block containing TARGET_INSN.  */
+      x = df_bb_regno_first_def_find (df, target_bb, regno);
+      if (x && DF_INSN_LUID (df, x->insn) < DF_INSN_LUID (df, target_insn))
+	return true;
+
+      return false;
+    }
+
+  /* Otherwise assume the worst case.  */
+  return true;
+}
+
+
+/* for_each_rtx traversal function that returns 1 if BODY points to
+   a non-constant mem.  */
+
+static int
+varying_mem_p (rtx *body, void *data ATTRIBUTE_UNUSED)
+{
+  rtx x = *body;
+  return MEM_P (x) && !MEM_READONLY_P (x);
+}
+            
+/* Check if all uses in DEF_INSN can be used in TARGET_INSN.  This
+   would require full computation of available expressions;
+   we check only restricted conditions, see use_killed_between.  */
+static bool
+all_uses_available_at (rtx def_insn, rtx target_insn)
+{
+  struct df_ref * use;
+  rtx def_set = single_set (def_insn);
+
+  gcc_assert (def_set);
+
+  /* If target_insn comes right after def_insn, which is very common
+     for addresses, we can use a quicker test.  */
+  if (NEXT_INSN (def_insn) == target_insn
+      && REG_P (SET_DEST (def_set)))
+    {
+      rtx def_reg = SET_DEST (def_set);
+
+      /* If the insn uses the reg that it defines, the substitution is
+         invalid.  */
+      for (use = DF_INSN_USES (df, def_insn); use; use = use->next_ref)
+        if (rtx_equal_p (use->reg, def_reg))
+          return false;
+    }
+  else
+    {
+      /* Look at all the uses of DEF_INSN, and see if they are not
+	 killed between DEF_INSN and TARGET_INSN.  */
+      for (use = DF_INSN_USES (df, def_insn); use; use = use->next_ref)
+	if (use_killed_between (use, def_insn, target_insn))
+	  return false;
+    }
+
+  /* We don't do any analysis of memories or aliasing.  Reject any
+     instruction that involves references to non-constant memory.  */
+  return !for_each_rtx (&SET_SRC (def_set), varying_mem_p, NULL);
+}
+
+
+struct find_occurrence_data
+{
+  rtx find;
+  rtx *retval;
+};
+
+/* Callback for for_each_rtx, used in find_occurrence.
+   See if PX is the rtx we have to find.  Return 1 to stop for_each_rtx
+   if successful, or 0 to continue traversing otherwise.  */
+
+static int
+find_occurrence_callback (rtx *px, void *data)
+{
+  struct find_occurrence_data *fod = (struct find_occurrence_data *) data;
+  rtx x = *px;
+  rtx find = fod->find;
+
+  if (x == find)
+    {
+      fod->retval = px;
+      return 1;
+    }
+
+  return 0;
+}
+
+/* Return a pointer to one of the occurrences of register FIND in *PX.  */
+
+static rtx *
+find_occurrence (rtx *px, rtx find)
+{
+  struct find_occurrence_data data;
+
+  gcc_assert (REG_P (find)
+	      || (GET_CODE (find) == SUBREG
+		  && REG_P (SUBREG_REG (find))));
+
+  data.find = find;
+  data.retval = NULL;
+  for_each_rtx (px, find_occurrence_callback, &data);
+  return data.retval;
+}
+
+
+/* Inside INSN, the expression rooted at *LOC has been changed, moving some
+   uses from ORIG_USES.  Find those that are present, and create new items
+   in the data flow object of the pass.  Mark any new uses as having the
+   given TYPE.  */
+static void
+update_df (rtx insn, rtx *loc, struct df_ref *orig_uses, enum df_ref_type type,
+	   int new_flags)
+{
+  struct df_ref *use;
+
+  /* Add a use for the registers that were propagated.  */
+  for (use = orig_uses; use; use = use->next_ref)
+    {
+      struct df_ref *orig_use = use, *new_use;
+      rtx *new_loc = find_occurrence (loc, DF_REF_REG (orig_use));
+
+      if (!new_loc)
+	continue;
+
+      /* Add a new insn use.  Use the original type, because it says if the
+         use was within a MEM.  */
+      new_use = df_ref_create (df, DF_REF_REG (orig_use), new_loc,
+			       insn, BLOCK_FOR_INSN (insn),
+			       type, DF_REF_FLAGS (orig_use) | new_flags);
+
+      /* Set up the use-def chain.  */
+      df_chain_copy (df->problems_by_index[DF_CHAIN], 
+		     new_use, DF_REF_CHAIN (orig_use));
+    }
+}
+
+
+/* Try substituting NEW into LOC, which originated from forward propagation
+   of USE's value from DEF_INSN.  SET_REG_EQUAL says whether we are
+   substituting the whole SET_SRC, so we can set a REG_EQUAL note if the
+   new insn is not recognized.  Return whether the substitution was
+   performed.  */
+
+static bool
+try_fwprop_subst (struct df_ref *use, rtx *loc, rtx new, rtx def_insn, bool set_reg_equal)
+{
+  rtx insn = DF_REF_INSN (use);
+  enum df_ref_type type = DF_REF_TYPE (use);
+  int flags = DF_REF_FLAGS (use);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nIn insn %d, replacing\n ", INSN_UID (insn));
+      print_inline_rtx (dump_file, *loc, 2);
+      fprintf (dump_file, "\n with ");
+      print_inline_rtx (dump_file, new, 2);
+      fprintf (dump_file, "\n");
+    }
+
+  if (validate_change (insn, loc, new, false))
+    {
+      num_changes++;
+      if (dump_file)
+	fprintf (dump_file, "Changed insn %d\n", INSN_UID (insn));
+
+      /* Unlink the use that we changed.  */
+      df_ref_remove (df, use);
+      if (!CONSTANT_P (new))
+	update_df (insn, loc, DF_INSN_USES (df, def_insn), type, flags);
+
+      return true;
+    }
+  else
+    {
+      if (dump_file)
+	fprintf (dump_file, "Changes to insn %d not recognized\n",
+		 INSN_UID (insn));
+
+      /* Can also record a simplified value in a REG_EQUAL note, making a
+	 new one if one does not already exist.  */
+      if (set_reg_equal)
+	{
+	  if (dump_file)
+	    fprintf (dump_file, " Setting REG_EQUAL note\n");
+
+	  REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_EQUAL, copy_rtx (new),
+						REG_NOTES (insn));
+
+          if (!CONSTANT_P (new))
+	    update_df (insn, loc, DF_INSN_USES (df, def_insn),
+		       type, DF_REF_IN_NOTE);
+	}
+
+      return false;
+    }
+}
+
+
+/* If USE is a paradoxical subreg, see if it can be replaced by a pseudo.  */
+
+static bool
+forward_propagate_subreg (struct df_ref *use, rtx def_insn, rtx def_set)
+{
+  rtx use_reg = DF_REF_REG (use);
+  rtx use_insn, src;
+
+  /* Only consider paradoxical subregs... */
+  enum machine_mode use_mode = GET_MODE (use_reg);
+  if (GET_CODE (use_reg) != SUBREG
+      || !REG_P (SET_DEST (def_set))
+      || GET_MODE_SIZE (use_mode)
+	 <= GET_MODE_SIZE (GET_MODE (SUBREG_REG (use_reg))))
+    return false;
+
+  /* If this is a paradoxical SUBREG, we have no idea what value the
+     extra bits would have.  However, if the operand is equivalent to
+     a SUBREG whose operand is the same as our mode, and all the modes
+     are within a word, we can just use the inner operand because
+     these SUBREGs just say how to treat the register.  */
+  use_insn = DF_REF_INSN (use);
+  src = SET_SRC (def_set);
+  if (GET_CODE (src) == SUBREG
+      && REG_P (SUBREG_REG (src))
+      && GET_MODE (SUBREG_REG (src)) == use_mode
+      && subreg_lowpart_p (src)
+      && all_uses_available_at (def_insn, use_insn))
+    return try_fwprop_subst (use, DF_REF_LOC (use), SUBREG_REG (src),
+			     def_insn, false);
+  else
+    return false;
+}
+
+/* Try to replace USE with SRC (defined in DEF_INSN) and simplify the
+   result.  */
+
+static bool
+forward_propagate_and_simplify (struct df_ref *use, rtx def_insn, rtx def_set)
+{
+  rtx use_insn = DF_REF_INSN (use);
+  rtx use_set = single_set (use_insn);
+  rtx src, reg, new, *loc;
+  bool set_reg_equal;
+  enum machine_mode mode;
+
+  if (!use_set)
+    return false;
+
+  /* Do not propagate into PC, CC0, etc.  */
+  if (GET_MODE (SET_DEST (use_set)) == VOIDmode)
+    return false;
+
+  /* If def and use are subreg, check if they match.  */
+  reg = DF_REF_REG (use);
+  if (GET_CODE (reg) == SUBREG
+      && GET_CODE (SET_DEST (def_set)) == SUBREG
+      && (SUBREG_BYTE (SET_DEST (def_set)) != SUBREG_BYTE (reg)
+	  || GET_MODE (SET_DEST (def_set)) != GET_MODE (reg)))
+    return false;
+
+  /* Check if the def had a subreg, but the use has the whole reg.  */
+  if (REG_P (reg) && GET_CODE (SET_DEST (def_set)) == SUBREG)
+    return false;
+
+  /* Check if the use has a subreg, but the def had the whole reg.  Unlike the
+     previous case, the optimization is possible and often useful indeed.  */
+  if (GET_CODE (reg) == SUBREG && REG_P (SET_DEST (def_set)))
+    reg = SUBREG_REG (reg);
+
+  /* Check if the substitution is valid (last, because it's the most
+     expensive check!).  */
+  src = SET_SRC (def_set);
+  if (!CONSTANT_P (src) && !all_uses_available_at (def_insn, use_insn))
+    return false;
+
+  /* Check if the def is loading something from the constant pool; in this
+     case we would undo optimization such as compress_float_constant.
+     Still, we can set a REG_EQUAL note.  */
+  if (MEM_P (src) && MEM_READONLY_P (src))
+    {
+      rtx x = avoid_constant_pool_reference (src);
+      if (x != src)
+	{
+          rtx note = find_reg_note (use_insn, REG_EQUAL, NULL_RTX);
+	  rtx old = note ? XEXP (note, 0) : SET_SRC (use_set);
+	  rtx new = simplify_replace_rtx (old, src, x);
+	  if (old != new)	
+            set_unique_reg_note (use_insn, REG_EQUAL, copy_rtx (new));
+	}
+      return false;
+    }
+
+  /* Else try simplifying.  */
+
+  if (DF_REF_TYPE (use) == DF_REF_REG_MEM_STORE)
+    {
+      loc = &SET_DEST (use_set);
+      set_reg_equal = false;
+    }
+  else
+    {
+      rtx note = find_reg_note (use_insn, REG_EQUAL, NULL_RTX);
+      if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
+	loc = &XEXP (note, 0);
+      else
+	loc = &SET_SRC (use_set);
+	  
+      /* Do not replace an existing REG_EQUAL note if the insn is not
+	 recognized.  Either we're already replacing in the note, or
+	 we'll separately try plugging the definition in the note and
+	 simplifying.  */
+      set_reg_equal = (note == NULL_RTX);
+    }
+
+  if (GET_MODE (*loc) == VOIDmode)
+    mode = GET_MODE (SET_DEST (use_set));
+  else
+    mode = GET_MODE (*loc);
+
+  new = propagate_rtx (*loc, mode, reg, src);
+  
+  if (!new)
+    return false;
+
+  return try_fwprop_subst (use, loc, new, def_insn, set_reg_equal);
+}
+
+
+/* Given a use USE of an insn, if it has a single reaching
+   definition, try to forward propagate it into that insn.  */
+
+static void
+forward_propagate_into (struct df_ref *use)
+{
+  struct df_link *defs;
+  struct df_ref *def;
+  rtx def_insn, def_set, use_insn;
+  rtx parent;  
+
+  if (DF_REF_FLAGS (use) & DF_REF_READ_WRITE)
+    return;
+
+  /* Only consider uses that have a single definition.  */
+  defs = DF_REF_CHAIN (use);
+  if (!defs || defs->next)
+    return;
+
+  def = defs->ref;
+  if (DF_REF_FLAGS (def) & DF_REF_READ_WRITE)
+    return;
+
+  /* Do not propagate loop invariant definitions inside the loop if
+     we are going to unroll.  */
+  if (loops.num > 0
+      && DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
+    return;
+
+  /* Check if the use is still present in the insn!  */
+  use_insn = DF_REF_INSN (use);
+  if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
+    parent = find_reg_note (use_insn, REG_EQUAL, NULL_RTX);
+  else
+    parent = PATTERN (use_insn);
+
+  if (!loc_mentioned_in_p (DF_REF_LOC (use), parent))
+    return;
+
+  def_insn = DF_REF_INSN (def);
+  def_set = single_set (def_insn);
+  if (!def_set)
+    return;
+
+  /* Only try one kind of propagation.  If two are possible, we'll
+     do it on the following iterations.  */
+  if (!forward_propagate_and_simplify (use, def_insn, def_set))
+    forward_propagate_subreg (use, def_insn, def_set);
+}
+
+
+static void
+fwprop_init (void)
+{
+  num_changes = 0;
+
+  /* We do not always want to propagate into loops, so we have to find
+     loops and be careful about them.  But we have to call flow_loops_find
+     before df_analyze, because flow_loops_find may introduce new jump
+     insns (sadly) if we are not working in cfglayout mode.  */
+  if (flag_rerun_cse_after_loop && (flag_unroll_loops || flag_peel_loops))
+    {
+      calculate_dominance_info (CDI_DOMINATORS);
+      flow_loops_find (&loops);
+    }
+
+  /* Now set up the dataflow problem (we only want use-def chains) and
+     put the dataflow solver to work.  */
+  df = df_init (DF_SUBREGS | DF_EQUIV_NOTES);
+  df_chain_add_problem (df, DF_UD_CHAIN);
+  df_analyze (df);
+  df_dump (df, dump_file);
+}
+
+static void
+fwprop_done (void)
+{
+  df_finish (df);
+
+  if (flag_rerun_cse_after_loop && (flag_unroll_loops || flag_peel_loops))
+    {
+      flow_loops_free (&loops);
+      free_dominance_info (CDI_DOMINATORS);
+      loops.num = 0;
+    }
+
+  cleanup_cfg (0);
+  delete_trivially_dead_insns (get_insns (), max_reg_num ());
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "\nNumber of successful forward propagations: %d\n\n",
+	     num_changes);
+}
+
+
+
+/* Main entry point.  */
+
+static bool
+gate_fwprop (void)
+{
+  return optimize > 0 && flag_forward_propagate;
+}
+
+static unsigned int
+fwprop (void)
+{
+  unsigned i;
+
+  fwprop_init ();
+
+  /* Go through all the uses.  update_df will create new ones at the
+     end, and we'll go through them as well.
+
+     Do not forward propagate addresses into loops until after unrolling.
+     CSE did so because it was able to fix its own mess, but we are not.  */
+
+  df_reorganize_refs (&df->use_info);
+  for (i = 0; i < DF_USES_SIZE (df); i++)
+    {
+      struct df_ref *use = DF_USES_GET (df, i);
+      if (use)
+	if (loops.num == 0
+	    || DF_REF_TYPE (use) == DF_REF_REG_USE
+	    || DF_REF_BB (use)->loop_father == NULL)
+	  forward_propagate_into (use);
+    }
+
+  fwprop_done ();
+
+  return 0;
+}
+
+struct tree_opt_pass pass_rtl_fwprop =
+{
+  "fwprop1",                            /* name */
+  gate_fwprop,				/* gate */   
+  fwprop,				/* execute */       
+  NULL,                                 /* sub */
+  NULL,                                 /* next */
+  0,                                    /* static_pass_number */
+  TV_FWPROP,                            /* tv_id */
+  0,                                    /* properties_required */
+  0,                                    /* properties_provided */
+  0,                                    /* properties_destroyed */
+  0,                                    /* todo_flags_start */
+  TODO_dump_func,                       /* todo_flags_finish */
+  0                                     /* letter */
+};
+
+static bool
+gate_fwprop_addr (void)
+{
+  return optimize > 0 && flag_forward_propagate && flag_rerun_cse_after_loop
+  	 && (flag_unroll_loops || flag_peel_loops);
+}
+
+static unsigned int
+fwprop_addr (void)
+{
+  unsigned i;
+  fwprop_init ();
+
+  /* Go through all the uses.  update_df will create new ones at the
+     end, and we'll go through them as well.  */
+  df_reorganize_refs (&df->use_info);
+  for (i = 0; i < DF_USES_SIZE (df); i++)
+    {
+      struct df_ref *use = DF_USES_GET (df, i);
+      if (use)
+	if (DF_REF_TYPE (use) != DF_REF_REG_USE
+	    && DF_REF_BB (use)->loop_father != NULL)
+	  forward_propagate_into (use);
+    }
+
+  fwprop_done ();
+
+  return 0;
+}
+
+struct tree_opt_pass pass_rtl_fwprop_addr =
+{
+  "fwprop2",                            /* name */
+  gate_fwprop_addr,			/* gate */   
+  fwprop_addr,				/* execute */       
+  NULL,                                 /* sub */
+  NULL,                                 /* next */
+  0,                                    /* static_pass_number */
+  TV_FWPROP,                            /* tv_id */
+  0,                                    /* properties_required */
+  0,                                    /* properties_provided */
+  0,                                    /* properties_destroyed */
+  0,                                    /* todo_flags_start */
+  TODO_dump_func,                       /* todo_flags_finish */
+  0                                     /* letter */
+};
diff --git a/gcc/gcse.c b/gcc/gcse.c
index b28273d48ce..f1214952b24 100644
--- a/gcc/gcse.c
+++ b/gcc/gcse.c
@@ -3396,7 +3396,8 @@ one_cprop_pass (int pass, bool cprop_jumps, bool bypass_jumps)
   global_const_prop_count = local_const_prop_count = 0;
   global_copy_prop_count = local_copy_prop_count = 0;
 
-  local_cprop_pass (cprop_jumps);
+  if (cprop_jumps)
+    local_cprop_pass (cprop_jumps);
 
   /* Determine implicit sets.  */
   implicit_sets = XCNEWVEC (rtx, last_basic_block);
diff --git a/gcc/opts.c b/gcc/opts.c
index bbce07498b9..da16f7bf341 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -474,6 +474,7 @@ decode_options (unsigned int argc, const char **argv)
       flag_thread_jumps = 1;
       flag_crossjumping = 1;
       flag_optimize_sibling_calls = 1;
+      flag_forward_propagate = 1;
       flag_cse_follow_jumps = 1;
       flag_gcse = 1;
       flag_expensive_optimizations = 1;
diff --git a/gcc/passes.c b/gcc/passes.c
index 46e4756e84d..9d585d3bf6a 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -635,6 +635,7 @@ init_optimization_passes (void)
   NEXT_PASS (pass_instantiate_virtual_regs);
   NEXT_PASS (pass_jump2);
   NEXT_PASS (pass_cse);
+  NEXT_PASS (pass_rtl_fwprop);
   NEXT_PASS (pass_gcse);
   NEXT_PASS (pass_jump_bypass);
   NEXT_PASS (pass_rtl_ifcvt);
@@ -645,6 +646,7 @@ init_optimization_passes (void)
   NEXT_PASS (pass_loop2);
   NEXT_PASS (pass_web);
   NEXT_PASS (pass_cse2);
+  NEXT_PASS (pass_rtl_fwprop_addr);
   NEXT_PASS (pass_life);
   NEXT_PASS (pass_combine);
   NEXT_PASS (pass_if_after_combine);
diff --git a/gcc/recog.c b/gcc/recog.c
index a3948a729e0..18ad72f2d4e 100644
--- a/gcc/recog.c
+++ b/gcc/recog.c
@@ -238,6 +238,28 @@ validate_change (rtx object, rtx *loc, rtx new, int in_group)
     return apply_change_group ();
 }
 
+/* Keep X canonicalized if some changes have made it non-canonical; only
+   modifies the operands of X, not (for example) its code.  Simplifications
+   are not the job of this routine.
+
+   Return true if anything was changed.  */
+bool
+canonicalize_change_group (rtx insn, rtx x)
+{
+  if (COMMUTATIVE_P (x)
+      && swap_commutative_operands_p (XEXP (x, 0), XEXP (x, 1)))
+    {
+      /* Oops, the caller has made X no longer canonical.
+	 Let's redo the changes in the correct order.  */
+      rtx tem = XEXP (x, 0);
+      validate_change (insn, &XEXP (x, 0), XEXP (x, 1), 1);
+      validate_change (insn, &XEXP (x, 1), tem, 1);
+      return true;
+    }
+  else
+    return false;
+}
+  
 
 /* This subroutine of apply_change_group verifies whether the changes to INSN
    were valid; i.e. whether INSN can still be recognized.  */
diff --git a/gcc/recog.h b/gcc/recog.h
index 8281a9e55e7..b921b8074dc 100644
--- a/gcc/recog.h
+++ b/gcc/recog.h
@@ -74,6 +74,7 @@ extern void init_recog_no_volatile (void);
 extern int check_asm_operands (rtx);
 extern int asm_operand_ok (rtx, const char *);
 extern int validate_change (rtx, rtx *, rtx, int);
+extern bool canonicalize_change_group (rtx insn, rtx x);
 extern int insn_invalid_p (rtx);
 extern int verify_changes (int);
 extern void confirm_change_group (void);
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index b0a816106d4..8a7c914022c 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -2837,10 +2837,15 @@ auto_inc_p (rtx x)
 int
 loc_mentioned_in_p (rtx *loc, rtx in)
 {
-  enum rtx_code code = GET_CODE (in);
-  const char *fmt = GET_RTX_FORMAT (code);
+  enum rtx_code code;
+  const char *fmt;
   int i, j;
 
+  if (!in)
+    return 0;
+
+  code = GET_CODE (in);
+  fmt = GET_RTX_FORMAT (code);
   for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
     {
       if (loc == &in->u.fld[i].rt_rtx)
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 28b0b766eba..bdfe9ae201c 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -128,6 +128,7 @@ DEFTIMEVAR (TV_TEMPLATE_INSTANTIATION, "template instantiation")
 DEFTIMEVAR (TV_EXPAND		     , "expand")
 DEFTIMEVAR (TV_VARCONST              , "varconst")
 DEFTIMEVAR (TV_JUMP                  , "jump")
+DEFTIMEVAR (TV_FWPROP                , "forward prop")
 DEFTIMEVAR (TV_CSE                   , "CSE")
 DEFTIMEVAR (TV_LOOP                  , "loop analysis")
 DEFTIMEVAR (TV_GCSE                  , "global CSE")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 8a553021706..3ab0ef730b7 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -330,6 +330,8 @@ extern struct tree_opt_pass pass_rtl_eh;
 extern struct tree_opt_pass pass_initial_value_sets;
 extern struct tree_opt_pass pass_unshare_all_rtl;
 extern struct tree_opt_pass pass_instantiate_virtual_regs;
+extern struct tree_opt_pass pass_rtl_fwprop;
+extern struct tree_opt_pass pass_rtl_fwprop_addr;
 extern struct tree_opt_pass pass_jump2;
 extern struct tree_opt_pass pass_cse;
 extern struct tree_opt_pass pass_gcse;
author	Paolo Bonzini <bonzini@gnu.org>	2006-11-04 08:36:45 +0000
committer	Paolo Bonzini <bonzini@gnu.org>	2006-11-04 08:36:45 +0000
commit	0beb16d2dfcb01910979375e7b846fe73471990d (patch)
tree	ba024c11cf4d0fba9de80471ac1eedc16e891dca
parent	7f4985ca2f42779511223b115b65ca7af63c97cd (diff)