Mainline merge 2006-01-19 (@109981)

git-svn-id: https://gcc.gnu.org/svn/gcc/branches/gomp-20050608-branch@109983 138bc75d-0d04-0410-961f-82ee72b054a4
author: Diego Novillo <dnovillo@redhat.com> 2006-01-19 21:40:06 +0000
committer: Diego Novillo <dnovillo@redhat.com> 2006-01-19 21:40:06 +0000
commit: e42b66ccd847810cc6212d564496bd23d5d52104 (patch)
tree: 39e2a10a73fe63839fc3cb345a5e79464caf20fd
parent: 90545ddfafabdf07974234507025cc3d5c65ad8f (diff)
116 files changed, 7350 insertions, 2173 deletions
diff --git a/contrib/regression/ChangeLog b/contrib/regression/ChangeLog
index 62aa9ba1766..32a81dd7042 100644
--- a/contrib/regression/ChangeLog
+++ b/contrib/regression/ChangeLog
@@ -1,3 +1,9 @@
+2006-01-18  Andrew Pinski  <pinskia@physics.uc.edu>
+
+	* btest-gcc.sh: gcc.sum has moved to gcc/testsuite/gcc/gcc.sum.
+	g++.sum has moved to gcc/testsuite/g++/g++.sum.
+	objc.sum has moved to gcc/testsuite/objc/objc.sum.
+
 2005-12-20  Geoffrey Keating  <geoffk@apple.com>
 
 	* btest-gcc.sh: Support -j option.
diff --git a/contrib/regression/btest-gcc.sh b/contrib/regression/btest-gcc.sh
index bff069ac686..db8664cfd9d 100755
--- a/contrib/regression/btest-gcc.sh
+++ b/contrib/regression/btest-gcc.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 #  Test GCC.
-#  Copyright (C) 1999, 2000, 2001, 2002, 2005  Free Software Foundation, Inc.
+#  Copyright (C) 1999, 2000, 2001, 2002, 2005, 2006  Free Software Foundation, Inc.
 
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
@@ -114,9 +114,9 @@ H_REAL_TARGET=`$SOURCE/config.sub $H_TARGET || exit 1`
 
 # TESTLOGS is the list of dejagnu .sum files that the tester should
 # look at.
-TESTLOGS="gcc/testsuite/gcc.sum
-gcc/testsuite/g++.sum
-gcc/testsuite/objc.sum"
+TESTLOGS="gcc/testsuite/gcc/gcc.sum
+gcc/testsuite/g++/g++.sum
+gcc/testsuite/objc/objc.sum"
 
 # Build.
 echo build > $RESULT
diff --git a/fixincludes/ChangeLog b/fixincludes/ChangeLog
index 0121729026d..ede9e0b8be5 100644
--- a/fixincludes/ChangeLog
+++ b/fixincludes/ChangeLog
@@ -1,3 +1,11 @@
+2006-01-19  Andrew Pinski  <pinskia@physics.uc.edu>
+
+	PR target/15642
+	* inclhack.def (AAB_darwin7_9_long_double_funcs [replace]): Define
+	__APPLE_CC_ as 1345.
+	(broken_nan): New.
+	* fixincl.x: Regenerate.
+
 2005-11-24  Bruce Korb  <bkorb@gnu.org>
 
 	* fixincl.c(write_replacement) "here strings" in AutoGen often/generally
diff --git a/fixincludes/fixincl.x b/fixincludes/fixincl.x
index ead2c54cf6d..01071730a0a 100644
--- a/fixincludes/fixincl.x
+++ b/fixincludes/fixincl.x
@@ -2,11 +2,11 @@
  * 
  * DO NOT EDIT THIS FILE   (fixincl.x)
  * 
- * It has been AutoGen-ed  Thursday November 24, 2005 at 09:46:46 PM PST
+ * It has been AutoGen-ed  Thursday January 19, 2006 at 12:17:28 PM EST
  * From the definitions    inclhack.def
  * and the template file   fixincl
  */
-/* DO NOT CVS-MERGE THIS FILE, EITHER Thu Nov 24 21:46:46 PST 2005
+/* DO NOT CVS-MERGE THIS FILE, EITHER Thu Jan 19 12:17:28 EST 2006
  *
  * You must regenerate it.  Use the ./genfixes script.
  *
@@ -15,7 +15,7 @@
  * certain ANSI-incompatible system header files which are fixed to work
  * correctly with ANSI C and placed in a directory that GNU C will search.
  *
- * This file contains 193 fixup descriptions.
+ * This file contains 194 fixup descriptions.
  *
  * See README for more information.
  *
@@ -26,8 +26,7 @@
  *  
  *  You may redistribute it and/or modify it under the terms of the
  *  GNU General Public License, as published by the Free Software
- *  Foundation; either version 2 of the License, or (at your option)
- *  any later version.
+ *  Foundation; either version 2, or (at your option) any later version.
  *  
  *  inclhack is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -35,10 +34,10 @@
  *  See the GNU General Public License for more details.
  *  
  *  You should have received a copy of the GNU General Public License
- *  along with inclhack.  If not, write to:
- *  	The Free Software Foundation, Inc.,
- *  	51 Franklin Street, Fifth Floor
- *  	Boston, MA  02110-1301, USA.
+ *  along with inclhack.  See the file "COPYING".  If not,
+ *  write to:  The Free Software Foundation, Inc.,
+ *             59 Temple Place - Suite 330,
+ *             Boston,  MA  02111-1307, USA.
  */
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * *
@@ -77,7 +76,11 @@ static const char* apzAab_Darwin7_9_Long_Double_FuncsPatch[] = {
 "/* This file prototypes the long double functions available on Mac OS\n\
    10.3.9.  */\n\
 #ifndef __MATH__\n\
+# undef __APPLE_CC__\n\
+# define __APPLE_CC__  1345\n\
 # include_next <architecture/ppc/math.h>\n\
+# undef __APPLE_CC__\n\
+# define __APPLE_CC__ 1\n\
 # ifndef __LIBMLDBL_COMPAT\n\
 #  ifdef __LONG_DOUBLE_128__\n\
 #   define __LIBMLDBL_COMPAT(sym) __asm(\"_\" #sym \"$LDBL128\")\n\
@@ -1677,6 +1680,48 @@ static const char* apzBroken_CabsPatch[] = {
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * *
  *
+ *  Description of Broken_Nan fix
+ */
+tSCC zBroken_NanName[] =
+     "broken_nan";
+
+/*
+ *  File name selection pattern
+ */
+tSCC zBroken_NanList[] =
+  "|architecture/ppc/math.h|architecture/i386/math.h|";
+/*
+ *  Machine/OS name selection pattern
+ */
+#define apzBroken_NanMachs (const char**)NULL
+
+/*
+ *  content selection pattern - do fix if pattern found
+ */
+tSCC zBroken_NanSelect0[] =
+       "#if defined(__APPLE_CC__) && (__APPLE_CC__ >= 1345)";
+
+/*
+ *  content bypass pattern - skip fix if pattern found
+ */
+tSCC zBroken_NanBypass0[] =
+       "powl";
+
+#define    BROKEN_NAN_TEST_CT  2
+static tTestDesc aBroken_NanTests[] = {
+  { TT_NEGREP,   zBroken_NanBypass0, (regex_t*)NULL },
+  { TT_EGREP,    zBroken_NanSelect0, (regex_t*)NULL }, };
+
+/*
+ *  Fix Command Arguments for Broken_Nan
+ */
+static const char* apzBroken_NanPatch[] = {
+    "format",
+    "#if 1",
+    (char*)NULL };
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * *
+ *
  *  Description of Bsd_Stdio_Attrs_Conflict fix
  */
 tSCC zBsd_Stdio_Attrs_ConflictName[] =
@@ -7849,9 +7894,9 @@ static const char* apzX11_SprintfPatch[] = {
  *
  *  List of all fixes
  */
-#define REGEX_COUNT          235
+#define REGEX_COUNT          237
 #define MACH_LIST_SIZE_LIMIT 261
-#define FIX_COUNT            193
+#define FIX_COUNT            194
 
 /*
  *  Enumerate the fixes
@@ -7894,6 +7939,7 @@ typedef enum {
     BROKEN_ASSERT_STDIO_FIXIDX,
     BROKEN_ASSERT_STDLIB_FIXIDX,
     BROKEN_CABS_FIXIDX,
+    BROKEN_NAN_FIXIDX,
     BSD_STDIO_ATTRS_CONFLICT_FIXIDX,
     CTRL_QUOTES_DEF_FIXIDX,
     CTRL_QUOTES_USE_FIXIDX,
@@ -8238,6 +8284,11 @@ tFixDesc fixDescList[ FIX_COUNT ] = {
      BROKEN_CABS_TEST_CT, FD_MACH_ONLY | FD_SUBROUTINE,
      aBroken_CabsTests,   apzBroken_CabsPatch, 0 },
 
+  {  zBroken_NanName,    zBroken_NanList,
+     apzBroken_NanMachs,
+     BROKEN_NAN_TEST_CT, FD_MACH_ONLY | FD_SUBROUTINE,
+     aBroken_NanTests,   apzBroken_NanPatch, 0 },
+
   {  zBsd_Stdio_Attrs_ConflictName,    zBsd_Stdio_Attrs_ConflictList,
      apzBsd_Stdio_Attrs_ConflictMachs,
      BSD_STDIO_ATTRS_CONFLICT_TEST_CT, FD_MACH_ONLY | FD_SUBROUTINE,
diff --git a/fixincludes/inclhack.def b/fixincludes/inclhack.def
index 3cfb3e099e3..e1721421bff 100644
--- a/fixincludes/inclhack.def
+++ b/fixincludes/inclhack.def
@@ -20,6 +20,7 @@ autogen definitions fixincl;
 FIXINC_DEBUG = yes;
 #endif
 
+
 /*
  *  On Mac OS 10.3.9, the 'long double' functions are available in
  *  libSystem, but are not prototyped in math.h.
@@ -33,7 +34,11 @@ fix = {
 /* This file prototypes the long double functions available on Mac OS
    10.3.9.  */
 #ifndef __MATH__
+# undef __APPLE_CC__
+# define __APPLE_CC__  1345
 # include_next <architecture/ppc/math.h>
+# undef __APPLE_CC__
+# define __APPLE_CC__ 1
 # ifndef __LIBMLDBL_COMPAT
 #  ifdef __LONG_DOUBLE_128__
 #   define __LIBMLDBL_COMPAT(sym) __asm("_" #sym "$LDBL128")
@@ -1042,6 +1047,21 @@ fix = {
 		"extern double	cabs(struct __cabs_s);";
 };
 
+/*
+ * Fixup Darwin's broken check for __builtin_nanf.
+ */ 
+    
+fix = {
+    hackname  = broken_nan;
+    files     = "architecture/ppc/math.h";
+    files     = "architecture/i386/math.h";
+    select    = "#if defined(__APPLE_CC__) && (__APPLE_CC__ >= 1345)";
+    bypass    = "powl";
+    c_fix     = format; 
+    c_fix_arg = "#if 1";
+    test_text = "#if defined(__APPLE_CC__) && (__APPLE_CC__ >= 1345)";
+}; 
+
 
 /*
  *  Various systems derived from BSD4.4 contain a macro definition
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 349f8dcffa2..927251a579b 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,424 @@
+2006-01-19  Andrew Pinski  <pinskia@physics.uc.edu>
+
+	PR target/22099
+	* config/i386/i386.md (builtin_setjmp_receiver): Don't emit the label.
+	* config/i386/i386.c (output_set_got): Output the label if we have one
+	for the TARGET_DEEP_BRANCH_PREDICTION case.
+
+2006-01-19  Jan Hubicka  <jh@suse.cz>
+            H.J. Lu  <hongjiu.lu@intel.com>
+	    Evandro Menezes <evandro.menezes@amd.com>
+
+	* invoke.texi (generic): Document
+	(i686) Update.
+	* config.gcc: Make x86_64-* and i686-* default to generic tunning.
+	* i386.h (TARGET_GENERIC32, TARGET_GENERIC64, TARGET_GENERIC,
+	TARGET_USE_INCDEC, TARGET_PAD_RETURNS): New macros.
+	(x86_use_incdec, x86_pad_returns): New variables
+	(TARGET_CPU_DEFAULT_generic): New constant
+	(TARGET_CPU_DEFAULT_NAMES): Add generic.
+	(enum processor_type): Add generic32 and generic64.
+	* i386.md (cpu attribute): Add generic32/generic64
+	(movhi splitter): Behave sanely when both partial_reg_dependency and
+	partial_reg_stall are set.
+	(K8 splitters): Enable for generic as well.
+	* predicates.md (incdec_operand): Use TARGET_INCDEC
+	(aligned_operand): Avoid memory mismatch stalls.
+	* athlon.md: Enable for generic64, new patterns for 128bit moves.
+	* ppro.md: Enable for generic32
+	* i386.c (generic64_cost, generic32_cost): New.
+	(m_GENERIC32, m_GENERIC64, m_GENERIC): New macros.
+	(x86_use_leave): Enable for generic64.  (x86_use_sahf,
+	x86_ext_80387_constants): Enable for generic32.  (x86_push_memory,
+	x86_movx, x86_unroll_strlen, x86_deep_branch, x86_use_simode_fiop,
+	x86_use_cltd, x86_promote_QImode, x86_sub_esp_4, x86_sub_esp_8,
+	x86_add_esp_4, x86_add_esp_8, x86_integer_DFmode_moves,
+	x86_partial_reg_dependency, x86_memory_mismatch_stall,
+	x86_accumulate_outgoing_args, x86_prologue_using_move,
+	x86_epilogue_using_move, x86_arch_always_fancy_math_387,
+	x86_sse_partial_reg_dependency, x86_four_jump_limit, x86_schedule):
+	Enable for generic.
+	(x86_use_incdec, x86_pad_returns): New.
+	(override_options): Add generic32 and generic64, translate "generic"
+	to generic32/generic64 and "i686" to "generic32", refuse
+	"generic32"/"generic64" as arch target.
+	(ix86_issue_rate, ix86_adjust_cost): Handle generic as athlon.
+	(ix86_reorg): Honor PAD_RETURNS.
+
+2006-01-19  Diego Novillo  <dnovillo@redhat.com>
+
+	* tree-pretty-print.c (dump_generic_node): Handle
+	OMP_PARALLEL_FN, OMP_PARALLEL_DATA_ARG and OMP_RETURN_EXPR.
+	* cgraph.c (cgraph_expand_queue): Rename from
+	cgraph_analyze_queue.  Update all users.
+	* cgraphunit.c (cgraph_assemble_pending_functions): Process
+	cgraph_expand_queue.
+	(cgraph_expand_all_functions): Likewise.
+	(cgraph_finalize_pending_functions): Remove.  Update callers.
+
+	* tree.h (OMP_DIRECTIVE_P): Define.
+	(OMP_PARALLEL_FN): Define.
+	(OMP_PARALLEL_DATA_ARG): Define.
+	(OMP_SECTIONS_SECTIONS): Define.
+	* tree-pass.h (pass_expand_omp): Declare.
+	* omp-low.c (struct omp_region): Declare.
+	(struct omp_context): Remove fields 'parallel_type',
+	'parallel_start_ix' and 'parallel_start_additional_args'.
+	Update all users.
+	(struct omp_for_data): Rename from struct expand_omp_for_data.
+	(omp_regions): New static variable.
+	(root_omp_region): New static variable.
+	(find_omp_clause): Make static.
+	(is_in_combined_parallel_ctx): Remove.
+	(is_combined_parallel): New.
+	(extract_omp_for_data): Move earlier in the file.
+	(workshare_safe_to_combine_p): New.
+	(get_ws_args_for): New.
+	(determine_parallel_type): Move earlier in the file.
+	(omp_copy_decl_2): Do not set DECL_CONTEXT of new local to the
+	child function.
+	(omp_copy_decl): Likewise.
+	(create_omp_child_function): Likewise.
+	(lookup_omp_region): New.
+	(dump_omp_region): New.
+	(debug_omp_region): New.
+	(debug_all_omp_regions): New.
+	(new_omp_region): New.
+	(scan_omp_parallel): If parallel_nesting_level > 1, the
+	directive is nested within another parallel directive.
+	Set OMP_PARALLEL_FN.
+	(scan_omp_for): Do not try to handle combined parallel+for
+	cases.
+	Remove FIXME comment.
+	(scan_omp_nested): Remove.
+	(scan_omp_1): Do not call scan_omp_nested when
+	parallel_nesting_level is > 1.
+	Do not change the DECL_CONTEXT of local variables found.
+	(lookup_decl_in_outer_ctx): New.
+	(lower_rec_input_clauses): Rename from expand_rec_input_clauses.
+	(lower_lastprivate_clauses): Rename from expand_lastprivate_clauses.
+	(lower_reduction_clauses): Rename from expand_reduction_clauses.
+	(lower_copyprivate_clauses): Rename from expand_copyprivate_clauses.
+	If CTX is nested, lookup VAR in the outer context when
+	building copy assignment.
+	(lower_send_clauses): Rename from expand_send_clauses.
+	If CTX is nested, lookup VAR in the outer context when
+	building copy assignments.
+	(lower_send_shared_vars): Rename from expand_send_shared_vars.
+	If CTX is nested, lookup VAR in the outer context when
+	building copy assignments.
+	(expand_parallel_call): Rename from build_parallel_call.
+	Handle combined parallel+workshare cases.
+	Re-implement to emit code into the CFG.
+	(list2chain): New.
+	(expand_omp_parallel): Re-implement to emit code into the CFG.
+	Call move_sese_region_to_fn to outline the sub-graph
+	containing the parallel region.
+	(expand_omp_for_1): Remove.
+	(expand_omp_for_generic): Re-implement to emit code into the
+	CFG.
+	(expand_omp_for_static_nochunk): Likewise.
+	(expand_omp_for_static_chunk): Likewise.
+	(expand_omp_for): Likewise.
+	(expand_omp_sections): Likewise.
+	(remove_exit_barriers): New.
+	(expand_omp_synch): New.
+	(expand_omp): New.
+	(build_omp_regions_1): New.
+	(build_omp_regions): New.
+	(execute_expand_omp): New.
+	(gate_expand_omp): New.
+	(pass_expand_omp): Define.
+	(lower_omp_sections): Rename from expand_omp_sections.
+	Set OMP_SECTIONS_SECTIONS.
+	(lower_omp_single_simple): Rename from expand_omp_single_simple.
+	(lower_omp_single_copy): Rename from expand_omp_single_copy.
+	(lower_omp_single): Rename from expand_omp_simple.
+	(lower_omp_master): Rename from expand_omp_master.
+	(lower_omp_ordered): Rename from expand_omp_ordered.
+	(lower_omp_critical): Rename from expand_omp_critical.
+	(lower_omp_for_lastprivate): Rename from expand_omp_for_lastprivate.
+	(lower_omp_for): Re-implement.
+	(lower_omp_parallel): Re-implement.
+	(lower_regimplify): Rename from expand_regimplify.
+	(lower_omp_1): Rename from expand_omp_1.
+	If there are syntax errors in the program, replace every
+	OpenMP directive with NOP.
+	Call lower_omp_* instead of expand_omp_*.
+	(lower_omp): Rename from expand_omp.
+
+	* tree-gimple.c (is_gimple_stmt): Handle OMP_RETURN_EXPR.
+	* tree-gimple.h (enum omp_parallel_type): Remove.
+	(gimple_boolify): Declare extern.
+	(find_omp_clause, determine_parallel_type): Remove.
+
+	* gimple-low.c (lower_omp_directive): New.
+	(lower_stmt): Call it.
+	(record_vars_into): Move from ...
+	(record_vars): ... here.
+	Call record_vars_into with current_function_decl.
+
+	* gimplify.c (struct gimplify_ctx): Remove fields
+	combined_pre_p and combined_ctxp.  Update users.
+	(get_formal_tmp_var): Add documentation.
+	(gimple_boolify): Make extern.
+	(gimplify_expr_in_ctx): Remove.  Update callers.
+	(gimplify_omp_parallel): Do not assume that OMP_PARALLEL_BODY
+	will always be a BIND_EXPR.
+	(gimplify_expr): Handle OMP_RETURN_EXPR.
+	* tree.def (BLOCK): Remove documentation about BLOCK_TYPE_TAGS.
+	(OMP_PARALLEL): Add 3 operands.
+	(OMP_SECTIONS): Add 1 operand.
+	(OMP_RETURN_EXPR): Define.
+
+	* tree-inline.c (estimate_num_insns_1): Handle OpenMP directives.
+	(copy_tree_r): Restore TREE_CHAIN in OMP_CLAUSE_*.
+	* tree-iterator.c (alloc_stmt_list): Assert that we are not
+	creating a circular free list.
+	(free_stmt_list): Assert that we are not freeing stmt_list_cache.
+
+	* tree-flow.h (move_sese_region_to_fn): Declare.
+	(record_vars_into): Declare.
+	* tree-cfg.c (make_omp_sections_edges): New.
+	(make_exit_edges): Handle OMP_PARALLEL, OMP_FOR, OMP_SINGLE,
+	OMP_MASTER, OMP_ORDERED, OMP_CRITICAL, OMP_RETURN_EXPR,
+	OMP_SECTIONS and OMP_SECTION.
+	(is_ctrl_altering_stmt): Return true for OMP_DIRECTIVE_P.
+	(set_bb_for_stmt): Undo change to check currently_expanding_to_rtl.
+	(verify_stmt): Do not handle OMP_DIRECTIVE_P.
+	(gather_blocks_in_sese_region): New.
+	(struct move_stmt_d): Declare.
+	(move_stmt_r): New.
+	(move_block_to_fn): New.
+	(move_sese_region_to_fn): New.
+
+	* passes.c (init_optimization_passes): Schedule
+	pass_expand_omp after pass_init_datastructures.
+
+	* tree-ssa-operands.c (get_expr_operands): Handle
+	OMP_PARALLEL, OMP_SECTIONS, OMP_FOR, OMP_RETURN_EXPR,
+	OMP_SINGLE, OMP_MASTER, OMP_ORDERED, OMP_CRITICAL.
+
+2006-01-19  Jeff Law  <law@redhat.com>
+
+	* tree-vrp.c (extract_range_from_assert): Refine the result range
+	if the variable referenced by the ASSERT_EXPR has a range and
+	either the tentative result range or the previous range is a 
+	VR_ANTI_RANGE.
+
+2006-01-19  Richard Sandiford  <richard@codesourcery.com>
+
+	* c-decl.c (add_flexible_array_elts_to_size): Simplify vector accesses.
+
+2006-01-19  Jan Hubicka  <jh@suse.cz>
+
+	* tree-vect-patterns.c (vect_pattern_recog_1): Prevent parse error.
+
+2006-01-19  Paolo Bonzini  <bonzini@gnu.org>
+
+	* combine.c (try_combine): Do not worry about MEMs wrapped by USEs.
+	(expand_compound_operation, expand_field_assignment): Fail if the
+	bit lengths of an extract operation are out of range.
+	(make_extraction): Compute wanted_inner_mode based on the position
+	and length of the extraction.  Make it extraction_mode for non-constant
+	positions, and do not modify offset in that case.  When generating a
+	new MEM, use a mode that can hold the extraction while keeping correct
+	alignment.  Remove code that supported MEMs wrapped by USEs.
+	(simplify_shift_const_1, force_to_mode) <case USE>: Remove.
+
+2006-01-19  Andreas Krebbel  <krebbel1@de.ibm.com>
+
+	* config/s390/s390.md ("Y", "y"): New mode attribute.
+	("*cmpdi_ccs", "*cmpsi_ccs", "*adddi3_imm_cc", "*adddi3_carry1_cc", 
+	"*adddi3_carry1_cconly", "*adddi3_carry2_cc", "*adddi3_carry2_cconly",
+	"*adddi3_cc", "*adddi3_cconly", "*adddi3_cconly2", "*adddi3_64", 
+	"*addsi3_imm_cc", "*addsi3_carry1_cc", 
+	"*addsi3_carry1_cconly", "*addsi3_carry2_cc", "*addsi3_carr2_cconly",
+	"*addsi3_cc", "*addsi3_cconly", "*addsi3_cconly2",
+	"*subdi3_borrow_cc", "*subdi3_borrow_cconly", "*subdi3_cc", 
+	"*subdi3_cc2", "*subdi3_cconly", "*subdi3_cconly2", "*subdi3_64",
+	"*subsi3_borrow_cc", "*subsi3_borrow_cconly", "*subsi3_cc",
+	"*subsi3_cc2", "*subsi3_cconly", "*subsi3_cconly", "*subsi3_cconly2",
+	"addsi3", "subsi3"): Insn patterns removed.
+	("*add<mode>3_carry1_cc", "*add<mode>3_carry1_cconly",
+	"*add<mode>3_carry2_cc", "*add<mode>3_carry2_cconly", "*add<mode>3_cc",
+	"*add<mode>3_cconly", "*add<mode>3_cconly2", "*add<mode>3_imm_cc",
+	"*sub<mode>3_borrow_cc", "*sub<mode>3_borrow_cconly",
+	"*sub<mode>3_cc", "*sub<mode>3_cc2", "*sub<mode>3_cconly", 
+	"*sub<mode>3_cconly2"): New insn patterns.
+	("addsi3", "subsi3"): New expanders.
+
+2006-01-19  Dorit Nuzman  <dorit@il.ibm.com>
+
+	* Makefile.in (tree-vect-patterns.o): Add rule for new file.
+	* tree-vect-analyze.c (vect_determine_vectorization_factor): Use
+	existing STMT_VINFO_VECTYPE if available.
+	(vect_mark_relevant): Add special handling for stmts that are
+	marked as STMT_VINFO_IN_PATTERN_P.
+	(vect_analyze_loop): Call vect_pattern_recog.
+	* tree-vectorizer.c (new_stmt_vec_info): Initialize new fields.
+	* tree-vectorizer.h (in_pattern_p, related_stmt): New fields in
+	stmt_info.
+	(STMT_VINFO_IN_PATTERN_P, STMT_VINFO_RELATED_STMT): New macros.
+	(vect_recog_func_ptr): New function-pointer type.
+	* tree-vect-patterns.c: New file.
+	(vect_recog_widen_sum_pattern, vect_recog_widen_mult_pattern):
+	(vect_recog_dot_prod_pattern, vect_pattern_recog):
+	(vect_pattern_recog_1): New functions.
+	(vect_pattern_recog_funcs): New array of function pointers.
+
+	* tree-vectorizer.h (ternary_op): New enum value.
+	* tree-vect-transform.c (vect_create_epilog_for_reduction): Added
+	declaration. Revised documentation. Removed redundant dump prints.
+	Removed redundant argument. Added support for reduction patterns.
+	(vectorizable_reduction): Added support for reduction patterns.
+	(vect_transform_stmt): Added support for patterns.
+
+	* expr.c (expand_expr_real_1): Added case for DOT_PROD_EXPR.
+	* genopinit.c (udot_prod_optab, sdot_prod_optab): Initialize.
+	* optabs.c (optab_for_tree_code): Added case for DOT_PROD_EXPR.
+	(expand_widen_pattern_expr): New function.
+	(init_optabs): Initialize new optabs udot_prod_optab,
+	sdot_prod_optab.
+	* optabs.h (OTI_sdot_prod, OTI_udot_prod): New.
+	(sdot_prod_optab, udot_prod_optab): Define new optabs.
+	(expand_widen_pattern_expr): New function declaration.
+	* tree.def (DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR): New
+	tree-codes.
+	* tree-inline.c (estimate_num_insns_1): Added cases for new
+	tree-codes DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR.
+	* tree-pretty-print.c (dump_generic_node): Likewise.
+	(op_prio): Likewise.
+	(op_symbol): Added cases for WIDEN_SUM_EXPR, WIDEN_MULT_EXPR.
+	* tree-ssa-operands.c (get_expr_operands): Added case for
+	DOT_PROD_EXPR.
+	* tree-vect-patterns.c (widened_name_p): New function.
+	(vect_recog_dot_prod_pattern): Added function implementation.
+	* tree-vect-transform.c (get_initial_def_for_reduction): Added
+	cases for DOT_PROD_EXPR, WIDEN_SUM_EXPR.
+	* config/rs6000/altivec.md (udot_prod<mode>, sdot_prodv8hi): New.
+	* config/i386/sse.md (sdot_prodv8hi, udot_prodv4si): New.
+
+	* expr.c (expand_expr_real_1): Added case for WIDEN_SUM_EXPR.
+	* genopinit.c (widen_ssum_optab, widen_usum_optab): Initialize.
+	* optabs.c (optab_for_tree_code): Added case for WIDEN_SUM_EXPR.
+	(init_optabs): Initialize new optabs widen_ssum_optab,
+	widen_usum_optab.
+	* optabs.h (OTI_widen_ssum, OTI_widen_usum): New.
+	(widen_ssum_optab, widen_usum_optab): Define new optabs.
+	* tree-vect-generic.c: (expand_vector_operations_1): Check type of
+	use instead of type of def.
+	* tree-vect-patterns.c (vect_recog_widen_sum_pattern): Added
+	function implementation.
+	* config/rs6000/altivec.md (widen_usum<mode>, widen_ssumv16qi,
+	widen_ssumv8hi): New.
+
+	* doc/tm.texi (ssum_widen, usum_widen, sdot_prod, udot_prod): New
+	patterns.
+
+2006-01-19  Richard Sandiford  <richard@codesourcery.com>
+
+	PR c/25805
+	* c-decl.c (add_flexible_array_elts_to_size): New function.
+	(finish_decl): Use it.
+
+2006-01-18  Andrew Pinski  <pinskia@physics.uc.edu>
+
+	* diagnostic.h: Include options.h.
+	* config/rs6000/x-darwin (host-ppc-darwin.o): Correct the
+	dependancies for diagnostic.h.
+	* Makefile.in (DIAGNOSTIC_H): Add options.h.
+
+2006-01-16  Daniel Berlin  <dberlin@dberlin.org>
+
+	* tree-ssa-operands.h (ssa_call_clobbered_cache_valid): Remove.
+	(ssa_ro_call_cache_valid): Ditto.
+	* tree-ssa-alias.c (sort_tags_by_id): New function.
+	(init_transitive_clobber_worklist): Ditto.
+	(add_to_worklist): Ditto.
+	(mark_aliases_call_clobbered): Ditto.
+	(compute_tag_properties): Ditto.
+	(set_initial_properties): Ditto.
+	(compute_call_clobbered): Ditto.
+	(compute_may_aliases):	Call compute_call_clobbered and grouping.
+	(compute_flow_sensitive_aliasing): Remove clobbering related code.
+	(compute_flow_insensitive_aliasing): Grouping now happens in our
+	caller.
+	(setup_pointers_and_addressables): Remove clobbering related code.
+	(add_may_alias): Ditto.
+	(replace_may_alias): Ditto.
+	(get_nmt_for): Ditto.
+	(create_global_var): 
+	(is_escape_site): Return an escape_type enumeration.
+	* tree-flow-inline.h (is_call_clobbered):  Global var does not
+	imply call clobbered.
+	(mark_call_clobbered): Take a reason for marking this. Remove
+	marking of globalness, and cache invalidation.
+	(clear_call_clobbered): Remove cache invalidation code.
+	* tree-dfa.c (dump_variable): If details is on, dump the reason
+	for escaping.
+	* tree-outof-ssa.c (create_temp): Copy escape mask from original
+	variable. 
+	* tree-flow.h (struct ptr_info_def): Add escape mask member.
+	(struct var_ann_d): Ditto.
+	(enum escape_type): New.
+	(mark_call_clobbered): Adjust prototype.
+	* tree-ssa-structalias.c (update_alias_info): Unmodifiable vars
+	are never call clobbered. 
+	Record reasons for escaping.
+	* tree-ssa-structalias.h (is_escape_site): Update prototype.
+	* tree-ssa-operands.c (ssa_call_clobbered_cache_valid): Remove.
+	(ssa_ro_call_cache_valid): Ditto.
+	(clobbered_v_may_defs): Ditto.
+	(clobbered_vuses): Ditto.
+	(ro_call_vuses): Ditto.
+	(clobber_stats): New.
+	(init_ssa_operands): Zero out clobber stats.
+	(fini_ssa_operands): Print out clobber stats.
+	(get_call_expr_operands): Pass callee fndecl to
+	add_call_read_ops).
+	(add_call_clobber_ops): Remove use of cache.
+	Add use of PURE_CONST information.
+	(add_call_read_ops): Remove use of cache.
+	Add use of static not_read information.
+	
+2006-01-18  Alexandre Oliva  <aoliva@redhat.com>
+
+	Introduce TLS descriptors for i386 and x86_64.
+	* config/i386/i386.h (TARGET_GNU2_TLS): New macro.
+	(TARGET_ANY_GNU_TLS): New macro.
+	(enum tls_dialect): Added TLS_DIALECT_GNU2.
+	(struct machine_function): Add tls_descriptor_call_expanded_p.
+	(ix86_tls_descriptor_calls_expande_in_cfun): New macro.
+	(ix86_current_function_calls_tls_descriptor): Likewise.
+	* config/i386/i386.c (ix86_tls_dialect): Fix typo in comment.
+	(override_options): Introduce gnu2 tls dialect.
+	(ix86_frame_pointer_required): Functions containing TLSCALLs are
+	not leaves.
+	(ix86_select_alt_pic_regnum, ix86_compute_frame_layout):
+	Likewise.
+	(legitimize_tls_address): Adjust logic for GNU2 TLS.
+	(ix86_init_machine_status): Initialize new field.
+	(ix86_tls_get_addr): Use TARGET_ANY_GNU_TLS.
+	(ix86_tls_module_base): New.
+	* config/i386/i386-protos.h (ix86_tls_module_base): Declare it.
+	* config/i386/i386.md (UNSPEC_TLSDESC): New constant.
+	(tls_global_dynamic_32, tls_global_dynamic_64): Handle GNU2 TLS.
+	(tls_local_dynamic_base_32, tls_local_dynamic_base_64): Likewise.
+	(tls_dynamic_gnu2_32, *tls_dynamic_lea_32): New patterns.
+	(*tls_dynamic_call_32, *tls_dynamic_gnu2_combine_32): Likewise.
+	(tls_dynamic_gnu2_64, *tls_dynamic_lea_64): Likewise.
+	(*tls_dynamic_call_64, *tls_dynamic_gnu2_combine_64): Likewise.
+	* config/i386/predicates.md (tls_modbase_operand): New.
+	(tp_or_register_operand): New.
+
+2006-01-18  Daniel Berlin  <dberlin@dberlin.org>
+
+	* ipa-reference.c (check_operand):  Allow FUNCTION_DECL.
+	(look_for_address_of): Ditto.
+	(ipa_init): Walk the function decls.
+	(static_execute): Don't set readonly on FUNCTION_DECL's.
+
 2006-01-18  Roger Sayle  <roger@eyesopen.com>
 
 	* config/i386/i386.md (shift spliter): Fix a a typo in the splitter
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index a471c73f514..f7f833b2bdc 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20060118
+20060119
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index dd2c8452926..e1c6a1d84e3 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -787,7 +787,7 @@ TREE_FLOW_H = tree-flow.h tree-flow-inline.h tree-ssa-operands.h \
 		$(HASHTAB_H) $(CGRAPH_H) $(IPA_REFERENCE_H)
 TREE_SSA_LIVE_H = tree-ssa-live.h $(PARTITION_H)
 PRETTY_PRINT_H = pretty-print.h input.h $(OBSTACK_H)
-DIAGNOSTIC_H = diagnostic.h diagnostic.def $(PRETTY_PRINT_H)
+DIAGNOSTIC_H = diagnostic.h diagnostic.def $(PRETTY_PRINT_H) options.h
 C_PRETTY_PRINT_H = c-pretty-print.h $(PRETTY_PRINT_H) $(C_COMMON_H) $(TREE_H)
 SCEV_H = tree-scalar-evolution.h $(GGC_H) tree-chrec.h $(PARAMS_H)
 LAMBDA_H = lambda.h $(TREE_H) vec.h $(GGC_H)
@@ -967,6 +967,7 @@ OBJS-common = \
  tree-vect-generic.o tree-ssa-loop.o tree-ssa-loop-niter.o		   \
  tree-ssa-loop-manip.o tree-ssa-threadupdate.o				   \
  tree-vectorizer.o tree-vect-analyze.o tree-vect-transform.o		   \
+ tree-vect-patterns.o                                                      \
  tree-ssa-loop-ivcanon.o tree-ssa-propagate.o tree-ssa-address.o	   \
  tree-ssa-math-opts.o							   \
  tree-ssa-loop-ivopts.o tree-if-conv.o tree-ssa-loop-unswitch.o		   \
@@ -2065,6 +2066,10 @@ tree-vect-analyze.o: tree-vect-analyze.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(BASIC_BLOCK_H) \
    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
    tree-vectorizer.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) tree-chrec.h
+tree-vect-patterns.o: tree-vect-patterns.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+   $(TM_H) errors.h $(GGC_H) $(OPTABS_H) $(TREE_H) $(RTL_H) $(BASIC_BLOCK_H) \
+   diagnostic.h $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) cfgloop.h \
+   tree-vectorizer.h tree-data-ref.h $(EXPR_H)
 tree-vect-transform.o: tree-vect-transform.c $(CONFIG_H) $(SYSTEM_H) \
    coretypes.h $(TM_H) $(GGC_H) $(OPTABS_H) $(RECOG_H) $(TREE_H) $(RTL_H) \
    $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) \
diff --git a/gcc/c-decl.c b/gcc/c-decl.c
index a3b56c0b634..d3bf14290c8 100644
--- a/gcc/c-decl.c
+++ b/gcc/c-decl.c
@@ -3068,6 +3068,33 @@ set_array_declarator_inner (struct c_declarator *decl,
     error ("static or type qualifiers in abstract declarator");
   return decl;
 }
+
+/* INIT is a constructor that forms DECL's initializer.  If the final
+   element initializes a flexible array field, add the size of that
+   initializer to DECL's size.  */
+
+static void
+add_flexible_array_elts_to_size (tree decl, tree init)
+{
+  tree elt, type;
+
+  if (VEC_empty (constructor_elt, CONSTRUCTOR_ELTS (init)))
+    return;
+
+  elt = VEC_last (constructor_elt, CONSTRUCTOR_ELTS (init))->value;
+  type = TREE_TYPE (elt);
+  if (TREE_CODE (type) == ARRAY_TYPE
+      && TYPE_SIZE (type) == NULL_TREE
+      && TYPE_DOMAIN (type) != NULL_TREE
+      && TYPE_MAX_VALUE (TYPE_DOMAIN (type)) == NULL_TREE)
+    {
+      complete_array_type (&type, elt, false);
+      DECL_SIZE (decl)
+	= size_binop (PLUS_EXPR, DECL_SIZE (decl), TYPE_SIZE (type));
+      DECL_SIZE_UNIT (decl)
+	= size_binop (PLUS_EXPR, DECL_SIZE_UNIT (decl), TYPE_SIZE_UNIT (type));
+    }
+}
 
 /* Decode a "typename", such as "int **", returning a ..._TYPE node.  */
 
@@ -3367,6 +3394,9 @@ finish_decl (tree decl, tree init, tree asmspec_tree)
 
   if (TREE_CODE (decl) == VAR_DECL)
     {
+      if (init && TREE_CODE (init) == CONSTRUCTOR)
+	add_flexible_array_elts_to_size (decl, init);
+
       if (DECL_SIZE (decl) == 0 && TREE_TYPE (decl) != error_mark_node
 	  && COMPLETE_TYPE_P (TREE_TYPE (decl)))
 	layout_decl (decl, 0);
diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index 4e4add8dd04..a05f76dffd3 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -113,8 +113,10 @@ struct cgraph_node *cgraph_nodes;
 /* Queue of cgraph nodes scheduled to be lowered.  */
 struct cgraph_node *cgraph_nodes_queue;
 
-/* Queue of cgraph nodes scheduled to be analyzed.  */
-struct cgraph_node *cgraph_analyze_queue;
+/* Queue of cgraph nodes scheduled to be expanded.  This is a
+   secondary queue used during optimization to accomodate passes that
+   may generate new functions that need to be optimized and expanded.  */
+struct cgraph_node *cgraph_expand_queue;
 
 /* Number of nodes in existence.  */
 int cgraph_n_nodes;
@@ -1095,19 +1097,23 @@ cgraph_variable_initializer_availability (struct cgraph_varpool_node *node)
 }
 
 
-/* Add the function FNDECL to the call graph.  This assumes that the
-   body of FNDECL is in GENERIC form and ready to be processed by
-   cgraph_finalize_function.  */
+/* Add the function FNDECL to the call graph.  FNDECL is assumed to be
+   in low GIMPLE form and ready to be processed by cgraph_finalize_function.
+
+   When operating in unit-at-a-time, a new callgraph node is added to
+   CGRAPH_EXPAND_QUEUE, which is processed after all the original
+   functions in the call graph .
+
+   When not in unit-at-a-time, the new callgraph node is added to
+   CGRAPH_NODES_QUEUE for cgraph_assemble_pending_functions to
+   process.  */
 
 void
 cgraph_add_new_function (tree fndecl)
 {
-  /* We're called while lowering another function.  We can't do anything
-     at this time without recursing.  Which would cause a GC at an 
-     inappropriate time.  */
   struct cgraph_node *n = cgraph_node (fndecl);
-  n->next_needed = cgraph_analyze_queue;
-  cgraph_analyze_queue = n;
+  n->next_needed = cgraph_expand_queue;
+  cgraph_expand_queue = n;
 }
 
 #include "gt-cgraph.h"
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index 600b00e2193..6e60f8c205e 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -152,7 +152,7 @@ struct cgraph_node GTY((chain_next ("%h.next"), chain_prev ("%h.previous")))
   /* Set when function is reachable by call from other function
      that is either reachable or needed.  */
   bool reachable;
-  /* Set once the function is lowered (ie it's CFG is built).  */
+  /* Set once the function is lowered (i.e. its CFG is built).  */
   bool lowered;
   /* Set once the function has been instantiated and its callee
      lists created.  */
@@ -239,7 +239,7 @@ extern GTY(()) int cgraph_max_uid;
 extern bool cgraph_global_info_ready;
 extern bool cgraph_function_flags_ready;
 extern GTY(()) struct cgraph_node *cgraph_nodes_queue;
-extern GTY(()) struct cgraph_node *cgraph_analyze_queue;
+extern GTY(()) struct cgraph_node *cgraph_expand_queue;
 
 extern GTY(()) struct cgraph_varpool_node *cgraph_varpool_first_unanalyzed_node;
 extern GTY(()) struct cgraph_varpool_node *cgraph_varpool_nodes_queue;
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 995bcb9c5af..2b7942970bf 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -353,8 +353,22 @@ cgraph_assemble_pending_functions (void)
 	}
     }
 
+  /* Process CGRAPH_EXPAND_QUEUE, these are functions created during
+     the expansion process.  Note that this queue may grow as its
+     being processed, as the new functions may generate new ones.  */
+  while (cgraph_expand_queue)
+    {
+      struct cgraph_node *n = cgraph_expand_queue;
+      cgraph_expand_queue = cgraph_expand_queue->next_needed;
+      n->next_needed = NULL;
+      cgraph_finalize_function (n->decl, false);
+      output = true;
+    }
+
   return output;
 }
+
+
 /* As an GCC extension we allow redefinition of the function.  The
    semantics when both copies of bodies differ is not well defined.
    We replace the old body with new body so in unit at a time mode
@@ -418,20 +432,6 @@ cgraph_lower_function (struct cgraph_node *node)
   node->lowered = true;
 }
 
-static void
-cgraph_finalize_pending_functions (void)
-{
-  struct cgraph_node *next, *node = cgraph_analyze_queue;
-
-  cgraph_analyze_queue = NULL;
-  for (; node ; node = next)
-    {
-      next = node->next_needed;
-      node->next_needed = NULL;
-      cgraph_finalize_function (node->decl, true);
-    }
-}
-
 /* DECL has been parsed.  Take it, queue it, compile it at the whim of the
    logic in effect.  If NESTED is true, then our caller cannot stand to have
    the garbage collector run at the moment.  We would need to either create
@@ -458,7 +458,6 @@ cgraph_finalize_function (tree decl, bool nested)
   if (!flag_unit_at_a_time)
     {
       cgraph_analyze_function (node);
-      cgraph_finalize_pending_functions ();
       cgraph_decide_inlining_incrementally (node, false);
     }
 
@@ -982,7 +981,6 @@ cgraph_finalize_compilation_unit (void)
       gcc_assert (DECL_SAVED_TREE (decl));
 
       cgraph_analyze_function (node);
-      cgraph_finalize_pending_functions ();
 
       for (edge = node->callees; edge; edge = edge->next_callee)
 	if (!edge->callee->reachable)
@@ -1166,7 +1164,21 @@ cgraph_expand_all_functions (void)
 	  cgraph_expand_function (node);
 	}
     }
+
   free (order);
+
+  /* Process CGRAPH_EXPAND_QUEUE, these are functions created during
+     the expansion process.  Note that this queue may grow as its
+     being processed, as the new functions may generate new ones.  */
+  while (cgraph_expand_queue)
+    {
+      node = cgraph_expand_queue;
+      cgraph_expand_queue = cgraph_expand_queue->next_needed;
+      node->next_needed = NULL;
+      node->output = 0;
+      node->lowered = DECL_STRUCT_FUNCTION (node->decl)->cfg != NULL;
+      cgraph_expand_function (node);
+    }
 }
 
 /* This is used to sort the node types by the cgraph order number.  */
diff --git a/gcc/combine.c b/gcc/combine.c
index 69b9c1022d8..55f5723a3d7 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -2896,9 +2896,6 @@ try_combine (rtx i3, rtx i2, rtx i1, int *new_direct_jump_p)
 	   && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 1))) != STRICT_LOW_PART
 	   && ! use_crosses_set_p (SET_SRC (XVECEXP (newpat, 0, 1)),
 				   INSN_CUID (i2))
-	   /* Don't pass sets with (USE (MEM ...)) dests to the following.  */
-	   && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 1))) != USE
-	   && GET_CODE (SET_DEST (XVECEXP (newpat, 0, 0))) != USE
 	   && ! reg_referenced_p (SET_DEST (XVECEXP (newpat, 0, 1)),
 				  XVECEXP (newpat, 0, 0))
 	   && ! reg_referenced_p (SET_DEST (XVECEXP (newpat, 0, 0)),
@@ -5643,11 +5640,8 @@ expand_compound_operation (rtx x)
       len = INTVAL (XEXP (x, 1));
       pos = INTVAL (XEXP (x, 2));
 
-      /* If this goes outside the object being extracted, replace the object
-	 with a (use (mem ...)) construct that only combine understands
-	 and is used only for this purpose.  */
-      if (len + pos > GET_MODE_BITSIZE (GET_MODE (XEXP (x, 0))))
-	SUBST (XEXP (x, 0), gen_rtx_USE (GET_MODE (x), XEXP (x, 0)));
+      /* This should stay within the object being extracted, fail.  */
+      gcc_assert (len + pos <= GET_MODE_BITSIZE (GET_MODE (XEXP (x, 0))));
 
       if (BITS_BIG_ENDIAN)
 	pos = GET_MODE_BITSIZE (GET_MODE (XEXP (x, 0))) - len - pos;
@@ -5805,11 +5799,10 @@ expand_field_assignment (rtx x)
 	  len = INTVAL (XEXP (SET_DEST (x), 1));
 	  pos = XEXP (SET_DEST (x), 2);
 
-	  /* If the position is constant and spans the width of INNER,
-	     surround INNER  with a USE to indicate this.  */
-	  if (GET_CODE (pos) == CONST_INT
-	      && INTVAL (pos) + len > GET_MODE_BITSIZE (GET_MODE (inner)))
-	    inner = gen_rtx_USE (GET_MODE (SET_DEST (x)), inner);
+	  /* A constant position should stay within the width of INNER.  */
+	  if (GET_CODE (pos) == CONST_INT)
+	    gcc_assert (INTVAL (pos) + len
+			<= GET_MODE_BITSIZE (GET_MODE (inner)));
 
 	  if (BITS_BIG_ENDIAN)
 	    {
@@ -5907,13 +5900,6 @@ expand_field_assignment (rtx x)
    it is an RTX that represents a variable starting position; otherwise,
    POS is the (constant) starting bit position (counted from the LSB).
 
-   INNER may be a USE.  This will occur when we started with a bitfield
-   that went outside the boundary of the object in memory, which is
-   allowed on most machines.  To isolate this case, we produce a USE
-   whose mode is wide enough and surround the MEM with it.  The only
-   code that understands the USE is this routine.  If it is not removed,
-   it will cause the resulting insn not to match.
-
    UNSIGNEDP is nonzero for an unsigned reference and zero for a
    signed reference.
 
@@ -5940,23 +5926,16 @@ make_extraction (enum machine_mode mode, rtx inner, HOST_WIDE_INT pos,
      ignore the POS lowest bits, etc.  */
   enum machine_mode is_mode = GET_MODE (inner);
   enum machine_mode inner_mode;
-  enum machine_mode wanted_inner_mode = byte_mode;
+  enum machine_mode wanted_inner_mode;
   enum machine_mode wanted_inner_reg_mode = word_mode;
   enum machine_mode pos_mode = word_mode;
   enum machine_mode extraction_mode = word_mode;
   enum machine_mode tmode = mode_for_size (len, MODE_INT, 1);
-  int spans_byte = 0;
   rtx new = 0;
   rtx orig_pos_rtx = pos_rtx;
   HOST_WIDE_INT orig_pos;
 
-  /* Get some information about INNER and get the innermost object.  */
-  if (GET_CODE (inner) == USE)
-    /* (use:SI (mem:QI foo)) stands for (mem:SI foo).  */
-    /* We don't need to adjust the position because we set up the USE
-       to pretend that it was a full-word object.  */
-    spans_byte = 1, inner = XEXP (inner, 0);
-  else if (GET_CODE (inner) == SUBREG && subreg_lowpart_p (inner))
+  if (GET_CODE (inner) == SUBREG && subreg_lowpart_p (inner))
     {
       /* If going from (subreg:SI (mem:QI ...)) to (mem:QI ...),
 	 consider just the QI as the memory to extract from.
@@ -5995,14 +5974,9 @@ make_extraction (enum machine_mode mode, rtx inner, HOST_WIDE_INT pos,
      appropriate STRICT_LOW_PART operation available.
 
      For MEM, we can avoid an extract if the field starts on an appropriate
-     boundary and we can change the mode of the memory reference.  However,
-     we cannot directly access the MEM if we have a USE and the underlying
-     MEM is not TMODE.  This combination means that MEM was being used in a
-     context where bits outside its mode were being referenced; that is only
-     valid in bit-field insns.  */
+     boundary and we can change the mode of the memory reference.  */
 
   if (tmode != BLKmode
-      && ! (spans_byte && inner_mode != tmode)
       && ((pos_rtx == 0 && (pos % BITS_PER_WORD) == 0
 	   && !MEM_P (inner)
 	   && (inner_mode == tmode
@@ -6133,15 +6107,14 @@ make_extraction (enum machine_mode mode, rtx inner, HOST_WIDE_INT pos,
      don't do anything with zero-extending field extracts starting at
      the low-order bit since they are simple AND operations.  */
   if (pos_rtx == 0 && pos == 0 && ! in_dest
-      && ! in_compare && ! spans_byte && unsignedp)
+      && ! in_compare && unsignedp)
     return 0;
 
-  /* Unless we are allowed to span bytes or INNER is not MEM, reject this if
-     we would be spanning bytes or if the position is not a constant and the
-     length is not 1.  In all other cases, we would only be going outside
-     our object in cases when an original shift would have been
-     undefined.  */
-  if (! spans_byte && MEM_P (inner)
+  /* Unless INNER is not MEM, reject this if we would be spanning bytes or
+     if the position is not a constant and the length is not 1.  In all
+     other cases, we would only be going outside our object in cases when
+     an original shift would have been undefined.  */
+  if (MEM_P (inner)
       && ((pos_rtx == 0 && pos + len > GET_MODE_BITSIZE (is_mode))
 	  || (pos_rtx != 0 && len != 1)))
     return 0;
@@ -6181,15 +6154,31 @@ make_extraction (enum machine_mode mode, rtx inner, HOST_WIDE_INT pos,
       && GET_MODE_SIZE (pos_mode) < GET_MODE_SIZE (GET_MODE (pos_rtx)))
     pos_mode = GET_MODE (pos_rtx);
 
-  /* If this is not from memory, the desired mode is wanted_inner_reg_mode;
-     if we have to change the mode of memory and cannot, the desired mode is
-     EXTRACTION_MODE.  */
+  /* If this is not from memory, the desired mode is the preferred mode
+     for an extraction pattern's first input operand, or word_mode if there
+     is none.  */
   if (!MEM_P (inner))
     wanted_inner_mode = wanted_inner_reg_mode;
-  else if (inner_mode != wanted_inner_mode
-	   && (mode_dependent_address_p (XEXP (inner, 0))
-	       || MEM_VOLATILE_P (inner)))
-    wanted_inner_mode = extraction_mode;
+  else
+    {
+      /* Be careful not to go beyond the extracted object and maintain the
+	 natural alignment of the memory.  */ 
+      wanted_inner_mode = smallest_mode_for_size (len, MODE_INT);
+      while (pos % GET_MODE_BITSIZE (wanted_inner_mode) + len
+	     > GET_MODE_BITSIZE (wanted_inner_mode))
+	{
+	  wanted_inner_mode = GET_MODE_WIDER_MODE (wanted_inner_mode);
+	  gcc_assert (wanted_inner_mode != VOIDmode);
+	}
+
+      /* If we have to change the mode of memory and cannot, the desired mode
+	 is EXTRACTION_MODE.  */
+      if (inner_mode != wanted_inner_mode
+	  && (mode_dependent_address_p (XEXP (inner, 0))
+	      || MEM_VOLATILE_P (inner)
+	      || pos_rtx))
+	wanted_inner_mode = extraction_mode;
+    }
 
   orig_pos = pos;
 
@@ -6215,15 +6204,16 @@ make_extraction (enum machine_mode mode, rtx inner, HOST_WIDE_INT pos,
 	 Note that it can only be less than 0 if !MEM_P (inner).  */
     }
 
-  /* If INNER has a wider mode, make it smaller.  If this is a constant
-     extract, try to adjust the byte to point to the byte containing
+  /* If INNER has a wider mode, and this is a constant extraction, try to
+     make it smaller and adjust the byte to point to the byte containing
      the value.  */
   if (wanted_inner_mode != VOIDmode
+      && inner_mode != wanted_inner_mode
+      && ! pos_rtx
       && GET_MODE_SIZE (wanted_inner_mode) < GET_MODE_SIZE (is_mode)
-      && ((MEM_P (inner)
-	   && (inner_mode == wanted_inner_mode
-	       || (! mode_dependent_address_p (XEXP (inner, 0))
-		   && ! MEM_VOLATILE_P (inner))))))
+      && MEM_P (inner)
+      && ! mode_dependent_address_p (XEXP (inner, 0))
+      && ! MEM_VOLATILE_P (inner))
     {
       int offset = 0;
 
@@ -6234,28 +6224,20 @@ make_extraction (enum machine_mode mode, rtx inner, HOST_WIDE_INT pos,
       /* If bytes are big endian and we had a paradoxical SUBREG, we must
 	 adjust OFFSET to compensate.  */
       if (BYTES_BIG_ENDIAN
-	  && ! spans_byte
 	  && GET_MODE_SIZE (inner_mode) < GET_MODE_SIZE (is_mode))
 	offset -= GET_MODE_SIZE (is_mode) - GET_MODE_SIZE (inner_mode);
 
-      /* If this is a constant position, we can move to the desired byte.
-	 Be careful not to go beyond the original object and maintain the
-	 natural alignment of the memory.  */ 
-      if (pos_rtx == 0)
-	{
-	  enum machine_mode bfmode = smallest_mode_for_size (len, MODE_INT);
-	  offset += (pos / GET_MODE_BITSIZE (bfmode)) * GET_MODE_SIZE (bfmode);
-	  pos %= GET_MODE_BITSIZE (bfmode);
-	}
+      /* We can now move to the desired byte.  */
+      offset += (pos / GET_MODE_BITSIZE (wanted_inner_mode))
+		* GET_MODE_SIZE (wanted_inner_mode);
+      pos %= GET_MODE_BITSIZE (wanted_inner_mode);
 
       if (BYTES_BIG_ENDIAN != BITS_BIG_ENDIAN
-	  && ! spans_byte
 	  && is_mode != wanted_inner_mode)
 	offset = (GET_MODE_SIZE (is_mode)
 		  - GET_MODE_SIZE (wanted_inner_mode) - offset);
 
-      if (offset != 0 || inner_mode != wanted_inner_mode)
-	inner = adjust_address_nv (inner, wanted_inner_mode, offset);
+      inner = adjust_address_nv (inner, wanted_inner_mode, offset);
     }
 
   /* If INNER is not memory, we can always get it into the proper mode.  If we
@@ -6886,15 +6868,6 @@ force_to_mode (rtx x, enum machine_mode mode, unsigned HOST_WIDE_INT mask,
 	 generating something that won't match.  */
       return x;
 
-    case USE:
-      /* X is a (use (mem ..)) that was made from a bit-field extraction that
-	 spanned the boundary of the MEM.  If we are now masking so it is
-	 within that boundary, we don't need the USE any more.  */
-      if (! BITS_BIG_ENDIAN
-	  && (mask & ~GET_MODE_MASK (GET_MODE (XEXP (x, 0)))) == 0)
-	return force_to_mode (XEXP (x, 0), mode, mask, next_select);
-      break;
-
     case SIGN_EXTEND:
     case ZERO_EXTEND:
     case ZERO_EXTRACT:
@@ -8688,33 +8661,6 @@ simplify_shift_const_1 (enum rtx_code code, enum machine_mode result_mode,
 	    }
 	  break;
 
-	case USE:
-	  /* Similar to the case above, except that we can only do this if
-	     the resulting mode is the same as that of the underlying
-	     MEM and adjust the address depending on the *bits* endianness
-	     because of the way that bit-field extract insns are defined.  */
-	  if ((code == ASHIFTRT || code == LSHIFTRT)
-	      && (tmode = mode_for_size (GET_MODE_BITSIZE (mode) - count,
-					 MODE_INT, 1)) != BLKmode
-	      && tmode == GET_MODE (XEXP (varop, 0)))
-	    {
-	      if (BITS_BIG_ENDIAN)
-		new = XEXP (varop, 0);
-	      else
-		{
-		  new = copy_rtx (XEXP (varop, 0));
-		  SUBST (XEXP (new, 0),
-			 plus_constant (XEXP (new, 0),
-					count / BITS_PER_UNIT));
-		}
-
-	      varop = gen_rtx_fmt_e (code == ASHIFTRT ? SIGN_EXTEND
-				     : ZERO_EXTEND, mode, new);
-	      count = 0;
-	      continue;
-	    }
-	  break;
-
 	case SUBREG:
 	  /* If VAROP is a SUBREG, strip it as long as the inner operand has
 	     the same number of words as what we've seen so far.  Then store
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 076e71541f4..252f10c80ad 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -2366,6 +2366,9 @@ if test x$with_cpu = x ; then
       # A Cirrus ARM variant.
       with_cpu="ep9312"
       ;;
+    i386-*-*)
+      with_cpu=i386
+      ;;
     i486-*-*)
       with_cpu=i486
       ;;
@@ -2417,13 +2420,26 @@ if test x$with_cpu = x ; then
         pentium_m-*)
           with_cpu=pentium-m
           ;;
-        *)
+        pentiumpro-*)
           with_cpu=pentiumpro
           ;;
+        *)
+          with_cpu=generic
+          ;;
       esac
       ;;
     x86_64-*-*)
-      with_cpu=k8
+      case ${target_noncanonical} in
+        k8-*|opteron-*|athlon_64-*)
+          with_cpu=k8
+          ;;
+        nocona-*)
+          with_cpu=nocona
+          ;;
+        *)
+          with_cpu=generic
+          ;;
+      esac
       ;;
     alphaev6[78]*-*-*)
       with_cpu=ev67
@@ -2629,13 +2645,21 @@ case "${target}" in
 		for which in arch cpu tune; do
 			eval "val=\$with_$which"
 			case ${val} in
-			"" | i386 | i486 \
+			i386 | i486 \
 			| i586 | pentium | pentium-mmx | winchip-c6 | winchip2 \
 			| c3 | c3-2 | i686 | pentiumpro | pentium2 | pentium3 \
 			| pentium4 | k6 | k6-2 | k6-3 | athlon | athlon-tbird \
-			| athlon-4 | athlon-xp | athlon-mp | k8 | opteron \
-			| athlon64 | athlon-fx | prescott | pentium-m \
-			| pentium4m | pentium3m| nocona)
+			| athlon-4 | athlon-xp | athlon-mp \
+			| prescott | pentium-m | pentium4m | pentium3m)
+				case "${target}" in
+				  x86_64-*-*)
+				      echo "CPU given in --with-$which=$val doesn't support 64bit mode." 1>&2
+				      exit 1
+				      ;;
+				esac
+				# OK
+				;;
+			"" | k8 | opteron | athlon64 | athlon-fx | nocona | generic)
 				# OK
 				;;
 			*)
diff --git a/gcc/config/i386/athlon.md b/gcc/config/i386/athlon.md
index 1029a818196..86130b77808 100644
--- a/gcc/config/i386/athlon.md
+++ b/gcc/config/i386/athlon.md
@@ -123,7 +123,7 @@
 (define_cpu_unit "athlon-fmul" "athlon_fp")
 (define_cpu_unit "athlon-fstore" "athlon_fp")
 (define_reservation "athlon-fany" "(athlon-fstore | athlon-fmul | athlon-fadd)")
-(define_reservation "athlon-faddmul" "(athlon-fmul | athlon-fadd)")
+(define_reservation "athlon-faddmul" "(athlon-fadd | athlon-fmul)")
 
 ;; Vector operations usually consume many of pipes.
 (define_reservation "athlon-fvector" "(athlon-fadd + athlon-fmul + athlon-fstore)")
@@ -131,26 +131,26 @@
 
 ;; Jump instructions are executed in the branch unit completely transparent to us
 (define_insn_reservation "athlon_branch" 0
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "ibr"))
 			 "athlon-direct,athlon-ieu")
 (define_insn_reservation "athlon_call" 0
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "call,callv"))
 			 "athlon-vector,athlon-ieu")
 
 ;; Latency of push operation is 3 cycles, but ESP value is available
 ;; earlier
 (define_insn_reservation "athlon_push" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "push"))
 			 "athlon-direct,athlon-agu,athlon-store")
 (define_insn_reservation "athlon_pop" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "pop"))
 			 "athlon-vector,athlon-load,athlon-ieu")
 (define_insn_reservation "athlon_pop_k8" 3
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "pop"))
 			 "athlon-double,(athlon-ieu+athlon-load)")
 (define_insn_reservation "athlon_leave" 3
@@ -158,13 +158,13 @@
 			      (eq_attr "type" "leave"))
 			 "athlon-vector,(athlon-ieu+athlon-load)")
 (define_insn_reservation "athlon_leave_k8" 3
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "leave"))
 			 "athlon-double,(athlon-ieu+athlon-load)")
 
 ;; Lea executes in AGU unit with 2 cycles latency.
 (define_insn_reservation "athlon_lea" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "lea"))
 			 "athlon-direct,athlon-agu,nothing")
 
@@ -176,13 +176,13 @@
 			 "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0")
 ;; ??? Widening multiply is vector or double.
 (define_insn_reservation "athlon_imul_k8_DI" 4
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "imul")
 				   (and (eq_attr "mode" "DI")
 					(eq_attr "memory" "none,unknown"))))
 			 "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")
 (define_insn_reservation "athlon_imul_k8" 3
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "imul")
 				   (eq_attr "memory" "none,unknown")))
 			 "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0")
@@ -192,13 +192,13 @@
 				   (eq_attr "memory" "load,both")))
 			 "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu")
 (define_insn_reservation "athlon_imul_mem_k8_DI" 7
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "imul")
 				   (and (eq_attr "mode" "DI")
 					(eq_attr "memory" "load,both"))))
 			 "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu")
 (define_insn_reservation "athlon_imul_mem_k8" 6
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "imul")
 				   (eq_attr "memory" "load,both")))
 			 "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu")
@@ -211,59 +211,59 @@
 ;; of the other code
 
 (define_insn_reservation "athlon_idiv" 6
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "idiv")
 				   (eq_attr "memory" "none,unknown")))
 			 "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))")
 (define_insn_reservation "athlon_idiv_mem" 9
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "idiv")
 				   (eq_attr "memory" "load,both")))
 			 "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))")
 ;; The parallelism of string instructions is not documented.  Model it same way
 ;; as idiv to create smaller automata.  This probably does not matter much.
 (define_insn_reservation "athlon_str" 6
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "str")
 				   (eq_attr "memory" "load,both,store")))
 			 "athlon-vector,athlon-load,athlon-ieu0*6")
 
 (define_insn_reservation "athlon_idirect" 1
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "direct")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "none,unknown"))))
 			 "athlon-direct,athlon-ieu")
 (define_insn_reservation "athlon_ivector" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "vector")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "none,unknown"))))
 			 "athlon-vector,athlon-ieu,athlon-ieu")
 (define_insn_reservation "athlon_idirect_loadmov" 3
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "imov")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-load")
 (define_insn_reservation "athlon_idirect_load" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "direct")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-load,athlon-ieu")
 (define_insn_reservation "athlon_ivector_load" 6
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "vector")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "load"))))
 			 "athlon-vector,athlon-load,athlon-ieu,athlon-ieu")
 (define_insn_reservation "athlon_idirect_movstore" 1
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "imov")
 				   (eq_attr "memory" "store")))
 			 "athlon-direct,athlon-agu,athlon-store")
 (define_insn_reservation "athlon_idirect_both" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "direct")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "both"))))
@@ -271,7 +271,7 @@
 			  athlon-ieu,athlon-store,
 			  athlon-store")
 (define_insn_reservation "athlon_ivector_both" 6
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "vector")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "both"))))
@@ -280,14 +280,14 @@
 			  athlon-ieu,
 			  athlon-store")
 (define_insn_reservation "athlon_idirect_store" 1
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "direct")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "store"))))
 			 "athlon-direct,(athlon-ieu+athlon-agu),
 			  athlon-store")
 (define_insn_reservation "athlon_ivector_store" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "vector")
 				   (and (eq_attr "unit" "integer,unknown")
 					(eq_attr "memory" "store"))))
@@ -302,7 +302,7 @@
 					(eq_attr "mode" "XF"))))
 			 "athlon-vector,athlon-fpload2,athlon-fvector*9")
 (define_insn_reservation "athlon_fldxf_k8" 13
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fmov")
 				   (and (eq_attr "memory" "load")
 					(eq_attr "mode" "XF"))))
@@ -314,7 +314,7 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-fany")
 (define_insn_reservation "athlon_fld_k8" 2
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fmov")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fploadk8,athlon-fstore")
@@ -326,7 +326,7 @@
 					(eq_attr "mode" "XF"))))
 			 "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))")
 (define_insn_reservation "athlon_fstxf_k8" 8
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fmov")
 				   (and (eq_attr "memory" "store,both")
 					(eq_attr "mode" "XF"))))
@@ -337,16 +337,16 @@
 				   (eq_attr "memory" "store,both")))
 			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
 (define_insn_reservation "athlon_fst_k8" 2
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fmov")
 				   (eq_attr "memory" "store,both")))
 			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
 (define_insn_reservation "athlon_fist" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "fistp"))
 			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
 (define_insn_reservation "athlon_fmov" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "fmov"))
 			 "athlon-direct,athlon-fpsched,athlon-faddmul")
 (define_insn_reservation "athlon_fadd_load" 4
@@ -355,12 +355,12 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-fadd")
 (define_insn_reservation "athlon_fadd_load_k8" 6
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fop")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fploadk8,athlon-fadd")
 (define_insn_reservation "athlon_fadd" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "fop"))
 			 "athlon-direct,athlon-fpsched,athlon-fadd")
 (define_insn_reservation "athlon_fmul_load" 4
@@ -369,16 +369,16 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-fmul")
 (define_insn_reservation "athlon_fmul_load_k8" 6
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fmul")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fploadk8,athlon-fmul")
 (define_insn_reservation "athlon_fmul" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "fmul"))
 			 "athlon-direct,athlon-fpsched,athlon-fmul")
 (define_insn_reservation "athlon_fsgn" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "fsgn"))
 			 "athlon-direct,athlon-fpsched,athlon-fmul")
 (define_insn_reservation "athlon_fdiv_load" 24
@@ -387,7 +387,7 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-fmul")
 (define_insn_reservation "athlon_fdiv_load_k8" 13
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fdiv")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fploadk8,athlon-fmul")
@@ -396,16 +396,16 @@
 			      (eq_attr "type" "fdiv"))
 			 "athlon-direct,athlon-fpsched,athlon-fmul")
 (define_insn_reservation "athlon_fdiv_k8" 11
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "fdiv"))
 			 "athlon-direct,athlon-fpsched,athlon-fmul")
 (define_insn_reservation "athlon_fpspc_load" 103
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "fpspc")
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fpload,athlon-fvector")
 (define_insn_reservation "athlon_fpspc" 100
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "fpspc"))
 			 "athlon-vector,athlon-fpsched,athlon-fvector")
 (define_insn_reservation "athlon_fcmov_load" 7
@@ -418,12 +418,12 @@
 			      (eq_attr "type" "fcmov"))
 			 "athlon-vector,athlon-fpsched,athlon-fvector")
 (define_insn_reservation "athlon_fcmov_load_k8" 17
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fcmov")
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fploadk8,athlon-fvector")
 (define_insn_reservation "athlon_fcmov_k8" 15
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "fcmov"))
 			 "athlon-vector,athlon-fpsched,athlon-fvector")
 ;; fcomi is vector decoded by uses only one pipe.
@@ -434,13 +434,13 @@
 				        (eq_attr "memory" "load"))))
 			 "athlon-vector,athlon-fpload,athlon-fadd")
 (define_insn_reservation "athlon_fcomi_load_k8" 5
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fcmp")
 				   (and (eq_attr "athlon_decode" "vector")
 				        (eq_attr "memory" "load"))))
 			 "athlon-vector,athlon-fploadk8,athlon-fadd")
 (define_insn_reservation "athlon_fcomi" 3
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "athlon_decode" "vector")
 				   (eq_attr "type" "fcmp")))
 			 "athlon-vector,athlon-fpsched,athlon-fadd")
@@ -450,18 +450,18 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-fadd")
 (define_insn_reservation "athlon_fcom_load_k8" 4
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "fcmp")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fploadk8,athlon-fadd")
 (define_insn_reservation "athlon_fcom" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "fcmp"))
 			 "athlon-direct,athlon-fpsched,athlon-fadd")
 ;; Never seen by the scheduler because we still don't do post reg-stack
 ;; scheduling.
 ;(define_insn_reservation "athlon_fxch" 2
-;			 (and (eq_attr "cpu" "athlon,k8")
+;			 (and (eq_attr "cpu" "athlon,k8,generic64")
 ;			      (eq_attr "type" "fxch"))
 ;			 "athlon-direct,athlon-fpsched,athlon-fany")
 
@@ -477,8 +477,13 @@
 			      (and (eq_attr "type" "ssemov")
 				   (match_operand:DF 1 "memory_operand" "")))
 			 "athlon-direct,athlon-fploadk8,athlon-fstore")
+(define_insn_reservation "athlon_movsd_load_generic64" 2
+			 (and (eq_attr "cpu" "generic64")
+			      (and (eq_attr "type" "ssemov")
+				   (match_operand:DF 1 "memory_operand" "")))
+			 "athlon-double,athlon-fploadk8,(athlon-fstore+athlon-fmul)")
 (define_insn_reservation "athlon_movaps_load_k8" 2
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssemov")
 				   (and (eq_attr "mode" "V4SF,V2DF,TI")
 					(eq_attr "memory" "load"))))
@@ -496,7 +501,7 @@
 					(eq_attr "memory" "load"))))
 			 "athlon-vector,athlon-fpload,(athlon-fany*2)")
 (define_insn_reservation "athlon_movss_load_k8" 1
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssemov")
 				   (and (eq_attr "mode" "SF,DI")
 					(eq_attr "memory" "load"))))
@@ -507,57 +512,57 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-fany")
 (define_insn_reservation "athlon_mmxsseld_k8" 2
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "mmxmov,ssemov")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fploadk8,athlon-fstore")
 (define_insn_reservation "athlon_mmxssest" 3
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "mmxmov,ssemov")
 				   (and (eq_attr "mode" "V4SF,V2DF,TI")
 					(eq_attr "memory" "store,both"))))
 			 "athlon-vector,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store2)*2)")
 (define_insn_reservation "athlon_mmxssest_k8" 3
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "mmxmov,ssemov")
 				   (and (eq_attr "mode" "V4SF,V2DF,TI")
 					(eq_attr "memory" "store,both"))))
 			 "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store2)*2)")
 (define_insn_reservation "athlon_mmxssest_short" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "mmxmov,ssemov")
 				   (eq_attr "memory" "store,both")))
 			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
-(define_insn_reservation "athlon_movaps" 2
-			 (and (eq_attr "cpu" "k8")
+(define_insn_reservation "athlon_movaps_k8" 2
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssemov")
 				   (eq_attr "mode" "V4SF,V2DF,TI")))
-			 "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-faddmul)")
-(define_insn_reservation "athlon_movaps_k8" 2
+			 "athlon-double,athlon-fpsched,((athlon-faddmul+athlon-faddmul) | (athlon-faddmul, athlon-faddmul))")
+(define_insn_reservation "athlon_movaps" 2
 			 (and (eq_attr "cpu" "athlon")
 			      (and (eq_attr "type" "ssemov")
 				   (eq_attr "mode" "V4SF,V2DF,TI")))
 			 "athlon-vector,athlon-fpsched,(athlon-faddmul+athlon-faddmul)")
 (define_insn_reservation "athlon_mmxssemov" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "mmxmov,ssemov"))
 			 "athlon-direct,athlon-fpsched,athlon-faddmul")
 (define_insn_reservation "athlon_mmxmul_load" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "mmxmul")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-fmul")
 (define_insn_reservation "athlon_mmxmul" 3
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "mmxmul"))
 			 "athlon-direct,athlon-fpsched,athlon-fmul")
 (define_insn_reservation "athlon_mmx_load" 3
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "unit" "mmx")
 				   (eq_attr "memory" "load")))
 			 "athlon-direct,athlon-fpload,athlon-faddmul")
 (define_insn_reservation "athlon_mmx" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "unit" "mmx"))
 			 "athlon-direct,athlon-fpsched,athlon-faddmul")
 ;; SSE operations are handled by the i387 unit as well.  The latency
@@ -569,7 +574,7 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fpload2,(athlon-fmul*2)")
 (define_insn_reservation "athlon_sselog_load_k8" 5
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "sselog,sselog1")
 				   (eq_attr "memory" "load")))
 			 "athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
@@ -578,7 +583,7 @@
 			      (eq_attr "type" "sselog,sselog1"))
 			 "athlon-vector,athlon-fpsched,athlon-fmul*2")
 (define_insn_reservation "athlon_sselog_k8" 3
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "sselog,sselog1"))
 			 "athlon-double,athlon-fpsched,athlon-fmul")
 ;; ??? pcmp executes in addmul, probably not worthwhile to bother about that.
@@ -589,13 +594,13 @@
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fpload,athlon-fadd")
 (define_insn_reservation "athlon_ssecmp_load_k8" 4
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssecmp")
 				   (and (eq_attr "mode" "SF,DF,DI,TI")
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fploadk8,athlon-fadd")
 (define_insn_reservation "athlon_ssecmp" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "ssecmp")
 				   (eq_attr "mode" "SF,DF,DI,TI")))
 			 "athlon-direct,athlon-fpsched,athlon-fadd")
@@ -605,7 +610,7 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fpload2,(athlon-fadd*2)")
 (define_insn_reservation "athlon_ssecmpvector_load_k8" 5
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssecmp")
 				   (eq_attr "memory" "load")))
 			 "athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
@@ -614,7 +619,7 @@
 			      (eq_attr "type" "ssecmp"))
 			 "athlon-vector,athlon-fpsched,(athlon-fadd*2)")
 (define_insn_reservation "athlon_ssecmpvector_k8" 3
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "ssecmp"))
 			 "athlon-double,athlon-fpsched,(athlon-fadd*2)")
 (define_insn_reservation "athlon_ssecomi_load" 4
@@ -623,12 +628,12 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fpload,athlon-fadd")
 (define_insn_reservation "athlon_ssecomi_load_k8" 6
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssecomi")
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fploadk8,athlon-fadd")
 (define_insn_reservation "athlon_ssecomi" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (eq_attr "type" "ssecmp"))
 			 "athlon-vector,athlon-fpsched,athlon-fadd")
 (define_insn_reservation "athlon_sseadd_load" 4
@@ -638,13 +643,13 @@
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fpload,athlon-fadd")
 (define_insn_reservation "athlon_sseadd_load_k8" 6
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "sseadd")
 				   (and (eq_attr "mode" "SF,DF,DI")
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fploadk8,athlon-fadd")
 (define_insn_reservation "athlon_sseadd" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "sseadd")
 				   (eq_attr "mode" "SF,DF,DI")))
 			 "athlon-direct,athlon-fpsched,athlon-fadd")
@@ -654,7 +659,7 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fpload2,(athlon-fadd*2)")
 (define_insn_reservation "athlon_sseaddvector_load_k8" 7
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "sseadd")
 				   (eq_attr "memory" "load")))
 			 "athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
@@ -663,7 +668,7 @@
 			      (eq_attr "type" "sseadd"))
 			 "athlon-vector,athlon-fpsched,(athlon-fadd*2)")
 (define_insn_reservation "athlon_sseaddvector_k8" 5
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "sseadd"))
 			 "athlon-double,athlon-fpsched,(athlon-fadd*2)")
 
@@ -673,28 +678,28 @@
 
 ;; cvtss2sd
 (define_insn_reservation "athlon_ssecvt_cvtss2sd_load_k8" 4
-			 (and (eq_attr "cpu" "k8,athlon")
+			 (and (eq_attr "cpu" "k8,athlon,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "direct")
 					(and (eq_attr "mode" "DF")
 					     (eq_attr "memory" "load")))))
 			 "athlon-direct,athlon-fploadk8,athlon-fstore")
 (define_insn_reservation "athlon_ssecvt_cvtss2sd" 2
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "direct")
 					(eq_attr "mode" "DF"))))
 			 "athlon-direct,athlon-fpsched,athlon-fstore")
 ;; cvtps2pd.  Model same way the other double decoded FP conversions.
 (define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5
-			 (and (eq_attr "cpu" "k8,athlon")
+			 (and (eq_attr "cpu" "k8,athlon,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "double")
 					(and (eq_attr "mode" "V2DF,V4SF,TI")
 					     (eq_attr "memory" "load")))))
 			 "athlon-double,athlon-fpload2k8,(athlon-fstore*2)")
 (define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3
-			 (and (eq_attr "cpu" "k8,athlon")
+			 (and (eq_attr "cpu" "k8,athlon,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "double")
 					(eq_attr "mode" "V2DF,V4SF,TI"))))
@@ -717,7 +722,7 @@
 					     (eq_attr "memory" "load")))))
 			 "athlon-vector,athlon-fpload,(athlon-fstore*2)")
 (define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_k8" 9
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "sseicvt")
 				   (and (eq_attr "athlon_decode" "double")
 					(and (eq_attr "mode" "SF,DF")
@@ -725,7 +730,7 @@
 			 "athlon-double,athlon-fploadk8,(athlon-fstore*2)")
 ;; cvtsi2sd reg,reg is double decoded (vector on Athlon)
 (define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11
-			 (and (eq_attr "cpu" "k8,athlon")
+			 (and (eq_attr "cpu" "k8,athlon,generic64")
 			      (and (eq_attr "type" "sseicvt")
 				   (and (eq_attr "athlon_decode" "double")
 					(and (eq_attr "mode" "SF,DF")
@@ -733,7 +738,7 @@
 			 "athlon-double,athlon-fploadk8,athlon-fstore")
 ;; cvtsi2ss reg, reg is doublepath
 (define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "sseicvt")
 				   (and (eq_attr "athlon_decode" "vector")
 					(and (eq_attr "mode" "SF,DF")
@@ -741,7 +746,7 @@
 			 "athlon-vector,athlon-fploadk8,(athlon-fvector*2)")
 ;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9
 (define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9
-			 (and (eq_attr "cpu" "k8,athlon")
+			 (and (eq_attr "cpu" "k8,athlon,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "double")
 					(and (eq_attr "mode" "SF")
@@ -749,14 +754,14 @@
 			 "athlon-double,athlon-fploadk8,(athlon-fstore*3)")
 ;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12
 (define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "vector")
 					(and (eq_attr "mode" "SF")
 					     (eq_attr "memory" "none")))))
 			 "athlon-vector,athlon-fpsched,(athlon-fvector*3)")
 (define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "vector")
 					(and (eq_attr "mode" "V4SF,V2DF,TI")
@@ -765,7 +770,7 @@
 ;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10
 ;; ??? Why it is fater than cvtsd2ss?
 (define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "ssecvt")
 				   (and (eq_attr "athlon_decode" "vector")
 					(and (eq_attr "mode" "V4SF,V2DF,TI")
@@ -773,7 +778,7 @@
 			 "athlon-vector,athlon-fpsched,athlon-fvector*2")
 ;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9
 (define_insn_reservation "athlon_secvt_cvtsX2si_load" 9
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "sseicvt")
 				   (and (eq_attr "athlon_decode" "vector")
 					(and (eq_attr "mode" "SI,DI")
@@ -788,7 +793,7 @@
 					     (eq_attr "memory" "none")))))
 			 "athlon-vector,athlon-fpsched,athlon-fvector")
 (define_insn_reservation "athlon_ssecvt_cvtsX2si_k8" 9
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "sseicvt")
 				   (and (eq_attr "athlon_decode" "double")
 					(and (eq_attr "mode" "SI,DI")
@@ -803,13 +808,13 @@
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fpload,athlon-fmul")
 (define_insn_reservation "athlon_ssemul_load_k8" 6
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssemul")
 				   (and (eq_attr "mode" "SF,DF")
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fploadk8,athlon-fmul")
 (define_insn_reservation "athlon_ssemul" 4
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "ssemul")
 				   (eq_attr "mode" "SF,DF")))
 			 "athlon-direct,athlon-fpsched,athlon-fmul")
@@ -819,7 +824,7 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fpload2,(athlon-fmul*2)")
 (define_insn_reservation "athlon_ssemulvector_load_k8" 7
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssemul")
 				   (eq_attr "memory" "load")))
 			 "athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
@@ -828,7 +833,7 @@
 			      (eq_attr "type" "ssemul"))
 			 "athlon-vector,athlon-fpsched,(athlon-fmul*2)")
 (define_insn_reservation "athlon_ssemulvector_k8" 5
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "ssemul"))
 			 "athlon-double,athlon-fpsched,(athlon-fmul*2)")
 ;; divsd timings.  divss is faster
@@ -839,13 +844,13 @@
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fpload,athlon-fmul*17")
 (define_insn_reservation "athlon_ssediv_load_k8" 22
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssediv")
 				   (and (eq_attr "mode" "SF,DF")
 					(eq_attr "memory" "load"))))
 			 "athlon-direct,athlon-fploadk8,athlon-fmul*17")
 (define_insn_reservation "athlon_ssediv" 20
-			 (and (eq_attr "cpu" "athlon,k8")
+			 (and (eq_attr "cpu" "athlon,k8,generic64")
 			      (and (eq_attr "type" "ssediv")
 				   (eq_attr "mode" "SF,DF")))
 			 "athlon-direct,athlon-fpsched,athlon-fmul*17")
@@ -855,7 +860,7 @@
 				   (eq_attr "memory" "load")))
 			 "athlon-vector,athlon-fpload2,athlon-fmul*34")
 (define_insn_reservation "athlon_ssedivvector_load_k8" 35
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (and (eq_attr "type" "ssediv")
 				   (eq_attr "memory" "load")))
 			 "athlon-double,athlon-fpload2k8,athlon-fmul*34")
@@ -864,6 +869,6 @@
 			      (eq_attr "type" "ssediv"))
 			 "athlon-vector,athlon-fmul*34")
 (define_insn_reservation "athlon_ssedivvector_k8" 39
-			 (and (eq_attr "cpu" "k8")
+			 (and (eq_attr "cpu" "k8,generic64")
 			      (eq_attr "type" "ssediv"))
 			 "athlon-double,athlon-fmul*34")
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 188c9677b77..ed9d4f3d62e 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -1,6 +1,7 @@
 /* Definitions of target machine for GCC for IA-32.
    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1996, 1997, 1998, 1999,
-   2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -179,6 +180,7 @@ extern int x86_field_alignment (tree, int);
 #endif
 
 extern rtx ix86_tls_get_addr (void);
+extern rtx ix86_tls_module_base (void);
 
 extern void ix86_expand_vector_init (bool, rtx, rtx);
 extern void ix86_expand_vector_set (bool, rtx, rtx, int);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f4838de8d4b..0b5277b3628 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1,6 +1,6 @@
 /* Subroutines used for code generation on IA-32.
    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-   2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -587,6 +587,118 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
 };
 
+/* Generic64 should produce code tuned for Nocona and K8.  */
+static const
+struct processor_costs generic64_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  /* On all chips taken into consideration lea is 2 cycles and more.  With
+     this cost however our current implementation of synth_mult results in
+     use of unnecesary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*                               HI */
+   COSTS_N_INSNS (3),			/*                               SI */
+   COSTS_N_INSNS (4),			/*                               DI */
+   COSTS_N_INSNS (2)},			/*                               other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*                          HI */
+   COSTS_N_INSNS (42),			/*                          SI */
+   COSTS_N_INSNS (74),			/*                          DI */
+   COSTS_N_INSNS (74)},			/*                          other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,					/* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of loading integer registers */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
+     is increased to perhaps more appropriate value of 5.  */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+};
+
+/* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8.  */
+static const
+struct processor_costs generic32_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*                               HI */
+   COSTS_N_INSNS (3),			/*                               SI */
+   COSTS_N_INSNS (4),			/*                               DI */
+   COSTS_N_INSNS (2)},			/*                               other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*                          HI */
+   COSTS_N_INSNS (42),			/*                          SI */
+   COSTS_N_INSNS (74),			/*                          DI */
+   COSTS_N_INSNS (74)},			/*                          other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,					/* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of loading integer registers */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+};
+
 const struct processor_costs *ix86_cost = &pentium_cost;
 
 /* Processor feature/optimization bitmasks.  */
@@ -600,52 +712,81 @@ const struct processor_costs *ix86_cost = &pentium_cost;
 #define m_K8  (1<<PROCESSOR_K8)
 #define m_ATHLON_K8  (m_K8 | m_ATHLON)
 #define m_NOCONA  (1<<PROCESSOR_NOCONA)
-
-const int x86_use_leave = m_386 | m_K6 | m_ATHLON_K8;
-const int x86_push_memory = m_386 | m_K6 | m_ATHLON_K8 | m_PENT4 | m_NOCONA;
+#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
+#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
+#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
+
+/* Generic instruction choice should be common subset of supported CPUs
+   (PPro/PENT4/NOCONA/Athlon/K8).  */
+
+/* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
+   Generic64 seems like good code size tradeoff.  We can't enable it for 32bit
+   generic because it is not working well with PPro base chips.  */
+const int x86_use_leave = m_386 | m_K6 | m_ATHLON_K8 | m_GENERIC64;
+const int x86_push_memory = m_386 | m_K6 | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC;
 const int x86_zero_extend_with_and = m_486 | m_PENT;
-const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA /* m_386 | m_K6 */;
+const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_GENERIC /* m_386 | m_K6 */;
 const int x86_double_with_add = ~m_386;
 const int x86_use_bit_test = m_386;
-const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6;
-const int x86_cmove = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA;
+const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6 | m_GENERIC;
+const int x86_cmove = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA; 
 const int x86_fisttp = m_NOCONA;
 const int x86_3dnow_a = m_ATHLON_K8;
-const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON_K8 | m_PENT4 | m_NOCONA;
+const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC;
 /* Branch hints were put in P4 based on simulation result. But
    after P4 was made, no performance benefit was observed with
    branch hints. It also increases the code size. As the result,
    icc never generates branch hints.  */
 const int x86_branch_hints = 0;
-const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4 | m_NOCONA;
+const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4 | m_NOCONA | m_GENERIC32; /*m_GENERIC | m_ATHLON_K8 ? */
+/* We probably ought to watch for partial register stalls on Generic32
+   compilation setting as well.  However in current implementation the
+   partial register stalls are not eliminated very well - they can
+   be introduced via subregs synthetized by combine and can happen
+   in caller/callee saving sequences.
+   Because this option pays back little on PPro based chips and is in conflict
+   with partial reg. dependencies used by Athlon/P4 based chips, it is better
+   to leave it off for generic32 for now.  */
 const int x86_partial_reg_stall = m_PPRO;
 const int x86_use_himode_fiop = m_386 | m_486 | m_K6;
-const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT);
+const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT | m_GENERIC);
 const int x86_use_mov0 = m_K6;
-const int x86_use_cltd = ~(m_PENT | m_K6);
+const int x86_use_cltd = ~(m_PENT | m_K6 | m_GENERIC);
 const int x86_read_modify_write = ~m_PENT;
 const int x86_read_modify = ~(m_PENT | m_PPRO);
 const int x86_split_long_moves = m_PPRO;
-const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON_K8;
+const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON_K8 | m_GENERIC; /* m_PENT4 ? */
 const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
 const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
 const int x86_qimode_math = ~(0);
 const int x86_promote_qi_regs = 0;
+/* On PPro this flag is meant to avoid partial register stalls.  Just like
+   the x86_partial_reg_stall this option might be considered for Generic32
+   if our scheme for avoiding partial stalls was more effective.  */
 const int x86_himode_math = ~(m_PPRO);
 const int x86_promote_hi_regs = m_PPRO;
-const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA;
-const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4 | m_NOCONA;
-const int x86_add_esp_4 = m_ATHLON_K8 | m_K6 | m_PENT4 | m_NOCONA;
-const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4 | m_NOCONA;
-const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO);
-const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4 | m_NOCONA;
-const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4 | m_NOCONA;
-const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO;
-const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO;
-const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO;
+const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_add_esp_4 = m_ATHLON_K8 | m_K6 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC);
+const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC;
+const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_GENERIC;
+const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_GENERIC;
 const int x86_shift1 = ~m_486;
-const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA;
-const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO;
+const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC;
+/* In Generic model we have an confict here in between PPro/Pentium4 based chips
+   that thread 128bit SSE registers as single units versus K8 based chips that
+   divide SSE registers to two 64bit halves.
+   x86_sse_partial_reg_dependency promote all store destinations to be 128bit
+   to allow register renaming on 128bit SSE units, but usually results in one
+   extra microop on 64bit SSE units.  Experimental results shows that disabling
+   this option on P4 brings over 20% SPECfp regression, while enabling it on
+   K8 brings roughly 2.4% regression that can be partly masked by careful scheduling
+   of moves.  */
+const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC;
 /* Set for machines where the type and dependencies are resolved on SSE
    register parts instead of whole registers, so we may maintain just
    lower part of scalar values in proper format leaving the upper part
@@ -655,16 +796,17 @@ const int x86_sse_typeless_stores = m_ATHLON_K8;
 const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
 const int x86_use_ffreep = m_ATHLON_K8;
 const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6;
+const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_GENERIC);
 
 /* ??? Allowing interunit moves makes it all too easy for the compiler to put
    integer data in xmm registers.  Which results in pretty abysmal code.  */
 const int x86_inter_unit_moves = 0 /* ~(m_ATHLON_K8) */;
 
-const int x86_ext_80387_constants = m_K6 | m_ATHLON | m_PENT4 | m_NOCONA | m_PPRO;
+const int x86_ext_80387_constants = m_K6 | m_ATHLON | m_PENT4 | m_NOCONA | m_PPRO | m_GENERIC32;
 /* Some CPU cores are not able to predict more than 4 branch instructions in
    the 16 byte window.  */
-const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA;
-const int x86_schedule = m_PPRO | m_ATHLON_K8 | m_K6 | m_PENT;
+const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_GENERIC;
+const int x86_schedule = m_PPRO | m_ATHLON_K8 | m_K6 | m_PENT | m_GENERIC;
 const int x86_use_bt = m_ATHLON_K8;
 /* Compare and exchange was added for 80486.  */
 const int x86_cmpxchg = ~m_386;
@@ -674,6 +816,7 @@ const int x86_cmpxchg8b = ~(m_386 | m_486);
 const int x86_cmpxchg16b = m_NOCONA;
 /* Exchange and add was added for 80486.  */
 const int x86_xadd = ~m_386;
+const int x86_pad_returns = m_ATHLON_K8 | m_GENERIC;
 
 /* In case the average insn count for single function invocation is
    lower than this constant, emit fast (but longer) prologue and
@@ -876,7 +1019,7 @@ struct ix86_frame
 enum cmodel ix86_cmodel;
 /* Asm dialect.  */
 enum asm_dialect ix86_asm_dialect = ASM_ATT;
-/* TLS dialext.  */
+/* TLS dialects.  */
 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
 
 /* Which unit we are generating floating point math for.  */
@@ -1291,7 +1434,9 @@ override_options (void)
       {&athlon_cost, 0, 0, 16, 7, 16, 7, 16},
       {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0},
       {&k8_cost, 0, 0, 16, 7, 16, 7, 16},
-      {&nocona_cost, 0, 0, 0, 0, 0, 0, 0}
+      {&nocona_cost, 0, 0, 0, 0, 0, 0, 0},
+      {&generic32_cost, 0, 0, 16, 7, 16, 7, 16},
+      {&generic64_cost, 0, 0, 16, 7, 16, 7, 16}
     };
 
   static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
@@ -1359,6 +1504,8 @@ override_options (void)
 				      | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
       {"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
 				      | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
+      {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch.  */ },
+      {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch.  */ },
     };
 
   int const pta_size = ARRAY_SIZE (processor_alias_table);
@@ -1388,15 +1535,52 @@ override_options (void)
 	flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
     }
 
-  if (!ix86_tune_string && ix86_arch_string)
-    ix86_tune_string = ix86_arch_string;
-  if (!ix86_tune_string)
+  /* Need to check -mtune=generic first.  */
+  if (ix86_tune_string)
     {
-      ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
-      ix86_tune_defaulted = 1;
+      if (!strcmp (ix86_tune_string, "generic")
+	  || !strcmp (ix86_tune_string, "i686"))
+	{
+	  if (TARGET_64BIT)
+	    ix86_tune_string = "generic64";
+	  else
+	    ix86_tune_string = "generic32";
+	}
+      else if (!strncmp (ix86_tune_string, "generic", 7))
+	error ("bad value (%s) for -mtune= switch", ix86_tune_string);
+    }
+  else
+    {
+      if (ix86_arch_string)
+	ix86_tune_string = ix86_arch_string;
+      if (!ix86_tune_string)
+	{
+	  ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
+	  ix86_tune_defaulted = 1;
+	}
+
+      /* ix86_tune_string is set to ix86_arch_string or defaulted.  We
+	 need to use a sensible tune option.  */
+      if (!strcmp (ix86_tune_string, "generic")
+	  || !strcmp (ix86_tune_string, "x86-64")
+	  || !strcmp (ix86_tune_string, "i686"))
+	{
+	  if (TARGET_64BIT)
+	    ix86_tune_string = "generic64";
+	  else
+	    ix86_tune_string = "generic32";
+	}
     }
+  if (!strcmp (ix86_tune_string, "x86-64"))
+    warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated.  Use -mtune=k8 or "
+	     "-mtune=generic instead as appropriate.");
+
   if (!ix86_arch_string)
     ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
+  if (!strcmp (ix86_arch_string, "generic"))
+    error ("generic CPU can be used only for -mtune= switch");
+  if (!strncmp (ix86_arch_string, "generic", 7))
+    error ("bad value (%s) for -march= switch", ix86_arch_string);
 
   if (ix86_cmodel_string != 0)
     {
@@ -1626,6 +1810,8 @@ override_options (void)
     {
       if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
 	ix86_tls_dialect = TLS_DIALECT_GNU;
+      else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
+	ix86_tls_dialect = TLS_DIALECT_GNU2;
       else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
 	ix86_tls_dialect = TLS_DIALECT_SUN;
       else
@@ -4415,7 +4601,8 @@ ix86_frame_pointer_required (void)
      the frame pointer by default.  Turn it back on now if we've not
      got a leaf function.  */
   if (TARGET_OMIT_LEAF_FRAME_POINTER
-      && (!current_function_is_leaf))
+      && (!current_function_is_leaf
+	  || ix86_current_function_calls_tls_descriptor))
     return 1;
 
   if (current_function_profile)
@@ -4565,6 +4752,9 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
 #if TARGET_MACHO
       if (!label)
 	ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
+      else
+        targetm.asm_out.internal_label (asm_out_file, "L",
+					   CODE_LABEL_NUMBER (label));
 #endif
     }
 
@@ -4597,7 +4787,8 @@ gen_push (rtx arg)
 static unsigned int
 ix86_select_alt_pic_regnum (void)
 {
-  if (current_function_is_leaf && !current_function_profile)
+  if (current_function_is_leaf && !current_function_profile
+      && !ix86_current_function_calls_tls_descriptor)
     {
       int i;
       for (i = 2; i >= 0; --i)
@@ -4788,7 +4979,8 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
      expander assumes that last current_function_outgoing_args_size
      of stack frame are unused.  */
   if (ACCUMULATE_OUTGOING_ARGS
-      && (!current_function_is_leaf || current_function_calls_alloca))
+      && (!current_function_is_leaf || current_function_calls_alloca
+	  || ix86_current_function_calls_tls_descriptor))
     {
       offset += current_function_outgoing_args_size;
       frame->outgoing_arguments_size = current_function_outgoing_args_size;
@@ -4798,7 +4990,8 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
 
   /* Align stack boundary.  Only needed if we're calling another function
      or using alloca.  */
-  if (!current_function_is_leaf || current_function_calls_alloca)
+  if (!current_function_is_leaf || current_function_calls_alloca
+      || ix86_current_function_calls_tls_descriptor)
     frame->padding2 = ((offset + preferred_alignment - 1)
 		       & -preferred_alignment) - offset;
   else
@@ -4819,7 +5012,8 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
     frame->save_regs_using_mov = false;
 
   if (TARGET_RED_ZONE && current_function_sp_is_unchanging
-      && current_function_is_leaf)
+      && current_function_is_leaf
+      && !ix86_current_function_calls_tls_descriptor)
     {
       frame->red_zone_size = frame->to_allocate;
       if (frame->save_regs_using_mov)
@@ -6351,14 +6545,16 @@ get_thread_pointer (int to_reg)
 static rtx
 legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
 {
-  rtx dest, base, off, pic;
+  rtx dest, base, off, pic, tp;
   int type;
 
   switch (model)
     {
     case TLS_MODEL_GLOBAL_DYNAMIC:
       dest = gen_reg_rtx (Pmode);
-      if (TARGET_64BIT)
+      tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
+
+      if (TARGET_64BIT && ! TARGET_GNU2_TLS)
 	{
 	  rtx rax = gen_rtx_REG (Pmode, 0), insns;
 
@@ -6369,13 +6565,24 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
 
 	  emit_libcall_block (insns, dest, rax, x);
 	}
+      else if (TARGET_64BIT && TARGET_GNU2_TLS)
+	emit_insn (gen_tls_global_dynamic_64 (dest, x));
       else
 	emit_insn (gen_tls_global_dynamic_32 (dest, x));
+
+      if (TARGET_GNU2_TLS)
+	{
+	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
+
+	  set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
+	}
       break;
 
     case TLS_MODEL_LOCAL_DYNAMIC:
       base = gen_reg_rtx (Pmode);
-      if (TARGET_64BIT)
+      tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
+
+      if (TARGET_64BIT && ! TARGET_GNU2_TLS)
 	{
 	  rtx rax = gen_rtx_REG (Pmode, 0), insns, note;
 
@@ -6388,13 +6595,25 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
 	  note = gen_rtx_EXPR_LIST (VOIDmode, ix86_tls_get_addr (), note);
 	  emit_libcall_block (insns, base, rax, note);
 	}
+      else if (TARGET_64BIT && TARGET_GNU2_TLS)
+	emit_insn (gen_tls_local_dynamic_base_64 (base));
       else
 	emit_insn (gen_tls_local_dynamic_base_32 (base));
 
+      if (TARGET_GNU2_TLS)
+	{
+	  rtx x = ix86_tls_module_base ();
+
+	  base = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, base));
+
+	  set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
+	}
+
       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
       off = gen_rtx_CONST (Pmode, off);
 
-      return gen_rtx_PLUS (Pmode, base, off);
+      dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
+      break;
 
     case TLS_MODEL_INITIAL_EXEC:
       if (TARGET_64BIT)
@@ -6407,9 +6626,9 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
 	  if (reload_in_progress)
 	    regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
 	  pic = pic_offset_table_rtx;
-	  type = TARGET_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
+	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
 	}
-      else if (!TARGET_GNU_TLS)
+      else if (!TARGET_ANY_GNU_TLS)
 	{
 	  pic = gen_reg_rtx (Pmode);
 	  emit_insn (gen_set_got (pic));
@@ -6428,7 +6647,7 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
       off = gen_const_mem (Pmode, off);
       set_mem_alias_set (off, ix86_GOT_alias_set ());
 
-      if (TARGET_64BIT || TARGET_GNU_TLS)
+      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
 	{
           base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
 	  off = force_reg (Pmode, off);
@@ -6444,11 +6663,11 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
 
     case TLS_MODEL_LOCAL_EXEC:
       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
-			    (TARGET_64BIT || TARGET_GNU_TLS)
+			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
 			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
       off = gen_rtx_CONST (Pmode, off);
 
-      if (TARGET_64BIT || TARGET_GNU_TLS)
+      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
 	{
 	  base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
 	  return gen_rtx_PLUS (Pmode, base, off);
@@ -12900,6 +13119,7 @@ ix86_init_machine_status (void)
 
   f = ggc_alloc_cleared (sizeof (struct machine_function));
   f->use_fast_prologue_epilogue_nregs = -1;
+  f->tls_descriptor_call_expanded_p = 0;
 
   return f;
 }
@@ -12942,13 +13162,32 @@ ix86_tls_get_addr (void)
   if (!ix86_tls_symbol)
     {
       ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode,
-					    (TARGET_GNU_TLS && !TARGET_64BIT)
+					    (TARGET_ANY_GNU_TLS
+					     && !TARGET_64BIT)
 					    ? "___tls_get_addr"
 					    : "__tls_get_addr");
     }
 
   return ix86_tls_symbol;
 }
+
+/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
+
+static GTY(()) rtx ix86_tls_module_base_symbol;
+rtx
+ix86_tls_module_base (void)
+{
+
+  if (!ix86_tls_module_base_symbol)
+    {
+      ix86_tls_module_base_symbol = gen_rtx_SYMBOL_REF (Pmode,
+							"_TLS_MODULE_BASE_");
+      SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
+	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
+    }
+
+  return ix86_tls_module_base_symbol;
+}
 
 /* Calculate the length of the memory address in the instruction
    encoding.  Does not include the one-byte modrm, opcode, or prefix.  */
@@ -13112,6 +13351,8 @@ ix86_issue_rate (void)
     case PROCESSOR_ATHLON:
     case PROCESSOR_K8:
     case PROCESSOR_NOCONA:
+    case PROCESSOR_GENERIC32:
+    case PROCESSOR_GENERIC64:
       return 3;
 
     default:
@@ -13304,6 +13545,8 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
 
     case PROCESSOR_ATHLON:
     case PROCESSOR_K8:
+    case PROCESSOR_GENERIC32:
+    case PROCESSOR_GENERIC64:
       memory = get_attr_memory (insn);
 
       /* Show ability of reorder buffer to hide latency of load by executing
@@ -17274,7 +17517,7 @@ ix86_pad_returns (void)
 static void
 ix86_reorg (void)
 {
-  if (TARGET_ATHLON_K8 && optimize && !optimize_size)
+  if (TARGET_PAD_RETURNS && optimize && !optimize_size)
     ix86_pad_returns ();
   if (TARGET_FOUR_JUMP_LIMIT && optimize && !optimize_size)
     ix86_avoid_jump_misspredicts ();
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 88398d9ec2d..628a5dd65a3 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1,6 +1,6 @@
 /* Definitions of target machine for GCC for IA-32.
    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-   2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -93,11 +93,7 @@ extern const struct processor_costs *ix86_cost;
 /* configure can arrange to make this 2, to force a 486.  */
 
 #ifndef TARGET_CPU_DEFAULT
-#ifdef TARGET_64BIT_DEFAULT
-#define TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT_k8
-#else
-#define TARGET_CPU_DEFAULT 0
-#endif
+#define TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT_generic
 #endif
 
 #ifndef TARGET_FPMATH_DEFAULT
@@ -140,6 +136,9 @@ extern const struct processor_costs *ix86_cost;
 #define TARGET_K8 (ix86_tune == PROCESSOR_K8)
 #define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON)
 #define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA)
+#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
+#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
+#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
 
 #define TUNEMASK (1 << ix86_tune)
 extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
@@ -163,6 +162,8 @@ extern const int x86_use_ffreep;
 extern const int x86_inter_unit_moves, x86_schedule;
 extern const int x86_use_bt;
 extern const int x86_cmpxchg, x86_cmpxchg8b, x86_cmpxchg16b, x86_xadd;
+extern const int x86_use_incdec;
+extern const int x86_pad_returns;
 extern int x86_prefetch_sse;
 
 #define TARGET_USE_LEAVE (x86_use_leave & TUNEMASK)
@@ -217,6 +218,8 @@ extern int x86_prefetch_sse;
 #define TARGET_FOUR_JUMP_LIMIT (x86_four_jump_limit & TUNEMASK)
 #define TARGET_SCHEDULE (x86_schedule & TUNEMASK)
 #define TARGET_USE_BT (x86_use_bt & TUNEMASK)
+#define TARGET_USE_INCDEC (x86_use_incdec & TUNEMASK)
+#define TARGET_PAD_RETURNS (x86_pad_returns & TUNEMASK)
 
 #define ASSEMBLER_DIALECT (ix86_asm_dialect)
 
@@ -225,6 +228,8 @@ extern int x86_prefetch_sse;
 			     && (ix86_fpmath & FPMATH_387))
 
 #define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU)
+#define TARGET_GNU2_TLS (ix86_tls_dialect == TLS_DIALECT_GNU2)
+#define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS)
 #define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN)
 
 #define TARGET_CMPXCHG (x86_cmpxchg & (1 << ix86_arch))
@@ -462,12 +467,14 @@ extern int x86_prefetch_sse;
 #define TARGET_CPU_DEFAULT_pentium_m 14
 #define TARGET_CPU_DEFAULT_prescott 15
 #define TARGET_CPU_DEFAULT_nocona 16
+#define TARGET_CPU_DEFAULT_generic 17
 
 #define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",\
 				  "pentiumpro", "pentium2", "pentium3", \
 				  "pentium4", "k6", "k6-2", "k6-3",\
 				  "athlon", "athlon-4", "k8", \
-				  "pentium-m", "prescott", "nocona"}
+				  "pentium-m", "prescott", "nocona", \
+				  "generic"}
 
 #ifndef CC1_SPEC
 #define CC1_SPEC "%(cc1_cpu) "
@@ -2117,6 +2124,8 @@ enum processor_type
   PROCESSOR_PENTIUM4,
   PROCESSOR_K8,
   PROCESSOR_NOCONA,
+  PROCESSOR_GENERIC32,
+  PROCESSOR_GENERIC64,
   PROCESSOR_max
 };
 
@@ -2134,6 +2143,7 @@ extern enum fpmath_unit ix86_fpmath;
 enum tls_dialect
 {
   TLS_DIALECT_GNU,
+  TLS_DIALECT_GNU2,
   TLS_DIALECT_SUN
 };
 
@@ -2275,11 +2285,30 @@ struct machine_function GTY(())
   /* Number of saved registers USE_FAST_PROLOGUE_EPILOGUE has been computed
      for.  */
   int use_fast_prologue_epilogue_nregs;
+  /* If true, the current function needs the default PIC register, not
+     an alternate register (on x86) and must not use the red zone (on
+     x86_64), even if it's a leaf function.  We don't want the
+     function to be regarded as non-leaf because TLS calls need not
+     affect register allocation.  This flag is set when a TLS call
+     instruction is expanded within a function, and never reset, even
+     if all such instructions are optimized away.  Use the
+     ix86_current_function_calls_tls_descriptor macro for a better
+     approximation.  */
+  int tls_descriptor_call_expanded_p;
 };
 
 #define ix86_stack_locals (cfun->machine->stack_locals)
 #define ix86_save_varrargs_registers (cfun->machine->save_varrargs_registers)
 #define ix86_optimize_mode_switching (cfun->machine->optimize_mode_switching)
+#define ix86_tls_descriptor_calls_expanded_in_cfun \
+  (cfun->machine->tls_descriptor_call_expanded_p)
+/* Since tls_descriptor_call_expanded is not cleared, even if all TLS
+   calls are optimized away, we try to detect cases in which it was
+   optimized away.  Since such instructions (use (reg REG_SP)), we can
+   verify whether there's any such instruction live by testing that
+   REG_SP is live.  */
+#define ix86_current_function_calls_tls_descriptor \
+  (ix86_tls_descriptor_calls_expanded_in_cfun && regs_ever_live[SP_REG])
 
 /* Control behavior of x86_file_start.  */
 #define X86_FILE_START_VERSION_DIRECTIVE false
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 146ed096c6a..0b1d153af38 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1,6 +1,6 @@
 ;; GCC machine description for IA-32 and x86-64.
 ;; Copyright (C) 1988, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-;; 2001, 2002, 2003, 2004, 2005
+;; 2001, 2002, 2003, 2004, 2005, 2006
 ;; Free Software Foundation, Inc.
 ;; Mostly by William Schelter.
 ;; x86_64 support added by Jan Hubicka
@@ -73,6 +73,7 @@
    (UNSPEC_TP			16)
    (UNSPEC_TLS_GD		17)
    (UNSPEC_TLS_LD_BASE		18)
+   (UNSPEC_TLSDESC		19)
 
    ; Other random patterns
    (UNSPEC_SCAS			20)
@@ -186,7 +187,7 @@
 
 ;; Processor type.  This attribute must exactly match the processor_type
 ;; enumeration in i386.h.
-(define_attr "cpu" "i386,i486,pentium,pentiumpro,k6,athlon,pentium4,k8,nocona"
+(define_attr "cpu" "i386,i486,pentium,pentiumpro,k6,athlon,pentium4,k8,nocona,generic32,generic64"
   (const (symbol_ref "ix86_tune")))
 
 ;; A basic instruction type.  Refinements due to arguments to be
@@ -1510,8 +1511,12 @@
 	       (const_string "SI")
 	     (and (eq_attr "type" "imov")
 		  (and (eq_attr "alternative" "0,1")
-		       (ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY")
-			   (const_int 0))))
+		       (and (ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY")
+				(const_int 0))
+			    (and (eq (symbol_ref "optimize_size")
+				     (const_int 0))
+			    	 (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
+				     (const_int 0))))))
 	       (const_string "SI")
 	     ;; Avoid partial register stalls when not using QImode arithmetic
 	     (and (eq_attr "type" "imov")
@@ -4144,7 +4149,7 @@
   [(match_scratch:DF 2 "Y")
    (set (match_operand:SSEMODEI24 0 "register_operand" "")
 	(fix:SSEMODEI24 (match_operand:DF 1 "memory_operand" "")))]
-  "TARGET_K8 && !optimize_size"
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (fix:SSEMODEI24 (match_dup 2)))]
   "")
@@ -4153,7 +4158,7 @@
   [(match_scratch:SF 2 "x")
    (set (match_operand:SSEMODEI24 0 "register_operand" "")
 	(fix:SSEMODEI24 (match_operand:SF 1 "memory_operand" "")))]
-  "TARGET_K8 && !optimize_size"
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (fix:SSEMODEI24 (match_dup 2)))]
   "")
@@ -14161,6 +14166,12 @@
       operands[2] = gen_reg_rtx (Pmode);
       emit_insn (gen_set_got (operands[2]));
     }
+  if (TARGET_GNU2_TLS)
+    {
+       emit_insn (gen_tls_dynamic_gnu2_32
+		  (operands[0], operands[1], operands[2]));
+       DONE;
+    }
   operands[3] = ix86_tls_get_addr ();
 })
 
@@ -14182,6 +14193,12 @@
 			 UNSPEC_TLS_GD)])]
   ""
 {
+  if (TARGET_GNU2_TLS)
+    {
+       emit_insn (gen_tls_dynamic_gnu2_64
+		  (operands[0], operands[1]));
+       DONE;
+    }
   operands[2] = ix86_tls_get_addr ();
 })
 
@@ -14228,6 +14245,12 @@
       operands[1] = gen_reg_rtx (Pmode);
       emit_insn (gen_set_got (operands[1]));
     }
+  if (TARGET_GNU2_TLS)
+    {
+       emit_insn (gen_tls_dynamic_gnu2_32
+		  (operands[0], ix86_tls_module_base (), operands[1]));
+       DONE;
+    }
   operands[2] = ix86_tls_get_addr ();
 })
 
@@ -14247,6 +14270,12 @@
 	      (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)])]
   ""
 {
+  if (TARGET_GNU2_TLS)
+    {
+       emit_insn (gen_tls_dynamic_gnu2_64
+		  (operands[0], ix86_tls_module_base ()));
+       DONE;
+    }
   operands[1] = ix86_tls_get_addr ();
 })
 
@@ -14324,6 +14353,146 @@
    (set_attr "length" "7")
    (set_attr "memory" "load")
    (set_attr "imm_disp" "false")])
+
+;; GNU2 TLS patterns can be split.
+
+(define_expand "tls_dynamic_gnu2_32"
+  [(set (match_dup 3)
+	(plus:SI (match_operand:SI 2 "register_operand" "")
+		 (const:SI
+		  (unspec:SI [(match_operand:SI 1 "tls_symbolic_operand" "")]
+			     UNSPEC_TLSDESC))))
+   (parallel
+    [(set (match_operand:SI 0 "register_operand" "")
+	  (unspec:SI [(match_dup 1) (match_dup 3)
+		      (match_dup 2) (reg:SI SP_REG)]
+		      UNSPEC_TLSDESC))
+     (clobber (reg:CC FLAGS_REG))])]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+{
+  operands[3] = no_new_pseudos ? operands[0] : gen_reg_rtx (Pmode);
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
+
+(define_insn "*tls_dynamic_lea_32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (match_operand:SI 1 "register_operand" "b")
+		 (const:SI
+		  (unspec:SI [(match_operand:SI 2 "tls_symbolic_operand" "")]
+			      UNSPEC_TLSDESC))))]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+  "lea{l}\t{%a2@TLSDESC(%1), %0|%0, %a2@TLSDESC[%1]}"
+  [(set_attr "type" "lea")
+   (set_attr "mode" "SI")
+   (set_attr "length" "6")
+   (set_attr "length_address" "4")])
+
+(define_insn "*tls_dynamic_call_32"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(unspec:SI [(match_operand:SI 1 "tls_symbolic_operand" "")
+		    (match_operand:SI 2 "register_operand" "0")
+		    ;; we have to make sure %ebx still points to the GOT
+		    (match_operand:SI 3 "register_operand" "b")
+		    (reg:SI SP_REG)]
+		   UNSPEC_TLSDESC))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+  "call\t{*%a1@TLSCALL(%2)|[DWORD PTR [%2+%a1@TLSCALL]]}"
+  [(set_attr "type" "call")
+   (set_attr "length" "2")
+   (set_attr "length_address" "0")])
+
+(define_insn_and_split "*tls_dynamic_gnu2_combine_32"
+  [(set (match_operand:SI 0 "register_operand" "=&a")
+	(plus:SI
+	 (plus:SI (match_operand:SI 3 "tp_or_register_operand" "ir")
+		  (unspec:SI [(match_operand:SI 4 "tls_modbase_operand" "")
+			      (match_operand:SI 5 "" "")
+			      (match_operand:SI 2 "register_operand" "b")
+			      (reg:SI SP_REG)]
+			     UNSPEC_TLSDESC))
+	 (const:SI (unspec:SI
+		    [(match_operand:SI 1 "tls_symbolic_operand" "")]
+		    UNSPEC_DTPOFF))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+  "#"
+  ""
+  [(parallel
+    [(set (match_dup 0)
+	  (plus:SI (match_dup 3)
+		   (match_dup 5)))
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[5] = no_new_pseudos ? operands[0] : gen_reg_rtx (Pmode);
+  emit_insn (gen_tls_dynamic_gnu2_32 (operands[5], operands[1], operands[2]));
+})
+
+(define_expand "tls_dynamic_gnu2_64"
+  [(set (match_dup 2)
+	(unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "")]
+		   UNSPEC_TLSDESC))
+   (parallel
+    [(set (match_operand:DI 0 "register_operand" "")
+	  (unspec:DI [(match_dup 1) (match_dup 2) (reg:DI SP_REG)]
+		     UNSPEC_TLSDESC))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+{
+  operands[2] = no_new_pseudos ? operands[0] : gen_reg_rtx (Pmode);
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
+
+(define_insn "*tls_dynamic_lea_64"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "")]
+		   UNSPEC_TLSDESC))]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+  "lea{q}\t{%a1@TLSDESC(%%rip), %0|%0, %a1@TLSDESC[%%rip]}"
+  [(set_attr "type" "lea")
+   (set_attr "mode" "DI")
+   (set_attr "length" "7")
+   (set_attr "length_address" "4")])
+
+(define_insn "*tls_dynamic_call_64"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(unspec:DI [(match_operand:DI 1 "tls_symbolic_operand" "")
+		    (match_operand:DI 2 "register_operand" "0")
+		    (reg:DI SP_REG)]
+		   UNSPEC_TLSDESC))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+  "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}"
+  [(set_attr "type" "call")
+   (set_attr "length" "2")
+   (set_attr "length_address" "0")])
+
+(define_insn_and_split "*tls_dynamic_gnu2_combine_64"
+  [(set (match_operand:DI 0 "register_operand" "=&a")
+	(plus:DI
+	 (plus:DI (match_operand:DI 2 "tp_or_register_operand" "ir")
+		  (unspec:DI [(match_operand:DI 3 "tls_modbase_operand" "")
+			      (match_operand:DI 4 "" "")
+			      (reg:DI SP_REG)]
+			      UNSPEC_TLSDESC))
+	 (const:DI (unspec:DI
+		    [(match_operand:DI 1 "tls_symbolic_operand" "")]
+		    UNSPEC_DTPOFF))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+  "#"
+  ""
+  [(parallel
+    [(set (match_dup 0)
+	  (plus:DI (match_dup 2)
+		   (match_dup 4)))
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[4] = no_new_pseudos ? operands[0] : gen_reg_rtx (Pmode);
+  emit_insn (gen_tls_dynamic_gnu2_64 (operands[4], operands[1]));
+})
+
+;;
 
 ;; These patterns match the binary 387 instructions for addM3, subM3,
 ;; mulM3 and divM3.  There are three patterns for each of DFmode and
@@ -18818,7 +18987,6 @@
       rtx picreg = gen_rtx_REG (Pmode, PIC_OFFSET_TABLE_REGNUM);
       rtx label_rtx = gen_label_rtx ();
       emit_insn (gen_set_got_labelled (pic_offset_table_rtx, label_rtx));
-      emit_label (label_rtx);
       xops[0] = xops[1] = picreg;
       xops[2] = gen_rtx_CONST (SImode,
 	          gen_rtx_MINUS (SImode,
@@ -19731,7 +19899,7 @@
 		   (mult:DI (match_operand:DI 1 "memory_operand" "")
 			    (match_operand:DI 2 "immediate_operand" "")))
 	      (clobber (reg:CC FLAGS_REG))])]
-  "TARGET_K8 && !optimize_size
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
    && (GET_CODE (operands[2]) != CONST_INT
        || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))"
   [(set (match_dup 3) (match_dup 1))
@@ -19745,7 +19913,7 @@
 		   (mult:SI (match_operand:SI 1 "memory_operand" "")
 			    (match_operand:SI 2 "immediate_operand" "")))
 	      (clobber (reg:CC FLAGS_REG))])]
-  "TARGET_K8 && !optimize_size
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
    && (GET_CODE (operands[2]) != CONST_INT
        || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))"
   [(set (match_dup 3) (match_dup 1))
@@ -19760,7 +19928,7 @@
 		     (mult:SI (match_operand:SI 1 "memory_operand" "")
 			      (match_operand:SI 2 "immediate_operand" ""))))
 	      (clobber (reg:CC FLAGS_REG))])]
-  "TARGET_K8 && !optimize_size
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
    && (GET_CODE (operands[2]) != CONST_INT
        || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))"
   [(set (match_dup 3) (match_dup 1))
@@ -19778,7 +19946,7 @@
 			    (match_operand:DI 2 "const_int_operand" "")))
 	      (clobber (reg:CC FLAGS_REG))])
    (match_scratch:DI 3 "r")]
-  "TARGET_K8 && !optimize_size
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
    && CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')"
   [(set (match_dup 3) (match_dup 2))
    (parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3)))
@@ -19794,7 +19962,7 @@
 			    (match_operand:SI 2 "const_int_operand" "")))
 	      (clobber (reg:CC FLAGS_REG))])
    (match_scratch:SI 3 "r")]
-  "TARGET_K8 && !optimize_size
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
    && CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')"
   [(set (match_dup 3) (match_dup 2))
    (parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3)))
@@ -19810,7 +19978,7 @@
 			    (match_operand:HI 2 "immediate_operand" "")))
 	      (clobber (reg:CC FLAGS_REG))])
    (match_scratch:HI 3 "r")]
-  "TARGET_K8 && !optimize_size"
+  "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size"
   [(set (match_dup 3) (match_dup 2))
    (parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3)))
 	      (clobber (reg:CC FLAGS_REG))])]
diff --git a/gcc/config/i386/ppro.md b/gcc/config/i386/ppro.md
index 08010daaf7f..9f069891484 100644
--- a/gcc/config/i386/ppro.md
+++ b/gcc/config/i386/ppro.md
@@ -137,25 +137,25 @@
 ;; on decoder 0, and say that it takes a little while before the result
 ;; is available.
 (define_insn_reservation "ppro_complex_insn" 6
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (eq_attr "type" "other,multi,call,callv,str"))
 			 "decoder0")
 
 ;; imov with memory operands does not use the integer units.
 (define_insn_reservation "ppro_imov" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "imov")))
 			 "decodern,(p0|p1)")
 
 (define_insn_reservation "ppro_imov_load" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (eq_attr "type" "imov")))
 			 "decodern,p2")
 
 (define_insn_reservation "ppro_imov_store" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "store")
 				   (eq_attr "type" "imov")))
 			 "decoder0,p4+p3")
@@ -163,20 +163,20 @@
 ;; imovx always decodes to one uop, and also doesn't use the integer
 ;; units if it has memory operands.
 (define_insn_reservation "ppro_imovx" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "imovx")))
 			 "decodern,(p0|p1)")
 
 (define_insn_reservation "ppro_imovx_load" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (eq_attr "type" "imovx")))
 			 "decodern,p2")
 
 ;; lea executes on port 0 with latency one and throughput 1.
 (define_insn_reservation "ppro_lea" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "lea")))
 			 "decodern,p0")
@@ -185,19 +185,19 @@
 ;; The load and store units need to be reserved when memory operands
 ;; are involved.
 (define_insn_reservation "ppro_shift_rotate" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_shift_rotate_mem" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "!none")
 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
 			 "decoder0,p2+p0,p4+p3")
 
 (define_insn_reservation "ppro_cld" 2
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (eq_attr "type" "cld"))
 			 "decoder0,(p0+p1)*2")
 
@@ -219,32 +219,32 @@
 ;; results because we can assume these instructions can decode on all
 ;; decoders.
 (define_insn_reservation "ppro_branch" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "ibr")))
 			 "decodern,p1")
 
 ;; ??? Indirect branches probably have worse latency than this.
 (define_insn_reservation "ppro_indirect_branch" 6
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "!none")
 				   (eq_attr "type" "ibr")))
 			 "decoder0,p2+p1")
 
 (define_insn_reservation "ppro_leave" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (eq_attr "type" "leave"))
 			 "decoder0,p2+(p0|p1),(p0|p1)")
 
 ;; imul has throughput one, but latency 4, and can only execute on port 0.
 (define_insn_reservation "ppro_imul" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "imul")))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_imul_mem" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "!none")
 				   (eq_attr "type" "imul")))
 			 "decoder0,p2+p0")
@@ -253,42 +253,42 @@
 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
 ;; These issue latencies are modelled via the ppro_div automaton.
 (define_insn_reservation "ppro_idiv_QI" 19
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "QI")
 					(eq_attr "type" "idiv"))))
 			 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
 
 (define_insn_reservation "ppro_idiv_QI_load" 19
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "QI")
 					(eq_attr "type" "idiv"))))
 			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
 
 (define_insn_reservation "ppro_idiv_HI" 23
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "HI")
 					(eq_attr "type" "idiv"))))
 			 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
 
 (define_insn_reservation "ppro_idiv_HI_load" 23
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "HI")
 					(eq_attr "type" "idiv"))))
 			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
 
 (define_insn_reservation "ppro_idiv_SI" 39
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SI")
 					(eq_attr "type" "idiv"))))
 			 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
 
 (define_insn_reservation "ppro_idiv_SI_load" 39
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "SI")
 					(eq_attr "type" "idiv"))))
@@ -299,85 +299,85 @@
 ;;     has throughput "1/cycle (align with FADD)".  What do they
 ;;     mean and how can we model that?
 (define_insn_reservation "ppro_fop" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none,unknown")
 				   (eq_attr "type" "fop")))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_fop_load" 5
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (eq_attr "type" "fop")))
 			 "decoder0,p2+p0,p0")
 
 (define_insn_reservation "ppro_fop_store" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "store")
 				   (eq_attr "type" "fop")))
 			 "decoder0,p0,p0,p0+p4+p3")
 
 (define_insn_reservation "ppro_fop_both" 5
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "both")
 				   (eq_attr "type" "fop")))
 			 "decoder0,p2+p0,p0+p4+p3")
 
 (define_insn_reservation "ppro_fsgn" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (eq_attr "type" "fsgn"))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_fistp" 5
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (eq_attr "type" "fistp"))
 			 "decoder0,p0*2,p4+p3")
 
 (define_insn_reservation "ppro_fcmov" 2
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (eq_attr "type" "fcmov"))
 			 "decoder0,p0*2")
 
 (define_insn_reservation "ppro_fcmp" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "fcmp")))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_fcmp_load" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (eq_attr "type" "fcmp")))
 			 "decoder0,p2+p0")
 
 (define_insn_reservation "ppro_fmov" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "fmov")))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_fmov_load" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "!XF")
 					(eq_attr "type" "fmov"))))
 			 "decodern,p2")
 
 (define_insn_reservation "ppro_fmov_XF_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "XF")
 					(eq_attr "type" "fmov"))))
 			 "decoder0,(p2+p0)*2")
 
 (define_insn_reservation "ppro_fmov_store" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "store")
 				   (and (eq_attr "mode" "!XF")
 					(eq_attr "type" "fmov"))))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_fmov_XF_store" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "store")
 				   (and (eq_attr "mode" "XF")
 					(eq_attr "type" "fmov"))))
@@ -386,13 +386,13 @@
 ;; fmul executes on port 0 with latency 5.  It has issue latency 2,
 ;; but we don't model this.
 (define_insn_reservation "ppro_fmul" 5
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "fmul")))
 			 "decoder0,p0*2")
 
 (define_insn_reservation "ppro_fmul_load" 6
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (eq_attr "type" "fmul")))
 			 "decoder0,p2+p0,p0")
@@ -403,42 +403,42 @@
 ;; that.  Throughput is equal to latency - 1, which we model using the
 ;; ppro_div automaton.
 (define_insn_reservation "ppro_fdiv_SF" 18
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "fdiv,fpspc"))))
 			 "decodern,p0+fdiv,fdiv*16")
 
 (define_insn_reservation "ppro_fdiv_SF_load" 19
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "fdiv,fpspc"))))
 			 "decoder0,p2+p0+fdiv,fdiv*16")
 
 (define_insn_reservation "ppro_fdiv_DF" 32
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "DF")
 					(eq_attr "type" "fdiv,fpspc"))))
 			 "decodern,p0+fdiv,fdiv*30")
 
 (define_insn_reservation "ppro_fdiv_DF_load" 33
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "DF")
 					(eq_attr "type" "fdiv,fpspc"))))
 			 "decoder0,p2+p0+fdiv,fdiv*30")
 
 (define_insn_reservation "ppro_fdiv_XF" 38
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "XF")
 					(eq_attr "type" "fdiv,fpspc"))))
 			 "decodern,p0+fdiv,fdiv*36")
 
 (define_insn_reservation "ppro_fdiv_XF_load" 39
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "XF")
 					(eq_attr "type" "fdiv,fpspc"))))
@@ -456,31 +456,31 @@
 ;; so they behave as "simple" instructions that need no special modelling.
 ;; We only have to model mmxshft and mmxmul.
 (define_insn_reservation "ppro_mmx_shft" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "mmxshft")))
 			 "decodern,p1")
 
 (define_insn_reservation "ppro_mmx_shft_load" 2
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "mmxshft")))
 			 "decoder0,p2+p1")
 
 (define_insn_reservation "ppro_mmx_mul" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "mmxmul")))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_mmx_mul_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (eq_attr "type" "mmxmul")))
 			 "decoder0,p2+p0")
 
 (define_insn_reservation "ppro_sse_mmxcvt" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "mode" "DI")
 				   (eq_attr "type" "mmxcvt")))
 			 "decodern,p1")
@@ -488,7 +488,7 @@
 ;; FIXME: These are Pentium III only, but we cannot tell here if
 ;; we're generating code for PentiumPro/Pentium II or Pentium III
 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
-;;			 (and (eq_attr "cpu" "pentiumpro")
+;;			 (and (eq_attr "cpu" "pentiumpro,generic32")
 ;;			      (and (eq_attr "mode" "DI")
 ;;				   (eq_attr "type" "mmxshft")))
 ;;			 "decodern,p0")
@@ -499,69 +499,69 @@
 
 ;; The sfence instruction.
 (define_insn_reservation "ppro_sse_sfence" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "unknown")
 				   (eq_attr "type" "sse")))
 			 "decoder0,p4+p3")
 
 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
 (define_insn_reservation "ppro_sse_SF" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "mode" "SF")
 				   (eq_attr "type" "sse")))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_sse_add_SF" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "sseadd"))))
 			 "decodern,p1")
 
 (define_insn_reservation "ppro_sse_add_SF_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "sseadd"))))
 			 "decoder0,p2+p1")
 
 (define_insn_reservation "ppro_sse_cmp_SF" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssecmp"))))
 			 "decoder0,p1")
 
 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssecmp"))))
 			 "decoder0,p2+p1")
 
 (define_insn_reservation "ppro_sse_comi_SF" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssecomi"))))
 			 "decodern,p0")
 
 (define_insn_reservation "ppro_sse_comi_SF_load" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssecomi"))))
 			 "decoder0,p2+p0")
 
 (define_insn_reservation "ppro_sse_mul_SF" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssemul"))))
 			"decodern,p0")
 
 (define_insn_reservation "ppro_sse_mul_SF_load" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssemul"))))
@@ -569,109 +569,109 @@
 
 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
 (define_insn_reservation "ppro_sse_div_SF" 18
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssediv"))))
 			 "decoder0,p0*17")
 
 (define_insn_reservation "ppro_sse_div_SF_load" 18
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssediv"))))
 			 "decoder0,(p2+p0),p0*16")
 
 (define_insn_reservation "ppro_sse_icvt_SF" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "mode" "SF")
 				   (eq_attr "type" "sseicvt")))
 			 "decoder0,(p2+p1)*2")
 
 (define_insn_reservation "ppro_sse_icvt_SI" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "mode" "SI")
 				   (eq_attr "type" "sseicvt")))
 			 "decoder0,(p2+p1)")
 
 (define_insn_reservation "ppro_sse_mov_SF" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssemov"))))
 			 "decoder0,(p0|p1)")
 
 (define_insn_reservation "ppro_sse_mov_SF_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssemov"))))
 			 "decoder0,p2+(p0|p1)")
 
 (define_insn_reservation "ppro_sse_mov_SF_store" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "store")
 				   (and (eq_attr "mode" "SF")
 					(eq_attr "type" "ssemov"))))
 			 "decoder0,p4+p3")
 
 (define_insn_reservation "ppro_sse_V4SF" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "mode" "V4SF")
 				   (eq_attr "type" "sse")))
 			 "decoder0,p1*2")
 
 (define_insn_reservation "ppro_sse_add_V4SF" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "sseadd"))))
 			 "decoder0,p1*2")
 
 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "sseadd"))))
 			 "decoder0,(p2+p1)*2")
 
 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssecmp"))))
 			 "decoder0,p1*2")
 
 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssecmp"))))
 			 "decoder0,(p2+p1)*2")
 
 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none,unknown")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssecvt"))))
 			 "decoder0,p1*2")
 
 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "!none,unknown")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssecmp"))))
 			 "decoder0,p1,p4+p3")
 
 (define_insn_reservation "ppro_sse_mul_V4SF" 5
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssemul"))))
 			"decoder0,p0*2")
 
 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssemul"))))
@@ -679,49 +679,49 @@
 
 ;; FIXME: p0 really closed this long???
 (define_insn_reservation "ppro_sse_div_V4SF" 48
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssediv"))))
 			 "decoder0,p0*34")
 
 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssediv"))))
 			 "decoder0,(p2+p0)*2,p0*32")
 
 (define_insn_reservation "ppro_sse_log_V4SF" 2
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "sselog,sselog1"))))
 			 "decodern,p1")
 
 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "sselog,sselog1"))))
 			 "decoder0,(p2+p1)")
 
 (define_insn_reservation "ppro_sse_mov_V4SF" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssemov"))))
 			 "decoder0,(p0|p1)*2")
 
 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssemov"))))
 			 "decoder0,p2*2")
 
 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "store")
 				   (and (eq_attr "mode" "V4SF")
 					(eq_attr "type" "ssemov"))))
@@ -735,7 +735,7 @@
 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
 ;; the three decoders.
 (define_insn_reservation "ppro_insn" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "none,unknown")
 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
 			 "decodern,(p0|p1)")
@@ -743,13 +743,13 @@
 ;; read-modify and register-memory instructions have 2 or three uops,
 ;; so they have to be decoded on decoder0.
 (define_insn_reservation "ppro_insn_load" 3
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "load")
 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
 			 "decoder0,p2+(p0|p1)")
 
 (define_insn_reservation "ppro_insn_store" 1
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "store")
 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
 			 "decoder0,(p0|p1),p4+p3")
@@ -757,7 +757,7 @@
 ;; read-modify-store instructions produce 4 uops so they have to be
 ;; decoded on decoder0 as well.
 (define_insn_reservation "ppro_insn_both" 4
-			 (and (eq_attr "cpu" "pentiumpro")
+			 (and (eq_attr "cpu" "pentiumpro,generic32")
 			      (and (eq_attr "memory" "both")
 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
 			 "decoder0,p2+(p0|p1),p4+p3")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index bc16628439b..2d423d4053a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1,5 +1,5 @@
 ;; Predicate definitions for IA-32 and x86-64.
-;; Copyright (C) 2004, 2005 Free Software Foundation, Inc.
+;; Copyright (C) 2004, 2005, 2006 Free Software Foundation, Inc.
 ;;
 ;; This file is part of GCC.
 ;;
@@ -467,6 +467,15 @@
   (and (match_code "symbol_ref")
        (match_test "SYMBOL_REF_TLS_MODEL (op) != 0")))
 
+(define_predicate "tls_modbase_operand"
+  (and (match_code "symbol_ref")
+       (match_test "op == ix86_tls_module_base ()")))
+
+(define_predicate "tp_or_register_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "unspec")
+	    (match_test "XINT (op, 1) == UNSPEC_TP"))))
+
 ;; Test for a pc-relative call operand
 (define_predicate "constant_call_address_operand"
   (ior (match_code "symbol_ref")
@@ -619,7 +628,7 @@
 {
   /* On Pentium4, the inc and dec operations causes extra dependency on flag
      registers, since carry flag is not set.  */
-  if ((TARGET_PENTIUM4 || TARGET_NOCONA) && !optimize_size)
+  if (!TARGET_USE_INCDEC && !optimize_size)
     return 0;
   return op == const1_rtx || op == constm1_rtx;
 })
@@ -698,6 +707,11 @@
   if (GET_CODE (op) != MEM)
     return 1;
 
+  /* All patterns using aligned_operand on memory operands ends up
+     in promoting memory operand to 64bit and thus causing memory mismatch.  */
+  if (TARGET_MEMORY_MISMATCH_STALL && !optimize_size)
+    return 0;
+
   /* Don't even try to do any aligned optimizations with volatiles.  */
   if (MEM_VOLATILE_P (op))
     return 0;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bd943f7665d..88c7adf081d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1,5 +1,5 @@
 ;; GCC machine description for SSE instructions
-;; Copyright (C) 2005
+;; Copyright (C) 2005, 2006
 ;; Free Software Foundation, Inc.
 ;;
 ;; This file is part of GCC.
@@ -2700,6 +2700,48 @@
   DONE;
 })
 
+(define_expand "sdot_prodv8hi"
+  [(match_operand:V4SI 0 "register_operand" "")
+   (match_operand:V8HI 1 "nonimmediate_operand" "")
+   (match_operand:V8HI 2 "nonimmediate_operand" "")
+   (match_operand:V4SI 3 "register_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx t = gen_reg_rtx (V4SImode);
+  emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2]));
+  emit_insn (gen_addv4si3 (operands[0], operands[3], t));
+  DONE;
+})
+
+(define_expand "udot_prodv4si"
+  [(match_operand:V2DI 0 "register_operand" "") 
+   (match_operand:V4SI 1 "register_operand" "") 
+   (match_operand:V4SI 2 "register_operand" "")
+   (match_operand:V2DI 3 "register_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx t1, t2, t3, t4;
+
+  t1 = gen_reg_rtx (V2DImode);
+  emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_addv2di3 (t1, t1, operands[3]));
+
+  t2 = gen_reg_rtx (V4SImode);
+  t3 = gen_reg_rtx (V4SImode);
+  emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2),
+                               gen_lowpart (TImode, operands[1]),
+                               GEN_INT (32)));
+  emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3),
+                               gen_lowpart (TImode, operands[2]),
+                               GEN_INT (32)));
+
+  t4 = gen_reg_rtx (V2DImode);
+  emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3));
+
+  emit_insn (gen_addv2di3 (operands[0], t1, t4));
+  DONE;
+})
+
 (define_insn "ashr<mode>3"
   [(set (match_operand:SSEMODE24 0 "register_operand" "=x")
 	(ashiftrt:SSEMODE24
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 26ec2be5cae..d4bf08e228c 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1,5 +1,5 @@
 ;; AltiVec patterns.
-;; Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+;; Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
 ;; Contributed by Aldy Hernandez (aldy@quesejoda.com)
 
 ;; This file is part of GCC.
@@ -2150,6 +2150,77 @@
   DONE;
 }")
 
+(define_expand "udot_prod<mode>"
+  [(set (match_operand:V4SI 0 "register_operand" "=v")
+        (plus:V4SI (match_operand:V4SI 3 "register_operand" "v")
+                   (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v")  
+                                 (match_operand:VIshort 2 "register_operand" "v")] 
+                                UNSPEC_VMSUMU)))]
+  "TARGET_ALTIVEC"
+  "
+{  
+  emit_insn (gen_altivec_vmsumu<VI_char>m (operands[0], operands[1], operands[2], operands[3]));
+  DONE;
+}")
+   
+(define_expand "sdot_prodv8hi"
+  [(set (match_operand:V4SI 0 "register_operand" "=v")
+        (plus:V4SI (match_operand:V4SI 3 "register_operand" "v")
+                   (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")
+                                 (match_operand:V8HI 2 "register_operand" "v")]
+                                UNSPEC_VMSUMSHM)))]
+  "TARGET_ALTIVEC"
+  "
+{
+  emit_insn (gen_altivec_vmsumshm (operands[0], operands[1], operands[2], operands[3]));
+  DONE;
+}")
+
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:V4SI 0 "register_operand" "=v")
+        (plus:V4SI (match_operand:V4SI 2 "register_operand" "v")
+                   (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v")]
+                                UNSPEC_VMSUMU)))]
+  "TARGET_ALTIVEC"
+  "
+{
+  rtx vones = gen_reg_rtx (GET_MODE (operands[1]));
+
+  emit_insn (gen_altivec_vspltis<VI_char> (vones, const1_rtx));
+  emit_insn (gen_altivec_vmsumu<VI_char>m (operands[0], operands[1], vones, operands[2]));
+  DONE;
+}")
+
+(define_expand "widen_ssumv16qi3"
+  [(set (match_operand:V4SI 0 "register_operand" "=v")
+        (plus:V4SI (match_operand:V4SI 2 "register_operand" "v")
+                   (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")]
+                                UNSPEC_VMSUMM)))]
+  "TARGET_ALTIVEC"
+  "
+{
+  rtx vones = gen_reg_rtx (V16QImode);
+
+  emit_insn (gen_altivec_vspltisb (vones, const1_rtx));
+  emit_insn (gen_altivec_vmsummbm (operands[0], operands[1], vones, operands[2]));
+  DONE;
+}")
+
+(define_expand "widen_ssumv8hi3"
+  [(set (match_operand:V4SI 0 "register_operand" "=v")
+        (plus:V4SI (match_operand:V4SI 2 "register_operand" "v")
+                   (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")]
+                                UNSPEC_VMSUMSHM)))]
+  "TARGET_ALTIVEC"
+  "
+{
+  rtx vones = gen_reg_rtx (V8HImode);
+
+  emit_insn (gen_altivec_vspltish (vones, const1_rtx));
+  emit_insn (gen_altivec_vmsumshm (operands[0], operands[1], vones, operands[2]));
+  DONE;
+}")
+
 (define_expand "negv4sf2"
   [(use (match_operand:V4SF 0 "register_operand" ""))
    (use (match_operand:V4SF 1 "register_operand" ""))]
diff --git a/gcc/config/rs6000/x-darwin b/gcc/config/rs6000/x-darwin
index bcf1c9ecb1f..033ab6bf54c 100644
--- a/gcc/config/rs6000/x-darwin
+++ b/gcc/config/rs6000/x-darwin
@@ -1,4 +1,4 @@
 host-ppc-darwin.o : $(srcdir)/config/rs6000/host-darwin.c \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h hosthooks.h $(HOSTHOOKS_DEF_H) toplev.h \
-  diagnostic.h config/host-darwin.h
+  config/host-darwin.h $(DIAGNOSTIC_H)
 	$(CC) -c $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $< -o $@
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index d4e515c50e8..40a32d4508f 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -321,6 +321,10 @@
 ;; in "RRE" for DImode and "RR" for SImode.
 (define_mode_attr E [(DI "E") (SI "")])
 
+;; This attribute handles differences in the instruction 'type' and makes RX<Y>
+;; to result in "RXY" for DImode and "RX" for SImode.
+(define_mode_attr Y [(DI "Y") (SI "")])
+
 ;; This attribute handles differences in the instruction 'type' and will result
 ;; in "RSE" for TImode and "RS" for DImode.
 (define_mode_attr TE [(TI "E") (DI "")])
@@ -329,6 +333,12 @@
 ;; and "lcr" in SImode.
 (define_mode_attr g [(DI "g") (SI "")])
 
+;; In GPR templates, a string like "sl<y>" will expand to "slg" in DImode
+;; and "sly" in SImode. This is useful because on 64bit the ..g instructions
+;; were enhanced with long displacements whereas 31bit instructions got a ..y
+;; variant for long displacements.
+(define_mode_attr y [(DI "g") (SI "y")])
+
 ;; In DP templates, a string like "cds<g>" will expand to "cdsg" in TImode
 ;; and "cds" in DImode.
 (define_mode_attr tg [(TI "g") (DI "")])
@@ -626,18 +636,6 @@
    cgf\t%0,%1"
   [(set_attr "op_type" "RRE,RXY")])
 
-(define_insn "*cmpdi_ccs"
-  [(set (reg CC_REGNUM)
-	(compare (match_operand:DI 0 "register_operand" "d,d,d,d")
-		 (match_operand:DI 1 "general_operand" "d,K,Os,m")))]
-  "s390_match_ccmode(insn, CCSmode) && TARGET_64BIT"
-  "@
-   cgr\t%0,%1
-   cghi\t%0,%h1
-   cgfi\t%0,%1
-   cg\t%0,%1"
-  [(set_attr "op_type" "RRE,RI,RIL,RXY")])
-
 (define_insn "*cmpsi_ccs_sign"
   [(set (reg CC_REGNUM)
         (compare (sign_extend:SI (match_operand:HI 1 "memory_operand" "R,T"))
@@ -648,18 +646,18 @@
    chy\t%0,%1"
   [(set_attr "op_type" "RX,RXY")])
 
-(define_insn "*cmpsi_ccs"
+(define_insn "*cmp<mode>_ccs"
   [(set (reg CC_REGNUM)
-        (compare (match_operand:SI 0 "register_operand" "d,d,d,d,d")
-                 (match_operand:SI 1 "general_operand" "d,K,Os,R,T")))]
+        (compare (match_operand:GPR 0 "register_operand" "d,d,d,d,d")
+                 (match_operand:GPR 1 "general_operand" "d,K,Os,R,T")))]
   "s390_match_ccmode(insn, CCSmode)"
   "@
-   cr\t%0,%1
-   chi\t%0,%h1
-   cfi\t%0,%1
-   c\t%0,%1
-   cy\t%0,%1"
-  [(set_attr "op_type" "RR,RI,RIL,RX,RXY")])
+   c<g>r\t%0,%1
+   c<g>hi\t%0,%h1
+   c<g>fi\t%0,%1
+   c<g>\t%0,%1
+   c<y>\t%0,%1"
+  [(set_attr "op_type" "RR<E>,RI,RIL,RX<Y>,RXY")])
 
 
 ; Compare (unsigned) instructions
@@ -2746,11 +2744,6 @@
 })
 
 ;
-; extendqihi2 instruction pattern(s).
-;
-
-
-;
 ; zero_extendsidi2 instruction pattern(s).
 ;
 
@@ -3264,6 +3257,15 @@
 ; adddi3 instruction pattern(s).
 ;
 
+(define_expand "adddi3"
+  [(parallel
+    [(set (match_operand:DI 0 "register_operand" "")
+          (plus:DI (match_operand:DI 1 "nonimmediate_operand" "")
+                   (match_operand:DI 2 "general_operand" "")))
+     (clobber (reg:CC CC_REGNUM))])]
+  ""
+  "")
+
 (define_insn "*adddi3_sign"
   [(set (match_operand:DI 0 "register_operand" "=d,d")
         (plus:DI (sign_extend:DI (match_operand:SI 2 "general_operand" "d,m"))
@@ -3311,128 +3313,6 @@
    algf\t%0,%2"
   [(set_attr "op_type"  "RRE,RXY")])
 
-(define_insn "*adddi3_imm_cc"
-  [(set (reg CC_REGNUM)
-        (compare (plus:DI (match_operand:DI 1 "nonimmediate_operand" "0,0")
-                          (match_operand:DI 2 "const_int_operand" "K,Os"))
-                 (const_int 0)))
-   (set (match_operand:DI 0 "register_operand" "=d,d")
-        (plus:DI (match_dup 1) (match_dup 2)))]
-  "TARGET_64BIT
-   && s390_match_ccmode (insn, CCAmode)
-   && (CONST_OK_FOR_CONSTRAINT_P (INTVAL (operands[2]), 'K', \"K\")
-       || CONST_OK_FOR_CONSTRAINT_P (INTVAL (operands[2]), 'O', \"Os\"))"
-  "@
-   aghi\t%0,%h2
-   agfi\t%0,%2"
-  [(set_attr "op_type"  "RI,RIL")])
-
-(define_insn "*adddi3_carry1_cc"
-  [(set (reg CC_REGNUM)
-        (compare (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,0")
-                          (match_operand:DI 2 "general_operand" "d,Op,On,m"))
-                 (match_dup 1)))
-   (set (match_operand:DI 0 "register_operand" "=d,d,d,d")
-        (plus:DI (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCL1mode) && TARGET_64BIT"
-  "@
-   algr\t%0,%2
-   algfi\t%0,%2
-   slgfi\t%0,%n2
-   alg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RIL,RIL,RXY")])
-
-(define_insn "*adddi3_carry1_cconly"
-  [(set (reg CC_REGNUM)
-        (compare (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0")
-                          (match_operand:DI 2 "general_operand" "d,m"))
-                 (match_dup 1)))
-   (clobber (match_scratch:DI 0 "=d,d"))]
-  "s390_match_ccmode (insn, CCL1mode) && TARGET_64BIT"
-  "@
-   algr\t%0,%2
-   alg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*adddi3_carry2_cc"
-  [(set (reg CC_REGNUM)
-        (compare (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,0")
-                          (match_operand:DI 2 "general_operand" "d,Op,On,m"))
-                 (match_dup 2)))
-   (set (match_operand:DI 0 "register_operand" "=d,d,d,d")
-        (plus:DI (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCL1mode) && TARGET_64BIT"
-  "@
-   algr\t%0,%2
-   algfi\t%0,%2
-   slgfi\t%0,%n2
-   alg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RIL,RIL,RXY")])
-
-(define_insn "*adddi3_carry2_cconly"
-  [(set (reg CC_REGNUM)
-        (compare (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0")
-                          (match_operand:DI 2 "general_operand" "d,m"))
-                 (match_dup 2)))
-   (clobber (match_scratch:DI 0 "=d,d"))]
-  "s390_match_ccmode (insn, CCL1mode) && TARGET_64BIT"
-  "@
-   algr\t%0,%2
-   alg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*adddi3_cc"
-  [(set (reg CC_REGNUM)
-        (compare (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,0")
-                          (match_operand:DI 2 "general_operand" "d,Op,On,m"))
-                 (const_int 0)))
-   (set (match_operand:DI 0 "register_operand" "=d,d,d,d")
-        (plus:DI (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCLmode) && TARGET_64BIT"
-  "@
-   algr\t%0,%2
-   algfi\t%0,%2
-   slgfi\t%0,%n2
-   alg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RIL,RIL,RXY")])
-
-(define_insn "*adddi3_cconly"
-  [(set (reg CC_REGNUM)
-        (compare (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0")
-                          (match_operand:DI 2 "general_operand" "d,m"))
-                 (const_int 0)))
-   (clobber (match_scratch:DI 0 "=d,d"))]
-  "s390_match_ccmode (insn, CCLmode) && TARGET_64BIT"
-  "@
-   algr\t%0,%2
-   alg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*adddi3_cconly2"
-  [(set (reg CC_REGNUM)
-        (compare (match_operand:DI 1 "nonimmediate_operand" "%0,0")
-                 (neg:SI (match_operand:DI 2 "general_operand" "d,m"))))
-   (clobber (match_scratch:DI 0 "=d,d"))]
-  "s390_match_ccmode(insn, CCLmode) && TARGET_64BIT"
-  "@
-   algr\t%0,%2
-   alg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*adddi3_64"
-  [(set (match_operand:DI 0 "register_operand" "=d,d,d,d,d")
-        (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,0,0")
-                 (match_operand:DI 2 "general_operand" "d,K,Op,On,m") ) )
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_64BIT"
-  "@
-   agr\t%0,%2
-   aghi\t%0,%h2
-   algfi\t%0,%2
-   slgfi\t%0,%n2
-   ag\t%0,%2"
-  [(set_attr "op_type"  "RRE,RI,RIL,RIL,RXY")])
-
 (define_insn_and_split "*adddi3_31z"
   [(set (match_operand:DI 0 "register_operand" "=&d")
         (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0")
@@ -3489,155 +3369,163 @@
    operands[8] = operand_subword (operands[2], 1, 0, DImode);
    operands[9] = gen_label_rtx ();")
 
-(define_expand "adddi3"
+;
+; addsi3 instruction pattern(s).
+;
+
+(define_expand "addsi3"
   [(parallel
-    [(set (match_operand:DI 0 "register_operand" "")
-          (plus:DI (match_operand:DI 1 "nonimmediate_operand" "")
-                   (match_operand:DI 2 "general_operand" "")))
+    [(set (match_operand:SI 0 "register_operand" "")
+          (plus:SI (match_operand:SI 1 "nonimmediate_operand" "")
+                   (match_operand:SI 2 "general_operand" "")))
      (clobber (reg:CC CC_REGNUM))])]
   ""
   "")
 
+(define_insn "*addsi3_sign"
+  [(set (match_operand:SI 0 "register_operand" "=d,d")
+        (plus:SI (sign_extend:SI (match_operand:HI 2 "memory_operand" "R,T"))
+                 (match_operand:SI 1 "register_operand" "0,0")))
+   (clobber (reg:CC CC_REGNUM))]
+  ""
+  "@
+   ah\t%0,%2
+   ahy\t%0,%2"
+  [(set_attr "op_type"  "RX,RXY")])
+
 ;
-; addsi3 instruction pattern(s).
+; add(di|si)3 instruction pattern(s).
 ;
 
-(define_insn "*addsi3_imm_cc"
-  [(set (reg CC_REGNUM)
-        (compare (plus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
-                          (match_operand:SI 2 "const_int_operand" "K,Os"))
-                 (const_int 0)))
-   (set (match_operand:SI 0 "register_operand" "=d,d")
-        (plus:SI (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCAmode)
-   && (CONST_OK_FOR_CONSTRAINT_P (INTVAL (operands[2]), 'K', \"K\")
-       || CONST_OK_FOR_CONSTRAINT_P (INTVAL (operands[2]), 'O', \"Os\"))
-   && INTVAL (operands[2]) != -((HOST_WIDE_INT)1 << 31)"
+(define_insn "*add<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=d,d,d,d,d,d")
+        (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0,0,0,0")
+		  (match_operand:GPR 2 "general_operand" "d,K,Op,On,R,T") ) )
+   (clobber (reg:CC CC_REGNUM))]
+  ""
   "@
-   ahi\t%0,%h2
-   afi\t%0,%2"
-  [(set_attr "op_type"  "RI,RIL")])
+   a<g>r\t%0,%2
+   a<g>hi\t%0,%h2
+   al<g>fi\t%0,%2
+   sl<g>fi\t%0,%n2
+   a<g>\t%0,%2
+   a<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RI,RIL,RIL,RX<Y>,RXY")])
 
-(define_insn "*addsi3_carry1_cc"
+(define_insn "*add<mode>3_carry1_cc"
   [(set (reg CC_REGNUM)
-        (compare (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,0,0")
-                          (match_operand:SI 2 "general_operand" "d,Os,R,T"))
+        (compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0,0,0")
+			   (match_operand:GPR 2 "general_operand" "d,Op,On,R,T"))
                  (match_dup 1)))
-   (set (match_operand:SI 0 "register_operand" "=d,d,d,d")
-        (plus:SI (match_dup 1) (match_dup 2)))]
+   (set (match_operand:GPR 0 "register_operand" "=d,d,d,d,d")
+        (plus:GPR (match_dup 1) (match_dup 2)))]
   "s390_match_ccmode (insn, CCL1mode)"
   "@
-   alr\t%0,%2
-   alfi\t%0,%o2
-   al\t%0,%2
-   aly\t%0,%2"
-  [(set_attr "op_type"  "RR,RIL,RX,RXY")])
+   al<g>r\t%0,%2
+   al<g>fi\t%0,%2
+   sl<g>fi\t%0,%n2
+   al<g>\t%0,%2
+   al<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RIL,RIL,RX<Y>,RXY")])
 
-(define_insn "*addsi3_carry1_cconly"
+(define_insn "*add<mode>3_carry1_cconly"
   [(set (reg CC_REGNUM)
-        (compare (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,0")
-                          (match_operand:SI 2 "general_operand" "d,R,T"))
+        (compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0")
+			   (match_operand:GPR 2 "general_operand" "d,R,T"))
                  (match_dup 1)))
-   (clobber (match_scratch:SI 0 "=d,d,d"))]
+   (clobber (match_scratch:GPR 0 "=d,d,d"))]
   "s390_match_ccmode (insn, CCL1mode)"
   "@
-   alr\t%0,%2
-   al\t%0,%2
-   aly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   al<g>r\t%0,%2
+   al<g>\t%0,%2
+   al<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*addsi3_carry2_cc"
+(define_insn "*add<mode>3_carry2_cc"
   [(set (reg CC_REGNUM)
-        (compare (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,0,0")
-                          (match_operand:SI 2 "general_operand" "d,Os,R,T"))
+        (compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0,0,0")
+			   (match_operand:GPR 2 "general_operand" "d,Op,On,R,T"))
                  (match_dup 2)))
-   (set (match_operand:SI 0 "register_operand" "=d,d,d,d")
-        (plus:SI (match_dup 1) (match_dup 2)))]
+   (set (match_operand:GPR 0 "register_operand" "=d,d,d,d,d")
+        (plus:GPR (match_dup 1) (match_dup 2)))]
   "s390_match_ccmode (insn, CCL1mode)"
   "@
-   alr\t%0,%2
-   alfi\t%0,%o2
-   al\t%0,%2
-   aly\t%0,%2"
-  [(set_attr "op_type"  "RR,RIL,RX,RXY")])
+   al<g>r\t%0,%2
+   al<g>fi\t%0,%2
+   sl<g>fi\t%0,%n2
+   al<g>\t%0,%2
+   al<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RIL,RIL,RX<Y>,RXY")])
 
-(define_insn "*addsi3_carry2_cconly"
+(define_insn "*add<mode>3_carry2_cconly"
   [(set (reg CC_REGNUM)
-        (compare (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,0")
-                          (match_operand:SI 2 "general_operand" "d,R,T"))
+        (compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0")
+			   (match_operand:GPR 2 "general_operand" "d,R,T"))
                  (match_dup 2)))
-   (clobber (match_scratch:SI 0 "=d,d,d"))]
+   (clobber (match_scratch:GPR 0 "=d,d,d"))]
   "s390_match_ccmode (insn, CCL1mode)"
   "@
-   alr\t%0,%2
-   al\t%0,%2
-   aly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   al<g>r\t%0,%2
+   al<g>\t%0,%2
+   al<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*addsi3_cc"
+(define_insn "*add<mode>3_cc"
   [(set (reg CC_REGNUM)
-        (compare (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,0,0")
-                          (match_operand:SI 2 "general_operand" "d,Os,R,T"))
+        (compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0,0,0")
+			   (match_operand:GPR 2 "general_operand" "d,Op,On,R,T"))
                  (const_int 0)))
-   (set (match_operand:SI 0 "register_operand" "=d,d,d,d")
-        (plus:SI (match_dup 1) (match_dup 2)))]
+   (set (match_operand:GPR 0 "register_operand" "=d,d,d,d,d")
+        (plus:GPR (match_dup 1) (match_dup 2)))]
   "s390_match_ccmode (insn, CCLmode)"
   "@
-   alr\t%0,%2
-   alfi\t%0,%o2
-   al\t%0,%2
-   aly\t%0,%2"
-  [(set_attr "op_type"  "RR,RIL,RX,RXY")])
+   al<g>r\t%0,%2
+   al<g>fi\t%0,%2
+   sl<g>fi\t%0,%n2
+   al<g>\t%0,%2
+   al<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RIL,RIL,RX<Y>,RXY")])
 
-(define_insn "*addsi3_cconly"
+(define_insn "*add<mode>3_cconly"
   [(set (reg CC_REGNUM)
-        (compare (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,0")
-                          (match_operand:SI 2 "general_operand" "d,R,T"))
+        (compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0")
+			   (match_operand:GPR 2 "general_operand" "d,R,T"))
                  (const_int 0)))
-   (clobber (match_scratch:SI 0 "=d,d,d"))]
+   (clobber (match_scratch:GPR 0 "=d,d,d"))]
   "s390_match_ccmode (insn, CCLmode)"
   "@
-   alr\t%0,%2
-   al\t%0,%2
-   aly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   al<g>r\t%0,%2
+   al<g>\t%0,%2
+   al<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*addsi3_cconly2"
+(define_insn "*add<mode>3_cconly2"
   [(set (reg CC_REGNUM)
-        (compare (match_operand:SI 1 "nonimmediate_operand" "%0,0,0")
-                 (neg:SI (match_operand:SI 2 "general_operand" "d,R,T"))))
-   (clobber (match_scratch:SI 0 "=d,d,d"))]
-  "s390_match_ccmode (insn, CCLmode)"
-  "@
-   alr\t%0,%2
-   al\t%0,%2
-   aly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
-
-(define_insn "*addsi3_sign"
-  [(set (match_operand:SI 0 "register_operand" "=d,d")
-        (plus:SI (sign_extend:SI (match_operand:HI 2 "memory_operand" "R,T"))
-                 (match_operand:SI 1 "register_operand" "0,0")))
-   (clobber (reg:CC CC_REGNUM))]
-  ""
+        (compare (match_operand:GPR 1 "nonimmediate_operand" "%0,0,0")
+                 (neg:GPR (match_operand:GPR 2 "general_operand" "d,R,T"))))
+   (clobber (match_scratch:GPR 0 "=d,d,d"))]
+  "s390_match_ccmode(insn, CCLmode)"
   "@
-   ah\t%0,%2
-   ahy\t%0,%2"
-  [(set_attr "op_type"  "RX,RXY")])
+   al<g>r\t%0,%2
+   al<g>\t%0,%2
+   al<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "addsi3"
-  [(set (match_operand:SI 0 "register_operand" "=d,d,d,d,d")
-        (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,0,0,0")
-                 (match_operand:SI 2 "general_operand" "d,K,Os,R,T")))
-   (clobber (reg:CC CC_REGNUM))]
-  ""
+(define_insn "*add<mode>3_imm_cc"
+  [(set (reg CC_REGNUM)
+        (compare (plus:GPR (match_operand:GPR 1 "nonimmediate_operand" "0,0")
+			   (match_operand:GPR 2 "const_int_operand" "K,Os"))
+                 (const_int 0)))
+   (set (match_operand:GPR 0 "register_operand" "=d,d")
+        (plus:GPR (match_dup 1) (match_dup 2)))]
+  "s390_match_ccmode (insn, CCAmode)
+   && (CONST_OK_FOR_CONSTRAINT_P (INTVAL (operands[2]), 'K', \"K\")
+       || CONST_OK_FOR_CONSTRAINT_P (INTVAL (operands[2]), 'O', \"Os\"))
+   && INTVAL (operands[2]) != -((HOST_WIDE_INT)1 << (GET_MODE_BITSIZE(<MODE>mode) - 1))"
   "@
-   ar\t%0,%2
-   ahi\t%0,%h2
-   afi\t%0,%2
-   a\t%0,%2
-   ay\t%0,%2"
-  [(set_attr "op_type"  "RR,RI,RIL,RX,RXY")])
+   a<g>hi\t%0,%h2
+   a<g>fi\t%0,%2"
+  [(set_attr "op_type"  "RI,RIL")])
 
 ;
 ; add(df|sf)3 instruction pattern(s).
@@ -3740,6 +3628,15 @@
 ; subdi3 instruction pattern(s).
 ;
 
+(define_expand "subdi3"
+  [(parallel
+    [(set (match_operand:DI 0 "register_operand" "")
+          (minus:DI (match_operand:DI 1 "register_operand" "")
+                    (match_operand:DI 2 "general_operand" "")))
+     (clobber (reg:CC CC_REGNUM))])]
+  ""
+  "")
+
 (define_insn "*subdi3_sign"
   [(set (match_operand:DI 0 "register_operand" "=d,d")
         (minus:DI (match_operand:DI 1 "register_operand" "0,0")
@@ -3787,90 +3684,6 @@
    slgf\t%0,%2"
   [(set_attr "op_type"  "RRE,RXY")])
 
-(define_insn "*subdi3_borrow_cc"
-  [(set (reg CC_REGNUM)
-        (compare (minus:DI (match_operand:DI 1 "register_operand" "0,0")
-                           (match_operand:DI 2 "general_operand" "d,m"))
-                 (match_dup 1)))
-   (set (match_operand:DI 0 "register_operand" "=d,d")
-        (minus:DI (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCL2mode) && TARGET_64BIT"
-  "@
-   slgr\t%0,%2
-   slg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*subdi3_borrow_cconly"
-  [(set (reg CC_REGNUM)
-        (compare (minus:DI (match_operand:DI 1 "register_operand" "0,0")
-                           (match_operand:DI 2 "general_operand" "d,m"))
-                 (match_dup 1)))
-   (clobber (match_scratch:DI 0 "=d,d"))]
-  "s390_match_ccmode (insn, CCL2mode) && TARGET_64BIT"
-  "@
-   slgr\t%0,%2
-   slg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*subdi3_cc"
-  [(set (reg CC_REGNUM)
-        (compare (minus:DI (match_operand:DI 1 "register_operand" "0,0")
-                           (match_operand:DI 2 "general_operand" "d,m"))
-                 (const_int 0)))
-   (set (match_operand:DI 0 "register_operand" "=d,d")
-        (minus:DI (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCLmode) && TARGET_64BIT"
-  "@
-   slgr\t%0,%2
-   slg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*subdi3_cc2"
-  [(set (reg CC_REGNUM)
-        (compare (match_operand:DI 1 "register_operand" "0,0")
-                 (match_operand:DI 2 "general_operand" "d,m")))
-   (set (match_operand:DI 0 "register_operand" "=d,d")
-        (minus:DI (match_dup 1) (match_dup 2)))]
-  "s390_match_ccmode (insn, CCL3mode) && TARGET_64BIT"
-  "@
-   slgr\t%0,%2
-   slg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*subdi3_cconly"
-  [(set (reg CC_REGNUM)
-        (compare (minus:DI (match_operand:DI 1 "register_operand" "0,0")
-                           (match_operand:DI 2 "general_operand" "d,m"))
-                 (const_int 0)))
-   (clobber (match_scratch:DI 0 "=d,d"))]
-  "s390_match_ccmode (insn, CCLmode) && TARGET_64BIT"
-  "@
-   slgr\t%0,%2
-   slg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*subdi3_cconly2"
-  [(set (reg CC_REGNUM)
-        (compare (match_operand:DI 1 "register_operand" "0,0")
-                 (match_operand:DI 2 "general_operand" "d,m")))
-   (clobber (match_scratch:DI 0 "=d,d"))]
-  "s390_match_ccmode (insn, CCL3mode) && TARGET_64BIT"
-  "@
-   slgr\t%0,%2
-   slg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RXY")])
-
-(define_insn "*subdi3_64"
-  [(set (match_operand:DI 0 "register_operand" "=d,d")
-        (minus:DI (match_operand:DI 1 "register_operand" "0,0")
-                  (match_operand:DI 2 "general_operand" "d,m") ) )
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_64BIT"
-  "@
-   sgr\t%0,%2
-   sg\t%0,%2"
-  [(set_attr "op_type"  "RRE,RRE")])
-
 (define_insn_and_split "*subdi3_31z"
   [(set (match_operand:DI 0 "register_operand" "=&d")
         (minus:DI (match_operand:DI 1 "register_operand" "0")
@@ -3927,121 +3740,124 @@
    operands[8] = operand_subword (operands[2], 1, 0, DImode);
    operands[9] = gen_label_rtx ();")
 
-(define_expand "subdi3"
+;
+; subsi3 instruction pattern(s).
+;
+
+(define_expand "subsi3"
   [(parallel
-    [(set (match_operand:DI 0 "register_operand" "")
-          (minus:DI (match_operand:DI 1 "register_operand" "")
-                    (match_operand:DI 2 "general_operand" "")))
+    [(set (match_operand:SI 0 "register_operand" "")
+          (minus:SI (match_operand:SI 1 "register_operand" "")
+                    (match_operand:SI 2 "general_operand" "")))
      (clobber (reg:CC CC_REGNUM))])]
   ""
   "")
 
+(define_insn "*subsi3_sign"
+  [(set (match_operand:SI 0 "register_operand" "=d,d")
+        (minus:SI (match_operand:SI 1 "register_operand" "0,0")
+                  (sign_extend:SI (match_operand:HI 2 "memory_operand" "R,T"))))
+   (clobber (reg:CC CC_REGNUM))]
+  ""
+  "@
+   sh\t%0,%2
+   shy\t%0,%2"
+  [(set_attr "op_type"  "RX,RXY")])
+
 ;
-; subsi3 instruction pattern(s).
+; sub(di|si)3 instruction pattern(s).
 ;
 
-(define_insn "*subsi3_borrow_cc"
+(define_insn "*sub<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=d,d,d")
+        (minus:GPR (match_operand:GPR 1 "register_operand" "0,0,0")
+		   (match_operand:GPR 2 "general_operand" "d,R,T") ) )
+   (clobber (reg:CC CC_REGNUM))]
+  ""
+  "@
+   s<g>r\t%0,%2
+   s<g>\t%0,%2
+   s<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
+
+(define_insn "*sub<mode>3_borrow_cc"
   [(set (reg CC_REGNUM)
-        (compare (minus:SI (match_operand:SI 1 "register_operand" "0,0,0")
-                           (match_operand:SI 2 "general_operand" "d,R,T"))
+        (compare (minus:GPR (match_operand:GPR 1 "register_operand" "0,0,0")
+			    (match_operand:GPR 2 "general_operand" "d,R,T"))
                  (match_dup 1)))
-   (set (match_operand:SI 0 "register_operand" "=d,d,d")
-        (minus:SI (match_dup 1) (match_dup 2)))]
+   (set (match_operand:GPR 0 "register_operand" "=d,d,d")
+        (minus:GPR (match_dup 1) (match_dup 2)))]
   "s390_match_ccmode (insn, CCL2mode)"
   "@
-   slr\t%0,%2
-   sl\t%0,%2
-   sly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   sl<g>r\t%0,%2
+   sl<g>\t%0,%2
+   sl<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*subsi3_borrow_cconly"
+(define_insn "*sub<mode>3_borrow_cconly"
   [(set (reg CC_REGNUM)
-        (compare (minus:SI (match_operand:SI 1 "register_operand" "0,0,0")
-                           (match_operand:SI 2 "general_operand" "d,R,T"))
+        (compare (minus:GPR (match_operand:GPR 1 "register_operand" "0,0,0")
+			    (match_operand:GPR 2 "general_operand" "d,R,T"))
                  (match_dup 1)))
-   (clobber (match_scratch:SI 0 "=d,d,d"))]
+   (clobber (match_scratch:GPR 0 "=d,d,d"))]
   "s390_match_ccmode (insn, CCL2mode)"
   "@
-   slr\t%0,%2
-   sl\t%0,%2
-   sly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   sl<g>r\t%0,%2
+   sl<g>\t%0,%2
+   sl<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*subsi3_cc"
+(define_insn "*sub<mode>3_cc"
   [(set (reg CC_REGNUM)
-        (compare (minus:SI (match_operand:SI 1 "register_operand" "0,0,0")
-                           (match_operand:SI 2 "general_operand" "d,R,T"))
+        (compare (minus:GPR (match_operand:GPR 1 "register_operand" "0,0,0")
+			    (match_operand:GPR 2 "general_operand" "d,R,T"))
                  (const_int 0)))
-   (set (match_operand:SI 0 "register_operand" "=d,d,d")
-        (minus:SI (match_dup 1) (match_dup 2)))]
+   (set (match_operand:GPR 0 "register_operand" "=d,d,d")
+        (minus:GPR (match_dup 1) (match_dup 2)))]
   "s390_match_ccmode (insn, CCLmode)"
   "@
-   slr\t%0,%2
-   sl\t%0,%2
-   sly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   sl<g>r\t%0,%2
+   sl<g>\t%0,%2
+   sl<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*subsi3_cc2"
+(define_insn "*sub<mode>3_cc2"
   [(set (reg CC_REGNUM)
-        (compare (match_operand:SI 1 "register_operand" "0,0,0")
-                 (match_operand:SI 2 "general_operand" "d,R,T")))
-   (set (match_operand:SI 0 "register_operand" "=d,d,d")
-        (minus:SI (match_dup 1) (match_dup 2)))]
+        (compare (match_operand:GPR 1 "register_operand" "0,0,0")
+                 (match_operand:GPR 2 "general_operand" "d,R,T")))
+   (set (match_operand:GPR 0 "register_operand" "=d,d,d")
+        (minus:GPR (match_dup 1) (match_dup 2)))]
   "s390_match_ccmode (insn, CCL3mode)"
   "@
-   slr\t%0,%2
-   sl\t%0,%2
-   sly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   sl<g>r\t%0,%2
+   sl<g>\t%0,%2
+   sl<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*subsi3_cconly"
+(define_insn "*sub<mode>3_cconly"
   [(set (reg CC_REGNUM)
-        (compare (minus:SI (match_operand:SI 1 "register_operand" "0,0,0")
-                           (match_operand:SI 2 "general_operand" "d,R,T"))
+        (compare (minus:GPR (match_operand:GPR 1 "register_operand" "0,0,0")
+			    (match_operand:GPR 2 "general_operand" "d,R,T"))
                  (const_int 0)))
-   (clobber (match_scratch:SI 0 "=d,d,d"))]
+   (clobber (match_scratch:GPR 0 "=d,d,d"))]
   "s390_match_ccmode (insn, CCLmode)"
   "@
-   slr\t%0,%2
-   sl\t%0,%2
-   sly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
+   sl<g>r\t%0,%2
+   sl<g>\t%0,%2
+   sl<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
-(define_insn "*subsi3_cconly2"
+(define_insn "*sub<mode>3_cconly2"
   [(set (reg CC_REGNUM)
-        (compare (match_operand:SI 1 "register_operand" "0,0,0")
-                 (match_operand:SI 2 "general_operand" "d,R,T")))
-   (clobber (match_scratch:SI 0 "=d,d,d"))]
+        (compare (match_operand:GPR 1 "register_operand" "0,0,0")
+                 (match_operand:GPR 2 "general_operand" "d,R,T")))
+   (clobber (match_scratch:GPR 0 "=d,d,d"))]
   "s390_match_ccmode (insn, CCL3mode)"
   "@
-   slr\t%0,%2
-   sl\t%0,%2
-   sly\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
-
-(define_insn "*subsi3_sign"
-  [(set (match_operand:SI 0 "register_operand" "=d,d")
-        (minus:SI (match_operand:SI 1 "register_operand" "0,0")
-                  (sign_extend:SI (match_operand:HI 2 "memory_operand" "R,T"))))
-   (clobber (reg:CC CC_REGNUM))]
-  ""
-  "@
-   sh\t%0,%2
-   shy\t%0,%2"
-  [(set_attr "op_type"  "RX,RXY")])
-
-(define_insn "subsi3"
-  [(set (match_operand:SI 0 "register_operand" "=d,d,d")
-        (minus:SI (match_operand:SI 1 "register_operand" "0,0,0")
-                  (match_operand:SI 2 "general_operand" "d,R,T")))
-   (clobber (reg:CC CC_REGNUM))]
-  ""
-  "@
-   sr\t%0,%2
-   s\t%0,%2
-   sy\t%0,%2"
-  [(set_attr "op_type"  "RR,RX,RXY")])
-
+   sl<g>r\t%0,%2
+   sl<g>\t%0,%2
+   sl<y>\t%0,%2"
+  [(set_attr "op_type"  "RR<E>,RX<Y>,RXY")])
 
 ;
 ; sub(df|sf)3 instruction pattern(s).
diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog
index 1ed12d615e8..53b2953320a 100644
--- a/gcc/cp/ChangeLog
+++ b/gcc/cp/ChangeLog
@@ -1,3 +1,35 @@
+2006-01-19  Volker Reichelt  <reichelt@igpm.rwth-aachen.de>
+
+	PR c++/25854
+	* pt.c (maybe_process_partial_specialization): Return early on
+	error_mark_node.
+
+2006-01-19  Volker Reichelt  <reichelt@igpm.rwth-aachen.de>
+
+	PR c++/16829
+	* decl.c (start_preparsed_function): Check default arguments
+	unconditionally.
+	* name-lookup.c (pushdecl_maybe_friend): Check default arguments
+	of all functions and function templates.
+	* parser.c (cp_parser_late_parsing_default_args): Check default
+	arguments.
+	* decl2.c (check_default_args): Set missing default arguments to
+	error_mark_node.
+
+2006-01-18  Mark Mitchell  <mark@codesourcery.com>
+
+	PR c++/25836
+	* cp-tree.h (push_class_stack): New function.
+	(pop_class_stack): Likewise.
+	* class.c (class_stack_node): Add hidden field.
+	(pushclass): Clear it.
+	(push_class_stack): New function.
+	(pop_class_stack): Likewise.
+	(currently_open_class): Ignore hidden classes.
+	(currently_open_derived_class): Likewise.
+	* name-lookup.c (push_to_top_level): Call push_class_stack.
+	(pop_from_top_level): Call pop_class_stack.
+
 2006-01-18  Kazu Hirata  <kazu@codesourcery.com>
 
 	* tree.c (find_tree_t, find_tree): Remove.
diff --git a/gcc/cp/class.c b/gcc/cp/class.c
index 86dbcca14e6..bd89b558abd 100644
--- a/gcc/cp/class.c
+++ b/gcc/cp/class.c
@@ -60,6 +60,10 @@ typedef struct class_stack_node {
 
   /* If were defining TYPE, the names used in this class.  */
   splay_tree names_used;
+
+  /* Nonzero if this class is no longer open, because of a call to
+     push_to_top_level.  */
+  size_t hidden;
 }* class_stack_node_t;
 
 typedef struct vtbl_init_data_s
@@ -5387,6 +5391,8 @@ restore_class_cache (void)
 void
 pushclass (tree type)
 {
+  class_stack_node_t csn;
+
   type = TYPE_MAIN_VARIANT (type);
 
   /* Make sure there is enough room for the new entry on the stack.  */
@@ -5399,10 +5405,12 @@ pushclass (tree type)
     }
 
   /* Insert a new entry on the class stack.  */
-  current_class_stack[current_class_depth].name = current_class_name;
-  current_class_stack[current_class_depth].type = current_class_type;
-  current_class_stack[current_class_depth].access = current_access_specifier;
-  current_class_stack[current_class_depth].names_used = 0;
+  csn = current_class_stack + current_class_depth;
+  csn->name = current_class_name;
+  csn->type = current_class_type;
+  csn->access = current_access_specifier;
+  csn->names_used = 0;
+  csn->hidden = 0;
   current_class_depth++;
 
   /* Now set up the new type.  */
@@ -5459,6 +5467,24 @@ popclass (void)
     splay_tree_delete (current_class_stack[current_class_depth].names_used);
 }
 
+/* Mark the top of the class stack as hidden.  */
+
+void
+push_class_stack (void)
+{
+  if (current_class_depth)
+    ++current_class_stack[current_class_depth - 1].hidden;
+}
+
+/* Mark the top of the class stack as un-hidden.  */
+
+void
+pop_class_stack (void)
+{
+  if (current_class_depth)
+    --current_class_stack[current_class_depth - 1].hidden;
+}
+
 /* Returns 1 if current_class_type is either T or a nested type of T.
    We start looking from 1 because entry 0 is from global scope, and has
    no type.  */
@@ -5469,10 +5495,14 @@ currently_open_class (tree t)
   int i;
   if (current_class_type && same_type_p (t, current_class_type))
     return 1;
-  for (i = 1; i < current_class_depth; ++i)
-    if (current_class_stack[i].type
-	&& same_type_p (current_class_stack [i].type, t))
-      return 1;
+  for (i = current_class_depth - 1; i > 0; --i)
+    {
+      if (current_class_stack[i].hidden)
+	break;
+      if (current_class_stack[i].type
+	  && same_type_p (current_class_stack [i].type, t))
+	return 1;
+    }
   return 0;
 }
 
@@ -5496,8 +5526,12 @@ currently_open_derived_class (tree t)
     return current_class_type;
 
   for (i = current_class_depth - 1; i > 0; --i)
-    if (DERIVED_FROM_P (t, current_class_stack[i].type))
-      return current_class_stack[i].type;
+    {
+      if (current_class_stack[i].hidden)
+	break;
+      if (DERIVED_FROM_P (t, current_class_stack[i].type))
+	return current_class_stack[i].type;
+    }
 
   return NULL_TREE;
 }
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index f857bff4ffb..8c7a905b491 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -3775,6 +3775,8 @@ extern tree cp_fold_obj_type_ref		(tree, tree);
 extern void set_linkage_according_to_type	(tree, tree);
 extern void determine_key_method		(tree);
 extern void check_for_override			(tree, tree);
+extern void push_class_stack                    (void);
+extern void pop_class_stack                     (void);
 
 /* in cvt.c */
 extern tree convert_to_reference		(tree, tree, int, int, tree);
diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index e063e71d14a..cfb53fda373 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -10347,6 +10347,8 @@ start_preparsed_function (tree decl1, tree attrs, int flags)
      you declare a function, these types can be incomplete, but they
      must be complete when you define the function.  */
   check_function_type (decl1, current_function_parms);
+  /* Make sure no default arg is missing.  */
+  check_default_args (decl1);
 
   /* Build the return declaration for the function.  */
   restype = TREE_TYPE (fntype);
@@ -10413,8 +10415,6 @@ start_preparsed_function (tree decl1, tree attrs, int flags)
 	  /* We need to set the DECL_CONTEXT.  */
 	  if (!DECL_CONTEXT (decl1) && DECL_TEMPLATE_INFO (decl1))
 	    DECL_CONTEXT (decl1) = DECL_CONTEXT (DECL_TI_TEMPLATE (decl1));
-	  /* And make sure we have enough default args.  */
-	  check_default_args (decl1);
 	}
       fntype = TREE_TYPE (decl1);
     }
diff --git a/gcc/cp/decl2.c b/gcc/cp/decl2.c
index 9408c8acc88..637ac8c9de7 100644
--- a/gcc/cp/decl2.c
+++ b/gcc/cp/decl2.c
@@ -3228,7 +3228,7 @@ check_default_args (tree x)
       else if (saw_def)
 	{
 	  error ("default argument missing for parameter %P of %q+#D", i, x);
-	  break;
+	  TREE_PURPOSE (arg) = error_mark_node;
 	}
     }
 }
diff --git a/gcc/cp/name-lookup.c b/gcc/cp/name-lookup.c
index 1f8aad71a74..1992487459a 100644
--- a/gcc/cp/name-lookup.c
+++ b/gcc/cp/name-lookup.c
@@ -602,6 +602,9 @@ pushdecl_maybe_friend (tree x, bool is_friend)
     {
       int different_binding_level = 0;
 
+      if (TREE_CODE (x) == FUNCTION_DECL || DECL_FUNCTION_TEMPLATE_P (x))
+       check_default_args (x);
+
       if (TREE_CODE (name) == TEMPLATE_ID_EXPR)
 	name = TREE_OPERAND (name, 0);
 
@@ -710,8 +713,6 @@ pushdecl_maybe_friend (tree x, bool is_friend)
 		{
 		  if (TREE_CODE (t) == TYPE_DECL)
 		    SET_IDENTIFIER_TYPE_VALUE (name, TREE_TYPE (t));
-		  else if (TREE_CODE (t) == FUNCTION_DECL)
-		    check_default_args (t);
 
 		  POP_TIMEVAR_AND_RETURN (TV_NAME_LOOKUP, t);
 		}
@@ -994,9 +995,6 @@ pushdecl_maybe_friend (tree x, bool is_friend)
 	    }
 	}
 
-      if (TREE_CODE (x) == FUNCTION_DECL)
-	check_default_args (x);
-
       if (TREE_CODE (x) == VAR_DECL)
 	maybe_register_incomplete_var (x);
     }
@@ -4939,6 +4937,7 @@ push_to_top_level (void)
   current_lang_base = VEC_alloc (tree, gc, 10);
   current_lang_name = lang_name_cplusplus;
   current_namespace = global_namespace;
+  push_class_stack ();
   skip_evaluation = 0;
   timevar_pop (TV_NAME_LOOKUP);
 }
@@ -4954,6 +4953,7 @@ pop_from_top_level (void)
   /* Clear out class-level bindings cache.  */
   if (previous_class_level)
     invalidate_class_lookup_cache ();
+  pop_class_stack ();
 
   current_lang_base = 0;
 
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 1374afbd8af..fa3ccbc05d5 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -15870,6 +15870,9 @@ cp_parser_late_parsing_default_args (cp_parser *parser, tree fn)
       cp_parser_pop_lexer (parser);
     }
 
+  /* Make sure no default arg is missing.  */
+  check_default_args (fn);
+
   /* Restore the state of local_variables_forbidden_p.  */
   parser->local_variables_forbidden_p = saved_local_variables_forbidden_p;
 
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 0fda8dc9b79..487e9badd74 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -678,8 +678,12 @@ check_explicit_instantiation_namespace (tree spec)
 void
 maybe_process_partial_specialization (tree type)
 {
-  /* TYPE maybe an ERROR_MARK_NODE.  */
-  tree context = TYPE_P (type) ? TYPE_CONTEXT (type) : NULL_TREE;
+  tree context;
+
+  if (type == error_mark_node)
+    return;
+
+  context = TYPE_CONTEXT (type);
 
   if (CLASS_TYPE_P (type) && CLASSTYPE_USE_TEMPLATE (type))
     {
diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h
index 51493ea8bab..df520266a88 100644
--- a/gcc/diagnostic.h
+++ b/gcc/diagnostic.h
@@ -24,6 +24,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
 #define GCC_DIAGNOSTIC_H
 
 #include "pretty-print.h"
+#include "options.h"
 
 /* Constants used to discriminate diagnostics.  */
 typedef enum
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 924c121502e..3a19fb9c36b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -9062,6 +9062,23 @@ Tune to @var{cpu-type} everything applicable about the generated code, except
 for the ABI and the set of available instructions.  The choices for
 @var{cpu-type} are:
 @table @emph
+@item generic
+Produce code optimized for the most common IA32/AMD64/EM64T processors.
+If you know the CPU on which your code will run, then you should use
+the corresponding @option{-mtune} option instead of
+@option{-mtune=generic}.  But, if you do not know exactly what CPU users
+of your application will have, then you should use this option.
+
+As new processors are deployed in the marketplace, the behavior of this
+option will change.  Therefore, if you upgrade to a newer version of
+GCC, the code generated option will change to reflect the processors
+that were most common when that version of GCC was released.
+
+There is no @option{-march=generic} option because @option{-march}
+indicates the instruction set the compiler can use, and there is no
+generic instruction set applicable to all processors.  In contrast,
+@option{-mtune} indicates the processor (or, in this case, collection of
+processors) for which the code is optimized.
 @item i386
 Original Intel's i386 CPU@.
 @item i486
@@ -9070,8 +9087,11 @@ Intel's i486 CPU@.  (No scheduling is implemented for this chip.)
 Intel Pentium CPU with no MMX support.
 @item pentium-mmx
 Intel PentiumMMX CPU based on Pentium core with MMX instruction set support.
-@item i686, pentiumpro
+@item pentiumpro
 Intel PentiumPro CPU@.
+@item i686
+Same as @code{generic}, but when used as @code{march} option, PentiumPro
+instruction set will be used, so the code will run on all i686 familly chips.
 @item pentium2
 Intel Pentium2 CPU based on PentiumPro core with MMX instruction set support.
 @item pentium3, pentium3m
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 90efcc34a15..b6dd8382f8e 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -1,5 +1,5 @@
 @c Copyright (C) 1988, 1989, 1992, 1993, 1994, 1996, 1998, 1999, 2000, 2001,
-@c 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+@c 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
 @c This is part of the GCC manual.
 @c For copying conditions, see the file gcc.texi.
 
@@ -3099,6 +3099,25 @@ Compute the sum of the unsigned elements of a vector. The vector is operand 1,
 and the scalar result is stored in the least significant bits of operand 0
 (also a vector). The output and input vector should have the same modes.
 
+@cindex @code{sdot_prod@var{m}} instruction pattern
+@item @samp{sdot_prod@var{m}}
+@cindex @code{udot_prod@var{m}} instruction pattern
+@item @samp{udot_prod@var{m}}
+Compute the sum of the products of two signed/unsigned elements. 
+Operand 1 and operand 2 are of the same mode. Their product, which is of a 
+wider mode, is computed and added to operand 3. Operand 3 is of a mode equal or 
+wider than the mode of the product. The result is placed in operand 0, which
+is of the same mode as operand 3. 
+
+@cindex @code{ssum_widen@var{m3}} instruction pattern
+@item @samp{ssum_widen@var{m3}}
+@cindex @code{usum_widen@var{m3}} instruction pattern
+@item @samp{usum_widen@var{m3}}
+Operands 0 and 2 are of the same mode, which is wider than the mode of 
+operand 1. Add operand 1 to operand 2 and place the widened result in
+operand 0. (This is used express accumulation of elements into an accumulator
+of a wider mode.)
+
 @cindex @code{vec_shl_@var{m}} instruction pattern
 @cindex @code{vec_shr_@var{m}} instruction pattern
 @item @samp{vec_shl_@var{m}}, @samp{vec_shr_@var{m}}
diff --git a/gcc/expr.c b/gcc/expr.c
index 92048ff7304..b15b43cb7d9 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8553,6 +8553,31 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode,
         return temp;
       }
 
+    case DOT_PROD_EXPR:
+      {
+	tree oprnd0 = TREE_OPERAND (exp, 0);
+	tree oprnd1 = TREE_OPERAND (exp, 1);
+	tree oprnd2 = TREE_OPERAND (exp, 2);
+	rtx op2;
+
+	expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, 0);
+	op2 = expand_expr (oprnd2, NULL_RTX, VOIDmode, 0);
+	target = expand_widen_pattern_expr (exp, op0, op1, op2, 
+					    target, unsignedp);
+	return target;
+      }
+
+    case WIDEN_SUM_EXPR:
+      {
+        tree oprnd0 = TREE_OPERAND (exp, 0);
+        tree oprnd1 = TREE_OPERAND (exp, 1);
+                                                                               
+        expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, 0);
+        target = expand_widen_pattern_expr (exp, op0, NULL_RTX, op1,
+                                            target, unsignedp);
+        return target;
+      }
+
     case REDUC_MAX_EXPR:
     case REDUC_MIN_EXPR:
     case REDUC_PLUS_EXPR:
diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog
index 618e4a6b3b6..e982bc4a03e 100644
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@@ -1,3 +1,14 @@
+2006-01-19  Tobias Schl��ter  <tobias.schlueter@physik.uni-muenchen.de>
+
+	* gfortranspec.c: Update copyright years.
+	* trans.c: Likewise.
+	* trans-array.c: Likewise.
+	* trans-array.h: Likewise.
+	* trans-decl.c: Likewise.
+	* trans-stmt.c: Likewise.
+	* trans-stmt.h: Likewise.
+	* trans-types.c: Likewise.
+
 2006-01-18  Tobias Schl��ter  <tobias.schlueter@physik.uni-muenchen.de>
 
 	PR fortran/18540
diff --git a/gcc/fortran/gfortranspec.c b/gcc/fortran/gfortranspec.c
index dfb84667bc2..7bbf372061c 100644
--- a/gcc/fortran/gfortranspec.c
+++ b/gcc/fortran/gfortranspec.c
@@ -1,6 +1,6 @@
 /* Specific flags and argument handling of the Fortran front-end.
-   Copyright (C) 1997, 1999, 2000, 2001, 2002, 2003, 2004, 2005 Free
-   Software Foundation, Inc.
+   Copyright (C) 1997, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
 
 This file is part of GCC.
 
diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
index 6103fab490b..1edc7b79f6f 100644
--- a/gcc/fortran/trans-array.c
+++ b/gcc/fortran/trans-array.c
@@ -1,5 +1,6 @@
 /* Array translation routines
-   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+   Inc.
    Contributed by Paul Brook <paul@nowt.org>
    and Steven Bosscher <s.bosscher@student.tudelft.nl>
 
diff --git a/gcc/fortran/trans-array.h b/gcc/fortran/trans-array.h
index 564e6490a26..8c03ab1e313 100644
--- a/gcc/fortran/trans-array.h
+++ b/gcc/fortran/trans-array.h
@@ -1,5 +1,5 @@
 /* Header for array handling functions
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2006 Free Software Foundation, Inc.
    Contributed by Paul Brook
 
 This file is part of GCC.
diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c
index 9439b06f747..fbecd578d2e 100644
--- a/gcc/fortran/trans-decl.c
+++ b/gcc/fortran/trans-decl.c
@@ -1,5 +1,6 @@
 /* Backend function setup
-   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+   Inc.
    Contributed by Paul Brook
 
 This file is part of GCC.
diff --git a/gcc/fortran/trans-stmt.c b/gcc/fortran/trans-stmt.c
index b12ea58dcd9..f9ec24f0704 100644
--- a/gcc/fortran/trans-stmt.c
+++ b/gcc/fortran/trans-stmt.c
@@ -1,5 +1,6 @@
 /* Statement translation -- generate GCC trees from gfc_code.
-   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+   Inc.
    Contributed by Paul Brook <paul@nowt.org>
    and Steven Bosscher <s.bosscher@student.tudelft.nl>
 
diff --git a/gcc/fortran/trans-stmt.h b/gcc/fortran/trans-stmt.h
index 56b192fdc59..a71c8bfbede 100644
--- a/gcc/fortran/trans-stmt.h
+++ b/gcc/fortran/trans-stmt.h
@@ -1,5 +1,5 @@
 /* Header for statement translation functions
-   Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2006 Free Software Foundation, Inc.
    Contributed by Paul Brook
 
 This file is part of GCC.
diff --git a/gcc/fortran/trans-types.c b/gcc/fortran/trans-types.c
index fcdef6a0c0e..3b5c1a884a3 100644
--- a/gcc/fortran/trans-types.c
+++ b/gcc/fortran/trans-types.c
@@ -1,5 +1,6 @@
 /* Backend support for Fortran 95 basic types and derived types.
-   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+   Inc.
    Contributed by Paul Brook <paul@nowt.org>
    and Steven Bosscher <s.bosscher@student.tudelft.nl>
 
diff --git a/gcc/fortran/trans.c b/gcc/fortran/trans.c
index 510a9f649db..a586932c9d6 100644
--- a/gcc/fortran/trans.c
+++ b/gcc/fortran/trans.c
@@ -1,5 +1,6 @@
 /* Code translation -- generate GCC trees from gfc_code.
-   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+   Inc.
    Contributed by Paul Brook
 
 This file is part of GCC.
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index ec8076b8206..d9582202b3b 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -1,6 +1,6 @@
 /* Generate code to initialize optabs from machine description.
    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-   2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -203,6 +203,10 @@ static const char * const optabs[] =
   "vec_realign_load_optab->handlers[$A].insn_code = CODE_FOR_$(vec_realign_load_$a$)",
   "vcond_gen_code[$A] = CODE_FOR_$(vcond$a$)",
   "vcondu_gen_code[$A] = CODE_FOR_$(vcondu$a$)",
+  "ssum_widen_optab->handlers[$A].insn_code = CODE_FOR_$(widen_ssum$I$a3$)",
+  "usum_widen_optab->handlers[$A].insn_code = CODE_FOR_$(widen_usum$I$a3$)",
+  "udot_prod_optab->handlers[$A].insn_code = CODE_FOR_$(udot_prod$I$a$)",
+  "sdot_prod_optab->handlers[$A].insn_code = CODE_FOR_$(sdot_prod$I$a$)",
   "reduc_smax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smax_$a$)",
   "reduc_umax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umax_$a$)",
   "reduc_smin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smin_$a$)",
diff --git a/gcc/gimple-low.c b/gcc/gimple-low.c
index 2a49eedfd6b..8b2581bbc85 100644
--- a/gcc/gimple-low.c
+++ b/gcc/gimple-low.c
@@ -151,6 +151,32 @@ lower_stmt_body (tree expr, struct lower_data *data)
     lower_stmt (&tsi, data);
 }
 
+
+/* Lower the OpenMP directive statement pointed by TSI.  DATA is
+   passed through the recursion.  */
+
+static void
+lower_omp_directive (tree_stmt_iterator *tsi, struct lower_data *data)
+{
+  tree clause, stmt;
+  
+  stmt = tsi_stmt (*tsi);
+
+  clause = (TREE_CODE (stmt) >= OMP_PARALLEL && TREE_CODE (stmt) <= OMP_SINGLE)
+	   ? OMP_CLAUSES (stmt)
+	   : NULL_TREE;
+
+  for (; clause; clause = OMP_CLAUSE_CHAIN (clause))
+    TREE_BLOCK (clause) = TREE_BLOCK (stmt);
+
+  lower_stmt_body (OMP_BODY (stmt), data);
+  tsi_link_before (tsi, stmt, TSI_SAME_STMT);
+  tsi_link_before (tsi, OMP_BODY (stmt), TSI_SAME_STMT);
+  OMP_BODY (stmt) = NULL_TREE;
+  tsi_delink (tsi);
+}
+
+
 /* Lowers statement TSI.  DATA is passed through the recursion.  */
 
 static void
@@ -192,8 +218,20 @@ lower_stmt (tree_stmt_iterator *tsi, struct lower_data *data)
     case GOTO_EXPR:
     case LABEL_EXPR:
     case SWITCH_EXPR:
+    case OMP_RETURN_EXPR:
       break;
 
+    case OMP_PARALLEL:
+    case OMP_FOR:
+    case OMP_SECTIONS:
+    case OMP_SECTION:
+    case OMP_SINGLE:
+    case OMP_MASTER:
+    case OMP_ORDERED:
+    case OMP_CRITICAL:
+      lower_omp_directive (tsi, data);
+      return;
+
     default:
       gcc_unreachable ();
     }
@@ -503,11 +541,16 @@ lower_return_expr (tree_stmt_iterator *tsi, struct lower_data *data)
 }
 
 
-/* Record the variables in VARS.  */
+/* Record the variables in VARS into function FN.  */
 
 void
-record_vars (tree vars)
+record_vars_into (tree vars, tree fn)
 {
+  struct function *saved_cfun = cfun;
+
+  if (fn != current_function_decl)
+    cfun = DECL_STRUCT_FUNCTION (fn);
+
   for (; vars; vars = TREE_CHAIN (vars))
     {
       tree var = vars;
@@ -516,6 +559,7 @@ record_vars (tree vars)
          we don't need to care about.  */
       if (TREE_CODE (var) != VAR_DECL)
 	continue;
+
       /* Nothing to do in this case.  */
       if (DECL_EXTERNAL (var))
 	continue;
@@ -524,6 +568,18 @@ record_vars (tree vars)
       cfun->unexpanded_var_list = tree_cons (NULL_TREE, var,
 					     cfun->unexpanded_var_list);
     }
+
+  if (fn != current_function_decl)
+    cfun = saved_cfun;
+}
+
+
+/* Record the variables in VARS into current_function_decl.  */
+
+void
+record_vars (tree vars)
+{
+  record_vars_into (vars, current_function_decl);
 }
 
 
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index acd0468a338..7d7b80f6104 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -92,16 +92,6 @@ struct gimplify_ctx
   int conditions;
   bool save_stack;
   bool into_ssa;
-
-  /* When gimplifying combined omp parallel directives (omp parallel
-     loop and omp parallel sections), any prefix code needed to setup
-     the associated worksharing construct needs to be emitted in the
-     pre-queue of its parent parallel, otherwise the lowering process
-     will move that code to the child function.  Similarly, we need to
-     move up to the gimplification context of the parent parallel
-     directive so temporaries are declared in the right context.  */
-  tree *combined_pre_p;
-  struct gimplify_ctx *combined_ctxp;
 };
 
 static struct gimplify_ctx *gimplify_ctxp;
@@ -634,6 +624,10 @@ internal_get_tmp_var (tree val, tree *pre_p, tree *post_p, bool is_formal)
   return t;
 }
 
+/* Returns a formal temporary variable initialized with VAL.  PRE_P
+   points to a statement list where side-effects needed to compute VAL
+   should be stored.  */
+
 tree
 get_formal_tmp_var (tree val, tree *pre_p)
 {
@@ -2297,7 +2291,7 @@ shortcut_cond_expr (tree expr)
 
 /* EXPR is used in a boolean context; make sure it has BOOLEAN_TYPE.  */
 
-static tree
+tree
 gimple_boolify (tree expr)
 {
   tree type = TREE_TYPE (expr);
@@ -4131,29 +4125,6 @@ gimplify_to_stmt_list (tree *stmt_p)
     }
 }
 
-/* Gimplify *EXPR_P as if it had been used inside the gimplification
-   context CTX_P.  The other arguments are as in gimplify_expr.  */
-
-static enum gimplify_status
-gimplify_expr_in_ctx (tree *expr_p, tree *pre_p, tree *post_p, 
-		      bool (* gimple_test_f) (tree), fallback_t fallback,
-		      struct gimplify_ctx *ctx_p,
-		      struct gimplify_omp_ctx *omp_ctx_p)
-{
-  enum gimplify_status ret;
-  struct gimplify_ctx *prev_ctxp;
-  struct gimplify_omp_ctx *prev_omp_ctxp;
-  
-  prev_ctxp = gimplify_ctxp;
-  gimplify_ctxp = ctx_p;
-  prev_omp_ctxp = gimplify_omp_ctxp;
-  gimplify_omp_ctxp = omp_ctx_p;
-  ret = gimplify_expr (expr_p, pre_p, post_p, gimple_test_f, fallback);
-  gimplify_ctxp = prev_ctxp;
-  gimplify_omp_ctxp = prev_omp_ctxp;
-
-  return ret;
-}
 
 /* Add FIRSTPRIVATE entries for DECL in the OpenMP the surrounding parallels
    to CTX.  If entries already exist, force them to be some flavor of private.
@@ -4531,19 +4502,6 @@ gimplify_scan_omp_clauses (tree *list_p, tree *pre_p, bool in_parallel)
 	  break;
 
 	case OMP_CLAUSE_SCHEDULE:
-	  if (gimplify_ctxp->combined_pre_p)
-	    {
-	      gcc_assert (gimplify_omp_ctxp == outer_ctx);
-	      gs = gimplify_expr_in_ctx (&OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (c),
-					 gimplify_ctxp->combined_pre_p, NULL,
-					 is_gimple_val, fb_rvalue,
-					 gimplify_ctxp->combined_ctxp,
-					 outer_ctx->outer_context);
-	      if (gs == GS_ERROR)
-		remove = true;
-	      break;
-	    }
-	  /* FALLTHRU */
 	case OMP_CLAUSE_IF:
 	case OMP_CLAUSE_NUM_THREADS:
 	  gs = gimplify_expr (&TREE_OPERAND (c, 0), pre_p, NULL,
@@ -4708,17 +4666,12 @@ gimplify_omp_parallel (tree *expr_p, tree *pre_p)
 
   push_gimplify_context ();
 
-  if (determine_parallel_type (expr) == IS_COMBINED_PARALLEL)
-    {
-      gimplify_ctxp->combined_pre_p = pre_p;
-      gimplify_ctxp->combined_ctxp = gimplify_ctxp->prev_context;
-    }
-
   gimplify_stmt (&OMP_PARALLEL_BODY (expr));
-  pop_gimplify_context (OMP_PARALLEL_BODY (expr));
 
-  gimplify_ctxp->combined_pre_p = NULL;
-  gimplify_ctxp->combined_ctxp = NULL;
+  if (TREE_CODE (OMP_PARALLEL_BODY (expr)) == BIND_EXPR)
+    pop_gimplify_context (OMP_PARALLEL_BODY (expr));
+  else
+    pop_gimplify_context (NULL_TREE);
 
   gimplify_adjust_omp_clauses (&OMP_PARALLEL_CLAUSES (expr));
 
@@ -4732,13 +4685,9 @@ gimplify_omp_for (tree *expr_p, tree *pre_p)
 {
   tree for_stmt, decl, t;
   enum gimplify_status ret = 0;
-  struct gimplify_omp_ctx *outer_combined_omp_ctxp = NULL;
 
   for_stmt = *expr_p;
 
-  if (gimplify_ctxp->combined_pre_p)
-    outer_combined_omp_ctxp = gimplify_omp_ctxp->outer_context;
-
   gimplify_scan_omp_clauses (&OMP_FOR_CLAUSES (for_stmt), pre_p, false);
 
   t = OMP_FOR_INIT (for_stmt);
@@ -4754,33 +4703,15 @@ gimplify_omp_for (tree *expr_p, tree *pre_p)
   else
     omp_add_variable (gimplify_omp_ctxp, decl, GOVD_PRIVATE | GOVD_SEEN);
 
-  /* Gimplify inside our parent's context if this is part of a combined
-     parallel+workshare directive.  */
-  if (gimplify_ctxp->combined_pre_p)
-    ret |= gimplify_expr_in_ctx (&TREE_OPERAND (t, 1),
-				 gimplify_ctxp->combined_pre_p, NULL,
-				 is_gimple_val, fb_rvalue,
-				 gimplify_ctxp->combined_ctxp,
-				 outer_combined_omp_ctxp);
-  else
-    ret |= gimplify_expr (&TREE_OPERAND (t, 1), &OMP_FOR_PRE_BODY (for_stmt),
-			  NULL, is_gimple_val, fb_rvalue);
+  ret |= gimplify_expr (&TREE_OPERAND (t, 1), &OMP_FOR_PRE_BODY (for_stmt),
+			NULL, is_gimple_val, fb_rvalue);
 
   t = OMP_FOR_COND (for_stmt);
   gcc_assert (COMPARISON_CLASS_P (t));
   gcc_assert (TREE_OPERAND (t, 0) == decl);
 
-  /* Gimplify inside our parent's context if this is part of a combined
-     parallel+workshare directive.  */
-  if (gimplify_ctxp->combined_pre_p)
-    ret |= gimplify_expr_in_ctx (&TREE_OPERAND (t, 1),
-				 gimplify_ctxp->combined_pre_p, NULL,
-				 is_gimple_val, fb_rvalue,
-				 gimplify_ctxp->combined_ctxp,
-				 outer_combined_omp_ctxp);
-  else
-    ret |= gimplify_expr (&TREE_OPERAND (t, 1), &OMP_FOR_PRE_BODY (for_stmt),
-			  NULL, is_gimple_val, fb_rvalue);
+  ret |= gimplify_expr (&TREE_OPERAND (t, 1), &OMP_FOR_PRE_BODY (for_stmt),
+			NULL, is_gimple_val, fb_rvalue);
 
   t = OMP_FOR_INCR (for_stmt);
   switch (TREE_CODE (t))
@@ -4818,18 +4749,8 @@ gimplify_omp_for (tree *expr_p, tree *pre_p)
 	  gcc_unreachable ();
 	}
 
-      /* Gimplify inside our parent's context if this is part of a
-	 combined parallel+workshare directive.  */
-      if (gimplify_ctxp->combined_pre_p)
-	ret |= gimplify_expr_in_ctx (&TREE_OPERAND (t, 1),
-				     gimplify_ctxp->combined_pre_p, NULL,
-				     is_gimple_val, fb_rvalue,
-				     gimplify_ctxp->combined_ctxp,
-				     outer_combined_omp_ctxp);
-      else
-	ret |= gimplify_expr (&TREE_OPERAND (t, 1),
-			      &OMP_FOR_PRE_BODY (for_stmt), NULL,
-			      is_gimple_val, fb_rvalue);
+      ret |= gimplify_expr (&TREE_OPERAND (t, 1), &OMP_FOR_PRE_BODY (for_stmt),
+			    NULL, is_gimple_val, fb_rvalue);
       break;
 
     default:
@@ -5622,6 +5543,10 @@ gimplify_expr (tree *expr_p, tree *pre_p, tree *post_p,
 	  ret = gimplify_omp_atomic (expr_p, pre_p);
 	  break;
 
+	case OMP_RETURN_EXPR:
+	  ret = GS_ALL_DONE;
+	  break;
+
 	default:
 	  switch (TREE_CODE_CLASS (TREE_CODE (*expr_p)))
 	    {
diff --git a/gcc/ipa-reference.c b/gcc/ipa-reference.c
index ec8d3b8e595..fe2f807d3e8 100644
--- a/gcc/ipa-reference.c
+++ b/gcc/ipa-reference.c
@@ -286,7 +286,7 @@ check_operand (ipa_reference_local_vars_info_t local,
 {
   if (!t) return;
 
-  if ((TREE_CODE (t) == VAR_DECL)
+  if ((TREE_CODE (t) == VAR_DECL || TREE_CODE (t) == FUNCTION_DECL)
       && (has_proper_scope_for_analysis (t))) 
     {
       if (checking_write)
@@ -343,7 +343,7 @@ look_for_address_of (tree t)
   if (TREE_CODE (t) == ADDR_EXPR)
     {
       tree x = get_base_var (t);
-      if (TREE_CODE (x) == VAR_DECL) 
+      if (TREE_CODE (x) == VAR_DECL || TREE_CODE (x) == FUNCTION_DECL) 
 	if (has_proper_scope_for_analysis (x))
 	  bitmap_set_bit (module_statics_escape, DECL_UID (x));
     }
@@ -741,6 +741,7 @@ merge_callee_local_info (struct cgraph_node *target,
 static void 
 ipa_init (void) 
 {
+  struct cgraph_node *node;
   memory_identifier_string = build_string(7, "memory");
 
   reference_vars_to_consider =
@@ -751,6 +752,10 @@ ipa_init (void)
   module_statics_written = BITMAP_ALLOC (&ipa_obstack);
   all_module_statics = BITMAP_ALLOC (&ipa_obstack);
 
+  /* This will add NODE->DECL to the splay trees.  */
+  for (node = cgraph_nodes; node; node = node->next)
+    has_proper_scope_for_analysis (node->decl);
+
   /* There are some shared nodes, in particular the initializers on
      static declarations.  We do not need to scan them more than once
      since all we would be interested in are the addressof
@@ -964,6 +969,11 @@ static_execute (void)
       {
 	tree var = get_static_decl (index);
 
+	/* Readonly on a function decl is very different from the
+	   variable.  */
+	if (TREE_CODE (var) == FUNCTION_DECL)
+	  continue;
+
 	/* Ignore variables in named sections - changing TREE_READONLY
 	   changes the section flags, potentially causing conflicts with
 	   other variables in the same named section.  */
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 65907f0089c..f5bdcb99791 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -59,6 +59,40 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
    optimal, but lexically nested parallels effectively only happens in
    test suites.  */
 
+/* Parallel region information.  Every parallel and workshare
+   directive is enclosed between two markers, the OMP_* directive
+   and a corresponding OMP_RETURN_EXPR statement.  */
+
+struct omp_region GTY(())
+{
+  /* The enclosing region.  */
+  struct omp_region *outer;
+
+  /* First child region.  */
+  struct omp_region *inner;
+
+  /* Next peer region.  */
+  struct omp_region *next;
+
+  /* Entry point to this region.  */
+  tree entry;
+
+  /* Exit label from this region.  */
+  tree exit;
+
+  /* Region number.  */
+  int num;
+
+  /* True if this is a combined parallel+workshare region.  */
+  bool is_combined_parallel;
+
+  /* If this is a combined parallel+workshare region, this is a list
+     of additional arguments needed by the combined parallel+workshare
+     library call.  */
+  tree ws_args;
+};
+
+
 /* Context structure.  Used to store information about each parallel
    directive in the code.  */
 
@@ -94,48 +128,34 @@ typedef struct omp_context
      reserved for the main body of the function.  */
   int depth;
 
-  /* Type of parallel construct.  Used to distinguish regular parallel
-     regions from combined parallel+workshare directives (parallel,
-     parallel loop and parallel sections).  */
-  enum omp_parallel_type parallel_type;
-
   /* True if this parallel directive is nested within another.  */
   bool is_nested;
-
-  /* For combined parallel constructs, the built-in index for the
-     library call used to launch the children threads.  */
-  int parallel_start_ix;
-
-  /* If the combined parallel directive needs additional arguments for
-     the call to GOMP_parallel_start_foo, they are added here.  */
-  tree parallel_start_additional_args;
 } omp_context;
 
 
-/* A structure describing the main elements of a parallel loop.
-   Mostly used to communicate between the various subroutines of 
-   expand_omp_for_1.  */
+/* A structure describing the main elements of a parallel loop.  */
 
-struct expand_omp_for_data
+struct omp_for_data
 {
   tree v, n1, n2, step, chunk_size, for_stmt;
   enum tree_code cond_code;
   tree pre;
-  omp_context *ctx;
   bool have_nowait, have_ordered;
   enum omp_clause_schedule_kind sched_kind;
 };
 
+
 static splay_tree all_contexts;
 static int parallel_nesting_level;
+static splay_tree omp_regions;
+static struct omp_region *root_omp_region;
 
 static void scan_omp (tree *, omp_context *);
-static void expand_omp (tree *, omp_context *);
-
+static void lower_omp (tree *, omp_context *);
 
 /* Find an OpenMP clause of type KIND within CLAUSES.  */
 
-tree
+static tree
 find_omp_clause (tree clauses, enum tree_code kind)
 {
   for (; clauses ; clauses = OMP_CLAUSE_CHAIN (clauses))
@@ -150,17 +170,290 @@ find_omp_clause (tree clauses, enum tree_code kind)
 static inline bool
 is_parallel_ctx (omp_context *ctx)
 {
-  return ctx->parallel_type != IS_NOT_PARALLEL;
+  return TREE_CODE (ctx->stmt) == OMP_PARALLEL;
 }
 
-/* Return true if CTX is inside a combined omp parallel + workshare.  */
+
+/* Return true if REGION is a combined parallel+workshare region.  */
 
 static inline bool
-is_in_combined_parallel_ctx (omp_context *ctx)
+is_combined_parallel (struct omp_region *region)
+{
+  return region->is_combined_parallel;
+}
+
+
+/* Extract the header elements of parallel loop FOR_STMT and store
+   them into *FD.  */
+
+static void
+extract_omp_for_data (tree for_stmt, struct omp_for_data *fd)
+{
+  tree t;
+
+  fd->for_stmt = for_stmt;
+  fd->pre = NULL;
+
+  t = OMP_FOR_INIT (for_stmt);
+  gcc_assert (TREE_CODE (t) == MODIFY_EXPR);
+  fd->v = TREE_OPERAND (t, 0);
+  gcc_assert (DECL_P (fd->v));
+  gcc_assert (TREE_CODE (TREE_TYPE (fd->v)) == INTEGER_TYPE);
+  fd->n1 = TREE_OPERAND (t, 1);
+
+  t = OMP_FOR_COND (for_stmt);
+  fd->cond_code = TREE_CODE (t);
+  gcc_assert (TREE_OPERAND (t, 0) == fd->v);
+  fd->n2 = TREE_OPERAND (t, 1);
+  switch (fd->cond_code)
+    {
+    case LT_EXPR:
+    case GT_EXPR:
+      break;
+    case LE_EXPR:
+      fd->n2 = fold_build2 (PLUS_EXPR, TREE_TYPE (fd->n2), fd->n2,
+			   build_int_cst (TREE_TYPE (fd->n2), 1));
+      fd->cond_code = LT_EXPR;
+      break;
+    case GE_EXPR:
+      fd->n2 = fold_build2 (MINUS_EXPR, TREE_TYPE (fd->n2), fd->n2,
+			   build_int_cst (TREE_TYPE (fd->n2), 1));
+      fd->cond_code = GT_EXPR;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  t = OMP_FOR_INCR (fd->for_stmt);
+  gcc_assert (TREE_CODE (t) == MODIFY_EXPR);
+  gcc_assert (TREE_OPERAND (t, 0) == fd->v);
+  t = TREE_OPERAND (t, 1);
+  gcc_assert (TREE_OPERAND (t, 0) == fd->v);
+  switch (TREE_CODE (t))
+    {
+    case PLUS_EXPR:
+      fd->step = TREE_OPERAND (t, 1);
+      break;
+    case MINUS_EXPR:
+      fd->step = TREE_OPERAND (t, 1);
+      fd->step = fold_build1 (NEGATE_EXPR, TREE_TYPE (fd->step), fd->step);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  fd->have_nowait = fd->have_ordered = false;
+  fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
+  fd->chunk_size = NULL_TREE;
+
+  for (t = OMP_FOR_CLAUSES (for_stmt); t ; t = OMP_CLAUSE_CHAIN (t))
+    switch (TREE_CODE (t))
+      {
+      case OMP_CLAUSE_NOWAIT:
+	fd->have_nowait = true;
+	break;
+      case OMP_CLAUSE_ORDERED:
+	fd->have_ordered = true;
+	break;
+      case OMP_CLAUSE_SCHEDULE:
+	fd->sched_kind = OMP_CLAUSE_SCHEDULE_KIND (t);
+	fd->chunk_size = OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t);
+	break;
+      default:
+	break;
+      }
+
+  if (fd->sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
+    gcc_assert (fd->chunk_size == NULL);
+  else if (fd->chunk_size == NULL)
+    {
+      /* We only need to compute a default chunk size for ordered
+	 static loops and dynamic loops.  */
+      if (fd->sched_kind != OMP_CLAUSE_SCHEDULE_STATIC || fd->have_ordered)
+	fd->chunk_size = (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC)
+			 ? integer_zero_node : integer_one_node;
+    }
+}
+
+
+/* Given two blocks PAR_ENTRY_BB and WS_ENTRY_BB such that WS_ENTRY_BB
+   is the immediate dominator of PAR_ENTRY_BB, return true if there
+   are no data dependencies that would prevent expanding the parallel
+   directive at PAR_ENTRY_BB as a combined parallel+workshare region.
+
+   When expanding a combined parallel+workshare region, the call to
+   the child function may need additional arguments in the case of
+   OMP_FOR regions.  In some cases, these arguments are computed out
+   of variables passed in from the parent to the child via 'struct
+   .omp_data_s'.  For instance:
+
+	#pragma omp parallel for schedule (guided, i * 4)
+	for (j ...)
+
+   Is lowered into:
+
+   	# BLOCK 2 (PAR_ENTRY_BB)
+	.omp_data_o.i = i;
+	#pragma omp parallel [child fn: bar.omp_fn.0 ( ..., D.1598)
+	
+	# BLOCK 3 (WS_ENTRY_BB)
+	.omp_data_i = &.omp_data_o;
+	D.1667 = .omp_data_i->i;
+	D.1598 = D.1667 * 4;
+	#pragma omp for schedule (guided, D.1598)
+
+   When we outline the parallel region, the call to the child function
+   'bar.omp_fn.0' will need the value D.1598 in its argument list, but
+   that value is computed *after* the call site.  So, in principle we
+   cannot do the transformation.
+
+   To see whether the code in WS_ENTRY_BB blocks the combined
+   parallel+workshare call, we collect all the variables used in the
+   OMP_FOR header check whether they appear on the LHS of any
+   statement in WS_ENTRY_BB.  If so, then we cannot emit the combined
+   call.
+
+   FIXME.  If we had the SSA form built at this point, we could merely
+   hoist the code in block 3 into block 2 and be done with it.  But at
+   this point we don't have dataflow information and though we could
+   hack something up here, it is really not worth the aggravation.  */
+
+static bool
+workshare_safe_to_combine_p (basic_block par_entry_bb, basic_block ws_entry_bb)
 {
-  return ctx->outer && ctx->outer->parallel_type == IS_COMBINED_PARALLEL;
+  struct omp_for_data fd;
+  tree par_stmt, ws_stmt;
+
+  par_stmt = last_stmt (par_entry_bb);
+  ws_stmt = last_stmt (ws_entry_bb);
+
+  if (TREE_CODE (ws_stmt) == OMP_SECTIONS)
+    return true;
+
+  gcc_assert (TREE_CODE (ws_stmt) == OMP_FOR);
+
+  extract_omp_for_data (ws_stmt, &fd);
+
+  /* FIXME.  We give up too easily here.  If any of these arguments
+     are not constants, they will likely involve variables that have
+     been mapped into fields of .omp_data_s for sharing with the child
+     function.  With appropriate data flow, it would be possible to
+     see through this.  */
+  if (!is_gimple_min_invariant (fd.n1)
+      || !is_gimple_min_invariant (fd.n2)
+      || !is_gimple_min_invariant (fd.step)
+      || (fd.chunk_size && !is_gimple_min_invariant (fd.chunk_size)))
+    return false;
+
+  return true;
 }
 
+
+/* Collect additional arguments needed to emit a combined
+   parallel+workshare call.  WS_STMT is the workshare directive being
+   expanded.  */
+
+static tree
+get_ws_args_for (tree ws_stmt)
+{
+  tree t;
+
+  if (TREE_CODE (ws_stmt) == OMP_FOR)
+    {
+      struct omp_for_data fd;
+      tree ws_args;
+
+      extract_omp_for_data (ws_stmt, &fd);
+
+      ws_args = NULL_TREE;
+      if (fd.chunk_size)
+	{
+	  t = fold_convert (long_integer_type_node, fd.chunk_size);
+	  ws_args = tree_cons (NULL, t, ws_args);
+	}
+
+      t = fold_convert (long_integer_type_node, fd.step);
+      ws_args = tree_cons (NULL, t, ws_args);
+
+      t = fold_convert (long_integer_type_node, fd.n2);
+      ws_args = tree_cons (NULL, t, ws_args);
+
+      t = fold_convert (long_integer_type_node, fd.n1);
+      ws_args = tree_cons (NULL, t, ws_args);
+
+      return ws_args;
+    }
+  else if (TREE_CODE (ws_stmt) == OMP_SECTIONS)
+    {
+      basic_block bb = bb_for_stmt (ws_stmt);
+      t = build_int_cst (unsigned_type_node, EDGE_COUNT (bb->succs));
+      t = tree_cons (NULL, t, NULL);
+      return t;
+    }
+
+  gcc_unreachable ();
+}
+
+
+/* Discover whether REGION is a combined parallel+workshare region.  */
+
+static void
+determine_parallel_type (struct omp_region *region)
+{
+  basic_block par_entry_bb, par_exit_bb;
+  basic_block ws_entry_bb, ws_exit_bb;
+
+  if (region == NULL || region->inner == NULL)
+    return;
+
+  /* We only support parallel+for and parallel+sections.  */
+  if (TREE_CODE (region->entry) != OMP_PARALLEL
+      || (TREE_CODE (region->inner->entry) != OMP_FOR
+	  && TREE_CODE (region->inner->entry) != OMP_SECTIONS))
+    return;
+
+  /* Check for perfect nesting PAR_ENTRY_BB -> WS_ENTRY_BB and
+     WS_EXIT_BB -> PAR_EXIT_BB.  */
+  par_entry_bb = bb_for_stmt (region->entry);
+  par_exit_bb = bb_for_stmt (region->exit);
+
+  ws_entry_bb = bb_for_stmt (region->inner->entry);
+  ws_exit_bb = bb_for_stmt (region->inner->exit);
+
+  if (single_succ (par_entry_bb) == ws_entry_bb
+      && single_succ (ws_exit_bb) == par_exit_bb
+      && workshare_safe_to_combine_p (par_entry_bb, ws_entry_bb))
+    {
+      if (TREE_CODE (region->inner->entry) == OMP_FOR)
+	{
+	  /* If this is a combined parallel loop, we need to determine
+	     whether or not to use the combined library calls.  There
+	     are two cases where we do not apply the transformation:
+	     static loops and any kind of ordered loop.  In the first
+	     case, we already open code the loop so there is no need
+	     to do anything else.  In the latter case, the combined
+	     parallel loop call would still need extra synchronization
+	     to implement ordered semantics, so there would not be any
+	     gain in using the combined call.  */
+	  tree clauses = OMP_FOR_CLAUSES (region->inner->entry);
+	  tree c = find_omp_clause (clauses, OMP_CLAUSE_SCHEDULE);
+	  if (c == NULL
+	      || OMP_CLAUSE_SCHEDULE_KIND (c) == OMP_CLAUSE_SCHEDULE_STATIC
+	      || find_omp_clause (clauses, OMP_CLAUSE_ORDERED))
+	    {
+	      region->is_combined_parallel = false;
+	      region->inner->is_combined_parallel = false;
+	      return;
+	    }
+	}
+
+      region->is_combined_parallel = true;
+      region->inner->is_combined_parallel = true;
+      region->ws_args = get_ws_args_for (region->inner->entry);
+    }
+}
+
+
 /* Return true if EXPR is variable sized.  */
 
 static inline bool
@@ -261,7 +554,7 @@ omp_copy_decl_2 (tree var, tree name, tree type, omp_context *ctx)
   DECL_ARTIFICIAL (copy) = DECL_ARTIFICIAL (var);
   DECL_IGNORED_P (copy) = DECL_IGNORED_P (var);
   TREE_USED (copy) = 1;
-  DECL_CONTEXT (copy) = ctx->cb.dst_fn;
+  DECL_CONTEXT (copy) = current_function_decl;
   DECL_SEEN_IN_BIND_EXPR_P (copy) = 1;
 
   TREE_CHAIN (copy) = ctx->block_vars;
@@ -426,7 +719,7 @@ omp_copy_decl (tree var, copy_body_data *cb)
   if (TREE_CODE (var) == LABEL_DECL)
     {
       new_var = create_artificial_label ();
-      DECL_CONTEXT (new_var) = ctx->cb.dst_fn;
+      DECL_CONTEXT (new_var) = current_function_decl;
       insert_decl_map (&ctx->cb, var, new_var);
       return new_var;
     }
@@ -444,6 +737,99 @@ omp_copy_decl (tree var, copy_body_data *cb)
   return error_mark_node;
 }
 
+
+/* Return the parallel region associated with STMT.  */
+
+static inline struct omp_region *
+lookup_omp_region (tree stmt)
+{
+  splay_tree_node n = splay_tree_lookup (omp_regions, (splay_tree_key) stmt);
+  return n ? (struct omp_region *) n->value : NULL;
+}
+
+
+/* Debugging dumps for parallel regions.  */
+void dump_omp_region (FILE *, struct omp_region *, int);
+void debug_omp_region (struct omp_region *);
+void debug_all_omp_regions (void);
+
+/* Dump the parallel region tree rooted at REGION.  */
+
+void
+dump_omp_region (FILE *file, struct omp_region *region, int indent)
+{
+  fprintf (file, "%*s", indent, "");
+  print_generic_stmt (file, region->entry, TDF_SLIM);
+
+  if (region->inner)
+    dump_omp_region (file, region->inner, indent + 4);
+
+  fprintf (file, "%*s", indent, "");
+  if (region->exit)
+    print_generic_stmt (file, region->exit, 0);
+  else
+    fprintf (file, "[no exit marker]\n");
+
+  if (region->next)
+    {
+      fprintf (file, "\n");
+      dump_omp_region (file, region->next, indent);
+    }
+}
+
+void
+debug_omp_region (struct omp_region *region)
+{
+  dump_omp_region (stderr, region, 0);
+}
+
+void
+debug_all_omp_regions (void)
+{
+  dump_omp_region (stderr, root_omp_region, 0);
+}
+
+
+/* Create a new parallel region starting at STMT inside region PARENT.  */
+
+static struct omp_region *
+new_omp_region (tree stmt, struct omp_region *parent)
+{
+  struct omp_region *region = ggc_alloc_cleared (sizeof (*region));
+  static int num = 0;
+
+  region->outer = parent;
+  region->entry = stmt;
+  region->num = num++;
+
+  if (parent)
+    {
+      /* This is a nested region.  Add it to the list of inner
+	 regions in PARENT.  */
+      region->next = parent->inner;
+      parent->inner = region;
+    }
+  else if (omp_regions)
+    {
+      /* This is a toplevel region.  Add it to the list of toplevel
+	 regions in ROOT_OMP_REGION.  */
+      region->next = root_omp_region;
+      root_omp_region = region;
+    }
+  else
+    {
+      /* Create a new root region with the first region we find.  */
+      root_omp_region = region;
+      omp_regions = splay_tree_new (splay_tree_compare_pointers, 0, 0);
+    }
+
+  splay_tree_insert (omp_regions, (splay_tree_key) stmt,
+		     (splay_tree_value) region);
+
+  return region;
+}
+
+
 /* Create a new context, with OUTER_CTX being the surrounding context.  */
 
 static omp_context *
@@ -742,6 +1128,7 @@ create_omp_child_function (omp_context *ctx)
   DECL_UNINLINABLE (decl) = 1;
   DECL_EXTERNAL (decl) = 0;
   DECL_CONTEXT (decl) = NULL_TREE;
+  DECL_INITIAL (decl) = make_node (BLOCK);
 
   t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
   DECL_ARTIFICIAL (t) = 1;
@@ -751,13 +1138,13 @@ create_omp_child_function (omp_context *ctx)
   t = build_decl (PARM_DECL, get_identifier (".omp_data_i"), ptr_type_node);
   DECL_ARTIFICIAL (t) = 1;
   DECL_ARG_TYPE (t) = ptr_type_node;
-  DECL_CONTEXT (t) = decl;
+  DECL_CONTEXT (t) = current_function_decl;
   TREE_USED (t) = 1;
   DECL_ARGUMENTS (decl) = t;
   ctx->receiver_decl = t;
 
   /* Allocate memory for the function structure.  The call to 
-     allocate_struct_function clobbers cfun, so we need to restore
+     allocate_struct_function clobbers CFUN, so we need to restore
      it afterward.  */
   allocate_struct_function (decl);
   DECL_SOURCE_LOCATION (decl) = EXPR_LOCATION (ctx->stmt);
@@ -765,30 +1152,6 @@ create_omp_child_function (omp_context *ctx)
   cfun = ctx->cb.src_cfun;
 }
 
-/* Given an OMP_PARALLEL statement, determine whether it is a combined
-   parallel+worksharing directive.  This is simply done by examining
-   the body of the directive.  If the body contains a single OMP_FOR
-   or a single OMP_SECTIONS then this is a combined directive.
-   Otherwise, it is a regular parallel directive.  */
-
-enum omp_parallel_type
-determine_parallel_type (tree stmt)
-{
-  enum omp_parallel_type par_type;
-  tree body = BIND_EXPR_BODY (OMP_PARALLEL_BODY (stmt));
-  tree t;
-
-  par_type = IS_PARALLEL;
-
-  t = expr_only (body);
-  if (t && TREE_CODE (t) == OMP_SECTIONS)
-    par_type = IS_COMBINED_PARALLEL;
-  else
-    par_type = IS_PARALLEL;
-
-  return par_type;
-}
-
 
 /* Scan an OpenMP parallel directive.  */
 
@@ -809,16 +1172,16 @@ scan_omp_parallel (tree *stmt_p, omp_context *outer_ctx)
     }
 
   ctx = new_omp_context (*stmt_p, outer_ctx);
+  if (parallel_nesting_level > 1)
+    ctx->is_nested = true;
   ctx->field_map = splay_tree_new (splay_tree_compare_pointers, 0, 0);
-  ctx->parallel_type = determine_parallel_type (*stmt_p);
   ctx->default_kind = OMP_CLAUSE_DEFAULT_SHARED;
   ctx->record_type = lang_hooks.types.make_type (RECORD_TYPE);
-  ctx->parallel_start_ix = BUILT_IN_GOMP_PARALLEL_START;
-  ctx->parallel_start_additional_args = NULL_TREE;
   name = create_tmp_var_name (".omp_data_s");
   name = build_decl (TYPE_DECL, name, ctx->record_type);
   TYPE_NAME (ctx->record_type) = name;
   create_omp_child_function (ctx);
+  OMP_PARALLEL_FN (*stmt_p) = ctx->cb.dst_fn;
 
   scan_sharing_clauses (OMP_PARALLEL_CLAUSES (*stmt_p), ctx);
   scan_omp (&OMP_PARALLEL_BODY (*stmt_p), ctx);
@@ -833,143 +1196,19 @@ scan_omp_parallel (tree *stmt_p, omp_context *outer_ctx)
 }
 
 
-/* Extract the header elements of parallel loop FOR_STMT and store
-   them into *FD.  */
-
-static void
-extract_omp_for_data (tree for_stmt, omp_context *ctx, 
-		      struct expand_omp_for_data *fd)
-{
-  tree t;
-
-  fd->for_stmt = for_stmt;
-  fd->pre = NULL;
-  fd->ctx = ctx;
-
-  t = OMP_FOR_INIT (for_stmt);
-  gcc_assert (TREE_CODE (t) == MODIFY_EXPR);
-  fd->v = TREE_OPERAND (t, 0);
-  gcc_assert (DECL_P (fd->v));
-  gcc_assert (TREE_CODE (TREE_TYPE (fd->v)) == INTEGER_TYPE);
-  fd->n1 = TREE_OPERAND (t, 1);
-
-  t = OMP_FOR_COND (for_stmt);
-  fd->cond_code = TREE_CODE (t);
-  gcc_assert (TREE_OPERAND (t, 0) == fd->v);
-  fd->n2 = TREE_OPERAND (t, 1);
-  switch (fd->cond_code)
-    {
-    case LT_EXPR:
-    case GT_EXPR:
-      break;
-    case LE_EXPR:
-      fd->n2 = fold_build2 (PLUS_EXPR, TREE_TYPE (fd->n2), fd->n2,
-			   build_int_cst (TREE_TYPE (fd->n2), 1));
-      fd->cond_code = LT_EXPR;
-      break;
-    case GE_EXPR:
-      fd->n2 = fold_build2 (MINUS_EXPR, TREE_TYPE (fd->n2), fd->n2,
-			   build_int_cst (TREE_TYPE (fd->n2), 1));
-      fd->cond_code = GT_EXPR;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  t = OMP_FOR_INCR (fd->for_stmt);
-  gcc_assert (TREE_CODE (t) == MODIFY_EXPR);
-  gcc_assert (TREE_OPERAND (t, 0) == fd->v);
-  t = TREE_OPERAND (t, 1);
-  gcc_assert (TREE_OPERAND (t, 0) == fd->v);
-  switch (TREE_CODE (t))
-    {
-    case PLUS_EXPR:
-      fd->step = TREE_OPERAND (t, 1);
-      break;
-    case MINUS_EXPR:
-      fd->step = TREE_OPERAND (t, 1);
-      fd->step = fold_build1 (NEGATE_EXPR, TREE_TYPE (fd->step), fd->step);
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  fd->have_nowait = fd->have_ordered = false;
-  fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
-  fd->chunk_size = NULL_TREE;
-
-  for (t = OMP_FOR_CLAUSES (for_stmt); t ; t = OMP_CLAUSE_CHAIN (t))
-    switch (TREE_CODE (t))
-      {
-      case OMP_CLAUSE_NOWAIT:
-	fd->have_nowait = true;
-	break;
-      case OMP_CLAUSE_ORDERED:
-	fd->have_ordered = true;
-	break;
-      case OMP_CLAUSE_SCHEDULE:
-	fd->sched_kind = OMP_CLAUSE_SCHEDULE_KIND (t);
-	fd->chunk_size = OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t);
-	break;
-      default:
-	break;
-      }
-
-  if (fd->sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
-    gcc_assert (fd->chunk_size == NULL);
-  else if (fd->chunk_size == NULL)
-    {
-      /* We only need to compute a default chunk size for ordered
-	 static loops and dynamic loops.  */
-      if (fd->sched_kind != OMP_CLAUSE_SCHEDULE_STATIC || fd->have_ordered)
-	fd->chunk_size = (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC)
-			 ? integer_zero_node : integer_one_node;
-    }
-}
-
-
 /* Scan an OpenMP loop directive.  */
 
 static void
 scan_omp_for (tree *stmt_p, omp_context *outer_ctx)
 {
   omp_context *ctx;
-  tree stmt = *stmt_p;
+  tree stmt;
 
+  stmt = *stmt_p;
   ctx = new_omp_context (stmt, outer_ctx);
 
-  /* If this is a combined parallel loop directive, we need to extract
-     the bounds, step and chunk size for the loop so that we can build
-     the call to GOMP_parallel_loop_foo_start.  Do this before
-     scanning the loop header to avoid getting the mapped variables
-     from the child context.  */
-  if (is_in_combined_parallel_ctx (ctx))
-    {
-      struct expand_omp_for_data fd;
-      tree t, additional_args;
-
-      extract_omp_for_data (stmt, ctx, &fd);
-
-      additional_args = NULL_TREE;
-      if (fd.chunk_size)
-	{
-	  t = fold_convert (long_integer_type_node, fd.chunk_size);
-	  additional_args = tree_cons (NULL, t, additional_args);
-	}
-      t = fold_convert (long_integer_type_node, fd.step);
-      additional_args = tree_cons (NULL, t, additional_args);
-      t = fold_convert (long_integer_type_node, fd.n2);
-      additional_args = tree_cons (NULL, t, additional_args);
-      t = fold_convert (long_integer_type_node, fd.n1);
-      additional_args = tree_cons (NULL, t, additional_args);
-      outer_ctx->parallel_start_additional_args = additional_args;
-    }
-
   scan_sharing_clauses (OMP_FOR_CLAUSES (stmt), ctx);
 
-  /* FIXME.  When expanding into a combined parallel loop, we may not
-     need to map some of the variables in the loop header (in
-     particular, FD.N1 and FD.N2 for dynamic loops).  */
   scan_omp (&OMP_FOR_PRE_BODY (stmt), ctx);
   scan_omp (&OMP_FOR_INIT (stmt), ctx);
   scan_omp (&OMP_FOR_COND (stmt), ctx);
@@ -982,9 +1221,10 @@ scan_omp_for (tree *stmt_p, omp_context *outer_ctx)
 static void
 scan_omp_sections (tree *stmt_p, omp_context *outer_ctx)
 {
-  tree stmt = *stmt_p;
+  tree stmt;
   omp_context *ctx;
 
+  stmt = *stmt_p;
   ctx = new_omp_context (stmt, outer_ctx);
   scan_sharing_clauses (OMP_SECTIONS_CLAUSES (stmt), ctx);
   scan_omp (&OMP_SECTIONS_BODY (stmt), ctx);
@@ -1015,79 +1255,6 @@ scan_omp_single (tree *stmt_p, omp_context *outer_ctx)
     layout_type (ctx->record_type);
 }
 
-/* Similar, except this is either a parallel nested within another
-   parallel, or a workshare construct nested within a nested parallel.
-   In this case we want to do minimal processing, as the real work 
-   will be done during lowering of the function generated by the
-   outermost parallel.
-
-   The minimal amount of work is processing private clauses, and simply
-   scanning the rest.  Private clauses are the only ones that don't
-   also imply a reference in the outer parallel.  We must set up a
-   translation lest the default behaviour in omp_copy_decl substitute
-   error_mark_node.  */
-
-static void
-scan_omp_nested (tree *stmt_p, omp_context *outer_ctx)
-{
-  omp_context *ctx;
-  tree var_sized_list = NULL;
-  tree c, decl, stmt = *stmt_p;
-
-  ctx = new_omp_context (stmt, outer_ctx);
-  ctx->is_nested = true;
-
-  for (c = OMP_CLAUSES (stmt); c ; c = OMP_CLAUSE_CHAIN (c))
-    {
-      switch (TREE_CODE (c))
-	{
-	case OMP_CLAUSE_PRIVATE:
-	  decl = OMP_CLAUSE_DECL (c);
-	  if (is_variable_sized (decl))
-	    var_sized_list = tree_cons (NULL, c, var_sized_list);
-	  OMP_CLAUSE_DECL (c) = install_var_local (decl, ctx);
-	  break;
-
-	case OMP_CLAUSE_FIRSTPRIVATE:
-	case OMP_CLAUSE_LASTPRIVATE:
-	case OMP_CLAUSE_REDUCTION:
-	case OMP_CLAUSE_SHARED:
-	case OMP_CLAUSE_COPYPRIVATE:
-	case OMP_CLAUSE_IF:
-	case OMP_CLAUSE_NUM_THREADS:
-	case OMP_CLAUSE_SCHEDULE:
-	  scan_omp (&TREE_OPERAND (c, 0), ctx->outer);
-	  break;
-
-	case OMP_CLAUSE_COPYIN:
-	case OMP_CLAUSE_NOWAIT:
-	case OMP_CLAUSE_ORDERED:
-	case OMP_CLAUSE_DEFAULT:
-	  break;
-
-	default:
-	  gcc_unreachable ();
-	}
-    }
-
-  /* Instantiate the VALUE_EXPR for variable sized variables.  We have
-     to do this as a separate pass, since we need the pointer and size
-     decls installed first.  */
-  for (c = var_sized_list; c ; c = TREE_CHAIN (c))
-    fixup_remapped_decl (OMP_CLAUSE_DECL (TREE_VALUE (c)), ctx,
-			 OMP_CLAUSE_PRIVATE_DEBUG (TREE_VALUE (c)));
-
-  scan_omp (&OMP_BODY (stmt), ctx);
-
-  if (TREE_CODE (stmt) == OMP_FOR)
-    {
-      scan_omp (&OMP_FOR_PRE_BODY (stmt), ctx);
-      scan_omp (&OMP_FOR_INIT (stmt), ctx);
-      scan_omp (&OMP_FOR_COND (stmt), ctx);
-      scan_omp (&OMP_FOR_INCR (stmt), ctx);
-    }
-}
-
 
 /* Callback for walk_stmts used to scan for OpenMP directives at TP.  */
 
@@ -1105,32 +1272,21 @@ scan_omp_1 (tree *tp, int *walk_subtrees, void *data)
   switch (TREE_CODE (t))
     {
     case OMP_PARALLEL:
-      if (++parallel_nesting_level == 1)
-	scan_omp_parallel (tp, ctx);
-      else
-	scan_omp_nested (tp, ctx);
+      parallel_nesting_level++;
+      scan_omp_parallel (tp, ctx);
       parallel_nesting_level--;
       break;
 
     case OMP_FOR:
-      if (parallel_nesting_level <= 1)
-	scan_omp_for (tp, ctx);
-      else
-	scan_omp_nested (tp, ctx);
+      scan_omp_for (tp, ctx);
       break;
 
     case OMP_SECTIONS:
-      if (parallel_nesting_level <= 1)
-	scan_omp_sections (tp, ctx);
-      else
-	scan_omp_nested (tp, ctx);
+      scan_omp_sections (tp, ctx);
       break;
 
     case OMP_SINGLE:
-      if (parallel_nesting_level <= 1)
-	scan_omp_single (tp, ctx);
-      else
-	scan_omp_nested (tp, ctx);
+      scan_omp_single (tp, ctx);
       break;
 
     case OMP_SECTION:
@@ -1147,11 +1303,7 @@ scan_omp_1 (tree *tp, int *walk_subtrees, void *data)
 	*walk_subtrees = 1;
 
 	for (var = BIND_EXPR_VARS (t); var ; var = TREE_CHAIN (var))
-	  {
-	    if (DECL_CONTEXT (var) == ctx->cb.src_fn)
-	      DECL_CONTEXT (var) = ctx->cb.dst_fn;
-	    insert_decl_map (&ctx->cb, var, var);
-	  }
+	  insert_decl_map (&ctx->cb, var, var);
       }
       break;
 
@@ -1219,6 +1371,73 @@ maybe_lookup_ctx (tree stmt)
   return n ? (omp_context *) n->value : NULL;
 }
 
+
+/* Find the mapping for DECL in CTX or the immediately enclosing
+   context that has a mapping for DECL.
+
+   If CTX is a nested parallel directive, we may have to use the decl
+   mappings created in CTX's parent context.  Suppose that we have the
+   following parallel nesting (variable UIDs showed for clarity):
+
+	iD.1562 = 0;
+     	#omp parallel shared(iD.1562)		-> outer parallel
+	  iD.1562 = iD.1562 + 1;
+
+	  #omp parallel shared (iD.1562)	-> inner parallel
+	     iD.1562 = iD.1562 - 1;
+
+   Each parallel structure will create a distinct .omp_data_s structure
+   for copying iD.1562 in/out of the directive:
+
+  	outer parallel		.omp_data_s.1.i -> iD.1562
+	inner parallel		.omp_data_s.2.i -> iD.1562
+
+   A shared variable mapping will produce a copy-out operation before
+   the parallel directive and a copy-in operation after it.  So, in
+   this case we would have:
+
+  	iD.1562 = 0;
+	.omp_data_o.1.i = iD.1562;
+	#omp parallel shared(iD.1562)		-> outer parallel
+	  .omp_data_i.1 = &.omp_data_o.1
+	  .omp_data_i.1->i = .omp_data_i.1->i + 1;
+
+	  .omp_data_o.2.i = iD.1562;		-> **
+	  #omp parallel shared(iD.1562)		-> inner parallel
+	    .omp_data_i.2 = &.omp_data_o.2
+	    .omp_data_i.2->i = .omp_data_i.2->i - 1;
+
+
+    ** This is a problem.  The symbol iD.1562 cannot be referenced
+       inside the body of the outer parallel region.  But since we are
+       emitting this copy operation while expanding the inner parallel
+       directive, we need to access the CTX structure of the outer
+       parallel directive to get the correct mapping:
+
+	  .omp_data_o.2.i = .omp_data_i.1->i
+
+    Since there may be other workshare or parallel directives enclosing
+    the parallel directive, it may be necessary to walk up the context
+    parent chain.  This is not a problem in general because nested
+    parallelism happens only rarely.  */
+
+static tree
+lookup_decl_in_outer_ctx (tree decl, omp_context *ctx)
+{
+  tree t;
+  omp_context *up;
+
+  gcc_assert (ctx->is_nested);
+
+  for (up = ctx->outer, t = NULL; up && t == NULL; up = up->outer)
+    t = maybe_lookup_decl (decl, up);
+
+  gcc_assert (t);
+
+  return t;
+}
+
+
 /* Construct the initialization value for reduction CLAUSE.  */
 
 tree
@@ -1291,7 +1510,7 @@ omp_reduction_init (tree clause, tree type)
    to destructors go in DLIST.  */
 
 static void
-expand_rec_input_clauses (tree clauses, tree *ilist, tree *dlist,
+lower_rec_input_clauses (tree clauses, tree *ilist, tree *dlist,
 			  omp_context *ctx)
 {
   tree_stmt_iterator diter;
@@ -1340,11 +1559,11 @@ expand_rec_input_clauses (tree clauses, tree *ilist, tree *dlist,
 	      if (pass != 0)
 		continue;
 	    }
-	  /* For variable sized types, we need to allocate the actual
-	     storage here.  Call alloca and store the result in the pointer
-	     decl that we created elsewhere.  */
 	  else if (is_variable_sized (var))
 	    {
+	      /* For variable sized types, we need to allocate the
+		 actual storage here.  Call alloca and store the
+		 result in the pointer decl that we created elsewhere.  */
 	      if (pass == 0)
 		continue;
 
@@ -1361,14 +1580,15 @@ expand_rec_input_clauses (tree clauses, tree *ilist, tree *dlist,
 	      x = build2 (MODIFY_EXPR, void_type_node, ptr, x);
 	      gimplify_and_add (x, ilist);
 	    }
-	  /* For references that are being privatized for Fortran, allocate
-	     new backing storage for the new pointer variable.  This allows
-	     us to avoid changing all the code that expects a pointer to
-	     something that expects a direct variable.  Note that this
-	     doesn't apply to C++, since reference types are disallowed in
-	     data sharing clauses there.  */
 	  else if (is_reference (var))
 	    {
+	      /* For references that are being privatized for Fortran,
+		 allocate new backing storage for the new pointer
+		 variable.  This allows us to avoid changing all the
+		 code that expects a pointer to something that expects
+		 a direct variable.  Note that this doesn't apply to
+		 C++, since reference types are disallowed in data
+		 sharing clauses there.  */
 	      if (pass == 0)
 		continue;
 
@@ -1501,12 +1721,13 @@ expand_rec_input_clauses (tree clauses, tree *ilist, tree *dlist,
     build_omp_barrier (ilist);
 }
 
+
 /* Generate code to implement the LASTPRIVATE clauses.  This is used for
    both parallel and workshare constructs.  PREDICATE may be NULL if it's
    always true.   */
 
 static void
-expand_lastprivate_clauses (tree clauses, tree predicate, tree *stmt_list,
+lower_lastprivate_clauses (tree clauses, tree predicate, tree *stmt_list,
 			    omp_context *ctx)
 {
   tree sub_list, x, c;
@@ -1554,13 +1775,15 @@ expand_lastprivate_clauses (tree clauses, tree predicate, tree *stmt_list,
     x = build3 (COND_EXPR, void_type_node, predicate, sub_list, NULL);
   else
     x = sub_list;
+
   gimplify_and_add (x, stmt_list);
 }
 
+
 /* Generate code to implement the REDUCTION clauses.  */
 
 static void
-expand_reduction_clauses (tree clauses, tree *stmt_list, omp_context *ctx)
+lower_reduction_clauses (tree clauses, tree *stmt_list, omp_context *ctx)
 {
   tree sub_list = NULL, x, c;
   int count = 0;
@@ -1596,8 +1819,9 @@ expand_reduction_clauses (tree clauses, tree *stmt_list, omp_context *ctx)
 	new_var = build_fold_indirect_ref (new_var);
       ref = build_outer_var_ref (var, ctx);
       code = OMP_CLAUSE_REDUCTION_CODE (c);
-      /* reduction(-:var) sums up the partial results, so it acts identically
-         to reduction(+:var).  */
+
+      /* reduction(-:var) sums up the partial results, so it acts
+	 identically to reduction(+:var).  */
       if (code == MINUS_EXPR)
         code = PLUS_EXPR;
 
@@ -1645,10 +1869,11 @@ expand_reduction_clauses (tree clauses, tree *stmt_list, omp_context *ctx)
   gimplify_and_add (x, stmt_list);
 }
 
+
 /* Generate code to implement the COPYPRIVATE clauses.  */
 
 static void
-expand_copyprivate_clauses (tree clauses, tree *slist, tree *rlist,
+lower_copyprivate_clauses (tree clauses, tree *slist, tree *rlist,
 			    omp_context *ctx)
 {
   tree c;
@@ -1665,7 +1890,8 @@ expand_copyprivate_clauses (tree clauses, tree *slist, tree *rlist,
       by_ref = use_pointer_for_field (var, false);
 
       ref = build_sender_ref (var, ctx);
-      x = by_ref ? build_fold_addr_expr (var) : var;
+      x = (ctx->is_nested) ? lookup_decl_in_outer_ctx (var, ctx) : var;
+      x = by_ref ? build_fold_addr_expr (x) : x;
       x = build2 (MODIFY_EXPR, void_type_node, ref, x);
       gimplify_and_add (x, slist);
 
@@ -1680,17 +1906,18 @@ expand_copyprivate_clauses (tree clauses, tree *slist, tree *rlist,
     }
 }
 
+
 /* Generate code to implement the clauses, FIRSTPRIVATE, COPYIN, LASTPRIVATE,
    and REDUCTION from the sender (aka parent) side.  */
 
 static void
-expand_send_clauses (tree clauses, tree *ilist, tree *olist, omp_context *ctx)
+lower_send_clauses (tree clauses, tree *ilist, tree *olist, omp_context *ctx)
 {
   tree c;
 
   for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
     {
-      tree val, ref, x;
+      tree val, ref, x, var;
       bool by_ref, do_in = false, do_out = false;
 
       switch (TREE_CODE (c))
@@ -1704,7 +1931,10 @@ expand_send_clauses (tree clauses, tree *ilist, tree *olist, omp_context *ctx)
 	  continue;
 	}
 
-      val = OMP_CLAUSE_DECL (c);
+      var = val = OMP_CLAUSE_DECL (c);
+      if (ctx->is_nested)
+	var = lookup_decl_in_outer_ctx (val, ctx);
+
       if (is_variable_sized (val))
 	continue;
       by_ref = use_pointer_for_field (val, false);
@@ -1739,14 +1969,15 @@ expand_send_clauses (tree clauses, tree *ilist, tree *olist, omp_context *ctx)
       if (do_in)
 	{
 	  ref = build_sender_ref (val, ctx);
-	  x = by_ref ? build_fold_addr_expr (val) : val;
+	  x = by_ref ? build_fold_addr_expr (var) : var;
 	  x = build2 (MODIFY_EXPR, void_type_node, ref, x);
 	  gimplify_and_add (x, ilist);
 	}
+
       if (do_out)
 	{
 	  ref = build_sender_ref (val, ctx);
-	  x = build2 (MODIFY_EXPR, void_type_node, val, ref);
+	  x = build2 (MODIFY_EXPR, void_type_node, var, ref);
 	  gimplify_and_add (x, olist);
 	}
     }
@@ -1757,13 +1988,13 @@ expand_send_clauses (tree clauses, tree *ilist, tree *olist, omp_context *ctx)
    got automatically shared.  */
 
 static void
-expand_send_shared_vars (tree *ilist, tree *olist, omp_context *ctx)
+lower_send_shared_vars (tree *ilist, tree *olist, omp_context *ctx)
 {
-  tree ovar, nvar, f, x;
+  tree var, ovar, nvar, f, x;
 
   if (ctx->record_type == NULL)
     return;
-  
+
   for (f = TYPE_FIELDS (ctx->record_type); f ; f = TREE_CHAIN (f))
     {
       ovar = DECL_ABSTRACT_ORIGIN (f);
@@ -1771,33 +2002,69 @@ expand_send_shared_vars (tree *ilist, tree *olist, omp_context *ctx)
       if (!nvar || !DECL_HAS_VALUE_EXPR_P (nvar))
 	continue;
 
+      var = ovar;
+
+      /* If CTX is a nested parallel directive.  Find the immediately
+	 enclosing parallel or workshare construct that contains a
+	 mapping for OVAR.  */
+      if (ctx->is_nested)
+	var = lookup_decl_in_outer_ctx (ovar, ctx);
+
       if (use_pointer_for_field (ovar, true))
 	{
 	  x = build_sender_ref (ovar, ctx);
-	  ovar = build_fold_addr_expr (ovar);
-	  x = build2 (MODIFY_EXPR, void_type_node, x, ovar);
+	  var = build_fold_addr_expr (var);
+	  x = build2 (MODIFY_EXPR, void_type_node, x, var);
 	  gimplify_and_add (x, ilist);
 	}
       else
 	{
 	  x = build_sender_ref (ovar, ctx);
-	  x = build2 (MODIFY_EXPR, void_type_node, x, ovar);
+	  x = build2 (MODIFY_EXPR, void_type_node, x, var);
 	  gimplify_and_add (x, ilist);
 
 	  x = build_sender_ref (ovar, ctx);
-	  x = build2 (MODIFY_EXPR, void_type_node, ovar, x);
+	  x = build2 (MODIFY_EXPR, void_type_node, var, x);
 	  gimplify_and_add (x, olist);
 	}
     }
 }
 
 /* Build the function calls to GOMP_parallel_start etc to actually 
-   generate the parallel operation.  */
+   generate the parallel operation.  REGION is the parallel region
+   being expanded.  BB is the block where to insert the code.  WS_ARGS
+   will be set if this is a call to a combined parallel+workshare
+   construct, it contains the list of additional arguments needed by
+   the workshare construct.  */
 
 static void
-build_parallel_call (tree clauses, tree *stmt_list, omp_context *ctx)
+expand_parallel_call (struct omp_region *region, basic_block bb, tree ws_args)
 {
-  tree t, args, val, cond, c;
+  tree t, args, val, cond, c, list, clauses;
+  block_stmt_iterator si;
+  int start_ix;
+
+  clauses = OMP_PARALLEL_CLAUSES (region->entry);
+  push_gimplify_context ();
+
+  /* Determine what flavour of GOMP_parallel_start we will be
+     emitting.  */
+  start_ix = BUILT_IN_GOMP_PARALLEL_START;
+  if (is_combined_parallel (region))
+    {
+      tree stmt = region->inner->entry;
+
+      if (TREE_CODE (stmt) == OMP_FOR)
+	{
+	  struct omp_for_data fd;
+	  extract_omp_for_data (stmt, &fd);
+	  start_ix = BUILT_IN_GOMP_PARALLEL_LOOP_STATIC_START + fd.sched_kind;
+	}
+      else if (TREE_CODE (stmt) == OMP_SECTIONS)
+	start_ix = BUILT_IN_GOMP_PARALLEL_SECTIONS_START;
+      else
+	gcc_unreachable ();
+    }
 
   /* By default, the value of NUM_THREADS is zero (selected at run time)
      and there is no conditional.  */
@@ -1819,43 +2086,103 @@ build_parallel_call (tree clauses, tree *stmt_list, omp_context *ctx)
      (cond != 0) or (cond ? val : 1u).  */
   if (cond)
     {
+      block_stmt_iterator si;
+
+      cond = gimple_boolify (cond);
+
       if (integer_zerop (val))
 	val = build2 (EQ_EXPR, unsigned_type_node, cond,
 		      build_int_cst (TREE_TYPE (cond), 0));
       else
-	val = build3 (COND_EXPR, unsigned_type_node, cond, val,
-		      build_int_cst (unsigned_type_node, 1));
+	{
+	  basic_block cond_bb, then_bb, else_bb;
+	  edge e;
+	  tree t, then_lab, else_lab, tmp;
+
+	  tmp = create_tmp_var (TREE_TYPE (val), NULL);
+	  e = split_block (bb, NULL);
+	  cond_bb = e->src;
+	  bb = e->dest;
+	  remove_edge (e);
+
+	  then_bb = create_empty_bb (cond_bb);
+	  else_bb = create_empty_bb (then_bb);
+	  then_lab = create_artificial_label ();
+	  else_lab = create_artificial_label ();
+
+	  t = build3 (COND_EXPR, void_type_node,
+		      cond,
+		      build_and_jump (&then_lab),
+		      build_and_jump (&else_lab));
+
+	  si = bsi_start (cond_bb);
+	  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+
+	  si = bsi_start (then_bb);
+	  t = build1 (LABEL_EXPR, void_type_node, then_lab);
+	  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+	  t = build2 (MODIFY_EXPR, void_type_node, tmp, val);
+	  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+
+	  si = bsi_start (else_bb);
+	  t = build1 (LABEL_EXPR, void_type_node, else_lab);
+	  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+	  t = build2 (MODIFY_EXPR, void_type_node, tmp, 
+	              build_int_cst (unsigned_type_node, 1));
+	  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+
+	  make_edge (cond_bb, then_bb, EDGE_TRUE_VALUE);
+	  make_edge (cond_bb, else_bb, EDGE_FALSE_VALUE);
+	  make_edge (then_bb, bb, EDGE_FALLTHRU);
+	  make_edge (else_bb, bb, EDGE_FALLTHRU);
+
+	  val = tmp;
+	}
+
+      list = NULL_TREE;
+      val = get_formal_tmp_var (val, &list);
+      si = bsi_start (bb);
+      bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
     }
 
+  list = NULL_TREE;
   args = tree_cons (NULL, val, NULL);
-  t = ctx->sender_decl;
+  t = OMP_PARALLEL_DATA_ARG (region->entry);
   if (t == NULL)
     t = null_pointer_node;
   else
     t = build_fold_addr_expr (t);
   args = tree_cons (NULL, t, args);
-  t = build_fold_addr_expr (ctx->cb.dst_fn);
+  t = build_fold_addr_expr (OMP_PARALLEL_FN (region->entry));
   args = tree_cons (NULL, t, args);
-  if (ctx->parallel_start_additional_args)
-    args = chainon (args, ctx->parallel_start_additional_args);
-  t = built_in_decls[ctx->parallel_start_ix];
+
+  if (ws_args)
+    args = chainon (args, ws_args);
+
+  t = built_in_decls[start_ix];
   t = build_function_call_expr (t, args);
-  gimplify_and_add (t, stmt_list);
+  gimplify_and_add (t, &list);
 
-  t = ctx->sender_decl;
+  t = OMP_PARALLEL_DATA_ARG (region->entry);
   if (t == NULL)
     t = null_pointer_node;
   else
     t = build_fold_addr_expr (t);
   args = tree_cons (NULL, t, NULL);
-  t = build_function_call_expr (ctx->cb.dst_fn, args);
-  gimplify_and_add (t, stmt_list);
+  t = build_function_call_expr (OMP_PARALLEL_FN (region->entry), args);
+  gimplify_and_add (t, &list);
 
   t = built_in_decls[BUILT_IN_GOMP_PARALLEL_END];
   t = build_function_call_expr (t, NULL);
-  gimplify_and_add (t, stmt_list);
+  gimplify_and_add (t, &list);
+
+  si = bsi_last (bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  pop_gimplify_context (NULL_TREE);
 }
 
+
 /* If exceptions are enabled, wrap *STMT_P in a MUST_NOT_THROW catch
    handler.  This prevents programs from violating the structured
    block semantics with throws.  */
@@ -1886,112 +2213,185 @@ maybe_catch_exception (tree *stmt_p)
   append_to_statement_list (t, stmt_p);
 }
 
+/* Chain all the DECLs in LIST by their TREE_CHAIN fields.  */
 
-/* Expand the OpenMP parallel directive pointed to by STMT_P.  CTX
-   holds context information for *STMT_P.  Expansion proceeds in
-   two main phases:
-
-   (1) The body of the parallel is expanded in-situ.
-       All the input and reduction clauses are expanded (from the
-       child's perspective).  The body of the parallel is then
-       inserted as the body of CTX->CB.DST_FUN (the function spawned
-       to execute each child thread).
-
-   (2) Back in the original function, the original body of the
-       directive is replaced with the expansion of clauses (from the
-       parent's perspective), and the thread library call to launch
-       all the children threads.  */
-
-static void
-expand_omp_parallel (tree *stmt_p, omp_context *ctx)
+static tree
+list2chain (tree list)
 {
-  tree clauses, block, bind, body, olist;
-
-  current_function_decl = ctx->cb.dst_fn;
-  cfun = DECL_STRUCT_FUNCTION (current_function_decl);
-
-  push_gimplify_context ();
-
-  /* First phase.  Expand the body of the children threads, emit
-     receiving code for data copying clauses.  */
-  clauses = OMP_PARALLEL_CLAUSES (*stmt_p);
-  bind = OMP_PARALLEL_BODY (*stmt_p);
-  block = BIND_EXPR_BLOCK (bind);
-  body = BIND_EXPR_BODY (bind);
-  BIND_EXPR_BODY (bind) = alloc_stmt_list ();
-
-  expand_rec_input_clauses (clauses, &BIND_EXPR_BODY (bind), &olist, ctx);
-
-  expand_omp (&body, ctx);
-  append_to_statement_list (body, &BIND_EXPR_BODY (bind));
+  tree t;
 
-  expand_reduction_clauses (clauses, &BIND_EXPR_BODY (bind), ctx);
-  append_to_statement_list (olist, &BIND_EXPR_BODY (bind));
-  maybe_catch_exception (&BIND_EXPR_BODY (bind));
+  for (t = list; t; t = TREE_CHAIN (t))
+    {
+      tree var = TREE_VALUE (t);
+      if (TREE_CHAIN (t))
+	TREE_CHAIN (var) = TREE_VALUE (TREE_CHAIN (t));
+      else
+	TREE_CHAIN (var) = NULL_TREE;
+    }
 
-  pop_gimplify_context (bind);
-  BIND_EXPR_VARS (bind) = chainon (BIND_EXPR_VARS (bind), ctx->block_vars);
-  BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
+  return list ? TREE_VALUE (list) : NULL_TREE;
+}
 
-  DECL_INITIAL (ctx->cb.dst_fn) = block;
-  DECL_SAVED_TREE (ctx->cb.dst_fn) = bind;
-  cgraph_add_new_function (ctx->cb.dst_fn);
 
-  current_function_decl = ctx->cb.src_fn;
-  cfun = DECL_STRUCT_FUNCTION (current_function_decl);
+/* Remove barriers in REGION->EXIT's block.  Note that this is only
+   valid for OMP_PARALLEL regions.  Since the end of a parallel region
+   is an implicit barrier, any workshare inside the OMP_PARALLEL that
+   left a barrier at the end of the OMP_PARALLEL region can now be
+   removed.  */
 
-  block = make_node (BLOCK);
-  bind = build3 (BIND_EXPR, void_type_node, NULL, NULL, block);
-  *stmt_p = bind;
+static void
+remove_exit_barrier (struct omp_region *region)
+{
+  block_stmt_iterator si;
+  basic_block exit_bb;
+  tree t;
 
-  push_gimplify_context ();
+  gcc_assert (TREE_CODE (region->entry) == OMP_PARALLEL);
 
-  /* Second phase.  Build the sender decl now that we're in the
-     correct context.  Replace the original body of the directive with
-     sending code for data copying clauses and the parallel call to
-     launch children threads.  */
-  if (ctx->record_type)
-    ctx->sender_decl = create_tmp_var (ctx->record_type, ".omp_data_o");
+  exit_bb = bb_for_stmt (region->exit);
 
-  olist = NULL;
-  expand_send_clauses (clauses, &BIND_EXPR_BODY (bind), &olist, ctx);
-  expand_send_shared_vars (&BIND_EXPR_BODY (bind), &olist, ctx);
-  build_parallel_call (clauses, &BIND_EXPR_BODY (bind), ctx);
-  append_to_statement_list (olist, &BIND_EXPR_BODY (bind));
+  /* The barrier should be immediately before OMP_RETURN_EXPR.
+     Otherwise, we cannot remove it.  */
+  si = bsi_last (exit_bb);
+  t = bsi_stmt (si);
+  gcc_assert (TREE_CODE (t) == OMP_RETURN_EXPR);
+  bsi_prev (&si);
+  if (bsi_end_p (si))
+    return;
 
-  pop_gimplify_context (bind);
-  BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
+  t = bsi_stmt (si);
+  if (TREE_CODE (t) == CALL_EXPR
+      && get_callee_fndecl (t) == built_in_decls[BUILT_IN_GOMP_BARRIER])
+    bsi_remove (&si, true);
 }
 
-/* A subroutine of expand_omp_for_1.  Generate code to emit the
-   for for a lastprivate clause.  Given a loop control predicate
-   of (V cond N2), we gate the clause on (!(V cond N2)).  */
+
+/* Expand the OpenMP parallel directive starting at REGION.  */
 
 static void
-expand_omp_for_lastprivate (struct expand_omp_for_data *fd)
+expand_omp_parallel (struct omp_region *region)
 {
-  tree clauses, cond;
-  enum tree_code cond_code;
-  
-  cond_code = fd->cond_code;
-  cond_code = cond_code == LT_EXPR ? GE_EXPR : LE_EXPR;
+  basic_block entry_bb, exit_bb, new_bb;
+  struct function *child_cfun, *saved_cfun;
+  tree child_fn, block, t, ws_args;
+  block_stmt_iterator si;
+  edge e;
+
+  child_fn = OMP_PARALLEL_FN (region->entry);
+  child_cfun = DECL_STRUCT_FUNCTION (child_fn);
+  saved_cfun = cfun;
+
+  entry_bb = bb_for_stmt (region->entry);
+  exit_bb = bb_for_stmt (region->exit);
+
+  /* Barriers at the end of the function are not necessary and can be
+     removed.  Since the caller will have a barrier of its own, this
+     one is superfluous.  */
+  remove_exit_barrier (region);
+
+  if (is_combined_parallel (region))
+    ws_args = region->ws_args;
+  else
+    ws_args = NULL_TREE;
 
-  /* When possible, use a strict equality expression.  This can let VRP
-     type optimizations deduce the value and remove a copy.  */
-  if (host_integerp (fd->step, 0))
+  if (DECL_STRUCT_FUNCTION (OMP_PARALLEL_FN (region->entry))->cfg)
     {
-      HOST_WIDE_INT step = TREE_INT_CST_LOW (fd->step);
-      if (step == 1 || step == -1)
-	cond_code = EQ_EXPR;
+      /* Due to inlining, it may happen that we have already outlined
+	 the region, in which case all we need to do is make the
+	 sub-graph unreachable and emit the parallel call.  */
+      edge entry_succ_e, exit_succ_e;
+      block_stmt_iterator si;
+
+      entry_succ_e = single_succ_edge (entry_bb);
+      exit_succ_e = single_succ_edge (exit_bb);
+
+      si = bsi_last (entry_bb);
+      gcc_assert (!bsi_end_p (si) && TREE_CODE (bsi_stmt (si)) == OMP_PARALLEL);
+      bsi_remove (&si, true);
+
+      new_bb = entry_bb;
+      remove_edge (entry_succ_e);
+      make_edge (new_bb, exit_succ_e->dest, EDGE_FALLTHRU);
     }
+  else
+    {
+      /* If the parallel region needs data sent from the parent
+	 function, then the very first statement of the parallel body
+	 is a copy assignment .OMP_DATA_I = &.OMP_DATA_O.  Since
+	 &.OMP_DATA_O is passed as an argument to the child function,
+	 we need to replace it with the argument as seen by the child
+	 function.
+
+	 In most cases, this will end up being the identity assignment
+	 .OMP_DATA_I = .OMP_DATA_I.  However, if the parallel body had
+	 a function call that has been inlined, the original PARM_DECL
+	 .OMP_DATA_I may have been converted into a different local
+	 variable.  In which case, we need to keep the assignment.  */
+      if (OMP_PARALLEL_DATA_ARG (region->entry))
+	{
+	  basic_block entry_succ_bb = single_succ (entry_bb);
+	  block_stmt_iterator si = bsi_start (entry_succ_bb);
+	  tree stmt;
 
-  cond = build2 (cond_code, boolean_type_node, fd->v, fd->n2);
+	  gcc_assert (!bsi_end_p (si));
 
-  clauses = OMP_FOR_CLAUSES (fd->for_stmt);
-  expand_lastprivate_clauses (clauses, cond, &fd->pre, fd->ctx);
+	  stmt = bsi_stmt (si);
+	  gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR
+		      && TREE_CODE (TREE_OPERAND (stmt, 1)) == ADDR_EXPR
+		      && TREE_OPERAND (TREE_OPERAND (stmt, 1), 0)
+			 == OMP_PARALLEL_DATA_ARG (region->entry));
+
+	  if (TREE_OPERAND (stmt, 0) == DECL_ARGUMENTS (child_fn))
+	    bsi_remove (&si, true);
+	  else
+	    TREE_OPERAND (stmt, 1) = DECL_ARGUMENTS (child_fn);
+	}
+
+      /* Declare local variables needed in CHILD_CFUN.  */
+      block = DECL_INITIAL (child_fn);
+      BLOCK_VARS (block) = list2chain (child_cfun->unexpanded_var_list);
+      DECL_SAVED_TREE (child_fn) = single_succ (entry_bb)->stmt_list;
+
+      /* Reset DECL_CONTEXT on locals and function arguments.  */
+      for (t = BLOCK_VARS (block); t; t = TREE_CHAIN (t))
+	DECL_CONTEXT (t) = child_fn;
+
+      for (t = DECL_ARGUMENTS (child_fn); t; t = TREE_CHAIN (t))
+	DECL_CONTEXT (t) = child_fn;
+
+      /* Split ENTRY_BB at OMP_PARALLEL so that it can be moved to the
+	 child function.  */
+      si = bsi_last (entry_bb);
+      t = bsi_stmt (si);
+      gcc_assert (t && TREE_CODE (t) == OMP_PARALLEL);
+      bsi_remove (&si, true);
+      e = split_block (entry_bb, t);
+      entry_bb = e->dest;
+      single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
+
+      /* Move the parallel region into CHILD_CFUN.  We need to reset
+	 dominance information because the expansion of the inner
+	 regions has invalidated it.  */
+      free_dominance_info (CDI_DOMINATORS);
+      new_bb = move_sese_region_to_fn (child_cfun, entry_bb, exit_bb);
+      single_succ_edge (new_bb)->flags = EDGE_FALLTHRU;
+      cgraph_add_new_function (child_fn);
+
+      /* Convert OMP_RETURN into a RETURN_EXPR.  */
+      si = bsi_last (exit_bb);
+      gcc_assert (!bsi_end_p (si)
+	          && TREE_CODE (bsi_stmt (si)) == OMP_RETURN_EXPR);
+      t = build1 (RETURN_EXPR, void_type_node, NULL);
+      bsi_insert_after (&si, t, TSI_SAME_STMT);
+      bsi_remove (&si, true);
+    }
+
+  /* Emit a library call to launch the children threads.  */
+  expand_parallel_call (region, new_bb, ws_args);
 }
 
-/* A subroutine of expand_omp_for_1.  Generate code for a parallel
+
+/* A subroutine of expand_omp_for.  Generate code for a parallel
    loop with any schedule.  Given parameters:
 
 	for (V = N1; V cond N2; V += STEP) BODY;
@@ -1999,44 +2399,34 @@ expand_omp_for_lastprivate (struct expand_omp_for_data *fd)
    where COND is "<" or ">", we generate pseudocode
 
 	more = GOMP_loop_foo_start (N1, N2, STEP, CHUNK, &istart0, &iend0);
-	if (more) goto L0; else goto L2;
+	if (more) goto L0; else goto L3;
     L0:
 	V = istart0;
 	iend = iend0;
     L1:
 	BODY;
 	V += STEP;
-	if (V cond iend) goto L1;
-	more = GOMP_loop_foo_next (&istart0, &iend0);
-	if (more) goto L0;
-	lastprivate;
+	if (V cond iend) goto L1; else goto L2;
     L2:
+	if (GOMP_loop_foo_next (&istart0, &iend0)) goto L0; else goto L3;
+    L3:
 
-    If this is a combined omp parallel loop, we can skip the call
-    to GOMP_loop_foo_start and generate
+    If this is a combined omp parallel loop, instead of the call to
+    GOMP_loop_foo_start, we emit 'goto L3'.  */
 
-    L0:
-	if (!GOMP_loop_foo_next (&istart0, &iend0)) goto L2;
-	V = istart0;
-	iend = iend0;
-    L1:
-	BODY;
-	V += STEP;
-	if (V cond iend) goto L1;
-	goto L0;
-    L2:
-	lastprivate;
-*/
-
-static void
-expand_omp_for_generic (struct expand_omp_for_data *fd,
+static basic_block
+expand_omp_for_generic (struct omp_region *region,
+			struct omp_for_data *fd,
 			enum built_in_function start_fn,
 			enum built_in_function next_fn)
 {
-  tree l0, l1, l2;
+  tree l0, l1, l2, l3;
   tree type, istart0, iend0, iend;
-  tree t, args;
-  bool in_combined_parallel = is_in_combined_parallel_ctx (fd->ctx);
+  tree t, args, list;
+  basic_block entry_bb, exit_bb, l0_bb, l1_bb, l2_bb;
+  edge exit_edge;
+  block_stmt_iterator si;
+  bool in_combined_parallel = is_combined_parallel (region);
 
   type = TREE_TYPE (fd->v);
 
@@ -2046,25 +2436,22 @@ expand_omp_for_generic (struct expand_omp_for_data *fd,
   l0 = create_artificial_label ();
   l1 = create_artificial_label ();
   l2 = create_artificial_label ();
+  l3 = create_artificial_label ();
   iend = create_tmp_var (type, NULL);
 
-  /* If this is a combined parallel loop, skip the call to
-     GOMP_loop_foo_start and call GOMP_loop_foo_next directly.  */
-  if (in_combined_parallel)
-    {
-      t = build1 (LABEL_EXPR, void_type_node, l0);
-      gimplify_and_add (t, &fd->pre);
-      t = build_fold_addr_expr (iend0);
-      args = tree_cons (NULL, t, NULL);
-      t = build_fold_addr_expr (istart0);
-      args = tree_cons (NULL, t, args);
-      t = build_function_call_expr (built_in_decls[next_fn], args);
-      t = build1 (TRUTH_NOT_EXPR, TREE_TYPE (t), t);
-      t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l2), NULL);
-      gimplify_and_add (t, &fd->pre);
-    }
-  else
+  entry_bb = bb_for_stmt (region->entry);
+  l1_bb = single_succ (entry_bb);
+  exit_bb = bb_for_stmt (region->exit);
+
+  si = bsi_last (entry_bb);
+  gcc_assert (bsi_stmt (si) && TREE_CODE (bsi_stmt (si)) == OMP_FOR);
+  bsi_remove (&si, true);
+  list = alloc_stmt_list ();
+
+  if (!in_combined_parallel)
     {
+      /* If this is not a combined parallel loop, emit a call to
+	 GOMP_loop_foo_start in ENTRY_BB.  */
       t = build_fold_addr_expr (iend0);
       args = tree_cons (NULL, t, NULL);
       t = build_fold_addr_expr (istart0);
@@ -2081,61 +2468,109 @@ expand_omp_for_generic (struct expand_omp_for_data *fd,
       t = fold_convert (long_integer_type_node, fd->n1);
       args = tree_cons (NULL, t, args);
       t = build_function_call_expr (built_in_decls[start_fn], args);
-      t = build3 (COND_EXPR, void_type_node, t,
-		  build_and_jump (&l0), build_and_jump (&l2));
-      gimplify_and_add (t, &fd->pre);
-      t = build1 (LABEL_EXPR, void_type_node, l0);
-      gimplify_and_add (t, &fd->pre);
+      t = get_formal_tmp_var (t, &list);
+      t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l0),
+		  build_and_jump (&l3));
+      append_to_statement_list (t, &list);
+      si = bsi_last (entry_bb);
+      bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
     }
 
+  /* Iteration setup for sequential loop goes in L0_BB.  */
+  list = alloc_stmt_list ();
+  t = build1 (LABEL_EXPR, void_type_node, l0);
+  gimplify_and_add (t, &list);
+
   t = fold_convert (type, istart0);
   t = build2 (MODIFY_EXPR, void_type_node, fd->v, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = fold_convert (type, iend0);
   t = build2 (MODIFY_EXPR, void_type_node, iend, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
-  t = build1 (LABEL_EXPR, void_type_node, l1);
-  gimplify_and_add (t, &fd->pre);
+  l0_bb = create_empty_bb (entry_bb);
+  si = bsi_start (l0_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Loop body goes in L1_BB.  */
+  list = alloc_stmt_list ();
+  si = bsi_start (l1_bb);
+  bsi_insert_before (&si, build1 (LABEL_EXPR, void_type_node, l1),
+		     BSI_CONTINUE_LINKING);
 
-  append_to_statement_list (OMP_FOR_BODY (fd->for_stmt), &fd->pre);
+  /* Code to control the increment and predicate for the sequential
+     loop goes in the first half of EXIT_BB (we split EXIT_BB so
+     that we can inherit all the edges going out of the loop
+     body).  */
+  list = alloc_stmt_list ();
 
   t = build2 (PLUS_EXPR, type, fd->v, fd->step);
   t = build2 (MODIFY_EXPR, void_type_node, fd->v, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
   
   t = build2 (fd->cond_code, boolean_type_node, fd->v, iend);
-  t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l1), NULL);
-  gimplify_and_add (t, &fd->pre);
+  t = get_formal_tmp_var (t, &list);
+  t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l1),
+	      build_and_jump (&l2));
+  append_to_statement_list (t, &list);
+
+  si = bsi_last (exit_bb);
+  t = bsi_stmt (si);
+  gcc_assert (t && TREE_CODE (t) == OMP_RETURN_EXPR);
+  bsi_remove (&si, true);
+  exit_edge = split_block (exit_bb, t);
+  exit_edge->flags = EDGE_FALSE_VALUE;
+
+  si = bsi_last (exit_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Emit code to get the next parallel iteration in L2_BB.  */
+  list = alloc_stmt_list ();
+  t = build1 (LABEL_EXPR, void_type_node, l2);
+  gimplify_and_add (t, &list);
 
-  /* If emitting a combined parallel loop, we only need to emit a jump
-     back to L0 to call GOMP_loop_foo_next again.  */
+  t = build_fold_addr_expr (iend0);
+  args = tree_cons (NULL, t, NULL);
+  t = build_fold_addr_expr (istart0);
+  args = tree_cons (NULL, t, args);
+  t = build_function_call_expr (built_in_decls[next_fn], args);
+  t = get_formal_tmp_var (t, &list);
+  t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l0),
+	      build_and_jump (&l3));
+  append_to_statement_list (t, &list);
+  
+  l2_bb = exit_edge->dest;
+  si = bsi_start (l2_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Insert exit label on EXIT_EDGE.  */
+  exit_edge = single_succ_edge (l2_bb);
+  t = build1 (LABEL_EXPR, void_type_node, l3);
+  bsi_insert_on_edge_immediate (exit_edge, t);
+  exit_edge->flags = EDGE_FALSE_VALUE;
+
+  /* Connect the new blocks.  */
+  remove_edge (single_succ_edge (entry_bb));
   if (in_combined_parallel)
-    {
-      t = build_and_jump (&l0);
-      gimplify_and_add (t, &fd->pre);
-    }
+    make_edge (entry_bb, l2_bb, EDGE_FALLTHRU);
   else
     {
-      t = build_fold_addr_expr (iend0);
-      args = tree_cons (NULL, t, NULL);
-      t = build_fold_addr_expr (istart0);
-      args = tree_cons (NULL, t, args);
-      t = build_function_call_expr (built_in_decls[next_fn], args);
-      t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l0), NULL);
-      gimplify_and_add (t, &fd->pre);
+      make_edge (entry_bb, l0_bb, EDGE_TRUE_VALUE);
+      make_edge (entry_bb, exit_edge->dest, EDGE_FALSE_VALUE);
     }
 
-  expand_omp_for_lastprivate (fd);
-  
-  t = build1 (LABEL_EXPR, void_type_node, l2);
-  gimplify_and_add (t, &fd->pre);
+  make_edge (l0_bb, l1_bb, EDGE_FALLTHRU);
+  make_edge (exit_bb, l1_bb, EDGE_TRUE_VALUE);
+  make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
+
+  return exit_edge->dest;
 }
 
 
-/* A subroutine of expand_omp_for_1.  Generate code for a parallel
-   loop with static schedule and no specified chunk size.  Given parameters:
+/* A subroutine of expand_omp_for.  Generate code for a parallel
+   loop with static schedule and no specified chunk size.  Given
+   parameters:
 
 	for (V = N1; V cond N2; V += STEP) BODY;
 
@@ -2158,15 +2593,18 @@ expand_omp_for_generic (struct expand_omp_for_data *fd,
 	BODY;
 	V += STEP;
 	if (V cond e) goto L1;
-	lastprivate;
     L2:
 */
 
-static void
-expand_omp_for_static_nochunk (struct expand_omp_for_data *fd)
+static basic_block
+expand_omp_for_static_nochunk (struct omp_region *region,
+			       struct omp_for_data *fd)
 {
   tree l0, l1, l2, n, q, s0, e0, e, t, nthreads, threadid;
-  tree type, utype;
+  tree type, utype, list;
+  basic_block entry_bb, exit_bb, seq_start_bb, body_bb, new_exit_bb;
+  block_stmt_iterator si;
+  edge exit_edge;
 
   l0 = create_artificial_label ();
   l1 = create_artificial_label ();
@@ -2175,27 +2613,33 @@ expand_omp_for_static_nochunk (struct expand_omp_for_data *fd)
   type = TREE_TYPE (fd->v);
   utype = lang_hooks.types.unsigned_type (type);
 
+  entry_bb = bb_for_stmt (region->entry);
+  body_bb = single_succ (entry_bb);
+  exit_bb = bb_for_stmt (region->exit);
+
+  /* Iteration space partitioning goes in ENTRY_BB.  */
+  list = alloc_stmt_list ();
   t = built_in_decls[BUILT_IN_OMP_GET_NUM_THREADS];
   t = build_function_call_expr (t, NULL);
   t = fold_convert (utype, t);
-  nthreads = get_formal_tmp_var (t, &fd->pre);
+  nthreads = get_formal_tmp_var (t, &list);
   
   t = built_in_decls[BUILT_IN_OMP_GET_THREAD_NUM];
   t = build_function_call_expr (t, NULL);
   t = fold_convert (utype, t);
-  threadid = get_formal_tmp_var (t, &fd->pre);
+  threadid = get_formal_tmp_var (t, &list);
 
   fd->n1 = fold_convert (type, fd->n1);
   if (!is_gimple_val (fd->n1))
-    fd->n1 = get_formal_tmp_var (fd->n1, &fd->pre);
+    fd->n1 = get_formal_tmp_var (fd->n1, &list);
 
   fd->n2 = fold_convert (type, fd->n2);
   if (!is_gimple_val (fd->n2))
-    fd->n2 = get_formal_tmp_var (fd->n2, &fd->pre);
+    fd->n2 = get_formal_tmp_var (fd->n2, &list);
 
   fd->step = fold_convert (type, fd->step);
   if (!is_gimple_val (fd->step))
-    fd->step = get_formal_tmp_var (fd->step, &fd->pre);
+    fd->step = get_formal_tmp_var (fd->step, &list);
 
   t = build_int_cst (type, (fd->cond_code == LT_EXPR ? -1 : 1));
   t = fold_build2 (PLUS_EXPR, type, fd->step, t);
@@ -2206,63 +2650,107 @@ expand_omp_for_static_nochunk (struct expand_omp_for_data *fd)
   if (is_gimple_val (t))
     n = t;
   else
-    n = get_formal_tmp_var (t, &fd->pre);
+    n = get_formal_tmp_var (t, &list);
 
   t = build2 (TRUNC_DIV_EXPR, utype, n, nthreads);
-  q = get_formal_tmp_var (t, &fd->pre);
+  q = get_formal_tmp_var (t, &list);
 
   t = build2 (MULT_EXPR, utype, q, nthreads);
   t = build2 (NE_EXPR, utype, t, n);
   t = build2 (PLUS_EXPR, utype, q, t);
-  q = get_formal_tmp_var (t, &fd->pre);
+  q = get_formal_tmp_var (t, &list);
 
   t = build2 (MULT_EXPR, utype, q, threadid);
-  s0 = get_formal_tmp_var (t, &fd->pre);
+  s0 = get_formal_tmp_var (t, &list);
 
   t = build2 (PLUS_EXPR, utype, s0, q);
   t = build2 (MIN_EXPR, utype, t, n);
-  e0 = get_formal_tmp_var (t, &fd->pre);
+  e0 = get_formal_tmp_var (t, &list);
 
   t = build2 (GE_EXPR, boolean_type_node, s0, e0);
-  t = build3 (COND_EXPR, void_type_node, t,
-	      build_and_jump (&l2), build_and_jump (&l0));
-  gimplify_and_add (t, &fd->pre);
+  t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l2),
+	      build_and_jump (&l0));
+  append_to_statement_list (t, &list);
+
+  si = bsi_last (entry_bb);
+  gcc_assert (bsi_stmt (si) && TREE_CODE (bsi_stmt (si)) == OMP_FOR);
+  bsi_remove (&si, true);
+  si = bsi_last (entry_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Setup code for sequential iteration goes in SEQ_START_BB.  */
+  list = alloc_stmt_list ();
 
   t = build1 (LABEL_EXPR, void_type_node, l0);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = fold_convert (type, s0);
   t = build2 (MULT_EXPR, type, t, fd->step);
   t = build2 (PLUS_EXPR, type, t, fd->n1);
   t = build2 (MODIFY_EXPR, void_type_node, fd->v, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = fold_convert (type, e0);
   t = build2 (MULT_EXPR, type, t, fd->step);
   t = build2 (PLUS_EXPR, type, t, fd->n1);
-  e = get_formal_tmp_var (t, &fd->pre);
+  e = get_formal_tmp_var (t, &list);
 
-  t = build1 (LABEL_EXPR, void_type_node, l1);
-  gimplify_and_add (t, &fd->pre);
+  seq_start_bb = create_empty_bb (entry_bb);
+  si = bsi_start (seq_start_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
 
-  append_to_statement_list (OMP_FOR_BODY (fd->for_stmt), &fd->pre);
+  /* Original body goes in BODY_BB.  */
+  si = bsi_start (body_bb);
+  t = build1 (LABEL_EXPR, void_type_node, l1);
+  bsi_insert_before (&si, t, BSI_CONTINUE_LINKING);
+
+  /* Split EXIT_BB at the OMP_RETURN.  The code controlling the
+     sequential loop goes in the original EXIT_BB.  The exit out of
+     the parallel loop goes in the new block (NEW_EXIT_BB).  */
+  si = bsi_last (exit_bb);
+  t = bsi_stmt (si);
+  bsi_remove (&si, true);
+  gcc_assert (t && TREE_CODE (t) == OMP_RETURN_EXPR);
+  exit_edge = split_block (exit_bb, t);
+  new_exit_bb = exit_edge->dest;
+  list = alloc_stmt_list ();
 
   t = build2 (PLUS_EXPR, type, fd->v, fd->step);
   t = build2 (MODIFY_EXPR, void_type_node, fd->v, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = build2 (fd->cond_code, boolean_type_node, fd->v, e);
-  t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l1), NULL);
-  gimplify_and_add (t, &fd->pre);
+  t = get_formal_tmp_var (t, &list);
+  t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l1),
+	      build_and_jump (&l2));
+  append_to_statement_list (t, &list);
 
-  expand_omp_for_lastprivate (fd);
-  
+  si = bsi_last (exit_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Add the exit label to NEW_EXIT_BB.  */
+  si = bsi_start (new_exit_bb);
   t = build1 (LABEL_EXPR, void_type_node, l2);
-  gimplify_and_add (t, &fd->pre);
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+  single_succ_edge (new_exit_bb)->flags = EDGE_FALLTHRU;
+
+  /* Connect all the blocks.  */
+  make_edge (seq_start_bb, body_bb, EDGE_FALLTHRU);
+
+  remove_edge (single_succ_edge (entry_bb));
+  make_edge (entry_bb, new_exit_bb, EDGE_TRUE_VALUE);
+  make_edge (entry_bb, seq_start_bb, EDGE_FALSE_VALUE);
+
+  make_edge (exit_bb, body_bb, EDGE_TRUE_VALUE);
+  find_edge (exit_bb, new_exit_bb)->flags = EDGE_FALSE_VALUE;
+
+  return new_exit_bb;
 }
 
-/* A subroutine of expand_omp_for_1.  Generate code for a parallel
-   loop with static schedule and a specified chunk size.  Given parameters:
+
+/* A subroutine of expand_omp_for.  Generate code for a parallel
+   loop with static schedule and a specified chunk size.  Given
+   parameters:
 
 	for (V = N1; V cond N2; V += STEP) BODY;
 
@@ -2289,53 +2777,62 @@ expand_omp_for_static_nochunk (struct expand_omp_for_data *fd)
 	trip += 1;
 	goto L0;
     L4:
-	if (trip == 0) goto L5;
-	lastprivate;
-    L5:
 */
 
-static void
-expand_omp_for_static_chunk (struct expand_omp_for_data *fd)
+static basic_block
+expand_omp_for_static_chunk (struct omp_region *region, struct omp_for_data *fd)
 {
-  tree l0, l1, l2, l3, l4, l5, n, s0, e0, e, t;
+  tree l0, l1, l2, l3, l4, n, s0, e0, e, t;
   tree trip, nthreads, threadid;
   tree type, utype;
+  basic_block entry_bb, exit_bb, body_bb, seq_start_bb, iter_part_bb;
+  basic_block trip_update_bb, new_exit_bb;
+  edge exit_edge;
+  tree list;
+  block_stmt_iterator si;
 
   l0 = create_artificial_label ();
   l1 = create_artificial_label ();
   l2 = create_artificial_label ();
   l3 = create_artificial_label ();
   l4 = create_artificial_label ();
-  l5 = create_artificial_label ();
   
   type = TREE_TYPE (fd->v);
   utype = lang_hooks.types.unsigned_type (type);
 
+  entry_bb = bb_for_stmt (region->entry);
+  body_bb = single_succ (entry_bb);
+
+  exit_bb = bb_for_stmt (region->exit);
+
+  /* Trip and adjustment setup goes in ENTRY_BB.  */
+  list = alloc_stmt_list ();
+
   t = built_in_decls[BUILT_IN_OMP_GET_NUM_THREADS];
   t = build_function_call_expr (t, NULL);
   t = fold_convert (utype, t);
-  nthreads = get_formal_tmp_var (t, &fd->pre);
+  nthreads = get_formal_tmp_var (t, &list);
   
   t = built_in_decls[BUILT_IN_OMP_GET_THREAD_NUM];
   t = build_function_call_expr (t, NULL);
   t = fold_convert (utype, t);
-  threadid = get_formal_tmp_var (t, &fd->pre);
+  threadid = get_formal_tmp_var (t, &list);
 
   fd->n1 = fold_convert (type, fd->n1);
   if (!is_gimple_val (fd->n1))
-    fd->n1 = get_formal_tmp_var (fd->n1, &fd->pre);
+    fd->n1 = get_formal_tmp_var (fd->n1, &list);
 
   fd->n2 = fold_convert (type, fd->n2);
   if (!is_gimple_val (fd->n2))
-    fd->n2 = get_formal_tmp_var (fd->n2, &fd->pre);
+    fd->n2 = get_formal_tmp_var (fd->n2, &list);
 
   fd->step = fold_convert (type, fd->step);
   if (!is_gimple_val (fd->step))
-    fd->step = get_formal_tmp_var (fd->step, &fd->pre);
+    fd->step = get_formal_tmp_var (fd->step, &list);
 
   fd->chunk_size = fold_convert (utype, fd->chunk_size);
   if (!is_gimple_val (fd->chunk_size))
-    fd->chunk_size = get_formal_tmp_var (fd->chunk_size, &fd->pre);
+    fd->chunk_size = get_formal_tmp_var (fd->chunk_size, &list);
 
   t = build_int_cst (type, (fd->cond_code == LT_EXPR ? -1 : 1));
   t = fold_build2 (PLUS_EXPR, type, fd->step, t);
@@ -2346,155 +2843,164 @@ expand_omp_for_static_chunk (struct expand_omp_for_data *fd)
   if (is_gimple_val (t))
     n = t;
   else
-    n = get_formal_tmp_var (t, &fd->pre);
+    n = get_formal_tmp_var (t, &list);
 
   t = build_int_cst (utype, 0);
-  trip = get_initialized_tmp_var (t, &fd->pre, NULL);
+  trip = get_initialized_tmp_var (t, &list, NULL);
+
+  si = bsi_last (entry_bb);
+  gcc_assert (bsi_stmt (si) && TREE_CODE (bsi_stmt (si)) == OMP_FOR);
+  bsi_remove (&si, true);
+  si = bsi_last (entry_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Iteration space partitioning goes in ITER_PART_BB.  */
+  list = alloc_stmt_list ();
 
   t = build1 (LABEL_EXPR, void_type_node, l0);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = build2 (MULT_EXPR, utype, trip, nthreads);
   t = build2 (PLUS_EXPR, utype, t, threadid);
   t = build2 (MULT_EXPR, utype, t, fd->chunk_size);
-  s0 = get_formal_tmp_var (t, &fd->pre);
+  s0 = get_formal_tmp_var (t, &list);
 
   t = build2 (PLUS_EXPR, utype, s0, fd->chunk_size);
   t = build2 (MIN_EXPR, utype, t, n);
-  e0 = get_formal_tmp_var (t, &fd->pre);
+  e0 = get_formal_tmp_var (t, &list);
 
   t = build2 (LT_EXPR, boolean_type_node, s0, n);
   t = build3 (COND_EXPR, void_type_node, t,
 	      build_and_jump (&l1), build_and_jump (&l4));
-  gimplify_and_add (t, &fd->pre);
+  append_to_statement_list (t, &list);
+
+  iter_part_bb = create_empty_bb (entry_bb);
+  si = bsi_start (iter_part_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Setup code for sequential iteration goes in SEQ_START_BB.  */
+  list = alloc_stmt_list ();
 
   t = build1 (LABEL_EXPR, void_type_node, l1);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = fold_convert (type, s0);
   t = build2 (MULT_EXPR, type, t, fd->step);
   t = build2 (PLUS_EXPR, type, t, fd->n1);
   t = build2 (MODIFY_EXPR, void_type_node, fd->v, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = fold_convert (type, e0);
   t = build2 (MULT_EXPR, type, t, fd->step);
   t = build2 (PLUS_EXPR, type, t, fd->n1);
-  e = get_formal_tmp_var (t, &fd->pre);
+  e = get_formal_tmp_var (t, &list);
+
+  seq_start_bb = create_empty_bb (iter_part_bb);
+  si = bsi_start (seq_start_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
 
+  /* Main loop body goes in BODY_BB.  */
+  si = bsi_start (body_bb);
   t = build1 (LABEL_EXPR, void_type_node, l2);
-  gimplify_and_add (t, &fd->pre);
+  bsi_insert_before (&si, t, BSI_CONTINUE_LINKING);
 
-  append_to_statement_list (OMP_FOR_BODY (fd->for_stmt), &fd->pre);
+  /* Split EXIT_BB.  The code controlling the sequential loop goes in
+     the first half.  The trip update code goes into the second half
+     (TRIP_UPDATE_BB).  */
+  list = alloc_stmt_list ();
 
   t = build2 (PLUS_EXPR, type, fd->v, fd->step);
   t = build2 (MODIFY_EXPR, void_type_node, fd->v, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = build2 (fd->cond_code, boolean_type_node, fd->v, e);
+  t = get_formal_tmp_var (t, &list);
   t = build3 (COND_EXPR, void_type_node, t,
 	      build_and_jump (&l2), build_and_jump (&l3));
-  gimplify_and_add (t, &fd->pre);
+  append_to_statement_list (t, &list);
+  
+  si = bsi_last (exit_bb);
+  t = bsi_stmt (si);
+  gcc_assert (t && TREE_CODE (t) == OMP_RETURN_EXPR);
+  bsi_remove (&si, true);
+  exit_edge = split_block (exit_bb, t);
+  si = bsi_last (exit_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+
+  /* Trip update code goes into TRIP_UPDATE_BB.  */
+  trip_update_bb = exit_edge->dest;
+  list = alloc_stmt_list ();
 
   t = build1 (LABEL_EXPR, void_type_node, l3);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
   t = build_int_cst (utype, 1);
   t = build2 (PLUS_EXPR, utype, trip, t);
   t = build2 (MODIFY_EXPR, void_type_node, trip, t);
-  gimplify_and_add (t, &fd->pre);
+  gimplify_and_add (t, &list);
 
-  t = build1 (GOTO_EXPR, void_type_node, l0);
-  gimplify_and_add (t, &fd->pre);
+  si = bsi_start (trip_update_bb);
+  bsi_insert_after (&si, list, BSI_CONTINUE_LINKING);
+  exit_edge = single_succ_edge (trip_update_bb);
+  exit_edge->flags = EDGE_FALLTHRU;
+  new_exit_bb = exit_edge->dest;
 
+  /* Insert exit label on EXIT_EDGE.  */
   t = build1 (LABEL_EXPR, void_type_node, l4);
-  gimplify_and_add (t, &fd->pre);
+  bsi_insert_on_edge_immediate (exit_edge, t);
 
-  t = build_int_cst (utype, 0);
-  t = build2 (EQ_EXPR, boolean_type_node, trip, t);
-  t = build3 (COND_EXPR, void_type_node, t, build_and_jump (&l5), NULL);
+  /* Connect the new blocks.  */
+  remove_edge (single_succ_edge (entry_bb));
+  make_edge (entry_bb, iter_part_bb, EDGE_FALLTHRU);
 
-  expand_omp_for_lastprivate (fd);
-  
-  t = build1 (LABEL_EXPR, void_type_node, l5);
-  gimplify_and_add (t, &fd->pre);
+  make_edge (iter_part_bb, seq_start_bb, EDGE_TRUE_VALUE);
+  make_edge (iter_part_bb, new_exit_bb, EDGE_FALSE_VALUE);
+  remove_edge (exit_edge);
+
+  make_edge (seq_start_bb, body_bb, EDGE_FALLTHRU);
+
+  make_edge (exit_bb, body_bb, EDGE_TRUE_VALUE);
+  find_edge (exit_bb, trip_update_bb)->flags = EDGE_FALSE_VALUE;
+
+  make_edge (trip_update_bb, iter_part_bb, EDGE_FALLTHRU);
+
+  return new_exit_bb;
 }
 
-/* A subroutine of expand_omp_for.  Expand the logic of the loop itself.  */
 
-static tree
-expand_omp_for_1 (tree *stmt_p, omp_context *ctx)
-{
-  struct expand_omp_for_data fd;
-  tree dlist;
+/* Expand the OpenMP loop defined by REGION.  */
 
-  extract_omp_for_data (*stmt_p, ctx, &fd);
+static void
+expand_omp_for (struct omp_region *region)
+{
+  struct omp_for_data fd;
+  basic_block last_bb = NULL;
 
-  expand_rec_input_clauses (OMP_FOR_CLAUSES (fd.for_stmt),
-			    &fd.pre, &dlist, ctx);
+  push_gimplify_context ();
 
-  expand_omp (&OMP_FOR_PRE_BODY (fd.for_stmt), ctx);
-  append_to_statement_list (OMP_FOR_PRE_BODY (fd.for_stmt), &fd.pre);
+  extract_omp_for_data (region->entry, &fd);
 
   if (fd.sched_kind == OMP_CLAUSE_SCHEDULE_STATIC && !fd.have_ordered)
     {
       if (fd.chunk_size == NULL)
-	expand_omp_for_static_nochunk (&fd);
+	last_bb = expand_omp_for_static_nochunk (region, &fd);
       else
-	expand_omp_for_static_chunk (&fd);
+	last_bb = expand_omp_for_static_chunk (region, &fd);
     }
   else
     {
-      int fn_index;
-
-      fn_index = fd.sched_kind + fd.have_ordered * 4;
-
-      expand_omp_for_generic (&fd, BUILT_IN_GOMP_LOOP_STATIC_START + fn_index,
-			      BUILT_IN_GOMP_LOOP_STATIC_NEXT + fn_index);
+      int fn_index = fd.sched_kind + fd.have_ordered * 4;
+      int start_ix = BUILT_IN_GOMP_LOOP_STATIC_START + fn_index;
+      int next_ix = BUILT_IN_GOMP_LOOP_STATIC_NEXT + fn_index;
+      last_bb = expand_omp_for_generic (region, &fd, start_ix, next_ix);
     }
 
-  expand_reduction_clauses (OMP_FOR_CLAUSES (fd.for_stmt), &fd.pre, ctx);
-  append_to_statement_list (dlist, &fd.pre);
-
-  /* If this parallel loop was part of a combined parallel loop
-     directive, inform the parent parallel what flavour of
-     GOMP_parallel_loop_XXX_start to use.  */
-  if (is_in_combined_parallel_ctx (ctx))
-    {
-      int start_ix = BUILT_IN_GOMP_PARALLEL_LOOP_STATIC_START + fd.sched_kind;
-      ctx->outer->parallel_start_ix = start_ix;
-    }
-  else if (!fd.have_nowait)
-    build_omp_barrier (&fd.pre);
-
-  return fd.pre;
+  pop_gimplify_context (NULL);
 }
 
-/* Expand code for an OpenMP loop directive.  */
-
-static void
-expand_omp_for (tree *stmt_p, omp_context *ctx)
-{
-  tree bind, block, stmt_list;
-
-  push_gimplify_context ();
-
-  expand_omp (&OMP_FOR_BODY (*stmt_p), ctx);
-
-  stmt_list = expand_omp_for_1 (stmt_p, ctx);
-  block = make_node (BLOCK);
-  bind = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, block);
-  maybe_catch_exception (&BIND_EXPR_BODY (bind));
-  *stmt_p = bind;
-
-  pop_gimplify_context (bind);
-  BIND_EXPR_VARS (bind) = chainon (BIND_EXPR_VARS (bind), ctx->block_vars);
-  BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
-}
 
 /* Expand code for an OpenMP sections directive.  In pseudo code, we generate
 
-	firstprivate;
 	v = GOMP_sections_start (n);
     L0:
 	switch (v)
@@ -2508,7 +3014,6 @@ expand_omp_for (tree *stmt_p, omp_context *ctx)
 	    ...
 	  case n:
 	    ...
-	    lastprivate;
 	  default:
 	    abort ();
 	  }
@@ -2518,156 +3023,468 @@ expand_omp_for (tree *stmt_p, omp_context *ctx)
     L2:
 	reduction;
 
-    If this is a combined parallel sections skip the call to
-    GOMP_sections_start and emit the call to GOMP_sections_next right
-    before the switch().  */
+    If this is a combined parallel sections, replace the call to
+    GOMP_sections_start with 'goto L1'.  */
 
 static void
-expand_omp_sections (tree *stmt_p, omp_context *ctx)
+expand_omp_sections (struct omp_region *region)
 {
-  tree sec_stmt, label_vec, bind, block, stmt_list, l0, l1, l2, t, u, v;
-  tree_stmt_iterator tsi;
-  tree dlist;
+  tree label_vec, l0, l1, l2, t, u, v;
   unsigned i, len;
-  bool in_combined_parallel = is_in_combined_parallel_ctx (ctx);
-
-  sec_stmt = *stmt_p;
-  stmt_list = NULL;
-
-  push_gimplify_context ();
+  basic_block entry_bb, exit_bb, l0_bb, l1_bb, default_bb;
+  edge e, entry_edge, exit_edge;
+  edge_iterator ei;
+  block_stmt_iterator si;
 
-  expand_rec_input_clauses (OMP_SECTIONS_CLAUSES (sec_stmt),
-			    &stmt_list, &dlist, ctx);
-
-  tsi = tsi_start (OMP_SECTIONS_BODY (sec_stmt));
-  for (len = 0; !tsi_end_p (tsi); len++, tsi_next (&tsi))
-    continue;
+  entry_bb = bb_for_stmt (region->entry);
+  exit_bb = bb_for_stmt (region->exit);
 
   l0 = create_artificial_label ();
   l1 = create_artificial_label ();
   l2 = create_artificial_label ();
+
   v = create_tmp_var (unsigned_type_node, ".section");
+
+  /* We will build a switch() with enough cases for all the
+     OMP_SECTION regions, a '0' case to handle the end of more work
+     and a default case to abort if something goes wrong.  */
+  len = EDGE_COUNT (entry_bb->succs);
   label_vec = make_tree_vec (len + 2);
 
-  t = build_int_cst (unsigned_type_node, len);
-  t = tree_cons (NULL, t, NULL);
+  /* Split ENTRY_BB.  The call to GOMP_sections_start goes in the
+     first half.  The second half contains the switch().  */
+  si = bsi_last (entry_bb);
+  t = bsi_stmt (si);
+  gcc_assert (t && TREE_CODE (t) == OMP_SECTIONS);
+  bsi_remove (&si, true);
+  entry_edge = split_block (entry_bb, t);
+  l0_bb = entry_edge->dest;
 
-  if (in_combined_parallel)
-    {
-      /* Nothing to do.  Just inform our parent of the additional
-	 arguments to invoke GOMP_parallel_sections_start.  */
-      ctx->outer->parallel_start_ix = BUILT_IN_GOMP_PARALLEL_SECTIONS_START;
-      ctx->outer->parallel_start_additional_args = t;
-    }
-  else
+  if (!is_combined_parallel (region))
     {
+      /* If we are not inside a combined parallel+sections region,
+	 call GOMP_sections_start.  */
+      t = build_int_cst (unsigned_type_node, len);
+      t = tree_cons (NULL, t, NULL);
       u = built_in_decls[BUILT_IN_GOMP_SECTIONS_START];
       t = build_function_call_expr (u, t);
       t = build2 (MODIFY_EXPR, void_type_node, v, t);
-      gimplify_and_add (t, &stmt_list);
+      si = bsi_last (entry_bb);
+      bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
     }
 
-  t = build1 (LABEL_EXPR, void_type_node, l0);
-  gimplify_and_add (t, &stmt_list);
+  /* The switch() statement replacing OMP_SECTIONS goes in L0_BB.  */
+  si = bsi_last (l0_bb);
 
-  if (in_combined_parallel)
-    {
-      /* Combined parallel sections need the call to GOMP_sections_next
-	 before the switch().  */
-      t = built_in_decls[BUILT_IN_GOMP_SECTIONS_NEXT];
-      t = build_function_call_expr (t, NULL);
-      t = build2 (MODIFY_EXPR, void_type_node, v, t);
-      gimplify_and_add (t, &stmt_list);
-    }
+  t = build1 (LABEL_EXPR, void_type_node, l0);
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
 
   t = build3 (SWITCH_EXPR, void_type_node, v, NULL, label_vec);
-  gimplify_and_add (t, &stmt_list);
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
 
   t = build3 (CASE_LABEL_EXPR, void_type_node,
 	      build_int_cst (unsigned_type_node, 0), NULL, l2);
   TREE_VEC_ELT (label_vec, 0) = t;
   
-  tsi = tsi_start (OMP_SECTIONS_BODY (sec_stmt));
-  for (i = 0; i < len; i++, tsi_next (&tsi))
+  /* Convert each OMP_SECTION into a CASE_LABEL_EXPR.  */
+  i = 1;
+  FOR_EACH_EDGE (e, ei, l0_bb->succs)
     {
-      omp_context *sctx;
+      basic_block s_entry_bb, s_exit_bb;
+
+      e->flags = 0;
+      s_entry_bb = e->dest;
+      si = bsi_last (s_entry_bb);
+      t = bsi_stmt (si);
+      gcc_assert (t && TREE_CODE (t) == OMP_SECTION);
+      s_exit_bb = bb_for_stmt (lookup_omp_region (t)->exit);
+      bsi_remove (&si, true);
 
       t = create_artificial_label ();
-      u = build_int_cst (unsigned_type_node, i + 1);
+      u = build_int_cst (unsigned_type_node, i);
       u = build3 (CASE_LABEL_EXPR, void_type_node, u, NULL, t);
-      TREE_VEC_ELT (label_vec, i + 1) = u;
+      TREE_VEC_ELT (label_vec, i) = u;
       t = build1 (LABEL_EXPR, void_type_node, t);
-      gimplify_and_add (t, &stmt_list);
-  
-      t = tsi_stmt (tsi);
-      sctx = maybe_lookup_ctx (t);
-      gcc_assert (sctx);
-      expand_omp (&OMP_SECTION_BODY (t), sctx);
-      append_to_statement_list (OMP_SECTION_BODY (t), &stmt_list);
-
-      if (i == len - 1)
-	expand_lastprivate_clauses (OMP_SECTIONS_CLAUSES (sec_stmt),
-				    NULL, &stmt_list, ctx);
-
-      t = build1 (GOTO_EXPR, void_type_node, l1);
-      gimplify_and_add (t, &stmt_list);
+      si = bsi_last (s_entry_bb);
+      bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+      i++;
+      single_succ_edge (s_entry_bb)->flags = EDGE_FALLTHRU;
+
+      si = bsi_last (s_exit_bb);
+      t = bsi_stmt (si);
+      gcc_assert (t && TREE_CODE (t) == OMP_RETURN_EXPR);
+      bsi_remove (&si, true);
+      single_succ_edge (s_exit_bb)->flags = EDGE_FALLTHRU;
     }
 
+  /* Error handling code goes in DEFAULT_BB.  */
+  default_bb = create_empty_bb (entry_bb);
+  si = bsi_start (default_bb);
   t = create_artificial_label ();
   u = build3 (CASE_LABEL_EXPR, void_type_node, NULL, NULL, t);
   TREE_VEC_ELT (label_vec, len + 1) = u;
   t = build1 (LABEL_EXPR, void_type_node, t);
-  gimplify_and_add (t, &stmt_list);
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
 
   t = built_in_decls[BUILT_IN_TRAP];
   t = build_function_call_expr (t, NULL);
-  gimplify_and_add (t, &stmt_list);
-
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+
+  make_edge (l0_bb, default_bb, 0);
+
+  /* Code to get the next section goes in L1_BB.  */
+  si = bsi_last (exit_bb);
+  t = bsi_stmt (si);
+  gcc_assert (t && TREE_CODE (t) == OMP_RETURN_EXPR);
+  bsi_remove (&si, true);
+  exit_edge = split_block (exit_bb, t);
+  l1_bb = exit_edge->src;
+  exit_bb = exit_edge->dest;
+  si = bsi_start (l1_bb);
   t = build1 (LABEL_EXPR, void_type_node, l1);
-  gimplify_and_add (t, &stmt_list);
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
 
-  if (!in_combined_parallel)
+  t = built_in_decls[BUILT_IN_GOMP_SECTIONS_NEXT];
+  t = build_function_call_expr (t, NULL);
+  t = build2 (MODIFY_EXPR, void_type_node, v, t);
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+
+  remove_edge (single_succ_edge (l1_bb));
+  make_edge (l1_bb, l0_bb, EDGE_FALLTHRU);
+
+  /* Exit label in EXIT_BB.  */
+  si = bsi_last (exit_bb);
+  t = build1 (LABEL_EXPR, void_type_node, l2);
+  bsi_insert_after (&si, t, BSI_CONTINUE_LINKING);
+
+  make_edge (l0_bb, exit_bb, 0);
+  single_succ_edge (exit_bb)->flags = EDGE_FALLTHRU;
+
+  if (is_combined_parallel (region))
     {
-      t = built_in_decls[BUILT_IN_GOMP_SECTIONS_NEXT];
-      t = build_function_call_expr (t, NULL);
-      t = build2 (MODIFY_EXPR, void_type_node, v, t);
-      gimplify_and_add (t, &stmt_list);
+      /* If this was a combined parallel+sections region, we did not
+	 emit a GOMP_sections_start in the entry block, so we just
+	 need to jump to L1_BB to get the next section.  */
+      remove_edge (single_succ_edge (entry_bb));
+      make_edge (entry_bb, l1_bb, EDGE_FALLTHRU);
     }
+}
 
-  t = build1 (GOTO_EXPR, void_type_node, l0);
-  gimplify_and_add (t, &stmt_list);
 
-  t = build1 (LABEL_EXPR, void_type_node, l2);
-  gimplify_and_add (t, &stmt_list);
+/* Generic expansion for OpenMP synchronization directives: single,
+   master, ordered and critical.  All we need to do here is remove the
+   entry and exit markers for REGION.  */
 
-  expand_reduction_clauses (OMP_SECTIONS_CLAUSES (sec_stmt), &stmt_list, ctx);
-  append_to_statement_list (dlist, &stmt_list);
+static void
+expand_omp_synch (struct omp_region *region)
+{
+  basic_block entry_bb, exit_bb;
+  block_stmt_iterator si;
+  tree t;
 
-  /* Unless there's a nowait clause, add a barrier afterward.  */
-  if (!find_omp_clause (OMP_SECTIONS_CLAUSES (sec_stmt), OMP_CLAUSE_NOWAIT))
-    build_omp_barrier (&stmt_list);
+  entry_bb = bb_for_stmt (region->entry);
+  exit_bb = bb_for_stmt (region->exit);
+
+  si = bsi_last (entry_bb);
+  t = bsi_stmt (si);
+  gcc_assert (t
+              && (TREE_CODE (t) == OMP_SINGLE
+		  || TREE_CODE (t) == OMP_MASTER
+		  || TREE_CODE (t) == OMP_ORDERED
+		  || TREE_CODE (t) == OMP_CRITICAL));
+  bsi_remove (&si, true);
+  single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
+
+  si = bsi_last (exit_bb);
+  t = bsi_stmt (si);
+  gcc_assert (t && TREE_CODE (t) == OMP_RETURN_EXPR);
+  bsi_remove (&si, true);
+  single_succ_edge (exit_bb)->flags = EDGE_FALLTHRU;
+}
+
+
+/* Expand the parallel region tree rooted at REGION.  Expansion
+   proceeds in depth-first order.  Innermost regions are expanded
+   first.  This way, parallel regions that require a new function to
+   be created (e.g., OMP_PARALLEL) can be expanded without having any
+   internal dependencies in their body.  */
+
+static void
+expand_omp (struct omp_region *region)
+{
+  while (region)
+    {
+      enum tree_code code = TREE_CODE (region->entry);
+
+      if (region->inner)
+	expand_omp (region->inner);
+
+      switch (code)
+	{
+	  case OMP_PARALLEL:
+	    expand_omp_parallel (region);
+	    break;
+
+	  case OMP_FOR:
+	    expand_omp_for (region);
+	    break;
+
+	  case OMP_SECTIONS:
+	    expand_omp_sections (region);
+	    break;
+
+	  case OMP_SECTION:
+	    /* Individual omp sections are handled together with their
+	       parent OMP_SECTIONS region.  */
+	    break;
+
+	  case OMP_SINGLE:
+	  case OMP_MASTER:
+	  case OMP_ORDERED:
+	  case OMP_CRITICAL:
+	    expand_omp_synch (region);
+	    break;
+
+	  default:
+	    gcc_unreachable ();
+	}
+
+      region = region->next;
+    }
+}
+
+
+/* Helper for build_omp_regions.  Scan the dominator tree starting at
+   block BB.  PARENT is the region that contains BB.  */
+
+static void
+build_omp_regions_1 (basic_block bb, struct omp_region *parent)
+{
+  block_stmt_iterator si;
+  tree stmt;
+  basic_block son;
+
+  si = bsi_last (bb);
+  if (!bsi_end_p (si) && OMP_DIRECTIVE_P (bsi_stmt (si)))
+    {
+      struct omp_region *region;
+
+      stmt = bsi_stmt (si);
+
+      if (TREE_CODE (stmt) == OMP_RETURN_EXPR)
+	{
+	  /* STMT is the return point out of region PARENT.  Mark it
+	     as the exit point and make PARENT the immediately
+	     enclosing region.  */
+	  gcc_assert (parent);
+	  region = parent;
+	  region->exit = stmt;
+	  parent = parent->outer;
+
+	  /* If REGION is a parallel region, determine whether it is
+	     a combined parallel+workshare region.  */
+	  if (TREE_CODE (region->entry) == OMP_PARALLEL)
+	    determine_parallel_type (region);
+	}
+      else
+	{
+	  /* Otherwise, this directive becomes the parent for a new
+	     region.  */
+	  region = new_omp_region (stmt, parent);
+	  parent = region;
+	}
+
+      gcc_assert (region);
+      if (omp_regions == NULL)
+	{
+	  omp_regions = splay_tree_new (splay_tree_compare_pointers, 0, 0);
+	  root_omp_region = region;
+	}
+
+      splay_tree_insert (omp_regions, (splay_tree_key) stmt,
+	                 (splay_tree_value) region);
+    }
+
+  for (son = first_dom_son (CDI_DOMINATORS, bb);
+       son;
+       son = next_dom_son (CDI_DOMINATORS, son))
+    build_omp_regions_1 (son, parent);
+}
+
+
+/* Scan the CFG and build a tree of OMP regions.  Return the root of
+   the OMP region tree.  */
+
+static void
+build_omp_regions (void)
+{
+  gcc_assert (omp_regions == NULL && root_omp_region == NULL);
+  calculate_dominance_info (CDI_DOMINATORS);
+  build_omp_regions_1 (ENTRY_BLOCK_PTR, NULL);
+}
+
+
+/* Main entry point for expanding OMP-GIMPLE into runtime calls.  */
+
+static void
+execute_expand_omp (void)
+{
+  build_omp_regions ();
+
+  if (root_omp_region)
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nOMP region tree\n\n");
+	  dump_omp_region (dump_file, root_omp_region, 0);
+	  fprintf (dump_file, "\n");
+	}
+
+      expand_omp (root_omp_region);
+      splay_tree_delete (omp_regions);
+      root_omp_region = NULL;
+      omp_regions = NULL;
+      free_dominance_info (CDI_DOMINATORS);
+      free_dominance_info (CDI_POST_DOMINATORS);
+    }
+
+  /* Expansion adds basic blocks that may be merged.  */
+  cleanup_tree_cfg ();
+}
+
+static bool
+gate_expand_omp (void)
+{
+  return flag_openmp != 0 && errorcount == 0;
+}
+
+struct tree_opt_pass pass_expand_omp = 
+{
+  "ompexp",				/* name */
+  gate_expand_omp,			/* gate */
+  execute_expand_omp,			/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
+  0,					/* tv_id */
+  PROP_gimple_any,			/* properties_required */
+  PROP_gimple_lomp,			/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_dump_func,			/* todo_flags_finish */
+  0					/* letter */
+};
+
+/* Routines to lower OpenMP directives into OMP-GIMPLE.  */
+
+/* Lower the OpenMP sections directive in *STMT_P.  */
+
+static void
+lower_omp_sections (tree *stmt_p, omp_context *ctx)
+{
+  tree new_stmt, stmt, body, bind, block, ilist, olist, new_body;
+  tree dlist, region_exit;
+  tree_stmt_iterator tsi;
+  unsigned i, len;
+
+  stmt = *stmt_p;
+
+  gcc_assert (OMP_SECTIONS_SECTIONS (stmt) == NULL_TREE);
+
+  push_gimplify_context ();
+
+  dlist = NULL;
+  ilist = NULL;
+  lower_rec_input_clauses (OMP_SECTIONS_CLAUSES (stmt), &ilist, &dlist, ctx);
+
+  tsi = tsi_start (OMP_SECTIONS_BODY (stmt));
+  for (len = 0; !tsi_end_p (tsi); len++, tsi_next (&tsi))
+    continue;
+
+  /* There are two markers per section and one end marker for the
+     whole construct.  */
+  OMP_SECTIONS_SECTIONS (stmt) = make_tree_vec (2 * len + 1);
+
+  tsi = tsi_start (OMP_SECTIONS_BODY (stmt));
+  body = alloc_stmt_list ();
+  for (i = 0; i < len; i++, tsi_next (&tsi))
+    {
+      omp_context *sctx;
+      tree sec_start, sec_end, sec_body;
+
+      sec_start = tsi_stmt (tsi);
+      sec_body = alloc_stmt_list ();
+      sctx = maybe_lookup_ctx (sec_start);
+      gcc_assert (sctx);
+
+      lower_omp (&OMP_SECTION_BODY (sec_start), sctx);
+      append_to_statement_list (OMP_SECTION_BODY (sec_start), &sec_body);
+
+      if (i == len - 1)
+	{
+	  tree l = alloc_stmt_list ();
+	  lower_lastprivate_clauses (OMP_SECTIONS_CLAUSES (stmt), NULL,
+				     &l, ctx);
+	  append_to_statement_list (l, &sec_body);
+	}
+      
+      sec_end = make_node (OMP_RETURN_EXPR);
+
+      OMP_SECTION_BODY (sec_start) = sec_body;
+      append_to_statement_list (sec_start, &body);
+      append_to_statement_list (sec_end, &body);
+
+      TREE_VEC_ELT (OMP_SECTIONS_SECTIONS (stmt), i * 2) = sec_start;
+      TREE_VEC_ELT (OMP_SECTIONS_SECTIONS (stmt), i * 2 + 1) = sec_end;
+    }
 
   block = make_node (BLOCK);
-  bind = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, block);
+  bind = build3 (BIND_EXPR, void_type_node, NULL, body, block);
   maybe_catch_exception (&BIND_EXPR_BODY (bind));
-  *stmt_p = bind;
 
-  pop_gimplify_context (bind);
-  BIND_EXPR_VARS (bind) = chainon (BIND_EXPR_VARS (bind), ctx->block_vars);
-  BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
+  olist = NULL_TREE;
+  lower_reduction_clauses (OMP_SECTIONS_CLAUSES (stmt), &olist, ctx);
+
+  /* Unless there's a nowait clause, add a barrier afterward.  */
+  if (!find_omp_clause (OMP_SECTIONS_CLAUSES (stmt), OMP_CLAUSE_NOWAIT))
+    build_omp_barrier (&olist);
+
+  pop_gimplify_context (NULL_TREE);
+  record_vars_into (ctx->block_vars, ctx->cb.dst_fn);
+
+  new_stmt = build3 (BIND_EXPR, void_type_node, NULL, NULL, NULL);
+  TREE_SIDE_EFFECTS (new_stmt) = 1;
+  OMP_SECTIONS_BODY (stmt) = body;
+
+  region_exit = make_node (OMP_RETURN_EXPR);
+
+  new_body = alloc_stmt_list ();
+  append_to_statement_list (ilist, &new_body);
+  append_to_statement_list (stmt, &new_body);
+  append_to_statement_list (region_exit, &new_body);
+  append_to_statement_list (olist, &new_body);
+  append_to_statement_list (dlist, &new_body);
+  BIND_EXPR_BODY (new_stmt) = new_body;
+
+  TREE_VEC_ELT (OMP_SECTIONS_SECTIONS (stmt), 2 * len) = region_exit;
+
+  *stmt_p = new_stmt;
 }
 
 
-/* A subroutine of expand_omp_single.  Expand the simple form of
+/* A subroutine of lower_omp_single.  Expand the simple form of
    an OMP_SINGLE, without a copyprivate clause:
 
      	if (GOMP_single_start ())
 	  BODY;
 	[ GOMP_barrier (); ]	-> unless 'nowait' is present.
-*/
+
+  FIXME.  It may be better to delay expanding the logic of this until
+  pass_expand_omp.  The expanded logic may make the job more difficult
+  to a synchronization analysis pass.  */
 
 static void
-expand_omp_single_simple (tree single_stmt, tree *pre_p)
+lower_omp_single_simple (tree single_stmt, tree *pre_p)
 {
   tree t;
 
@@ -2681,7 +3498,8 @@ expand_omp_single_simple (tree single_stmt, tree *pre_p)
     build_omp_barrier (pre_p);
 }
 
-/* A subroutine of expand_omp_single.  Expand the simple form of
+
+/* A subroutine of lower_omp_single.  Expand the simple form of
    an OMP_SINGLE, with a copyprivate clause:
 
 	#pragma omp single copyprivate (a, b, c)
@@ -2705,10 +3523,13 @@ expand_omp_single_simple (tree single_stmt, tree *pre_p)
 	  }
 	GOMP_barrier ();
       }
-*/
+
+  FIXME.  It may be better to delay expanding the logic of this until
+  pass_expand_omp.  The expanded logic may make the job more difficult
+  to a synchronization analysis pass.  */
 
 static void
-expand_omp_single_copy (tree single_stmt, tree *pre_p, omp_context *ctx)
+lower_omp_single_copy (tree single_stmt, tree *pre_p, omp_context *ctx)
 {
   tree ptr_type, t, args, l0, l1, l2, copyin_seq;
 
@@ -2739,7 +3560,7 @@ expand_omp_single_copy (tree single_stmt, tree *pre_p, omp_context *ctx)
   append_to_statement_list (OMP_SINGLE_BODY (single_stmt), pre_p);
 
   copyin_seq = NULL;
-  expand_copyprivate_clauses (OMP_SINGLE_CLAUSES (single_stmt), pre_p,
+  lower_copyprivate_clauses (OMP_SINGLE_CLAUSES (single_stmt), pre_p,
 			      &copyin_seq, ctx);
 
   t = build_fold_addr_expr (ctx->sender_decl);
@@ -2762,41 +3583,47 @@ expand_omp_single_copy (tree single_stmt, tree *pre_p, omp_context *ctx)
   build_omp_barrier (pre_p);
 }
 
+
 /* Expand code for an OpenMP single directive.  */
 
 static void
-expand_omp_single (tree *stmt_p, omp_context *ctx)
+lower_omp_single (tree *stmt_p, omp_context *ctx)
 {
-  tree bind, block, single_stmt = *stmt_p, dlist;
+  tree t, bind, block, single_stmt = *stmt_p, dlist;
 
   push_gimplify_context ();
 
   block = make_node (BLOCK);
   bind = build3 (BIND_EXPR, void_type_node, NULL, NULL, block);
-  *stmt_p = bind;
+  TREE_SIDE_EFFECTS (bind) = 1;
 
-  expand_rec_input_clauses (OMP_SINGLE_CLAUSES (single_stmt),
-			    &BIND_EXPR_BODY (bind), &dlist, ctx);
-
-  expand_omp (&OMP_SINGLE_BODY (single_stmt), ctx);
+  lower_rec_input_clauses (OMP_SINGLE_CLAUSES (single_stmt),
+			   &BIND_EXPR_BODY (bind), &dlist, ctx);
+  lower_omp (&OMP_SINGLE_BODY (single_stmt), ctx);
 
   if (ctx->record_type)
-    expand_omp_single_copy (single_stmt, &BIND_EXPR_BODY (bind), ctx);
+    lower_omp_single_copy (single_stmt, &BIND_EXPR_BODY (bind), ctx);
   else
-    expand_omp_single_simple (single_stmt, &BIND_EXPR_BODY (bind));
+    lower_omp_single_simple (single_stmt, &BIND_EXPR_BODY (bind));
 
   append_to_statement_list (dlist, &BIND_EXPR_BODY (bind));
-
+  t = make_node (OMP_RETURN_EXPR);
+  append_to_statement_list (t, &BIND_EXPR_BODY (bind));
   maybe_catch_exception (&BIND_EXPR_BODY (bind));
   pop_gimplify_context (bind);
+
   BIND_EXPR_VARS (bind) = chainon (BIND_EXPR_VARS (bind), ctx->block_vars);
   BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
+
+  OMP_SINGLE_BODY (single_stmt) = alloc_stmt_list ();
+  append_to_statement_list (bind, &OMP_SINGLE_BODY (single_stmt));
 }
 
+
 /* Expand code for an OpenMP master directive.  */
 
 static void
-expand_omp_master (tree *stmt_p, omp_context *ctx)
+lower_omp_master (tree *stmt_p, omp_context *ctx)
 {
   tree bind, block, stmt = *stmt_p, lab = NULL, x;
 
@@ -2804,7 +3631,7 @@ expand_omp_master (tree *stmt_p, omp_context *ctx)
 
   block = make_node (BLOCK);
   bind = build3 (BIND_EXPR, void_type_node, NULL, NULL, block);
-  *stmt_p = bind;
+  TREE_SIDE_EFFECTS (bind) = 1;
 
   x = built_in_decls[BUILT_IN_OMP_GET_THREAD_NUM];
   x = build_function_call_expr (x, NULL);
@@ -2812,22 +3639,28 @@ expand_omp_master (tree *stmt_p, omp_context *ctx)
   x = build3 (COND_EXPR, void_type_node, x, NULL, build_and_jump (&lab));
   gimplify_and_add (x, &BIND_EXPR_BODY (bind));
 
-  expand_omp (&OMP_MASTER_BODY (stmt), ctx);
+  lower_omp (&OMP_MASTER_BODY (stmt), ctx);
   append_to_statement_list (OMP_MASTER_BODY (stmt), &BIND_EXPR_BODY (bind));
 
   x = build1 (LABEL_EXPR, void_type_node, lab);
   gimplify_and_add (x, &BIND_EXPR_BODY (bind));
-
+  x = make_node (OMP_RETURN_EXPR);
+  append_to_statement_list (x, &BIND_EXPR_BODY (bind));
   maybe_catch_exception (&BIND_EXPR_BODY (bind));
   pop_gimplify_context (bind);
+
   BIND_EXPR_VARS (bind) = chainon (BIND_EXPR_VARS (bind), ctx->block_vars);
   BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
+
+  OMP_MASTER_BODY (stmt) = alloc_stmt_list ();
+  append_to_statement_list (bind, &OMP_MASTER_BODY (stmt));
 }
 
+
 /* Expand code for an OpenMP ordered directive.  */
 
 static void
-expand_omp_ordered (tree *stmt_p, omp_context *ctx)
+lower_omp_ordered (tree *stmt_p, omp_context *ctx)
 {
   tree bind, block, stmt = *stmt_p, x;
 
@@ -2835,26 +3668,30 @@ expand_omp_ordered (tree *stmt_p, omp_context *ctx)
 
   block = make_node (BLOCK);
   bind = build3 (BIND_EXPR, void_type_node, NULL, NULL, block);
-  *stmt_p = bind;
+  TREE_SIDE_EFFECTS (bind) = 1;
 
   x = built_in_decls[BUILT_IN_GOMP_ORDERED_START];
   x = build_function_call_expr (x, NULL);
   gimplify_and_add (x, &BIND_EXPR_BODY (bind));
 
-  expand_omp (&OMP_ORDERED_BODY (stmt), ctx);
+  lower_omp (&OMP_ORDERED_BODY (stmt), ctx);
   append_to_statement_list (OMP_ORDERED_BODY (stmt), &BIND_EXPR_BODY (bind));
 
   x = built_in_decls[BUILT_IN_GOMP_ORDERED_END];
   x = build_function_call_expr (x, NULL);
   gimplify_and_add (x, &BIND_EXPR_BODY (bind));
-
+  x = make_node (OMP_RETURN_EXPR);
+  append_to_statement_list (x, &BIND_EXPR_BODY (bind));
   maybe_catch_exception (&BIND_EXPR_BODY (bind));
   pop_gimplify_context (bind);
+
   BIND_EXPR_VARS (bind) = chainon (BIND_EXPR_VARS (bind), ctx->block_vars);
   BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
+
+  OMP_ORDERED_BODY (stmt) = alloc_stmt_list ();
+  append_to_statement_list (bind, &OMP_ORDERED_BODY (stmt));
 }
 
-/* Expand code for an OpenMP critical directive.  */
 
 /* Gimplify an OMP_CRITICAL statement.  This is a relatively simple
    substitution of a couple of function calls.  But in the NAMED case,
@@ -2865,10 +3702,10 @@ static GTY((param1_is (tree), param2_is (tree)))
   splay_tree critical_name_mutexes;
 
 static void
-expand_omp_critical (tree *stmt_p, omp_context *ctx)
+lower_omp_critical (tree *stmt_p, omp_context *ctx)
 {
   tree bind, block, stmt = *stmt_p;
-  tree lock, unlock, name;
+  tree t, lock, unlock, name;
 
   name = OMP_CRITICAL_NAME (stmt);
   if (name)
@@ -2924,27 +3761,217 @@ expand_omp_critical (tree *stmt_p, omp_context *ctx)
 
   block = make_node (BLOCK);
   bind = build3 (BIND_EXPR, void_type_node, NULL, NULL, block);
-  *stmt_p = bind;
+  TREE_SIDE_EFFECTS (bind) = 1;
 
   gimplify_and_add (lock, &BIND_EXPR_BODY (bind));
 
-  expand_omp (&OMP_CRITICAL_BODY (stmt), ctx);
+  lower_omp (&OMP_CRITICAL_BODY (stmt), ctx);
   maybe_catch_exception (&OMP_CRITICAL_BODY (stmt));
   append_to_statement_list (OMP_CRITICAL_BODY (stmt), &BIND_EXPR_BODY (bind));
 
   gimplify_and_add (unlock, &BIND_EXPR_BODY (bind));
+  t = make_node (OMP_RETURN_EXPR);
+  append_to_statement_list (t, &BIND_EXPR_BODY (bind));
 
   pop_gimplify_context (bind);
   BIND_EXPR_VARS (bind) = chainon (BIND_EXPR_VARS (bind), ctx->block_vars);
   BLOCK_VARS (block) = BIND_EXPR_VARS (bind);
+
+  OMP_CRITICAL_BODY (stmt) = alloc_stmt_list ();
+  append_to_statement_list (bind, &OMP_CRITICAL_BODY (stmt));
+}
+
+
+/* A subroutine of lower_omp_for.  Generate code to emit the predicate
+   for a lastprivate clause.  Given a loop control predicate of (V
+   cond N2), we gate the clause on (!(V cond N2)).  The lowered form
+   is appended to *BODY_P.  */
+
+static void
+lower_omp_for_lastprivate (struct omp_for_data *fd, tree *body_p,
+			   struct omp_context *ctx)
+{
+  tree clauses, cond;
+  enum tree_code cond_code;
+  
+  cond_code = fd->cond_code;
+  cond_code = cond_code == LT_EXPR ? GE_EXPR : LE_EXPR;
+
+  /* When possible, use a strict equality expression.  This can let VRP
+     type optimizations deduce the value and remove a copy.  */
+  if (host_integerp (fd->step, 0))
+    {
+      HOST_WIDE_INT step = TREE_INT_CST_LOW (fd->step);
+      if (step == 1 || step == -1)
+	cond_code = EQ_EXPR;
+    }
+
+  cond = build2 (cond_code, boolean_type_node, fd->v, fd->n2);
+
+  clauses = OMP_FOR_CLAUSES (fd->for_stmt);
+  lower_lastprivate_clauses (clauses, cond, body_p, ctx);
+}
+
+
+/* Lower code for an OpenMP loop directive.  */
+
+static void
+lower_omp_for (tree *stmt_p, omp_context *ctx)
+{
+  tree t, stmt, ilist, dlist, new_stmt, *body_p, *rhs_p;
+  struct omp_for_data fd;
+
+  stmt = *stmt_p;
+
+  push_gimplify_context ();
+
+  lower_omp (&OMP_FOR_PRE_BODY (stmt), ctx);
+  lower_omp (&OMP_FOR_BODY (stmt), ctx);
+
+  /* Move declaration of temporaries in the loop body before we make
+     it go away.  */
+  if (TREE_CODE (OMP_FOR_BODY (stmt)) == BIND_EXPR)
+    record_vars_into (BIND_EXPR_VARS (OMP_FOR_BODY (stmt)), ctx->cb.dst_fn);
+
+  new_stmt = build3 (BIND_EXPR, void_type_node, NULL, NULL, NULL);
+  TREE_SIDE_EFFECTS (new_stmt) = 1;
+  body_p = &BIND_EXPR_BODY (new_stmt);
+
+  /* The pre-body and input clauses go before the lowered OMP_FOR.  */
+  ilist = NULL;
+  dlist = NULL;
+  append_to_statement_list (OMP_FOR_PRE_BODY (stmt), body_p);
+  lower_rec_input_clauses (OMP_FOR_CLAUSES (stmt), body_p, &dlist, ctx);
+
+  /* Lower the header expressions.  At this point, we can assume that
+     the header is of the form:
+
+     	#pragma omp for (V = VAL1; V {<|>|<=|>=} VAL2; V = V [+-] VAL3)
+
+     We just need to make sure that VAL1, VAL2 and VAL3 are lowered
+     using the .omp_data_s mapping, if needed.  */
+  rhs_p = &TREE_OPERAND (OMP_FOR_INIT (stmt), 1);
+  if (!is_gimple_min_invariant (*rhs_p))
+    *rhs_p = get_formal_tmp_var (*rhs_p, body_p);
+
+  rhs_p = &TREE_OPERAND (OMP_FOR_COND (stmt), 1);
+  if (!is_gimple_min_invariant (*rhs_p))
+    *rhs_p = get_formal_tmp_var (*rhs_p, body_p);
+
+  rhs_p = &TREE_OPERAND (TREE_OPERAND (OMP_FOR_INCR (stmt), 1), 1);
+  if (!is_gimple_min_invariant (*rhs_p))
+    *rhs_p = get_formal_tmp_var (*rhs_p, body_p);
+
+  /* Once lowered, extract the bounds and clauses.  */
+  extract_omp_for_data (stmt, &fd);
+
+  /* Region exit marker goes at the end of the loop body.  */
+  t = make_node (OMP_RETURN_EXPR);
+  append_to_statement_list (t, &OMP_FOR_BODY (stmt));
+  maybe_catch_exception (&OMP_FOR_BODY (stmt));
+  append_to_statement_list (stmt, body_p);
+
+  /* After the loop, add exit clauses.  */
+  lower_omp_for_lastprivate (&fd, &dlist, ctx);
+  lower_reduction_clauses (OMP_FOR_CLAUSES (stmt), body_p, ctx);
+  append_to_statement_list (dlist, body_p);
+
+  /* Add a barrier unless the user specified NOWAIT.  Note that if
+     this is a combined parallel+loop construct, the barrier will be
+     optimized away during expansion (see expand_omp_for).  */
+  if (!fd.have_nowait)
+    {
+      tree stmt = alloc_stmt_list ();
+      build_omp_barrier (&stmt);
+      append_to_statement_list (stmt, body_p);
+    }
+
+  pop_gimplify_context (NULL_TREE);
+  record_vars_into (ctx->block_vars, ctx->cb.dst_fn);
+
+  OMP_FOR_PRE_BODY (stmt) = NULL_TREE;
+  *stmt_p = new_stmt;
+}
+
+
+/* Lower the OpenMP parallel directive in *STMT_P.  CTX holds context
+   information for the directive.  */
+
+static void
+lower_omp_parallel (tree *stmt_p, omp_context *ctx)
+{
+  tree clauses, par_bind, par_body, new_body, bind;
+  tree olist, ilist, par_olist, par_ilist;
+  tree stmt, child_fn, t;
+
+  stmt = *stmt_p;
+
+  clauses = OMP_PARALLEL_CLAUSES (stmt);
+  par_bind = OMP_PARALLEL_BODY (stmt);
+  par_body = BIND_EXPR_BODY (par_bind);
+  child_fn = ctx->cb.dst_fn;
+
+  push_gimplify_context ();
+
+  par_olist = NULL_TREE;
+  par_ilist = NULL_TREE;
+  lower_rec_input_clauses (clauses, &par_ilist, &par_olist, ctx);
+  lower_omp (&par_body, ctx);
+  maybe_catch_exception (&par_body);
+  lower_reduction_clauses (clauses, &par_olist, ctx);
+
+  /* Declare all the variables created by mapping and the variables
+     declared in the scope of the parallel body.  */
+  record_vars_into (ctx->block_vars, child_fn);
+  record_vars_into (BIND_EXPR_VARS (par_bind), child_fn);
+
+  if (ctx->record_type)
+    {
+      ctx->sender_decl = create_tmp_var (ctx->record_type, ".omp_data_o");
+      OMP_PARALLEL_DATA_ARG (stmt) = ctx->sender_decl;
+    }
+
+  olist = NULL_TREE;
+  ilist = NULL_TREE;
+  lower_send_clauses (clauses, &ilist, &olist, ctx);
+  lower_send_shared_vars (&ilist, &olist, ctx);
+
+  /* Once all the expansions are done, sequence all the different
+     fragments inside OMP_PARALLEL_BODY.  */
+  bind = build3 (BIND_EXPR, void_type_node, NULL, NULL, NULL);
+  append_to_statement_list (ilist, &BIND_EXPR_BODY (bind));
+
+  new_body = alloc_stmt_list ();
+
+  if (ctx->record_type)
+    {
+      t = build_fold_addr_expr (ctx->sender_decl);
+      t = build2 (MODIFY_EXPR, void_type_node, ctx->receiver_decl, t);
+      append_to_statement_list (t, &new_body);
+    }
+
+  append_to_statement_list (par_ilist, &new_body);
+  append_to_statement_list (par_body, &new_body);
+  append_to_statement_list (par_olist, &new_body);
+  t = make_node (OMP_RETURN_EXPR);
+  append_to_statement_list (t, &new_body);
+  OMP_PARALLEL_BODY (stmt) = new_body;
+
+  append_to_statement_list (stmt, &BIND_EXPR_BODY (bind));
+  append_to_statement_list (olist, &BIND_EXPR_BODY (bind));
+
+  *stmt_p = bind;
+
+  pop_gimplify_context (NULL_TREE);
 }
 
+
 /* Pass *TP back through the gimplifier within the context determined by WI.
    This handles replacement of DECL_VALUE_EXPR, as well as adjusting the 
    flags on ADDR_EXPR.  */
 
 static void
-expand_regimplify (tree *tp, struct walk_stmt_info *wi)
+lower_regimplify (tree *tp, struct walk_stmt_info *wi)
 {
   enum gimplify_status gs;
   tree pre = NULL;
@@ -2961,66 +3988,77 @@ expand_regimplify (tree *tp, struct walk_stmt_info *wi)
     tsi_link_before (&wi->tsi, pre, TSI_SAME_STMT);
 }
 
+
+/* Callback for walk_stmts.  Lower the OpenMP directive pointed by TP.  */
+
 static tree
-expand_omp_1 (tree *tp, int *walk_subtrees, void *data)
+lower_omp_1 (tree *tp, int *walk_subtrees, void *data)
 {
   struct walk_stmt_info *wi = data;
   omp_context *ctx = wi->info;
   tree t = *tp;
 
+  /* If we have issued syntax errors, avoid doing any heavy lifting.
+     Just replace the OpenMP directives with a NOP to avoid
+     confusing RTL expansion.  */
+  if (errorcount && OMP_DIRECTIVE_P (*tp))
+    {
+      *tp = build_empty_stmt ();
+      return NULL_TREE;
+    }
+
   *walk_subtrees = 0;
   switch (TREE_CODE (*tp))
     {
     case OMP_PARALLEL:
       ctx = maybe_lookup_ctx (t);
-      if (!ctx->is_nested)
-	expand_omp_parallel (tp, ctx);
+      lower_omp_parallel (tp, ctx);
       break;
 
     case OMP_FOR:
       ctx = maybe_lookup_ctx (t);
       gcc_assert (ctx);
-      expand_omp_for (tp, ctx);
+      lower_omp_for (tp, ctx);
       break;
 
     case OMP_SECTIONS:
       ctx = maybe_lookup_ctx (t);
       gcc_assert (ctx);
-      expand_omp_sections (tp, ctx);
+      lower_omp_sections (tp, ctx);
       break;
 
     case OMP_SINGLE:
       ctx = maybe_lookup_ctx (t);
       gcc_assert (ctx);
-      expand_omp_single (tp, ctx);
+      lower_omp_single (tp, ctx);
       break;
 
     case OMP_MASTER:
       ctx = maybe_lookup_ctx (t);
       gcc_assert (ctx);
-      expand_omp_master (tp, ctx);
+      lower_omp_master (tp, ctx);
       break;
 
     case OMP_ORDERED:
       ctx = maybe_lookup_ctx (t);
       gcc_assert (ctx);
-      expand_omp_ordered (tp, ctx);
+      lower_omp_ordered (tp, ctx);
       break;
 
     case OMP_CRITICAL:
       ctx = maybe_lookup_ctx (t);
       gcc_assert (ctx);
-      expand_omp_critical (tp, ctx);
+      lower_omp_critical (tp, ctx);
       break;
 
     case VAR_DECL:
       if (ctx && DECL_HAS_VALUE_EXPR_P (t))
-	expand_regimplify (tp, wi);
+	lower_regimplify (tp, wi);
       break;
 
     case ADDR_EXPR:
       if (ctx)
-	expand_regimplify (tp, wi);
+	lower_regimplify (tp, wi);
       break;
 
     case ARRAY_REF:
@@ -3030,7 +4068,7 @@ expand_omp_1 (tree *tp, int *walk_subtrees, void *data)
     case COMPONENT_REF:
     case VIEW_CONVERT_EXPR:
       if (ctx)
-	expand_regimplify (tp, wi);
+	lower_regimplify (tp, wi);
       break;
 
     case INDIRECT_REF:
@@ -3038,7 +4076,7 @@ expand_omp_1 (tree *tp, int *walk_subtrees, void *data)
 	{
 	  wi->is_lhs = false;
 	  wi->val_only = true;
-	  expand_regimplify (&TREE_OPERAND (t, 0), wi);
+	  lower_regimplify (&TREE_OPERAND (t, 0), wi);
 	}
       break;
 
@@ -3052,12 +4090,12 @@ expand_omp_1 (tree *tp, int *walk_subtrees, void *data)
 }
 
 static void
-expand_omp (tree *stmt_p, omp_context *ctx)
+lower_omp (tree *stmt_p, omp_context *ctx)
 {
   struct walk_stmt_info wi;
 
   memset (&wi, 0, sizeof (wi));
-  wi.callback = expand_omp_1;
+  wi.callback = lower_omp_1;
   wi.info = ctx;
   wi.val_only = true;
   wi.want_locations = true;
@@ -3077,10 +4115,13 @@ execute_lower_omp (void)
   gcc_assert (parallel_nesting_level == 0);
 
   if (all_contexts->root)
-    expand_omp (&DECL_SAVED_TREE (current_function_decl), NULL);
+    lower_omp (&DECL_SAVED_TREE (current_function_decl), NULL);
 
-  splay_tree_delete (all_contexts);
-  all_contexts = NULL;
+  if (all_contexts)
+    {
+      splay_tree_delete (all_contexts);
+      all_contexts = NULL;
+    }
 }
 
 static bool
@@ -3105,7 +4146,6 @@ struct tree_opt_pass pass_lower_omp =
   TODO_dump_func,			/* todo_flags_finish */
   0					/* letter */
 };
-
 
 /* The following is a utility to diagnose OpenMP structured block violations.
    It's part of the "omplower" pass, as that's invoked too late.  It should
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 5a87ac0335a..da5251c554b 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -294,6 +294,12 @@ optab_for_tree_code (enum tree_code code, tree type)
     case REALIGN_LOAD_EXPR:
       return vec_realign_load_optab;
 
+    case WIDEN_SUM_EXPR:
+      return TYPE_UNSIGNED (type) ? usum_widen_optab : ssum_widen_optab;
+
+    case DOT_PROD_EXPR:
+      return TYPE_UNSIGNED (type) ? udot_prod_optab : sdot_prod_optab;
+
     case REDUC_MAX_EXPR:
       return TYPE_UNSIGNED (type) ? reduc_umax_optab : reduc_smax_optab;
 
@@ -337,6 +343,154 @@ optab_for_tree_code (enum tree_code code, tree type)
 }
 
 
+/* Expand vector widening operations.
+
+   There are two different classes of operations handled here:
+   1) Operations whose result is wider than all the arguments to the operation.
+      Examples: VEC_UNPACK_HI/LO_EXPR, VEC_WIDEN_MULT_HI/LO_EXPR
+      In this case OP0 and optionally OP1 would be initialized,
+      but WIDE_OP wouldn't (not relevant for this case).
+   2) Operations whose result is of the same size as the last argument to the
+      operation, but wider than all the other arguments to the operation.
+      Examples: WIDEN_SUM_EXPR, VEC_DOT_PROD_EXPR.
+      In the case WIDE_OP, OP0 and optionally OP1 would be initialized.
+
+   E.g, when called to expand the following operations, this is how
+   the arguments will be initialized:
+                                nops    OP0     OP1     WIDE_OP
+   widening-sum                 2       oprnd0  -       oprnd1          
+   widening-dot-product         3       oprnd0  oprnd1  oprnd2
+   widening-mult                2       oprnd0  oprnd1  -
+   type-promotion (vec-unpack)  1       oprnd0  -       -  */
+
+rtx
+expand_widen_pattern_expr (tree exp, rtx op0, rtx op1, rtx wide_op, rtx target,
+                           int unsignedp)
+{   
+  tree oprnd0, oprnd1, oprnd2;
+  enum machine_mode wmode = 0, tmode0, tmode1 = 0;
+  optab widen_pattern_optab;
+  int icode; 
+  enum machine_mode xmode0, xmode1 = 0, wxmode = 0;
+  rtx temp;
+  rtx pat;
+  rtx xop0, xop1, wxop;
+  int nops = TREE_CODE_LENGTH (TREE_CODE (exp));
+
+  oprnd0 = TREE_OPERAND (exp, 0);
+  tmode0 = TYPE_MODE (TREE_TYPE (oprnd0));
+  widen_pattern_optab =
+        optab_for_tree_code (TREE_CODE (exp), TREE_TYPE (oprnd0));
+  icode = (int) widen_pattern_optab->handlers[(int) tmode0].insn_code;
+  gcc_assert (icode != CODE_FOR_nothing);
+  xmode0 = insn_data[icode].operand[1].mode;
+
+  if (nops >= 2)
+    {
+      oprnd1 = TREE_OPERAND (exp, 1);
+      tmode1 = TYPE_MODE (TREE_TYPE (oprnd1));
+      xmode1 = insn_data[icode].operand[2].mode;
+    }
+
+  /* The last operand is of a wider mode than the rest of the operands.  */
+  if (nops == 2)
+    {
+      wmode = tmode1;
+      wxmode = xmode1;
+    }
+  else if (nops == 3)
+    {
+      gcc_assert (tmode1 == tmode0);
+      gcc_assert (op1);
+      oprnd2 = TREE_OPERAND (exp, 2);
+      wmode = TYPE_MODE (TREE_TYPE (oprnd2));
+      wxmode = insn_data[icode].operand[3].mode;
+    }
+
+  if (!wide_op)
+    wmode = wxmode = insn_data[icode].operand[0].mode;
+
+  if (!target
+      || ! (*insn_data[icode].operand[0].predicate) (target, wmode))
+    temp = gen_reg_rtx (wmode);
+  else
+    temp = target;
+
+  xop0 = op0;
+  xop1 = op1;
+  wxop = wide_op;
+
+  /* In case the insn wants input operands in modes different from
+     those of the actual operands, convert the operands.  It would
+     seem that we don't need to convert CONST_INTs, but we do, so
+     that they're properly zero-extended, sign-extended or truncated
+     for their mode.  */
+
+  if (GET_MODE (op0) != xmode0 && xmode0 != VOIDmode)
+    xop0 = convert_modes (xmode0,
+                          GET_MODE (op0) != VOIDmode
+                          ? GET_MODE (op0)
+                          : tmode0,
+                          xop0, unsignedp);
+
+  if (op1)
+    if (GET_MODE (op1) != xmode1 && xmode1 != VOIDmode)
+      xop1 = convert_modes (xmode1,
+                            GET_MODE (op1) != VOIDmode
+                            ? GET_MODE (op1)
+                            : tmode1,
+                            xop1, unsignedp);
+
+  if (wide_op)
+    if (GET_MODE (wide_op) != wxmode && wxmode != VOIDmode)
+      wxop = convert_modes (wxmode,
+                            GET_MODE (wide_op) != VOIDmode
+                            ? GET_MODE (wide_op)
+                            : wmode,
+                            wxop, unsignedp);
+
+  /* Now, if insn's predicates don't allow our operands, put them into
+     pseudo regs.  */
+
+  if (! (*insn_data[icode].operand[1].predicate) (xop0, xmode0)
+      && xmode0 != VOIDmode)
+    xop0 = copy_to_mode_reg (xmode0, xop0);
+
+  if (op1)
+    {
+      if (! (*insn_data[icode].operand[2].predicate) (xop1, xmode1)
+          && xmode1 != VOIDmode)
+        xop1 = copy_to_mode_reg (xmode1, xop1);
+
+      if (wide_op)
+        {
+          if (! (*insn_data[icode].operand[3].predicate) (wxop, wxmode)
+              && wxmode != VOIDmode)
+            wxop = copy_to_mode_reg (wxmode, wxop);
+
+          pat = GEN_FCN (icode) (temp, xop0, xop1, wxop);
+        }
+      else
+        pat = GEN_FCN (icode) (temp, xop0, xop1);
+    }
+  else
+    {
+      if (wide_op)
+        {
+          if (! (*insn_data[icode].operand[2].predicate) (wxop, wxmode)
+              && wxmode != VOIDmode)
+            wxop = copy_to_mode_reg (wxmode, wxop);
+
+          pat = GEN_FCN (icode) (temp, xop0, wxop);
+        }
+      else
+        pat = GEN_FCN (icode) (temp, xop0);
+    }
+
+  emit_insn (pat);
+  return temp;
+}
+
 /* Generate code to perform an operation specified by TERNARY_OPTAB
    on operands OP0, OP1 and OP2, with result having machine-mode MODE.
 
@@ -5139,6 +5293,11 @@ init_optabs (void)
   reduc_splus_optab = init_optab (UNKNOWN);
   reduc_uplus_optab = init_optab (UNKNOWN);
 
+  ssum_widen_optab = init_optab (UNKNOWN);
+  usum_widen_optab = init_optab (UNKNOWN);
+  sdot_prod_optab = init_optab (UNKNOWN); 
+  udot_prod_optab = init_optab (UNKNOWN);
+
   vec_extract_optab = init_optab (UNKNOWN);
   vec_set_optab = init_optab (UNKNOWN);
   vec_init_optab = init_optab (UNKNOWN);
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 78cf53b15ff..58fb6905175 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -1,5 +1,6 @@
 /* Definitions for code generation pass of GNU compiler.
-   Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006 
+   Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -241,6 +242,14 @@ enum optab_index
   OTI_reduc_splus,
   OTI_reduc_uplus,
 
+  /* Summation, with result machine mode one or more wider than args.  */
+  OTI_ssum_widen,
+  OTI_usum_widen,
+
+  /* Dot product, with result machine mode one or more wider than args.  */
+  OTI_sdot_prod,
+  OTI_udot_prod,
+
   /* Set specified field of vector operand.  */
   OTI_vec_set,
   /* Extract specified field of vector operand.  */
@@ -367,6 +376,11 @@ extern GTY(()) optab optab_table[OTI_MAX];
 #define reduc_umin_optab (optab_table[OTI_reduc_umin])
 #define reduc_splus_optab (optab_table[OTI_reduc_splus])
 #define reduc_uplus_optab (optab_table[OTI_reduc_uplus])
+                                                                                
+#define ssum_widen_optab (optab_table[OTI_ssum_widen])
+#define usum_widen_optab (optab_table[OTI_usum_widen])
+#define sdot_prod_optab (optab_table[OTI_sdot_prod])
+#define udot_prod_optab (optab_table[OTI_udot_prod])
 
 #define vec_set_optab (optab_table[OTI_vec_set])
 #define vec_extract_optab (optab_table[OTI_vec_extract])
@@ -495,6 +509,9 @@ extern enum insn_code sync_lock_release[NUM_MACHINE_MODES];
 
 /* Define functions given in optabs.c.  */
 
+extern rtx expand_widen_pattern_expr (tree exp, rtx op0, rtx op1, rtx wide_op,
+                                      rtx target, int unsignedp);
+
 extern rtx expand_ternary_op (enum machine_mode mode, optab ternary_optab,
 			      rtx op0, rtx op1, rtx op2, rtx target,
 			      int unsignedp);
diff --git a/gcc/passes.c b/gcc/passes.c
index e2d18c9593f..8301590d98a 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -481,6 +481,7 @@ init_optimization_passes (void)
   p = &all_passes;
   NEXT_PASS (pass_fixup_cfg);
   NEXT_PASS (pass_init_datastructures);
+  NEXT_PASS (pass_expand_omp);
   NEXT_PASS (pass_all_optimizations);
   NEXT_PASS (pass_warn_function_noreturn);
   NEXT_PASS (pass_mudflap_2);
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index deca25d9850..4e4c7e0c683 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,81 @@
+2006-01-19  Volker Reichelt  <reichelt@igpm.rwth-aachen.de>
+
+	PR c++/25854
+	* g++.dg/template/spec28.C: New test.
+
+2006-01-19  Andrew Pinski  <pinskia@physics.uc.edu>
+
+	* objc.dg/gnu-encoding/struct-layout-encoding-1_generate.c (switchfiles):
+	Fix fprintf's by adding srcdir or removing it.
+
+2006-01-19  Diego Novillo  <dnovillo@redhat.com>
+
+	* testsuite/gcc.dg/gomp/for-13.c: Use -fdump-tree-ompexp.
+	* testsuite/gcc.dg/gomp/critical-1.c: Likewise.
+	* testsuite/gcc.dg/gomp/critical-3.c: Likewise.
+	* testsuite/gcc.dg/gomp/empty.c: Likewise.
+	* testsuite/gcc.dg/gomp/ordered-1.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-4.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-6.c: Likewise.
+	* testsuite/gcc.dg/gomp/master-3.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-8.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-10.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-18.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-5.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-7.c: Likewise.
+	* testsuite/gcc.dg/gomp/for-9.c: Likewise.
+
+2006-01-18  Jeff Law  <law@redhat.com>
+
+	* gcc.dg/tree-ssa/vrp25.c: New test.
+
+2006-01-19  Richard Sandiford  <richard@codesourcery.com>
+
+	* gcc.dg/pr25805.c: Fix misapplied patch.
+
+2006-01-19  Dorit Nuzman  <dorit@il.ibm.com>
+
+	* lib/target-suports.exp (check_effective_target_vect_sdot_qi): New.
+	(check_effective_target_vect_udot_qi): New.
+	(check_effective_target_vect_sdot_hi): New.
+	(check_effective_target_vect_udot_hi): New.
+	* gcc.dg/vect/vect.exp: Use dump-details, and compile testcases
+	prefixed with "wrapv-" with -fwrapv.
+	* gcc.dg/vect/wrapv-vect-reduc-dot-s8.c: New.
+	* gcc.dg/vect/vect-reduc-dot-u8.c: New.
+	* gcc.dg/vect/vect-reduc-dot-u16.c: New.
+	* gcc.dg/vect/vect-reduc-dot-s8.c: New.
+	* gcc.dg/vect/vect-reduc-dot-s16.c: New.
+
+	* lib/target-suports.exp (check_effective_target_vect_widen_sum): New.
+	* gcc.dg/vect/vect-reduc-pattern-1.c: New.
+	* gcc.dg/vect/vect-reduc-pattern-2.c: New.
+	* gcc.dg/vect/wrapv-vect-reduc-pattern-2.c: New.
+
+2006-01-19  Volker Reichelt  <reichelt@igpm.rwth-aachen.de>
+
+	PR c++/16829
+	* g++.dg/other/default2.C: New test.
+	* g++.dg/other/default3.C: New test.
+
+2006-01-19  Richard Sandiford  <richard@codesourcery.com>
+
+	PR c/25805
+	* gcc.dg/pr25805.c: New file.
+
+2006-01-18  Mark Mitchell  <mark@codesourcery.com>
+
+	PR c++/25836
+	* g++.dg/template/init6.C: New test.
+
+2006-01-18  Daniel Berlin  <dberlin@dberlin.org>
+
+	* gcc.dg/tree-ssa/pr24287.c: New test
+
+2006-01-18  Eric Christopher  <echristo@apple.com>
+
+	* g++.dg/eh/table.C: New.
+
 2006-01-18  DJ Delorie  <dj@redhat.com>
 
 	* gcc.dg/Werror-1.c, gcc.dg/Werror-2.c, gcc.dg/Werror-3.c,
diff --git a/gcc/testsuite/g++.dg/eh/table.C b/gcc/testsuite/g++.dg/eh/table.C
new file mode 100644
index 00000000000..6f36bc6cf91
--- /dev/null
+++ b/gcc/testsuite/g++.dg/eh/table.C
@@ -0,0 +1,33 @@
+// { dg-do compile { target *-*-darwin* } }
+// { dg-final { scan-assembler "GCC_except_table0" } }
+void needed();
+void unneeded();
+
+class Bar
+{
+public:
+  Bar() {}
+  virtual ~Bar() {}
+
+  void unneeded();
+};
+
+void needed()
+{
+	Bar b;
+}
+
+//#if 0
+void unneeded()
+{
+	Bar b;
+	b.unneeded();
+}
+//#endif
+
+int main()
+{
+	needed();
+
+	return 0;
+}
diff --git a/gcc/testsuite/g++.dg/other/default2.C b/gcc/testsuite/g++.dg/other/default2.C
new file mode 100644
index 00000000000..be0e5c32413
--- /dev/null
+++ b/gcc/testsuite/g++.dg/other/default2.C
@@ -0,0 +1,9 @@
+// PR c++/16829
+// { dg-do "compile" }
+
+template<typename T> void foo(T, int = 0, int) {}  // { dg-error "default" }
+
+void bar()
+{
+  foo(0);
+}
diff --git a/gcc/testsuite/g++.dg/other/default3.C b/gcc/testsuite/g++.dg/other/default3.C
new file mode 100644
index 00000000000..324ba7146fe
--- /dev/null
+++ b/gcc/testsuite/g++.dg/other/default3.C
@@ -0,0 +1,109 @@
+// PR c++/16829
+// { dg-do "compile" }
+
+void f1(int = 0, int);                       // { dg-error "default" }
+
+void f2(int = 0, int) {}                     // { dg-error "default" }
+
+void f3(int, int);
+void f3(int = 0, int);                       // { dg-error "default" }
+
+void f4(int, int);
+void f4(int = 0, int) {}                     // { dg-error "default" }
+
+void f5();
+void f5(int = 0, int);                       // { dg-error "default" }
+
+void f6();
+void f6(int = 0, int) {}                     // { dg-error "default" }
+
+template<typename> void g1(int = 0, int);    // { dg-error "default" }
+
+template<typename> void g2(int = 0, int) {}  // { dg-error "default" }
+
+template<typename> void g3(int, int);
+template<typename> void g3(int = 0, int);    // { dg-error "default" }
+
+template<typename> void g4(int, int);
+template<typename> void g4(int = 0, int) {}  // { dg-error "default" }
+
+template<typename> void g5();
+template<typename> void g5(int = 0, int);    // { dg-error "default" }
+
+template<typename> void g6();
+template<typename> void g6(int = 0, int) {}  // { dg-error "default" }
+
+template<typename T> void g7(T, T)   {}
+template<typename T> void g7(T* = 0, T*) {}  // { dg-error "default" }
+
+
+struct A
+{
+  void F1(int = 0, int);                       // { dg-error "default" }
+
+  void F2(int = 0, int) {}                     // { dg-error "default" }
+
+  void F3(int, int);
+
+  void F4();
+  void F4(int = 0, int);                       // { dg-error "default" }
+
+  void F5();
+  void F5(int = 0, int) {}                     // { dg-error "default" }
+
+  template<typename> void G1(int = 0, int);    // { dg-error "default" }
+
+  template<typename> void G2(int = 0, int) {}  // { dg-error "default" }
+
+  template<typename> void G3(int, int);
+
+  template<typename> void G4();
+  template<typename> void G4(int = 0, int);    // { dg-error "default" }
+
+  template<typename> void G5();
+  template<typename> void G5(int = 0, int) {}  // { dg-error "default" }
+
+  template<typename T> void G6(T, T)   {}
+  template<typename T> void G6(T* = 0, T*) {}  // { dg-error "default" }
+};
+
+void A::F3(int = 0, int) {}                     // { dg-error "default" }
+
+template<typename> void A::G3(int = 0, int) {}  // { dg-error "default" }
+
+
+template<typename> struct B
+{
+  void F1(int = 0, int);                       // { dg-error "default" }
+
+  void F2(int = 0, int) {}                     // { dg-error "default" }
+
+  void F3(int, int);
+
+  void F4();
+  void F4(int = 0, int);                       // { dg-error "default" }
+
+  void F5();
+  void F5(int = 0, int) {}                     // { dg-error "default" }
+
+  template<typename> void G1(int = 0, int);    // { dg-error "default" }
+
+  template<typename> void G2(int = 0, int) {}  // { dg-error "default" }
+
+  template<typename> void G3(int, int);
+
+  template<typename> void G4();
+  template<typename> void G4(int = 0, int);    // { dg-error "default" }
+
+  template<typename> void G5();
+  template<typename> void G5(int = 0, int) {}  // { dg-error "default" }
+
+  template<typename T> void G6(T, T)   {}
+  template<typename T> void G6(T* = 0, T*) {}  // { dg-error "default" }
+};
+
+template<typename T>
+void B<T>::F3(int = 0, int) {}  // { dg-error "default" }
+
+template<typename T> template<typename>
+void B<T>::G3(int = 0, int) {}  // { dg-error "default" }
diff --git a/gcc/testsuite/g++.dg/template/init6.C b/gcc/testsuite/g++.dg/template/init6.C
new file mode 100644
index 00000000000..143746642fc
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/init6.C
@@ -0,0 +1,31 @@
+// PR c++/25836
+
+template <class T>
+class Iter {};
+
+template <class T>
+class SubIter : public Iter<T> {
+  void insert(T);
+};
+
+class GraphBase {
+public:
+  class Node;
+};
+
+template<class T>
+class Graph : public GraphBase {
+  class Inner {
+    Iter<typename Graph<T>::Node*> *get();
+  };
+};
+
+template<class T>
+Iter<typename Graph<T>::Node*> *Graph<T>::Inner::get() {
+  SubIter<typename Graph<T>::Node*> *iter;
+  iter->insert(0);
+}
+
+int main() {
+  Iter<Graph<int>::Node*> *n2_iter = new SubIter<Graph<int>::Node*>();
+}
diff --git a/gcc/testsuite/g++.dg/template/spec28.C b/gcc/testsuite/g++.dg/template/spec28.C
new file mode 100644
index 00000000000..f0bb22caeb4
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/spec28.C
@@ -0,0 +1,6 @@
+// PR c++/25854
+// Bad diagnostic
+// { dg-do compile }
+
+template<typename> struct A {};  // { dg-error "provided" }
+template<> struct A<> {};        // { dg-error "wrong number" }
diff --git a/gcc/testsuite/gcc.dg/gomp/critical-1.c b/gcc/testsuite/gcc.dg/gomp/critical-1.c
index bdc7bad7b82..6f3348c8884 100644
--- a/gcc/testsuite/gcc.dg/gomp/critical-1.c
+++ b/gcc/testsuite/gcc.dg/gomp/critical-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-omplower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -21,8 +21,8 @@ void foo (void)
     bar(3);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_critical_start" 2 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_critical_end" 2 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_critical_name_start" 2 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_critical_name_end" 2 "omplower" } } */
-/* { dg-final { cleanup-tree-dump "omplower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_critical_start" 2 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_critical_end" 2 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_critical_name_start" 2 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_critical_name_end" 2 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/critical-3.c b/gcc/testsuite/gcc.dg/gomp/critical-3.c
index 9cd73ac046c..6726e6a03d4 100644
--- a/gcc/testsuite/gcc.dg/gomp/critical-3.c
+++ b/gcc/testsuite/gcc.dg/gomp/critical-3.c
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options "-fopenmp -fdump-tree-omplower" }
+// { dg-options "-fopenmp -fdump-tree-ompexp" }
 
 void bar(void);
 void foo(void)
@@ -8,4 +8,4 @@ void foo(void)
     bar();
 }
 
-// { dg-final { scan-tree-dump-times "\\&\\.gomp_critical_user_xyzzy" 2 "omplower" } }
+// { dg-final { scan-tree-dump-times "\\&\\.gomp_critical_user_xyzzy" 2 "ompexp" } }
diff --git a/gcc/testsuite/gcc.dg/gomp/empty.c b/gcc/testsuite/gcc.dg/gomp/empty.c
index 18af1d80d11..6a21c0460ec 100644
--- a/gcc/testsuite/gcc.dg/gomp/empty.c
+++ b/gcc/testsuite/gcc.dg/gomp/empty.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O -fopenmp -fdump-tree-omplower" } */
+/* { dg-options "-O -fopenmp -fdump-tree-ompexp" } */
 
 main()
 {
@@ -8,5 +8,5 @@ main()
 }
 
 /* There should not be a GOMP_parallel_start call.  */
-/* { dg-final { scan-tree-dump-times "GOMP_parallel_start" 0 "omplower"} } */
-/* { dg-final { cleanup-tree-dump "omplower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_parallel_start" 0 "ompexp"} } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-10.c b/gcc/testsuite/gcc.dg/gomp/for-10.c
index 9dfac165b0a..f21404249c7 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-10.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-lower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -12,6 +12,6 @@ void foo (int n)
     bar(i);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_runtime_start" 1 "lower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_runtime_next" 1 "lower" } } */
-/* { dg-final { cleanup-tree-dump "lower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_runtime_start" 1 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_runtime_next" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-13.c b/gcc/testsuite/gcc.dg/gomp/for-13.c
index 16e971f1927..607de49c749 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-13.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-13.c
@@ -2,7 +2,7 @@
 // for iteration variable as private.
 
 // { dg-do compile }
-// { dg-options "-fopenmp -fdump-tree-lower" }
+// { dg-options "-fopenmp -fdump-tree-ompexp" }
 
 extern void bar(int);
 void foo(void)
@@ -14,5 +14,5 @@ void foo(void)
     bar(i);
 }
 
-// { dg-final { scan-tree-dump-times "omp_data_o" 0 "lower" } }
-// { dg-final { cleanup-tree-dump "lower" } }
+// { dg-final { scan-tree-dump-times "omp_data_o" 0 "ompexp" } }
+// { dg-final { cleanup-tree-dump "ompexp" } }
diff --git a/gcc/testsuite/gcc.dg/gomp/for-18.c b/gcc/testsuite/gcc.dg/gomp/for-18.c
index c875a0c5f81..545f271c80b 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-18.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-18.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O -fopenmp -fdump-tree-omplower" } */
+/* { dg-options "-O -fopenmp -fdump-tree-ompexp" } */
 
 void
 foo (int *a, int i)
@@ -37,6 +37,6 @@ bar (int *a, int i)
     a[j] = 4;
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_parallel_loop_dynamic_start" 4 "omplower" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "GOMP_parallel_loop_guided_start" 4 "omplower" { xfail *-*-* } } } */
-/* { dg-final { cleanup-tree-dump "omplower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_parallel_loop_dynamic_start" 4 "ompexp" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "GOMP_parallel_loop_guided_start" 4 "ompexp" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-4.c b/gcc/testsuite/gcc.dg/gomp/for-4.c
index c5f1bb8d13d..fb6994ea20b 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-4.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-4.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-lower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -12,6 +12,6 @@ void foo (int n)
     bar(i);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_loop_dynamic_start" 1 "lower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_loop_dynamic_next" 1 "lower" } } */
-/* { dg-final { cleanup-tree-dump "lower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_dynamic_start" 1 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_dynamic_next" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-5.c b/gcc/testsuite/gcc.dg/gomp/for-5.c
index 6d9722a97f4..5912a4e5561 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-5.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-lower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -12,6 +12,6 @@ void foo (int n)
     bar(i);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_loop_guided_start" 1 "lower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_loop_guided_next" 1 "lower" } } */
-/* { dg-final { cleanup-tree-dump "lower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_guided_start" 1 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_guided_next" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-6.c b/gcc/testsuite/gcc.dg/gomp/for-6.c
index 9361205e757..100ee2c8c21 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-6.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-6.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-lower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -12,6 +12,6 @@ void foo (int n)
     bar(i);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_loop_runtime_start" 1 "lower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_loop_runtime_next" 1 "lower" } } */
-/* { dg-final { cleanup-tree-dump "lower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_runtime_start" 1 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_runtime_next" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-7.c b/gcc/testsuite/gcc.dg/gomp/for-7.c
index b3eb997cb38..10763dc596c 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-7.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-7.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-lower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -12,6 +12,6 @@ void foo (int n)
     bar(i);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_static_start" 1 "lower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_static_next" 1 "lower" } } */
-/* { dg-final { cleanup-tree-dump "lower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_static_start" 1 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_static_next" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-8.c b/gcc/testsuite/gcc.dg/gomp/for-8.c
index c1386ce4a41..1bc66c49a0d 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-8.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-8.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-lower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -12,6 +12,6 @@ void foo (int n)
     bar(i);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_dynamic_start" 1 "lower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_dynamic_next" 1 "lower" } } */
-/* { dg-final { cleanup-tree-dump "lower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_dynamic_start" 1 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_dynamic_next" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/for-9.c b/gcc/testsuite/gcc.dg/gomp/for-9.c
index 2a554d51527..af99e216e79 100644
--- a/gcc/testsuite/gcc.dg/gomp/for-9.c
+++ b/gcc/testsuite/gcc.dg/gomp/for-9.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-lower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -12,6 +12,6 @@ void foo (int n)
     bar(i);
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_guided_start" 1 "lower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_guided_next" 1 "lower" } } */
-/* { dg-final { cleanup-tree-dump "lower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_guided_start" 1 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_loop_ordered_guided_next" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/master-3.c b/gcc/testsuite/gcc.dg/gomp/master-3.c
index 37966106df5..fee09ddd798 100644
--- a/gcc/testsuite/gcc.dg/gomp/master-3.c
+++ b/gcc/testsuite/gcc.dg/gomp/master-3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-omplower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -9,5 +9,5 @@ void foo (void)
     bar(0);
 }
 
-/* { dg-final { scan-tree-dump-times "omp_get_thread_num" 1 "omplower" } } */
-/* { dg-final { cleanup-tree-dump "omplower" } } */
+/* { dg-final { scan-tree-dump-times "omp_get_thread_num" 1 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/ordered-1.c b/gcc/testsuite/gcc.dg/gomp/ordered-1.c
index a1cd7f48602..de5e116ebd2 100644
--- a/gcc/testsuite/gcc.dg/gomp/ordered-1.c
+++ b/gcc/testsuite/gcc.dg/gomp/ordered-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-fopenmp -fdump-tree-omplower" } */
+/* { dg-options "-fopenmp -fdump-tree-ompexp" } */
 
 extern void bar(int);
 
@@ -15,6 +15,6 @@ void foo (void)
   }
 }
 
-/* { dg-final { scan-tree-dump-times "GOMP_ordered_start" 2 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "GOMP_ordered_end" 2 "omplower" } } */
-/* { dg-final { cleanup-tree-dump "omplower" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_ordered_start" 2 "ompexp" } } */
+/* { dg-final { scan-tree-dump-times "GOMP_ordered_end" 2 "ompexp" } } */
+/* { dg-final { cleanup-tree-dump "ompexp" } } */
diff --git a/gcc/testsuite/gcc.dg/pr25805.c b/gcc/testsuite/gcc.dg/pr25805.c
new file mode 100644
index 00000000000..71182c52f2e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr25805.c
@@ -0,0 +1,20 @@
+/* When -fzero-initialized-in-bss was in effect, we used to only allocate
+   storage for d1.a.  */
+/* { dg-do run } */
+/* { dg-options "" } */
+extern void abort (void);
+extern void exit (int);
+
+struct { int a; int x[]; } d1 = { 0, 0 };
+int d2 = 0;
+
+int
+main ()
+{
+  d2 = 1;
+  if (sizeof (d1) != sizeof (int))
+    abort ();
+  if (d1.x[0] != 0)
+    abort ();
+  exit (0);
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr24287.c b/gcc/testsuite/gcc.dg/tree-ssa/pr24287.c
new file mode 100644
index 00000000000..8e7f18691dc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr24287.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */ 
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+int g1(int);
+int h(int *a, int *b)__attribute__((pure));
+void link_error();
+
+/* The calls to link_error should be eliminated, since nothing escapes to 
+   non-pure functions.  */
+int g(void)
+{
+  int t = 0, t1 = 2;
+  int t2 = h(&t, &t1);
+  if (t != 0)
+    link_error ();
+  if (t1 != 2)
+    link_error ();
+  g1(t2);
+  if (t != 0)
+    link_error ();
+  if (t1 != 2)
+    link_error ();
+  return t2 == 2;
+}
+/* { dg-final { scan-tree-dump-times "link_error" 0 "optimized"} } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp25.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp25.c
new file mode 100644
index 00000000000..52f9ea2fea5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp25.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-vrp-details" } */
+
+extern void abort ();
+int tree_code_length[100];
+
+blah (int code1)
+{
+  unsigned char D18670;
+
+  if (code1 != 53) goto L0; else goto L1;
+
+L0:
+  abort ();
+
+L1:
+  D18670 = tree_code_length[53];
+  if (D18670 <= 1) goto L2; else goto L3;
+
+L2:
+  abort ();
+
+L3:
+  if (D18670 == 2) goto L4; else goto L5;
+
+L4:
+  abort ();
+
+L5:
+  arf ();
+  if (code1 != 53) goto L6; else goto L7;
+
+L6:
+  abort ();
+
+L7:
+  if (D18670 <= 2) goto L8; else goto L9;
+
+L8:
+  abort ();
+
+L9:
+  return;
+
+}
+
+/* The second test of (code1 != 53) and the test (D18670 <= 2) are
+   both totally subsumed by earlier tests and thus should be folded
+   away using VRP.  */
+/* { dg-final { scan-tree-dump-times "Folding predicate" 2 "vrp" } } */
+/* { dg-final { cleanup-tree-dump "vrp" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c
new file mode 100644
index 00000000000..ddffc109d35
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c
@@ -0,0 +1,70 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 43680
+
+signed short X[N] __attribute__ ((__aligned__(16)));
+signed short Y[N] __attribute__ ((__aligned__(16)));
+
+/* short->short->int dot product. 
+   Not detected as a dot-product pattern.
+   Currently fails to be vectorized due to presence of type conversions. */
+int
+foo1(int len) {
+  int i;
+  int result = 0;
+  short prod;
+
+  for (i=0; i<len; i++) {
+    prod = X[i] * Y[i];
+    result += prod;
+  }
+  return result;
+}
+
+/* short->int->int dot product.
+   Detected as a dot-product pattern.
+   Vectorized on targets that support dot-product for signed shorts.  */ 
+int
+foo2(int len) {
+  int i;
+  int result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (X[i] * Y[i]);
+  }
+  return result;
+}
+
+
+int main (void)
+{
+  int i, dot1, dot2;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+  }
+
+  dot1 = foo1 (N);
+  if (dot1 != DOT1)
+    abort ();
+
+  dot2 = foo2 (N);
+  if (dot2 != DOT2)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_sdot_hi } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c
new file mode 100644
index 00000000000..8e5d48035b3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c
@@ -0,0 +1,111 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 -21856
+#define DOT3 43680
+
+signed char X[N] __attribute__ ((__aligned__(16)));
+signed char Y[N] __attribute__ ((__aligned__(16)));
+
+/* char->short->int dot product.
+   The dot-product pattern should be detected.
+   Vectorizable on vect_sdot_qi targets (targets that support dot-product of 
+   signed chars).
+
+   In the future could also be vectorized as widening-mult + widening-summation,
+   or with type-conversion support.
+ */
+int
+foo1(int len) {
+  int i;
+  int result = 0;
+  short prod;
+
+  for (i=0; i<len; i++) {
+    prod = X[i] * Y[i];
+    result += prod;
+  }
+  return result;
+}
+
+/* char->short->short dot product.
+   The dot-product pattern should be detected.
+   The reduction is currently not vectorized becaus of the signed->unsigned->signed
+   casts, since this patch:
+
+     2005-12-26  Kazu Hirata  <kazu@codesourcery.com>
+                                                                                                
+        PR tree-optimization/25125
+
+   When the dot-product is detected, the loop should be vectorized on vect_sdot_qi 
+   targets (targets that support dot-product of signed char).  
+   This test would currently fail to vectorize on targets that support
+   dot-product of chars when the accumulator is int.
+
+   In the future could also be vectorized as widening-mult + summation,
+   or with type-conversion support.
+ */
+short
+foo2(int len) {
+  int i;
+  short result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (X[i] * Y[i]);
+  }
+  return result;
+}
+
+/* char->int->int dot product. 
+   Not detected as a dot-product pattern.
+   Currently fails to be vectorized due to presence of type conversions. */
+int
+foo3(int len) {
+  int i;
+  int result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (X[i] * Y[i]);
+  }
+  return result;
+}
+
+int main (void)
+{
+  int i, dot1, dot3;
+  short dot2;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+  }
+
+  dot1 = foo1 (N);
+  if (dot1 != DOT1)
+    abort ();
+
+  dot2 = foo2 (N);
+  if (dot2 != DOT2)
+    abort ();
+
+  dot3 = foo3 (N);
+  if (dot3 != DOT3)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_sdot_qi } } } */
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c
new file mode 100644
index 00000000000..03db7e0b6a6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c
@@ -0,0 +1,77 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 43680
+
+unsigned short X[N] __attribute__ ((__aligned__(16)));
+unsigned short Y[N] __attribute__ ((__aligned__(16)));
+
+/* short->short->int dot product. 
+   Not detected as a dot-product pattern.
+   Not vectorized due to presence of type-conversions. */
+unsigned int
+foo1(int len) {
+  int i;
+  unsigned int result = 0;
+  unsigned short prod;
+
+  for (i=0; i<len; i++) {
+    prod = X[i] * Y[i];
+    result += prod;
+  }
+  return result;
+}
+
+/* short->int->int dot product. 
+   Currently not detected as a dot-product pattern: the multiplication 
+   promotes the ushorts to int, and then the product is promoted to unsigned 
+   int for the addition.  Which results in an int->unsigned int cast, which 
+   since no bits are modified in the cast should be trivially vectorizable.  */
+unsigned int
+foo2(int len) {
+  int i;
+  unsigned int result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (X[i] * Y[i]);
+  }
+  return result;
+}
+
+
+int main (void)
+{
+  unsigned int dot1, dot2;
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+  }
+
+  dot1 = foo1 (N);
+  if (dot1 != DOT1)
+    abort ();
+
+  dot2 = foo2 (N);
+  if (dot2 != DOT2)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
+
+/* Once the dot-product pattern is detected in the second loop, we expect
+   that loop to be vectorized on vect_udot_hi targets (targets that support 
+   dot-product of unsigned shorts).  */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */ 
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c
new file mode 100644
index 00000000000..ad68bc752c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c
@@ -0,0 +1,101 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 43680
+#define DOT3 43680
+
+unsigned char X[N] __attribute__ ((__aligned__(16)));
+unsigned char Y[N] __attribute__ ((__aligned__(16)));
+
+/* char->short->int dot product. 
+   Detected as a dot-product pattern.
+   Should be vectorized on targets that support dot-product for unsigned chars.
+   */
+unsigned int
+foo1(int len) {
+  int i;
+  unsigned int result = 0;
+  unsigned short prod;
+
+  for (i=0; i<len; i++) {
+    prod = X[i] * Y[i];
+    result += prod;
+  }
+  return result;
+}
+
+/* char->short->short dot product. 
+   Detected as a dot-product pattern.
+   Should be vectorized on targets that support dot-product for unsigned chars.
+   This test currently fails to vectorize on targets that support dot-product 
+   of chars only when the accumulator is int.
+   */
+unsigned short
+foo2(int len) {
+  int i;
+  unsigned short result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (unsigned short)(X[i] * Y[i]);
+  }
+  return result;
+}
+
+/* char->int->int dot product. 
+   Not detected as a dot-product.
+   Doesn't get vectorized due to presence of type converisons.  */
+unsigned int
+foo3(int len) {
+  int i;
+  unsigned int result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (X[i] * Y[i]);
+  }
+  return result;
+}
+
+int main (void)
+{
+  unsigned int dot1, dot3;
+  unsigned short dot2;
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+  }
+
+  dot1 = foo1 (N);
+  if (dot1 != DOT1)
+    abort ();
+
+  dot2 = foo2 (N);
+  if (dot2 != DOT2)
+    abort ();
+
+  dot3 = foo3 (N);
+  if (dot3 != DOT3)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 2 "vect" } } */
+
+/* When the vectorizer is enhanced to vectorize foo2 (accumulation into short) for 
+   targets that support accumulation into int (powerpc, ia64) we'd have:
+dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_udot_qi } }
+*/
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_udot_qi } } } */
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c
new file mode 100644
index 00000000000..61f1da19d6e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c
@@ -0,0 +1,60 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define SH_SUM 210
+#define CH_SUM 120
+
+int main1 ()
+{
+  int i;
+  unsigned short udata_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28};
+  unsigned char udata_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  unsigned int intsum = 0;
+  unsigned short shortsum = 0;
+
+  /* widenning sum: sum shorts into int.  */
+  for (i = 0; i < N; i++){
+    intsum += udata_sh[i];
+  }
+
+  /* check results:  */
+  if (intsum != SH_SUM)
+    abort ();
+
+  /* widenning sum: sum chars into int.  */
+  intsum = 0;
+  for (i = 0; i < N; i++){
+    intsum += udata_ch[i];
+  }
+
+  /* check results:  */
+  if (intsum != CH_SUM)
+    abort ();
+
+  /* widenning sum: sum chars into short.  
+     pattern detected, but not vectorized yet. */
+  for (i = 0; i < N; i++){
+    shortsum += udata_ch[i];
+  }
+
+  /* check results:  */
+  if (shortsum != CH_SUM)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c
new file mode 100644
index 00000000000..5423c4376d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c
@@ -0,0 +1,67 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define SH_SUM 210
+#define CH_SUM 120
+
+int main1 ()
+{
+  int i;
+  signed short data_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28};
+  signed char data_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  signed int intsum = 0;
+  signed short shortsum = 0;
+
+  /* widenning sum: sum shorts into int.  */
+  for (i = 0; i < N; i++){
+    intsum += data_sh[i];
+  }
+
+  /* check results:  */
+  if (intsum != SH_SUM)
+    abort ();
+
+  /* widenning sum: sum chars into int.  */
+  intsum = 0;
+  for (i = 0; i < N; i++){
+    intsum += data_ch[i];
+  }
+
+  /* check results:  */
+  if (intsum != CH_SUM)
+    abort ();
+
+  /* widenning sum: sum chars into short.
+     The widening-summation pattern is currently not detected because of this
+     patch:
+
+     2005-12-26  Kazu Hirata  <kazu@codesourcery.com>
+                                                                                
+        PR tree-optimization/25125
+   */
+  for (i = 0; i < N; i++){
+    shortsum += data_ch[i];
+  }
+
+  /* check results:  */
+  if (shortsum != CH_SUM)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp
index bfa6dced9b1..9cf78ff8519 100644
--- a/gcc/testsuite/gcc.dg/vect/vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/vect.exp
@@ -1,4 +1,4 @@
-# Copyright (C) 1997, 2004 Free Software Foundation, Inc.
+# Copyright (C) 1997, 2004, 2005, 2006 Free Software Foundation, Inc.
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -78,7 +78,7 @@ dg-init
 dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/nodump-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
-lappend DEFAULT_VECTCFLAGS "-ftree-vectorizer-verbose=4" "-fdump-tree-vect-stats"
+lappend DEFAULT_VECTCFLAGS "-fdump-tree-vect-details"
 
 # Main loop.
 dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]]  \
@@ -96,6 +96,12 @@ lappend DEFAULT_VECTCFLAGS "-ffast-math"
 dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-vect*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
+# -fwrapv tests
+set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
+lappend DEFAULT_VECTCFLAGS "-fwrapv"
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-vect*.\[cS\]]]  \
+        "" $DEFAULT_VECTCFLAGS
+
 # -ftrapv tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-ftrapv"
diff --git a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c
new file mode 100644
index 00000000000..b11b9c70086
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c
@@ -0,0 +1,108 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 -21856
+#define DOT3 43680
+
+signed char X[N] __attribute__ ((__aligned__(16)));
+signed char Y[N] __attribute__ ((__aligned__(16)));
+
+/* char->short->int dot product.
+   The dot-product pattern should be detected.
+   Vectorizable on vect_sdot_qi targets (targets that support dot-product of 
+   signed chars).
+
+   In the future could also be vectorized as widening-mult + widening-summation,
+   or with type-conversion support.
+ */
+int
+foo1(int len) {
+  int i;
+  int result = 0;
+  short prod;
+
+  for (i=0; i<len; i++) {
+    prod = X[i] * Y[i];
+    result += prod;
+  }
+  return result;
+}
+
+/* char->short->short dot product.
+   The dot-product pattern should be detected.
+   Should be vectorized on vect_sdot_qi targets (targets that support 
+   dot-product of signed char).  
+   This test currently fails to vectorize on targets that support
+   dot-product of chars when the accumulator is int.
+
+   In the future could also be vectorized as widening-mult + summation,
+   or with type-conversion support.
+ */
+short
+foo2(int len) {
+  int i;
+  short result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (X[i] * Y[i]);
+  }
+  return result;
+}
+
+/* char->int->int dot product. 
+   Not detected as a dot-product pattern.
+   Currently fails to be vectorized due to presence of type conversions. */
+int
+foo3(int len) {
+  int i;
+  int result = 0;
+
+  for (i=0; i<len; i++) {
+    result += (X[i] * Y[i]);
+  }
+  return result;
+}
+
+int main (void)
+{
+  int i, dot1, dot3;
+  short dot2;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+  }
+
+  dot1 = foo1 (N);
+  if (dot1 != DOT1)
+    abort ();
+
+  dot2 = foo2 (N);
+  if (dot2 != DOT2)
+    abort ();
+
+  dot3 = foo3 (N);
+  if (dot3 != DOT3)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 2 "vect" } } */
+
+/* When vectorizer is enhanced to vectorize foo2 (accumulation into short) for targets 
+   that support accumulation into int (ia64) we'd have:
+dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_sdot_qi } }
+*/
+/* In the meantime expect: */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_sdot_qi } } } */
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c
new file mode 100755
index 00000000000..6c844eac38e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c
@@ -0,0 +1,59 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define SH_SUM 210
+#define CH_SUM 120
+
+int main1 ()
+{
+  int i;
+  signed short data_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28};
+  signed char data_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  signed int intsum = 0;
+  signed short shortsum = 0;
+
+  /* widenning sum: sum shorts into int.  */
+  for (i = 0; i < N; i++){
+    intsum += data_sh[i];
+  }
+
+  /* check results:  */
+  if (intsum != SH_SUM)
+    abort ();
+
+  /* widenning sum: sum chars into int.  */
+  intsum = 0;
+  for (i = 0; i < N; i++){
+    intsum += data_ch[i];
+  }
+
+  /* check results:  */
+  if (intsum != CH_SUM)
+    abort ();
+
+  /* widenning sum: sum chars into short.  */
+  for (i = 0; i < N; i++){
+    shortsum += data_ch[i];
+  }
+
+  /* check results:  */
+  if (shortsum != CH_SUM)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{ 
+  check_vect ();
+  
+  return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index cadef960460..05a180eb9cc 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1364,6 +1364,112 @@ proc check_effective_target_vect_no_bitwise { } {
     return $et_vect_no_bitwise_saved
 }
 
+# Return 1 if the target plus current options supports a vector
+# widening summation, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+                                                                                                
+proc check_effective_target_vect_widen_sum { } {
+    global et_vect_widen_sum
+                                                                                                
+    if [info exists et_vect_widen_sum_saved] {
+        verbose "check_effective_target_vect_widen_sum: using cached result" 2
+    } else {
+        set et_vect_widen_sum_saved 0
+        if { [istarget powerpc*-*-*]
+	     || [istarget ia64-*-*] } {
+            set et_vect_widen_sum_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_widen_sum: returning $et_vect_widen_sum_saved" 2
+    return $et_vect_widen_sum_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of signed chars, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_sdot_qi { } {
+    global et_vect_sdot_qi
+
+    if [info exists et_vect_sdot_qi_saved] {
+        verbose "check_effective_target_vect_sdot_qi: using cached result" 2
+    } else {
+        set et_vect_sdot_qi_saved 0
+        if { [istarget ia64-*-*] } {
+            set et_vect_sdot_qi_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_sdot_qi: returning $et_vect_sdot_qi_saved" 2
+    return $et_vect_sdot_qi_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of unsigned chars, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_udot_qi { } {
+    global et_vect_udot_qi
+
+    if [info exists et_vect_udot_qi_saved] {
+        verbose "check_effective_target_vect_udot_qi: using cached result" 2
+    } else {
+        set et_vect_udot_qi_saved 0
+        if { [istarget powerpc*-*-*]
+             || [istarget ia64-*-*] } {
+            set et_vect_udot_qi_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_udot_qi: returning $et_vect_udot_qi_saved" 2
+    return $et_vect_udot_qi_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of signed shorts, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_sdot_hi { } {
+    global et_vect_sdot_hi
+
+    if [info exists et_vect_sdot_hi_saved] {
+        verbose "check_effective_target_vect_sdot_hi: using cached result" 2
+    } else {
+        set et_vect_sdot_hi_saved 0
+        if { [istarget powerpc*-*-*] 
+	     || [istarget i?86-*-*]
+             || [istarget x86_64-*-*]
+             || [istarget ia64-*-*] } {
+            set et_vect_sdot_hi_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_sdot_hi: returning $et_vect_sdot_hi_saved" 2
+    return $et_vect_sdot_hi_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of unsigned shorts, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_udot_hi { } {
+    global et_vect_udot_hi
+
+    if [info exists et_vect_udot_hi_saved] {
+        verbose "check_effective_target_vect_udot_hi: using cached result" 2
+    } else {
+        set et_vect_udot_hi_saved 0
+        if { [istarget powerpc*-*-*] } {
+            set et_vect_udot_hi_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_udot_hi: returning $et_vect_udot_hi_saved" 2
+    return $et_vect_udot_hi_saved
+}
+
+
 # Return 1 if the target plus current options does not support a vector
 # alignment mechanism, 0 otherwise.
 #
diff --git a/gcc/testsuite/objc.dg/gnu-encoding/struct-layout-encoding-1_generate.c b/gcc/testsuite/objc.dg/gnu-encoding/struct-layout-encoding-1_generate.c
index 31f328ee9a9..4b5d5c6dae7 100644
--- a/gcc/testsuite/objc.dg/gnu-encoding/struct-layout-encoding-1_generate.c
+++ b/gcc/testsuite/objc.dg/gnu-encoding/struct-layout-encoding-1_generate.c
@@ -230,13 +230,13 @@ switchfiles (int fields)
     {
       fprintf (outfile, "\
 /* { dg-do run { xfail *-*-* } } */\n\
-/* { dg-options \"-w -I%s -fgnu-runtime\" } */\n");
+/* { dg-options \"-w -I%s -fgnu-runtime\" } */\n", srcdir);
     }
   else
     {
       fprintf (outfile, "\
 /* { dg-do run } */\n\
-/* { dg-options \"-w -I%s -fgnu-runtime\" } */\n");
+/* { dg-options \"-w -I%s -fgnu-runtime\" } */\n", srcdir);
     }
   fprintf(outfile, "#include <objc/encoding.h> \n\
 #include \"struct-layout-1.h\"\n\
@@ -274,7 +274,7 @@ int main (void)\n\
       abort ();\n\
     }\n\
   exit (0);\n\
-}\n", srcdir, filecnt, filecnt);
+}\n", filecnt, filecnt);
   fclose (outfile);
   sprintf (destptr, "t%03d_test.h", filecnt);
   outfile = fopen (destbuf, "w");
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 45e78ddf2e2..f76f663ef69 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -486,6 +486,37 @@ make_edges (void)
 }
 
 
+/* Link an OMP_SECTIONS block to all the OMP_SECTION blocks in its body.  */
+
+static void
+make_omp_sections_edges (basic_block bb)
+{
+  basic_block exit_bb;
+  size_t i, n;
+  tree vec, stmt;
+
+  stmt = last_stmt (bb);
+  vec = OMP_SECTIONS_SECTIONS (stmt);
+  n = TREE_VEC_LENGTH (vec);
+  exit_bb = bb_for_stmt (TREE_VEC_ELT (vec, n - 1));
+
+  for (i = 0; i < n - 1; i += 2)
+    {
+      basic_block start_bb = bb_for_stmt (TREE_VEC_ELT (vec, i));
+      basic_block end_bb = bb_for_stmt (TREE_VEC_ELT (vec, i + 1));
+      make_edge (bb, start_bb, EDGE_ABNORMAL);
+      make_edge (end_bb, exit_bb, EDGE_FALLTHRU);
+    }
+
+  /* Once the CFG has been built, the vector of sections is no longer
+     useful.  The region can be easily obtained with build_omp_regions.
+     Furthermore, this sharing of tree expressions is not allowed by the
+     statement verifier.  */
+  OMP_SECTIONS_SECTIONS (stmt) = NULL_TREE;
+}
+
+
+
 /* Create edges for control statement at basic block BB.  */
 
 static void
@@ -581,6 +612,27 @@ make_exit_edges (basic_block bb)
       make_edge (bb, bb->next_bb, EDGE_FALLTHRU);
       break;
 
+    case OMP_PARALLEL:
+    case OMP_FOR:
+    case OMP_SINGLE:
+    case OMP_MASTER:
+    case OMP_ORDERED:
+    case OMP_CRITICAL:
+      make_edge (bb, bb->next_bb, EDGE_ABNORMAL);
+
+    case OMP_RETURN_EXPR:
+      if (EDGE_COUNT (bb->succs) == 0)
+	make_edge (bb, bb->next_bb, EDGE_FALLTHRU);
+      break;
+
+    case OMP_SECTIONS:
+      make_omp_sections_edges (bb);
+      break;
+
+    case OMP_SECTION:
+      make_edge (bb, bb->next_bb, EDGE_FALLTHRU);
+      break;
+
     default:
       gcc_unreachable ();
     }
@@ -2503,6 +2555,10 @@ is_ctrl_altering_stmt (tree t)
 	return true;
     }
 
+  /* OpenMP directives alter control flow.  */
+  if (flag_openmp && OMP_DIRECTIVE_P (t))
+    return true;
+
   /* If a statement can throw, it alters control flow.  */
   return tree_can_throw_internal (t);
 }
@@ -2746,12 +2802,9 @@ set_bb_for_stmt (tree t, basic_block bb)
       stmt_ann_t ann = get_stmt_ann (t);
       ann->bb = bb;
 
-      /* If the statement is a label, add the label to block-to-labels
-	 map so that we can speed up edge creation for GOTO_EXPRs.
-	 Note that LABEL_TO_BLOCK_MAP may not exist if we are
-	 currently expanding into RTL (in which case, this mapping is
-	 unnecessary, anyway).  */
-      if (TREE_CODE (t) == LABEL_EXPR && !currently_expanding_to_rtl)
+      /* If the statement is a label, add the label to block-to-labels map
+        so that we can speed up edge creation for GOTO_EXPRs.  */
+      if (TREE_CODE (t) == LABEL_EXPR)
 	{
 	  int uid;
 
@@ -3432,6 +3485,17 @@ verify_stmt (tree stmt, bool last_in_block)
 {
   tree addr;
 
+  if (OMP_DIRECTIVE_P (stmt))
+    {
+      /* OpenMP directives are validated by the FE and never operated
+	 on by the optimizers.  Furthermore, OMP_FOR may contain
+	 non-gimple expressions when the main index variable has had
+	 its address taken.  This does not affect the loop itself
+	 because the header of an OMP_FOR is merely used to determine
+	 how to setup the parallel iteration.  */
+      return false;
+    }
+
   if (!is_gimple_stmt (stmt))
     {
       error ("is not a valid GIMPLE statement");
@@ -4494,6 +4558,329 @@ tree_duplicate_sese_region (edge entry, edge exit,
   return true;
 }
 
+/*
+DEF_VEC_P(basic_block);
+DEF_VEC_ALLOC_P(basic_block,heap);
+*/
+
+/* Add all the blocks dominated by ENTRY to the array BBS_P.  Stop
+   adding blocks when the dominator traversal reaches EXIT.  This
+   function silently assumes that ENTRY strictly dominates EXIT.  */
+
+static void
+gather_blocks_in_sese_region (basic_block entry, basic_block exit,
+			      VEC(basic_block,heap) **bbs_p)
+{
+  basic_block son;
+
+  for (son = first_dom_son (CDI_DOMINATORS, entry);
+       son;
+       son = next_dom_son (CDI_DOMINATORS, son))
+    {
+      VEC_safe_push (basic_block, heap, *bbs_p, son);
+      if (son != exit)
+	gather_blocks_in_sese_region (son, exit, bbs_p);
+    }
+}
+
+
+struct move_stmt_d
+{
+  tree block;
+  tree from_context;
+  tree to_context;
+  bitmap vars_to_remove;
+  bool remap_decls_p;
+};
+
+/* Helper for move_block_to_fn.  Set TREE_BLOCK in every expression
+   contained in *TP and change the DECL_CONTEXT of every local
+   variable referenced in *TP.  */
+
+static tree
+move_stmt_r (tree *tp, int *walk_subtrees ATTRIBUTE_UNUSED, void *data)
+{
+  struct move_stmt_d *p = (struct move_stmt_d *) data;
+
+  if (p->block && IS_EXPR_CODE_CLASS (TREE_CODE_CLASS (TREE_CODE (*tp))))
+    TREE_BLOCK (*tp) = p->block;
+
+  if (OMP_DIRECTIVE_P (*tp))
+    {
+      /* Do not remap variables inside OMP directives.  Variables
+	 referenced in clauses and directive header belong to the
+	 parent function and should not be moved into the child
+	 function.  */
+      p->remap_decls_p = false;
+    }
+
+  if (p->remap_decls_p
+      && DECL_P (*tp)
+      && DECL_CONTEXT (*tp) == p->from_context)
+    {
+      DECL_CONTEXT (*tp) = p->to_context;
+
+      if (TREE_CODE (*tp) == VAR_DECL)
+	{
+	  struct function *f = DECL_STRUCT_FUNCTION (p->to_context);
+	  f->unexpanded_var_list = tree_cons (0, *tp, f->unexpanded_var_list);
+
+	  /* Mark *TP to be removed from the original function,
+	     otherwise it will be given a DECL_RTL when the original
+	     function is expanded.  */
+	  bitmap_set_bit (p->vars_to_remove, DECL_UID (*tp));
+	}
+    }
+
+  return NULL_TREE;
+}
+
+
+/* Move basic block BB from function CFUN to function DEST_FN.  The
+   block is moved out of the original linked list and placed after
+   block AFTER in the new list.  Also, the block is removed from the
+   original array of blocks and placed in DEST_FN's array of blocks.
+   If UPDATE_EDGE_COUNT_P is true, the edge counts on both CFGs is
+   updated to reflect the moved edges.
+   
+   On exit, local variables that need to be removed from
+   CFUN->UNEXPANDED_VAR_LIST will have been added to VARS_TO_REMOVE.  */
+
+static void
+move_block_to_fn (struct function *dest_cfun, basic_block bb,
+		  basic_block after, bool update_edge_count_p,
+		  bitmap vars_to_remove)
+{
+  struct control_flow_graph *cfg;
+  edge_iterator ei;
+  edge e;
+  block_stmt_iterator si;
+  struct move_stmt_d d;
+  unsigned sz;
+
+  /* Link BB to the new linked list.  */
+  move_block_after (bb, after);
+
+  /* Update the edge count in the corresponding flowgraphs.  */
+  if (update_edge_count_p)
+    FOR_EACH_EDGE (e, ei, bb->succs)
+      {
+	cfun->cfg->x_n_edges--;
+	dest_cfun->cfg->x_n_edges++;
+      }
+
+  /* Remove BB from the original basic block array.  */
+  VEC_replace (basic_block, cfun->cfg->x_basic_block_info, bb->index, NULL);
+  cfun->cfg->x_n_basic_blocks--;
+
+  /* Grow DEST_CFUN's basic block array if needed.  */
+  cfg = dest_cfun->cfg;
+  cfg->x_n_basic_blocks++;
+  if (bb->index > cfg->x_last_basic_block)
+    cfg->x_last_basic_block = bb->index;
+
+  sz = VEC_length (basic_block, cfg->x_basic_block_info);
+  if ((unsigned) cfg->x_last_basic_block >= sz)
+    {
+      sz = cfg->x_last_basic_block + (cfg->x_last_basic_block + 3) / 4;
+      VEC_safe_grow (basic_block, gc, cfg->x_basic_block_info, sz);
+    }
+
+  VEC_replace (basic_block, cfg->x_basic_block_info,
+               cfg->x_last_basic_block, bb);
+
+  /* The statements in BB need to be associated with a new TREE_BLOCK.
+     Labels need to be associated with a new label-to-block map.  */
+  memset (&d, 0, sizeof (d));
+  d.vars_to_remove = vars_to_remove;
+
+  for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+    {
+      tree stmt = bsi_stmt (si);
+
+      d.from_context = cfun->decl;
+      d.to_context = dest_cfun->decl;
+      d.remap_decls_p = true;
+      if (TREE_BLOCK (stmt))
+	d.block = DECL_INITIAL (dest_cfun->decl);
+
+      walk_tree (&stmt, move_stmt_r, &d, NULL);
+
+      if (TREE_CODE (stmt) == LABEL_EXPR)
+	{
+	  unsigned old_len;
+	  tree label = LABEL_EXPR_LABEL (stmt);
+	  int uid = LABEL_DECL_UID (label);
+
+	  gcc_assert (uid > -1);
+
+	  old_len = VEC_length (basic_block, cfg->x_label_to_block_map);
+	  if (old_len <= (unsigned) uid)
+	    {
+	      basic_block *addr;
+	      unsigned new_len = 3 * uid / 2;
+	      VEC_safe_grow (basic_block, gc, cfg->x_label_to_block_map,
+			     new_len);
+	      addr = VEC_address (basic_block, cfg->x_label_to_block_map);
+	      memset (&addr[old_len], 0,
+		      sizeof (basic_block) * (new_len - old_len));
+	    }
+
+	  VEC_replace (basic_block, cfg->x_label_to_block_map, uid, bb);
+	  VEC_replace (basic_block, cfun->cfg->x_label_to_block_map, uid, NULL);
+
+	  gcc_assert (DECL_CONTEXT (label) == dest_cfun->decl);
+
+	  if (uid >= dest_cfun->last_label_uid)
+	    dest_cfun->last_label_uid = uid + 1;
+	}
+    }
+}
+
+
+/* Move a single-entry, single-exit region delimited by ENTRY_BB and
+   EXIT_BB to function DEST_CFUN.  The whole region is replaced by a
+   single basic block in the original CFG and the new basic block is
+   returned.  DEST_CFUN must not have a CFG yet.
+
+   Note that the region need not be a pure SESE region.  Blocks inside
+   the region may contain calls to abort/exit.  The only restriction
+   is that ENTRY_BB should be the only entry point and it must
+   dominate EXIT_BB.
+
+   All local variables referenced in the region are assumed to be in
+   the corresponding BLOCK_VARS and unexpanded variable lists
+   associated with DEST_CFUN.  */
+
+basic_block
+move_sese_region_to_fn (struct function *dest_cfun, basic_block entry_bb,
+		        basic_block exit_bb)
+{
+  VEC(basic_block,heap) *bbs;
+  basic_block after, bb, *entry_pred, *exit_succ;
+  struct function *saved_cfun;
+  int *entry_flag, *exit_flag;
+  unsigned i, num_entry_edges, num_exit_edges;
+  edge e;
+  edge_iterator ei;
+  bitmap vars_to_remove;
+
+  saved_cfun = cfun;
+
+  /* Collect all the blocks in the region.  Manually add ENTRY_BB
+     because it won't be added by dfs_enumerate_from.  */
+  calculate_dominance_info (CDI_DOMINATORS);
+
+  /* If ENTRY does not strictly dominate EXIT, this cannot be an SESE
+     region.  */
+  gcc_assert (entry_bb != exit_bb
+              && dominated_by_p (CDI_DOMINATORS, exit_bb, entry_bb));
+
+  bbs = NULL;
+  VEC_safe_push (basic_block, heap, bbs, entry_bb);
+  gather_blocks_in_sese_region (entry_bb, exit_bb, &bbs);
+
+  /* Detach ENTRY_BB and EXIT_BB from CFUN->CFG.  We need to remember
+     the predecessor edges to ENTRY_BB and the successor edges to
+     EXIT_BB so that we can re-attach them to the new basic block that
+     will replace the region.  */
+  num_entry_edges = EDGE_COUNT (entry_bb->preds);
+  entry_pred = (basic_block *) xcalloc (num_entry_edges, sizeof (basic_block));
+  entry_flag = (int *) xcalloc (num_entry_edges, sizeof (int));
+  i = 0;
+  for (ei = ei_start (entry_bb->preds); (e = ei_safe_edge (ei)) != NULL;)
+    {
+      entry_flag[i] = e->flags;
+      entry_pred[i++] = e->src;
+      remove_edge (e);
+    }
+
+  num_exit_edges = EDGE_COUNT (exit_bb->succs);
+  exit_succ = (basic_block *) xcalloc (num_exit_edges, sizeof (basic_block));
+  exit_flag = (int *) xcalloc (num_exit_edges, sizeof (int));
+  i = 0;
+  for (ei = ei_start (exit_bb->succs); (e = ei_safe_edge (ei)) != NULL;)
+    {
+      exit_flag[i] = e->flags;
+      exit_succ[i++] = e->dest;
+      remove_edge (e);
+    }
+
+  /* Switch context to the child function to initialize DEST_FN's CFG.  */
+  gcc_assert (dest_cfun->cfg == NULL);
+  cfun = dest_cfun;
+  init_empty_tree_cfg ();
+  cfun = saved_cfun;
+
+  /* Move blocks from BBS into DEST_CFUN.  */
+  gcc_assert (VEC_length (basic_block, bbs) >= 2);
+  after = dest_cfun->cfg->x_entry_block_ptr;
+  vars_to_remove = BITMAP_ALLOC (NULL);
+  for (i = 0; VEC_iterate (basic_block, bbs, i, bb); i++)
+    {
+      /* No need to update edge counts on the last block.  It has
+	 already been updated earlier when we detached the region from
+	 the original CFG.  */
+      move_block_to_fn (dest_cfun, bb, after, bb != exit_bb, vars_to_remove);
+      after = bb;
+    }
+
+  /* Remove the variables marked in VARS_TO_REMOVE from
+     CFUN->UNEXPANDED_VAR_LIST.  Otherwise, they will be given a
+     DECL_RTL in the context of CFUN.  */
+  if (!bitmap_empty_p (vars_to_remove))
+    {
+      tree *p;
+
+      for (p = &cfun->unexpanded_var_list; *p; )
+	{
+	  tree var = TREE_VALUE (*p);
+	  if (bitmap_bit_p (vars_to_remove, DECL_UID (var)))
+	    {
+	      *p = TREE_CHAIN (*p);
+	      continue;
+	    }
+
+	  p = &TREE_CHAIN (*p);
+	}
+    }
+
+  BITMAP_FREE (vars_to_remove);
+
+  /* Rewire the entry and exit blocks.  The successor to the entry
+     block turns into the successor of DEST_FN's ENTRY_BLOCK_PTR in
+     the child function.  Similarly, the predecessor of DEST_FN's
+     EXIT_BLOCK_PTR turns into the predecessor of EXIT_BLOCK_PTR.  We
+     need to switch CFUN between DEST_CFUN and SAVED_CFUN so that the
+     various CFG manipulation function get to the right CFG.
+
+     FIXME, this is silly.  The CFG ought to become a parameter to
+     these helpers.  */
+  cfun = dest_cfun;
+  make_edge (ENTRY_BLOCK_PTR, entry_bb, EDGE_FALLTHRU);
+  make_edge (exit_bb,  EXIT_BLOCK_PTR, 0);
+  cfun = saved_cfun;
+
+  /* Back in the original function, the SESE region has disappeared,
+     create a new basic block in its place.  */
+  bb = create_empty_bb (entry_pred[0]);
+  for (i = 0; i < num_entry_edges; i++)
+    make_edge (entry_pred[i], bb, entry_flag[i]);
+
+  for (i = 0; i < num_exit_edges; i++)
+    make_edge (bb, exit_succ[i], exit_flag[i]);
+
+  free (exit_flag);
+  free (entry_flag);
+  free (entry_pred);
+  free (exit_succ);
+  free_dominance_info (CDI_DOMINATORS);
+  free_dominance_info (CDI_POST_DOMINATORS);
+  VEC_free (basic_block, heap, bbs);
+
+  return bb;
+}
+
 
 /* Dump FUNCTION_DECL FN to file FILE using FLAGS (see TDF_* in tree.h)  */
 
diff --git a/gcc/tree-dfa.c b/gcc/tree-dfa.c
index 36b4a5bf953..19453780d42 100644
--- a/gcc/tree-dfa.c
+++ b/gcc/tree-dfa.c
@@ -363,7 +363,35 @@ dump_variable (FILE *file, tree var)
     fprintf (file, ", is volatile");
 
   if (is_call_clobbered (var))
-    fprintf (file, ", call clobbered");
+    {
+      fprintf (file, ", call clobbered");
+      if (dump_flags & TDF_DETAILS)
+	{
+	  var_ann_t va = var_ann (var);
+	  unsigned int escape_mask = va->escape_mask;
+	  
+	  fprintf (file, " (");
+	  if (escape_mask & ESCAPE_STORED_IN_GLOBAL)
+	    fprintf (file, ", stored in global");
+	  if (escape_mask & ESCAPE_TO_ASM)
+	    fprintf (file, ", goes through ASM");
+	  if (escape_mask & ESCAPE_TO_CALL)
+	    fprintf (file, ", passed to call");
+	  if (escape_mask & ESCAPE_BAD_CAST)
+	    fprintf (file, ", bad cast");
+	  if (escape_mask & ESCAPE_TO_RETURN)
+	    fprintf (file, ", returned from func");
+	  if (escape_mask & ESCAPE_TO_PURE_CONST)
+	    fprintf (file, ", passed to pure/const");
+	  if (escape_mask & ESCAPE_IS_GLOBAL)
+	    fprintf (file, ", is global var");
+	  if (escape_mask & ESCAPE_IS_PARM)
+	    fprintf (file, ", is incoming pointer");
+	  if (escape_mask & ESCAPE_UNKNOWN)
+	    fprintf (file, ", unknown escape");
+	  fprintf (file, " )");
+	}
+    }
 
   if (default_def (var))
     {
@@ -719,15 +747,11 @@ add_referenced_var (tree var, struct walk_state *walk_state)
 	*slot = (void *) var;
       
       referenced_var_insert (DECL_UID (var), var);
-
-      /* Global variables are always call-clobbered.  */
-      if (is_global_var (var))
-	mark_call_clobbered (var);
-
+      
       /* Tag's don't have DECL_INITIAL.  */
       if (MTAG_P (var))
 	return;
-      
+
       /* Scan DECL_INITIAL for pointer variables as they may contain
 	 address arithmetic referencing the address of other
 	 variables.  */
diff --git a/gcc/tree-flow-inline.h b/gcc/tree-flow-inline.h
index 7e36ccc0e2a..69bef68f9ac 100644
--- a/gcc/tree-flow-inline.h
+++ b/gcc/tree-flow-inline.h
@@ -843,34 +843,26 @@ loop_containing_stmt (tree stmt)
 static inline bool
 is_call_clobbered (tree var)
 {
-  return is_global_var (var)
-    || bitmap_bit_p (call_clobbered_vars, DECL_UID (var));
+  return bitmap_bit_p (call_clobbered_vars, DECL_UID (var));
 }
 
 /* Mark variable VAR as being clobbered by function calls.  */
 static inline void
-mark_call_clobbered (tree var)
+mark_call_clobbered (tree var, unsigned int escape_type)
 {
-  /* If VAR is a memory tag, then we need to consider it a global
-     variable.  This is because the pointer that VAR represents has
-     been found to point to either an arbitrary location or to a known
-     location in global memory.  */
-  if (MTAG_P (var) && TREE_CODE (var) != STRUCT_FIELD_TAG)
-    MTAG_GLOBAL (var) = 1;
+  var_ann (var)->escape_mask |= escape_type;
   bitmap_set_bit (call_clobbered_vars, DECL_UID (var));
-  ssa_call_clobbered_cache_valid = false;
-  ssa_ro_call_cache_valid = false;
 }
 
 /* Clear the call-clobbered attribute from variable VAR.  */
 static inline void
 clear_call_clobbered (tree var)
 {
+  var_ann_t ann = var_ann (var);
+  ann->escape_mask = 0;
   if (MTAG_P (var) && TREE_CODE (var) != STRUCT_FIELD_TAG)
     MTAG_GLOBAL (var) = 0;
   bitmap_clear_bit (call_clobbered_vars, DECL_UID (var));
-  ssa_call_clobbered_cache_valid = false;
-  ssa_ro_call_cache_valid = false;
 }
 
 /* Mark variable VAR as being non-addressable.  */
@@ -879,8 +871,6 @@ mark_non_addressable (tree var)
 {
   bitmap_clear_bit (call_clobbered_vars, DECL_UID (var));
   TREE_ADDRESSABLE (var) = 0;
-  ssa_call_clobbered_cache_valid = false;
-  ssa_ro_call_cache_valid = false;
 }
 
 /* Return the common annotation for T.  Return NULL if the annotation
diff --git a/gcc/tree-flow.h b/gcc/tree-flow.h
index 92a6035c6a9..0c5e96b669b 100644
--- a/gcc/tree-flow.h
+++ b/gcc/tree-flow.h
@@ -92,6 +92,9 @@ struct ptr_info_def GTY(())
      pointer will be represented by this memory tag, instead of the type
      tag computed by TBAA.  */
   tree name_mem_tag;
+
+  /* Mask of reasons this pointer's value escapes the function  */
+  unsigned int escape_mask;
 };
 
 
@@ -213,6 +216,10 @@ struct var_ann_d GTY(())
   /* If this variable is a structure, this fields holds a list of
      symbols representing each of the fields of the structure.  */
   subvar_t subvars;
+
+  /* Mask of values saying the reasons why this variable has escaped
+     the function.  */
+  unsigned int escape_mask;
 };
 
 struct function_ann_d GTY(())
@@ -533,6 +540,8 @@ extern void fold_cond_expr_cond (void);
 extern void replace_uses_by (tree, tree);
 extern void start_recording_case_labels (void);
 extern void end_recording_case_labels (void);
+extern basic_block move_sese_region_to_fn (struct function *, basic_block,
+				           basic_block);
 
 /* In tree-cfgcleanup.c  */
 extern bool cleanup_tree_cfg (void);
@@ -573,8 +582,9 @@ extern void remove_phi_node (tree, tree);
 extern tree phi_reverse (tree);
 
 /* In gimple-low.c  */
+extern void record_vars_into (tree, tree);
 extern void record_vars (tree);
-extern bool block_may_fallthru (tree block);
+extern bool block_may_fallthru (tree);
 
 /* In tree-ssa-alias.c  */
 extern void dump_may_aliases_for (FILE *, tree);
@@ -751,9 +761,27 @@ enum move_pos
   };
 extern enum move_pos movement_possibility (tree);
 
+/* The reasons a variable may escape a function.  */
+enum escape_type 
+  {
+    NO_ESCAPE = 0, /* Doesn't escape.  */
+    ESCAPE_STORED_IN_GLOBAL = 1 << 1,
+    ESCAPE_TO_ASM = 1 << 2,  /* Passed by address to an assembly
+				statement.  */
+    ESCAPE_TO_CALL = 1 << 3,  /* Escapes to a function call.  */
+    ESCAPE_BAD_CAST = 1 << 4, /* Cast from pointer to integer */
+    ESCAPE_TO_RETURN = 1 << 5, /* Returned from function.  */
+    ESCAPE_TO_PURE_CONST = 1 << 6, /* Escapes to a pure or constant
+				      function call.  */
+    ESCAPE_IS_GLOBAL = 1 << 7,  /* Is a global variable.  */
+    ESCAPE_IS_PARM = 1 << 8, /* Is an incoming function parameter.  */
+    ESCAPE_UNKNOWN = 1 << 9 /* We believe it escapes for some reason
+			       not enumerated above.  */
+  };
+
 /* In tree-flow-inline.h  */
 static inline bool is_call_clobbered (tree);
-static inline void mark_call_clobbered (tree);
+static inline void mark_call_clobbered (tree, unsigned int);
 static inline void set_is_used (tree);
 static inline bool unmodifiable_var_p (tree);
 
@@ -861,6 +889,7 @@ tree create_mem_ref (block_stmt_iterator *, tree,
 rtx addr_for_mem_ref (struct mem_address *, bool);
 void get_address_description (tree, struct mem_address *);
 tree maybe_fold_tmr (tree);
+
 /* This structure is simply used during pushing fields onto the fieldstack
    to track the offset of the field, since bitpos_of_field gives it relative
    to its immediate containing type, and we want it relative to the ultimate
diff --git a/gcc/tree-gimple.c b/gcc/tree-gimple.c
index 5edf55833f3..b47b0012c68 100644
--- a/gcc/tree-gimple.c
+++ b/gcc/tree-gimple.c
@@ -224,6 +224,7 @@ is_gimple_stmt (tree t)
     case OMP_MASTER:
     case OMP_ORDERED:
     case OMP_CRITICAL:
+    case OMP_RETURN_EXPR:
       /* These are always void.  */
       return true;
 
diff --git a/gcc/tree-gimple.h b/gcc/tree-gimple.h
index ff1a6d20a39..9cba07f5597 100644
--- a/gcc/tree-gimple.h
+++ b/gcc/tree-gimple.h
@@ -109,21 +109,6 @@ enum gimplify_status {
   GS_ALL_DONE	= 1	/* The expression is fully gimplified.  */
 };
 
-/* Type of parallel constructs.  Used to decide what runtime function
-   to use for launching children threads and the gimplification
-   strategy.  */
-
-enum omp_parallel_type {
-    IS_NOT_PARALLEL = 0,
-
-    /* Regular omp parallel  */
-    IS_PARALLEL,
-
-    /* Combined parallel + workshare (parallel loop and parallel
-       sections).  */
-    IS_COMBINED_PARALLEL
-};
-
 extern enum gimplify_status gimplify_expr (tree *, tree *, tree *,
 					   bool (*) (tree), fallback_t);
 extern void gimplify_type_sizes (tree, tree *);
@@ -147,12 +132,11 @@ extern tree force_labels_r (tree *, int *, void *);
 extern enum gimplify_status gimplify_va_arg_expr (tree *, tree *, tree *);
 struct gimplify_omp_ctx;
 extern void omp_firstprivatize_variable (struct gimplify_omp_ctx *, tree);
+extern tree gimple_boolify (tree);
 
 /* In omp-low.c.  */
-extern tree find_omp_clause (tree, enum tree_code);
 extern void diagnose_omp_structured_block_errors (tree);
 extern tree omp_reduction_init (tree, tree);
-enum omp_parallel_type determine_parallel_type (tree stmt);
 
 /* In tree-nested.c.  */
 extern void lower_nested_functions (tree);
diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
index 7030b92b89d..c75cc9b7bd2 100644
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@@ -1,5 +1,5 @@
 /* Tree inlining.
-   Copyright 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
    Contributed by Alexandre Oliva <aoliva@redhat.com>
 
 This file is part of GCC.
@@ -1598,6 +1598,29 @@ estimate_num_insns_1 (tree *tp, int *walk_subtrees, void *data)
     case LOOP_EXPR:
     case PHI_NODE:
     case WITH_SIZE_EXPR:
+    case OMP_PARALLEL:
+    case OMP_FOR:
+    case OMP_SECTIONS:
+    case OMP_SINGLE:
+    case OMP_SECTION:
+    case OMP_MASTER:
+    case OMP_ORDERED:
+    case OMP_CRITICAL:
+    case OMP_ATOMIC:
+    case OMP_CLAUSE_PRIVATE:
+    case OMP_CLAUSE_SHARED:
+    case OMP_CLAUSE_FIRSTPRIVATE:
+    case OMP_CLAUSE_LASTPRIVATE:
+    case OMP_CLAUSE_REDUCTION:
+    case OMP_CLAUSE_COPYIN:
+    case OMP_CLAUSE_COPYPRIVATE:
+    case OMP_CLAUSE_IF:
+    case OMP_CLAUSE_NUM_THREADS:
+    case OMP_CLAUSE_SCHEDULE:
+    case OMP_CLAUSE_NOWAIT:
+    case OMP_CLAUSE_ORDERED:
+    case OMP_CLAUSE_DEFAULT:
+    case OMP_RETURN_EXPR:
       break;
 
     /* We don't account constants for now.  Assume that the cost is amortized
@@ -1728,6 +1751,10 @@ estimate_num_insns_1 (tree *tp, int *walk_subtrees, void *data)
     case REDUC_MAX_EXPR:
     case REDUC_MIN_EXPR:
     case REDUC_PLUS_EXPR:
+    case WIDEN_SUM_EXPR:
+    case DOT_PROD_EXPR: 
+
+    case WIDEN_MULT_EXPR:
 
     case RESX_EXPR:
       *count += 1;
@@ -2281,7 +2308,22 @@ copy_tree_r (tree *tp, int *walk_subtrees, void *data ATTRIBUTE_UNUSED)
 
       /* Now, restore the chain, if appropriate.  That will cause
 	 walk_tree to walk into the chain as well.  */
-      if (code == PARM_DECL || code == TREE_LIST)
+      if (code == PARM_DECL
+	  || code == TREE_LIST
+	  /* OpenMP clauses are linked through TREE_CHAIN.  */
+	  || code == OMP_CLAUSE_PRIVATE
+	  || code == OMP_CLAUSE_SHARED
+	  || code == OMP_CLAUSE_FIRSTPRIVATE
+	  || code == OMP_CLAUSE_LASTPRIVATE
+	  || code == OMP_CLAUSE_REDUCTION
+	  || code == OMP_CLAUSE_COPYIN
+	  || code == OMP_CLAUSE_COPYPRIVATE
+	  || code == OMP_CLAUSE_IF
+	  || code == OMP_CLAUSE_NUM_THREADS
+	  || code == OMP_CLAUSE_SCHEDULE
+	  || code == OMP_CLAUSE_NOWAIT
+	  || code == OMP_CLAUSE_ORDERED
+	  || code == OMP_CLAUSE_DEFAULT)
 	TREE_CHAIN (*tp) = chain;
 
       /* For now, we don't update BLOCKs when we make copies.  So, we
diff --git a/gcc/tree-iterator.c b/gcc/tree-iterator.c
index c4c30104731..ad2b47ea8cd 100644
--- a/gcc/tree-iterator.c
+++ b/gcc/tree-iterator.c
@@ -40,6 +40,7 @@ alloc_stmt_list (void)
   if (list)
     {
       stmt_list_cache = TREE_CHAIN (list);
+      gcc_assert (stmt_list_cache != list);
       memset (list, 0, sizeof(struct tree_common));
       TREE_SET_CODE (list, STATEMENT_LIST);
     }
@@ -54,6 +55,9 @@ free_stmt_list (tree t)
 {
   gcc_assert (!STATEMENT_LIST_HEAD (t));
   gcc_assert (!STATEMENT_LIST_TAIL (t));
+  /* If this triggers, it's a sign that the same list is being freed
+     twice.  */
+  gcc_assert (t != stmt_list_cache || stmt_list_cache == NULL);
   TREE_CHAIN (t) = stmt_list_cache;
   stmt_list_cache = t;
 }
diff --git a/gcc/tree-outof-ssa.c b/gcc/tree-outof-ssa.c
index 553f7a1f1fc..2f36cc6cc81 100644
--- a/gcc/tree-outof-ssa.c
+++ b/gcc/tree-outof-ssa.c
@@ -177,7 +177,7 @@ create_temp (tree t)
      inherit from our original variable.  */
   var_ann (tmp)->type_mem_tag = var_ann (t)->type_mem_tag;
   if (is_call_clobbered (t))
-    mark_call_clobbered (tmp);
+    mark_call_clobbered (tmp, var_ann (t)->escape_mask);
 
   return tmp;
 }
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 90327ba8864..82e8c107765 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -263,6 +263,7 @@ extern struct tree_opt_pass pass_lower_complex;
 extern struct tree_opt_pass pass_lower_vector;
 extern struct tree_opt_pass pass_lower_vector_ssa;
 extern struct tree_opt_pass pass_lower_omp;
+extern struct tree_opt_pass pass_expand_omp;
 extern struct tree_opt_pass pass_object_sizes;
 extern struct tree_opt_pass pass_fold_builtins;
 extern struct tree_opt_pass pass_stdarg;
diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c
index d7e3391a3e4..b23416df306 100644
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@@ -1,5 +1,6 @@
 /* Pretty formatting of GENERIC trees in C syntax.
-   Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006 
+   Free Software Foundation, Inc.
    Adapted from c-pretty-print.c by Diego Novillo <dnovillo@redhat.com>
 
 This file is part of GCC.
@@ -1168,6 +1169,8 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
       break;
 
       /* Binary arithmetic and logic expressions.  */
+    case WIDEN_SUM_EXPR:
+    case WIDEN_MULT_EXPR:
     case MULT_EXPR:
     case PLUS_EXPR:
     case MINUS_EXPR:
@@ -1686,9 +1689,34 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
       pp_string (buffer, " > ");
       break;
 
+    case DOT_PROD_EXPR:
+      pp_string (buffer, " DOT_PROD_EXPR < ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
+      pp_string (buffer, " , ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false);
+      pp_string (buffer, " , ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 2), spc, flags, false);
+      pp_string (buffer, " > ");
+      break;
+
     case OMP_PARALLEL:
       pp_string (buffer, "#pragma omp parallel");
       dump_omp_clauses (buffer, OMP_PARALLEL_CLAUSES (node), spc, flags);
+      if (OMP_PARALLEL_FN (node))
+	{
+	  pp_string (buffer, " [child fn: ");
+	  dump_generic_node (buffer, OMP_PARALLEL_FN (node), spc, flags, false);
+
+	  pp_string (buffer, " (");
+
+	  if (OMP_PARALLEL_DATA_ARG (node))
+	    dump_generic_node (buffer, OMP_PARALLEL_DATA_ARG (node), spc, flags,
+		               false);
+	  else
+	    pp_string (buffer, "???");
+
+	  pp_string (buffer, ")]");
+	}
 
     dump_omp_body:
       if (!(flags & TDF_SLIM) && OMP_BODY (node))
@@ -1790,6 +1818,11 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
       dump_omp_clauses (buffer, OMP_SINGLE_CLAUSES (node), spc, flags);
       goto dump_omp_body;
 
+    case OMP_RETURN_EXPR:
+      pp_string (buffer, "OMP_RETURN");
+      is_expr = false;
+      break;
+
     case REDUC_MAX_EXPR:
       pp_string (buffer, " REDUC_MAX_EXPR < ");
       dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
@@ -2105,10 +2138,13 @@ op_prio (tree op)
     case RROTATE_EXPR:
       return 11;
 
+    case WIDEN_SUM_EXPR:
     case PLUS_EXPR:
     case MINUS_EXPR:
       return 12;
 
+    case WIDEN_MULT_EXPR:
+    case DOT_PROD_EXPR:
     case MULT_EXPR:
     case TRUNC_DIV_EXPR:
     case CEIL_DIV_EXPR:
@@ -2263,6 +2299,12 @@ op_symbol_1 (enum tree_code code)
     case REDUC_PLUS_EXPR:
       return "r+";
 
+    case WIDEN_SUM_EXPR:
+      return "w+";
+
+    case WIDEN_MULT_EXPR:
+      return "w*";
+
     case NEGATE_EXPR:
     case MINUS_EXPR:
       return "-";
diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c
index 5370747fc62..a890e11ce75 100644
--- a/gcc/tree-ssa-alias.c
+++ b/gcc/tree-ssa-alias.c
@@ -135,6 +135,287 @@ bitmap addressable_vars;
    having to keep track of too many V_MAY_DEF expressions at call sites.  */
 tree global_var;
 
+DEF_VEC_I(int);
+DEF_VEC_ALLOC_I(int,heap);
+
+/* qsort comparison function to sort type/name tags by DECL_UID.  */
+
+static int
+sort_tags_by_id (const void *pa, const void *pb)
+{
+  tree a = *(tree *)pa;
+  tree b = *(tree *)pb;
+ 
+  return DECL_UID (a) - DECL_UID (b);
+}
+
+/* Initialize WORKLIST to contain those memory tags that are marked call
+   clobbered.  Initialized WORKLIST2 to contain the reasons these
+   memory tags escaped.  */
+
+static void
+init_transitive_clobber_worklist (VEC (tree, heap) **worklist,
+				  VEC (int, heap) **worklist2)
+{
+  referenced_var_iterator rvi;
+  tree curr;
+
+  FOR_EACH_REFERENCED_VAR (curr, rvi)
+    {
+      if (MTAG_P (curr) && is_call_clobbered (curr))
+	{
+	  VEC_safe_push (tree, heap, *worklist, curr);
+	  VEC_safe_push (int, heap, *worklist2, var_ann (curr)->escape_mask);
+	}
+    }
+}
+
+/* Add ALIAS to WORKLIST (and the reason for escaping REASON to WORKLIST2) if
+   ALIAS is not already marked call clobbered, and is a memory
+   tag.  */
+
+static void
+add_to_worklist (tree alias, VEC (tree, heap) **worklist,
+		 VEC (int, heap) **worklist2,
+		 int reason)
+{
+  if (MTAG_P (alias) && !is_call_clobbered (alias))
+    {
+      VEC_safe_push (tree, heap, *worklist, alias);
+      VEC_safe_push (int, heap, *worklist2, reason);
+    }
+}
+
+/* Mark aliases of TAG as call clobbered, and place any tags on the
+   alias list that were not already call clobbered on WORKLIST.  */
+
+static void
+mark_aliases_call_clobbered (tree tag, VEC (tree, heap) **worklist,
+			     VEC (int, heap) **worklist2)
+{
+  unsigned int i;
+  VEC (tree, gc) *ma;
+  tree entry;
+  var_ann_t ta = var_ann (tag);
+
+  if (!MTAG_P (tag))
+    return;
+  ma = may_aliases (tag);
+  if (!ma)
+    return;
+
+  for (i = 0; VEC_iterate (tree, ma, i, entry); i++)
+    {
+      if (!unmodifiable_var_p (entry))
+	{
+	  add_to_worklist (entry, worklist, worklist2, ta->escape_mask);
+	  mark_call_clobbered (entry, ta->escape_mask);
+	}
+    }
+}
+
+/* Tags containing global vars need to be marked as global.
+   Tags containing call clobbered vars need to be marked as call
+   clobbered. */
+
+static void
+compute_tag_properties (void)
+{
+  referenced_var_iterator rvi;
+  tree tag;
+  bool changed = true;
+  VEC (tree, heap) *taglist = NULL;
+
+  FOR_EACH_REFERENCED_VAR (tag, rvi)
+    {
+      if (!MTAG_P (tag) || TREE_CODE (tag) == STRUCT_FIELD_TAG)
+	continue;
+      VEC_safe_push (tree, heap, taglist, tag);
+    }
+
+  /* We sort the taglist by DECL_UID, for two reasons.
+     1. To get a sequential ordering to make the bitmap accesses
+     faster.
+     2. Because of the way we compute aliases, it's more likely that
+     an earlier tag is included in a later tag, and this will reduce
+     the number of iterations.
+
+     If we had a real tag graph, we would just topo-order it and be
+     done with it.  */
+  qsort (VEC_address (tree, taglist),
+	 VEC_length (tree, taglist),
+	 sizeof (tree),
+	 sort_tags_by_id);
+
+  /* Go through each tag not marked as global, and if it aliases
+     global vars, mark it global. 
+     
+     If the tag contains call clobbered vars, mark it call
+     clobbered.  
+
+     This loop iterates because tags may appear in the may-aliases
+     list of other tags when we group.  */
+
+  while (changed)
+    {
+      unsigned int k;
+
+      changed = false;      
+      for (k = 0; VEC_iterate (tree, taglist, k, tag); k++)
+	{
+	  VEC (tree, gc) *ma;
+	  unsigned int i;
+	  tree entry;
+	  bool tagcc = is_call_clobbered (tag);
+	  bool tagglobal = MTAG_GLOBAL (tag);
+	  
+	  if (tagcc && tagglobal)
+	    continue;
+	  
+	  ma = may_aliases (tag);
+	  if (!ma)
+	    continue;
+
+	  for (i = 0; VEC_iterate (tree, ma, i, entry); i++)
+	    {
+	      /* Call clobbered entries cause the tag to be marked
+		 call clobbered.  */
+	      if (!tagcc && is_call_clobbered (entry))
+		{
+		  mark_call_clobbered (tag, var_ann (entry)->escape_mask);
+		  tagcc = true;
+		  changed = true;
+		}
+
+	      /* Global vars cause the tag to be marked global.  */
+	      if (!tagglobal && is_global_var (entry))
+		{
+		  MTAG_GLOBAL (tag) = true;
+		  changed = true;
+		  tagglobal = true;
+		}
+
+	      /* Early exit once both global and cc are set, since the
+		 loop can't do any more than that.  */
+	      if (tagcc && tagglobal)
+		break;
+	    }
+	}
+    }
+  VEC_free (tree, heap, taglist);
+}
+
+/* Set up the initial variable clobbers and globalness.
+   When this function completes, only tags whose aliases need to be
+   clobbered will be set clobbered.  Tags clobbered because they   
+   contain call clobbered vars are handled in compute_tag_properties.  */
+
+static void
+set_initial_properties (struct alias_info *ai)
+{
+  unsigned int i;
+  referenced_var_iterator rvi;
+  tree var;
+
+  FOR_EACH_REFERENCED_VAR (var, rvi)
+    {
+      if (is_global_var (var) 
+	  && (!var_can_have_subvars (var)
+	      || get_subvars_for_var (var) == NULL))
+	{
+	  if (!unmodifiable_var_p (var))
+	    mark_call_clobbered (var, ESCAPE_IS_GLOBAL);
+	}
+      else if (TREE_CODE (var) == PARM_DECL
+	       && default_def (var)
+	       && POINTER_TYPE_P (TREE_TYPE (var)))
+	{
+	  tree def = default_def (var);
+	  get_ptr_info (def)->value_escapes_p = 1;
+	  get_ptr_info (def)->escape_mask |= ESCAPE_IS_PARM;	  
+	}
+    }
+
+  for (i = 0; i < VARRAY_ACTIVE_SIZE (ai->processed_ptrs); i++)
+    {
+      tree ptr = VARRAY_TREE (ai->processed_ptrs, i);
+      struct ptr_info_def *pi = SSA_NAME_PTR_INFO (ptr);
+      var_ann_t v_ann = var_ann (SSA_NAME_VAR (ptr));
+      
+      if (pi->value_escapes_p)
+	{
+	  /* If PTR escapes then its associated memory tags and
+	     pointed-to variables are call-clobbered.  */
+	  if (pi->name_mem_tag)
+	    mark_call_clobbered (pi->name_mem_tag, pi->escape_mask);
+
+	  if (v_ann->type_mem_tag)
+	    mark_call_clobbered (v_ann->type_mem_tag, pi->escape_mask);
+
+	  if (pi->pt_vars)
+	    {
+	      bitmap_iterator bi;
+	      unsigned int j;	      
+	      EXECUTE_IF_SET_IN_BITMAP (pi->pt_vars, 0, j, bi)
+		if (!unmodifiable_var_p (referenced_var (j)))
+		  mark_call_clobbered (referenced_var (j), pi->escape_mask);
+	    }
+	}
+      /* If the name tag is call clobbered, so is the type tag
+	 associated with the base VAR_DECL.  */
+      if (pi->name_mem_tag
+	  && v_ann->type_mem_tag
+	  && is_call_clobbered (pi->name_mem_tag))
+	mark_call_clobbered (v_ann->type_mem_tag, pi->escape_mask);
+
+      /* Name tags and type tags that we don't know where they point
+	 to, might point to global memory, and thus, are clobbered.
+
+         FIXME:  This is not quite right.  They should only be
+         clobbered if value_escapes_p is true, regardless of whether
+         they point to global memory or not.
+         So removing this code and fixing all the bugs would be nice.
+         It is the cause of a bunch of clobbering.  */
+      if ((pi->pt_global_mem || pi->pt_anything) 
+	  && pi->is_dereferenced && pi->name_mem_tag)
+	{
+	  mark_call_clobbered (pi->name_mem_tag, ESCAPE_IS_GLOBAL);
+	  MTAG_GLOBAL (pi->name_mem_tag) = true;
+	}
+      
+      if ((pi->pt_global_mem || pi->pt_anything) 
+	  && pi->is_dereferenced && v_ann->type_mem_tag)
+	{
+	  mark_call_clobbered (v_ann->type_mem_tag, ESCAPE_IS_GLOBAL);
+	  MTAG_GLOBAL (v_ann->type_mem_tag) = true;
+	}
+    }
+}
+
+/* Compute which variables need to be marked call clobbered because
+   their tag is call clobbered, and which tags need to be marked
+   global because they contain global variables.  */
+
+static void
+compute_call_clobbered (struct alias_info *ai)
+{
+  VEC (tree, heap) *worklist = NULL;
+  VEC(int,heap) *worklist2 = NULL;
+  
+  set_initial_properties (ai);
+  init_transitive_clobber_worklist (&worklist, &worklist2);
+  while (VEC_length (tree, worklist) != 0)
+    {
+      tree curr = VEC_pop (tree, worklist);
+      int reason = VEC_pop (int, worklist2);
+      
+      mark_call_clobbered (curr, reason);
+      mark_aliases_call_clobbered (curr, &worklist, &worklist2);
+    }
+  VEC_free (tree, heap, worklist);
+  VEC_free (int, heap, worklist2);
+  compute_tag_properties ();
+}
 
 /* Compute may-alias information for every variable referenced in function
    FNDECL.
@@ -277,6 +558,13 @@ compute_may_aliases (void)
      memory tags.  */
   compute_flow_insensitive_aliasing (ai);
 
+  /* Determine if we need to enable alias grouping.  */
+  if (ai->total_alias_vops >= MAX_ALIASED_VOPS)
+    group_aliases (ai);
+
+  /* Compute call clobbering information.  */
+  compute_call_clobbered (ai);
+
   /* If the program has too many call-clobbered variables and/or function
      calls, create .GLOBAL_VAR and use it to model call-clobbering
      semantics at call sites.  This reduces the number of virtual operands
@@ -703,20 +991,6 @@ compute_flow_sensitive_aliasing (struct alias_info *ai)
       var_ann_t v_ann = var_ann (SSA_NAME_VAR (ptr));
       bitmap_iterator bi;
 
-      if (pi->value_escapes_p || pi->pt_anything)
-	{
-	  /* If PTR escapes or may point to anything, then its associated
-	     memory tags and pointed-to variables are call-clobbered.  */
-	  if (pi->name_mem_tag)
-	    mark_call_clobbered (pi->name_mem_tag);
-
-	  if (v_ann->type_mem_tag)
-	    mark_call_clobbered (v_ann->type_mem_tag);
-
-	  if (pi->pt_vars)
-	    EXECUTE_IF_SET_IN_BITMAP (pi->pt_vars, 0, j, bi)
-	      mark_call_clobbered (referenced_var (j));
-	}
 
       /* Set up aliasing information for PTR's name memory tag (if it has
 	 one).  Note that only pointers that have been dereferenced will
@@ -727,13 +1001,6 @@ compute_flow_sensitive_aliasing (struct alias_info *ai)
 	    add_may_alias (pi->name_mem_tag, referenced_var (j));
 	    add_may_alias (v_ann->type_mem_tag, referenced_var (j));
 	  }
-
-      /* If the name tag is call clobbered, so is the type tag
-	 associated with the base VAR_DECL.  */
-      if (pi->name_mem_tag
-	  && v_ann->type_mem_tag
-	  && is_call_clobbered (pi->name_mem_tag))
-	mark_call_clobbered (v_ann->type_mem_tag);
     }
 }
 
@@ -897,10 +1164,6 @@ compute_flow_insensitive_aliasing (struct alias_info *ai)
     fprintf (dump_file, "\n%s: Total number of aliased vops: %ld\n",
 	     get_name (current_function_decl),
 	     ai->total_alias_vops);
-
-  /* Determine if we need to enable alias grouping.  */
-  if (ai->total_alias_vops >= MAX_ALIASED_VOPS)
-    group_aliases (ai);
 }
 
 
@@ -1308,12 +1571,6 @@ setup_pointers_and_addressables (struct alias_info *ai)
 	      if (bitmap_bit_p (ai->dereferenced_ptrs_store, DECL_UID (var)))
 		bitmap_set_bit (ai->written_vars, DECL_UID (tag));
 
-	      /* If pointer VAR is a global variable or a PARM_DECL,
-		 then its memory tag should be considered a global
-		 variable.  */
-	      if (TREE_CODE (var) == PARM_DECL || is_global_var (var))
-		mark_call_clobbered (tag);
-
 	      /* All the dereferences of pointer VAR count as
 		 references of TAG.  Since TAG can be associated with
 		 several pointers, add the dereferences of VAR to the
@@ -1598,16 +1855,6 @@ add_may_alias (tree var, tree alias)
     if (alias == al)
       return;
 
-  /* If VAR is a call-clobbered variable, so is its new ALIAS.
-     FIXME, call-clobbering should only depend on whether an address
-     escapes.  It should be independent of aliasing.  */
-  if (is_call_clobbered (var))
-    mark_call_clobbered (alias);
-
-  /* Likewise.  If ALIAS is call-clobbered, so is VAR.  */
-  else if (is_call_clobbered (alias))
-    mark_call_clobbered (var);
-
   VEC_safe_push (tree, gc, v_ann->may_aliases, alias);
   a_ann->is_alias_tag = 1;
 }
@@ -1620,16 +1867,6 @@ replace_may_alias (tree var, size_t i, tree new_alias)
 {
   var_ann_t v_ann = var_ann (var);
   VEC_replace (tree, v_ann->may_aliases, i, new_alias);
-
-  /* If VAR is a call-clobbered variable, so is NEW_ALIAS.
-     FIXME, call-clobbering should only depend on whether an address
-     escapes.  It should be independent of aliasing.  */
-  if (is_call_clobbered (var))
-    mark_call_clobbered (new_alias);
-
-  /* Likewise.  If NEW_ALIAS is call-clobbered, so is VAR.  */
-  else if (is_call_clobbered (new_alias))
-    mark_call_clobbered (var);
 }
 
 
@@ -1663,9 +1900,12 @@ set_pt_anything (tree ptr)
 	3- STMT is an assignment to a non-local variable, or
 	4- STMT is a return statement.
 
-   AI points to the alias information collected so far.  */
+   AI points to the alias information collected so far.  
 
-bool
+   Return the type of escape site found, if we found one, or NO_ESCAPE
+   if none.  */
+
+enum escape_type
 is_escape_site (tree stmt, struct alias_info *ai)
 {
   tree call = get_call_expr_in (stmt);
@@ -1674,12 +1914,15 @@ is_escape_site (tree stmt, struct alias_info *ai)
       ai->num_calls_found++;
 
       if (!TREE_SIDE_EFFECTS (call))
-	ai->num_pure_const_calls_found++;
+	{
+	  ai->num_pure_const_calls_found++;
+	  return ESCAPE_TO_PURE_CONST;
+	}
 
-      return true;
+      return ESCAPE_TO_CALL;
     }
   else if (TREE_CODE (stmt) == ASM_EXPR)
-    return true;
+    return ESCAPE_TO_ASM;
   else if (TREE_CODE (stmt) == MODIFY_EXPR)
     {
       tree lhs = TREE_OPERAND (stmt, 0);
@@ -1691,7 +1934,7 @@ is_escape_site (tree stmt, struct alias_info *ai)
       /* If we couldn't recognize the LHS of the assignment, assume that it
 	 is a non-local store.  */
       if (lhs == NULL_TREE)
-	return true;
+	return ESCAPE_UNKNOWN;
 
       /* If the RHS is a conversion between a pointer and an integer, the
 	 pointer escapes since we can't track the integer.  */
@@ -1701,12 +1944,12 @@ is_escape_site (tree stmt, struct alias_info *ai)
 	  && POINTER_TYPE_P (TREE_TYPE (TREE_OPERAND
 					(TREE_OPERAND (stmt, 1), 0)))
 	  && !POINTER_TYPE_P (TREE_TYPE (TREE_OPERAND (stmt, 1))))
-	return true;
+	return ESCAPE_BAD_CAST;
 
       /* If the LHS is an SSA name, it can't possibly represent a non-local
 	 memory store.  */
       if (TREE_CODE (lhs) == SSA_NAME)
-	return false;
+	return NO_ESCAPE;
 
       /* FIXME: LHS is not an SSA_NAME.  Even if it's an assignment to a
 	 local variables we cannot be sure if it will escape, because we
@@ -1717,12 +1960,12 @@ is_escape_site (tree stmt, struct alias_info *ai)
 	 Midkiff, ``Escape analysis for java,'' in Proceedings of the
 	 Conference on Object-Oriented Programming Systems, Languages, and
 	 Applications (OOPSLA), pp. 1-19, 1999.  */
-      return true;
+      return ESCAPE_STORED_IN_GLOBAL;
     }
   else if (TREE_CODE (stmt) == RETURN_EXPR)
-    return true;
+    return ESCAPE_TO_RETURN;
 
-  return false;
+  return NO_ESCAPE;
 }
 
 /* Create a new memory tag of type TYPE.
@@ -1793,13 +2036,6 @@ get_nmt_for (tree ptr)
 
   if (tag == NULL_TREE)
     tag = create_memory_tag (TREE_TYPE (TREE_TYPE (ptr)), false);
-
-  /* If PTR is a PARM_DECL, it points to a global variable or malloc,
-     then its name tag should be considered a global variable.  */
-  if (TREE_CODE (SSA_NAME_VAR (ptr)) == PARM_DECL
-      || pi->pt_global_mem)
-    mark_call_clobbered (tag);
-
   return tag;
 }
 
@@ -1896,6 +2132,8 @@ create_global_var (void)
   TREE_THIS_VOLATILE (global_var) = 0;
   TREE_ADDRESSABLE (global_var) = 0;
 
+  create_var_ann (global_var);
+  mark_call_clobbered (global_var, ESCAPE_UNKNOWN);
   add_referenced_tmp_var (global_var);
   mark_sym_for_renaming (global_var);
 }
diff --git a/gcc/tree-ssa-operands.c b/gcc/tree-ssa-operands.c
index 87a1fc6eb1d..fbee0b915e1 100644
--- a/gcc/tree-ssa-operands.c
+++ b/gcc/tree-ssa-operands.c
@@ -1,5 +1,5 @@
 /* SSA operands management for trees.
-   Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -119,14 +119,8 @@ static VEC(tree,heap) *build_vuses;
 /* Array for building all the v_must_def operands.  */
 static VEC(tree,heap) *build_v_must_defs;
 
-/* True if the operands for call clobbered vars are cached and valid.  */
-bool ssa_call_clobbered_cache_valid;
-bool ssa_ro_call_cache_valid;
 
 /* These arrays are the cached operand vectors for call clobbered calls.  */
-static VEC(tree,heap) *clobbered_v_may_defs;
-static VEC(tree,heap) *clobbered_vuses;
-static VEC(tree,heap) *ro_call_vuses;
 static bool ops_active = false;
 
 static GTY (()) struct ssa_operand_memory_d *operand_memory = NULL;
@@ -142,7 +136,7 @@ static inline void append_use (tree *);
 static void append_v_may_def (tree);
 static void append_v_must_def (tree);
 static void add_call_clobber_ops (tree, tree);
-static void add_call_read_ops (tree);
+static void add_call_read_ops (tree, tree);
 static void add_stmt_operand (tree *, stmt_ann_t, int);
 static void build_ssa_operands (tree stmt);
                                                                                 
@@ -220,7 +214,34 @@ ssa_operands_active (void)
   return ops_active;
 }
 
+/* Structure storing statistics on how many call clobbers we have, and
+   how many where avoided.  */
+static struct 
+{
+  /* Number of call-clobbered ops we attempt to add to calls in
+     add_call_clobber_ops.  */
+  unsigned int clobbered_vars;
+
+  /* Number of write-clobbers (v_may_defs) avoided by using
+     not_written information.  */
+  unsigned int static_write_clobbers_avoided;
+
+  /* Number of reads (vuses) avoided by using not_read
+     information.  */
+  unsigned int static_read_clobbers_avoided;
+  
+  /* Number of write-clobbers avoided because the variable can't escape to
+     this call.  */
+  unsigned int unescapable_clobbers_avoided;
 
+  /* Number of readonly uses we attempt to add to calls in
+     add_call_read_ops.  */
+  unsigned int readonly_clobbers;
+
+  /* Number of readonly uses we avoid using not_read information.  */
+  unsigned int static_readonly_clobbers_avoided;
+} clobber_stats;
+  
 /* Initialize the operand cache routines.  */
 
 void
@@ -235,6 +256,8 @@ init_ssa_operands (void)
   gcc_assert (operand_memory == NULL);
   operand_memory_index = SSA_OPERAND_MEMORY_SIZE;
   ops_active = true;
+  memset (&clobber_stats, 0, sizeof (clobber_stats));
+  
 }
 
 
@@ -260,10 +283,17 @@ fini_ssa_operands (void)
       ggc_free (ptr);
     }
 
-  VEC_free (tree, heap, clobbered_v_may_defs);
-  VEC_free (tree, heap, clobbered_vuses);
-  VEC_free (tree, heap, ro_call_vuses);
   ops_active = false;
+  
+  if (dump_file && (dump_flags & TDF_STATS))
+    {
+      fprintf (dump_file, "Original clobbered vars:%d\n", clobber_stats.clobbered_vars);
+      fprintf (dump_file, "Static write clobbers avoided:%d\n", clobber_stats.static_write_clobbers_avoided);
+      fprintf (dump_file, "Static read clobbers avoided:%d\n", clobber_stats.static_read_clobbers_avoided);
+      fprintf (dump_file, "Unescapable clobbers avoided:%d\n", clobber_stats.unescapable_clobbers_avoided);
+      fprintf (dump_file, "Original readonly clobbers:%d\n", clobber_stats.readonly_clobbers);
+      fprintf (dump_file, "Static readonly clobbers avoided:%d\n", clobber_stats.static_readonly_clobbers_avoided);
+    }
 }
 
 
@@ -1243,6 +1273,7 @@ get_expr_operands (tree stmt, tree *expr_p, int flags)
 	return;
       }
 
+    case DOT_PROD_EXPR:
     case REALIGN_LOAD_EXPR:
       {
 	get_expr_operands (stmt, &TREE_OPERAND (expr, 0), flags);
@@ -1257,6 +1288,14 @@ get_expr_operands (tree stmt, tree *expr_p, int flags)
     case FILTER_EXPR:
     case LABEL_DECL:
     case CONST_DECL:
+    case OMP_PARALLEL:
+    case OMP_SECTIONS:
+    case OMP_FOR:
+    case OMP_RETURN_EXPR:
+    case OMP_SINGLE:
+    case OMP_MASTER:
+    case OMP_ORDERED:
+    case OMP_CRITICAL:
       /* Expressions that make no memory references.  */
       return;
 
@@ -1528,7 +1567,7 @@ get_call_expr_operands (tree stmt, tree expr)
 	  && !(call_flags & (ECF_PURE | ECF_CONST | ECF_NORETURN)))
 	add_call_clobber_ops (stmt, get_callee_fndecl (expr));
       else if (!(call_flags & ECF_CONST))
-	add_call_read_ops (stmt);
+	add_call_read_ops (stmt, get_callee_fndecl (expr));
     }
 
   /* Find uses in the called function.  */
@@ -1715,7 +1754,6 @@ add_to_addressable_set (tree ref, bitmap *addresses_taken)
     }
 }
 
-
 /* Add clobbering definitions for .GLOBAL_VAR or for each of the call
    clobbered variables in the function.  */
 
@@ -1723,12 +1761,10 @@ static void
 add_call_clobber_ops (tree stmt, tree callee)
 {
   unsigned u;
-  tree t;
   bitmap_iterator bi;
   stmt_ann_t s_ann = stmt_ann (stmt);
-  struct stmt_ann_d empty_ann;
   bitmap not_read_b, not_written_b;
-
+  
   /* Functions that are not const, pure or never return may clobber
      call-clobbered variables.  */
   if (s_ann)
@@ -1742,100 +1778,67 @@ add_call_clobber_ops (tree stmt, tree callee)
       return;
     }
 
-  /* FIXME - if we have better information from the static vars
-     analysis, we need to make the cache call site specific.  This way
-     we can have the performance benefits even if we are doing good
-     optimization.  */
-
   /* Get info for local and module level statics.  There is a bit
      set for each static if the call being processed does not read
      or write that variable.  */
 
   not_read_b = callee ? ipa_reference_get_not_read_global (callee) : NULL; 
   not_written_b = callee ? ipa_reference_get_not_written_global (callee) : NULL; 
-
-  /* If cache is valid, copy the elements into the build vectors.  */
-  if (ssa_call_clobbered_cache_valid
-      && (!not_read_b || bitmap_empty_p (not_read_b))
-      && (!not_written_b || bitmap_empty_p (not_written_b)))
-    {
-      for (u = 0 ; u < VEC_length (tree, clobbered_vuses); u++)
-	{
-	  t = VEC_index (tree, clobbered_vuses, u);
-	  gcc_assert (TREE_CODE (t) != SSA_NAME);
-	  var_ann (t)->in_vuse_list = 1;
-	  VEC_safe_push (tree, heap, build_vuses, (tree)t);
-	}
-      for (u = 0; u < VEC_length (tree, clobbered_v_may_defs); u++)
-	{
-	  t = VEC_index (tree, clobbered_v_may_defs, u);
-	  gcc_assert (TREE_CODE (t) != SSA_NAME);
-	  var_ann (t)->in_v_may_def_list = 1;
-	  VEC_safe_push (tree, heap, build_v_may_defs, (tree)t);
-	}
-      return;
-    }
-
-  memset (&empty_ann, 0, sizeof (struct stmt_ann_d));
-
   /* Add a V_MAY_DEF operand for every call clobbered variable.  */
   EXECUTE_IF_SET_IN_BITMAP (call_clobbered_vars, 0, u, bi)
     {
-      tree var = referenced_var (u);
-      unsigned int uid = u;
+      tree var = referenced_var_lookup (u);
+      unsigned int escape_mask = var_ann (var)->escape_mask;
+      tree real_var = var;
+      bool not_read;
+      bool not_written;
+      
+      /* Not read and not written are computed on regular vars, not
+	 subvars, so look at the parent var if this is an SFT. */
 
-      if (unmodifiable_var_p (var))
-	add_stmt_operand (&var, &empty_ann, opf_none);
-      else
+      if (TREE_CODE (var) == STRUCT_FIELD_TAG)
+	real_var = SFT_PARENT_VAR (var);
+
+      not_read = not_read_b ? bitmap_bit_p (not_read_b, 
+					    DECL_UID (real_var)) : false;
+      not_written = not_written_b ? bitmap_bit_p (not_written_b, 
+						  DECL_UID (real_var)) : false;
+      gcc_assert (!unmodifiable_var_p (var));
+      
+      clobber_stats.clobbered_vars++;
+
+      /* See if this variable is really clobbered by this function.  */
+
+      /* Trivial case: Things escaping only to pure/const are not
+	 clobbered by non-pure-const, and only read by pure/const. */
+      if ((escape_mask & ~(ESCAPE_TO_PURE_CONST)) == 0)
 	{
-	  bool not_read;
-	  bool not_written;
-	  
-	  /* Not read and not written are computed on regular vars, not
-	     subvars, so look at the parent var if this is an SFT. */
-	  
-	  if (TREE_CODE (var) == STRUCT_FIELD_TAG)
-	    uid = DECL_UID (SFT_PARENT_VAR (var));
-
-	  not_read = 
-	    not_read_b ? bitmap_bit_p (not_read_b, uid) : false;
-	  not_written = 
-	    not_written_b ? bitmap_bit_p (not_written_b, uid) : false;
-
-	  if (not_written)
+	  tree call = get_call_expr_in (stmt);
+	  if (call_expr_flags (call) & (ECF_CONST | ECF_PURE))
 	    {
-	      if (!not_read)
-		add_stmt_operand (&var, &empty_ann, opf_none);
+	      add_stmt_operand (&var, s_ann, opf_none);
+	      clobber_stats.unescapable_clobbers_avoided++;
+	      continue;
 	    }
 	  else
-	    add_stmt_operand (&var, &empty_ann, opf_is_def);
+	    {
+	      clobber_stats.unescapable_clobbers_avoided++;
+	      continue;
+	    }
 	}
+            
+      if (not_written)
+	{
+	  clobber_stats.static_write_clobbers_avoided++;
+	  if (!not_read)
+	    add_stmt_operand (&var, s_ann, opf_none);
+	  else
+	    clobber_stats.static_read_clobbers_avoided++;
+	}
+      else
+	add_stmt_operand (&var, s_ann, opf_is_def);
     }
-
-  if ((!not_read_b || bitmap_empty_p (not_read_b))
-      && (!not_written_b || bitmap_empty_p (not_written_b)))
-    {
-      /* Prepare empty cache vectors.  */
-      VEC_truncate (tree, clobbered_vuses, 0);
-      VEC_truncate (tree, clobbered_v_may_defs, 0);
-
-      /* Now fill the clobbered cache with the values that have been found.  */
-      for (u = 0; u < VEC_length (tree, build_vuses); u++)
-	VEC_safe_push (tree, heap, clobbered_vuses,
-		       VEC_index (tree, build_vuses, u));
-
-      gcc_assert (VEC_length (tree, build_vuses) 
-		  == VEC_length (tree, clobbered_vuses));
-
-      for (u = 0; u < VEC_length (tree, build_v_may_defs); u++)
-	VEC_safe_push (tree, heap, clobbered_v_may_defs, 
-		       VEC_index (tree, build_v_may_defs, u));
-
-      gcc_assert (VEC_length (tree, build_v_may_defs) 
-		  == VEC_length (tree, clobbered_v_may_defs));
-
-      ssa_call_clobbered_cache_valid = true;
-    }
+  
 }
 
 
@@ -1843,13 +1846,12 @@ add_call_clobber_ops (tree stmt, tree callee)
    function.  */
 
 static void
-add_call_read_ops (tree stmt)
+add_call_read_ops (tree stmt, tree callee)
 {
   unsigned u;
-  tree t;
   bitmap_iterator bi;
   stmt_ann_t s_ann = stmt_ann (stmt);
-  struct stmt_ann_d empty_ann;
+  bitmap not_read_b;
 
   /* if the function is not pure, it may reference memory.  Add
      a VUSE for .GLOBAL_VAR if it has been created.  See add_referenced_var
@@ -1860,40 +1862,34 @@ add_call_read_ops (tree stmt)
       return;
     }
   
-  /* If cache is valid, copy the elements into the build vector.  */
-  if (ssa_ro_call_cache_valid)
-    {
-      for (u = 0; u < VEC_length (tree, ro_call_vuses); u++)
-	{
-	  t = VEC_index (tree, ro_call_vuses, u);
-	  gcc_assert (TREE_CODE (t) != SSA_NAME);
-	  var_ann (t)->in_vuse_list = 1;
-	  VEC_safe_push (tree, heap, build_vuses, (tree)t);
-	}
-      return;
-    }
-
-  memset (&empty_ann, 0, sizeof (struct stmt_ann_d));
+  not_read_b = callee ? ipa_reference_get_not_read_global (callee) : NULL; 
 
   /* Add a VUSE for each call-clobbered variable.  */
   EXECUTE_IF_SET_IN_BITMAP (call_clobbered_vars, 0, u, bi)
     {
       tree var = referenced_var (u);
-      add_stmt_operand (&var, &empty_ann, opf_none | opf_non_specific);
-    }
-
-  /* Prepare empty cache vectors.  */
-  VEC_truncate (tree, ro_call_vuses, 0);
+      tree real_var = var;
+      bool not_read;
+      
+      clobber_stats.readonly_clobbers++;
 
-  /* Now fill the clobbered cache with the values that have been found.  */
-  for (u = 0; u <  VEC_length (tree, build_vuses); u++)
-    VEC_safe_push (tree, heap, ro_call_vuses,
-		   VEC_index (tree, build_vuses, u));
+      /* Not read and not written are computed on regular vars, not
+	 subvars, so look at the parent var if this is an SFT. */
 
-  gcc_assert (VEC_length (tree, build_vuses) 
-	      == VEC_length (tree, ro_call_vuses));
+      if (TREE_CODE (var) == STRUCT_FIELD_TAG)
+	real_var = SFT_PARENT_VAR (var);
 
-  ssa_ro_call_cache_valid = true;
+      not_read = not_read_b ? bitmap_bit_p (not_read_b, 
+					    DECL_UID (real_var)) : false;
+      
+      if (not_read)
+	{
+	  clobber_stats.static_readonly_clobbers_avoided++;
+	  continue;
+	}
+            
+      add_stmt_operand (&var, s_ann, opf_none | opf_non_specific);
+    }
 }
 
 
diff --git a/gcc/tree-ssa-operands.h b/gcc/tree-ssa-operands.h
index c1ec3650ee1..daf2dce05f7 100644
--- a/gcc/tree-ssa-operands.h
+++ b/gcc/tree-ssa-operands.h
@@ -165,9 +165,6 @@ extern void dump_immediate_uses_for (FILE *file, tree var);
 extern void debug_immediate_uses (void);
 extern void debug_immediate_uses_for (tree var);
 
-extern bool ssa_call_clobbered_cache_valid;
-extern bool ssa_ro_call_cache_valid;
-
 extern bool ssa_operands_active (void);
 
 extern void add_to_addressable_set (tree, bitmap *);
diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index 90bd037e313..923bdb77b3c 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -2953,7 +2953,7 @@ update_alias_info (tree stmt, struct alias_info *ai)
   bitmap addr_taken;
   use_operand_p use_p;
   ssa_op_iter iter;
-  bool stmt_escapes_p = is_escape_site (stmt, ai);
+  enum escape_type stmt_escape_type = is_escape_site (stmt, ai);
   tree op;
 
   /* Mark all the variables whose address are taken by the statement.  */
@@ -2964,13 +2964,17 @@ update_alias_info (tree stmt, struct alias_info *ai)
 
       /* If STMT is an escape point, all the addresses taken by it are
 	 call-clobbered.  */
-      if (stmt_escapes_p)
+      if (stmt_escape_type != NO_ESCAPE)
 	{
 	  bitmap_iterator bi;
 	  unsigned i;
 
 	  EXECUTE_IF_SET_IN_BITMAP (addr_taken, 0, i, bi)
-	    mark_call_clobbered (referenced_var (i));
+	    {
+	      tree rvar = referenced_var (i);
+	      if (!unmodifiable_var_p (rvar))
+		mark_call_clobbered (rvar, stmt_escape_type);
+	    }
 	}
     }
 
@@ -3094,13 +3098,14 @@ update_alias_info (tree stmt, struct alias_info *ai)
 	    bitmap_set_bit (ai->dereferenced_ptrs_load, DECL_UID (var));
 	}
 
-      if (stmt_escapes_p && num_derefs < num_uses)
+      if (stmt_escape_type != NO_ESCAPE && num_derefs < num_uses)
 	{
 	  /* If STMT is an escape point and STMT contains at
 	     least one direct use of OP, then the value of OP
 	     escapes and so the pointed-to variables need to
 	     be marked call-clobbered.  */
 	  pi->value_escapes_p = 1;
+	  pi->escape_mask |= stmt_escape_type;
 
 	  /* If the statement makes a function call, assume
 	     that pointer OP will be dereferenced in a store
diff --git a/gcc/tree-ssa-structalias.h b/gcc/tree-ssa-structalias.h
index ddabd6d1799..bc129dde134 100644
--- a/gcc/tree-ssa-structalias.h
+++ b/gcc/tree-ssa-structalias.h
@@ -80,7 +80,7 @@ struct alias_info
 #define NUM_REFERENCES_SET(ANN, VAL) (ANN)->common.aux = (void*) ((void *)(VAL))
 
 /* In tree-ssa-alias.c.  */
-bool is_escape_site (tree, struct alias_info *);
+enum escape_type is_escape_site (tree, struct alias_info *);
 
 /* In tree-ssa-structalias.c.  */
 extern void compute_points_to_sets (struct alias_info *);
diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c
index ab749fba34a..c5882d42afb 100644
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@@ -1,5 +1,5 @@
 /* Analysis Utilities for Loop Vectorization.
-   Copyright (C) 2003,2004,2005 Free Software Foundation, Inc.
+   Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
 
 This file is part of GCC.
@@ -142,35 +142,46 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
               return false;
             }
 
-          if (STMT_VINFO_DATA_REF (stmt_info))
-            scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
-          else if (TREE_CODE (stmt) == MODIFY_EXPR)
-            scalar_type = TREE_TYPE (TREE_OPERAND (stmt, 0));
-          else
-            scalar_type = TREE_TYPE (stmt);
+	  if (STMT_VINFO_VECTYPE (stmt_info))
+	    {
+	      vectype = STMT_VINFO_VECTYPE (stmt_info);
+	      scalar_type = TREE_TYPE (vectype);
+	    }
+	  else
+	    {
+	      if (STMT_VINFO_DATA_REF (stmt_info))
+		scalar_type = 
+			TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
+	      else if (TREE_CODE (stmt) == MODIFY_EXPR)
+		scalar_type = TREE_TYPE (TREE_OPERAND (stmt, 0));
+	      else
+		scalar_type = TREE_TYPE (stmt);
 
-          if (vect_print_dump_info (REPORT_DETAILS))
-            {
-              fprintf (vect_dump, "get vectype for scalar type:  ");
-              print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
-            }
+	      if (vect_print_dump_info (REPORT_DETAILS))
+		{
+		  fprintf (vect_dump, "get vectype for scalar type:  ");
+		  print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+		}
 
-          vectype = get_vectype_for_scalar_type (scalar_type);
-          if (!vectype)
-            {
-              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
-                {
-                  fprintf (vect_dump, "not vectorized: unsupported data-type ");
-                  print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
-                }
-              return false;
+	      vectype = get_vectype_for_scalar_type (scalar_type);
+	      if (!vectype)
+		{
+		  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+		    {
+		      fprintf (vect_dump, 
+			       "not vectorized: unsupported data-type ");
+		      print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+		    }
+		  return false;
+		}
+	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
             }
+
           if (vect_print_dump_info (REPORT_DETAILS))
             {
               fprintf (vect_dump, "vectype: ");
               print_generic_expr (vect_dump, vectype, TDF_SLIM);
             }
-          STMT_VINFO_VECTYPE (stmt_info) = vectype;
 
           nunits = TYPE_VECTOR_SUBPARTS (vectype);
           if (vect_print_dump_info (REPORT_DETAILS))
@@ -1439,6 +1450,24 @@ vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt,
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "mark relevant %d, live %d.",relevant_p, live_p);
 
+  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+    {
+      tree pattern_stmt;
+
+      /* This is the last stmt in a sequence that was detected as a 
+         pattern that can potentially be vectorized.  Don't mark the stmt
+         as relevant/live because it's not going to vectorized.
+         Instead mark the pattern-stmt that replaces it.  */
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
+      pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+      stmt_info = vinfo_for_stmt (pattern_stmt);
+      gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
+      save_relevant_p = STMT_VINFO_RELEVANT_P (stmt_info);
+      save_live_p = STMT_VINFO_LIVE_P (stmt_info);
+      stmt = pattern_stmt;
+    }
+
   STMT_VINFO_LIVE_P (stmt_info) |= live_p;
   STMT_VINFO_RELEVANT_P (stmt_info) |= relevant_p;
 
@@ -2002,6 +2031,8 @@ vect_analyze_loop (struct loop *loop)
 
   vect_analyze_scalar_cycles (loop_vinfo);
 
+  vect_pattern_recog (loop_vinfo);
+
   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
 
   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index cc834e4f231..dd58cb914e8 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -1,5 +1,5 @@
 /* Lower vector operations to scalar operations.
-   Copyright (C) 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2004, 2005, 2006 Free Software Foundation, Inc.
 
 This file is part of GCC.
    
@@ -411,6 +411,11 @@ expand_vector_operations_1 (block_stmt_iterator *bsi)
   gcc_assert (code != CONVERT_EXPR);
   op = optab_for_tree_code (code, type);
 
+  /* For widening vector operations, the relevant type is of the arguments,
+     not the widened result.  */
+  if (code == WIDEN_SUM_EXPR)
+    type = TREE_TYPE (TREE_OPERAND (rhs, 0));
+
   /* Optabs will try converting a negation into a subtraction, so
      look for it as well.  TODO: negation of floating-point vectors
      might be turned into an exclusive OR toggling the sign bit.  */
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
new file mode 100644
index 00000000000..006965cb99e
--- /dev/null
+++ b/gcc/tree-vect-patterns.c
@@ -0,0 +1,637 @@
+/* Analysis Utilities for Loop Vectorization.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   Contributed by Dorit Nuzman <dorit@il.ibm.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING.  If not, write to the Free
+Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301, USA.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "ggc.h"
+#include "tree.h"
+
+#include "target.h"
+#include "basic-block.h"
+#include "diagnostic.h"
+#include "tree-flow.h"
+#include "tree-dump.h"
+#include "timevar.h"
+#include "cfgloop.h"
+#include "expr.h"
+#include "optabs.h"
+#include "params.h"
+#include "tree-data-ref.h"
+#include "tree-vectorizer.h"
+#include "recog.h"
+#include "toplev.h"
+
+/* Funcion prototypes */
+static void vect_pattern_recog_1 
+  (tree (* ) (tree, tree *, tree *), block_stmt_iterator);
+static bool widened_name_p (tree, tree, tree *, tree *);
+
+/* Pattern recognition functions  */
+static tree vect_recog_widen_sum_pattern (tree, tree *, tree *);
+static tree vect_recog_widen_mult_pattern (tree, tree *, tree *);
+static tree vect_recog_dot_prod_pattern (tree, tree *, tree *);
+static vect_recog_func_ptr vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
+	vect_recog_widen_mult_pattern,
+	vect_recog_widen_sum_pattern,
+	vect_recog_dot_prod_pattern};
+
+
+/* Function widened_name_p
+
+   Check whether NAME, an ssa-name used in USE_STMT,
+   is a result of a type-promotion, such that:
+     DEF_STMT: NAME = NOP (name0)
+   where the type of name0 (HALF_TYPE) is smaller than the type of NAME. 
+*/
+
+static bool
+widened_name_p (tree name, tree use_stmt, tree *half_type, tree *def_stmt)
+{
+  tree dummy;
+  loop_vec_info loop_vinfo;
+  stmt_vec_info stmt_vinfo;
+  tree expr;
+  tree type = TREE_TYPE (name);
+  tree oprnd0;
+  enum vect_def_type dt;
+  tree def;
+
+  stmt_vinfo = vinfo_for_stmt (use_stmt);
+  loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
+
+  if (!vect_is_simple_use (name, loop_vinfo, def_stmt, &def, &dt))
+    return false;
+
+  if (dt != vect_loop_def
+      && dt != vect_invariant_def && dt != vect_constant_def)
+    return false;
+
+  if (! *def_stmt)
+    return false;
+
+  if (TREE_CODE (*def_stmt) != MODIFY_EXPR)
+    return false;
+
+  expr = TREE_OPERAND (*def_stmt, 1);
+  if (TREE_CODE (expr) != NOP_EXPR)
+    return false;
+
+  oprnd0 = TREE_OPERAND (expr, 0);
+
+  *half_type = TREE_TYPE (oprnd0);
+  if (!INTEGRAL_TYPE_P (type) || !INTEGRAL_TYPE_P (*half_type)
+      || (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (*half_type))
+      || (TYPE_PRECISION (type) < (TYPE_PRECISION (*half_type) * 2)))
+    return false;
+
+  if (!vect_is_simple_use (oprnd0, loop_vinfo, &dummy, &dummy, &dt))
+    return false;
+
+  if (dt != vect_invariant_def && dt != vect_constant_def
+      && dt != vect_loop_def)
+    return false;
+
+  return true;
+}
+
+
+/* Function vect_recog_dot_prod_pattern
+
+   Try to find the following pattern:
+
+     type x_t, y_t;
+     TYPE1 prod;
+     TYPE2 sum = init;
+   loop:
+     sum_0 = phi <init, sum_1>
+     S1  x_t = ...
+     S2  y_t = ...
+     S3  x_T = (TYPE1) x_t;
+     S4  y_T = (TYPE1) y_t;
+     S5  prod = x_T * y_T;
+     [S6  prod = (TYPE2) prod;  #optional]
+     S7  sum_1 = prod + sum_0;
+
+   where 'TYPE1' is exactly double the size of type 'type', and 'TYPE2' is the 
+   same size of 'TYPE1' or bigger. This is a sepcial case of a reduction 
+   computation.
+      
+   Input:
+
+   * LAST_STMT: A stmt from which the pattern search begins. In the example,
+   when this function is called with S7, the pattern {S3,S4,S5,S6,S7} will be
+   detected.
+
+   Output:
+
+   * TYPE_IN: The type of the input arguments to the pattern.
+
+   * TYPE_OUT: The type of the output  of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence of
+   stmts that constitute the pattern. In this case it will be:
+        WIDEN_DOT_PRODUCT <x_t, y_t, sum_0>
+*/
+
+static tree
+vect_recog_dot_prod_pattern (tree last_stmt, tree *type_in, tree *type_out)
+{
+  tree stmt, expr;
+  tree oprnd0, oprnd1;
+  tree oprnd00, oprnd01;
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  tree type, half_type;
+  tree pattern_expr;
+  tree prod_type;
+
+  if (TREE_CODE (last_stmt) != MODIFY_EXPR)
+    return NULL;
+
+  expr = TREE_OPERAND (last_stmt, 1);
+  type = TREE_TYPE (expr);
+
+  /* Look for the following pattern 
+          DX = (TYPE1) X;
+          DY = (TYPE1) Y;
+          DPROD = DX * DY; 
+          DDPROD = (TYPE2) DPROD;
+          sum_1 = DDPROD + sum_0;
+     In which 
+     - DX is double the size of X
+     - DY is double the size of Y
+     - DX, DY, DPROD all have the same type
+     - sum is the same size of DPROD or bigger
+     - sum has been recognized as a reduction variable.
+
+     This is equivalent to:
+       DPROD = X w* Y;          #widen mult
+       sum_1 = DPROD w+ sum_0;  #widen summation
+     or
+       DPROD = X w* Y;          #widen mult
+       sum_1 = DPROD + sum_0;   #summation
+   */
+
+  /* Starting from LAST_STMT, follow the defs of its uses in search
+     of the above pattern.  */
+
+  if (TREE_CODE (expr) != PLUS_EXPR)
+    return NULL;
+
+  if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+    {
+      /* Has been detected as widening-summation?  */
+
+      stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+      expr = TREE_OPERAND (stmt, 1);
+      type = TREE_TYPE (expr);
+      if (TREE_CODE (expr) != WIDEN_SUM_EXPR)
+        return NULL;
+      oprnd0 = TREE_OPERAND (expr, 0);
+      oprnd1 = TREE_OPERAND (expr, 1);
+      half_type = TREE_TYPE (oprnd0);
+    }
+  else
+    {
+      tree def_stmt;
+
+      if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def)
+        return NULL;
+      oprnd0 = TREE_OPERAND (expr, 0);
+      oprnd1 = TREE_OPERAND (expr, 1);
+      if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) != TYPE_MAIN_VARIANT (type)
+          || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) != TYPE_MAIN_VARIANT (type))
+        return NULL;
+      stmt = last_stmt;
+
+      if (widened_name_p (oprnd0, stmt, &half_type, &def_stmt))
+        {
+          stmt = def_stmt;
+          expr = TREE_OPERAND (stmt, 1);
+          oprnd0 = TREE_OPERAND (expr, 0);
+        }
+      else
+        half_type = type;
+    }
+
+  /* So far so good. Since last_stmt was detected as a (summation) reduction,
+     we know that oprnd1 is the reduction variable (defined by a loop-header
+     phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
+     Left to check that oprnd0 is defined by a (widen_)mult_expr  */
+
+  prod_type = half_type;
+  stmt = SSA_NAME_DEF_STMT (oprnd0);
+  gcc_assert (stmt);
+  stmt_vinfo = vinfo_for_stmt (stmt);
+  gcc_assert (stmt_vinfo);
+  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_loop_def);
+  expr = TREE_OPERAND (stmt, 1);
+  if (TREE_CODE (expr) != MULT_EXPR)
+    return NULL;
+  if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+    {
+      /* Has been detected as a widening multiplication?  */
+
+      stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+      expr = TREE_OPERAND (stmt, 1);
+      if (TREE_CODE (expr) != WIDEN_MULT_EXPR)
+        return NULL;
+      stmt_vinfo = vinfo_for_stmt (stmt);
+      gcc_assert (stmt_vinfo);
+      gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_loop_def);
+      oprnd00 = TREE_OPERAND (expr, 0);
+      oprnd01 = TREE_OPERAND (expr, 1);
+    }
+  else
+    {
+      tree half_type0, half_type1;
+      tree def_stmt;
+      tree oprnd0, oprnd1;
+
+      oprnd0 = TREE_OPERAND (expr, 0);
+      oprnd1 = TREE_OPERAND (expr, 1);
+      if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) 
+				!= TYPE_MAIN_VARIANT (prod_type)
+          || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) 
+				!= TYPE_MAIN_VARIANT (prod_type))
+        return NULL;
+      if (!widened_name_p (oprnd0, stmt, &half_type0, &def_stmt))
+        return NULL;
+      oprnd00 = TREE_OPERAND (TREE_OPERAND (def_stmt, 1), 0);
+      if (!widened_name_p (oprnd1, stmt, &half_type1, &def_stmt))
+        return NULL;
+      oprnd01 = TREE_OPERAND (TREE_OPERAND (def_stmt, 1), 0);
+      if (TYPE_MAIN_VARIANT (half_type0) != TYPE_MAIN_VARIANT (half_type1))
+        return NULL;
+      if (TYPE_PRECISION (prod_type) != TYPE_PRECISION (half_type0) * 2)
+	return NULL;
+    }
+
+  half_type = TREE_TYPE (oprnd00);
+  *type_in = half_type;
+  *type_out = type;
+  
+  /* Pattern detected. Create a stmt to be used to replace the pattern: */
+  pattern_expr = build3 (DOT_PROD_EXPR, type, oprnd00, oprnd01, oprnd1);
+  if (vect_print_dump_info (REPORT_DETAILS))
+    {
+      fprintf (vect_dump, "vect_recog_dot_prod_pattern: detected: ");
+      print_generic_expr (vect_dump, pattern_expr, TDF_SLIM);
+    }
+  return pattern_expr;
+}
+
+
+/* Function vect_recog_widen_mult_pattern
+
+   Try to find the following pattern:
+
+     type a_t, b_t;
+     TYPE a_T, b_T, prod_T;
+
+     S1  a_t = ;
+     S2  b_t = ;
+     S3  a_T = (TYPE) a_t;
+     S4  b_T = (TYPE) b_t;
+     S5  prod_T = a_T * b_T;
+
+   where type 'TYPE' is at least double the size of type 'type'.
+
+   Input:
+
+   * LAST_STMT: A stmt from which the pattern search begins. In the example,
+   when this function is called with S5, the pattern {S3,S4,S5} is be detected.
+
+   Output:
+
+   * TYPE_IN: The type of the input arguments to the pattern.
+
+   * TYPE_OUT: The type of the output  of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence of
+   stmts that constitute the pattern. In this case it will be:
+        WIDEN_MULT <a_t, b_t>
+*/
+
+static tree
+vect_recog_widen_mult_pattern (tree last_stmt ATTRIBUTE_UNUSED, 
+			       tree *type_in ATTRIBUTE_UNUSED, 
+			       tree *type_out ATTRIBUTE_UNUSED)
+{
+  /* Yet to be implemented.   */
+  return NULL;
+}
+
+
+/* Function vect_recog_widen_sum_pattern
+
+   Try to find the following pattern:
+
+     type x_t; 
+     TYPE x_T, sum = init;
+   loop:
+     sum_0 = phi <init, sum_1>
+     S1  x_t = *p;
+     S2  x_T = (TYPE) x_t;
+     S3  sum_1 = x_T + sum_0;
+
+   where type 'TYPE' is at least double the size of type 'type', i.e - we're 
+   summing elements of type 'type' into an accumulator of type 'TYPE'. This is
+   a sepcial case of a reduction computation.
+
+   Input:
+
+   * LAST_STMT: A stmt from which the pattern search begins. In the example,
+   when this function is called with S3, the pattern {S2,S3} will be detected.
+        
+   Output:
+      
+   * TYPE_IN: The type of the input arguments to the pattern.
+
+   * TYPE_OUT: The type of the output of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence of
+   stmts that constitute the pattern. In this case it will be:
+        WIDEN_SUM <x_t, sum_0>
+*/
+
+static tree
+vect_recog_widen_sum_pattern (tree last_stmt, tree *type_in, tree *type_out)
+{
+  tree stmt, expr;
+  tree oprnd0, oprnd1;
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  tree type, half_type;
+  tree pattern_expr;
+
+  if (TREE_CODE (last_stmt) != MODIFY_EXPR)
+    return NULL;
+
+  expr = TREE_OPERAND (last_stmt, 1);
+  type = TREE_TYPE (expr);
+
+  /* Look for the following pattern
+          DX = (TYPE) X;
+          sum_1 = DX + sum_0;
+     In which DX is at least double the size of X, and sum_1 has been
+     recognized as a reduction variable.
+   */
+
+  /* Starting from LAST_STMT, follow the defs of its uses in search
+     of the above pattern.  */
+
+  if (TREE_CODE (expr) != PLUS_EXPR)
+    return NULL;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def)
+    return NULL;
+
+  oprnd0 = TREE_OPERAND (expr, 0);
+  oprnd1 = TREE_OPERAND (expr, 1);
+  if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) != TYPE_MAIN_VARIANT (type)
+      || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) != TYPE_MAIN_VARIANT (type))
+    return NULL;
+
+  /* So far so good. Since last_stmt was detected as a (summation) reduction,
+     we know that oprnd1 is the reduction variable (defined by a loop-header
+     phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
+     Left to check that oprnd0 is defined by a cast from type 'type' to type
+     'TYPE'.  */
+
+  if (!widened_name_p (oprnd0, last_stmt, &half_type, &stmt))
+    return NULL;
+
+  oprnd0 = TREE_OPERAND (TREE_OPERAND (stmt, 1), 0);
+  *type_in = half_type;
+  *type_out = type;
+
+  /* Pattern detected. Create a stmt to be used to replace the pattern: */
+  pattern_expr = build2 (WIDEN_SUM_EXPR, type, oprnd0, oprnd1);
+  if (vect_print_dump_info (REPORT_DETAILS))
+    {
+      fprintf (vect_dump, "vect_recog_widen_sum_pattern: detected: ");
+      print_generic_expr (vect_dump, pattern_expr, TDF_SLIM);
+    }
+  return pattern_expr;
+}
+
+
+/* Function vect_pattern_recog_1 
+
+   Input:
+   PATTERN_RECOG_FUNC: A pointer to a function that detects a certain
+        computation pattern.
+   STMT: A stmt from which the pattern search should start.
+
+   If PATTERN_RECOG_FUNC successfully detected the pattern, it creates an
+   expression that computes the same functionality and can be used to 
+   replace the sequence of stmts that are involved in the pattern. 
+
+   Output:
+   This function checks if the expression returned by PATTERN_RECOG_FUNC is 
+   supported in vector form by the target.  We use 'TYPE_IN' to obtain the 
+   relevant vector type. If 'TYPE_IN' is already a vector type, then this 
+   indicates that target support had already been checked by PATTERN_RECOG_FUNC.
+   If 'TYPE_OUT' is also returned by PATTERN_RECOG_FUNC, we check that it fits
+   to the available target pattern.
+
+   This function also does some bookeeping, as explained in the documentation 
+   for vect_recog_pattern.  */
+
+static void
+vect_pattern_recog_1 (
+	tree (* vect_recog_func) (tree, tree *, tree *),
+	block_stmt_iterator si)
+{
+  tree stmt = bsi_stmt (si);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  stmt_vec_info pattern_stmt_info;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  tree pattern_expr;
+  tree pattern_vectype;
+  tree type_in, type_out;
+  tree pattern_type;
+  enum tree_code code;
+  tree var, var_name;
+  stmt_ann_t ann;
+
+  pattern_expr = (* vect_recog_func) (stmt, &type_in, &type_out);
+  if (!pattern_expr) 
+    return; 
+ 
+  if (VECTOR_MODE_P (TYPE_MODE (type_in))) 
+    { 
+      /* No need to check target support (already checked by the pattern 
+         recognition function).  */ 
+      pattern_vectype = type_in;
+    }
+  else
+    {
+      enum tree_code vec_mode;
+      enum insn_code icode;
+      optab optab;
+
+      /* Check target support  */
+      pattern_vectype = get_vectype_for_scalar_type (type_in);
+      optab = optab_for_tree_code (TREE_CODE (pattern_expr), pattern_vectype);
+      vec_mode = TYPE_MODE (pattern_vectype);
+      if (!optab
+          || (icode = optab->handlers[(int) vec_mode].insn_code) ==
+              CODE_FOR_nothing
+          || (type_out
+              && (insn_data[icode].operand[0].mode !=
+                  TYPE_MODE (get_vectype_for_scalar_type (type_out)))))
+	return;
+    }
+
+  /* Found a vectorizable pattern.  */
+  if (vect_print_dump_info (REPORT_DETAILS))
+    {
+      fprintf (vect_dump, "pattern recognized: "); 
+      print_generic_expr (vect_dump, pattern_expr, TDF_SLIM);
+    }
+  
+  /* Mark the stmts that are involved in the pattern,
+     create a new stmt to express the pattern and insert it.  */
+  code = TREE_CODE (pattern_expr);
+  pattern_type = TREE_TYPE (pattern_expr);
+  var = create_tmp_var (pattern_type, "patt");
+  add_referenced_tmp_var (var);
+  var_name = make_ssa_name (var, NULL_TREE);
+  pattern_expr = build2 (MODIFY_EXPR, void_type_node, var_name, pattern_expr);
+  SSA_NAME_DEF_STMT (var_name) = pattern_expr;
+  bsi_insert_before (&si, pattern_expr, BSI_SAME_STMT);
+  ann = stmt_ann (pattern_expr);
+  set_stmt_info ((tree_ann_t)ann, new_stmt_vec_info (pattern_expr, loop_vinfo));
+  pattern_stmt_info = vinfo_for_stmt (pattern_expr);
+  
+  STMT_VINFO_RELATED_STMT (pattern_stmt_info) = stmt;
+  STMT_VINFO_DEF_TYPE (pattern_stmt_info) = STMT_VINFO_DEF_TYPE (stmt_info);
+  STMT_VINFO_VECTYPE (pattern_stmt_info) = pattern_vectype;
+  STMT_VINFO_IN_PATTERN_P (stmt_info) = true;
+  STMT_VINFO_RELATED_STMT (stmt_info) = pattern_expr;
+
+  return;
+}
+
+
+/* Function vect_pattern_recog
+
+   Input:
+   LOOP_VINFO - a struct_loop_info of a loop in which we want to look for
+        computation idioms.
+
+   Output - for each computation idiom that is detected we insert a new stmt
+        that provides the same functionality and that can be vectorized. We
+        also record some information in the struct_stmt_info of the relevant
+        stmts, as explained below:
+
+   At the entry to this function we have the following stmts, with the
+   following initial value in the STMT_VINFO fields:
+
+         stmt                     in_pattern_p  related_stmt    vec_stmt
+         S1: a_i = ....                 -       -               -
+         S2: a_2 = ..use(a_i)..         -       -               -
+         S3: a_1 = ..use(a_2)..         -       -               -
+         S4: a_0 = ..use(a_1)..         -       -               -
+         S5: ... = ..use(a_0)..         -       -               -
+
+   Say the sequence {S1,S2,S3,S4} was detected as a pattern that can be
+   represented by a single stmt. We then:
+   - create a new stmt S6 that will replace the pattern.
+   - insert the new stmt S6 before the last stmt in the pattern
+   - fill in the STMT_VINFO fields as follows:
+
+                                  in_pattern_p  related_stmt    vec_stmt
+         S1: a_i = ....                 -       -               -       
+         S2: a_2 = ..use(a_i)..         -       -               -
+         S3: a_1 = ..use(a_2)..         -       -               -
+       > S6: a_new = ....               -       S4              -
+         S4: a_0 = ..use(a_1)..         true    S6              -
+         S5: ... = ..use(a_0)..         -       -               -
+
+   (the last stmt in the pattern (S4) and the new pattern stmt (S6) point
+    to each other through the RELATED_STMT field).
+
+   S6 will be marked as relevant in vect_mark_stmts_to_be_vectorized instead
+   of S4 because it will replace all its uses.  Stmts {S1,S2,S3} will
+   remain irrelevant unless used by stmts other than S4.
+
+   If vectorization succeeds, vect_transform_stmt will skip over {S1,S2,S3}
+   (because they are marked as irrelevent). It will vectorize S6, and record
+   a pointer to the new vector stmt VS6 both from S6 (as usual), and also 
+   from S4. We do that so that when we get to vectorizing stmts that use the
+   def of S4 (like S5 that uses a_0), we'll know where to take the relevant
+   vector-def from. S4 will be skipped, and S5 will be vectorized as usual:
+
+                                  in_pattern_p  related_stmt    vec_stmt
+         S1: a_i = ....                 -       -               -
+         S2: a_2 = ..use(a_i)..         -       -               -
+         S3: a_1 = ..use(a_2)..         -       -               -
+       > VS6: va_new = ....             -       -               -
+         S6: a_new = ....               -       S4              VS6
+         S4: a_0 = ..use(a_1)..         true    S6              VS6
+       > VS5: ... = ..vuse(va_new)..    -       -               -
+         S5: ... = ..use(a_0)..         -       -               -
+
+   DCE could then get rid of {S1,S2,S3,S4,S5,S6} (if their defs are not used
+   elsewhere), and we'll end up with:
+
+        VS6: va_new = .... 
+        VS5: ... = ..vuse(va_new)..
+
+   If vectorization does not succeed, DCE will clean S6 away (its def is
+   not used), and we'll end up with the original sequence.
+*/
+
+void
+vect_pattern_recog (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
+  unsigned int nbbs = loop->num_nodes;
+  block_stmt_iterator si;
+  tree stmt;
+  unsigned int i, j;
+  tree (* vect_recog_func_ptr) (tree, tree *, tree *);
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "=== vect_pattern_recog ===");
+
+  /* Scan through the loop stmts, applying the pattern recognition
+     functions starting at each stmt visited:  */
+  for (i = 0; i < nbbs; i++)
+    {
+      basic_block bb = bbs[i];
+      for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+        {
+          stmt = bsi_stmt (si);
+
+          /* Scan over all generic vect_recog_xxx_pattern functions.  */
+          for (j = 0; j < NUM_PATTERNS; j++)
+            {
+              vect_recog_func_ptr = vect_vect_recog_func_ptrs[j];
+              vect_pattern_recog_1 (vect_recog_func_ptr, si);
+            }
+        }
+    }
+}
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c
index 42090f7a20e..db0573ce6f5 100644
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -1,5 +1,5 @@
 /* Transformation Utilities for Loop Vectorization.
-   Copyright (C) 2003,2004,2005 Free Software Foundation, Inc.
+   Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
 
 This file is part of GCC.
@@ -59,6 +59,7 @@ static void vect_finish_stmt_generation
   (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
 static bool vect_is_simple_cond (tree, loop_vec_info); 
 static void update_vuses_to_preheader (tree, struct loop*);
+static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
 static tree get_initial_def_for_reduction (tree, tree, tree *);
 
 /* Utility function dealing with loop peeling (not peeling itself).  */
@@ -656,6 +657,8 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
 
   switch (code)
   {
+  case WIDEN_SUM_EXPR:
+  case DOT_PROD_EXPR:
   case PLUS_EXPR:
     if (INTEGRAL_TYPE_P (type))
       def = build_int_cst (type, 0);
@@ -711,66 +714,66 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
 }
 
 
-/* Function vect_create_epilog_for_reduction:
+/* Function vect_create_epilog_for_reduction
     
    Create code at the loop-epilog to finalize the result of a reduction
-   computation.
+   computation. 
   
-   LOOP_EXIT_VECT_DEF is a vector of partial results. We need to "reduce" it
-   into a single result, by applying the operation REDUC_CODE on the
-   partial-results-vector. For this, we need to create a new phi node at the
-   loop exit to preserve loop-closed form, as illustrated below.
-
-   STMT is the original scalar reduction stmt that is being vectorized.
-   REDUCTION_OP is the scalar reduction-variable.
+   VECT_DEF is a vector of partial results. 
+   REDUC_CODE is the tree-code for the epilog reduction.
+   STMT is the scalar reduction stmt that is being vectorized.
    REDUCTION_PHI is the phi-node that carries the reduction computation.
-   This function also sets the arguments for the REDUCTION_PHI:
-   The loop-entry argument is the (vectorized) initial-value of REDUCTION_OP.
-   The loop-latch argument is VECT_DEF - the vector of partial sums.
 
-     This function transforms this:
+   This function:
+   1. Creates the reduction def-use cycle: sets the the arguments for 
+      REDUCTION_PHI:
+      The loop-entry argument is the vectorized initial-value of the reduction.
+      The loop-latch argument is VECT_DEF - the vector of partial sums.
+   2. "Reduces" the vector of partial results VECT_DEF into a single result,
+      by applying the operation specified by REDUC_CODE if available, or by 
+      other means (whole-vector shifts or a scalar loop).
+      The function also creates a new phi node at the loop exit to preserve 
+      loop-closed form, as illustrated below.
+  
+     The flow at the entry to this function:
     
         loop:
-          vec_def = phi <null, null>    # REDUCTION_PHI
-          ....
-          VECT_DEF = ...
-
+          vec_def = phi <null, null>            # REDUCTION_PHI
+          VECT_DEF = vector_stmt                # vectorized form of STMT       
+          s_loop = scalar_stmt                  # (scalar) STMT
         loop_exit:
-          s_out0 = phi <s_loop>         # EXIT_PHI
-
+          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
           use <s_out0>
           use <s_out0>
 
-     Into:
+     The above is transformed by this function into:
 
         loop:
-          vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
-          ....
-          VECT_DEF = ...
-
+          vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
+          VECT_DEF = vector_stmt                # vectorized form of STMT
+          s_loop = scalar_stmt                  # (scalar) STMT 
         loop_exit:
-          s_out0 = phi <s_loop>         # EXIT_PHI
-          v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
-
-          v_out2 = reduc_expr <v_out1>
+          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
+          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
+          v_out2 = reduce <v_out1>
           s_out3 = extract_field <v_out2, 0>
-
-          use <s_out3>
-          use <s_out3>
+          s_out4 = adjust_result <s_out3>
+          use <s_out4>
+          use <s_out4>
 */
 
 static void
-vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
+vect_create_epilog_for_reduction (tree vect_def, tree stmt,
                                   enum tree_code reduc_code, tree reduction_phi)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  enum machine_mode mode = TYPE_MODE (vectype);
+  tree vectype;
+  enum machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block exit_bb;
-  tree scalar_dest = TREE_OPERAND (stmt, 0);
-  tree scalar_type = TREE_TYPE (scalar_dest);
+  tree scalar_dest;
+  tree scalar_type;
   tree new_phi;
   block_stmt_iterator exit_bsi;
   tree vec_dest;
@@ -786,7 +789,16 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
   imm_use_iterator imm_iter;
   use_operand_p use_p;
   bool extract_scalar_result;
+  tree reduction_op;
+  tree orig_stmt;
+  tree operation = TREE_OPERAND (stmt, 1);
+  int op_type;
   
+  op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
+  reduction_op = TREE_OPERAND (operation, op_type-1);
+  vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+  mode = TYPE_MODE (vectype);
+
   /*** 1. Create the reduction def-use cycle  ***/
   
   /* 1.1 set the loop-entry arg of the reduction-phi:  */
@@ -797,7 +809,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
 						  &scalar_initial_def);
   add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
 
-
   /* 1.2 set the loop-latch arg for the reduction-phi:  */
   add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
 
@@ -810,7 +821,32 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
     }
 
 
-  /*** 2. Create epilog code ***/
+  /*** 2. Create epilog code
+	  The reduction epilog code operates across the elements of the vector
+          of partial results computed by the vectorized loop.
+          The reduction epilog code consists of:
+          step 1: compute the scalar result in a vector (v_out2)
+          step 2: extract the scalar result (s_out3) from the vector (v_out2)
+          step 3: adjust the scalar result (s_out3) if needed.
+
+          Step 1 can be accomplished using one the following three schemes:
+          (scheme 1) using reduc_code, if available.
+          (scheme 2) using whole-vector shifts, if available.
+          (scheme 3) using a scalar loop. In this case steps 1+2 above are 
+                     combined.
+                
+          The overall epilog code looks like this:
+
+          s_out0 = phi <s_loop>         # original EXIT_PHI
+          v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
+          v_out2 = reduce <v_out1>              # step 1
+          s_out3 = extract_field <v_out2, 0>    # step 2
+          s_out4 = adjust_result <s_out3>       # step 3
+
+          (step 3 is optional, and step2 1 and 2 may be combined).
+          Lastly, the uses of s_out0 are replaced by s_out4.
+
+	  ***/
 
   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
         v_out1 = phi <v_loop>  */
@@ -818,15 +854,39 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
   exit_bb = loop->single_exit->dest;
   new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
   SET_PHI_ARG_DEF (new_phi, loop->single_exit->dest_idx, vect_def);
-
   exit_bsi = bsi_start (exit_bb);
 
-
+  /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 
+         (i.e. when reduc_code is not available) and in the final adjusment code
+         (if needed).  Also get the original scalar reduction variable as
+         defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it 
+         represents a reduction pattern), the tree-code and scalar-def are 
+         taken from the original stmt that the pattern-stmt (STMT) replaces.  
+         Otherwise (it is a regular reduction) - the tree-code and scalar-def
+         are taken from STMT.  */ 
+
+  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+  if (!orig_stmt)
+    {
+      /* Regular reduction  */
+      orig_stmt = stmt;
+    }
+  else
+    {
+      /* Reduction pattern  */
+      stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
+      gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
+      gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
+    }
+  code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
+  scalar_dest = TREE_OPERAND (orig_stmt, 0);
+  scalar_type = TREE_TYPE (scalar_dest);
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
   bitsize = TYPE_SIZE (scalar_type);
   bytesize = TYPE_SIZE_UNIT (scalar_type);
 
-  /* 2.2 Create the reduction code.  */
+  /* 2.3 Create the reduction code, using one of the three schemes described
+         above.  */
 
   if (reduc_code < NUM_TREE_CODES)
     {
@@ -849,16 +909,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
     {
       enum tree_code shift_code = 0;
       bool have_whole_vector_shift = true;
-      enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); /* CHECKME */
       int bit_offset;
       int element_bitsize = tree_low_cst (bitsize, 1);
       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
       tree vec_temp;
 
-      /* The result of the reduction is expected to be at the least
-	 significant bits of the vector.  This is merely convention,
-	 as it's the extraction later that really matters, and that
-	 is also under our control.  */
       if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
 	shift_code = VEC_RSHIFT_EXPR;
       else
@@ -881,7 +936,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
 
       if (have_whole_vector_shift)
         {
-	  /*** Case 2:
+	  /*** Case 2: Create:
 	     for (offset = VS/2; offset >= element_size; offset/=2)
 	        {
 	          Create:  va' = vec_shift <va, offset>
@@ -905,17 +960,12 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
 	      new_name = make_ssa_name (vec_dest, epilog_stmt);
 	      TREE_OPERAND (epilog_stmt, 0) = new_name;
 	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-	      if (vect_print_dump_info (REPORT_DETAILS))
-		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
-
 
 	      epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
 	      build2 (code, vectype, new_name, new_temp));
 	      new_temp = make_ssa_name (vec_dest, epilog_stmt);
 	      TREE_OPERAND (epilog_stmt, 0) = new_temp;
 	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-	      if (vect_print_dump_info (REPORT_DETAILS))
-		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
 	    }
 
 	  extract_scalar_result = true;
@@ -924,10 +974,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
         {
 	  tree rhs;
 
-	  /*** Case 3:
-	     Create:  
+	  /*** Case 3: Create:  
 	     s = extract_field <v_out2, 0>
-	     for (offset=element_size; offset<vector_size; offset+=element_size;)
+	     for (offset = element_size; 
+		  offset < vector_size; 
+		  offset += element_size;)
 	       {
 	         Create:  s' = extract_field <v_out2, offset>
 	         Create:  s = op <s, s'>
@@ -938,18 +989,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
 
 	  vec_temp = PHI_RESULT (new_phi);
 	  vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
-
 	  rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
 			 bitsize_zero_node);
-
 	  BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
-	  epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, 
-			        rhs);
+	  epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs);
 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
 	  TREE_OPERAND (epilog_stmt, 0) = new_temp;
 	  bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-	  if (vect_print_dump_info (REPORT_DETAILS))
-	    print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
 	      
 	  for (bit_offset = element_bitsize;
 	       bit_offset < vec_size_in_bits;
@@ -965,25 +1011,19 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
 	      new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
 	      TREE_OPERAND (epilog_stmt, 0) = new_name;
 	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-	      if (vect_print_dump_info (REPORT_DETAILS))
-		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
-
 
 	      epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
 				build2 (code, scalar_type, new_name, new_temp));
 	      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
 	      TREE_OPERAND (epilog_stmt, 0) = new_temp;
 	      bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-	      if (vect_print_dump_info (REPORT_DETAILS))
-		print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
 	    }
 
 	  extract_scalar_result = false;
 	}
     }
 
-
-  /* 2.3  Extract the final scalar result.  Create:
+  /* 2.4  Extract the final scalar result.  Create:
          s_out3 = extract_field <v_out2, bitpos>  */
   
   if (extract_scalar_result)
@@ -993,7 +1033,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
       if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "extract scalar result");
 
-      /* The result is in the low order bits.  */
       if (BYTES_BIG_ENDIAN)
 	bitpos = size_binop (MULT_EXPR,
 		       bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
@@ -1007,17 +1046,14 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
       TREE_OPERAND (epilog_stmt, 0) = new_temp; 
       bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-      if (vect_print_dump_info (REPORT_DETAILS))
-	print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
     }
 
-
   /* 2.4 Adjust the final result by the initial value of the reduction
-	 variable. (when such adjustment is not needed, then
+	 variable. (When such adjustment is not needed, then
 	 'scalar_initial_def' is zero).
 
 	 Create: 
-	 s_out = scalar_expr <s_out, scalar_initial_def>  */
+	 s_out4 = scalar_expr <s_out3, scalar_initial_def>  */
   
   if (scalar_initial_def)
     {
@@ -1026,18 +1062,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
       TREE_OPERAND (epilog_stmt, 0) = new_temp;
       bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-
-      if (vect_print_dump_info (REPORT_DETAILS))
-        print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
     }
 
+  /* 2.6 Replace uses of s_out0 with uses of s_out3  */
 
-  /* 2.5 Replace uses of s_out0 with uses of s_out3  */
-
-  /* Find the loop-closed-use at the loop exit of the original
-     scalar result.  (The reduction result is expected to have
-     two immediate uses - one at the latch block, and one at the
-     loop exit).  */
+  /* Find the loop-closed-use at the loop exit of the original scalar result.  
+     (The reduction result is expected to have two immediate uses - one at the 
+     latch block, and one at the loop exit).  */
   exit_phi = NULL;
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
     {
@@ -1047,9 +1078,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
 	  break;
 	}
     }
-
+  /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
+  gcc_assert (exit_phi);
+  /* Replace the uses:  */
   orig_name = PHI_RESULT (exit_phi);
-
   FOR_EACH_IMM_USE_SAFE (use_p, imm_iter, orig_name)
     SET_USE (use_p, new_temp);
 } 
@@ -1060,33 +1092,69 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
    Check if STMT performs a reduction operation that can be vectorized.
    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
-   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
+   Return FALSE if not a vectorizable STMT, TRUE otherwise.
+
+   This function also handles reduction idioms (patterns) that have been 
+   recognized in advance during vect_pattern_recog. In this case, STMT may be
+   of this form:
+     X = pattern_expr (arg0, arg1, ..., X)
+   and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
+   sequence that had been detected and replaced by the pattern-stmt (STMT).
+  
+   In some cases of reduction patterns, the type of the reduction variable X is 
+   different than the type of the other arguments of STMT.
+   In such cases, the vectype that is used when transforming STMT into a vector
+   stmt is different than the vectype that is used to determine the 
+   vectorization factor, because it consists of a different number of elements 
+   than the actual number of elements that are being operated upon in parallel.
+
+   For example, consider an accumulation of shorts into an int accumulator. 
+   On some targets it's possible to vectorize this pattern operating on 8
+   shorts at a time (hence, the vectype for purposes of determining the
+   vectorization factor should be V8HI); on the other hand, the vectype that
+   is used to create the vector form is actually V4SI (the type of the result). 
+
+   Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 
+   indicates what is the actual level of parallelism (V8HI in the example), so 
+   that the right vectorization factor would be derived. This vectype 
+   corresponds to the type of arguments to the reduction stmt, and should *NOT* 
+   be used to create the vectorized stmt. The right vectype for the vectorized
+   stmt is obtained from the type of the result X: 
+        get_vectype_for_scalar_type (TREE_TYPE (X))
+
+   This means that, contrary to "regular" reductions (or "regular" stmts in 
+   general), the following equation:
+      STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
+   does *NOT* necessarily hold for reduction patterns.  */
 
 bool
 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
 {
   tree vec_dest;
   tree scalar_dest;
-  tree op0, op1;
-  tree loop_vec_def;
+  tree op;
+  tree loop_vec_def0, loop_vec_def1;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree operation;
-  enum tree_code code, reduc_code = 0;
+  enum tree_code code, orig_code, epilog_reduc_code = 0;
   enum machine_mode vec_mode;
   int op_type;
   optab optab, reduc_optab;
   tree new_temp;
-  tree def0, def1, def_stmt0, def_stmt1;
-  enum vect_def_type dt0, dt1;
+  tree def, def_stmt;
+  enum vect_def_type dt;
   tree new_phi;
   tree scalar_type;
-  bool is_simple_use0;
-  bool is_simple_use1;
+  bool is_simple_use;
+  tree orig_stmt;
+  stmt_vec_info orig_stmt_info;
+  tree expr = NULL_TREE;
+  int i;
 
-  /* Is vectorizable reduction?  */
+  /* 1. Is vectorizable reduction?  */
 
   /* Not supportable if the reduction variable is used in the loop.  */
   if (STMT_VINFO_RELEVANT_P (stmt_info))
@@ -1095,43 +1163,68 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
   if (!STMT_VINFO_LIVE_P (stmt_info))
     return false;
 
-  /* Make sure it was already recognized as a reduction pattern.  */
+  /* Make sure it was already recognized as a reduction computation.  */
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
     return false;
 
+  /* 2. Has this been recognized as a reduction pattern? 
+
+     Check if STMT represents a pattern that has been recognized
+     in earlier analysis stages.  For stmts that represent a pattern,
+     the STMT_VINFO_RELATED_STMT field records the last stmt in
+     the original sequence that constitutes the pattern.  */
+
+  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+  if (orig_stmt)
+    {
+      orig_stmt_info = vinfo_for_stmt (orig_stmt);
+      gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
+      gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
+      gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
+    }
+ 
+  /* 3. Check the operands of the operation. The first operands are defined
+        inside the loop body. The last operand is the reduction variable,
+        which is defined by the loop-header-phi.  */
+
   gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR);
 
   operation = TREE_OPERAND (stmt, 1);
   code = TREE_CODE (operation);
   op_type = TREE_CODE_LENGTH (code);
 
-  if (op_type != binary_op)
+  if (op_type != binary_op && op_type != ternary_op)
     return false;
-
-  op0 = TREE_OPERAND (operation, 0);
-  op1 = TREE_OPERAND (operation, 1);
   scalar_dest = TREE_OPERAND (stmt, 0);
   scalar_type = TREE_TYPE (scalar_dest);
 
-  /* Check the first operand. It is expected to be defined inside the loop.  */
-  is_simple_use0 =
-        vect_is_simple_use (op0, loop_vinfo, &def_stmt0, &def0, &dt0);
-  is_simple_use1 =
-        vect_is_simple_use (op1, loop_vinfo, &def_stmt1, &def1, &dt1);
-
-  gcc_assert (is_simple_use0);
-  gcc_assert (is_simple_use1);
-  gcc_assert (dt0 == vect_loop_def);
-  gcc_assert (dt1 == vect_reduction_def);
-  gcc_assert (TREE_CODE (def_stmt1) == PHI_NODE);
-  gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt1));
+  /* All uses but the last are expected to be defined in the loop.
+     The last use is the reduction variable.  */
+  for (i = 0; i < op_type-1; i++)
+    {
+      op = TREE_OPERAND (operation, i);
+      is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
+      gcc_assert (is_simple_use);
+      gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
+                  dt == vect_constant_def);
+    }
 
-  if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt1)))
-   return false;
+  op = TREE_OPERAND (operation, i);
+  is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
+  gcc_assert (is_simple_use);
+  gcc_assert (dt == vect_reduction_def);
+  gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
+  if (orig_stmt) 
+    gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
+  else
+    gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
+  
+  if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
+    return false;
 
-  /* Supportable by target?  */
+  /* 4. Supportable by target?  */
 
-  /* check support for the operation in the loop  */
+  /* 4.1. check support for the operation in the loop  */
   optab = optab_for_tree_code (code, vectype);
   if (!optab)
     {
@@ -1162,21 +1255,69 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
       return false;
     }
 
-  /* check support for the epilog operation  */
-  if (!reduction_code_for_scalar_code (code, &reduc_code))
+  /* 4.2. Check support for the epilog operation.
+
+          If STMT represents a reduction pattern, then the type of the
+          reduction variable may be different than the type of the rest
+          of the arguments.  For example, consider the case of accumulation
+          of shorts into an int accumulator; The original code:
+                        S1: int_a = (int) short_a;
+          orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
+
+          was replaced with:
+                        STMT: int_acc = widen_sum <short_a, int_acc>
+
+          This means that:
+          1. The tree-code that is used to create the vector operation in the 
+             epilog code (that reduces the partial results) is not the 
+             tree-code of STMT, but is rather the tree-code of the original 
+             stmt from the pattern that STMT is replacing. I.e, in the example 
+             above we want to use 'widen_sum' in the loop, but 'plus' in the 
+             epilog.
+          2. The type (mode) we use to check available target support
+             for the vector operation to be created in the *epilog*, is 
+             determined by the type of the reduction variable (in the example 
+             above we'd check this: plus_optab[vect_int_mode]).
+             However the type (mode) we use to check available target support
+             for the vector operation to be created *inside the loop*, is
+             determined by the type of the other arguments to STMT (in the
+             example we'd check this: widen_sum_optab[vect_short_mode]).
+  
+          This is contrary to "regular" reductions, in which the types of all 
+          the arguments are the same as the type of the reduction variable. 
+          For "regular" reductions we can therefore use the same vector type 
+          (and also the same tree-code) when generating the epilog code and
+          when generating the code inside the loop.  */
+
+  if (orig_stmt)
+    {
+      /* This is a reduction pattern: get the vectype from the type of the
+         reduction variable, and get the tree-code from orig_stmt.  */
+      orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
+      vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
+      vec_mode = TYPE_MODE (vectype);
+    }
+  else
+    {
+      /* Regular reduction: use the same vectype and tree-code as used for
+         the vector code inside the loop can be used for the epilog code. */
+      orig_code = code;
+    }
+
+  if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
     return false;
-  reduc_optab = optab_for_tree_code (reduc_code, vectype);
+  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
   if (!reduc_optab)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "no optab for reduction.");
-      reduc_code = NUM_TREE_CODES;
+      epilog_reduc_code = NUM_TREE_CODES;
     }
   if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "reduc op not supported by target.");
-      reduc_code = NUM_TREE_CODES;
+      epilog_reduc_code = NUM_TREE_CODES;
     }
  
   if (!vec_stmt) /* transformation not required.  */
@@ -1193,25 +1334,31 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
   /* Create the destination vector  */
   vec_dest = vect_create_destination_var (scalar_dest, vectype);
 
-
   /* Create the reduction-phi that defines the reduction-operand.  */
   new_phi = create_phi_node (vec_dest, loop->header);
 
-
   /* Prepare the operand that is defined inside the loop body  */
-  loop_vec_def = vect_get_vec_def_for_operand (op0, stmt, NULL);
+  op = TREE_OPERAND (operation, 0);
+  loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
+  if (op_type == binary_op)
+    expr = build2 (code, vectype, loop_vec_def0, PHI_RESULT (new_phi));
+  else if (op_type == ternary_op)
+    {
+      op = TREE_OPERAND (operation, 1);
+      loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
+      expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, 
+		     PHI_RESULT (new_phi));
+    }
 
   /* Create the vectorized operation that computes the partial results  */
-  *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
-                build2 (code, vectype, loop_vec_def, PHI_RESULT (new_phi)));
+  *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr);
   new_temp = make_ssa_name (vec_dest, *vec_stmt);
   TREE_OPERAND (*vec_stmt, 0) = new_temp;
   vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
 
-
   /* Finalize the reduction-phi (set it's arguments) and create the
      epilog reduction code.  */
-  vect_create_epilog_for_reduction (new_temp, stmt, op1, reduc_code, new_phi);
+  vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
   return true;
 }
 
@@ -2040,6 +2187,7 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi)
   bool is_store = false;
   tree vec_stmt = NULL_TREE;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree orig_stmt_in_pattern;
   bool done;
 
   if (STMT_VINFO_RELEVANT_P (stmt_info))
@@ -2078,7 +2226,25 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi)
 	gcc_unreachable ();
       }
 
+      gcc_assert (vec_stmt);
       STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
+      orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
+      if (orig_stmt_in_pattern)
+        {
+          stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
+          if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+            {
+              gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
+
+              /* STMT was inserted by the vectorizer to replace a computation 
+                 idiom.  ORIG_STMT_IN_PATTERN is a stmt in the original
+                 sequence that computed this idiom.  We need to record a pointer
+                 to VEC_STMT in the stmt_info of ORIG_STMT_IN_PATTERN.  See more
+                 detail in the documentation of vect_pattern_recog.  */
+
+              STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
+            }
+        }
     }
 
   if (STMT_VINFO_LIVE_P (stmt_info))
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index d4c6989fdc3..f03a2a2ce9a 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1,5 +1,5 @@
 /* Loop Vectorization
-   Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
 
 This file is part of GCC.
@@ -1361,6 +1361,8 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
   STMT_VINFO_LIVE_P (res) = 0;
   STMT_VINFO_VECTYPE (res) = NULL;
   STMT_VINFO_VEC_STMT (res) = NULL;
+  STMT_VINFO_IN_PATTERN_P (res) = false;
+  STMT_VINFO_RELATED_STMT (res) = NULL;
   STMT_VINFO_DATA_REF (res) = NULL;
   if (TREE_CODE (stmt) == PHI_NODE)
     STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4f7fd951d6d..c5b13781436 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1,5 +1,5 @@
 /* Loop Vectorization
-   Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
 
 This file is part of GCC.
@@ -43,10 +43,11 @@ enum vect_var_kind {
   vect_scalar_var
 };
 
-/* Defines type of operation: unary or binary.  */
+/* Defines type of operation.  */
 enum operation_type {
   unary_op = 1,
-  binary_op
+  binary_op,
+  ternary_op
 };
 
 /* Define type of available alignment support.  */
@@ -204,6 +205,20 @@ typedef struct _stmt_vec_info {
   /* Information about the data-ref (access function, etc).  */
   struct data_reference *data_ref_info;
 
+  /* Stmt is part of some pattern (computation idiom)  */
+  bool in_pattern_p;
+
+  /* Used for various bookeeping purposes, generally holding a pointer to 
+     some other stmt S that is in some way "related" to this stmt. 
+     Current use of this field is:
+        If this stmt is part of a pattern (i.e. the field 'in_pattern_p' is 
+        true): S is the "pattern stmt" that represents (and replaces) the 
+        sequence of stmts that constitutes the pattern.  Similarly, the 
+        related_stmt of the "pattern stmt" points back to this stmt (which is 
+        the last stmt in the original sequence of stmts that constitutes the 
+        pattern).  */
+  tree related_stmt;
+
   /* List of datarefs that are known to have the same alignment as the dataref
      of this stmt.  */
   VEC(dr_p,heap) *same_align_refs;
@@ -222,6 +237,8 @@ typedef struct _stmt_vec_info {
 #define STMT_VINFO_VECTYPE(S)             (S)->vectype
 #define STMT_VINFO_VEC_STMT(S)            (S)->vectorized_stmt
 #define STMT_VINFO_DATA_REF(S)            (S)->data_ref_info
+#define STMT_VINFO_IN_PATTERN_P(S)        (S)->in_pattern_p
+#define STMT_VINFO_RELATED_STMT(S)        (S)->related_stmt
 #define STMT_VINFO_SAME_ALIGN_REFS(S)     (S)->same_align_refs
 #define STMT_VINFO_DEF_TYPE(S)            (S)->def_type
 
@@ -312,7 +329,6 @@ extern bool vect_can_force_dr_alignment_p (tree, unsigned int);
 extern enum dr_alignment_support vect_supportable_dr_alignment
   (struct data_reference *);
 extern bool reduction_code_for_scalar_code (enum tree_code, enum tree_code *);
-
 /* Creation and deletion of loop and stmt info structs.  */
 extern loop_vec_info new_loop_vec_info (struct loop *loop);
 extern void destroy_loop_vec_info (loop_vec_info);
@@ -320,10 +336,21 @@ extern stmt_vec_info new_stmt_vec_info (tree stmt, loop_vec_info);
 /* Main driver.  */
 extern void vectorize_loops (struct loops *);
 
+
 /** In tree-vect-analyze.c  **/
 /* Driver for analysis stage.  */
 extern loop_vec_info vect_analyze_loop (struct loop *);
 
+
+/** In tree-vect-patterns.c  **/
+/* Pattern recognition functions.
+   Additional pattern recognition functions can (and will) be added
+   in the future.  */
+typedef tree (* vect_recog_func_ptr) (tree, tree *, tree *);
+#define NUM_PATTERNS 3
+void vect_pattern_recog (loop_vec_info);
+
+
 /** In tree-vect-transform.c  **/
 extern bool vectorizable_load (tree, block_stmt_iterator *, tree *);
 extern bool vectorizable_store (tree, block_stmt_iterator *, tree *);
diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index a5028b4d8f5..c4a921c50a9 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -1048,6 +1048,96 @@ extract_range_from_assert (value_range_t *vr_p, tree expr)
       if (compare_values (var_vr->min, vr_p->min) == 0
 	  && compare_values (var_vr->max, vr_p->max) == 0)
 	set_value_range_to_varying (vr_p);
+      else
+	{
+	  tree min, max, anti_min, anti_max, real_min, real_max;
+
+	  /* We want to compute the logical AND of the two ranges;
+	     there are three cases to consider.
+
+
+	     1. The VR_ANTI_RANGE range is competely within the 
+		VR_RANGE and the endpoints of the ranges are
+		different.  In that case the resulting range
+		should be the VR_ANTI_RANGE.
+
+	     2. The VR_ANTI_RANGE is completely disjoint from
+		the VR_RANGE.  In this case the resulting range
+		should be the VR_RANGE.
+
+	     3. There is some overlap between the VR_ANTI_RANGE
+		and the VR_RANGE.
+
+		3a. If the high limit of the VR_ANTI_RANGE resides
+		    within the VR_RANGE, then the result is a new
+		    VR_RANGE starting at the high limit of the
+		    the VR_ANTI_RANGE + 1 and extending to the
+		    high limit of the original VR_RANGE.
+
+		3b. If the low limit of the VR_ANTI_RANGE resides
+		    within the VR_RANGE, then the result is a new
+		    VR_RANGE starting at the low limit of the original
+		    VR_RANGE and extending to the low limit of the
+		    VR_ANTI_RANGE - 1.  */
+	  if (vr_p->type == VR_ANTI_RANGE)
+	    {
+	      anti_min = vr_p->min;
+	      anti_max = vr_p->max;
+	      real_min = var_vr->min;
+	      real_max = var_vr->max;
+	    }
+	  else
+	    {
+	      anti_min = var_vr->min;
+	      anti_max = var_vr->max;
+	      real_min = vr_p->min;
+	      real_max = vr_p->max;
+	    }
+
+
+	  /* Case 1, VR_ANTI_RANGE completely within VR_RANGE,
+	     not including any endpoints.  */
+	  if (compare_values (anti_max, real_max) == -1
+	      && compare_values (anti_min, real_min) == 1)
+	    {
+	      set_value_range (vr_p, VR_ANTI_RANGE, anti_min,
+			       anti_max, vr_p->equiv);
+	    }
+	  /* Case 2, VR_ANTI_RANGE completely disjoint from
+	     VR_RANGE.  */
+	  else if (compare_values (anti_min, real_max) == 1
+		   || compare_values (anti_max, real_min) == -1)
+	    {
+	      set_value_range (vr_p, VR_RANGE, real_min,
+			       real_max, vr_p->equiv);
+	    }
+	  /* Case 3a, the anti-range extends into the low
+	     part of the real range.  Thus creating a new
+	     low for the real reange.  */
+	  else if ((compare_values (anti_max, real_min) == 1
+		    || compare_values (anti_max, real_min) == 0)
+		   && compare_values (anti_max, real_max) == -1)
+	    {
+	      min = fold_build2 (PLUS_EXPR, TREE_TYPE (var_vr->min),
+				 anti_max,
+				 build_int_cst (TREE_TYPE (var_vr->min), 1));
+	      max = real_max;
+	      set_value_range (vr_p, VR_RANGE, min, max, vr_p->equiv);
+	    }
+	  /* Case 3b, the anti-range extends into the high
+	     part of the real range.  Thus creating a new
+	     higher for the real reange.  */
+	  else if (compare_values (anti_min, real_min) == 1
+		   && (compare_values (anti_min, real_max) == -1
+		       || compare_values (anti_min, real_max) == 0))
+	    {
+	      max = fold_build2 (MINUS_EXPR, TREE_TYPE (var_vr->min),
+				 anti_min,
+				 build_int_cst (TREE_TYPE (var_vr->min), 1));
+	      min = real_min;
+	      set_value_range (vr_p, VR_RANGE, min, max, vr_p->equiv);
+	    }
+	}
     }
 
   /* Remove names from the equivalence set that have ranges
diff --git a/gcc/tree.def b/gcc/tree.def
index 9e7e5b011d5..f99b2474f7f 100644
--- a/gcc/tree.def
+++ b/gcc/tree.def
@@ -1,7 +1,7 @@
 /* This file contains the definitions and documentation for the
    tree codes used in GCC.
-   Copyright (C) 1987, 1988, 1993, 1995, 1997, 1998, 2000, 2001, 2004, 2005
-   Free Software Foundation, Inc.
+   Copyright (C) 1987, 1988, 1993, 1995, 1997, 1998, 2000, 2001, 2004, 2005, 
+   2006 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -65,7 +65,6 @@ DEFTREECODE (TREE_VEC, "tree_vec", tcc_exceptional, 0)
      For a block which represents the outermost scope of a function, it
      points to the FUNCTION_DECL node.
    BLOCK_VARS points to a chain of decl nodes.
-   BLOCK_TYPE_TAGS points to a chain of types which have their own names.
    BLOCK_CHAIN points to the next BLOCK at the same level.
    BLOCK_ABSTRACT_ORIGIN points to the original (abstract) tree node which
    this block is an instance of, or else is NULL to indicate that this
@@ -957,8 +956,15 @@ DEFTREECODE (TARGET_MEM_REF, "target_mem_ref", tcc_reference, 7)
    exposed to TREE_RANGE_CHECK.  */
 /* OpenMP - #pragma omp parallel [clause1 ... clauseN]
    Operand 0: OMP_PARALLEL_BODY: Code to be executed by all threads.
-   Operand 1: OMP_PARALLEL_CLAUSES: List of clauses.  */
-DEFTREECODE (OMP_PARALLEL, "omp_parallel", tcc_statement, 2)
+   Operand 1: OMP_PARALLEL_CLAUSES: List of clauses.
+   Operand 2: OMP_PARALLEL_FN: FUNCTION_DECL used when outlining the
+	      body of the parallel region.  Only valid after
+	      pass_lower_omp.
+   Operand 3: OMP_PARALLEL_DATA_ARG: Local variable in the parent
+	      function containing data to be shared with the child
+	      function.  */
+
+DEFTREECODE (OMP_PARALLEL, "omp_parallel", tcc_statement, 4)
 
 /* OpenMP - #pragma omp for [clause1 ... clauseN]
    Operand 0: OMP_FOR_BODY: Loop body.
@@ -983,8 +989,11 @@ DEFTREECODE (OMP_FOR, "omp_for", tcc_statement, 6)
 
 /* OpenMP - #pragma omp sections [clause1 ... clauseN]
    Operand 0: OMP_SECTIONS_BODY: Sections body.
-   Operand 1: OMP_SECTIONS_CLAUSES: List of clauses.  */
-DEFTREECODE (OMP_SECTIONS, "omp_sections", tcc_statement, 2)
+   Operand 1: OMP_SECTIONS_CLAUSES: List of clauses.
+   Operand 2: OMP_SECTIONS_SECTIONS: Vector of the different sections
+	      in the body.  Only valid after lowering and destroyed
+	      after the CFG has been built.  */
+DEFTREECODE (OMP_SECTIONS, "omp_sections", tcc_statement, 3)
 
 /* OpenMP - #pragma omp single
    Operand 0: OMP_SINGLE_BODY: Single section body.
@@ -1063,6 +1072,9 @@ DEFTREECODE (OMP_CLAUSE_ORDERED, "ordered", tcc_expression, 0)
 /* OpenMP clause: default.  */
 DEFTREECODE (OMP_CLAUSE_DEFAULT, "default", tcc_expression, 0)
 
+/* Return from an OpenMP directive.  */
+DEFTREECODE (OMP_RETURN_EXPR, "omp_return", tcc_statement, 0)
+
 /* Reduction operations. 
    Operations that take a vector of elements and "reduce" it to a scalar
    result (e.g. summing the elements of the vector, finding the minimum over
@@ -1073,6 +1085,33 @@ DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1)
 DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1)
 DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1)
 
+/* Widenning dot-product.
+   The first two arguments are of type t1.
+   The third argument and the result are of type t2, such that t2 is at least
+   twice the size of t1. DOT_PROD_EXPR(arg1,arg2,arg3) is equivalent to:
+   	tmp = WIDEN_MULT_EXPR(arg1, arg2);
+   	arg3 = PLUS_EXPR (tmp, arg3);
+   or: 
+	tmp = WIDEN_MULT_EXPR(arg1, arg2);
+        arg3 = WIDEN_SUM_EXPR (tmp, arg3);		 */
+DEFTREECODE (DOT_PROD_EXPR, "dot_prod_expr", tcc_expression, 3)
+
+/* Widenning summation.
+   The first argument is of type t1.
+   The second argument is of type t2, such that t2 is at least twice
+   the size of t1. The type of the entire expression is also t2.
+   WIDEN_SUM_EXPR is equivalent to first widening (promoting)
+   the first argument from type t1 to type t2, and then summing it
+   with the second argument.  */
+DEFTREECODE (WIDEN_SUM_EXPR, "widen_sum_expr", tcc_binary, 2)
+   
+/* Widenning multiplication. 
+   The two arguments are of type t1.
+   The result is of type t2, such that t2 is at least twice
+   the size of t1. WIDEN_MULT_EXPR is equivalent to first widening (promoting)
+   the arguments from type t1 to type t2, and then multiplying them.  */
+DEFTREECODE (WIDEN_MULT_EXPR, "widen_mult_expr", tcc_binary, 2)
+
 /* Whole vector left/right shift in bits.
    Operand 0 is a vector to be shifted.
    Operand 1 is an integer shift amount in bits.  */
diff --git a/gcc/tree.h b/gcc/tree.h
index f7244fc4f74..99463aed570 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -166,6 +166,19 @@ extern const enum tree_code_class tree_code_type[];
 
 #define EXPR_P(NODE) IS_EXPR_CODE_CLASS (TREE_CODE_CLASS (TREE_CODE (NODE)))
 
+/* Returns nonzero iff NODE is an OpenMP directive.  */
+
+#define OMP_DIRECTIVE_P(NODE)				\
+    (TREE_CODE (NODE) == OMP_PARALLEL			\
+     || TREE_CODE (NODE) == OMP_SECTIONS		\
+     || TREE_CODE (NODE) == OMP_SECTION			\
+     || TREE_CODE (NODE) == OMP_FOR			\
+     || TREE_CODE (NODE) == OMP_RETURN_EXPR		\
+     || TREE_CODE (NODE) == OMP_SINGLE			\
+     || TREE_CODE (NODE) == OMP_MASTER			\
+     || TREE_CODE (NODE) == OMP_ORDERED			\
+     || TREE_CODE (NODE) == OMP_CRITICAL)
+
 /* Number of argument-words in each kind of tree-node.  */
 
 extern const unsigned char tree_code_length[];
@@ -1424,6 +1437,8 @@ struct tree_constructor GTY(())
 
 #define OMP_PARALLEL_BODY(NODE)    TREE_OPERAND (OMP_PARALLEL_CHECK (NODE), 0)
 #define OMP_PARALLEL_CLAUSES(NODE) TREE_OPERAND (OMP_PARALLEL_CHECK (NODE), 1)
+#define OMP_PARALLEL_FN(NODE) TREE_OPERAND (OMP_PARALLEL_CHECK (NODE), 2)
+#define OMP_PARALLEL_DATA_ARG(NODE) TREE_OPERAND (OMP_PARALLEL_CHECK (NODE), 3)
 
 #define OMP_FOR_BODY(NODE)	   TREE_OPERAND (OMP_FOR_CHECK (NODE), 0)
 #define OMP_FOR_CLAUSES(NODE)	   TREE_OPERAND (OMP_FOR_CHECK (NODE), 1)
@@ -1434,6 +1449,7 @@ struct tree_constructor GTY(())
 
 #define OMP_SECTIONS_BODY(NODE)    TREE_OPERAND (OMP_SECTIONS_CHECK (NODE), 0)
 #define OMP_SECTIONS_CLAUSES(NODE) TREE_OPERAND (OMP_SECTIONS_CHECK (NODE), 1)
+#define OMP_SECTIONS_SECTIONS(NODE) TREE_OPERAND (OMP_SECTIONS_CHECK (NODE), 2)
 
 #define OMP_SECTION_BODY(NODE)	   TREE_OPERAND (OMP_SECTION_CHECK (NODE), 0)
 
diff --git a/gcc/version.c b/gcc/version.c
index 52611603368..1693b747d0b 100644
--- a/gcc/version.c
+++ b/gcc/version.c
@@ -8,7 +8,7 @@
    in parentheses.  You may also wish to include a number indicating
    the revision of your modified compiler.  */
 
-#define VERSUFFIX " (merged 20060118)"
+#define VERSUFFIX " (merged 20060119)"
 
 /* This is the location of the online document giving instructions for
    reporting bugs.  If you distribute a modified version of GCC,
diff --git a/libjava/ChangeLog b/libjava/ChangeLog
index 032a98db4ce..31f66e6a04d 100644
--- a/libjava/ChangeLog
+++ b/libjava/ChangeLog
@@ -1,3 +1,8 @@
+2006-01-18  Tom Tromey  <tromey@redhat.com>
+
+	* jni.cc (_Jv_JNI_AttachCurrentThread): Return environment if it
+	already exists.
+
 2006-01-18  Keith Seitz  <keiths@redhat.com>
 
 	* include/java-interp.h (_Jv_CompileMethod): Add declaration.
diff --git a/libjava/jni.cc b/libjava/jni.cc
index fd827065718..5a93753f006 100644
--- a/libjava/jni.cc
+++ b/libjava/jni.cc
@@ -1,6 +1,6 @@
 // jni.cc - JNI implementation, including the jump table.
 
-/* Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
+/* Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
    Free Software Foundation
 
    This file is part of libgcj.
@@ -2352,10 +2352,14 @@ _Jv_JNI_AttachCurrentThread (JavaVM *, jstring name, void **penv,
     }
 
   // Attaching an already-attached thread is a no-op.
-  if (_Jv_GetCurrentJNIEnv () != NULL)
-    return 0;
+  JNIEnv *env = _Jv_GetCurrentJNIEnv ();
+  if (env != NULL)
+    {
+      *penv = reinterpret_cast<void *> (env);
+      return 0;
+    }
 
-  JNIEnv *env = (JNIEnv *) _Jv_MallocUnchecked (sizeof (JNIEnv));
+  env = (JNIEnv *) _Jv_MallocUnchecked (sizeof (JNIEnv));
   if (env == NULL)
     return JNI_ERR;
   env->p = &_Jv_JNIFunctions;
author	Diego Novillo <dnovillo@redhat.com>	2006-01-19 21:40:06 +0000
committer	Diego Novillo <dnovillo@redhat.com>	2006-01-19 21:40:06 +0000
commit	e42b66ccd847810cc6212d564496bd23d5d52104 (patch)
tree	39e2a10a73fe63839fc3cb345a5e79464caf20fd
parent	90545ddfafabdf07974234507025cc3d5c65ad8f (diff)