17 files changed, 1012 insertions, 400 deletions
diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index f630d1f8450..03e372e04fa 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -1,6 +1,6 @@
 /* Operating system specific defines to be used when targeting GCC for
    hosting on Windows32, using a Unix style C library and tools.
-   Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002
+   Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
    Free Software Foundation, Inc.
 
 This file is part of GNU CC.
@@ -134,7 +134,8 @@ Boston, MA 02111-1307, USA.  */
    by calling the init function from the prologue.  */
 
 #undef LIBGCC_SPEC
-#define LIBGCC_SPEC "%{mno-cygwin: %{mthreads:-lmingwthrd} -lmingw32} -lgcc %{mno-cygwin:-lmoldname -lmsvcrt}"
+#define LIBGCC_SPEC "%{mno-cygwin: %{mthreads:-lmingwthrd} -lmingw32}	\
+  -lgcc %{mno-cygwin:-lmoldname -lmingwex -lmsvcrt}"
 
 /* This macro defines names of additional specifications to put in the specs
    that can be used in various specifications like CC1_SPEC.  Its definition
@@ -306,11 +307,13 @@ do {							\
 #define CHECK_STACK_LIMIT 4000
 
 /* By default, target has a 80387, uses IEEE compatible arithmetic,
-   and returns float values in the 387 and needs stack probes */
-#undef TARGET_SUBTARGET_DEFAULT
+   returns float values in the 387 and needs stack probes.
+   We also align doubles to 64-bits for MSVC default compatibility. */
 
+#undef TARGET_SUBTARGET_DEFAULT
 #define TARGET_SUBTARGET_DEFAULT \
-   (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_STACK_PROBE) 
+   (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_STACK_PROBE \
+    | MASK_ALIGN_DOUBLE)
 
 /* This is how to output an assembler line
    that says to advance the location counter
@@ -397,6 +400,15 @@ extern void i386_pe_unique_section PARAMS ((TREE, int));
 	const0_rtx));							\
     }
 
+/* Java Native Interface (JNI) methods on Win32 are invoked using the
+   stdcall calling convention.  */
+#undef MODIFY_JNI_METHOD_CALL
+#define MODIFY_JNI_METHOD_CALL(MDECL)					      \
+  build_type_attribute_variant ((MDECL),				      \
+			       build_tree_list (get_identifier ("stdcall"),   \
+						NULL))
+
+
 /* External function declarations.  */
 
 extern void i386_pe_record_external_function PARAMS ((const char *));
diff --git a/gcc/config/i386/djgpp.h b/gcc/config/i386/djgpp.h
index a271aa47cde..67807804501 100644
--- a/gcc/config/i386/djgpp.h
+++ b/gcc/config/i386/djgpp.h
@@ -136,6 +136,8 @@ Boston, MA 02111-1307, USA.  */
 #undef ASM_FILE_START
 #define ASM_FILE_START(FILE)						\
   do {									\
+	if (ix86_asm_dialect == ASM_INTEL)				\
+	  fputs ("\t.intel_syntax\n", FILE);				\
 	output_file_directive (FILE, main_input_filename);		\
   } while (0)
 
diff --git a/gcc/config/i386/freebsd-aout.h b/gcc/config/i386/freebsd-aout.h
index a2b616e700b..85e2703f42c 100644
--- a/gcc/config/i386/freebsd-aout.h
+++ b/gcc/config/i386/freebsd-aout.h
@@ -1,6 +1,6 @@
 /* Definitions of target machine for GNU compiler for Intel 80386
    running FreeBSD.
-   Copyright (C) 1988, 1992, 1994, 1996, 1997, 1999, 2000, 2002
+   Copyright (C) 1988, 1992, 1994, 1996, 1997, 1999, 2000, 2002, 2003
    Free Software Foundation, Inc.
    Contributed by Poul-Henning Kamp <phk@login.dkuug.dk>
    Continued development by David O'Brien <obrien@NUXI.org>
@@ -94,6 +94,9 @@ Boston, MA 02111-1307, USA.  */
 
 /* Profiling routines, partially copied from i386/osfrose.h.  */
 
+/* Tell final.c that we don't need a label passed to mcount.  */
+#define NO_PROFILE_COUNTERS 1
+
 #undef MCOUNT_NAME
 #define MCOUNT_NAME "mcount"
 #undef PROFILE_COUNT_REGISTER
@@ -112,6 +115,7 @@ Boston, MA 02111-1307, USA.  */
 
 #define TYPE_ASM_OP	"\t.type\t"
 #define SIZE_ASM_OP	"\t.size\t"
+#define SET_ASM_OP	"\t.set\t"
 
 /* The following macro defines the format used to output the second
    operand of the .type assembler directive.  Different svr4 assemblers
@@ -121,6 +125,12 @@ Boston, MA 02111-1307, USA.  */
 
 #define TYPE_OPERAND_FMT	"@%s"
 
+#define HANDLE_SYSV_PRAGMA	1
+
+#define ASM_WEAKEN_LABEL(FILE,NAME) \
+	do { fputs ("\t.weak\t", FILE); assemble_name (FILE, NAME); \
+	fputc ('\n', FILE); } while (0)
+
 /* Write the extra assembler code needed to declare a function's result.
    Most svr4 assemblers don't require any special declaration of the
    result value, but there are exceptions.  */
diff --git a/gcc/config/i386/freebsd64.h b/gcc/config/i386/freebsd64.h
index 699f4c4d344..12ca062301d 100644
--- a/gcc/config/i386/freebsd64.h
+++ b/gcc/config/i386/freebsd64.h
@@ -29,8 +29,7 @@ Boston, MA 02111-1307, USA.  */
 
 #undef	LINK_SPEC
 #define LINK_SPEC "\
-  %{!m32:-m elf_x86_64} \
-  %{m32:-m elf_i386} \
+  %{m32:-m elf_i386_fbsd} \
   %{Wl,*:%*} \
   %{v:-V} \
   %{assert*} %{R*} %{rpath*} %{defsym*} \
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 4afdf668bd8..d3c9d160190 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -190,6 +190,9 @@ extern void x86_function_profiler PARAMS ((FILE *, int));
 #ifdef TREE_CODE
 extern void init_cumulative_args PARAMS ((CUMULATIVE_ARGS *, tree, rtx));
 extern rtx function_arg PARAMS ((CUMULATIVE_ARGS *, enum machine_mode, tree, int));
+extern int function_arg_pass_by_reference PARAMS ((CUMULATIVE_ARGS *,
+						   enum machine_mode,
+						   tree, int));
 extern void function_arg_advance PARAMS ((CUMULATIVE_ARGS *, enum machine_mode,
 					tree, int));
 extern rtx ix86_function_value PARAMS ((tree));
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 42f6d93d3c3..0f8c8e4c3fc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1,6 +1,6 @@
 /* Subroutines used for code generation on IA-32.
    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-   2002 Free Software Foundation, Inc.
+   2002, 2003 Free Software Foundation, Inc.
 
 This file is part of GNU CC.
 
@@ -799,6 +799,7 @@ const struct attribute_spec ix86_attribute_table[];
 static tree ix86_handle_cdecl_attribute PARAMS ((tree *, tree, tree, int, bool *));
 static tree ix86_handle_regparm_attribute PARAMS ((tree *, tree, tree, int, bool *));
 static int ix86_value_regno PARAMS ((enum machine_mode));
+static bool contains_128bit_aligned_vector_p PARAMS ((tree));
 
 #if defined (DO_GLOBAL_CTORS_BODY) && defined (HAS_INIT_SECTION)
 static void ix86_svr3_asm_out_constructor PARAMS ((rtx, int));
@@ -911,6 +912,12 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class,
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
+/* The svr4 ABI for the i386 says that records and unions are returned
+   in memory.  */
+#ifndef DEFAULT_PCC_STRUCT_RETURN
+#define DEFAULT_PCC_STRUCT_RETURN 1
+#endif
+
 /* Sometimes certain combinations of command options do not make
    sense on a particular target machine.  You can define a macro
    `OVERRIDE_OPTIONS' to take account of this.  This macro, if
@@ -1021,7 +1028,7 @@ override_options ()
       if (flag_asynchronous_unwind_tables == 2)
 	flag_asynchronous_unwind_tables = 0;
       if (flag_pcc_struct_return == 2)
-	flag_pcc_struct_return = 1;
+	flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
     }
 
 #ifdef SUBTARGET_OVERRIDE_OPTIONS
@@ -2252,6 +2259,9 @@ function_arg (cum, mode, type, named)
 	break;
 
       case BLKmode:
+	if (bytes < 0)
+	  break;
+	/* FALLTHRU */
       case DImode:
       case SImode:
       case HImode:
@@ -2282,6 +2292,90 @@ function_arg (cum, mode, type, named)
   return ret;
 }
 
+/* Return true when TYPE should be 128bit aligned for 32bit argument passing
+   ABI  */
+static bool
+contains_128bit_aligned_vector_p (type)
+     tree type;
+{
+  enum machine_mode mode = TYPE_MODE (type);
+  if (SSE_REG_MODE_P (mode)
+      && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
+    return true;
+  if (TYPE_ALIGN (type) < 128)
+    return false;
+
+  if (AGGREGATE_TYPE_P (type))
+    {
+      /* Walk the agregates recursivly.  */
+      if (TREE_CODE (type) == RECORD_TYPE
+	  || TREE_CODE (type) == UNION_TYPE
+	  || TREE_CODE (type) == QUAL_UNION_TYPE)
+	{
+	  tree field;
+
+	  if (TYPE_BINFO (type) != NULL
+	      && TYPE_BINFO_BASETYPES (type) != NULL)
+	    {
+	      tree bases = TYPE_BINFO_BASETYPES (type);
+	      int n_bases = TREE_VEC_LENGTH (bases);
+	      int i;
+
+	      for (i = 0; i < n_bases; ++i)
+		{
+		  tree binfo = TREE_VEC_ELT (bases, i);
+		  tree type = BINFO_TYPE (binfo);
+
+		  if (contains_128bit_aligned_vector_p (type))
+		    return true;
+		}
+	    }
+	  /* And now merge the fields of structure.   */
+	  for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
+	    {
+	      if (TREE_CODE (field) == FIELD_DECL
+		  && contains_128bit_aligned_vector_p (TREE_TYPE (field)))
+		return true;
+	    }
+	}
+      /* Just for use if some languages passes arrays by value.  */
+      else if (TREE_CODE (type) == ARRAY_TYPE)
+	{
+	  if (contains_128bit_aligned_vector_p (TREE_TYPE (type)))
+	    return true;
+	}
+      else
+	abort ();
+    }
+  return false;
+}
+
+/* A C expression that indicates when an argument must be passed by
+   reference.  If nonzero for an argument, a copy of that argument is
+   made in memory and a pointer to the argument is passed instead of
+   the argument itself.  The pointer is passed in whatever way is
+   appropriate for passing a pointer to that type.  */
+
+int
+function_arg_pass_by_reference (cum, mode, type, named)
+     CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED;
+     enum machine_mode mode ATTRIBUTE_UNUSED;
+     tree type;
+     int named ATTRIBUTE_UNUSED;
+{
+  if (!TARGET_64BIT)
+    return 0;
+
+  if (type && int_size_in_bytes (type) == -1)
+    {
+      if (TARGET_DEBUG_ARG)
+	fprintf (stderr, "function_arg_pass_by_reference\n");
+      return 1;
+    }
+
+  return 0;
+}
+
 /* Gives the alignment boundary, in bits, of an argument with the specified mode
    and type.   */
 
@@ -2291,14 +2385,34 @@ ix86_function_arg_boundary (mode, type)
      tree type;
 {
   int align;
-  if (!TARGET_64BIT)
-    return PARM_BOUNDARY;
   if (type)
     align = TYPE_ALIGN (type);
   else
     align = GET_MODE_ALIGNMENT (mode);
   if (align < PARM_BOUNDARY)
     align = PARM_BOUNDARY;
+  if (!TARGET_64BIT)
+    {
+      /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
+	 make an exception for SSE modes since these require 128bit
+	 alignment.  
+
+	 The handling here differs from field_alignment.  ICC aligns MMX
+	 arguments to 4 byte boundaries, while structure fields are aligned
+	 to 8 byte boundaries.  */
+      if (!type)
+	{
+	  if (!SSE_REG_MODE_P (mode))
+	    align = PARM_BOUNDARY;
+	}
+      else
+	{
+	  if (!contains_128bit_aligned_vector_p (type))
+	    align = PARM_BOUNDARY;
+	}
+      if (align != PARM_BOUNDARY && !TARGET_SSE)
+	abort();
+    }
   if (align > 128)
     align = 128;
   return align;
@@ -2488,6 +2602,8 @@ ix86_setup_incoming_varargs (cum, mode, type, pretend_size, no_rtl)
   /* Indicate to allocate space on the stack for varargs save area.  */
   ix86_save_varrargs_registers = 1;
 
+  cfun->stack_alignment_needed = 128;
+
   fntype = TREE_TYPE (current_function_decl);
   stdarg_p = (TYPE_ARG_TYPES (fntype) != 0
 	      && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (fntype)))
@@ -2637,6 +2753,7 @@ ix86_va_arg (valist, type)
   rtx lab_false, lab_over = NULL_RTX;
   rtx addr_rtx, r;
   rtx container;
+  int indirect_p = 0;
 
   /* Only 64bit target needs something special.  */
   if (!TARGET_64BIT)
@@ -2656,6 +2773,13 @@ ix86_va_arg (valist, type)
   sav = build (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav);
 
   size = int_size_in_bytes (type);
+  if (size == -1)
+    {
+      /* Passed by reference.  */
+      indirect_p = 1;
+      type = build_pointer_type (type);
+      size = int_size_in_bytes (type);
+    }
   rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 
   container = construct_container (TYPE_MODE (type), type, 0,
@@ -2760,10 +2884,12 @@ ix86_va_arg (valist, type)
 	{
 	  int i;
 	  rtx mem;
+	  rtx x;
 
 	  /* Never use the memory itself, as it has the alias set.  */
-	  addr_rtx = XEXP (assign_temp (type, 0, 1, 0), 0);
-	  mem = gen_rtx_MEM (BLKmode, addr_rtx);
+	  x = XEXP (assign_temp (type, 0, 1, 0), 0);
+	  mem = gen_rtx_MEM (BLKmode, x);
+	  force_operand (x, addr_rtx);
 	  set_mem_alias_set (mem, get_varargs_alias_set ());
 	  set_mem_align (mem, BITS_PER_UNIT);
 
@@ -2846,6 +2972,13 @@ ix86_va_arg (valist, type)
   if (container)
     emit_label (lab_over);
 
+  if (indirect_p)
+    {
+      r = gen_rtx_MEM (Pmode, addr_rtx);
+      set_mem_alias_set (r, get_varargs_alias_set ());
+      emit_move_insn (addr_rtx, r);
+    }
+
   return addr_rtx;
 }
 
@@ -3401,6 +3534,19 @@ non_q_regs_operand (op, mode)
   return NON_QI_REG_P (op);
 }
 
+/*  Return 1 when OP is operand acceptable for standard SSE move.  */
+int
+vector_move_operand (op, mode)
+     rtx op;
+     enum machine_mode mode;
+{
+  if (nonimmediate_operand (op, mode))
+    return 1;
+  if (GET_MODE (op) != mode && mode != VOIDmode)
+    return 0;
+  return (op == CONST0_RTX (GET_MODE (op)));
+}
+
 /* Return 1 if OP is a comparison that can be used in the CMPSS/CMPPS
    insns.  */
 int
@@ -4225,7 +4371,8 @@ ix86_save_reg (regno, maybe_eh_return)
       && regno == REAL_PIC_OFFSET_TABLE_REGNUM
       && (regs_ever_live[REAL_PIC_OFFSET_TABLE_REGNUM]
 	  || current_function_profile
-	  || current_function_calls_eh_return))
+	  || current_function_calls_eh_return
+	  || current_function_uses_const_pool))
     {
       if (ix86_select_alt_pic_regnum () != INVALID_REGNUM)
 	return 0;
@@ -4516,6 +4663,10 @@ ix86_expand_prologue ()
       CALL_INSN_FUNCTION_USAGE (insn)
 	= gen_rtx_EXPR_LIST (VOIDmode, gen_rtx_USE (VOIDmode, arg0),
 			     CALL_INSN_FUNCTION_USAGE (insn));
+
+      /* Don't allow scheduling pass to move insns across __alloca
+         call.  */
+      emit_insn (gen_blockage (const0_rtx));
     }
   if (use_mov)
     {
@@ -7484,12 +7635,12 @@ output_fp_compare (insn, operands, eflags_p, unordered_p)
 	if (unordered_p)
 	  return "ucomiss\t{%1, %0|%0, %1}";
 	else
-	  return "comiss\t{%1, %0|%0, %y}";
+	  return "comiss\t{%1, %0|%0, %1}";
       else
 	if (unordered_p)
 	  return "ucomisd\t{%1, %0|%0, %1}";
 	else
-	  return "comisd\t{%1, %0|%0, %y}";
+	  return "comisd\t{%1, %0|%0, %1}";
     }
 
   if (! STACK_TOP_P (cmp_op0))
@@ -7777,9 +7928,17 @@ ix86_expand_move (mode, operands)
 
 	  if (strict)
 	    ;
-	  else if (GET_CODE (op1) == CONST_DOUBLE
-		   && register_operand (op0, mode))
-	    op1 = validize_mem (force_const_mem (mode, op1));
+	  else if (GET_CODE (op1) == CONST_DOUBLE)
+	    {
+	      op1 = validize_mem (force_const_mem (mode, op1));
+	      if (!register_operand (op0, mode))
+		{
+		  rtx temp = gen_reg_rtx (mode);
+		  emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
+		  emit_move_insn (op0, temp);
+		  return;
+		}
+	    }
 	}
     }
 
@@ -7799,8 +7958,12 @@ ix86_expand_vector_move (mode, operands)
      to handle some of them more efficiently.  */
   if ((reload_in_progress | reload_completed) == 0
       && register_operand (operands[0], mode)
-      && CONSTANT_P (operands[1]))
-    operands[1] = force_const_mem (mode, operands[1]);
+      && CONSTANT_P (operands[1]) && operands[1] != CONST0_RTX (mode))
+    {
+      operands[1] = force_const_mem (mode, operands[1]);
+      emit_move_insn (operands[0], operands[1]);
+      return;
+    }
 
   /* Make operand1 a register if it isn't already.  */
   if (!no_new_pseudos
@@ -9219,11 +9382,11 @@ ix86_expand_int_movcc (operands)
 	  /* On x86_64 the lea instruction operates on Pmode, so we need
 	     to get arithmetics done in proper mode to match.  */
 	  if (diff == 1)
-	    tmp = out;
+	    tmp = copy_rtx (out);
 	  else
 	    {
 	      rtx out1;
-	      out1 = out;
+	      out1 = copy_rtx (out);
 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
 	      nops++;
 	      if (diff & 1)
@@ -9241,9 +9404,9 @@ ix86_expand_int_movcc (operands)
 	      && (GET_CODE (tmp) != SUBREG || SUBREG_REG (tmp) != out))
 	    {
 	      if (nops == 1)
-		out = force_operand (tmp, out);
+		out = force_operand (tmp, copy_rtx (out));
 	      else
-		emit_insn (gen_rtx_SET (VOIDmode, out, tmp));
+		emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
 	    }
 	  if (out != operands[0])
 	    emit_move_insn (operands[0], copy_rtx (out));
@@ -9822,15 +9985,24 @@ ix86_split_long_move (operands)
 	 Do an lea to the last part and use only one colliding move.  */
       else if (collisions > 1)
 	{
+	  rtx base;
+
 	  collisions = 1;
-	  emit_insn (gen_rtx_SET (VOIDmode, part[0][nparts - 1],
-				  XEXP (part[1][0], 0)));
-	  part[1][0] = change_address (part[1][0],
-				       TARGET_64BIT ? DImode : SImode,
-				       part[0][nparts - 1]);
-	  part[1][1] = adjust_address (part[1][0], VOIDmode, UNITS_PER_WORD);
+
+	  base = part[0][nparts - 1];
+
+	  /* Handle the case when the last part isn't valid for lea.
+	     Happens in 64-bit mode storing the 12-byte XFmode.  */
+	  if (GET_MODE (base) != Pmode)
+	    base = gen_rtx_REG (Pmode, REGNO (base));
+
+	  emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
+	  part[1][0] = replace_equiv_address (part[1][0], base);
+	  part[1][1] = replace_equiv_address (part[1][1],
+				      plus_constant (base, UNITS_PER_WORD));
 	  if (nparts == 3)
-	    part[1][2] = adjust_address (part[1][0], VOIDmode, 8);
+	    part[1][2] = replace_equiv_address (part[1][2],
+				      plus_constant (base, 8));
 	}
     }
 
@@ -10973,7 +11145,8 @@ memory_address_length (addr)
       if (disp)
 	{
 	  if (GET_CODE (disp) == CONST_INT
-	      && CONST_OK_FOR_LETTER_P (INTVAL (disp), 'K'))
+	      && CONST_OK_FOR_LETTER_P (INTVAL (disp), 'K')
+	      && base)
 	    len = 1;
 	  else
 	    len = 4;
@@ -11036,6 +11209,26 @@ ix86_attr_length_address_default (insn)
      rtx insn;
 {
   int i;
+
+  if (get_attr_type (insn) == TYPE_LEA)
+    {
+      rtx set = PATTERN (insn);
+      if (GET_CODE (set) == SET)
+	;
+      else if (GET_CODE (set) == PARALLEL
+	       && GET_CODE (XVECEXP (set, 0, 0)) == SET)
+	set = XVECEXP (set, 0, 0);
+      else
+	{
+#ifdef ENABLE_CHECKING
+	  abort ();
+#endif
+	  return 0;
+	}
+
+      return memory_address_length (SET_SRC (set));
+    }
+
   extract_insn_cached (insn);
   for (i = recog_data.n_operands - 1; i >= 0; --i)
     if (GET_CODE (recog_data.operand[i]) == MEM)
@@ -11834,7 +12027,8 @@ x86_initialize_trampoline (tramp, fnaddr, cxt)
 
 #define def_builtin(MASK, NAME, TYPE, CODE)			\
 do {								\
-  if ((MASK) & target_flags)					\
+  if ((MASK) & target_flags					\
+      && (!((MASK) & MASK_64BIT) || TARGET_64BIT))		\
     builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD,	\
 		      NULL, NULL_TREE);				\
 } while (0)
@@ -11851,6 +12045,8 @@ struct builtin_description
 
 /* Used for builtins that are enabled both by -msse and -msse2.  */
 #define MASK_SSE1 (MASK_SSE | MASK_SSE2)
+#define MASK_SSE164 (MASK_SSE | MASK_SSE2 | MASK_64BIT)
+#define MASK_SSE264 (MASK_SSE2 | MASK_64BIT)
 
 static const struct builtin_description bdesc_comi[] =
 {
@@ -11933,9 +12129,11 @@ static const struct builtin_description bdesc_2arg[] =
   { MASK_MMX, CODE_FOR_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, 0, 0 },
   { MASK_MMX, CODE_FOR_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, 0, 0 },
   { MASK_MMX, CODE_FOR_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, 0, 0 },
+  { MASK_MMX, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, 0, 0 },
   { MASK_MMX, CODE_FOR_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, 0, 0 },
   { MASK_MMX, CODE_FOR_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, 0, 0 },
   { MASK_MMX, CODE_FOR_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, 0, 0 },
+  { MASK_MMX, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, 0, 0 },
 
   { MASK_MMX, CODE_FOR_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, 0, 0 },
   { MASK_MMX, CODE_FOR_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, 0, 0 },
@@ -11984,6 +12182,7 @@ static const struct builtin_description bdesc_2arg[] =
 
   { MASK_SSE1, CODE_FOR_cvtpi2ps, 0, IX86_BUILTIN_CVTPI2PS, 0, 0 },
   { MASK_SSE1, CODE_FOR_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, 0, 0 },
+  { MASK_SSE164, CODE_FOR_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, 0, 0 },
 
   { MASK_MMX, CODE_FOR_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, 0, 0 },
   { MASK_MMX, CODE_FOR_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, 0, 0 },
@@ -12056,11 +12255,11 @@ static const struct builtin_description bdesc_2arg[] =
   { MASK_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, 0, 0 },
   { MASK_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, 0, 0 },
   { MASK_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, 0, 0 },
-  { MASK_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, 0, 0 },
+  { MASK_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, 0, 0 },
   { MASK_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, 0, 0 },
   { MASK_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, 0, 0 },
   { MASK_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, 0, 0 },
-  { MASK_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, 0, 0 },
+  { MASK_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, 0, 0 },
 
   { MASK_MMX, CODE_FOR_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, 0, 0 },
   { MASK_MMX, CODE_FOR_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, 0, 0 },
@@ -12134,6 +12333,7 @@ static const struct builtin_description bdesc_2arg[] =
   { MASK_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, 0, 0 },
 
   { MASK_SSE2, CODE_FOR_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, 0, 0 },
+  { MASK_SSE264, CODE_FOR_cvtsi2sdq, 0, IX86_BUILTIN_CVTSI642SD, 0, 0 },
   { MASK_SSE2, CODE_FOR_cvtsd2ss, 0, IX86_BUILTIN_CVTSD2SS, 0, 0 },
   { MASK_SSE2, CODE_FOR_cvtss2sd, 0, IX86_BUILTIN_CVTSS2SD, 0, 0 }
 };
@@ -12149,8 +12349,10 @@ static const struct builtin_description bdesc_1arg[] =
 
   { MASK_SSE1, CODE_FOR_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, 0, 0 },
   { MASK_SSE1, CODE_FOR_cvtss2si, 0, IX86_BUILTIN_CVTSS2SI, 0, 0 },
+  { MASK_SSE164, CODE_FOR_cvtss2siq, 0, IX86_BUILTIN_CVTSS2SI64, 0, 0 },
   { MASK_SSE1, CODE_FOR_cvttps2pi, 0, IX86_BUILTIN_CVTTPS2PI, 0, 0 },
   { MASK_SSE1, CODE_FOR_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, 0, 0 },
+  { MASK_SSE164, CODE_FOR_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, 0, 0 },
 
   { MASK_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, 0, 0 },
   { MASK_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, 0, 0 },
@@ -12172,6 +12374,8 @@ static const struct builtin_description bdesc_1arg[] =
 
   { MASK_SSE2, CODE_FOR_cvtsd2si, 0, IX86_BUILTIN_CVTSD2SI, 0, 0 },
   { MASK_SSE2, CODE_FOR_cvttsd2si, 0, IX86_BUILTIN_CVTTSD2SI, 0, 0 },
+  { MASK_SSE264, CODE_FOR_cvtsd2siq, 0, IX86_BUILTIN_CVTSD2SI64, 0, 0 },
+  { MASK_SSE264, CODE_FOR_cvttsd2siq, 0, IX86_BUILTIN_CVTTSD2SI64, 0, 0 },
 
   { MASK_SSE2, CODE_FOR_cvtps2dq, 0, IX86_BUILTIN_CVTPS2DQ, 0, 0 },
   { MASK_SSE2, CODE_FOR_cvtps2pd, 0, IX86_BUILTIN_CVTPS2PD, 0, 0 },
@@ -12197,7 +12401,11 @@ ix86_init_mmx_sse_builtins ()
   size_t i;
 
   tree pchar_type_node = build_pointer_type (char_type_node);
+  tree pcchar_type_node = build_pointer_type (
+			     build_type_variant (char_type_node, 1, 0));
   tree pfloat_type_node = build_pointer_type (float_type_node);
+  tree pcfloat_type_node = build_pointer_type (
+			     build_type_variant (float_type_node, 1, 0));
   tree pv2si_type_node = build_pointer_type (V2SI_type_node);
   tree pv2di_type_node = build_pointer_type (V2DI_type_node);
   tree pdi_type_node = build_pointer_type (long_long_unsigned_type_node);
@@ -12213,11 +12421,18 @@ ix86_init_mmx_sse_builtins ()
   tree int_ftype_v4sf
     = build_function_type_list (integer_type_node,
 				V4SF_type_node, NULL_TREE);
+  tree int64_ftype_v4sf
+    = build_function_type_list (long_long_integer_type_node,
+				V4SF_type_node, NULL_TREE);
   tree int_ftype_v8qi
     = build_function_type_list (integer_type_node, V8QI_type_node, NULL_TREE);
   tree v4sf_ftype_v4sf_int
     = build_function_type_list (V4SF_type_node,
 				V4SF_type_node, integer_type_node, NULL_TREE);
+  tree v4sf_ftype_v4sf_int64
+    = build_function_type_list (V4SF_type_node,
+				V4SF_type_node, long_long_integer_type_node,
+				NULL_TREE);
   tree v4sf_ftype_v4sf_v2si
     = build_function_type_list (V4SF_type_node,
 				V4SF_type_node, V2SI_type_node, NULL_TREE);
@@ -12270,8 +12485,8 @@ ix86_init_mmx_sse_builtins ()
     = build_function_type_list (void_type_node,
 				V8QI_type_node, V8QI_type_node,
 				pchar_type_node, NULL_TREE);
-  tree v4sf_ftype_pfloat
-    = build_function_type_list (V4SF_type_node, pfloat_type_node, NULL_TREE);
+  tree v4sf_ftype_pcfloat
+    = build_function_type_list (V4SF_type_node, pcfloat_type_node, NULL_TREE);
   /* @@@ the type is bogus */
   tree v4sf_ftype_v4sf_pv2si
     = build_function_type_list (V4SF_type_node,
@@ -12326,7 +12541,11 @@ ix86_init_mmx_sse_builtins ()
     = build_function_type_list (V2SI_type_node,
 				V2SF_type_node, V2SF_type_node, NULL_TREE);
   tree pint_type_node    = build_pointer_type (integer_type_node);
+  tree pcint_type_node = build_pointer_type (
+			     build_type_variant (integer_type_node, 1, 0));
   tree pdouble_type_node = build_pointer_type (double_type_node);
+  tree pcdouble_type_node = build_pointer_type (
+				build_type_variant (double_type_node, 1, 0));
   tree int_ftype_v2df_v2df
     = build_function_type_list (integer_type_node,
 				V2DF_type_node, V2DF_type_node, NULL_TREE);
@@ -12338,8 +12557,8 @@ ix86_init_mmx_sse_builtins ()
   tree ti_ftype_ti_ti
     = build_function_type_list (intTI_type_node,
 				intTI_type_node, intTI_type_node, NULL_TREE);
-  tree void_ftype_pvoid
-    = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
+  tree void_ftype_pcvoid
+    = build_function_type_list (void_type_node, const_ptr_type_node, NULL_TREE);
   tree v2di_ftype_di
     = build_function_type_list (V2DI_type_node,
 				long_long_unsigned_type_node, NULL_TREE);
@@ -12364,9 +12583,16 @@ ix86_init_mmx_sse_builtins ()
     = build_function_type_list (V2DF_type_node, V4SF_type_node, NULL_TREE);
   tree int_ftype_v2df
     = build_function_type_list (integer_type_node, V2DF_type_node, NULL_TREE);
+  tree int64_ftype_v2df
+    = build_function_type_list (long_long_integer_type_node,
+		    		V2DF_type_node, NULL_TREE);
   tree v2df_ftype_v2df_int
     = build_function_type_list (V2DF_type_node,
 				V2DF_type_node, integer_type_node, NULL_TREE);
+  tree v2df_ftype_v2df_int64
+    = build_function_type_list (V2DF_type_node,
+				V2DF_type_node, long_long_integer_type_node,
+				NULL_TREE);
   tree v4sf_ftype_v4sf_v2df
     = build_function_type_list (V4SF_type_node,
 				V4SF_type_node, V2DF_type_node, NULL_TREE);
@@ -12394,8 +12620,8 @@ ix86_init_mmx_sse_builtins ()
     = build_function_type_list (void_type_node,
 				V16QI_type_node, V16QI_type_node,
 				pchar_type_node, NULL_TREE);
-  tree v2df_ftype_pdouble
-    = build_function_type_list (V2DF_type_node, pdouble_type_node, NULL_TREE);
+  tree v2df_ftype_pcdouble
+    = build_function_type_list (V2DF_type_node, pcdouble_type_node, NULL_TREE);
   tree v2df_ftype_v2df_v2df
     = build_function_type_list (V2DF_type_node,
 				V2DF_type_node, V2DF_type_node, NULL_TREE);
@@ -12454,16 +12680,16 @@ ix86_init_mmx_sse_builtins ()
 				V16QI_type_node, V16QI_type_node, NULL_TREE);
   tree int_ftype_v16qi
     = build_function_type_list (integer_type_node, V16QI_type_node, NULL_TREE);
-  tree v16qi_ftype_pchar
-    = build_function_type_list (V16QI_type_node, pchar_type_node, NULL_TREE);
+  tree v16qi_ftype_pcchar
+    = build_function_type_list (V16QI_type_node, pcchar_type_node, NULL_TREE);
   tree void_ftype_pchar_v16qi
     = build_function_type_list (void_type_node,
 			        pchar_type_node, V16QI_type_node, NULL_TREE);
-  tree v4si_ftype_pchar
-    = build_function_type_list (V4SI_type_node, pchar_type_node, NULL_TREE);
-  tree void_ftype_pchar_v4si
+  tree v4si_ftype_pcint
+    = build_function_type_list (V4SI_type_node, pcint_type_node, NULL_TREE);
+  tree void_ftype_pcint_v4si
     = build_function_type_list (void_type_node,
-			        pchar_type_node, V4SI_type_node, NULL_TREE);
+			        pcint_type_node, V4SI_type_node, NULL_TREE);
   tree v2di_ftype_v2di
     = build_function_type_list (V2DI_type_node, V2DI_type_node, NULL_TREE);
 
@@ -12539,8 +12765,6 @@ ix86_init_mmx_sse_builtins ()
   /* Add the remaining MMX insns with somewhat more complicated types.  */
   def_builtin (MASK_MMX, "__builtin_ia32_mmx_zero", di_ftype_void, IX86_BUILTIN_MMX_ZERO);
   def_builtin (MASK_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS);
-  def_builtin (MASK_MMX, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR);
-  def_builtin (MASK_MMX, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR);
   def_builtin (MASK_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW);
   def_builtin (MASK_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD);
   def_builtin (MASK_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ);
@@ -12566,21 +12790,26 @@ ix86_init_mmx_sse_builtins ()
   def_builtin (MASK_MMX, "__builtin_ia32_packssdw", v4hi_ftype_v2si_v2si, IX86_BUILTIN_PACKSSDW);
   def_builtin (MASK_MMX, "__builtin_ia32_packuswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKUSWB);
 
+  def_builtin (MASK_SSE1, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR);
+  def_builtin (MASK_SSE1, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR);
   def_builtin (MASK_SSE1, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS);
   def_builtin (MASK_SSE1, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI);
   def_builtin (MASK_SSE1, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS);
+  def_builtin (MASK_SSE164, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS);
   def_builtin (MASK_SSE1, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI);
+  def_builtin (MASK_SSE164, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64);
   def_builtin (MASK_SSE1, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI);
   def_builtin (MASK_SSE1, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI);
+  def_builtin (MASK_SSE164, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64);
 
   def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pextrw", int_ftype_v4hi_int, IX86_BUILTIN_PEXTRW);
   def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pinsrw", v4hi_ftype_v4hi_int_int, IX86_BUILTIN_PINSRW);
 
   def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_maskmovq", void_ftype_v8qi_v8qi_pchar, IX86_BUILTIN_MASKMOVQ);
 
-  def_builtin (MASK_SSE1, "__builtin_ia32_loadaps", v4sf_ftype_pfloat, IX86_BUILTIN_LOADAPS);
-  def_builtin (MASK_SSE1, "__builtin_ia32_loadups", v4sf_ftype_pfloat, IX86_BUILTIN_LOADUPS);
-  def_builtin (MASK_SSE1, "__builtin_ia32_loadss", v4sf_ftype_pfloat, IX86_BUILTIN_LOADSS);
+  def_builtin (MASK_SSE1, "__builtin_ia32_loadaps", v4sf_ftype_pcfloat, IX86_BUILTIN_LOADAPS);
+  def_builtin (MASK_SSE1, "__builtin_ia32_loadups", v4sf_ftype_pcfloat, IX86_BUILTIN_LOADUPS);
+  def_builtin (MASK_SSE1, "__builtin_ia32_loadss", v4sf_ftype_pcfloat, IX86_BUILTIN_LOADSS);
   def_builtin (MASK_SSE1, "__builtin_ia32_storeaps", void_ftype_pfloat_v4sf, IX86_BUILTIN_STOREAPS);
   def_builtin (MASK_SSE1, "__builtin_ia32_storeups", void_ftype_pfloat_v4sf, IX86_BUILTIN_STOREUPS);
   def_builtin (MASK_SSE1, "__builtin_ia32_storess", void_ftype_pfloat_v4sf, IX86_BUILTIN_STORESS);
@@ -12648,9 +12877,9 @@ ix86_init_mmx_sse_builtins ()
   def_builtin (MASK_SSE2, "__builtin_ia32_movq2dq", v2di_ftype_di, IX86_BUILTIN_MOVQ2DQ);
   def_builtin (MASK_SSE2, "__builtin_ia32_movdq2q", di_ftype_v2di, IX86_BUILTIN_MOVDQ2Q);
 
-  def_builtin (MASK_SSE2, "__builtin_ia32_loadapd", v2df_ftype_pdouble, IX86_BUILTIN_LOADAPD);
-  def_builtin (MASK_SSE2, "__builtin_ia32_loadupd", v2df_ftype_pdouble, IX86_BUILTIN_LOADUPD);
-  def_builtin (MASK_SSE2, "__builtin_ia32_loadsd", v2df_ftype_pdouble, IX86_BUILTIN_LOADSD);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loadapd", v2df_ftype_pcdouble, IX86_BUILTIN_LOADAPD);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loadupd", v2df_ftype_pcdouble, IX86_BUILTIN_LOADUPD);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loadsd", v2df_ftype_pcdouble, IX86_BUILTIN_LOADSD);
   def_builtin (MASK_SSE2, "__builtin_ia32_storeapd", void_ftype_pdouble_v2df, IX86_BUILTIN_STOREAPD);
   def_builtin (MASK_SSE2, "__builtin_ia32_storeupd", void_ftype_pdouble_v2df, IX86_BUILTIN_STOREUPD);
   def_builtin (MASK_SSE2, "__builtin_ia32_storesd", void_ftype_pdouble_v2df, IX86_BUILTIN_STORESD);
@@ -12689,33 +12918,36 @@ ix86_init_mmx_sse_builtins ()
 
   def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI);
   def_builtin (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI);
+  def_builtin (MASK_SSE264, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64);
+  def_builtin (MASK_SSE264, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64);
 
   def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ);
   def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD);
   def_builtin (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ);
 
   def_builtin (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD);
+  def_builtin (MASK_SSE264, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD);
   def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS);
   def_builtin (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD);
 
   def_builtin (MASK_SSE2, "__builtin_ia32_setpd1", v2df_ftype_double, IX86_BUILTIN_SETPD1);
   def_builtin (MASK_SSE2, "__builtin_ia32_setpd", v2df_ftype_double_double, IX86_BUILTIN_SETPD);
   def_builtin (MASK_SSE2, "__builtin_ia32_setzeropd", ti_ftype_void, IX86_BUILTIN_CLRPD);
-  def_builtin (MASK_SSE2, "__builtin_ia32_loadpd1", v2df_ftype_pdouble, IX86_BUILTIN_LOADPD1);
-  def_builtin (MASK_SSE2, "__builtin_ia32_loadrpd", v2df_ftype_pdouble, IX86_BUILTIN_LOADRPD);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loadpd1", v2df_ftype_pcdouble, IX86_BUILTIN_LOADPD1);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loadrpd", v2df_ftype_pcdouble, IX86_BUILTIN_LOADRPD);
   def_builtin (MASK_SSE2, "__builtin_ia32_storepd1", void_ftype_pdouble_v2df, IX86_BUILTIN_STOREPD1);
   def_builtin (MASK_SSE2, "__builtin_ia32_storerpd", void_ftype_pdouble_v2df, IX86_BUILTIN_STORERPD);
 
-  def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pvoid, IX86_BUILTIN_CLFLUSH);
+  def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pcvoid, IX86_BUILTIN_CLFLUSH);
   def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE);
   def_builtin (MASK_SSE2, "__builtin_ia32_mfence", void_ftype_void, IX86_BUILTIN_MFENCE);
 
-  def_builtin (MASK_SSE2, "__builtin_ia32_loaddqa", v16qi_ftype_pchar, IX86_BUILTIN_LOADDQA);
-  def_builtin (MASK_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pchar, IX86_BUILTIN_LOADDQU);
-  def_builtin (MASK_SSE2, "__builtin_ia32_loadd", v4si_ftype_pchar, IX86_BUILTIN_LOADD);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loaddqa", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQA);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQU);
+  def_builtin (MASK_SSE2, "__builtin_ia32_loadd", v4si_ftype_pcint, IX86_BUILTIN_LOADD);
   def_builtin (MASK_SSE2, "__builtin_ia32_storedqa", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQA);
   def_builtin (MASK_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU);
-  def_builtin (MASK_SSE2, "__builtin_ia32_stored", void_ftype_pchar_v4si, IX86_BUILTIN_STORED);
+  def_builtin (MASK_SSE2, "__builtin_ia32_stored", void_ftype_pcint_v4si, IX86_BUILTIN_STORED);
   def_builtin (MASK_SSE2, "__builtin_ia32_movq", v2di_ftype_v2di, IX86_BUILTIN_MOVQ);
 
   def_builtin (MASK_SSE1, "__builtin_ia32_setzero128", v2di_ftype_void, IX86_BUILTIN_CLRTI);
@@ -12795,6 +13027,13 @@ ix86_expand_binop_builtin (icode, arglist, target)
       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
     target = gen_reg_rtx (tmode);
 
+  if (GET_MODE (op1) == SImode && mode1 == TImode)
+    {
+      rtx x = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_loadd (x, op1));
+      op1 = gen_lowpart (TImode, x);
+    }
+
   /* In case the insn wants input operands in modes different from
      the result, abort.  */
   if (GET_MODE (op0) != mode0 || GET_MODE (op1) != mode1)
@@ -12837,9 +13076,7 @@ ix86_expand_store_builtin (icode, arglist)
     op1 = safe_vector_operand (op1, mode1);
 
   op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
-
-  if (! (*insn_data[icode].operand[1].predicate) (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
+  op1 = copy_to_mode_reg (mode1, op1);
 
   pat = GEN_FCN (icode) (op0, op1);
   if (pat)
@@ -13828,9 +14065,10 @@ ix86_hard_regno_mode_ok (regno, mode)
   if (FP_REGNO_P (regno))
     return VALID_FP_MODE_P (mode);
   if (SSE_REGNO_P (regno))
-    return VALID_SSE_REG_MODE (mode);
+    return (TARGET_SSE ? VALID_SSE_REG_MODE (mode) : 0);
   if (MMX_REGNO_P (regno))
-    return VALID_MMX_REG_MODE (mode) || VALID_MMX_REG_MODE_3DNOW (mode);
+    return (TARGET_MMX
+	    ? VALID_MMX_REG_MODE (mode) || VALID_MMX_REG_MODE_3DNOW (mode) : 0);
   /* We handle both integer and floats in the general purpose registers.
      In future we should be able to handle vector modes as well.  */
   if (!VALID_INT_MODE_P (mode) && !VALID_FP_MODE_P (mode))
@@ -14299,7 +14537,7 @@ x86_function_profiler (file, labelno)
   else
     {
 #ifndef NO_PROFILE_COUNTERS
-      fprintf (file, "\tmovl\t$%sP%d,%%$s\n", LPREFIX, labelno,
+      fprintf (file, "\tmovl\t$%sP%d,%%%s\n", LPREFIX, labelno,
 	       PROFILE_COUNT_REGISTER);
 #endif
       fprintf (file, "\tcall\t%s\n", MCOUNT_NAME);
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 493a2b5bf9c..ffca44fd57b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -487,9 +487,12 @@ extern int x86_prefetch_sse;
       if (TARGET_64BIT)						\
 	{							\
 	  builtin_assert ("cpu=x86_64");			\
-	  builtin_assert ("machine=x86_64");			\
+	  builtin_define ("__amd64");				\
+	  builtin_define ("__amd64__");				\
 	  builtin_define ("__x86_64");				\
 	  builtin_define ("__x86_64__");			\
+	  builtin_define ("__amd64");				\
+	  builtin_define ("__amd64__");				\
 	}							\
       else							\
 	{							\
@@ -1047,7 +1050,7 @@ do {									\
 	    && (TARGET_64BIT || !TARGET_PARTIAL_REG_STALL))	\
         || ((MODE1) == DImode && TARGET_64BIT))			\
        && ((MODE2) == HImode || (MODE2) == SImode		\
-	   || ((MODE1) == QImode				\
+	   || ((MODE2) == QImode				\
 	       && (TARGET_64BIT || !TARGET_PARTIAL_REG_STALL))	\
 	   || ((MODE2) == DImode && TARGET_64BIT))))
 
@@ -1522,6 +1525,20 @@ enum reg_class
    || ((CLASS) == SIREG)						\
    || ((CLASS) == DIREG))
 
+/* Return a class of registers that cannot change FROM mode to TO mode.
+  
+   x87 registers can't do subreg as all values are reformated to extended
+   precision.  XMM registers does not support with nonzero offsets equal
+   to 4, 8 and 12 otherwise valid for integer registers. Since we can't
+   determine these, prohibit all nonparadoxical subregs changing size.  */
+
+#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)	\
+  (GET_MODE_SIZE (TO) < GET_MODE_SIZE (FROM)		\
+   ? reg_classes_intersect_p (FLOAT_SSE_REGS, (CLASS))	\
+     || MAYBE_MMX_CLASS_P (CLASS) 			\
+   : GET_MODE_SIZE (FROM) != GET_MODE_SIZE (TO)		\
+   ? reg_classes_intersect_p (FLOAT_REGS, (CLASS)) : 0)
+
 /* A C statement that adds to CLOBBERS any hard regs the port wishes
    to automatically clobber for all asms.
 
@@ -1716,17 +1733,28 @@ typedef struct ix86_args {
 
 #define FUNCTION_ARG_PARTIAL_NREGS(CUM, MODE, TYPE, NAMED) 0
 
+/* A C expression that indicates when an argument must be passed by
+   reference.  If nonzero for an argument, a copy of that argument is
+   made in memory and a pointer to the argument is passed instead of
+   the argument itself.  The pointer is passed in whatever way is
+   appropriate for passing a pointer to that type.  */
+
+#define FUNCTION_ARG_PASS_BY_REFERENCE(CUM, MODE, TYPE, NAMED) \
+  function_arg_pass_by_reference(&CUM, MODE, TYPE, NAMED)
+
 /* If PIC, we cannot make sibling calls to global functions
    because the PLT requires %ebx live.
-   If we are returning floats on the register stack, we cannot make
-   sibling calls to functions that return floats.  (The stack adjust
-   instruction will wind up after the sibcall jump, and not be executed.) */
+   If we are returning floats on the 80387 register stack, we cannot
+   make a sibcall from a function that doesn't return a float to a
+   function that does or, conversely, from a function that does return
+   a float to a function that doesn't; the necessary stack adjustment
+   would not be executed.  */
 #define FUNCTION_OK_FOR_SIBCALL(DECL)					\
   ((DECL)								\
    && (! flag_pic || ! TREE_PUBLIC (DECL))				\
    && (! TARGET_FLOAT_RETURNS_IN_80387					\
-       || ! FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (TREE_TYPE (DECL))))	\
-       || FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (TREE_TYPE (cfun->decl))))))
+       || (FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (TREE_TYPE (DECL))))	\
+           == FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (TREE_TYPE (cfun->decl)))))))
 
 /* Perform any needed actions needed for a function that is receiving a
    variable number of arguments.
@@ -2068,9 +2096,12 @@ enum ix86_builtins
   IX86_BUILTIN_CVTPI2PS,
   IX86_BUILTIN_CVTPS2PI,
   IX86_BUILTIN_CVTSI2SS,
+  IX86_BUILTIN_CVTSI642SS,
   IX86_BUILTIN_CVTSS2SI,
+  IX86_BUILTIN_CVTSS2SI64,
   IX86_BUILTIN_CVTTPS2PI,
   IX86_BUILTIN_CVTTSS2SI,
+  IX86_BUILTIN_CVTTSS2SI64,
 
   IX86_BUILTIN_MAXPS,
   IX86_BUILTIN_MAXSS,
@@ -2116,6 +2147,7 @@ enum ix86_builtins
   IX86_BUILTIN_PADDB,
   IX86_BUILTIN_PADDW,
   IX86_BUILTIN_PADDD,
+  IX86_BUILTIN_PADDQ,
   IX86_BUILTIN_PADDSB,
   IX86_BUILTIN_PADDSW,
   IX86_BUILTIN_PADDUSB,
@@ -2123,6 +2155,7 @@ enum ix86_builtins
   IX86_BUILTIN_PSUBB,
   IX86_BUILTIN_PSUBW,
   IX86_BUILTIN_PSUBD,
+  IX86_BUILTIN_PSUBQ,
   IX86_BUILTIN_PSUBSB,
   IX86_BUILTIN_PSUBSW,
   IX86_BUILTIN_PSUBUSB,
@@ -2327,11 +2360,14 @@ enum ix86_builtins
 
   IX86_BUILTIN_CVTPI2PD,
   IX86_BUILTIN_CVTSI2SD,
+  IX86_BUILTIN_CVTSI642SD,
 
   IX86_BUILTIN_CVTSD2SI,
+  IX86_BUILTIN_CVTSD2SI64,
   IX86_BUILTIN_CVTSD2SS,
   IX86_BUILTIN_CVTSS2SD,
   IX86_BUILTIN_CVTTSD2SI,
+  IX86_BUILTIN_CVTTSD2SI64,
 
   IX86_BUILTIN_CVTPS2DQ,
   IX86_BUILTIN_CVTPS2PD,
@@ -3286,6 +3322,7 @@ do {						\
   {"register_and_not_any_fp_reg_operand", {REG}},			\
   {"fp_register_operand", {REG}},					\
   {"register_and_not_fp_reg_operand", {REG}},				\
+  {"vector_move_operand", {CONST_VECTOR, SUBREG, REG, MEM}},		\
 
 /* A list of predicates that do special things with modes, and so
    should not elicit warnings for VOIDmode match_operand.  */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index befbfe49569..edbb7163646 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1,5 +1,6 @@
 ;; GCC machine description for IA-32 and x86-64.
-;; Copyright (C) 1988, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002
+;; Copyright (C) 1988, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+;; 2001, 2002, 2003
 ;; Free Software Foundation, Inc.
 ;; Mostly by William Schelter.
 ;; x86_64 support added by Jan Hubicka
@@ -267,6 +268,8 @@
 (define_attr "length" ""
   (cond [(eq_attr "type" "other,multi,fistp")
 	   (const_int 16)
+	 (eq_attr "type" "fcmp")
+	   (const_int 4)
 	 (eq_attr "unit" "i387")
 	   (plus (const_int 2)
 		 (plus (attr "prefix_data16")
@@ -1099,25 +1102,20 @@
    (set_attr "mode" "SI")
    (set_attr "length_immediate" "1")])
 
-; The first alternative is used only to compute proper length of instruction.
-; Reload's algorithm does not take into account the cost of spill instructions
-; needed to free register in given class, so avoid it from choosing the first
-; alternative when eax is not available.
-
 (define_insn "*movsi_1"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=*?a,r,*?a,m,!*y,!rm,!*y,!*Y,!rm,!*Y")
-	(match_operand:SI 1 "general_operand" "im,rinm,rinm,rin,rm,*y,*y,rm,*Y,*Y"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,m,!*y,!rm,!*y,!*Y,!*Y,!rm")
+	(match_operand:SI 1 "general_operand" "rinm,rin,rm,*y,*y,*Y,rm,*Y"))]
   "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM"
 {
   switch (get_attr_type (insn))
     {
     case TYPE_SSEMOV:
-      if (get_attr_mode (insn) == TImode)
+      if (get_attr_mode (insn) == MODE_TI)
         return "movdqa\t{%1, %0|%0, %1}";
       return "movd\t{%1, %0|%0, %1}";
 
     case TYPE_MMXMOV:
-      if (get_attr_mode (insn) == DImode)
+      if (get_attr_mode (insn) == MODE_DI)
 	return "movq\t{%1, %0|%0, %1}";
       return "movd\t{%1, %0|%0, %1}";
 
@@ -1131,17 +1129,16 @@
     }
 }
   [(set (attr "type")
-     (cond [(eq_attr "alternative" "4,5,6")
+     (cond [(eq_attr "alternative" "2,3,4")
 	      (const_string "mmxmov")
-	    (eq_attr "alternative" "7,8,9")
+	    (eq_attr "alternative" "5,6,7")
 	      (const_string "ssemov")
 	    (and (ne (symbol_ref "flag_pic") (const_int 0))
 		 (match_operand:SI 1 "symbolic_operand" ""))
 	      (const_string "lea")
 	   ]
 	   (const_string "imov")))
-   (set_attr "modrm" "0,*,0,*,*,*,*,*,*,*")
-   (set_attr "mode" "SI,SI,SI,SI,SI,SI,DI,TI,SI,SI")])
+   (set_attr "mode" "SI,SI,SI,SI,DI,TI,SI,SI")])
 
 ;; Stores and loads of ax to arbitary constant address.
 ;; We fake an second form of instruction to force reload to load address
@@ -1214,14 +1211,9 @@
   [(set_attr "type" "push")
    (set_attr "mode" "QI")])
 
-; The first alternative is used only to compute proper length of instruction.
-; Reload's algorithm does not take into account the cost of spill instructions
-; needed to free register in given class, so avoid it from choosing the first
-; alternative when eax is not available.
-
 (define_insn "*movhi_1"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=*?a,r,r,*?a,r,m")
-	(match_operand:HI 1 "general_operand" "i,r,rn,rm,rm,rn"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m")
+	(match_operand:HI 1 "general_operand" "r,rn,rm,rn"))]
   "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM"
 {
   switch (get_attr_type (insn))
@@ -1238,36 +1230,35 @@
     }
 }
   [(set (attr "type")
-     (cond [(and (eq_attr "alternative" "0,1")
+     (cond [(and (eq_attr "alternative" "0")
 		 (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
 			  (const_int 0))
 		      (eq (symbol_ref "TARGET_HIMODE_MATH")
 			  (const_int 0))))
 	      (const_string "imov")
-	    (and (eq_attr "alternative" "2,3,4")
+	    (and (eq_attr "alternative" "1,2")
 		 (match_operand:HI 1 "aligned_operand" ""))
 	      (const_string "imov")
 	    (and (ne (symbol_ref "TARGET_MOVX")
 		     (const_int 0))
-		 (eq_attr "alternative" "0,1,3,4"))
+		 (eq_attr "alternative" "0,2"))
 	      (const_string "imovx")
 	   ]
 	   (const_string "imov")))
     (set (attr "mode")
       (cond [(eq_attr "type" "imovx")
 	       (const_string "SI")
-	     (and (eq_attr "alternative" "2,3,4")
+	     (and (eq_attr "alternative" "1,2")
 		  (match_operand:HI 1 "aligned_operand" ""))
 	       (const_string "SI")
-	     (and (eq_attr "alternative" "0,1")
+	     (and (eq_attr "alternative" "0")
 		  (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
 			   (const_int 0))
 		       (eq (symbol_ref "TARGET_HIMODE_MATH")
 			   (const_int 0))))
 	       (const_string "SI")
 	    ]
-	    (const_string "HI")))
-   (set_attr "modrm" "0,*,*,0,*,*")])
+	    (const_string "HI")))])
 
 ;; Stores and loads of ax to arbitary constant address.
 ;; We fake an second form of instruction to force reload to load address
@@ -1488,7 +1479,7 @@
 (define_expand "movstrictqi"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" ""))
 	(match_operand:QI 1 "general_operand" ""))]
-  "! TARGET_PARTIAL_REG_STALL"
+  "! TARGET_PARTIAL_REG_STALL || optimize_size"
 {
   /* Don't generate memory->memory moves, go through a register.  */
   if (GET_CODE (operands[0]) == MEM && GET_CODE (operands[1]) == MEM)
@@ -1498,7 +1489,7 @@
 (define_insn "*movstrictqi_1"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q"))
 	(match_operand:QI 1 "general_operand" "*qn,m"))]
-  "! TARGET_PARTIAL_REG_STALL
+  "(! TARGET_PARTIAL_REG_STALL || optimize_size)
    && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
   "mov{b}\t{%1, %0|%0, %1}"
   [(set_attr "type" "imov")
@@ -12839,9 +12830,9 @@
    (set_attr "modrm" "0")
    (set (attr "length")
 	   (if_then_else (and (ge (minus (match_dup 0) (pc))
-				  (const_int -128))
+				  (const_int -126))
 			      (lt (minus (match_dup 0) (pc))
-				  (const_int 124)))
+				  (const_int 128)))
 	     (const_int 2)
 	     (const_int 6)))])
 
@@ -12857,9 +12848,9 @@
    (set_attr "modrm" "0")
    (set (attr "length")
 	   (if_then_else (and (ge (minus (match_dup 0) (pc))
-				  (const_int -128))
+				  (const_int -126))
 			      (lt (minus (match_dup 0) (pc))
-				  (const_int 124)))
+				  (const_int 128)))
 	     (const_int 2)
 	     (const_int 6)))])
 
@@ -13124,9 +13115,9 @@
   [(set_attr "type" "ibr")
    (set (attr "length")
 	   (if_then_else (and (ge (minus (match_dup 0) (pc))
-				  (const_int -128))
+				  (const_int -126))
 			      (lt (minus (match_dup 0) (pc))
-				  (const_int 124)))
+				  (const_int 128)))
 	     (const_int 2)
 	     (const_int 5)))
    (set_attr "modrm" "0")])
@@ -13250,9 +13241,9 @@
    (set (attr "length")
 	(if_then_else (and (eq_attr "alternative" "0")
 			   (and (ge (minus (match_dup 0) (pc))
-			            (const_int -128))
+			            (const_int -126))
 			        (lt (minus (match_dup 0) (pc))
-			            (const_int 124))))
+			            (const_int 128))))
 		      (const_int 2)
 		      (const_int 16)))
    ;; We don't know the type before shorten branches.  Optimistically expect
@@ -13616,11 +13607,10 @@
   "ix86_expand_epilogue (0); DONE;")
 
 (define_expand "eh_return"
-  [(use (match_operand 0 "register_operand" ""))
-   (use (match_operand 1 "register_operand" ""))]
+  [(use (match_operand 0 "register_operand" ""))]
   ""
 {
-  rtx tmp, sa = operands[0], ra = operands[1];
+  rtx tmp, sa = EH_RETURN_STACKADJ_RTX, ra = operands[0];
 
   /* Tricky bit: we write the address of the handler to which we will
      be returning into someone else's stack frame, one word below the
@@ -13682,7 +13672,7 @@
 
 (define_expand "ffssi2"
   [(set (match_operand:SI 0 "nonimmediate_operand" "") 
-	(ffs:SI (match_operand:SI 1 "general_operand" "")))]
+	(ffs:SI (match_operand:SI 1 "nonimmediate_operand" "")))]
   ""
 {
   rtx out = gen_reg_rtx (SImode), tmp = gen_reg_rtx (SImode);
@@ -14823,7 +14813,7 @@
 (define_insn "cosxf2"
   [(set (match_operand:XF 0 "register_operand" "=f")
 	(unspec:XF [(match_operand:XF 1 "register_operand" "0")] UNSPEC_COS))]
-  "! TARGET_NO_FANCY_MATH_387 && TARGET_80387
+  "!TARGET_64BIT && ! TARGET_NO_FANCY_MATH_387 && TARGET_80387
    && flag_unsafe_math_optimizations"
   "fcos"
   [(set_attr "type" "fpspc")
@@ -16734,7 +16724,7 @@
 (define_split
   [(set (match_operand 0 "register_operand" "")
 	(if_then_else (match_operator 1 "comparison_operator"
-			[(match_operand 4 "register_operand" "")
+			[(match_operand 4 "nonimmediate_operand" "")
 			 (match_operand 5 "nonimmediate_operand" "")])
 		      (match_operand 2 "nonmemory_operand" "")
 		      (match_operand 3 "nonmemory_operand" "")))]
@@ -16746,13 +16736,16 @@
 					    (subreg:TI (match_dup 7) 0)))]
 {
   PUT_MODE (operands[1], GET_MODE (operands[0]));
-  if (!sse_comparison_operator (operands[1], VOIDmode))
+  if (!sse_comparison_operator (operands[1], VOIDmode)
+      || !rtx_equal_p (operands[0], operands[4]))
     {
       rtx tmp = operands[5];
       operands[5] = operands[4];
       operands[4] = tmp;
       PUT_CODE (operands[1], swap_condition (GET_CODE (operands[1])));
     }
+  if (!rtx_equal_p (operands[0], operands[4]))
+    abort ();
   if (const0_operand (operands[2], GET_MODE (operands[0])))
     {
       operands[7] = operands[3];
@@ -16853,6 +16846,10 @@
      operands[2] = gen_lowpart (SImode, operands[2]);
    PUT_MODE (operands[3], SImode);")
 
+; Promote the QImode tests, as i386 has encoding of the AND
+; instruction with 32-bit sign-extended immediate and thus the
+; instruction size is unchanged, except in the %eax case for
+; which it is increased by one byte, hence the ! optimize_size.
 (define_split
   [(set (reg 17)
 	(compare (and (match_operand 1 "aligned_operand" "")
@@ -16861,12 +16858,11 @@
    (set (match_operand 0 "register_operand" "")
 	(and (match_dup 1) (match_dup 2)))]
   "! TARGET_PARTIAL_REG_STALL && reload_completed
-   && ix86_match_ccmode (insn, CCNOmode)
-   && (GET_MODE (operands[0]) == HImode
-       || (GET_MODE (operands[0]) == QImode 
-	   /* Ensure that the operand will remain sign extended immediate.  */
-	   && INTVAL (operands[2]) >= 0
-	   && (TARGET_PROMOTE_QImode || optimize_size)))"
+   /* Ensure that the operand will remain sign-extended immediate.  */
+   && ix86_match_ccmode (insn, INTVAL (operands[2]) >= 0 ? CCNOmode : CCZmode)
+   && ! optimize_size
+   && ((GET_MODE (operands[0]) == HImode && ! TARGET_FAST_PREFIX)
+       || (GET_MODE (operands[0]) == QImode && TARGET_PROMOTE_QImode))"
   [(parallel [(set (reg:CCNO 17)
 		   (compare:CCNO (and:SI (match_dup 1) (match_dup 2))
 			         (const_int 0)))
@@ -16879,17 +16875,20 @@
    operands[0] = gen_lowpart (SImode, operands[0]);
    operands[1] = gen_lowpart (SImode, operands[1]);")
 
-; Don't promote the QImode tests, as i386 don't have encoding of
-; the test instruction with 32bit sign extended immediate and thus
-; the code grows.
+; Don't promote the QImode tests, as i386 doesn't have encoding of
+; the TEST instruction with 32-bit sign-extended immediate and thus
+; the instruction size would at least double, which is not what we
+; want even with ! optimize_size.
 (define_split
   [(set (reg 17)
 	(compare (and (match_operand:HI 0 "aligned_operand" "")
 		      (match_operand:HI 1 "const_int_operand" ""))
 		 (const_int 0)))]
   "! TARGET_PARTIAL_REG_STALL && reload_completed
-   && ix86_match_ccmode (insn, CCNOmode)
-   && GET_MODE (operands[0]) == HImode"
+   /* Ensure that the operand will remain sign-extended immediate.  */
+   && ix86_match_ccmode (insn, INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode)
+   && ! TARGET_FAST_PREFIX
+   && ! optimize_size"
   [(set (reg:CCNO 17)
 	(compare:CCNO (and:SI (match_dup 0) (match_dup 1))
 		      (const_int 0)))]
@@ -17848,67 +17847,92 @@
 ;; Moves for SSE/MMX regs.
 
 (define_insn "movv4sf_internal"
-  [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,m")
-	(match_operand:V4SF 1 "nonimmediate_operand" "xm,x"))]
+  [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m")
+	(match_operand:V4SF 1 "vector_move_operand" "C,xm,x"))]
   "TARGET_SSE"
   ;; @@@ let's try to use movaps here.
-  "movaps\t{%1, %0|%0, %1}"
+  "@
+   xorps\t%0, %0
+   movaps\t{%1, %0|%0, %1}
+   movaps\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V4SF")])
 
 (define_insn "movv4si_internal"
-  [(set (match_operand:V4SI 0 "nonimmediate_operand" "=x,m")
-	(match_operand:V4SI 1 "nonimmediate_operand" "xm,x"))]
+  [(set (match_operand:V4SI 0 "nonimmediate_operand" "=x,x,m")
+	(match_operand:V4SI 1 "vector_move_operand" "C,xm,x"))]
   "TARGET_SSE"
   ;; @@@ let's try to use movaps here.
-  "movaps\t{%1, %0|%0, %1}"
+  "@
+   xorps\t%0, %0
+   movaps\t{%1, %0|%0, %1}
+   movaps\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V4SF")])
 
 (define_insn "movv2di_internal"
-  [(set (match_operand:V2DI 0 "nonimmediate_operand" "=x,m")
-	(match_operand:V2DI 1 "nonimmediate_operand" "xm,x"))]
+  [(set (match_operand:V2DI 0 "nonimmediate_operand" "=x,x,m")
+	(match_operand:V2DI 1 "vector_move_operand" "C,xm,x"))]
   "TARGET_SSE"
   ;; @@@ let's try to use movaps here.
-  "movdqa\t{%1, %0|%0, %1}"
+  "@
+   pxor\t%0, %0
+   movdqa\t{%1, %0|%0, %1} 
+   movdqa\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V4SF")])
 
 (define_insn "movv8qi_internal"
-  [(set (match_operand:V8QI 0 "nonimmediate_operand" "=y,m")
-	(match_operand:V8QI 1 "nonimmediate_operand" "ym,y"))]
-  "TARGET_MMX"
-  "movq\t{%1, %0|%0, %1}"
+  [(set (match_operand:V8QI 0 "nonimmediate_operand" "=y,y,m")
+	(match_operand:V8QI 1 "vector_move_operand" "C,ym,y"))]
+  "TARGET_MMX
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "@
+    pxor\t%0, %0
+    movq\t{%1, %0|%0, %1}
+    movq\t{%1, %0|%0, %1}"
   [(set_attr "type" "mmxmov")
    (set_attr "mode" "DI")])
 
 (define_insn "movv4hi_internal"
-  [(set (match_operand:V4HI 0 "nonimmediate_operand" "=y,m")
-	(match_operand:V4HI 1 "nonimmediate_operand" "ym,y"))]
-  "TARGET_MMX"
-  "movq\t{%1, %0|%0, %1}"
+  [(set (match_operand:V4HI 0 "nonimmediate_operand" "=y,y,m")
+	(match_operand:V4HI 1 "vector_move_operand" "C,ym,y"))]
+  "TARGET_MMX
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "@
+    pxor\t%0, %0
+    movq\t{%1, %0|%0, %1}
+    movq\t{%1, %0|%0, %1}"
   [(set_attr "type" "mmxmov")
    (set_attr "mode" "DI")])
 
 (define_insn "movv2si_internal"
-  [(set (match_operand:V2SI 0 "nonimmediate_operand" "=y,m")
-	(match_operand:V2SI 1 "nonimmediate_operand" "ym,y"))]
-  "TARGET_MMX"
-  "movq\t{%1, %0|%0, %1}"
+  [(set (match_operand:V2SI 0 "nonimmediate_operand" "=y,y,m")
+	(match_operand:V2SI 1 "vector_move_operand" "C,ym,y"))]
+  "TARGET_MMX
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "@
+    pxor\t%0, %0
+    movq\t{%1, %0|%0, %1}
+    movq\t{%1, %0|%0, %1}"
   [(set_attr "type" "mmxcvt")
    (set_attr "mode" "DI")])
 
 (define_insn "movv2sf_internal"
-  [(set (match_operand:V2SF 0 "nonimmediate_operand" "=y,m")
-        (match_operand:V2SF 1 "nonimmediate_operand" "ym,y"))]
-  "TARGET_3DNOW"
-  "movq\\t{%1, %0|%0, %1}"
+  [(set (match_operand:V2SF 0 "nonimmediate_operand" "=y,y,m")
+        (match_operand:V2SF 1 "vector_move_operand" "C,ym,y"))]
+  "TARGET_3DNOW
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "@
+    pxor\t%0, %0
+    movq\t{%1, %0|%0, %1}
+    movq\t{%1, %0|%0, %1}"
   [(set_attr "type" "mmxcvt")
    (set_attr "mode" "DI")])
 
 (define_expand "movti"
-  [(set (match_operand:TI 0 "general_operand" "")
-	(match_operand:TI 1 "general_operand" ""))]
+  [(set (match_operand:TI 0 "nonimmediate_operand" "")
+	(match_operand:TI 1 "nonimmediate_operand" ""))]
   "TARGET_SSE || TARGET_64BIT"
 {
   if (TARGET_64BIT)
@@ -17919,35 +17943,44 @@
 })
 
 (define_insn "movv2df_internal"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,m")
-	(match_operand:V2DF 1 "nonimmediate_operand" "xm,x"))]
-  "TARGET_SSE2"
-  ;; @@@ let's try to use movaps here.
-  "movapd\t{%1, %0|%0, %1}"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
+	(match_operand:V2DF 1 "vector_move_operand" "C,xm,x"))]
+  "TARGET_SSE2
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "@
+   xorpd\t%0, %0
+   movapd\t{%1, %0|%0, %1}
+   movapd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V2DF")])
 
 (define_insn "movv8hi_internal"
-  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m")
-	(match_operand:V8HI 1 "nonimmediate_operand" "xm,x"))]
-  "TARGET_SSE2"
-  ;; @@@ let's try to use movaps here.
-  "movaps\t{%1, %0|%0, %1}"
+  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,x,m")
+	(match_operand:V8HI 1 "vector_move_operand" "C,xm,x"))]
+  "TARGET_SSE2
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "@
+   xorps\t%0, %0
+   movaps\t{%1, %0|%0, %1}
+   movaps\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V4SF")])
 
 (define_insn "movv16qi_internal"
-  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
-	(match_operand:V16QI 1 "nonimmediate_operand" "xm,x"))]
-  "TARGET_SSE2"
-  ;; @@@ let's try to use movaps here.
-  "movaps\t{%1, %0|%0, %1}"
+  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,x,m")
+	(match_operand:V16QI 1 "vector_move_operand" "C,xm,x"))]
+  "TARGET_SSE2
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "@
+   xorps\t%0, %0
+   movaps\t{%1, %0|%0, %1}
+   movaps\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V4SF")])
 
 (define_expand "movv2df"
-  [(set (match_operand:V2DF 0 "general_operand" "")
-	(match_operand:V2DF 1 "general_operand" ""))]
+  [(set (match_operand:V2DF 0 "nonimmediate_operand" "")
+	(match_operand:V2DF 1 "nonimmediate_operand" ""))]
   "TARGET_SSE2"
 {
   ix86_expand_vector_move (V2DFmode, operands);
@@ -17955,8 +17988,8 @@
 })
 
 (define_expand "movv8hi"
-  [(set (match_operand:V8HI 0 "general_operand" "")
-	(match_operand:V8HI 1 "general_operand" ""))]
+  [(set (match_operand:V8HI 0 "nonimmediate_operand" "")
+	(match_operand:V8HI 1 "nonimmediate_operand" ""))]
   "TARGET_SSE2"
 {
   ix86_expand_vector_move (V8HImode, operands);
@@ -17964,8 +17997,8 @@
 })
 
 (define_expand "movv16qi"
-  [(set (match_operand:V16QI 0 "general_operand" "")
-	(match_operand:V16QI 1 "general_operand" ""))]
+  [(set (match_operand:V16QI 0 "nonimmediate_operand" "")
+	(match_operand:V16QI 1 "nonimmediate_operand" ""))]
   "TARGET_SSE2"
 {
   ix86_expand_vector_move (V16QImode, operands);
@@ -17973,8 +18006,8 @@
 })
 
 (define_expand "movv4sf"
-  [(set (match_operand:V4SF 0 "general_operand" "")
-	(match_operand:V4SF 1 "general_operand" ""))]
+  [(set (match_operand:V4SF 0 "nonimmediate_operand" "")
+	(match_operand:V4SF 1 "nonimmediate_operand" ""))]
   "TARGET_SSE"
 {
   ix86_expand_vector_move (V4SFmode, operands);
@@ -17982,8 +18015,8 @@
 })
 
 (define_expand "movv4si"
-  [(set (match_operand:V4SI 0 "general_operand" "")
-	(match_operand:V4SI 1 "general_operand" ""))]
+  [(set (match_operand:V4SI 0 "nonimmediate_operand" "")
+	(match_operand:V4SI 1 "nonimmediate_operand" ""))]
   "TARGET_SSE"
 {
   ix86_expand_vector_move (V4SImode, operands);
@@ -17991,8 +18024,8 @@
 })
 
 (define_expand "movv2di"
-  [(set (match_operand:V2DI 0 "general_operand" "")
-	(match_operand:V2DI 1 "general_operand" ""))]
+  [(set (match_operand:V2DI 0 "nonimmediate_operand" "")
+	(match_operand:V2DI 1 "nonimmediate_operand" ""))]
   "TARGET_SSE"
 {
   ix86_expand_vector_move (V2DImode, operands);
@@ -18000,8 +18033,8 @@
 })
 
 (define_expand "movv2si"
-  [(set (match_operand:V2SI 0 "general_operand" "")
-	(match_operand:V2SI 1 "general_operand" ""))]
+  [(set (match_operand:V2SI 0 "nonimmediate_operand" "")
+	(match_operand:V2SI 1 "nonimmediate_operand" ""))]
   "TARGET_MMX"
 {
   ix86_expand_vector_move (V2SImode, operands);
@@ -18009,8 +18042,8 @@
 })
 
 (define_expand "movv4hi"
-  [(set (match_operand:V4HI 0 "general_operand" "")
-	(match_operand:V4HI 1 "general_operand" ""))]
+  [(set (match_operand:V4HI 0 "nonimmediate_operand" "")
+	(match_operand:V4HI 1 "nonimmediate_operand" ""))]
   "TARGET_MMX"
 {
   ix86_expand_vector_move (V4HImode, operands);
@@ -18018,8 +18051,8 @@
 })
 
 (define_expand "movv8qi"
-  [(set (match_operand:V8QI 0 "general_operand" "")
-	(match_operand:V8QI 1 "general_operand" ""))]
+  [(set (match_operand:V8QI 0 "nonimmediate_operand" "")
+	(match_operand:V8QI 1 "nonimmediate_operand" ""))]
   "TARGET_MMX"
 {
   ix86_expand_vector_move (V8QImode, operands);
@@ -18027,14 +18060,97 @@
 })
 
 (define_expand "movv2sf"
-  [(set (match_operand:V2SF 0 "general_operand" "")
-	(match_operand:V2SF 1 "general_operand" ""))]
+  [(set (match_operand:V2SF 0 "nonimmediate_operand" "")
+	(match_operand:V2SF 1 "nonimmediate_operand" ""))]
    "TARGET_3DNOW"
 {
   ix86_expand_vector_move (V2SFmode, operands);
   DONE;
 })
 
+(define_insn "*pushv2df"
+  [(set (match_operand:V2DF 0 "push_operand" "=<")
+	(match_operand:V2DF 1 "register_operand" "x"))]
+  "TARGET_SSE"
+  "#")
+
+(define_insn "*pushv2di"
+  [(set (match_operand:V2DI 0 "push_operand" "=<")
+	(match_operand:V2DI 1 "register_operand" "x"))]
+  "TARGET_SSE2"
+  "#")
+
+(define_insn "*pushv8hi"
+  [(set (match_operand:V8HI 0 "push_operand" "=<")
+	(match_operand:V8HI 1 "register_operand" "x"))]
+  "TARGET_SSE2"
+  "#")
+
+(define_insn "*pushv16qi"
+  [(set (match_operand:V16QI 0 "push_operand" "=<")
+	(match_operand:V16QI 1 "register_operand" "x"))]
+  "TARGET_SSE2"
+  "#")
+
+(define_insn "*pushv4sf"
+  [(set (match_operand:V4SF 0 "push_operand" "=<")
+	(match_operand:V4SF 1 "register_operand" "x"))]
+  "TARGET_SSE"
+  "#")
+
+(define_insn "*pushv4si"
+  [(set (match_operand:V4SI 0 "push_operand" "=<")
+	(match_operand:V4SI 1 "register_operand" "x"))]
+  "TARGET_SSE2"
+  "#")
+
+(define_insn "*pushv2si"
+  [(set (match_operand:V2SI 0 "push_operand" "=<")
+	(match_operand:V2SI 1 "register_operand" "y"))]
+  "TARGET_MMX"
+  "#")
+
+(define_insn "*pushv4hi"
+  [(set (match_operand:V4HI 0 "push_operand" "=<")
+	(match_operand:V4HI 1 "register_operand" "y"))]
+  "TARGET_MMX"
+  "#")
+
+(define_insn "*pushv8qi"
+  [(set (match_operand:V8QI 0 "push_operand" "=<")
+	(match_operand:V8QI 1 "register_operand" "y"))]
+  "TARGET_MMX"
+  "#")
+
+(define_insn "*pushv2sf"
+  [(set (match_operand:V2SF 0 "push_operand" "=<")
+	(match_operand:V2SF 1 "register_operand" "y"))]
+  "TARGET_3DNOW"
+  "#")
+
+(define_split
+  [(set (match_operand 0 "push_operand" "")
+	(match_operand 1 "register_operand" ""))]
+  "!TARGET_64BIT && reload_completed
+   && (SSE_REG_P (operands[1]) || MMX_REG_P (operands[1]))"
+  [(set (reg:SI 7) (plus:SI (reg:SI 7) (match_dup 3)))
+   (set (match_dup 2) (match_dup 1))]
+  "operands[2] = change_address (operands[0], GET_MODE (operands[0]),
+				 stack_pointer_rtx);
+   operands[3] = GEN_INT (-GET_MODE_SIZE (GET_MODE (operands[0])));")
+
+(define_split
+  [(set (match_operand 0 "push_operand" "")
+	(match_operand 1 "register_operand" ""))]
+  "TARGET_64BIT && reload_completed
+   && (SSE_REG_P (operands[1]) || MMX_REG_P (operands[1]))"
+  [(set (reg:DI 7) (plus:DI (reg:DI 7) (match_dup 3)))
+   (set (match_dup 2) (match_dup 1))]
+  "operands[2] = change_address (operands[0], GET_MODE (operands[0]),
+				 stack_pointer_rtx);
+   operands[3] = GEN_INT (-GET_MODE_SIZE (GET_MODE (operands[0])));")
+
+
 (define_insn_and_split "*pushti"
   [(set (match_operand:TI 0 "push_operand" "=<")
 	(match_operand:TI 1 "nonmemory_operand" "x"))]
@@ -18158,8 +18274,9 @@
 
 (define_insn "movti_internal"
   [(set (match_operand:TI 0 "nonimmediate_operand" "=x,x,m")
-	(match_operand:TI 1 "general_operand" "C,xm,x"))]
-  "TARGET_SSE && !TARGET_64BIT"
+	(match_operand:TI 1 "vector_move_operand" "C,xm,x"))]
+  "TARGET_SSE && !TARGET_64BIT
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
   "@
    xorps\t%0, %0
    movaps\t{%1, %0|%0, %1}
@@ -18169,7 +18286,7 @@
 
 (define_insn "*movti_rex64"
   [(set (match_operand:TI 0 "nonimmediate_operand" "=r,o,x,mx,x")
-	(match_operand:TI 1 "general_operand" "riFo,riF,O,x,m"))]
+	(match_operand:TI 1 "general_operand" "riFo,riF,C,x,m"))]
   "TARGET_64BIT
    && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
   "@
@@ -18191,29 +18308,56 @@
 
 ;; These two patterns are useful for specifying exactly whether to use
 ;; movaps or movups
-(define_insn "sse_movaps"
+(define_expand "sse_movaps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "")]
+		     UNSPEC_MOVA))]
+  "TARGET_SSE"
+{
+  if (GET_CODE (operands[0]) == MEM && GET_CODE (operands[1]) == MEM)
+    {
+      rtx tmp = gen_reg_rtx (V4SFmode);
+      emit_insn (gen_sse_movaps (tmp, operands[1]));
+      emit_move_insn (operands[0], tmp);
+      DONE;
+    }
+})
+
+(define_insn "*sse_movaps_1"
   [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,m")
 	(unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm,x")]
 		     UNSPEC_MOVA))]
-  "TARGET_SSE"
-  "@
-   movaps\t{%1, %0|%0, %1}
-   movaps\t{%1, %0|%0, %1}"
+  "TARGET_SSE
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "movaps\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov,ssemov")
    (set_attr "mode" "V4SF")])
 
-(define_insn "sse_movups"
+(define_expand "sse_movups"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "")]
+		     UNSPEC_MOVU))]
+  "TARGET_SSE"
+{
+  if (GET_CODE (operands[0]) == MEM && GET_CODE (operands[1]) == MEM)
+    {
+      rtx tmp = gen_reg_rtx (V4SFmode);
+      emit_insn (gen_sse_movups (tmp, operands[1]));
+      emit_move_insn (operands[0], tmp);
+      DONE;
+    }
+})
+
+(define_insn "*sse_movups_1"
   [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,m")
 	(unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm,x")]
 		     UNSPEC_MOVU))]
-  "TARGET_SSE"
-  "@
-   movups\t{%1, %0|%0, %1}
-   movups\t{%1, %0|%0, %1}"
+  "TARGET_SSE
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "movups\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt,ssecvt")
    (set_attr "mode" "V4SF")])
 
-
 ;; SSE Strange Moves.
 
 (define_insn "sse_movmskps"
@@ -18329,11 +18473,21 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "V4SF")])
 
-(define_insn "sse_loadss"
+(define_expand "sse_loadss"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:SF 1 "memory_operand" "")]
+  "TARGET_SSE"
+{
+  emit_insn (gen_sse_loadss_1 (operands[0], operands[1],
+			       CONST0_RTX (V4SFmode)));
+  DONE;
+})
+
+(define_insn "sse_loadss_1"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
-	 (match_operand:V4SF 1 "memory_operand" "m")
-	 (vec_duplicate:V4SF (float:SF (const_int 0)))
+	 (vec_duplicate:V4SF (match_operand:SF 1 "memory_operand" "m"))
+	 (match_operand:V4SF 2 "const0_operand" "X")
 	 (const_int 1)))]
   "TARGET_SSE"
   "movss\t{%1, %0|%0, %1}"
@@ -18804,7 +18958,7 @@
 
 (define_insn "sse2_nandv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
-        (and:V2DI (not:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "%0"))
+        (and:V2DI (not:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "0"))
 		  (match_operand:V2DI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2
    && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)"
@@ -18908,7 +19062,7 @@
 	 (match_operator:V4SI 3 "sse_comparison_operator"
 		[(match_operand:V4SF 1 "register_operand" "0")
 		 (match_operand:V4SF 2 "register_operand" "x")])
-	 (match_dup 1)
+	 (subreg:V4SI (match_dup 1) 0)
 	 (const_int 1)))]
   "TARGET_SSE"
   "cmp%D3ss\t{%2, %0|%0, %2}"
@@ -19093,6 +19247,19 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "SF")])
 
+(define_insn "cvtsi2ssq"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_merge:V4SF
+	 (match_operand:V4SF 1 "register_operand" "0,0")
+	 (vec_duplicate:V4SF
+	  (float:SF (match_operand:DI 2 "nonimmediate_operand" "r,rm")))
+	 (const_int 14)))]
+  "TARGET_SSE && TARGET_64BIT"
+  "cvtsi2ssq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "athlon_decode" "vector,vector")
+   (set_attr "mode" "SF")])
+
 (define_insn "cvtss2si"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(vec_select:SI
@@ -19103,6 +19270,17 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "SF")])
 
+(define_insn "cvtss2siq"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(vec_select:DI
+	 (fix:V4DI (match_operand:V4SF 1 "nonimmediate_operand" "x,m"))
+	 (parallel [(const_int 0)])))]
+  "TARGET_SSE"
+  "cvtss2siq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "athlon_decode" "vector,vector")
+   (set_attr "mode" "SF")])
+
 (define_insn "cvttss2si"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(vec_select:SI
@@ -19114,6 +19292,18 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "SF")])
 
+(define_insn "cvttss2siq"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(vec_select:DI
+	 (unspec:V4DI [(match_operand:V4SF 1 "nonimmediate_operand" "x,xm")]
+		      UNSPEC_FIX)
+	 (parallel [(const_int 0)])))]
+  "TARGET_SSE && TARGET_64BIT"
+  "cvttss2siq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "SF")
+   (set_attr "athlon_decode" "vector,vector")])
+
 
 ;; MMX insns
 
@@ -19121,7 +19311,7 @@
 
 (define_insn "addv8qi3"
   [(set (match_operand:V8QI 0 "register_operand" "=y")
-        (plus:V8QI (match_operand:V8QI 1 "register_operand" "0")
+        (plus:V8QI (match_operand:V8QI 1 "register_operand" "%0")
 	           (match_operand:V8QI 2 "nonimmediate_operand" "ym")))]
   "TARGET_MMX"
   "paddb\t{%2, %0|%0, %2}"
@@ -19130,7 +19320,7 @@
 
 (define_insn "addv4hi3"
   [(set (match_operand:V4HI 0 "register_operand" "=y")
-        (plus:V4HI (match_operand:V4HI 1 "register_operand" "0")
+        (plus:V4HI (match_operand:V4HI 1 "register_operand" "%0")
 	           (match_operand:V4HI 2 "nonimmediate_operand" "ym")))]
   "TARGET_MMX"
   "paddw\t{%2, %0|%0, %2}"
@@ -19139,16 +19329,27 @@
 
 (define_insn "addv2si3"
   [(set (match_operand:V2SI 0 "register_operand" "=y")
-        (plus:V2SI (match_operand:V2SI 1 "register_operand" "0")
+        (plus:V2SI (match_operand:V2SI 1 "register_operand" "%0")
 	           (match_operand:V2SI 2 "nonimmediate_operand" "ym")))]
   "TARGET_MMX"
   "paddd\t{%2, %0|%0, %2}"
   [(set_attr "type" "mmxadd")
    (set_attr "mode" "DI")])
 
+(define_insn "mmx_adddi3"
+  [(set (match_operand:DI 0 "register_operand" "=y")
+        (unspec:DI
+	 [(plus:DI (match_operand:DI 1 "register_operand" "%0")
+		   (match_operand:DI 2 "nonimmediate_operand" "ym"))]
+	 UNSPEC_NOP))]
+  "TARGET_MMX"
+  "paddq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
 (define_insn "ssaddv8qi3"
   [(set (match_operand:V8QI 0 "register_operand" "=y")
-        (ss_plus:V8QI (match_operand:V8QI 1 "register_operand" "0")
+        (ss_plus:V8QI (match_operand:V8QI 1 "register_operand" "%0")
 		      (match_operand:V8QI 2 "nonimmediate_operand" "ym")))]
   "TARGET_MMX"
   "paddsb\t{%2, %0|%0, %2}"
@@ -19157,7 +19358,7 @@
 
 (define_insn "ssaddv4hi3"
   [(set (match_operand:V4HI 0 "register_operand" "=y")
-        (ss_plus:V4HI (match_operand:V4HI 1 "register_operand" "0")
+        (ss_plus:V4HI (match_operand:V4HI 1 "register_operand" "%0")
 		      (match_operand:V4HI 2 "nonimmediate_operand" "ym")))]
   "TARGET_MMX"
   "paddsw\t{%2, %0|%0, %2}"
@@ -19166,7 +19367,7 @@
 
 (define_insn "usaddv8qi3"
   [(set (match_operand:V8QI 0 "register_operand" "=y")
-        (us_plus:V8QI (match_operand:V8QI 1 "register_operand" "0")
+        (us_plus:V8QI (match_operand:V8QI 1 "register_operand" "%0")
 		      (match_operand:V8QI 2 "nonimmediate_operand" "ym")))]
   "TARGET_MMX"
   "paddusb\t{%2, %0|%0, %2}"
@@ -19175,7 +19376,7 @@
 
 (define_insn "usaddv4hi3"
   [(set (match_operand:V4HI 0 "register_operand" "=y")
-        (us_plus:V4HI (match_operand:V4HI 1 "register_operand" "0")
+        (us_plus:V4HI (match_operand:V4HI 1 "register_operand" "%0")
 		      (match_operand:V4HI 2 "nonimmediate_operand" "ym")))]
   "TARGET_MMX"
   "paddusw\t{%2, %0|%0, %2}"
@@ -19209,6 +19410,17 @@
   [(set_attr "type" "mmxadd")
    (set_attr "mode" "DI")])
 
+(define_insn "mmx_subdi3"
+  [(set (match_operand:DI 0 "register_operand" "=y")
+        (unspec:DI
+	 [(minus:DI (match_operand:DI 1 "register_operand" "0")
+		    (match_operand:DI 2 "nonimmediate_operand" "ym"))]
+	 UNSPEC_NOP))]
+  "TARGET_MMX"
+  "psubq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
 (define_insn "sssubv8qi3"
   [(set (match_operand:V8QI 0 "register_operand" "=y")
         (ss_minus:V8QI (match_operand:V8QI 1 "register_operand" "0")
@@ -19312,7 +19524,7 @@
 (define_insn "mmx_iordi3"
   [(set (match_operand:DI 0 "register_operand" "=y")
         (unspec:DI
-	 [(ior:DI (match_operand:DI 1 "register_operand" "0")
+	 [(ior:DI (match_operand:DI 1 "register_operand" "%0")
 		  (match_operand:DI 2 "nonimmediate_operand" "ym"))]
 	 UNSPEC_NOP))]
   "TARGET_MMX"
@@ -19323,7 +19535,7 @@
 (define_insn "mmx_xordi3"
   [(set (match_operand:DI 0 "register_operand" "=y")
         (unspec:DI
-	 [(xor:DI (match_operand:DI 1 "register_operand" "0")
+	 [(xor:DI (match_operand:DI 1 "register_operand" "%0")
 		  (match_operand:DI 2 "nonimmediate_operand" "ym"))]
 	 UNSPEC_NOP))]
   "TARGET_MMX"
@@ -19346,7 +19558,7 @@
 (define_insn "mmx_anddi3"
   [(set (match_operand:DI 0 "register_operand" "=y")
         (unspec:DI
-	 [(and:DI (match_operand:DI 1 "register_operand" "0")
+	 [(and:DI (match_operand:DI 1 "register_operand" "%0")
 		  (match_operand:DI 2 "nonimmediate_operand" "ym"))]
 	 UNSPEC_NOP))]
   "TARGET_MMX"
@@ -19805,17 +20017,17 @@
 (define_insn "ldmxcsr"
   [(unspec_volatile [(match_operand:SI 0 "memory_operand" "m")]
 		    UNSPECV_LDMXCSR)]
-  "TARGET_MMX"
+  "TARGET_SSE"
   "ldmxcsr\t%0"
-  [(set_attr "type" "mmx")
+  [(set_attr "type" "sse")
    (set_attr "memory" "load")])
 
 (define_insn "stmxcsr"
   [(set (match_operand:SI 0 "memory_operand" "=m")
 	(unspec_volatile:SI [(const_int 0)] UNSPECV_STMXCSR))]
-  "TARGET_MMX"
+  "TARGET_SSE"
   "stmxcsr\t%0"
-  [(set_attr "type" "mmx")
+  [(set_attr "type" "sse")
    (set_attr "memory" "store")])
 
 (define_expand "sfence"
@@ -20471,7 +20683,7 @@
 	 (match_operator:V2DI 3 "sse_comparison_operator"
 			      [(match_operand:V2DF 1 "register_operand" "0")
 			       (match_operand:V2DF 2 "nonimmediate_operand" "x")])
-	 (match_dup 1)
+	 (subreg:V2DI (match_dup 1) 0)
 	 (const_int 1)))]
   "TARGET_SSE2"
   "cmp%D3sd\t{%2, %0|%0, %2}"
@@ -20692,6 +20904,15 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "SI")])
 
+(define_insn "cvtsd2siq"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(fix:DI (vec_select:DF (match_operand:V2DF 1 "register_operand" "xm")
+			       (parallel [(const_int 0)]))))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "cvtsd2siq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "SI")])
+
 (define_insn "cvttsd2si"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec:SI [(vec_select:DF (match_operand:V2DF 1 "register_operand" "xm")
@@ -20701,6 +20922,16 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "SI")])
 
+(define_insn "cvttsd2siq"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(unspec:DI [(vec_select:DF (match_operand:V2DF 1 "register_operand" "x,xm")
+				   (parallel [(const_int 0)]))] UNSPEC_FIX))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "cvttsd2siq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "DI")
+   (set_attr "athlon_decode" "vector,vector")])
+
 (define_insn "cvtsi2sd"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(vec_merge:V2DF (match_operand:V2DF 1 "register_operand" "0")
@@ -20713,6 +20944,19 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "DF")])
 
+(define_insn "cvtsi2sdq"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_merge:V2DF (match_operand:V2DF 1 "register_operand" "0,0")
+	 		(vec_duplicate:V2DF
+			  (float:DF
+			    (match_operand:DI 2 "nonimmediate_operand" "r,rm")))
+			(const_int 2)))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "cvtsi2sdq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "DF")
+   (set_attr "athlon_decode" "vector,direct")])
+
 ;; Conversions between SF and DF
 
 (define_insn "cvtsd2ss"
@@ -20770,7 +21014,7 @@
 
 (define_insn "addv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
-        (plus:V16QI (match_operand:V16QI 1 "register_operand" "0")
+        (plus:V16QI (match_operand:V16QI 1 "register_operand" "%0")
 		    (match_operand:V16QI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddb\t{%2, %0|%0, %2}"
@@ -20779,7 +21023,7 @@
 
 (define_insn "addv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
-        (plus:V8HI (match_operand:V8HI 1 "register_operand" "0")
+        (plus:V8HI (match_operand:V8HI 1 "register_operand" "%0")
 	           (match_operand:V8HI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddw\t{%2, %0|%0, %2}"
@@ -20788,7 +21032,7 @@
 
 (define_insn "addv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
-        (plus:V4SI (match_operand:V4SI 1 "register_operand" "0")
+        (plus:V4SI (match_operand:V4SI 1 "register_operand" "%0")
 	           (match_operand:V4SI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddd\t{%2, %0|%0, %2}"
@@ -20797,7 +21041,7 @@
 
 (define_insn "addv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
-        (plus:V2DI (match_operand:V2DI 1 "register_operand" "0")
+        (plus:V2DI (match_operand:V2DI 1 "register_operand" "%0")
 	           (match_operand:V2DI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddq\t{%2, %0|%0, %2}"
@@ -20806,7 +21050,7 @@
 
 (define_insn "ssaddv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
-        (ss_plus:V16QI (match_operand:V16QI 1 "register_operand" "0")
+        (ss_plus:V16QI (match_operand:V16QI 1 "register_operand" "%0")
 		       (match_operand:V16QI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddsb\t{%2, %0|%0, %2}"
@@ -20815,7 +21059,7 @@
 
 (define_insn "ssaddv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
-        (ss_plus:V8HI (match_operand:V8HI 1 "register_operand" "0")
+        (ss_plus:V8HI (match_operand:V8HI 1 "register_operand" "%0")
 		      (match_operand:V8HI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddsw\t{%2, %0|%0, %2}"
@@ -20824,7 +21068,7 @@
 
 (define_insn "usaddv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
-        (us_plus:V16QI (match_operand:V16QI 1 "register_operand" "0")
+        (us_plus:V16QI (match_operand:V16QI 1 "register_operand" "%0")
 		       (match_operand:V16QI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddusb\t{%2, %0|%0, %2}"
@@ -20833,7 +21077,7 @@
 
 (define_insn "usaddv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
-        (us_plus:V8HI (match_operand:V8HI 1 "register_operand" "0")
+        (us_plus:V8HI (match_operand:V8HI 1 "register_operand" "%0")
 		      (match_operand:V8HI 2 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
   "paddusw\t{%2, %0|%0, %2}"
@@ -21069,7 +21313,8 @@
   [(set (match_operand:V8HI 0 "register_operand" "=x")
         (vec_merge:V8HI (match_operand:V8HI 1 "register_operand" "0")
 			(vec_duplicate:V8HI
-			 (match_operand:SI 2 "nonimmediate_operand" "rm"))
+			 (truncate:HI
+			   (match_operand:SI 2 "nonimmediate_operand" "rm")))
 			(match_operand:SI 3 "immediate_operand" "i")))]
   "TARGET_SSE2"
   "pinsrw\t{%3, %2, %0|%0, %2, %3}"
@@ -21218,7 +21463,7 @@
 (define_insn "ashrv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
         (ashiftrt:V8HI (match_operand:V8HI 1 "register_operand" "0")
-		       (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		       (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "psraw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21227,7 +21472,7 @@
 (define_insn "ashrv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
         (ashiftrt:V4SI (match_operand:V4SI 1 "register_operand" "0")
-		       (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		       (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "psrad\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21236,7 +21481,7 @@
 (define_insn "lshrv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
         (lshiftrt:V8HI (match_operand:V8HI 1 "register_operand" "0")
-		       (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		       (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "psrlw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21245,7 +21490,7 @@
 (define_insn "lshrv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
         (lshiftrt:V4SI (match_operand:V4SI 1 "register_operand" "0")
-		       (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		       (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "psrld\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21254,7 +21499,7 @@
 (define_insn "lshrv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
         (lshiftrt:V2DI (match_operand:V2DI 1 "register_operand" "0")
-		       (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		       (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "psrlq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21263,7 +21508,7 @@
 (define_insn "ashlv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
         (ashift:V8HI (match_operand:V8HI 1 "register_operand" "0")
-		     (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		     (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "psllw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21272,7 +21517,7 @@
 (define_insn "ashlv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
         (ashift:V4SI (match_operand:V4SI 1 "register_operand" "0")
-		     (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		     (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "pslld\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21281,7 +21526,7 @@
 (define_insn "ashlv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
         (ashift:V2DI (match_operand:V2DI 1 "register_operand" "0")
-		     (match_operand:SI 2 "nonmemory_operand" "xi")))]
+		     (match_operand:TI 2 "nonmemory_operand" "xi")))]
   "TARGET_SSE2"
   "psllq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
@@ -21595,45 +21840,41 @@
 
 (define_insn "sse2_movapd"
   [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,m")
-	(unspec:V2DF [(match_operand:V2DF 1 "general_operand" "xm,x")]
+	(unspec:V2DF [(match_operand:V2DF 1 "nonimmediate_operand" "xm,x")]
 		     UNSPEC_MOVA))]
-  "TARGET_SSE2"
-  "@
-   movapd\t{%1, %0|%0, %1}
-   movapd\t{%1, %0|%0, %1}"
+  "TARGET_SSE2
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "movapd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V2DF")])
 
 (define_insn "sse2_movupd"
   [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,m")
-	(unspec:V2DF [(match_operand:V2DF 1 "general_operand" "xm,x")]
+	(unspec:V2DF [(match_operand:V2DF 1 "nonimmediate_operand" "xm,x")]
 		     UNSPEC_MOVU))]
-  "TARGET_SSE2"
-  "@
-   movupd\t{%1, %0|%0, %1}
-   movupd\t{%1, %0|%0, %1}"
+  "TARGET_SSE2
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "movupd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "V2DF")])
 
 (define_insn "sse2_movdqa"
   [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
-	(unspec:V16QI [(match_operand:V16QI 1 "general_operand" "xm,x")]
+	(unspec:V16QI [(match_operand:V16QI 1 "nonimmediate_operand" "xm,x")]
 		       UNSPEC_MOVA))]
-  "TARGET_SSE2"
-  "@
-   movdqa\t{%1, %0|%0, %1}
-   movdqa\t{%1, %0|%0, %1}"
+  "TARGET_SSE2
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "movdqa\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_movdqu"
   [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
-	(unspec:V16QI [(match_operand:V16QI 1 "general_operand" "xm,x")]
+	(unspec:V16QI [(match_operand:V16QI 1 "nonimmediate_operand" "xm,x")]
 		       UNSPEC_MOVU))]
-  "TARGET_SSE2"
-  "@
-   movdqu\t{%1, %0|%0, %1}
-   movdqu\t{%1, %0|%0, %1}"
+  "TARGET_SSE2
+   && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
+  "movdqu\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "TI")])
 
@@ -21641,24 +21882,48 @@
   [(set (match_operand:DI 0 "nonimmediate_operand" "=m,y")
 	(vec_select:DI (match_operand:V2DI 1 "register_operand" "x,x")
 		       (parallel [(const_int 0)])))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !TARGET_64BIT"
   "@
    movq\t{%1, %0|%0, %1}
    movdq2q\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "TI")])
 
+(define_insn "sse2_movdq2q_rex64"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=m,y,r")
+	(vec_select:DI (match_operand:V2DI 1 "register_operand" "x,x,x")
+		       (parallel [(const_int 0)])))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "@
+   movq\t{%1, %0|%0, %1}
+   movdq2q\t{%1, %0|%0, %1}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_movq2dq"
   [(set (match_operand:V2DI 0 "register_operand" "=x,?x")
 	(vec_concat:V2DI (match_operand:DI 1 "nonimmediate_operand" "m,y")
 			 (const_int 0)))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !TARGET_64BIT"
   "@
    movq\t{%1, %0|%0, %1}
    movq2dq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt,ssemov")
    (set_attr "mode" "TI")])
 
+(define_insn "sse2_movq2dq_rex64"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,?x,?x")
+	(vec_concat:V2DI (match_operand:DI 1 "nonimmediate_operand" "m,y,r")
+			 (const_int 0)))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "@
+   movq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt,ssemov,ssecvt")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_movq"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(vec_concat:V2DI (vec_select:DI
@@ -21673,7 +21938,7 @@
 (define_insn "sse2_loadd"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(vec_merge:V4SI
-	 (vec_duplicate:V4HI (match_operand:SI 1 "nonimmediate_operand" "mr"))
+	 (vec_duplicate:V4SI (match_operand:SI 1 "nonimmediate_operand" "mr"))
 	 (const_vector:V4SI [(const_int 0)
 			     (const_int 0)
 			     (const_int 0)
@@ -21716,11 +21981,21 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "V2DF")])
 
-(define_insn "sse2_loadsd"
+(define_expand "sse2_loadsd"
+  [(match_operand:V2DF 0 "register_operand" "")
+   (match_operand:DF 1 "memory_operand" "")]
+  "TARGET_SSE2"
+{
+  emit_insn (gen_sse2_loadsd_1 (operands[0], operands[1],
+			        CONST0_RTX (V2DFmode)));
+  DONE;
+})
+
+(define_insn "sse2_loadsd_1"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(vec_merge:V2DF
-	 (match_operand:DF 1 "memory_operand" "m")
-	 (vec_duplicate:DF (float:DF (const_int 0)))
+	 (vec_duplicate:V2DF (match_operand:DF 1 "memory_operand" "m"))
+	 (match_operand:V2DF 2 "const0_operand" "X")
 	 (const_int 1)))]
   "TARGET_SSE2"
   "movsd\t{%1, %0|%0, %1}"
diff --git a/gcc/config/i386/linux64.h b/gcc/config/i386/linux64.h
index ae346e6d518..7a9e0ba989f 100644
--- a/gcc/config/i386/linux64.h
+++ b/gcc/config/i386/linux64.h
@@ -1,5 +1,5 @@
 /* Definitions for AMD x86-64 running Linux-based GNU systems with ELF format.
-   Copyright (C) 2001, 2002 Free Software Foundation, Inc.
+   Copyright (C) 2001, 2002, 2003 Free Software Foundation, Inc.
    Contributed by Jan Hubicka <jh@suse.cz>, based on linux.h.
 
 This file is part of GNU CC.
@@ -36,6 +36,11 @@ Boston, MA 02111-1307, USA.  */
 	    builtin_define ("__PIC__");				\
 	    builtin_define ("__pic__");				\
 	  }							\
+	if (TARGET_64BIT)					\
+	  {							\
+	    builtin_define ("__LP64__");			\
+	    builtin_define ("_LP64");				\
+	  }							\
     }								\
   while (0)
 
@@ -116,17 +121,17 @@ Boston, MA 02111-1307, USA.  */
     (FS)->regs.reg[0].how = REG_SAVED_OFFSET;				\
     (FS)->regs.reg[0].loc.offset = (long)&sc_->rax - new_cfa_;		\
     (FS)->regs.reg[1].how = REG_SAVED_OFFSET;				\
-    (FS)->regs.reg[1].loc.offset = (long)&sc_->rbx - new_cfa_;		\
+    (FS)->regs.reg[1].loc.offset = (long)&sc_->rdx - new_cfa_;		\
     (FS)->regs.reg[2].how = REG_SAVED_OFFSET;				\
     (FS)->regs.reg[2].loc.offset = (long)&sc_->rcx - new_cfa_;		\
     (FS)->regs.reg[3].how = REG_SAVED_OFFSET;				\
-    (FS)->regs.reg[3].loc.offset = (long)&sc_->rdx - new_cfa_;		\
+    (FS)->regs.reg[3].loc.offset = (long)&sc_->rbx - new_cfa_;		\
     (FS)->regs.reg[4].how = REG_SAVED_OFFSET;				\
-    (FS)->regs.reg[4].loc.offset = (long)&sc_->rbp - new_cfa_;		\
+    (FS)->regs.reg[4].loc.offset = (long)&sc_->rsi - new_cfa_;		\
     (FS)->regs.reg[5].how = REG_SAVED_OFFSET;				\
-    (FS)->regs.reg[5].loc.offset = (long)&sc_->rsi - new_cfa_;		\
+    (FS)->regs.reg[5].loc.offset = (long)&sc_->rdi - new_cfa_;		\
     (FS)->regs.reg[6].how = REG_SAVED_OFFSET;				\
-    (FS)->regs.reg[6].loc.offset = (long)&sc_->rdi - new_cfa_;		\
+    (FS)->regs.reg[6].loc.offset = (long)&sc_->rbp - new_cfa_;		\
     (FS)->regs.reg[8].how = REG_SAVED_OFFSET;				\
     (FS)->regs.reg[8].loc.offset = (long)&sc_->r8 - new_cfa_;		\
     (FS)->regs.reg[9].how = REG_SAVED_OFFSET;				\
@@ -143,6 +148,8 @@ Boston, MA 02111-1307, USA.  */
     (FS)->regs.reg[14].loc.offset = (long)&sc_->r14 - new_cfa_;		\
     (FS)->regs.reg[15].how = REG_SAVED_OFFSET;				\
     (FS)->regs.reg[15].loc.offset = (long)&sc_->r15 - new_cfa_;		\
+    (FS)->regs.reg[16].how = REG_SAVED_OFFSET;				\
+    (FS)->regs.reg[16].loc.offset = (long)&sc_->rip - new_cfa_;		\
     (FS)->retaddr_column = 16;						\
     goto SUCCESS;							\
   } while (0)
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index e7c5e8b6bcc..7f62fbd5624 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -1,6 +1,7 @@
 /* Operating system specific defines to be used when targeting GCC for
    hosting on Windows32, using GNU tools and the Windows32 API Library.
-   Copyright (C) 1997, 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003
+   Free Software Foundation, Inc.
 
 This file is part of GNU CC.
 
@@ -89,7 +90,7 @@ Boston, MA 02111-1307, USA.  */
 /* Include in the mingw32 libraries with libgcc */
 #undef LIBGCC_SPEC
 #define LIBGCC_SPEC \
-  "%{mthreads:-lmingwthrd} -lmingw32 -lgcc -lmoldname -lmsvcrt"
+  "%{mthreads:-lmingwthrd} -lmingw32 -lgcc -lmoldname -lmingwex -lmsvcrt"
 
 #undef STARTFILE_SPEC
 #define STARTFILE_SPEC "%{shared|mdll:dllcrt2%O%s} \
diff --git a/gcc/config/i386/mmintrin.h b/gcc/config/i386/mmintrin.h
index 52e5195fbaf..7b4aa014645 100644
--- a/gcc/config/i386/mmintrin.h
+++ b/gcc/config/i386/mmintrin.h
@@ -56,6 +56,22 @@ _mm_cvtsi32_si64 (int __i)
   return (__m64) __tmp;
 }
 
+#ifdef __x86_64__
+/* Convert I to a __m64 object.  */
+static __inline __m64 
+_mm_cvtsi64x_si64 (long long __i)
+{
+  return (__m64) __i;
+}
+
+/* Convert I to a __m64 object.  */
+static __inline __m64 
+_mm_set_pi64x (long long __i)
+{
+  return (__m64) __i;
+}
+#endif
+
 /* Convert the lower 32 bits of the __m64 object into an integer.  */
 static __inline int
 _mm_cvtsi64_si32 (__m64 __i)
@@ -64,6 +80,15 @@ _mm_cvtsi64_si32 (__m64 __i)
   return __tmp;
 }
 
+#ifdef __x86_64__
+/* Convert the lower 32 bits of the __m64 object into an integer.  */
+static __inline long long
+_mm_cvtsi64_si64x (__m64 __i)
+{
+  return (long long)__i;
+}
+#endif
+
 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    the result, and the four 16-bit values from M2 into the upper four 8-bit
    values of the result, all with signed saturation.  */
@@ -160,6 +185,13 @@ _mm_add_pi32 (__m64 __m1, __m64 __m2)
   return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
 }
 
+/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
+static __inline __m64
+_mm_add_si64 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
+}
+
 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    saturated arithmetic.  */
 static __inline __m64
@@ -213,6 +245,13 @@ _mm_sub_pi32 (__m64 __m1, __m64 __m2)
   return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
 }
 
+/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
+static __inline __m64
+_mm_sub_si64 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
+}
+
 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    saturating arithmetic.  */
 static __inline __m64
diff --git a/gcc/config/i386/scodbx.h b/gcc/config/i386/scodbx.h
deleted file mode 100644
index 7da93053256..00000000000
--- a/gcc/config/i386/scodbx.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Definitions for Intel 386 running SCO Unix System V,
-   using dbx-in-coff encapsulation.
-   Copyright (C) 1992, 1995, 1996, 1999 Free Software Foundation, Inc.
-
-This file is part of GNU CC.
-
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-#include "i386/svr3dbx.h"
-
-/* Overridden defines for SCO systems from sco.h. */
-
-/* By default, target has a 80387, uses IEEE compatible arithmetic,
-   and returns float values in the 387, ie,
-   (TARGET_80387 | TARGET_FLOAT_RETURNS_IN_80387)
-
-   SCO's software emulation of a 387 fails to handle the `fucomp'
-   opcode.  fucomp is only used when generating IEEE compliant code.
-   So don't make TARGET_IEEE_FP default for SCO. */
-
-#undef TARGET_SUBTARGET_DEFAULT
-#define TARGET_SUBTARGET_DEFAULT (MASK_80387 | MASK_FLOAT_RETURNS)
-
-/* Use crt1.o as a startup file and crtn.o as a closing file.  */
-
-#undef STARTFILE_SPEC
-#define STARTFILE_SPEC \
-  "%{!r:%{!z:svr3.ifile%s}%{z:svr3z.ifile%s}}\
-   %{pg:gcrt1.o%s}%{!pg:%{p:mcrt1.o%s}%{!p:crt1.o%s}}"
-
-/* Library spec, including SCO international language support. */
-
-#undef LIB_SPEC
-#define LIB_SPEC \
- "%{p:-L/usr/lib/libp}%{pg:-L/usr/lib/libp} %{scointl:libintl.a%s} -lc"
-
-/* Specify predefined symbols in preprocessor.  */
-
-#undef CPP_PREDEFINES
-#define CPP_PREDEFINES "-Dunix -DM_UNIX -DM_I386 -DM_COFF -DM_WORDSWAP -Asystem=svr3"
-
-#undef CPP_SPEC
-#define CPP_SPEC "%(cpp_cpu) %{scointl:-DM_INTERNAT}"
-
-/* This spec is used for telling cpp whether char is signed or not.  */
-
-#undef SIGNED_CHAR_SPEC
-#if DEFAULT_SIGNED_CHAR
-#define SIGNED_CHAR_SPEC \
- "%{funsigned-char:-D__CHAR_UNSIGNED__ -D_CHAR_UNSIGNED}"
-#else
-#define SIGNED_CHAR_SPEC \
- "%{!fsigned-char:-D__CHAR_UNSIGNED__ -D_CHAR_UNSIGNED}"
-#endif
-
-/* caller has to pop the extra argument passed to functions that return
-   structures. */
-
-#undef RETURN_POPS_ARGS
-#define RETURN_POPS_ARGS(FUNDECL,FUNTYPE,SIZE)   \
-  ((FUNDECL) && TREE_CODE (FUNDECL) == IDENTIFIER_NODE ? 0	\
-   : (TARGET_RTD						\
-      && (TYPE_ARG_TYPES (FUNTYPE) == 0				\
-	  || (TREE_VALUE (tree_last (TYPE_ARG_TYPES (FUNTYPE)))	\
-	      == void_type_node))) ? (SIZE)			\
-   : 0)
-/* On other 386 systems, the last line looks like this:
-   : (aggregate_value_p (TREE_TYPE (FUNTYPE))) ? GET_MODE_SIZE (Pmode) : 0)  */
-
-/* Handle #pragma pack. */
-#define HANDLE_SYSV_PRAGMA
diff --git a/gcc/config/i386/t-sco5gas b/gcc/config/i386/t-sco5gas
index 2d0b48a6292..edeb554eea0 100644
--- a/gcc/config/i386/t-sco5gas
+++ b/gcc/config/i386/t-sco5gas
@@ -1,6 +1,6 @@
 # The pushl in CTOR initialization interferes with frame pointer elimination.
 CRTSTUFF_T_CFLAGS   = -fPIC -fno-omit-frame-pointer
-CRTSTUFF_T_CFLAGS_S = -mcoff -fno-omit-frame-pointer
+CRTSTUFF_T_CFLAGS_S = -fno-omit-frame-pointer
 
 #
 # I am still a little unsure of the multilib architecture. The following
diff --git a/gcc/config/i386/xm-dgux.h b/gcc/config/i386/xm-dgux.h
deleted file mode 100644
index 881c5c7be9d..00000000000
--- a/gcc/config/i386/xm-dgux.h
+++ /dev/null
@@ -1,4 +0,0 @@
-/* Configuration for GCC for Intel i386 running DG/ux */
-
-/* looks just like sysv4 for now */
-#include "xm-svr4.h"
diff --git a/gcc/config/i386/xm-sun.h b/gcc/config/i386/xm-sun.h
deleted file mode 100644
index 6c0f0a25630..00000000000
--- a/gcc/config/i386/xm-sun.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Configuration for GNU C-compiler for Intel 80386 running SunOS 4.0.
-   Copyright (C) 1988, 1997 Free Software Foundation, Inc.
-
-This file is part of GNU CC.
-
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-#define USG
diff --git a/gcc/config/i386/xm-sysv3.h b/gcc/config/i386/xm-sysv3.h
deleted file mode 100644
index 9a655443ff5..00000000000
--- a/gcc/config/i386/xm-sysv3.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* Configuration for GCC for Intel i386 running System V Release 3.  */
-
-#include "xm-svr3.h"
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 4136e901795..43a05c1a6ee 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -475,6 +475,16 @@ _mm_cvtss_si32 (__m128 __A)
   return __builtin_ia32_cvtss2si ((__v4sf) __A);
 }
 
+#ifdef __x86_64__
+/* Convert the lower SPFP value to a 32-bit integer according to the current
+   rounding mode.  */
+static __inline long long
+_mm_cvtss_si64x (__m128 __A)
+{
+  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+#endif
+
 /* Convert the two lower SPFP values to 32-bit integers according to the
    current rounding mode.  Return the integers in packed form.  */
 static __inline __m64
@@ -490,6 +500,15 @@ _mm_cvttss_si32 (__m128 __A)
   return __builtin_ia32_cvttss2si ((__v4sf) __A);
 }
 
+#ifdef __x86_64__
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+static __inline long long
+_mm_cvttss_si64x (__m128 __A)
+{
+  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
+}
+#endif
+
 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
    integers in packed form.  */
 static __inline __m64
@@ -505,6 +524,15 @@ _mm_cvtsi32_ss (__m128 __A, int __B)
   return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
 }
 
+#ifdef __x86_64__
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+static __inline __m128
+_mm_cvtsi64x_ss (__m128 __A, long long __B)
+{
+  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+#endif
+
 /* Convert the two 32-bit values in B to SPFP form and insert them
    as the two lower elements in A.  */
 static __inline __m128
@@ -1586,13 +1614,13 @@ _mm_ucomineq_sd (__m128d __A, __m128d __B)
 static __inline __m128i
 _mm_load_si128 (__m128i const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddqa (__P);
+  return (__m128i) __builtin_ia32_loaddqa ((char const *)__P);
 }
 
 static __inline __m128i
 _mm_loadu_si128 (__m128i const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddqu (__P);
+  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
 }
 
 static __inline __m128i
@@ -1604,13 +1632,13 @@ _mm_loadl_epi64 (__m128i const *__P)
 static __inline void
 _mm_store_si128 (__m128i *__P, __m128i __B)
 {
-  __builtin_ia32_storedqa (__P, (__v16qi)__B);
+  __builtin_ia32_storedqa ((char *)__P, (__v16qi)__B);
 }
 
 static __inline void
 _mm_storeu_si128 (__m128i *__P, __m128i __B)
 {
-  __builtin_ia32_storedqu (__P, (__v16qi)__B);
+  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
 }
 
 static __inline void
@@ -1619,6 +1647,12 @@ _mm_storel_epi64 (__m128i *__P, __m128i __B)
   *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B);
 }
 
+static __inline __m64
+_mm_movepi64_pi64 (__m128i __B)
+{
+  return (__m64) __builtin_ia32_movdq2q ((__v2di)__B);
+}
+
 static __inline __m128i
 _mm_move_epi64 (__m128i __A)
 {
@@ -1656,6 +1690,24 @@ _mm_set_epi32 (int __Z, int __Y, int __X, int __W)
 
   return __u.__v;
 }
+
+#ifdef __x86_64__
+/* Create the vector [Z Y].  */
+static __inline __m128i
+_mm_set_epi64x (long long __Z, long long __Y)
+{
+  union {
+    long __a[2];
+    __m128i __v;
+  } __u;
+
+  __u.__a[0] = __Y;
+  __u.__a[1] = __Z;
+
+  return __u.__v;
+}
+#endif
+
 /* Create the vector [S T U V Z Y X W].  */
 static __inline __m128i
 _mm_set_epi16 (short __Z, short __Y, short __X, short __W,
@@ -1724,6 +1776,15 @@ _mm_set1_epi32 (int __A)
   return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
 }
 
+#ifdef __x86_64__
+static __inline __m128i
+_mm_set1_epi64x (long long __A)
+{
+  __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
+  return (__m128i) __builtin_ia32_shufpd ((__v2df)__tmp, (__v2df)__tmp, _MM_SHUFFLE2 (0,0));
+}
+#endif
+
 static __inline __m128i
 _mm_set1_epi16 (short __A)
 {
@@ -1893,12 +1954,28 @@ _mm_cvtsd_si32 (__m128d __A)
   return __builtin_ia32_cvtsd2si ((__v2df) __A);
 }
 
+#ifdef __x86_64__
+static __inline long long
+_mm_cvtsd_si64x (__m128d __A)
+{
+  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+#endif
+
 static __inline int
 _mm_cvttsd_si32 (__m128d __A)
 {
   return __builtin_ia32_cvttsd2si ((__v2df) __A);
 }
 
+#ifdef __x86_64__
+static __inline long long
+_mm_cvttsd_si64x (__m128d __A)
+{
+  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
+}
+#endif
+
 static __inline __m128
 _mm_cvtsd_ss (__m128 __A, __m128d __B)
 {
@@ -1911,6 +1988,14 @@ _mm_cvtsi32_sd (__m128d __A, int __B)
   return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
 }
 
+#ifdef __x86_64__
+static __inline __m128d
+_mm_cvtsi64x_sd (__m128d __A, long long __B)
+{
+  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+#endif
+
 static __inline __m128d
 _mm_cvtss_sd (__m128d __A, __m128 __B)
 {
@@ -2048,7 +2133,7 @@ _mm_add_epi32 (__m128i __A, __m128i __B)
 static __inline __m128i
 _mm_add_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddq128 ((__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
 }
 
 static __inline __m128i
@@ -2096,7 +2181,7 @@ _mm_sub_epi32 (__m128i __A, __m128i __B)
 static __inline __m128i
 _mm_sub_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_psubq128 ((__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
 }
 
 static __inline __m128i
@@ -2142,7 +2227,7 @@ _mm_mullo_epi16 (__m128i __A, __m128i __B)
 }
 
 static __inline __m64
-_mm_mul_pu16 (__m64 __A, __m64 __B)
+_mm_mul_su32 (__m64 __A, __m64 __B)
 {
   return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
 }
@@ -2459,6 +2544,14 @@ _mm_cvtsi32_si128 (int __A)
   return (__m128i) __builtin_ia32_loadd (&__A);
 }
 
+#ifdef __x86_64__
+static __inline __m128i
+_mm_cvtsi64x_si128 (long long __A)
+{
+  return (__m128i) __builtin_ia32_movq2dq (__A);
+}
+#endif
+
 static __inline int
 _mm_cvtsi128_si32 (__m128i __A)
 {
@@ -2467,6 +2560,14 @@ _mm_cvtsi128_si32 (__m128i __A)
   return __tmp;
 }
 
+#ifdef __x86_64__
+static __inline long long
+_mm_cvtsi128_si64x (__m128i __A)
+{
+  return __builtin_ia32_movdq2q ((__v2di)__A);
+}
+#endif
+
 #endif /* __SSE2__  */
 
 #endif /* __SSE__ */