Added unaligned load and dot product to the ARM NEON backendst/cli-fe-vect

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/st/cli-fe-vect@164473 138bc75d-0d04-0410-961f-82ee72b054a4
author: dyuste <dyuste@138bc75d-0d04-0410-961f-82ee72b054a4> 2010-09-21 09:13:34 +0000
committer: dyuste <dyuste@138bc75d-0d04-0410-961f-82ee72b054a4> 2010-09-21 09:13:34 +0000
commit: 5cb58c2b8c38824c192f1c381a31ca4fed20b57d (patch)
tree: 7d0466eb01245a79cacd05fd3622e799e69580de
parent: 502a37341441ecd0e45097df1845bbc7610934ae (diff)
15 files changed, 223 insertions, 11 deletions
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index ed569e2df0c..d3402407ba8 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -224,6 +224,11 @@ static bool arm_can_eliminate (const int, const int);
 static void arm_asm_trampoline_template (FILE *);
 static void arm_trampoline_init (rtx, tree, rtx);
 static rtx arm_trampoline_adjust_address (rtx);
+static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
+static bool arm_builtin_support_vector_misalignment (enum machine_mode mode,
+                                                    const_tree type,
+                                                    int misalignment,
+                                                    bool is_packed);
 
 
 /* Table of machine attributes.  */
@@ -507,6 +512,14 @@ static const struct attribute_spec arm_attribute_table[] =
 #undef TARGET_CAN_ELIMINATE
 #define TARGET_CAN_ELIMINATE arm_can_eliminate
 
+#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
+#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
+  arm_vector_alignment_reachable
+
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+  arm_builtin_support_vector_misalignment
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 /* Obstack for minipool constant handling.  */
@@ -8436,7 +8449,8 @@ neon_vector_mem_operand (rtx op, int type)
     return arm_address_register_rtx_p (ind, 0);
 
   /* Allow post-increment with Neon registers.  */
-  if (type != 1 && (GET_CODE (ind) == POST_INC || GET_CODE (ind) == PRE_DEC))
+  if ((type != 1 && GET_CODE (ind) == POST_INC)
+      || (type == 0 && GET_CODE (ind) == PRE_DEC))
     return arm_address_register_rtx_p (XEXP (ind, 0), 0);
 
   /* FIXME: vld1 allows register post-modify.  */
@@ -15404,6 +15418,8 @@ arm_print_operand (FILE *stream, rtx x, int code)
       {
 	rtx addr;
 	bool postinc = FALSE;
+	unsigned align, modesize, align_bits;
+
 	gcc_assert (GET_CODE (x) == MEM);
 	addr = XEXP (x, 0);
 	if (GET_CODE (addr) == POST_INC)
@@ -15411,7 +15427,29 @@ arm_print_operand (FILE *stream, rtx x, int code)
 	    postinc = 1;
 	    addr = XEXP (addr, 0);
 	  }
-	asm_fprintf (stream, "[%r]", REGNO (addr));
+	asm_fprintf (stream, "[%r", REGNO (addr));
+
+	/* We know the alignment of this access, so we can emit a hint in the
+	   instruction (for some alignments) as an aid to the memory subsystem
+	   of the target.  */
+	align = MEM_ALIGN (x) >> 3;
+	modesize = GET_MODE_SIZE (GET_MODE (x));
+	
+	/* Only certain alignment specifiers are supported by the hardware.  */
+	if (modesize == 16 && (align % 32) == 0)
+	  align_bits = 256;
+	else if ((modesize == 8 || modesize == 16) && (align % 16) == 0)
+	  align_bits = 128;
+	else if ((align % 8) == 0)
+	  align_bits = 64;
+	else
+	  align_bits = 0;
+	
+	if (align_bits != 0)
+	  asm_fprintf (stream, ", :%d", align_bits);
+
+	asm_fprintf (stream, "]");
+
 	if (postinc)
 	  fputs("!", stream);
       }
@@ -21457,4 +21495,43 @@ arm_have_conditional_execution (void)
   return !TARGET_THUMB1;
 }
 
+static bool
+arm_vector_alignment_reachable (const_tree type, bool is_packed)
+{
+  /* Vectors which aren't in packed structures will not be less aligned than
+     the natural alignment of their element type, so this is safe.  */
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    return !is_packed;
+
+  return default_builtin_vector_alignment_reachable (type, is_packed);
+}
+
+static bool
+arm_builtin_support_vector_misalignment (enum machine_mode mode,
+					 const_tree type, int misalignment,
+					 bool is_packed)
+{
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    {
+      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
+
+      if (is_packed)
+        return align == 1;
+
+      /* If the misalignment is unknown, we should be able to handle the access
+	 so long as it is not to a member of a packed data structure.  */
+      if (misalignment == -1)
+        return true;
+
+      /* Return true if the misalignment is a multiple of the natural alignment
+         of the vector's element type.  This is probably always going to be
+	 true in practice, since we've already established that this isn't a
+	 packed access.  */
+      return ((misalignment % align) == 0);
+    }
+  
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+						      is_packed);
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 43b3805c7ba..c22bf56f19e 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -159,7 +159,8 @@
    (UNSPEC_VUZP1		201)
    (UNSPEC_VUZP2		202)
    (UNSPEC_VZIP1		203)
-   (UNSPEC_VZIP2		204)])
+   (UNSPEC_VZIP2		204)
+   (UNSPEC_MISALIGNED_ACCESS	205)])
 
 ;; Double-width vector modes.
 (define_mode_iterator VD [V8QI V4HI V2SI V2SF])
@@ -674,6 +675,52 @@
   neon_disambiguate_copy (operands, dest, src, 4);
 })
 
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:VDQX 0 "nonimmediate_operand"	      "")
+	(unspec:VDQX [(match_operand:VDQX 1 "general_operand" "")]
+		     UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+{
+  /* This pattern is not permitted to fail during expansion: if both arguments
+     are non-registers (e.g. memory := constant, which can be created by the
+     auto-vectorizer), force operand 1 into a register.  */
+  if (!s_register_operand (operands[0], <MODE>mode)
+      && !s_register_operand (operands[1], <MODE>mode))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*movmisalign<mode>_neon_store"
+  [(set (match_operand:VDX 0 "memory_operand"		       "=Um")
+	(unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vst1.<V_sz_elem>\t{%P1}, %A0"
+  [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")])
+
+(define_insn "*movmisalign<mode>_neon_load"
+  [(set (match_operand:VDX 0 "s_register_operand"	   "=w")
+	(unspec:VDX [(match_operand:VDX 1 "memory_operand" " Um")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vld1.<V_sz_elem>\t{%P0}, %A1"
+  [(set_attr "neon_type" "neon_vld1_1_2_regs")])
+
+(define_insn "*movmisalign<mode>_neon_store"
+  [(set (match_operand:VQX 0 "memory_operand"		       "=Um")
+	(unspec:VQX [(match_operand:VQX 1 "s_register_operand" " w")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vst1.<V_sz_elem>\t{%q1}, %A0"
+  [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")])
+
+(define_insn "*movmisalign<mode>_neon_load"
+  [(set (match_operand:VQX 0 "s_register_operand"	   "=w")
+	(unspec:VQX [(match_operand:VQX 1 "memory_operand" " Um")]
+		    UNSPEC_MISALIGNED_ACCESS))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vld1.<V_sz_elem>\t{%q0}, %A1"
+  [(set_attr "neon_type" "neon_vld1_1_2_regs")])
+
 (define_insn "vec_set<mode>_internal"
   [(set (match_operand:VD 0 "s_register_operand" "=w")
         (vec_merge:VD
@@ -5051,3 +5098,38 @@
   emit_insn (gen_orn<mode>3_neon (operands[0], operands[1], operands[2]));
   DONE;
 })
+
+
+;; dot product
+
+(define_expand "udot_prodv4hi"
+  [(match_operand:V2SI 0 "register_operand" "=v")
+   (match_operand:V4HI 1 "register_operand" "v")
+   (match_operand:V4HI 2 "register_operand" "v")
+   (match_operand:V2SI 3 "register_operand" "v")]  
+ "TARGET_NEON"
+
+{
+ /*rtx tmp = gen_rtx_REG (V4HImode);*/
+ emit_insn (gen_neon_vmulv4hi (operands[1], operands[1], operands[2], const0_rtx));
+ emit_insn (gen_neon_vpadalv4hi (operands[3], operands[3], operands[1], const0_rtx));
+ emit_move_insn (operands[0], operands[3]);
+ DONE;
+})
+
+(define_expand "sdot_prodv4hi"
+  [(match_operand:V2SI 0 "register_operand" "=v")
+   (match_operand:V4HI 1 "register_operand" "v")
+   (match_operand:V4HI 2 "register_operand" "v")
+   (match_operand:V2SI 3 "register_operand" "v")]  
+ "TARGET_NEON"
+
+{
+ /* FIXME: operands[1] gets clobbered and it shouldn't. A free register must be used as temporary */
+ emit_insn (gen_neon_vmulv4hi (operands[1], operands[1], operands[2], const0_rtx));
+ emit_insn (gen_neon_vpadalv4hi (operands[3], operands[3], operands[1], const0_rtx));
+ emit_move_insn (operands[0], operands[3]);
+ DONE;
+})
+
+
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 6fbb7cdcdac..9971cedbfb5 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -2750,3 +2750,4 @@
   emit_insn (gen_altivec_vcfux (operands[0], tmp, const0_rtx));
   DONE;
 }")
+
diff --git a/gcc/expand-vect-cli.c b/gcc/expand-vect-cli.c
index aacdbfb749b..81f8219916a 100644
--- a/gcc/expand-vect-cli.c
+++ b/gcc/expand-vect-cli.c
@@ -219,12 +219,12 @@ get_vectype (tree scalar_type, bool *ignore_stmt)
       return NULL_TREE;
     }
 
-  if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD (inner_mode))
+  if (nbytes == 0 || nbytes >= simd_width)
     return NULL_TREE;
 
   /* FORNOW: Only a single vector size per mode (UNITS_PER_SIMD_WORD)
      is expected.  */
-  nunits = UNITS_PER_SIMD_WORD (inner_mode) / nbytes;
+  nunits = simd_width / nbytes;
 
   vectype = build_vector_type (scalar_type, nunits);
 
diff --git a/gcc/expr.c b/gcc/expr.c
index ad66d934d2f..667dd76e76d 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -4351,7 +4351,10 @@ expand_assignment (tree to, tree from, bool nontemporal)
            && op_mode1 != VOIDmode)
          reg = copy_to_mode_reg (op_mode1, reg);
 
-      insn = GEN_FCN (icode) (mem, reg);
+       insn = GEN_FCN (icode) (mem, reg);
+       /* The movmisalign<mode> pattern cannot fail, else the assignment would
+          silently be omitted.  */
+       gcc_assert (insn != NULL_RTX);
        emit_insn (insn);
        return;
      }
@@ -8718,6 +8721,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode,
 
 	    /* Nor can the insn generator.  */
 	    insn = GEN_FCN (icode) (reg, temp);
+	    gcc_assert (insn != NULL_RTX);
 	    emit_insn (insn);
 
 	    return reg;
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c
index ea67946e505..afa5b3d241e 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-8.c
@@ -46,5 +46,5 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail { ! { vect_hw_misalign } } } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail { ! { vect_element_align } } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c b/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c
index 954fe25df04..954474eb925 100644
--- a/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c
+++ b/gcc/testsuite/gcc.dg/vect/no-vfa-pr29145.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/slp-25.c b/gcc/testsuite/gcc.dg/vect/slp-25.c
index b660508a79c..ebeebd8c10b 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-25.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-25.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/slp-3.c b/gcc/testsuite/gcc.dg/vect/slp-3.c
index 070715371bb..18614cdfb2e 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-3.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-3.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include <stdio.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-109.c b/gcc/testsuite/gcc.dg/vect/vect-109.c
index dd9f8ea77a3..420d0421584 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-109.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-109.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-95.c b/gcc/testsuite/gcc.dg/vect/vect-95.c
index c1d5926e67d..c03d1965df1 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-95.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-95.c
@@ -56,14 +56,14 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail {vect_hw_misalign} } } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail {vect_element_align} } } } */
 
 /* For targets that support unaligned loads we version for the two unaligned 
    stores and generate misaligned accesses for the loads. For targets that 
    don't support unaligned loads we version for all four accesses.  */
 
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign} } } }  */
-/* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_element_align} } } }  */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 2 "vect" { xfail { vect_no_align || vect_element_align } } } } */
 /*  { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 0 "vect" { target vect_no_align } } } */
 /*  { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 4 "vect" { target vect_no_align } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
index e8fe027f5f3..cf3dcca359f 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
index 274fb025319..eb16e8354ae 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_int } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdarg.h>
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-5.c b/gcc/testsuite/gcc.dg/vect/vect-outer-5.c
index 2fc421e9680..01094d343e8 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-5.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target vect_float } */
+/* { dg-add-options quad_vectors } */
 
 #include <stdio.h>
 #include <stdarg.h>
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 126ae380fe7..33c747141f6 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1575,6 +1575,18 @@ proc check_effective_target_arm32 { } {
     }]
 }
 
+# Return 1 if this is an ARM target that only supports aligned vector accesses
+proc check_effective_target_arm_vect_no_misalign { } {
+    return [check_no_compiler_messages arm_vect_no_misalign assembly {
+	#if !defined(__arm__) \
+	    || (defined(__ARMEL__) \
+	        && (!defined(__thumb__) || defined(__thumb2__)))
+	#error FOO
+	#endif
+    }]
+}
+
+
 # Return 1 if this is an ARM target supporting -mfpu=vfp
 # -mfloat-abi=softfp.  Some multilibs may be incompatible with these
 # options.
@@ -2392,7 +2404,7 @@ proc check_effective_target_vect_no_align { } {
 	if { [istarget mipsisa64*-*-*]
 	     || [istarget sparc*-*-*]
 	     || [istarget ia64-*-*]
-	     || [check_effective_target_arm32] } { 
+	     || [check_effective_target_arm_vect_no_misalign] } { 
 	    set et_vect_no_align_saved 1
 	}
     }
@@ -2527,6 +2539,25 @@ proc check_effective_target_vector_alignment_reachable_for_64bit { } {
     return $et_vector_alignment_reachable_for_64bit_saved
 }
 
+# Return 1 if the target only requires element alignment for vector accesses
+
+proc check_effective_target_vect_element_align { } {
+    global et_vect_element_align
+
+    if [info exists et_vect_element_align] {
+	verbose "check_effective_target_vect_element_align: using cached result" 2
+    } else {
+	set et_vect_element_align 0
+	if { [istarget arm*-*-*]
+	     || [check_effective_target_vect_hw_misalign] } {
+	   set et_vect_element_align 1
+	}
+    }
+
+    verbose "check_effective_target_vect_element_align: returning $et_vect_element_align" 2
+    return $et_vect_element_align
+}
+
 # Return 1 if the target supports vector conditional operations, 0 otherwise.
 
 proc check_effective_target_vect_condition { } {
@@ -3084,6 +3115,16 @@ proc add_options_for_bind_pic_locally { flags } {
     return $flags
 }
 
+# Add to FLAGS the flags needed to enable 128-bit vectors.
+
+proc add_options_for_quad_vectors { flags } {
+    if [is-effective-target arm_neon_ok] {
+	return "$flags -mvectorize-with-neon-quad"
+    }
+
+    return $flags
+}
+
 # Return 1 if the target provides a full C99 runtime.
 
 proc check_effective_target_c99_runtime { } {
author	dyuste <dyuste@138bc75d-0d04-0410-961f-82ee72b054a4>	2010-09-21 09:13:34 +0000
committer	dyuste <dyuste@138bc75d-0d04-0410-961f-82ee72b054a4>	2010-09-21 09:13:34 +0000
commit	5cb58c2b8c38824c192f1c381a31ca4fed20b57d (patch)
tree	7d0466eb01245a79cacd05fd3622e799e69580de
parent	502a37341441ecd0e45097df1845bbc7610934ae (diff)