aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Schmidt <wschmidt@linux.vnet.ibm.com>2015-04-10 22:06:46 +0000
committerSharad Singhai <singhai@google.com>2015-04-10 22:06:46 +0000
commit9bff13466c28d597fc55236626ab6f3f09e58b0f (patch)
treea07f56000146aade9d786896494d2e3c60687f29
parent5314e561e12c664564a05f8acd8f805ba599a907 (diff)
2015-03-26 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
Backport of r214242, r214254, and bug fix patches from mainline * config/rs6000/rs6000.c (context.h): New #include. (tree-pass.h): Likewise. (make_pass_analyze_swaps): New declaration. (rs6000_option_override): Register swap-optimization pass. (swap_web_entry): New class. (special_handling_values): New enum. (union_defs): New function. (union_uses): Likewise. (insn_is_load_p): Likewise. (insn_is_store_p): Likewise. (insn_is_swap_p): Likewise. (rtx_is_swappable_p): Likewise. (insn_is_swappable_p): Likewise. (chain_purpose): New enum. (chain_contains_only_swaps): New function. (mark_swaps_for_removal): Likewise. (swap_const_vector_halves): Likewise. (adjust_subreg_index): Likewise. (permute_load): Likewise. (permute_store): Likewise. (adjust_extract): Likewise. (adjust_splat): Likewise. (handle_special_swappables): Likewise. (replace_swap_with_copy): Likewise. (dump_swap_insn_table): Likewise. (rs6000_analyze_swaps): Likewise. (pass_data_analyze_swaps): New pass_data. (pass_analyze_swaps): New class. (pass_analyze_swaps::gate): New method. (pass_analyze_swaps::execute): New method. (make_pass_analyze_swaps): New function. * config/rs6000/rs6000.opt (moptimize-swaps): New option. * df.h (web_entry_base): New class, replacing struct web_entry. (web_entry_base::pred): New method. (web_entry_base::set_pred): Likewise. (web_entry_base::unionfind_root): Likewise. (web_entry_base::unionfind_union): Likewise. (unionfind_root): Delete external reference. (unionfind_union): Likewise. (union_defs): Likewise. * web.c (web_entry_base::unionfind_root): Convert to method. (web_entry_base::unionfind_union): Likewise. (web_entry): New class. (union_match_dups): Convert to use class structure. (union_defs): Likewise. (entry_register): Likewise. (web_main): Likewise. git-svn-id: https://gcc.gnu.org/svn/gcc/branches/google/gcc-4_9@221994 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--gcc/ChangeLog51
-rw-r--r--gcc/config/rs6000/rs6000.c1194
-rw-r--r--gcc/config/rs6000/rs6000.opt4
-rw-r--r--gcc/df.h28
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c35
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c42
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c53
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c56
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c54
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c43
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c51
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c57
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c15
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-2.c41
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c43
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c45
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c45
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c32
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c38
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c40
-rw-r--r--gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c42
-rw-r--r--gcc/web.c65
22 files changed, 2033 insertions, 41 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 3d828baa47e..629f3e77c83 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,54 @@
+2015-03-26 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
+
+ Backport of r214242, r214254, and bug fix patches from mainline
+ * config/rs6000/rs6000.c (context.h): New #include.
+ (tree-pass.h): Likewise.
+ (make_pass_analyze_swaps): New declaration.
+ (rs6000_option_override): Register swap-optimization pass.
+ (swap_web_entry): New class.
+ (special_handling_values): New enum.
+ (union_defs): New function.
+ (union_uses): Likewise.
+ (insn_is_load_p): Likewise.
+ (insn_is_store_p): Likewise.
+ (insn_is_swap_p): Likewise.
+ (rtx_is_swappable_p): Likewise.
+ (insn_is_swappable_p): Likewise.
+ (chain_purpose): New enum.
+ (chain_contains_only_swaps): New function.
+ (mark_swaps_for_removal): Likewise.
+ (swap_const_vector_halves): Likewise.
+ (adjust_subreg_index): Likewise.
+ (permute_load): Likewise.
+ (permute_store): Likewise.
+ (adjust_extract): Likewise.
+ (adjust_splat): Likewise.
+ (handle_special_swappables): Likewise.
+ (replace_swap_with_copy): Likewise.
+ (dump_swap_insn_table): Likewise.
+ (rs6000_analyze_swaps): Likewise.
+ (pass_data_analyze_swaps): New pass_data.
+ (pass_analyze_swaps): New class.
+ (pass_analyze_swaps::gate): New method.
+ (pass_analyze_swaps::execute): New method.
+ (make_pass_analyze_swaps): New function.
+ * config/rs6000/rs6000.opt (moptimize-swaps): New option.
+ * df.h (web_entry_base): New class, replacing struct web_entry.
+ (web_entry_base::pred): New method.
+ (web_entry_base::set_pred): Likewise.
+ (web_entry_base::unionfind_root): Likewise.
+ (web_entry_base::unionfind_union): Likewise.
+ (unionfind_root): Delete external reference.
+ (unionfind_union): Likewise.
+ (union_defs): Likewise.
+ * web.c (web_entry_base::unionfind_root): Convert to method.
+ (web_entry_base::unionfind_union): Likewise.
+ (web_entry): New class.
+ (union_match_dups): Convert to use class structure.
+ (union_defs): Likewise.
+ (entry_register): Likewise.
+ (web_main): Likewise.
+
2015-01-23 Jakub Jelinek <jakub@redhat.com>
PR middle-end/64734
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 9b3022d60a0..b6e64d15594 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -80,6 +80,8 @@
#include "cgraph.h"
#include "target-globals.h"
#include "real.h"
+#include "context.h"
+#include "tree-pass.h"
#if TARGET_XCOFF
#include "xcoffout.h" /* get declarations of xcoff_*_section_name */
#endif
@@ -1172,6 +1174,7 @@ static bool rs6000_secondary_reload_move (enum rs6000_reg_type,
enum machine_mode,
secondary_reload_info *,
bool);
+rtl_opt_pass *make_pass_analyze_swaps (gcc::context*);
/* Hash table stuff for keeping track of TOC entries. */
@@ -4085,6 +4088,15 @@ static void
rs6000_option_override (void)
{
(void) rs6000_option_override_internal (true);
+
+ /* Register machine-specific passes. This needs to be done at start-up.
+ It's convenient to do it here (like i386 does). */
+ opt_pass *pass_analyze_swaps = make_pass_analyze_swaps (g);
+
+ static struct register_pass_info analyze_swaps_info
+ = { pass_analyze_swaps, "cse1", 1, PASS_POS_INSERT_BEFORE };
+
+ register_pass (&analyze_swaps_info);
}
@@ -33096,7 +33108,1189 @@ emit_fusion_gpr_load (rtx target, rtx mem)
return "";
}
+
+/* Analyze vector computations and remove unnecessary doubleword
+ swaps (xxswapdi instructions). This pass is performed only
+ for little-endian VSX code generation.
+
+ For this specific case, loads and stores of 4x32 and 2x64 vectors
+ are inefficient. These are implemented using the lvx2dx and
+ stvx2dx instructions, which invert the order of doublewords in
+ a vector register. Thus the code generation inserts an xxswapdi
+ after each such load, and prior to each such store. (For spill
+ code after register assignment, an additional xxswapdi is inserted
+ following each store in order to return a hard register to its
+ unpermuted value.)
+
+ The extra xxswapdi instructions reduce performance. This can be
+ particularly bad for vectorized code. The purpose of this pass
+ is to reduce the number of xxswapdi instructions required for
+ correctness.
+
+ The primary insight is that much code that operates on vectors
+ does not care about the relative order of elements in a register,
+ so long as the correct memory order is preserved. If we have
+ a computation where all input values are provided by lvxd2x/xxswapdi
+ sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
+ and all intermediate computations are pure SIMD (independent of
+ element order), then all the xxswapdi's associated with the loads
+ and stores may be removed.
+
+ This pass uses some of the infrastructure and logical ideas from
+ the "web" pass in web.c. We create maximal webs of computations
+ fitting the description above using union-find. Each such web is
+ then optimized by removing its unnecessary xxswapdi instructions.
+
+ The pass is placed prior to global optimization so that we can
+ perform the optimization in the safest and simplest way possible;
+ that is, by replacing each xxswapdi insn with a register copy insn.
+ Subsequent forward propagation will remove copies where possible.
+
+ There are some operations sensitive to element order for which we
+ can still allow the operation, provided we modify those operations.
+ These include CONST_VECTORs, for which we must swap the first and
+ second halves of the constant vector; and SUBREGs, for which we
+ must adjust the byte offset to account for the swapped doublewords.
+ A remaining opportunity would be non-immediate-form splats, for
+ which we should adjust the selected lane of the input. We should
+ also make code generation adjustments for sum-across operations,
+ since this is a common vectorizer reduction.
+
+ Because we run prior to the first split, we can see loads and stores
+ here that match *vsx_le_perm_{load,store}_<mode>. These are vanilla
+ vector loads and stores that have not yet been split into a permuting
+ load/store and a swap. (One way this can happen is with a builtin
+ call to vec_vsx_{ld,st}.) We can handle these as well, but rather
+ than deleting a swap, we convert the load/store into a permuting
+ load/store (which effectively removes the swap). */
+
+/* Notes on Permutes
+
+ We do not currently handle computations that contain permutes. There
+ is a general transformation that can be performed correctly, but it
+ may introduce more expensive code than it replaces. To handle these
+ would require a cost model to determine when to perform the optimization.
+ This commentary records how this could be done if desired.
+
+ The most general permute is something like this (example for V16QI):
+
+ (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
+ (parallel [(const_int a0) (const_int a1)
+ ...
+ (const_int a14) (const_int a15)]))
+
+ where a0,...,a15 are in [0,31] and select elements from op1 and op2
+ to produce in the result.
+
+ Regardless of mode, we can convert the PARALLEL to a mask of 16
+ byte-element selectors. Let's call this M, with M[i] representing
+ the ith byte-element selector value. Then if we swap doublewords
+ throughout the computation, we can get correct behavior by replacing
+ M with M' as follows:
+
+ { M[i+8]+8 : i < 8, M[i+8] in [0,7] U [16,23]
+ M'[i] = { M[i+8]-8 : i < 8, M[i+8] in [8,15] U [24,31]
+ { M[i-8]+8 : i >= 8, M[i-8] in [0,7] U [16,23]
+ { M[i-8]-8 : i >= 8, M[i-8] in [8,15] U [24,31]
+
+ This seems promising at first, since we are just replacing one mask
+ with another. But certain masks are preferable to others. If M
+ is a mask that matches a vmrghh pattern, for example, M' certainly
+ will not. Instead of a single vmrghh, we would generate a load of
+ M' and a vperm. So we would need to know how many xxswapd's we can
+ remove as a result of this transformation to determine if it's
+ profitable; and preferably the logic would need to be aware of all
+ the special preferable masks.
+
+ Another form of permute is an UNSPEC_VPERM, in which the mask is
+ already in a register. In some cases, this mask may be a constant
+ that we can discover with ud-chains, in which case the above
+ transformation is ok. However, the common usage here is for the
+ mask to be produced by an UNSPEC_LVSL, in which case the mask
+ cannot be known at compile time. In such a case we would have to
+ generate several instructions to compute M' as above at run time,
+ and a cost model is needed again. */
+
+/* This is based on the union-find logic in web.c. web_entry_base is
+ defined in df.h. */
+class swap_web_entry : public web_entry_base
+{
+ public:
+ /* Pointer to the insn. */
+ rtx insn;
+ /* Set if insn contains a mention of a vector register. All other
+ fields are undefined if this field is unset. */
+ unsigned int is_relevant : 1;
+ /* Set if insn is a load. */
+ unsigned int is_load : 1;
+ /* Set if insn is a store. */
+ unsigned int is_store : 1;
+ /* Set if insn is a doubleword swap. This can either be a register swap
+ or a permuting load or store (test is_load and is_store for this). */
+ unsigned int is_swap : 1;
+ /* Set if the insn has a live-in use of a parameter register. */
+ unsigned int is_live_in : 1;
+ /* Set if the insn has a live-out def of a return register. */
+ unsigned int is_live_out : 1;
+ /* Set if the insn contains a subreg reference of a vector register. */
+ unsigned int contains_subreg : 1;
+ /* Set if the insn contains a 128-bit integer operand. */
+ unsigned int is_128_int : 1;
+ /* Set if this is a call-insn. */
+ unsigned int is_call : 1;
+ /* Set if this insn does not perform a vector operation for which
+ element order matters, or if we know how to fix it up if it does.
+ Undefined if is_swap is set. */
+ unsigned int is_swappable : 1;
+ /* A nonzero value indicates what kind of special handling for this
+ insn is required if doublewords are swapped. Undefined if
+ is_swappable is not set. */
+ unsigned int special_handling : 3;
+ /* Set if the web represented by this entry cannot be optimized. */
+ unsigned int web_not_optimizable : 1;
+ /* Set if this insn should be deleted. */
+ unsigned int will_delete : 1;
+};
+
+enum special_handling_values {
+ SH_NONE = 0,
+ SH_CONST_VECTOR,
+ SH_SUBREG,
+ SH_NOSWAP_LD,
+ SH_NOSWAP_ST,
+ SH_EXTRACT,
+ SH_SPLAT
+};
+
+/* Union INSN with all insns containing definitions that reach USE.
+ Detect whether USE is live-in to the current function. */
+static void
+union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
+{
+ struct df_link *link = DF_REF_CHAIN (use);
+
+ if (!link)
+ insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+ while (link)
+ {
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+ if (DF_REF_INSN_INFO (link->ref))
+ {
+ rtx def_insn = DF_REF_INSN (link->ref);
+ (void)unionfind_union (insn_entry + INSN_UID (insn),
+ insn_entry + INSN_UID (def_insn));
+ }
+
+ link = link->next;
+ }
+}
+
+/* Union INSN with all insns containing uses reached from DEF.
+ Detect whether DEF is live-out from the current function. */
+static void
+union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
+{
+ struct df_link *link = DF_REF_CHAIN (def);
+
+ if (!link)
+ insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+ while (link)
+ {
+ /* This could be an eh use or some other artificial use;
+ we treat these all the same (killing the optimization). */
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+ if (DF_REF_INSN_INFO (link->ref))
+ {
+ rtx use_insn = DF_REF_INSN (link->ref);
+ (void)unionfind_union (insn_entry + INSN_UID (insn),
+ insn_entry + INSN_UID (use_insn));
+ }
+
+ link = link->next;
+ }
+}
+
+/* Return 1 iff INSN is a load insn, including permuting loads that
+ represent an lvxd2x instruction; else return 0. */
+static unsigned int
+insn_is_load_p (rtx insn)
+{
+ rtx body = PATTERN (insn);
+
+ if (GET_CODE (body) == SET)
+ {
+ if (GET_CODE (SET_SRC (body)) == MEM)
+ return 1;
+
+ if (GET_CODE (SET_SRC (body)) == VEC_SELECT
+ && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
+ return 1;
+
+ return 0;
+ }
+
+ if (GET_CODE (body) != PARALLEL)
+ return 0;
+
+ rtx set = XVECEXP (body, 0, 0);
+
+ if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
+ return 1;
+
+ return 0;
+}
+
+/* Return 1 iff INSN is a store insn, including permuting stores that
+ represent an stvxd2x instruction; else return 0. */
+static unsigned int
+insn_is_store_p (rtx insn)
+{
+ rtx body = PATTERN (insn);
+ if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
+ return 1;
+ if (GET_CODE (body) != PARALLEL)
+ return 0;
+ rtx set = XVECEXP (body, 0, 0);
+ if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
+ return 1;
+ return 0;
+}
+
+/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap,
+ a permuting load, or a permuting store. */
+static unsigned int
+insn_is_swap_p (rtx insn)
+{
+ rtx body = PATTERN (insn);
+ if (GET_CODE (body) != SET)
+ return 0;
+ rtx rhs = SET_SRC (body);
+ if (GET_CODE (rhs) != VEC_SELECT)
+ return 0;
+ rtx parallel = XEXP (rhs, 1);
+ if (GET_CODE (parallel) != PARALLEL)
+ return 0;
+ unsigned int len = XVECLEN (parallel, 0);
+ if (len != 2 && len != 4 && len != 8 && len != 16)
+ return 0;
+ for (unsigned int i = 0; i < len / 2; ++i)
+ {
+ rtx op = XVECEXP (parallel, 0, i);
+ if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
+ return 0;
+ }
+ for (unsigned int i = len / 2; i < len; ++i)
+ {
+ rtx op = XVECEXP (parallel, 0, i);
+ if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
+ return 0;
+ }
+ return 1;
+}
+
+/* Return 1 iff OP is an operand that will not be affected by having
+ vector doublewords swapped in memory. */
+static unsigned int
+rtx_is_swappable_p (rtx op, unsigned int *special)
+{
+ enum rtx_code code = GET_CODE (op);
+ int i, j;
+ rtx parallel;
+
+ switch (code)
+ {
+ case LABEL_REF:
+ case SYMBOL_REF:
+ case CLOBBER:
+ case REG:
+ return 1;
+
+ case VEC_CONCAT:
+ case ASM_INPUT:
+ case ASM_OPERANDS:
+ return 0;
+
+ case CONST_VECTOR:
+ {
+ *special = SH_CONST_VECTOR;
+ return 1;
+ }
+
+ case VEC_DUPLICATE:
+ /* Opportunity: If XEXP (op, 0) has the same mode as the result,
+ and XEXP (op, 1) is a PARALLEL with a single QImode const int,
+ it represents a vector splat for which we can do special
+ handling. */
+ if (GET_CODE (XEXP (op, 0)) == CONST_INT)
+ return 1;
+ else if (GET_CODE (XEXP (op, 0)) == REG
+ && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+ /* This catches V2DF and V2DI splat, at a minimum. */
+ return 1;
+ else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
+ /* If the duplicated item is from a select, defer to the select
+ processing to see if we can change the lane for the splat. */
+ return rtx_is_swappable_p (XEXP (op, 0), special);
+ else
+ return 0;
+
+ case VEC_SELECT:
+ /* A vec_extract operation is ok if we change the lane. */
+ if (GET_CODE (XEXP (op, 0)) == REG
+ && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
+ && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+ && XVECLEN (parallel, 0) == 1
+ && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
+ {
+ *special = SH_EXTRACT;
+ return 1;
+ }
+ else
+ return 0;
+
+ case UNSPEC:
+ {
+ /* Various operations are unsafe for this optimization, at least
+ without significant additional work. Permutes are obviously
+ problematic, as both the permute control vector and the ordering
+ of the target values are invalidated by doubleword swapping.
+ Vector pack and unpack modify the number of vector lanes.
+ Merge-high/low will not operate correctly on swapped operands.
+ Vector shifts across element boundaries are clearly uncool,
+ as are vector select and concatenate operations. Vector
+ sum-across instructions define one operand with a specific
+ order-dependent element, so additional fixup code would be
+ needed to make those work. Vector set and non-immediate-form
+ vector splat are element-order sensitive. A few of these
+ cases might be workable with special handling if required. */
+ int val = XINT (op, 1);
+ switch (val)
+ {
+ default:
+ break;
+ case UNSPEC_VMRGH_DIRECT:
+ case UNSPEC_VMRGL_DIRECT:
+ case UNSPEC_VPACK_SIGN_SIGN_SAT:
+ case UNSPEC_VPACK_SIGN_UNS_SAT:
+ case UNSPEC_VPACK_UNS_UNS_MOD:
+ case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
+ case UNSPEC_VPACK_UNS_UNS_SAT:
+ case UNSPEC_VPERM:
+ case UNSPEC_VPERM_UNS:
+ case UNSPEC_VPERMHI:
+ case UNSPEC_VPERMSI:
+ case UNSPEC_VPKPX:
+ case UNSPEC_VSLDOI:
+ case UNSPEC_VSLO:
+ case UNSPEC_VSRO:
+ case UNSPEC_VSUM2SWS:
+ case UNSPEC_VSUM4S:
+ case UNSPEC_VSUM4UBS:
+ case UNSPEC_VSUMSWS:
+ case UNSPEC_VSUMSWS_DIRECT:
+ case UNSPEC_VSX_CONCAT:
+ case UNSPEC_VSX_SET:
+ case UNSPEC_VSX_SLDWI:
+ case UNSPEC_VUNPACK_HI_SIGN:
+ case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
+ case UNSPEC_VUNPACK_LO_SIGN:
+ case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
+ case UNSPEC_VUPKHPX:
+ case UNSPEC_VUPKHS_V4SF:
+ case UNSPEC_VUPKHU_V4SF:
+ case UNSPEC_VUPKLPX:
+ case UNSPEC_VUPKLS_V4SF:
+ case UNSPEC_VUPKLU_V4SF:
+ /* The following could be handled as an idiom with XXSPLTW.
+ These place a scalar in BE element zero, but the XXSPLTW
+ will currently expect it in BE element 2 in a swapped
+ region. When one of these feeds an XXSPLTW with no other
+ defs/uses either way, we can avoid the lane change for
+ XXSPLTW and things will be correct. TBD. */
+ case UNSPEC_VSX_CVDPSPN:
+ case UNSPEC_VSX_CVSPDP:
+ case UNSPEC_VSX_CVSPDPN:
+ return 0;
+ case UNSPEC_VSPLT_DIRECT:
+ *special = SH_SPLAT;
+ return 1;
+ }
+ }
+
+ default:
+ break;
+ }
+
+ const char *fmt = GET_RTX_FORMAT (code);
+ int ok = 1;
+
+ for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+ if (fmt[i] == 'e' || fmt[i] == 'u')
+ {
+ unsigned int special_op = SH_NONE;
+ ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
+ /* Ensure we never have two kinds of special handling
+ for the same insn. */
+ if (*special != SH_NONE && special_op != SH_NONE
+ && *special != special_op)
+ return 0;
+ *special = special_op;
+ }
+ else if (fmt[i] == 'E')
+ for (j = 0; j < XVECLEN (op, i); ++j)
+ {
+ unsigned int special_op = SH_NONE;
+ ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
+ /* Ensure we never have two kinds of special handling
+ for the same insn. */
+ if (*special != SH_NONE && special_op != SH_NONE
+ && *special != special_op)
+ return 0;
+ *special = special_op;
+ }
+
+ return ok;
+}
+
+/* Return 1 iff INSN is an operand that will not be affected by
+ having vector doublewords swapped in memory (in which case
+ *SPECIAL is unchanged), or that can be modified to be correct
+ if vector doublewords are swapped in memory (in which case
+ *SPECIAL is changed to a value indicating how). */
+static unsigned int
+insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
+ unsigned int *special)
+{
+ /* Calls are always bad. */
+ if (GET_CODE (insn) == CALL_INSN)
+ return 0;
+
+ /* Loads and stores seen here are not permuting, but we can still
+ fix them up by converting them to permuting ones. Exceptions:
+ UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
+ body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
+ for the SET source. */
+ rtx body = PATTERN (insn);
+ int i = INSN_UID (insn);
+
+ if (insn_entry[i].is_load)
+ {
+ if (GET_CODE (body) == SET)
+ {
+ *special = SH_NOSWAP_LD;
+ return 1;
+ }
+ else
+ return 0;
+ }
+
+ if (insn_entry[i].is_store)
+ {
+ if (GET_CODE (body) == SET && GET_CODE (SET_SRC (body)) != UNSPEC)
+ {
+ *special = SH_NOSWAP_ST;
+ return 1;
+ }
+ else
+ return 0;
+ }
+
+ /* Otherwise check the operands for vector lane violations. */
+ return rtx_is_swappable_p (body, special);
+}
+
+enum chain_purpose { FOR_LOADS, FOR_STORES };
+
+/* Return true if the UD or DU chain headed by LINK is non-empty,
+ and every entry on the chain references an insn that is a
+ register swap. Furthermore, if PURPOSE is FOR_LOADS, each such
+ register swap must have only permuting loads as reaching defs.
+ If PURPOSE is FOR_STORES, each such register swap must have only
+ register swaps or permuting stores as reached uses. */
+static bool
+chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
+ enum chain_purpose purpose)
+{
+ if (!link)
+ return false;
+
+ for (; link; link = link->next)
+ {
+ if (!VECTOR_MODE_P (GET_MODE (DF_REF_REG (link->ref))))
+ continue;
+
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ return false;
+
+ rtx reached_insn = DF_REF_INSN (link->ref);
+ unsigned uid = INSN_UID (reached_insn);
+
+ if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
+ || insn_entry[uid].is_store)
+ return false;
+
+ if (purpose == FOR_LOADS)
+ {
+ df_ref *use_rec;
+ for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+ {
+ df_ref use = *use_rec;
+ struct df_link *swap_link = DF_REF_CHAIN (use);
+
+ while (swap_link)
+ {
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ return false;
+
+ rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
+ unsigned uid2 = INSN_UID (swap_def_insn);
+
+ /* Only permuting loads are allowed. */
+ if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
+ return false;
+
+ swap_link = swap_link->next;
+ }
+ }
+ }
+ else if (purpose == FOR_STORES)
+ {
+ df_ref *def_rec;
+ for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+ {
+ df_ref def = *def_rec;
+ struct df_link *swap_link = DF_REF_CHAIN (def);
+
+ while (swap_link)
+ {
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ return false;
+
+ rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
+ unsigned uid2 = INSN_UID (swap_use_insn);
+
+ /* Permuting stores or register swaps are allowed. */
+ if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
+ return false;
+
+ swap_link = swap_link->next;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/* Mark the xxswapdi instructions associated with permuting loads and
+ stores for removal. Note that we only flag them for deletion here,
+ as there is a possibility of a swap being reached from multiple
+ loads, etc. */
+static void
+mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
+{
+ rtx insn = insn_entry[i].insn;
+ unsigned uid = INSN_UID (insn);
+
+ if (insn_entry[i].is_load)
+ {
+ df_ref *def_rec;
+ for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+ {
+ df_ref def = *def_rec;
+ struct df_link *link = DF_REF_CHAIN (def);
+
+ /* We know by now that these are swaps, so we can delete
+ them confidently. */
+ while (link)
+ {
+ rtx use_insn = DF_REF_INSN (link->ref);
+ insn_entry[INSN_UID (use_insn)].will_delete = 1;
+ link = link->next;
+ }
+ }
+ }
+ else if (insn_entry[i].is_store)
+ {
+ df_ref *use_rec;
+ for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+ {
+ df_ref use = *use_rec;
+ /* Ignore uses for addressability. */
+ machine_mode mode = GET_MODE (DF_REF_REG (use));
+ if (!VECTOR_MODE_P (mode))
+ continue;
+
+ struct df_link *link = DF_REF_CHAIN (use);
+
+ /* We know by now that these are swaps, so we can delete
+ them confidently. */
+ while (link)
+ {
+ rtx def_insn = DF_REF_INSN (link->ref);
+ insn_entry[INSN_UID (def_insn)].will_delete = 1;
+ link = link->next;
+ }
+ }
+ }
+}
+
+/* OP is either a CONST_VECTOR or an expression containing one.
+ Swap the first half of the vector with the second in the first
+ case. Recurse to find it in the second. */
+static void
+swap_const_vector_halves (rtx op)
+{
+ int i;
+ enum rtx_code code = GET_CODE (op);
+ if (GET_CODE (op) == CONST_VECTOR)
+ {
+ int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
+ for (i = 0; i < half_units; ++i)
+ {
+ rtx temp = CONST_VECTOR_ELT (op, i);
+ CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
+ CONST_VECTOR_ELT (op, i + half_units) = temp;
+ }
+ }
+ else
+ {
+ int j;
+ const char *fmt = GET_RTX_FORMAT (code);
+ for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+ if (fmt[i] == 'e' || fmt[i] == 'u')
+ swap_const_vector_halves (XEXP (op, i));
+ else if (fmt[i] == 'E')
+ for (j = 0; j < XVECLEN (op, i); ++j)
+ swap_const_vector_halves (XVECEXP (op, i, j));
+ }
+}
+
+/* Find all subregs of a vector expression that perform a narrowing,
+ and adjust the subreg index to account for doubleword swapping. */
+static void
+adjust_subreg_index (rtx op)
+{
+ enum rtx_code code = GET_CODE (op);
+ if (code == SUBREG
+ && (GET_MODE_SIZE (GET_MODE (op))
+ < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
+ {
+ unsigned int index = SUBREG_BYTE (op);
+ if (index < 8)
+ index += 8;
+ else
+ index -= 8;
+ SUBREG_BYTE (op) = index;
+ }
+
+ const char *fmt = GET_RTX_FORMAT (code);
+ int i,j;
+ for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+ if (fmt[i] == 'e' || fmt[i] == 'u')
+ adjust_subreg_index (XEXP (op, i));
+ else if (fmt[i] == 'E')
+ for (j = 0; j < XVECLEN (op, i); ++j)
+ adjust_subreg_index (XVECEXP (op, i, j));
+}
+/* Convert the non-permuting load INSN to a permuting one. */
+static void
+permute_load (rtx insn)
+{
+ rtx body = PATTERN (insn);
+ rtx mem_op = SET_SRC (body);
+ rtx tgt_reg = SET_DEST (body);
+ machine_mode mode = GET_MODE (tgt_reg);
+ int n_elts = GET_MODE_NUNITS (mode);
+ int half_elts = n_elts / 2;
+ rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+ int i, j;
+ for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
+ SET_SRC (body) = sel;
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Replacing load %d with permuted load\n",
+ INSN_UID (insn));
+}
+
+/* Convert the non-permuting store INSN to a permuting one. */
+static void
+permute_store (rtx insn)
+{
+ rtx body = PATTERN (insn);
+ rtx src_reg = SET_SRC (body);
+ machine_mode mode = GET_MODE (src_reg);
+ int n_elts = GET_MODE_NUNITS (mode);
+ int half_elts = n_elts / 2;
+ rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+ int i, j;
+ for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
+ SET_SRC (body) = sel;
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Replacing store %d with permuted store\n",
+ INSN_UID (insn));
+}
+
+/* Given OP that contains a vector extract operation, adjust the index
+ of the extracted lane to account for the doubleword swap. */
+static void
+adjust_extract (rtx insn)
+{
+ rtx src = SET_SRC (PATTERN (insn));
+ /* The vec_select may be wrapped in a vec_duplicate for a splat, so
+ account for that. */
+ rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
+ rtx par = XEXP (sel, 1);
+ int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
+ int lane = INTVAL (XVECEXP (par, 0, 0));
+ lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+ XVECEXP (par, 0, 0) = GEN_INT (lane);
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains a vector direct-splat operation, adjust the index
+ of the source lane to account for the doubleword swap. */
+static void
+adjust_splat (rtx insn)
+{
+ rtx body = PATTERN (insn);
+ rtx unspec = XEXP (body, 1);
+ int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
+ int lane = INTVAL (XVECEXP (unspec, 0, 1));
+ lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+ XVECEXP (unspec, 0, 1) = GEN_INT (lane);
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
+}
+
+/* The insn described by INSN_ENTRY[I] can be swapped, but only
+ with special handling. Take care of that here. */
+static void
+handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
+{
+ rtx insn = insn_entry[i].insn;
+ rtx body = PATTERN (insn);
+
+ switch (insn_entry[i].special_handling)
+ {
+ default:
+ gcc_unreachable ();
+ case SH_CONST_VECTOR:
+ {
+ /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */
+ gcc_assert (GET_CODE (body) == SET);
+ rtx rhs = SET_SRC (body);
+ swap_const_vector_halves (rhs);
+ if (dump_file)
+ fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
+ break;
+ }
+ case SH_SUBREG:
+ /* A subreg of the same size is already safe. For subregs that
+ select a smaller portion of a reg, adjust the index for
+ swapped doublewords. */
+ adjust_subreg_index (body);
+ if (dump_file)
+ fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
+ break;
+ case SH_NOSWAP_LD:
+ /* Convert a non-permuting load to a permuting one. */
+ permute_load (insn);
+ break;
+ case SH_NOSWAP_ST:
+ /* Convert a non-permuting store to a permuting one. */
+ permute_store (insn);
+ break;
+ case SH_EXTRACT:
+ /* Change the lane on an extract operation. */
+ adjust_extract (insn);
+ break;
+ case SH_SPLAT:
+ /* Change the lane on a direct-splat operation. */
+ adjust_splat (insn);
+ break;
+ }
+}
+
+/* Find the insn from the Ith table entry, which is known to be a
+ register swap Y = SWAP(X). Replace it with a copy Y = X. */
+static void
+replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
+{
+ rtx insn = insn_entry[i].insn;
+ rtx body = PATTERN (insn);
+ rtx src_reg = XEXP (SET_SRC (body), 0);
+ rtx copy = gen_rtx_SET (VOIDmode, SET_DEST (body), src_reg);
+ rtx new_insn = emit_insn_before (copy, insn);
+ set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
+ df_insn_rescan (new_insn);
+
+ if (dump_file)
+ {
+ unsigned int new_uid = INSN_UID (new_insn);
+ fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
+ }
+
+ df_insn_delete (insn);
+ remove_insn (insn);
+ INSN_DELETED_P (insn) = 1;
+}
+
+/* Dump the swap table to DUMP_FILE. */
+static void
+dump_swap_insn_table (swap_web_entry *insn_entry)
+{
+ int e = get_max_uid ();
+ fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
+
+ for (int i = 0; i < e; ++i)
+ if (insn_entry[i].is_relevant)
+ {
+ swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
+ fprintf (dump_file, "%6d %6d ", i,
+ pred_entry && pred_entry->insn
+ ? INSN_UID (pred_entry->insn) : 0);
+ if (insn_entry[i].is_load)
+ fputs ("load ", dump_file);
+ if (insn_entry[i].is_store)
+ fputs ("store ", dump_file);
+ if (insn_entry[i].is_swap)
+ fputs ("swap ", dump_file);
+ if (insn_entry[i].is_live_in)
+ fputs ("live-in ", dump_file);
+ if (insn_entry[i].is_live_out)
+ fputs ("live-out ", dump_file);
+ if (insn_entry[i].contains_subreg)
+ fputs ("subreg ", dump_file);
+ if (insn_entry[i].is_128_int)
+ fputs ("int128 ", dump_file);
+ if (insn_entry[i].is_call)
+ fputs ("call ", dump_file);
+ if (insn_entry[i].is_swappable)
+ {
+ fputs ("swappable ", dump_file);
+ if (insn_entry[i].special_handling == SH_CONST_VECTOR)
+ fputs ("special:constvec ", dump_file);
+ else if (insn_entry[i].special_handling == SH_SUBREG)
+ fputs ("special:subreg ", dump_file);
+ else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
+ fputs ("special:load ", dump_file);
+ else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
+ fputs ("special:store ", dump_file);
+ else if (insn_entry[i].special_handling == SH_EXTRACT)
+ fputs ("special:extract ", dump_file);
+ else if (insn_entry[i].special_handling == SH_SPLAT)
+ fputs ("special:splat ", dump_file);
+ }
+ if (insn_entry[i].web_not_optimizable)
+ fputs ("unoptimizable ", dump_file);
+ if (insn_entry[i].will_delete)
+ fputs ("delete ", dump_file);
+ fputs ("\n", dump_file);
+ }
+ fputs ("\n", dump_file);
+}
+
+/* Main entry point for this pass. */
+unsigned int
+rs6000_analyze_swaps (function *fun)
+{
+ swap_web_entry *insn_entry;
+ basic_block bb;
+ rtx insn;
+
+ /* Dataflow analysis for use-def chains. */
+ df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+ df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+ df_analyze ();
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+
+ /* Allocate structure to represent webs of insns. */
+ insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
+
+ /* Walk the insns to gather basic data. */
+ FOR_ALL_BB_FN (bb, fun)
+ FOR_BB_INSNS (bb, insn)
+ {
+ unsigned int uid = INSN_UID (insn);
+ if (NONDEBUG_INSN_P (insn))
+ {
+ insn_entry[uid].insn = insn;
+
+ if (GET_CODE (insn) == CALL_INSN)
+ insn_entry[uid].is_call = 1;
+
+ /* Walk the uses and defs to see if we mention vector regs.
+ Record any constraints on optimization of such mentions. */
+ df_ref *use_rec;
+ for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+ {
+ df_ref mention = *use_rec;
+ /* We use DF_REF_REAL_REG here to get inside any subregs. */
+ machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+ /* If a use gets its value from a call insn, it will be
+ a hard register and will look like (reg:V4SI 3 3).
+ The df analysis creates two mentions for GPR3 and GPR4,
+ both DImode. We must recognize this and treat it as a
+ vector mention to ensure the call is unioned with this
+ use. */
+ if (mode == DImode && DF_REF_INSN_INFO (mention))
+ {
+ rtx feeder = DF_REF_INSN (mention);
+ /* FIXME: It is pretty hard to get from the df mention
+ to the mode of the use in the insn. We arbitrarily
+ pick a vector mode here, even though the use might
+ be a real DImode. We can be too conservative
+ (create a web larger than necessary) because of
+ this, so consider eventually fixing this. */
+ if (GET_CODE (feeder) == CALL_INSN)
+ mode = V4SImode;
+ }
+
+ if (VECTOR_MODE_P (mode) || mode == TImode)
+ {
+ insn_entry[uid].is_relevant = 1;
+ if (mode == TImode || mode == V1TImode)
+ insn_entry[uid].is_128_int = 1;
+ if (DF_REF_INSN_INFO (mention))
+ insn_entry[uid].contains_subreg
+ = !rtx_equal_p (DF_REF_REG (mention),
+ DF_REF_REAL_REG (mention));
+ union_defs (insn_entry, insn, mention);
+ }
+ }
+ df_ref *def_rec;
+ for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+ {
+ df_ref mention = *def_rec;
+ /* We use DF_REF_REAL_REG here to get inside any subregs. */
+ machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+ /* If we're loading up a hard vector register for a call,
+ it looks like (set (reg:V4SI 9 9) (...)). The df
+ analysis creates two mentions for GPR9 and GPR10, both
+ DImode. So relying on the mode from the mentions
+ isn't sufficient to ensure we union the call into the
+ web with the parameter setup code. */
+ if (mode == DImode && GET_CODE (insn) == SET
+ && VECTOR_MODE_P (GET_MODE (SET_DEST (insn))))
+ mode = GET_MODE (SET_DEST (insn));
+
+ if (VECTOR_MODE_P (mode) || mode == TImode)
+ {
+ insn_entry[uid].is_relevant = 1;
+ if (mode == TImode || mode == V1TImode)
+ insn_entry[uid].is_128_int = 1;
+ if (DF_REF_INSN_INFO (mention))
+ insn_entry[uid].contains_subreg
+ = !rtx_equal_p (DF_REF_REG (mention),
+ DF_REF_REAL_REG (mention));
+ /* REG_FUNCTION_VALUE_P is not valid for subregs. */
+ else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
+ insn_entry[uid].is_live_out = 1;
+ union_uses (insn_entry, insn, mention);
+ }
+ }
+
+ if (insn_entry[uid].is_relevant)
+ {
+ /* Determine if this is a load or store. */
+ insn_entry[uid].is_load = insn_is_load_p (insn);
+ insn_entry[uid].is_store = insn_is_store_p (insn);
+
+ /* Determine if this is a doubleword swap. If not,
+ determine whether it can legally be swapped. */
+ if (insn_is_swap_p (insn))
+ insn_entry[uid].is_swap = 1;
+ else
+ {
+ unsigned int special = SH_NONE;
+ insn_entry[uid].is_swappable
+ = insn_is_swappable_p (insn_entry, insn, &special);
+ if (special != SH_NONE && insn_entry[uid].contains_subreg)
+ insn_entry[uid].is_swappable = 0;
+ else if (special != SH_NONE)
+ insn_entry[uid].special_handling = special;
+ else if (insn_entry[uid].contains_subreg)
+ insn_entry[uid].special_handling = SH_SUBREG;
+ }
+ }
+ }
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nSwap insn entry table when first built\n");
+ dump_swap_insn_table (insn_entry);
+ }
+
+ /* Record unoptimizable webs. */
+ unsigned e = get_max_uid (), i;
+ for (i = 0; i < e; ++i)
+ {
+ if (!insn_entry[i].is_relevant)
+ continue;
+
+ swap_web_entry *root
+ = (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
+ unsigned uid = INSN_UID (insn_entry[i].insn);
+
+ if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
+ || (insn_entry[i].contains_subreg
+ && insn_entry[i].special_handling != SH_SUBREG)
+ || insn_entry[i].is_128_int || insn_entry[i].is_call
+ || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
+ root->web_not_optimizable = 1;
+
+ /* If we have loads or stores that aren't permuting then the
+ optimization isn't appropriate. */
+ else if ((insn_entry[i].is_load || insn_entry[i].is_store)
+ && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
+ root->web_not_optimizable = 1;
+
+ /* If we have permuting loads or stores that are not accompanied
+ by a register swap, the optimization isn't appropriate. */
+ else if (insn_entry[i].is_load && insn_entry[i].is_swap)
+ {
+ df_ref *def_rec;
+
+ for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+ {
+ df_ref def = *def_rec;
+ struct df_link *link = DF_REF_CHAIN (def);
+
+ if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
+ {
+ root->web_not_optimizable = 1;
+ break;
+ }
+ }
+ }
+ else if (insn_entry[i].is_store && insn_entry[i].is_swap)
+ {
+ df_ref *use_rec;
+
+ for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+ {
+ df_ref use = *use_rec;
+ struct df_link *link = DF_REF_CHAIN (use);
+
+ if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
+ {
+ root->web_not_optimizable = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
+ dump_swap_insn_table (insn_entry);
+ }
+
+ /* For each load and store in an optimizable web (which implies
+ the loads and stores are permuting), find the associated
+ register swaps and mark them for removal. Due to various
+ optimizations we may mark the same swap more than once. Also
+ perform special handling for swappable insns that require it. */
+ for (i = 0; i < e; ++i)
+ if ((insn_entry[i].is_load || insn_entry[i].is_store)
+ && insn_entry[i].is_swap)
+ {
+ swap_web_entry* root_entry
+ = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+ if (!root_entry->web_not_optimizable)
+ mark_swaps_for_removal (insn_entry, i);
+ }
+ else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
+ {
+ swap_web_entry* root_entry
+ = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+ if (!root_entry->web_not_optimizable)
+ handle_special_swappables (insn_entry, i);
+ }
+
+ /* Now delete the swaps marked for removal. */
+ for (i = 0; i < e; ++i)
+ if (insn_entry[i].will_delete)
+ replace_swap_with_copy (insn_entry, i);
+
+ /* Clean up. */
+ free (insn_entry);
+ return 0;
+}
+
+const pass_data pass_data_analyze_swaps =
+{
+ RTL_PASS, /* type */
+ "swaps", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ true, /* has_gate */
+ true, /* has_execute */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_analyze_swaps : public rtl_opt_pass
+{
+public:
+ pass_analyze_swaps(gcc::context *ctxt)
+ : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate ()
+ {
+ return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
+ && rs6000_optimize_swaps);
+ }
+
+ unsigned int execute ()
+ {
+ return rs6000_analyze_swaps (cfun);
+ }
+
+}; // class pass_analyze_swaps
+
+rtl_opt_pass *
+make_pass_analyze_swaps (gcc::context *ctxt)
+{
+ return new pass_analyze_swaps (ctxt);
+}
struct gcc_target targetm = TARGET_INITIALIZER;
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 4c1a02a524a..4d0d5e73db1 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -588,3 +588,7 @@ Allow double variables in upper registers with -mcpu=power7 or -mvsx
mupper-regs-sf
Target Undocumented Mask(UPPER_REGS_SF) Var(rs6000_isa_flags)
Allow float variables in upper registers with -mcpu=power8 or -mp8-vector
+
+moptimize-swaps
+Target Undocumented Var(rs6000_optimize_swaps) Init(1) Save
+Analyze and remove doubleword swaps from VSX computations.
diff --git a/gcc/df.h b/gcc/df.h
index 2de800c0938..3020955c198 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -1134,21 +1134,23 @@ df_get_artificial_uses (unsigned int bb_index)
/* web */
-/* This entry is allocated for each reference in the insn stream. */
-struct web_entry
+class web_entry_base
{
- /* Pointer to the parent in the union/find tree. */
- struct web_entry *pred;
- /* Newly assigned register to the entry. Set only for roots. */
- rtx reg;
- void* extra_info;
-};
+ private:
+ /* Reference to the parent in the union/find tree. */
+ web_entry_base *pred_pvt;
+
+ public:
+ /* Accessors. */
+ web_entry_base *pred () { return pred_pvt; }
+ void set_pred (web_entry_base *p) { pred_pvt = p; }
-extern struct web_entry *unionfind_root (struct web_entry *);
-extern bool unionfind_union (struct web_entry *, struct web_entry *);
-extern void union_defs (df_ref, struct web_entry *,
- unsigned int *used, struct web_entry *,
- bool (*fun) (struct web_entry *, struct web_entry *));
+ /* Find representative in union-find tree. */
+ web_entry_base *unionfind_root ();
+
+ /* Union with another set, returning TRUE if they are already unioned. */
+ friend bool unionfind_union (web_entry_base *first, web_entry_base *second);
+};
extern bool df_check_ud_du_memory_usage (void);
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c
new file mode 100644
index 00000000000..ab85e9160a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort();
+
+#define N 16
+
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[] __attribute__((aligned(16)))
+ = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7};
+signed char cc[] __attribute__((aligned(16)))
+ = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2};
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = cb[i] - cc[i];
+ }
+}
+
+int main ()
+{
+ signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5};
+ int i;
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != cd[i])
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c
new file mode 100644
index 00000000000..170649df608
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c
@@ -0,0 +1,42 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i % 2 ? 1 : -1;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
+ abort ();
+ else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c
new file mode 100644
index 00000000000..699b5baf404
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+int hey;
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ vector int va, vb, vc, vd, tmp;
+ vector unsigned int threes = vec_splat_u32(3);
+ for (i = 0; i < N; i+=4) {
+ vb = vec_vsx_ld (0, &cb[i]);
+ vc = vec_vsx_ld (0, &cc[i]);
+ vd = vec_vsx_ld (0, &cd[i]);
+ tmp = vec_add (vb, vc);
+ tmp = vec_sub (tmp, vd);
+ tmp = vec_sra (tmp, threes);
+ hey = tmp[3];
+ vec_vsx_st (tmp, 0, &ca[i]);
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i + 14;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != (-3 * i - 1969) >> 3)
+ abort ();
+ if (hey != ca[N-1])
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c
new file mode 100644
index 00000000000..529d03e64c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c
@@ -0,0 +1,56 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include "altivec.h"
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+int hey;
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ vector int va, vb, vc, vd, tmp;
+ vector unsigned int threes = vec_splat_u32(3);
+ for (i = 0; i < N; i+=4) {
+ vb = vec_vsx_ld (0, &cb[i]);
+ vc = vec_vsx_ld (0, &cc[i]);
+ vd = vec_vsx_ld (0, &cd[i]);
+ tmp = vec_add (vb, vc);
+ tmp = vec_sub (tmp, vd);
+ tmp = vec_sra (tmp, threes);
+ hey = tmp[3];
+ vec_vsx_st (tmp, 0, &ca[i]);
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i + 14;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != (-3 * i - 1969) >> 3)
+ abort ();
+ if (hey != ca[N-1])
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
new file mode 100644
index 00000000000..787b02e6427
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ vector long long va, vb, vc, vd, tmp;
+ volatile unsigned long long three = 3;
+ vector unsigned long long threes = vec_splats (three);
+ for (i = 0; i < N; i+=2) {
+ vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+ vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+ vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+ tmp = vec_add (vb, vc);
+ tmp = vec_sub (tmp, vd);
+ tmp = vec_sra (tmp, threes);
+ x = vec_extract (tmp, 0);
+ vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i + 14;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != (-3 * i - 1969) >> 3)
+ abort ();
+ if (x != ca[N-1])
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c
new file mode 100644
index 00000000000..7ca6ad5ccaf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "stxsdx" } } */
+/* { dg-final { scan-assembler-times "xxpermdi" 1 } } */
+
+/* The only xxpermdi expected is for the vec_splats. */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ vector long long va, vb, vc, vd, tmp;
+ volatile unsigned long long three = 3;
+ vector unsigned long long threes = vec_splats (three);
+ for (i = 0; i < N; i+=2) {
+ vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+ vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+ vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+ tmp = vec_add (vb, vc);
+ tmp = vec_sub (tmp, vd);
+ tmp = vec_sra (tmp, threes);
+ x = vec_extract (tmp, 0);
+ vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+ }
+}
+
+int main ()
+{
+ foo ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
new file mode 100644
index 00000000000..172e4bd4cb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
@@ -0,0 +1,51 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "xxspltw" } } */
+
+/* Currently the analyze_swaps phase cannot optimize this loop because
+ of the presence of an UNSPEC_VSX_CVDPSPN. At such time as this is
+ handled, we need to add a 'scan-assembler-not "xxpermdi"' directive to
+ this test. */
+#include <altivec.h>
+void abort();
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]);
+
+ cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]);
+
+ cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]);
+
+ cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]);
+ }
+}
+
+int main ()
+{
+ foo ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c
new file mode 100644
index 00000000000..2b7f73c3715
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c
@@ -0,0 +1,57 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "vspltw" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include <altivec.h>
+void abort();
+
+typedef struct xx {vector double l; vector double h;} xx;
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ vector float brow;
+
+ for (i = 0; i < N; i++) {
+
+ brow = cb[i][0];
+ cc[i][0] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+
+ brow = cb[i][1];
+ cc[i][1] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+
+ brow = cb[i][2];
+ cc[i][2] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+
+ brow = cb[i][3];
+ cc[i][3] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+ }
+}
+
+int main ()
+{
+ foo ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c
new file mode 100644
index 00000000000..7a9cfbf954e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O1" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "xxpermdi" } } */
+
+/* Verify that we don't try to do permute removal in the presence of
+ vec_ste. This used to ICE. */
+#include <altivec.h>
+
+void f (void *p)
+{
+ vector unsigned int u32 = vec_vsx_ld (1, (const unsigned int *)p);
+ vec_ste (u32, 1, (unsigned int *)p);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-2.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-2.c
new file mode 100644
index 00000000000..6ce041ab519
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-2.c
@@ -0,0 +1,41 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 256
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = cb[i] - cc[i];
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = i - 128;
+ cc[i] = i/2 - 64;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != i - i/2 - 64)
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c
new file mode 100644
index 00000000000..35dacd4b578
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 4096
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = cb[i] - cc[i];
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i, ii;
+ for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
+ cb[i] = ii - 128;
+ cc[i] = ii/2 - 64;
+ }
+}
+
+int main ()
+{
+ int i, ii;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i) {
+ ii = i % 128;
+ if (ca[i] != ii - ii/2 - 64)
+ abort ();
+ }
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c
new file mode 100644
index 00000000000..61fe99b357b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = (cb[i] + cc[i]) * cd[i];
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i % 2 ? 1 : -1;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (i % 2 == 1 && ca[i] != -2 * i - 1955)
+ abort ();
+ else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c
new file mode 100644
index 00000000000..b367fb6b514
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i % 2 ? 1 : -1;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
+ abort ();
+ else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c
new file mode 100644
index 00000000000..f7084529ce8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort();
+
+#define N 16
+
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[] __attribute__((aligned(16)))
+ = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7};
+signed char cc[] __attribute__((aligned(16)))
+ = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2};
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = cb[i] - cc[i];
+ }
+}
+
+int main ()
+{
+ signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5};
+ int i;
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != cd[i])
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c
new file mode 100644
index 00000000000..27a31b711ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 256
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = cb[i] - cc[i];
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = i - 128;
+ cc[i] = i/2 - 64;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != i - i/2 - 64)
+ abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c
new file mode 100644
index 00000000000..7264d2586b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c
@@ -0,0 +1,40 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 4096
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = cb[i] - cc[i];
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i, ii;
+ for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
+ cb[i] = ii - 128;
+ cc[i] = ii/2 - 64;
+ }
+}
+
+int main ()
+{
+ int i, ii;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i) {
+ ii = i % 128;
+ if (ca[i] != ii - ii/2 - 64)
+ abort ();
+ }
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c
new file mode 100644
index 00000000000..cdca070e3d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c
@@ -0,0 +1,42 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ ca[i] = (cb[i] + cc[i]) * cd[i];
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i % 2 ? 1 : -1;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (i % 2 == 1 && ca[i] != -2 * i - 1955)
+ abort ();
+ else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
+ abort ();
+ return 0;
+}
diff --git a/gcc/web.c b/gcc/web.c
index d67151c7139..7ee39a19737 100644
--- a/gcc/web.c
+++ b/gcc/web.c
@@ -53,17 +53,17 @@ along with GCC; see the file COPYING3. If not see
/* Find the root of unionfind tree (the representative of set). */
-struct web_entry *
-unionfind_root (struct web_entry *element)
+web_entry_base *
+web_entry_base::unionfind_root ()
{
- struct web_entry *element1 = element, *element2;
+ web_entry_base *element = this, *element1 = this, *element2;
- while (element->pred)
- element = element->pred;
- while (element1->pred)
+ while (element->pred ())
+ element = element->pred ();
+ while (element1->pred ())
{
- element2 = element1->pred;
- element1->pred = element;
+ element2 = element1->pred ();
+ element1->set_pred (element);
element1 = element2;
}
return element;
@@ -74,23 +74,32 @@ unionfind_root (struct web_entry *element)
nothing is done. Otherwise, return false. */
bool
-unionfind_union (struct web_entry *first, struct web_entry *second)
+unionfind_union (web_entry_base *first, web_entry_base *second)
{
- first = unionfind_root (first);
- second = unionfind_root (second);
+ first = first->unionfind_root ();
+ second = second->unionfind_root ();
if (first == second)
return true;
- second->pred = first;
+ second->set_pred (first);
return false;
}
+class web_entry : public web_entry_base
+{
+ private:
+ rtx reg_pvt;
+
+ public:
+ rtx reg () { return reg_pvt; }
+ void set_reg (rtx r) { reg_pvt = r; }
+};
+
/* For INSN, union all defs and uses that are linked by match_dup.
FUN is the function that does the union. */
static void
-union_match_dups (rtx insn, struct web_entry *def_entry,
- struct web_entry *use_entry,
- bool (*fun) (struct web_entry *, struct web_entry *))
+union_match_dups (rtx insn, web_entry *def_entry, web_entry *use_entry,
+ bool (*fun) (web_entry_base *, web_entry_base *))
{
struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
df_ref *use_link = DF_INSN_INFO_USES (insn_info);
@@ -167,9 +176,9 @@ union_match_dups (rtx insn, struct web_entry *def_entry,
the values 0 and 1 are reserved for use by entry_register. */
void
-union_defs (df_ref use, struct web_entry *def_entry,
- unsigned int *used, struct web_entry *use_entry,
- bool (*fun) (struct web_entry *, struct web_entry *))
+union_defs (df_ref use, web_entry *def_entry,
+ unsigned int *used, web_entry *use_entry,
+ bool (*fun) (web_entry_base *, web_entry_base *))
{
struct df_insn_info *insn_info = DF_REF_INSN_INFO (use);
struct df_link *link = DF_REF_CHAIN (use);
@@ -270,15 +279,15 @@ union_defs (df_ref use, struct web_entry *def_entry,
/* Find the corresponding register for the given entry. */
static rtx
-entry_register (struct web_entry *entry, df_ref ref, unsigned int *used)
+entry_register (web_entry *entry, df_ref ref, unsigned int *used)
{
- struct web_entry *root;
+ web_entry *root;
rtx reg, newreg;
/* Find the corresponding web and see if it has been visited. */
- root = unionfind_root (entry);
- if (root->reg)
- return root->reg;
+ root = (web_entry *)entry->unionfind_root ();
+ if (root->reg ())
+ return root->reg ();
/* We are seeing this web for the first time, do the assignment. */
reg = DF_REF_REAL_REG (ref);
@@ -302,7 +311,7 @@ entry_register (struct web_entry *entry, df_ref ref, unsigned int *used)
REGNO (newreg));
}
- root->reg = newreg;
+ root->set_reg (newreg);
return newreg;
}
@@ -336,8 +345,8 @@ gate_handle_web (void)
static unsigned int
web_main (void)
{
- struct web_entry *def_entry;
- struct web_entry *use_entry;
+ web_entry *def_entry;
+ web_entry *use_entry;
unsigned int max = max_reg_num ();
unsigned int *used;
basic_block bb;
@@ -374,9 +383,9 @@ web_main (void)
}
/* Record the number of uses and defs at the beginning of the optimization. */
- def_entry = XCNEWVEC (struct web_entry, DF_DEFS_TABLE_SIZE ());
+ def_entry = XCNEWVEC (web_entry, DF_DEFS_TABLE_SIZE ());
used = XCNEWVEC (unsigned, max);
- use_entry = XCNEWVEC (struct web_entry, uses_num);
+ use_entry = XCNEWVEC (web_entry, uses_num);
/* Produce the web. */
FOR_ALL_BB_FN (bb, cfun)