aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-loop-distribution.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/tree-loop-distribution.c')
-rw-r--r--gcc/tree-loop-distribution.c827
1 files changed, 518 insertions, 309 deletions
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index 26b8b9a3751..5e835be779d 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -83,8 +83,8 @@ along with GCC; see the file COPYING3. If not see
loops and recover to the original one.
TODO:
- 1) We only distribute innermost loops now. This pass should handle loop
- nests in the future.
+ 1) We only distribute innermost two-level loop nest now. We should
+ extend it for arbitrary loop nests in the future.
2) We only fuse partitions in SCC now. A better fusion algorithm is
desired to minimize loop overhead, maximize parallelism and maximize
data reuse. */
@@ -118,6 +118,11 @@ along with GCC; see the file COPYING3. If not see
#define MAX_DATAREFS_NUM \
((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
+/* Threshold controlling number of distributed partitions. Given it may
+ be unnecessary if a memory stream cost model is invented in the future,
+ we define it as a temporary macro, rather than a parameter. */
+#define NUM_PARTITION_THRESHOLD (4)
+
/* Hashtable helpers. */
struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
@@ -588,27 +593,32 @@ enum partition_type {
PTYPE_SEQUENTIAL
};
+/* Builtin info for loop distribution. */
+struct builtin_info
+{
+ /* data-references a kind != PKIND_NORMAL partition is about. */
+ data_reference_p dst_dr;
+ data_reference_p src_dr;
+ /* Base address and size of memory objects operated by the builtin. Note
+ both dest and source memory objects must have the same size. */
+ tree dst_base;
+ tree src_base;
+ tree size;
+};
+
/* Partition for loop distribution. */
struct partition
{
/* Statements of the partition. */
bitmap stmts;
- /* Loops of the partition. */
- bitmap loops;
/* True if the partition defines variable which is used outside of loop. */
bool reduction_p;
- /* For builtin partition, true if it executes one iteration more than
- number of loop (latch) iterations. */
- bool plus_one;
enum partition_kind kind;
enum partition_type type;
- /* data-references a kind != PKIND_NORMAL partition is about. */
- data_reference_p main_dr;
- data_reference_p secondary_dr;
- /* Number of loop (latch) iterations. */
- tree niter;
/* Data references in the partition. */
bitmap datarefs;
+ /* Information of builtin parition. */
+ struct builtin_info *builtin;
};
@@ -619,7 +629,6 @@ partition_alloc (void)
{
partition *partition = XCNEW (struct partition);
partition->stmts = BITMAP_ALLOC (NULL);
- partition->loops = BITMAP_ALLOC (NULL);
partition->reduction_p = false;
partition->kind = PKIND_NORMAL;
partition->datarefs = BITMAP_ALLOC (NULL);
@@ -632,8 +641,10 @@ static void
partition_free (partition *partition)
{
BITMAP_FREE (partition->stmts);
- BITMAP_FREE (partition->loops);
BITMAP_FREE (partition->datarefs);
+ if (partition->builtin)
+ free (partition->builtin);
+
free (partition);
}
@@ -718,9 +729,11 @@ ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
{
- gimple *use_stmt = USE_STMT (use_p);
- if (!is_gimple_debug (use_stmt)
- && loop != loop_containing_stmt (use_stmt))
+ if (is_gimple_debug (USE_STMT (use_p)))
+ continue;
+
+ basic_block use_bb = gimple_bb (USE_STMT (use_p));
+ if (!flow_bb_inside_loop_p (loop, use_bb))
return true;
}
@@ -834,6 +847,10 @@ generate_loops_for_partition (struct loop *loop, partition *partition,
for (i = 0; i < loop->num_nodes; i++)
{
basic_block bb = bbs[i];
+ edge inner_exit = NULL;
+
+ if (loop != bb->loop_father)
+ inner_exit = single_exit (bb->loop_father);
for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
{
@@ -852,11 +869,17 @@ generate_loops_for_partition (struct loop *loop, partition *partition,
&& !is_gimple_debug (stmt)
&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
{
- /* Choose an arbitrary path through the empty CFG part
- that this unnecessary control stmt controls. */
+ /* In distribution of loop nest, if bb is inner loop's exit_bb,
+ we choose its exit edge/path in order to avoid generating
+ infinite loop. For all other cases, we choose an arbitrary
+ path through the empty CFG part that this unnecessary
+ control stmt controls. */
if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
{
- gimple_cond_make_false (cond_stmt);
+ if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
+ gimple_cond_make_true (cond_stmt);
+ else
+ gimple_cond_make_false (cond_stmt);
update_stmt (stmt);
}
else if (gimple_code (stmt) == GIMPLE_SWITCH)
@@ -881,43 +904,6 @@ generate_loops_for_partition (struct loop *loop, partition *partition,
free (bbs);
}
-/* Build the size argument for a memory operation call. */
-
-static tree
-build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter,
- bool plus_one)
-{
- tree size = fold_convert_loc (loc, sizetype, nb_iter);
- if (plus_one)
- size = size_binop (PLUS_EXPR, size, size_one_node);
- size = fold_build2_loc (loc, MULT_EXPR, sizetype, size,
- TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
- size = fold_convert_loc (loc, size_type_node, size);
- return size;
-}
-
-/* Build an address argument for a memory operation call. */
-
-static tree
-build_addr_arg_loc (location_t loc, data_reference_p dr, tree nb_bytes)
-{
- tree addr_base;
-
- addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
- addr_base = fold_convert_loc (loc, sizetype, addr_base);
-
- /* Test for a negative stride, iterating over every element. */
- if (tree_int_cst_sgn (DR_STEP (dr)) == -1)
- {
- addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
- fold_convert_loc (loc, sizetype, nb_bytes));
- addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
- TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
- }
-
- return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
-}
-
/* If VAL memory representation contains the same value in all bytes,
return that value, otherwise return -1.
E.g. for 0x24242424 return 0x24, for IEEE double
@@ -982,27 +968,23 @@ static void
generate_memset_builtin (struct loop *loop, partition *partition)
{
gimple_stmt_iterator gsi;
- gimple *stmt, *fn_call;
tree mem, fn, nb_bytes;
- location_t loc;
tree val;
-
- stmt = DR_STMT (partition->main_dr);
- loc = gimple_location (stmt);
+ struct builtin_info *builtin = partition->builtin;
+ gimple *fn_call;
/* The new statements will be placed before LOOP. */
gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
- nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
- partition->plus_one);
+ nb_bytes = builtin->size;
nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
false, GSI_CONTINUE_LINKING);
- mem = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
+ mem = builtin->dst_base;
mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
false, GSI_CONTINUE_LINKING);
/* This exactly matches the pattern recognition in classify_partition. */
- val = gimple_assign_rhs1 (stmt);
+ val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
/* Handle constants like 0x15151515 and similarly
floating point constants etc. where all bytes are the same. */
int bytev = const_with_all_bytes_same (val);
@@ -1038,23 +1020,19 @@ static void
generate_memcpy_builtin (struct loop *loop, partition *partition)
{
gimple_stmt_iterator gsi;
- gimple *stmt, *fn_call;
+ gimple *fn_call;
tree dest, src, fn, nb_bytes;
- location_t loc;
enum built_in_function kind;
-
- stmt = DR_STMT (partition->main_dr);
- loc = gimple_location (stmt);
+ struct builtin_info *builtin = partition->builtin;
/* The new statements will be placed before LOOP. */
gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
- nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter,
- partition->plus_one);
+ nb_bytes = builtin->size;
nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
false, GSI_CONTINUE_LINKING);
- dest = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
- src = build_addr_arg_loc (loc, partition->secondary_dr, nb_bytes);
+ dest = builtin->dst_base;
+ src = builtin->src_base;
if (partition->kind == PKIND_MEMCPY
|| ! ptr_derefs_may_alias_p (dest, src))
kind = BUILT_IN_MEMCPY;
@@ -1279,8 +1257,6 @@ build_rdg_partition_for_vertex (struct graph *rdg, int v)
FOR_EACH_VEC_ELT (nodes, i, x)
{
bitmap_set_bit (partition->stmts, x);
- bitmap_set_bit (partition->loops,
- loop_containing_stmt (RDG_STMT (rdg, x))->num);
for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
{
@@ -1307,69 +1283,22 @@ build_rdg_partition_for_vertex (struct graph *rdg, int v)
return partition;
}
-/* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
- For the moment we detect memset, memcpy and memmove patterns. Bitmap
- STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
+/* Given PARTITION of RDG, record single load/store data references for
+ builtin partition in SRC_DR/DST_DR, return false if there is no such
+ data references. */
-static void
-classify_partition (loop_p loop, struct graph *rdg, partition *partition,
- bitmap stmt_in_all_partitions)
+static bool
+find_single_drs (struct graph *rdg, partition *partition,
+ data_reference_p *dst_dr, data_reference_p *src_dr)
{
- bitmap_iterator bi;
unsigned i;
- tree nb_iter;
- data_reference_p single_load, single_store;
- bool volatiles_p = false, plus_one = false, has_reduction = false;
-
- partition->kind = PKIND_NORMAL;
- partition->main_dr = NULL;
- partition->secondary_dr = NULL;
- partition->niter = NULL_TREE;
- partition->plus_one = false;
-
- EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
- {
- gimple *stmt = RDG_STMT (rdg, i);
-
- if (gimple_has_volatile_ops (stmt))
- volatiles_p = true;
-
- /* If the stmt is not included by all partitions and there is uses
- outside of the loop, then mark the partition as reduction. */
- if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
- {
- /* Due to limitation in the transform phase we have to fuse all
- reduction partitions. As a result, this could cancel valid
- loop distribution especially for loop that induction variable
- is used outside of loop. To workaround this issue, we skip
- marking partition as reudction if the reduction stmt belongs
- to all partitions. In such case, reduction will be computed
- correctly no matter how partitions are fused/distributed. */
- if (!bitmap_bit_p (stmt_in_all_partitions, i))
- {
- partition->reduction_p = true;
- return;
- }
- has_reduction = true;
- }
- }
-
- /* Perform general partition disqualification for builtins. */
- if (volatiles_p
- /* Simple workaround to prevent classifying the partition as builtin
- if it contains any use outside of loop. */
- || has_reduction
- || !flag_tree_loop_distribute_patterns)
- return;
+ data_reference_p single_ld = NULL, single_st = NULL;
+ bitmap_iterator bi;
- /* Detect memset and memcpy. */
- single_load = NULL;
- single_store = NULL;
EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
{
gimple *stmt = RDG_STMT (rdg, i);
data_reference_p dr;
- unsigned j;
if (gimple_code (stmt) == GIMPLE_PHI)
continue;
@@ -1380,107 +1309,316 @@ classify_partition (loop_p loop, struct graph *rdg, partition *partition,
/* Otherwise just regular loads/stores. */
if (!gimple_assign_single_p (stmt))
- return;
+ return false;
/* But exactly one store and/or load. */
- for (j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
+ for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
{
tree type = TREE_TYPE (DR_REF (dr));
/* The memset, memcpy and memmove library calls are only
able to deal with generic address space. */
if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
- return;
+ return false;
if (DR_IS_READ (dr))
{
- if (single_load != NULL)
- return;
- single_load = dr;
+ if (single_ld != NULL)
+ return false;
+ single_ld = dr;
}
else
{
- if (single_store != NULL)
- return;
- single_store = dr;
+ if (single_st != NULL)
+ return false;
+ single_st = dr;
}
}
}
- if (!single_store)
- return;
+ if (!single_st)
+ return false;
- nb_iter = number_of_latch_executions (loop);
- gcc_assert (nb_iter && nb_iter != chrec_dont_know);
- if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
- gimple_bb (DR_STMT (single_store))))
- plus_one = true;
+ /* Bail out if this is a bitfield memory reference. */
+ if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
+ && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
+ return false;
- if (single_store && !single_load)
- {
- gimple *stmt = DR_STMT (single_store);
- tree rhs = gimple_assign_rhs1 (stmt);
- if (const_with_all_bytes_same (rhs) == -1
- && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
- || (TYPE_MODE (TREE_TYPE (rhs))
- != TYPE_MODE (unsigned_char_type_node))))
- return;
- if (TREE_CODE (rhs) == SSA_NAME
- && !SSA_NAME_IS_DEFAULT_DEF (rhs)
- && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
- return;
- if (!adjacent_dr_p (single_store)
- || !dominated_by_p (CDI_DOMINATORS,
- loop->latch, gimple_bb (stmt)))
- return;
- partition->kind = PKIND_MEMSET;
- partition->main_dr = single_store;
- partition->niter = nb_iter;
- partition->plus_one = plus_one;
- }
- else if (single_store && single_load)
+ /* Data reference must be executed exactly once per iteration. */
+ basic_block bb_st = gimple_bb (DR_STMT (single_st));
+ struct loop *inner = bb_st->loop_father;
+ if (!dominated_by_p (CDI_DOMINATORS, inner->latch, bb_st))
+ return false;
+
+ if (single_ld)
{
- gimple *store = DR_STMT (single_store);
- gimple *load = DR_STMT (single_load);
+ gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
/* Direct aggregate copy or via an SSA name temporary. */
if (load != store
&& gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
- return;
- if (!adjacent_dr_p (single_store)
- || !adjacent_dr_p (single_load)
- || !operand_equal_p (DR_STEP (single_store),
- DR_STEP (single_load), 0)
- || !dominated_by_p (CDI_DOMINATORS,
- loop->latch, gimple_bb (store)))
- return;
- /* Now check that if there is a dependence this dependence is
- of a suitable form for memmove. */
- ddr_p ddr = get_data_dependence (rdg, single_load, single_store);
- if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
- return;
+ return false;
+
+ /* Bail out if this is a bitfield memory reference. */
+ if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
+ && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
+ return false;
+
+ /* Load and store must be in the same loop nest. */
+ basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
+ if (inner != bb_ld->loop_father)
+ return false;
+
+ /* Data reference must be executed exactly once per iteration. */
+ if (!dominated_by_p (CDI_DOMINATORS, inner->latch, bb_ld))
+ return false;
+
+ edge e = single_exit (inner);
+ bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
+ bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
+ if (dom_ld != dom_st)
+ return false;
+ }
+
+ *src_dr = single_ld;
+ *dst_dr = single_st;
+ return true;
+}
+
+/* Given data reference DR in LOOP_NEST, this function checks the enclosing
+ loops from inner to outer to see if loop's step equals to access size at
+ each level of loop. Return true if yes; record access base and size in
+ BASE and SIZE; save loop's step at each level of loop in STEPS if it is
+ not null. For example:
+
+ int arr[100][100][100];
+ for (i = 0; i < 100; i++) ;steps[2] = 40000
+ for (j = 100; j > 0; j--) ;steps[1] = -400
+ for (k = 0; k < 100; k++) ;steps[0] = 4
+ arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */
+
+static bool
+compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
+ tree *size, vec<tree> *steps = NULL)
+{
+ location_t loc = gimple_location (DR_STMT (dr));
+ basic_block bb = gimple_bb (DR_STMT (dr));
+ struct loop *loop = bb->loop_father;
+ tree ref = DR_REF (dr);
+ tree access_base = build_fold_addr_expr (ref);
+ tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
+
+ do {
+ tree scev_fn = analyze_scalar_evolution (loop, access_base);
+ if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
+ return false;
+
+ access_base = CHREC_LEFT (scev_fn);
+ if (tree_contains_chrecs (access_base, NULL))
+ return false;
+
+ tree scev_step = CHREC_RIGHT (scev_fn);
+ /* Only support constant steps. */
+ if (TREE_CODE (scev_step) != INTEGER_CST)
+ return false;
+
+ enum ev_direction access_dir = scev_direction (scev_fn);
+ if (access_dir == EV_DIR_UNKNOWN)
+ return false;
+
+ if (steps != NULL)
+ steps->safe_push (scev_step);
+
+ scev_step = fold_convert_loc (loc, sizetype, scev_step);
+ /* Compute absolute value of scev step. */
+ if (access_dir == EV_DIR_DECREASES)
+ scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
- if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
+ /* At each level of loop, scev step must equal to access size. In other
+ words, DR must access consecutive memory between loop iterations. */
+ if (!operand_equal_p (scev_step, access_size, 0))
+ return false;
+
+ /* Compute DR's execution times in loop. */
+ tree niters = number_of_latch_executions (loop);
+ niters = fold_convert_loc (loc, sizetype, niters);
+ if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
+ niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
+
+ /* Compute DR's overall access size in loop. */
+ access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
+ niters, scev_step);
+ /* Adjust base address in case of negative step. */
+ if (access_dir == EV_DIR_DECREASES)
{
- if (DDR_NUM_DIST_VECTS (ddr) == 0)
- return;
+ tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
+ scev_step, access_size);
+ access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
+ }
+ } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
+
+ *base = access_base;
+ *size = access_size;
+ return true;
+}
+
+/* Allocate and return builtin struct. Record information like DST_DR,
+ SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
+
+static struct builtin_info *
+alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
+ tree dst_base, tree src_base, tree size)
+{
+ struct builtin_info *builtin = XNEW (struct builtin_info);
+ builtin->dst_dr = dst_dr;
+ builtin->src_dr = src_dr;
+ builtin->dst_base = dst_base;
+ builtin->src_base = src_base;
+ builtin->size = size;
+ return builtin;
+}
+
+/* Given data reference DR in loop nest LOOP, classify if it forms builtin
+ memset call. */
+
+static void
+classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
+{
+ gimple *stmt = DR_STMT (dr);
+ tree base, size, rhs = gimple_assign_rhs1 (stmt);
+
+ if (const_with_all_bytes_same (rhs) == -1
+ && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
+ || (TYPE_MODE (TREE_TYPE (rhs))
+ != TYPE_MODE (unsigned_char_type_node))))
+ return;
+
+ if (TREE_CODE (rhs) == SSA_NAME
+ && !SSA_NAME_IS_DEFAULT_DEF (rhs)
+ && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
+ return;
+
+ if (!compute_access_range (loop, dr, &base, &size))
+ return;
+
+ partition->builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
+ partition->kind = PKIND_MEMSET;
+}
+
+/* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
+ if it forms builtin memcpy or memmove call. */
+
+static void
+classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
+ data_reference_p dst_dr, data_reference_p src_dr)
+{
+ tree base, size, src_base, src_size;
+ auto_vec<tree> dst_steps, src_steps;
+
+ /* Compute access range of both load and store. They much have the same
+ access size. */
+ if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps)
+ || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps)
+ || !operand_equal_p (size, src_size, 0))
+ return;
+
+ /* Load and store in loop nest must access memory in the same way, i.e,
+ their must have the same steps in each loop of the nest. */
+ if (dst_steps.length () != src_steps.length ())
+ return;
+ for (unsigned i = 0; i < dst_steps.length (); ++i)
+ if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
+ return;
+
+ /* Now check that if there is a dependence. */
+ ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
+
+ /* Classify as memcpy if no dependence between load and store. */
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+ {
+ partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
+ partition->kind = PKIND_MEMCPY;
+ return;
+ }
+
+ /* Can't do memmove in case of unknown dependence or dependence without
+ classical distance vector. */
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
+ || DDR_NUM_DIST_VECTS (ddr) == 0)
+ return;
+
+ unsigned i;
+ lambda_vector dist_v;
+ int num_lev = (DDR_LOOP_NEST (ddr)).length ();
+ FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
+ {
+ unsigned dep_lev = dependence_level (dist_v, num_lev);
+ /* Can't do memmove if load depends on store. */
+ if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
+ return;
+ }
+
+ partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
+ partition->kind = PKIND_MEMMOVE;
+ return;
+}
+
+/* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
+ For the moment we detect memset, memcpy and memmove patterns. Bitmap
+ STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
+
+static void
+classify_partition (loop_p loop, struct graph *rdg, partition *partition,
+ bitmap stmt_in_all_partitions)
+{
+ bitmap_iterator bi;
+ unsigned i;
+ data_reference_p single_ld = NULL, single_st = NULL;
+ bool volatiles_p = false, has_reduction = false;
+
+ EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
+ {
+ gimple *stmt = RDG_STMT (rdg, i);
+
+ if (gimple_has_volatile_ops (stmt))
+ volatiles_p = true;
- lambda_vector dist_v;
- FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
+ /* If the stmt is not included by all partitions and there is uses
+ outside of the loop, then mark the partition as reduction. */
+ if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
+ {
+ /* Due to limitation in the transform phase we have to fuse all
+ reduction partitions. As a result, this could cancel valid
+ loop distribution especially for loop that induction variable
+ is used outside of loop. To workaround this issue, we skip
+ marking partition as reudction if the reduction stmt belongs
+ to all partitions. In such case, reduction will be computed
+ correctly no matter how partitions are fused/distributed. */
+ if (!bitmap_bit_p (stmt_in_all_partitions, i))
{
- int dist = dist_v[index_in_loop_nest (loop->num,
- DDR_LOOP_NEST (ddr))];
- if (dist > 0 && !DDR_REVERSED_P (ddr))
- return;
+ partition->reduction_p = true;
+ return;
}
- partition->kind = PKIND_MEMMOVE;
+ has_reduction = true;
}
- else
- partition->kind = PKIND_MEMCPY;
- partition->main_dr = single_store;
- partition->secondary_dr = single_load;
- partition->niter = nb_iter;
- partition->plus_one = plus_one;
}
+
+ /* Perform general partition disqualification for builtins. */
+ if (volatiles_p
+ /* Simple workaround to prevent classifying the partition as builtin
+ if it contains any use outside of loop. */
+ || has_reduction
+ || !flag_tree_loop_distribute_patterns)
+ return;
+
+ /* Find single load/store data references for builtin partition. */
+ if (!find_single_drs (rdg, partition, &single_st, &single_ld))
+ return;
+
+ /* Classify the builtin kind. */
+ if (single_ld == NULL)
+ classify_builtin_st (loop, partition, single_st);
+ else
+ classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
}
/* Returns true when PARTITION1 and PARTITION2 access the same memory
@@ -1939,7 +2077,8 @@ build_partition_graph (struct graph *rdg,
return pg;
}
-/* Sort partitions in PG by post order and store them in PARTITIONS. */
+/* Sort partitions in PG in descending post order and store them in
+ PARTITIONS. */
static void
sort_partitions_by_post_order (struct graph *pg,
@@ -1948,7 +2087,7 @@ sort_partitions_by_post_order (struct graph *pg,
int i;
struct pg_vdata *data;
- /* Now order the remaining nodes in postorder. */
+ /* Now order the remaining nodes in descending postorder. */
qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
partitions->truncate (0);
for (i = 0; i < pg->n_vertices; ++i)
@@ -1960,16 +2099,18 @@ sort_partitions_by_post_order (struct graph *pg,
}
/* Given reduced dependence graph RDG merge strong connected components
- of PARTITIONS. In this function, data dependence caused by possible
- alias between references is ignored, as if it doesn't exist at all. */
+ of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
+ possible alias between references is ignored, as if it doesn't exist
+ at all; otherwise all depdendences are considered. */
static void
merge_dep_scc_partitions (struct graph *rdg,
- vec<struct partition *> *partitions)
+ vec<struct partition *> *partitions,
+ bool ignore_alias_p)
{
struct partition *partition1, *partition2;
struct pg_vdata *data;
- graph *pg = build_partition_graph (rdg, partitions, true);
+ graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
int i, j, num_sccs = graphds_scc (pg, NULL);
/* Strong connected compoenent means dependence cycle, we cannot distribute
@@ -2044,7 +2185,7 @@ break_alias_scc_partitions (struct graph *rdg,
vec<struct partition *> *partitions,
vec<ddr_p> *alias_ddrs)
{
- int i, j, num_sccs, num_sccs_no_alias;
+ int i, j, k, num_sccs, num_sccs_no_alias;
/* Build partition dependence graph. */
graph *pg = build_partition_graph (rdg, partitions, false);
@@ -2061,7 +2202,7 @@ break_alias_scc_partitions (struct graph *rdg,
auto_vec<enum partition_type> scc_types;
struct partition *partition, *first;
- /* If all paritions in a SCC has the same type, we can simply merge the
+ /* If all partitions in a SCC have the same type, we can simply merge the
SCC. This loop finds out such SCCS and record them in bitmap. */
bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
for (i = 0; i < num_sccs; ++i)
@@ -2074,6 +2215,10 @@ break_alias_scc_partitions (struct graph *rdg,
if (pg->vertices[j].component != i)
continue;
+ /* Note we Merge partitions of parallel type on purpose, though
+ the result partition is sequential. The reason is vectorizer
+ can do more accurate runtime alias check in this case. Also
+ it results in more conservative distribution. */
if (first->type != partition->type)
{
bitmap_clear_bit (sccs_to_merge, i);
@@ -2095,7 +2240,7 @@ break_alias_scc_partitions (struct graph *rdg,
if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
{
/* Run SCC finding algorithm again, with alias dependence edges
- skipped. This is to topologically sort paritions according to
+ skipped. This is to topologically sort partitions according to
compilation time known dependence. Note the topological order
is stored in the form of pg's post order number. */
num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
@@ -2117,19 +2262,29 @@ break_alias_scc_partitions (struct graph *rdg,
for (j = 0; partitions->iterate (j, &first); ++j)
if (cbdata.vertices_component[j] == i)
break;
- for (++j; partitions->iterate (j, &partition); ++j)
+ for (k = j + 1; partitions->iterate (k, &partition); ++k)
{
struct pg_vdata *data;
- if (cbdata.vertices_component[j] != i)
+ if (cbdata.vertices_component[k] != i)
continue;
+ /* Update postorder number so that merged reduction partition is
+ sorted after other partitions. */
+ if (!partition_reduction_p (first)
+ && partition_reduction_p (partition))
+ {
+ gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
+ pg->vertices[j].post = pg->vertices[k].post;
+ }
partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
- (*partitions)[j] = NULL;
+ (*partitions)[k] = NULL;
partition_free (partition);
- data = (struct pg_vdata *)pg->vertices[j].data;
- gcc_assert (data->id == j);
+ data = (struct pg_vdata *)pg->vertices[k].data;
+ gcc_assert (data->id == k);
data->partition = NULL;
+ /* The result partition of merged SCC must be sequential. */
+ first->type = PTYPE_SEQUENTIAL;
}
}
}
@@ -2321,38 +2476,49 @@ version_for_distribution_p (vec<struct partition *> *partitions,
return (alias_ddrs->length () > 0);
}
-/* Fuse all partitions if necessary before finalizing distribution. */
+/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
+ ALIAS_DDRS contains ddrs which need runtime alias check. */
static void
-finalize_partitions (vec<struct partition *> *partitions,
+finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
vec<ddr_p> *alias_ddrs)
{
unsigned i;
- struct partition *a, *partition;
+ struct partition *partition, *a;
if (partitions->length () == 1
|| alias_ddrs->length () > 0)
return;
- a = (*partitions)[0];
- if (a->kind != PKIND_NORMAL)
- return;
-
- for (i = 1; partitions->iterate (i, &partition); ++i)
+ unsigned num_builtin = 0, num_normal = 0;
+ bool same_type_p = true;
+ enum partition_type type = ((*partitions)[0])->type;
+ for (i = 0; partitions->iterate (i, &partition); ++i)
{
- /* Don't fuse if partition has different type or it is a builtin. */
- if (partition->type != a->type
- || partition->kind != PKIND_NORMAL)
- return;
+ same_type_p &= (type == partition->type);
+ if (partition->kind != PKIND_NORMAL)
+ num_builtin++;
+ else
+ num_normal++;
}
- /* Fuse all partitions. */
- for (i = 1; partitions->iterate (i, &partition); ++i)
+ /* Don't distribute current loop into too many loops given we don't have
+ memory stream cost model. Be even more conservative in case of loop
+ nest distribution. */
+ if ((same_type_p && num_builtin == 0)
+ || (loop->inner != NULL
+ && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
+ || (loop->inner == NULL
+ && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
{
- partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
- partition_free (partition);
+ a = (*partitions)[0];
+ for (i = 1; partitions->iterate (i, &partition); ++i)
+ {
+ partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
+ partition_free (partition);
+ }
+ partitions->truncate (1);
}
- partitions->truncate (1);
}
/* Distributes the code from LOOP in such a way that producer statements
@@ -2505,16 +2671,23 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts,
i--;
}
- /* Build the partition dependency graph. */
+ /* Build the partition dependency graph and fuse partitions in strong
+ connected component. */
if (partitions.length () > 1)
{
- merge_dep_scc_partitions (rdg, &partitions);
- alias_ddrs.truncate (0);
- if (partitions.length () > 1)
- break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
+ /* Don't support loop nest distribution under runtime alias check
+ since it's not likely to enable many vectorization opportunities. */
+ if (loop->inner)
+ merge_dep_scc_partitions (rdg, &partitions, false);
+ else
+ {
+ merge_dep_scc_partitions (rdg, &partitions, true);
+ if (partitions.length () > 1)
+ break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
+ }
}
- finalize_partitions (&partitions, &alias_ddrs);
+ finalize_partitions (loop, &partitions, &alias_ddrs);
nbp = partitions.length ();
if (nbp == 0
@@ -2595,6 +2768,86 @@ public:
}; // class pass_loop_distribution
+
+/* Given LOOP, this function records seed statements for distribution in
+ WORK_LIST. Return false if there is nothing for distribution. */
+
+static bool
+find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
+{
+ basic_block *bbs = get_loop_body_in_dom_order (loop);
+
+ /* Initialize the worklist with stmts we seed the partitions with. */
+ for (unsigned i = 0; i < loop->num_nodes; ++i)
+ {
+ for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gphi *phi = gsi.phi ();
+ if (virtual_operand_p (gimple_phi_result (phi)))
+ continue;
+ /* Distribute stmts which have defs that are used outside of
+ the loop. */
+ if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
+ continue;
+ work_list->safe_push (phi);
+ }
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+
+ /* If there is a stmt with side-effects bail out - we
+ cannot and should not distribute this loop. */
+ if (gimple_has_side_effects (stmt))
+ {
+ free (bbs);
+ return false;
+ }
+
+ /* Distribute stmts which have defs that are used outside of
+ the loop. */
+ if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
+ ;
+ /* Otherwise only distribute stores for now. */
+ else if (!gimple_vdef (stmt))
+ continue;
+
+ work_list->safe_push (stmt);
+ }
+ }
+ free (bbs);
+ return work_list->length () > 0;
+}
+
+/* Given innermost LOOP, return the outermost enclosing loop that forms a
+ perfect loop nest. */
+
+static struct loop *
+prepare_perfect_loop_nest (struct loop *loop)
+{
+ struct loop *outer = loop_outer (loop);
+ tree niters = number_of_latch_executions (loop);
+
+ /* TODO: We only support the innermost 2-level loop nest distribution
+ because of compilation time issue for now. This should be relaxed
+ in the future. */
+ while (loop->inner == NULL
+ && loop_outer (outer)
+ && outer->inner == loop && loop->next == NULL
+ && single_exit (outer)
+ && optimize_loop_for_speed_p (outer)
+ && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
+ && (niters = number_of_latch_executions (outer)) != NULL_TREE
+ && niters != chrec_dont_know)
+ {
+ loop = outer;
+ outer = loop_outer (loop);
+ }
+
+ return loop;
+}
+
unsigned int
pass_loop_distribution::execute (function *fun)
{
@@ -2637,18 +2890,9 @@ pass_loop_distribution::execute (function *fun)
walking to innermost loops. */
FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
{
- auto_vec<gimple *> work_list;
- basic_block *bbs;
- int num = loop->num;
- unsigned int i;
-
- /* If the loop doesn't have a single exit we will fail anyway,
- so do that early. */
- if (!single_exit (loop))
- continue;
-
- /* Only optimize hot loops. */
- if (!optimize_loop_for_speed_p (loop))
+ /* Don't distribute multiple exit edges loop, or cold loop. */
+ if (!single_exit (loop)
+ || !optimize_loop_for_speed_p (loop))
continue;
/* Don't distribute loop if niters is unknown. */
@@ -2656,56 +2900,16 @@ pass_loop_distribution::execute (function *fun)
if (niters == NULL_TREE || niters == chrec_dont_know)
continue;
- /* Initialize the worklist with stmts we seed the partitions with. */
- bbs = get_loop_body_in_dom_order (loop);
- for (i = 0; i < loop->num_nodes; ++i)
+ /* Get the perfect loop nest for distribution. */
+ loop = prepare_perfect_loop_nest (loop);
+ for (; loop; loop = loop->inner)
{
- for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
- !gsi_end_p (gsi);
- gsi_next (&gsi))
- {
- gphi *phi = gsi.phi ();
- if (virtual_operand_p (gimple_phi_result (phi)))
- continue;
- /* Distribute stmts which have defs that are used outside of
- the loop. */
- if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
- continue;
- work_list.safe_push (phi);
- }
- for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
- !gsi_end_p (gsi);
- gsi_next (&gsi))
- {
- gimple *stmt = gsi_stmt (gsi);
+ auto_vec<gimple *> work_list;
+ if (!find_seed_stmts_for_distribution (loop, &work_list))
+ break;
- /* If there is a stmt with side-effects bail out - we
- cannot and should not distribute this loop. */
- if (gimple_has_side_effects (stmt))
- {
- work_list.truncate (0);
- goto out;
- }
-
- /* Distribute stmts which have defs that are used outside of
- the loop. */
- if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
- ;
- /* Otherwise only distribute stores for now. */
- else if (!gimple_vdef (stmt))
- continue;
-
- work_list.safe_push (stmt);
- }
- }
-out:
- free (bbs);
-
- int nb_generated_loops = 0;
- int nb_generated_calls = 0;
- location_t loc = find_loop_location (loop);
- if (work_list.length () > 0)
- {
+ const char *str = loop->inner ? " nest" : "";
+ location_t loc = find_loop_location (loop);
if (!cd)
{
calculate_dominance_info (CDI_DOMINATORS);
@@ -2713,24 +2917,29 @@ out:
cd = new control_dependences ();
free_dominance_info (CDI_POST_DOMINATORS);
}
+
bool destroy_p;
+ int nb_generated_loops, nb_generated_calls;
nb_generated_loops = distribute_loop (loop, work_list, cd,
&nb_generated_calls,
&destroy_p);
if (destroy_p)
loops_to_be_destroyed.safe_push (loop);
- }
- if (nb_generated_loops + nb_generated_calls > 0)
- {
- changed = true;
- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
- loc, "Loop %d distributed: split to %d loops "
- "and %d library calls.\n",
- num, nb_generated_loops, nb_generated_calls);
+ if (nb_generated_loops + nb_generated_calls > 0)
+ {
+ changed = true;
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
+ loc, "Loop%s %d distributed: split to %d loops "
+ "and %d library calls.\n", str, loop->num,
+ nb_generated_loops, nb_generated_calls);
+
+ break;
+ }
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
}
- else if (dump_file && (dump_flags & TDF_DETAILS))
- fprintf (dump_file, "Loop %d is the same.\n", num);
}
if (cd)