diff options
Diffstat (limited to 'gcc/tree-loop-distribution.c')
-rw-r--r-- | gcc/tree-loop-distribution.c | 827 |
1 files changed, 518 insertions, 309 deletions
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index 26b8b9a3751..5e835be779d 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -83,8 +83,8 @@ along with GCC; see the file COPYING3. If not see loops and recover to the original one. TODO: - 1) We only distribute innermost loops now. This pass should handle loop - nests in the future. + 1) We only distribute innermost two-level loop nest now. We should + extend it for arbitrary loop nests in the future. 2) We only fuse partitions in SCC now. A better fusion algorithm is desired to minimize loop overhead, maximize parallelism and maximize data reuse. */ @@ -118,6 +118,11 @@ along with GCC; see the file COPYING3. If not see #define MAX_DATAREFS_NUM \ ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) +/* Threshold controlling number of distributed partitions. Given it may + be unnecessary if a memory stream cost model is invented in the future, + we define it as a temporary macro, rather than a parameter. */ +#define NUM_PARTITION_THRESHOLD (4) + /* Hashtable helpers. */ struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation> @@ -588,27 +593,32 @@ enum partition_type { PTYPE_SEQUENTIAL }; +/* Builtin info for loop distribution. */ +struct builtin_info +{ + /* data-references a kind != PKIND_NORMAL partition is about. */ + data_reference_p dst_dr; + data_reference_p src_dr; + /* Base address and size of memory objects operated by the builtin. Note + both dest and source memory objects must have the same size. */ + tree dst_base; + tree src_base; + tree size; +}; + /* Partition for loop distribution. */ struct partition { /* Statements of the partition. */ bitmap stmts; - /* Loops of the partition. */ - bitmap loops; /* True if the partition defines variable which is used outside of loop. */ bool reduction_p; - /* For builtin partition, true if it executes one iteration more than - number of loop (latch) iterations. */ - bool plus_one; enum partition_kind kind; enum partition_type type; - /* data-references a kind != PKIND_NORMAL partition is about. */ - data_reference_p main_dr; - data_reference_p secondary_dr; - /* Number of loop (latch) iterations. */ - tree niter; /* Data references in the partition. */ bitmap datarefs; + /* Information of builtin parition. */ + struct builtin_info *builtin; }; @@ -619,7 +629,6 @@ partition_alloc (void) { partition *partition = XCNEW (struct partition); partition->stmts = BITMAP_ALLOC (NULL); - partition->loops = BITMAP_ALLOC (NULL); partition->reduction_p = false; partition->kind = PKIND_NORMAL; partition->datarefs = BITMAP_ALLOC (NULL); @@ -632,8 +641,10 @@ static void partition_free (partition *partition) { BITMAP_FREE (partition->stmts); - BITMAP_FREE (partition->loops); BITMAP_FREE (partition->datarefs); + if (partition->builtin) + free (partition->builtin); + free (partition); } @@ -718,9 +729,11 @@ ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) { - gimple *use_stmt = USE_STMT (use_p); - if (!is_gimple_debug (use_stmt) - && loop != loop_containing_stmt (use_stmt)) + if (is_gimple_debug (USE_STMT (use_p))) + continue; + + basic_block use_bb = gimple_bb (USE_STMT (use_p)); + if (!flow_bb_inside_loop_p (loop, use_bb)) return true; } @@ -834,6 +847,10 @@ generate_loops_for_partition (struct loop *loop, partition *partition, for (i = 0; i < loop->num_nodes; i++) { basic_block bb = bbs[i]; + edge inner_exit = NULL; + + if (loop != bb->loop_father) + inner_exit = single_exit (bb->loop_father); for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);) { @@ -852,11 +869,17 @@ generate_loops_for_partition (struct loop *loop, partition *partition, && !is_gimple_debug (stmt) && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) { - /* Choose an arbitrary path through the empty CFG part - that this unnecessary control stmt controls. */ + /* In distribution of loop nest, if bb is inner loop's exit_bb, + we choose its exit edge/path in order to avoid generating + infinite loop. For all other cases, we choose an arbitrary + path through the empty CFG part that this unnecessary + control stmt controls. */ if (gcond *cond_stmt = dyn_cast <gcond *> (stmt)) { - gimple_cond_make_false (cond_stmt); + if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE) + gimple_cond_make_true (cond_stmt); + else + gimple_cond_make_false (cond_stmt); update_stmt (stmt); } else if (gimple_code (stmt) == GIMPLE_SWITCH) @@ -881,43 +904,6 @@ generate_loops_for_partition (struct loop *loop, partition *partition, free (bbs); } -/* Build the size argument for a memory operation call. */ - -static tree -build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter, - bool plus_one) -{ - tree size = fold_convert_loc (loc, sizetype, nb_iter); - if (plus_one) - size = size_binop (PLUS_EXPR, size, size_one_node); - size = fold_build2_loc (loc, MULT_EXPR, sizetype, size, - TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)))); - size = fold_convert_loc (loc, size_type_node, size); - return size; -} - -/* Build an address argument for a memory operation call. */ - -static tree -build_addr_arg_loc (location_t loc, data_reference_p dr, tree nb_bytes) -{ - tree addr_base; - - addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); - addr_base = fold_convert_loc (loc, sizetype, addr_base); - - /* Test for a negative stride, iterating over every element. */ - if (tree_int_cst_sgn (DR_STEP (dr)) == -1) - { - addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base, - fold_convert_loc (loc, sizetype, nb_bytes)); - addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base, - TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)))); - } - - return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base); -} - /* If VAL memory representation contains the same value in all bytes, return that value, otherwise return -1. E.g. for 0x24242424 return 0x24, for IEEE double @@ -982,27 +968,23 @@ static void generate_memset_builtin (struct loop *loop, partition *partition) { gimple_stmt_iterator gsi; - gimple *stmt, *fn_call; tree mem, fn, nb_bytes; - location_t loc; tree val; - - stmt = DR_STMT (partition->main_dr); - loc = gimple_location (stmt); + struct builtin_info *builtin = partition->builtin; + gimple *fn_call; /* The new statements will be placed before LOOP. */ gsi = gsi_last_bb (loop_preheader_edge (loop)->src); - nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter, - partition->plus_one); + nb_bytes = builtin->size; nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, false, GSI_CONTINUE_LINKING); - mem = build_addr_arg_loc (loc, partition->main_dr, nb_bytes); + mem = builtin->dst_base; mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE, false, GSI_CONTINUE_LINKING); /* This exactly matches the pattern recognition in classify_partition. */ - val = gimple_assign_rhs1 (stmt); + val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr)); /* Handle constants like 0x15151515 and similarly floating point constants etc. where all bytes are the same. */ int bytev = const_with_all_bytes_same (val); @@ -1038,23 +1020,19 @@ static void generate_memcpy_builtin (struct loop *loop, partition *partition) { gimple_stmt_iterator gsi; - gimple *stmt, *fn_call; + gimple *fn_call; tree dest, src, fn, nb_bytes; - location_t loc; enum built_in_function kind; - - stmt = DR_STMT (partition->main_dr); - loc = gimple_location (stmt); + struct builtin_info *builtin = partition->builtin; /* The new statements will be placed before LOOP. */ gsi = gsi_last_bb (loop_preheader_edge (loop)->src); - nb_bytes = build_size_arg_loc (loc, partition->main_dr, partition->niter, - partition->plus_one); + nb_bytes = builtin->size; nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, false, GSI_CONTINUE_LINKING); - dest = build_addr_arg_loc (loc, partition->main_dr, nb_bytes); - src = build_addr_arg_loc (loc, partition->secondary_dr, nb_bytes); + dest = builtin->dst_base; + src = builtin->src_base; if (partition->kind == PKIND_MEMCPY || ! ptr_derefs_may_alias_p (dest, src)) kind = BUILT_IN_MEMCPY; @@ -1279,8 +1257,6 @@ build_rdg_partition_for_vertex (struct graph *rdg, int v) FOR_EACH_VEC_ELT (nodes, i, x) { bitmap_set_bit (partition->stmts, x); - bitmap_set_bit (partition->loops, - loop_containing_stmt (RDG_STMT (rdg, x))->num); for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j) { @@ -1307,69 +1283,22 @@ build_rdg_partition_for_vertex (struct graph *rdg, int v) return partition; } -/* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. - For the moment we detect memset, memcpy and memmove patterns. Bitmap - STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */ +/* Given PARTITION of RDG, record single load/store data references for + builtin partition in SRC_DR/DST_DR, return false if there is no such + data references. */ -static void -classify_partition (loop_p loop, struct graph *rdg, partition *partition, - bitmap stmt_in_all_partitions) +static bool +find_single_drs (struct graph *rdg, partition *partition, + data_reference_p *dst_dr, data_reference_p *src_dr) { - bitmap_iterator bi; unsigned i; - tree nb_iter; - data_reference_p single_load, single_store; - bool volatiles_p = false, plus_one = false, has_reduction = false; - - partition->kind = PKIND_NORMAL; - partition->main_dr = NULL; - partition->secondary_dr = NULL; - partition->niter = NULL_TREE; - partition->plus_one = false; - - EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) - { - gimple *stmt = RDG_STMT (rdg, i); - - if (gimple_has_volatile_ops (stmt)) - volatiles_p = true; - - /* If the stmt is not included by all partitions and there is uses - outside of the loop, then mark the partition as reduction. */ - if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) - { - /* Due to limitation in the transform phase we have to fuse all - reduction partitions. As a result, this could cancel valid - loop distribution especially for loop that induction variable - is used outside of loop. To workaround this issue, we skip - marking partition as reudction if the reduction stmt belongs - to all partitions. In such case, reduction will be computed - correctly no matter how partitions are fused/distributed. */ - if (!bitmap_bit_p (stmt_in_all_partitions, i)) - { - partition->reduction_p = true; - return; - } - has_reduction = true; - } - } - - /* Perform general partition disqualification for builtins. */ - if (volatiles_p - /* Simple workaround to prevent classifying the partition as builtin - if it contains any use outside of loop. */ - || has_reduction - || !flag_tree_loop_distribute_patterns) - return; + data_reference_p single_ld = NULL, single_st = NULL; + bitmap_iterator bi; - /* Detect memset and memcpy. */ - single_load = NULL; - single_store = NULL; EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) { gimple *stmt = RDG_STMT (rdg, i); data_reference_p dr; - unsigned j; if (gimple_code (stmt) == GIMPLE_PHI) continue; @@ -1380,107 +1309,316 @@ classify_partition (loop_p loop, struct graph *rdg, partition *partition, /* Otherwise just regular loads/stores. */ if (!gimple_assign_single_p (stmt)) - return; + return false; /* But exactly one store and/or load. */ - for (j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j) + for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j) { tree type = TREE_TYPE (DR_REF (dr)); /* The memset, memcpy and memmove library calls are only able to deal with generic address space. */ if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type))) - return; + return false; if (DR_IS_READ (dr)) { - if (single_load != NULL) - return; - single_load = dr; + if (single_ld != NULL) + return false; + single_ld = dr; } else { - if (single_store != NULL) - return; - single_store = dr; + if (single_st != NULL) + return false; + single_st = dr; } } } - if (!single_store) - return; + if (!single_st) + return false; - nb_iter = number_of_latch_executions (loop); - gcc_assert (nb_iter && nb_iter != chrec_dont_know); - if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, - gimple_bb (DR_STMT (single_store)))) - plus_one = true; + /* Bail out if this is a bitfield memory reference. */ + if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) + return false; - if (single_store && !single_load) - { - gimple *stmt = DR_STMT (single_store); - tree rhs = gimple_assign_rhs1 (stmt); - if (const_with_all_bytes_same (rhs) == -1 - && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs)) - || (TYPE_MODE (TREE_TYPE (rhs)) - != TYPE_MODE (unsigned_char_type_node)))) - return; - if (TREE_CODE (rhs) == SSA_NAME - && !SSA_NAME_IS_DEFAULT_DEF (rhs) - && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) - return; - if (!adjacent_dr_p (single_store) - || !dominated_by_p (CDI_DOMINATORS, - loop->latch, gimple_bb (stmt))) - return; - partition->kind = PKIND_MEMSET; - partition->main_dr = single_store; - partition->niter = nb_iter; - partition->plus_one = plus_one; - } - else if (single_store && single_load) + /* Data reference must be executed exactly once per iteration. */ + basic_block bb_st = gimple_bb (DR_STMT (single_st)); + struct loop *inner = bb_st->loop_father; + if (!dominated_by_p (CDI_DOMINATORS, inner->latch, bb_st)) + return false; + + if (single_ld) { - gimple *store = DR_STMT (single_store); - gimple *load = DR_STMT (single_load); + gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); /* Direct aggregate copy or via an SSA name temporary. */ if (load != store && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) - return; - if (!adjacent_dr_p (single_store) - || !adjacent_dr_p (single_load) - || !operand_equal_p (DR_STEP (single_store), - DR_STEP (single_load), 0) - || !dominated_by_p (CDI_DOMINATORS, - loop->latch, gimple_bb (store))) - return; - /* Now check that if there is a dependence this dependence is - of a suitable form for memmove. */ - ddr_p ddr = get_data_dependence (rdg, single_load, single_store); - if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) - return; + return false; + + /* Bail out if this is a bitfield memory reference. */ + if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) + return false; + + /* Load and store must be in the same loop nest. */ + basic_block bb_ld = gimple_bb (DR_STMT (single_ld)); + if (inner != bb_ld->loop_father) + return false; + + /* Data reference must be executed exactly once per iteration. */ + if (!dominated_by_p (CDI_DOMINATORS, inner->latch, bb_ld)) + return false; + + edge e = single_exit (inner); + bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld); + bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st); + if (dom_ld != dom_st) + return false; + } + + *src_dr = single_ld; + *dst_dr = single_st; + return true; +} + +/* Given data reference DR in LOOP_NEST, this function checks the enclosing + loops from inner to outer to see if loop's step equals to access size at + each level of loop. Return true if yes; record access base and size in + BASE and SIZE; save loop's step at each level of loop in STEPS if it is + not null. For example: + + int arr[100][100][100]; + for (i = 0; i < 100; i++) ;steps[2] = 40000 + for (j = 100; j > 0; j--) ;steps[1] = -400 + for (k = 0; k < 100; k++) ;steps[0] = 4 + arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */ + +static bool +compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, + tree *size, vec<tree> *steps = NULL) +{ + location_t loc = gimple_location (DR_STMT (dr)); + basic_block bb = gimple_bb (DR_STMT (dr)); + struct loop *loop = bb->loop_father; + tree ref = DR_REF (dr); + tree access_base = build_fold_addr_expr (ref); + tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); + + do { + tree scev_fn = analyze_scalar_evolution (loop, access_base); + if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) + return false; + + access_base = CHREC_LEFT (scev_fn); + if (tree_contains_chrecs (access_base, NULL)) + return false; + + tree scev_step = CHREC_RIGHT (scev_fn); + /* Only support constant steps. */ + if (TREE_CODE (scev_step) != INTEGER_CST) + return false; + + enum ev_direction access_dir = scev_direction (scev_fn); + if (access_dir == EV_DIR_UNKNOWN) + return false; + + if (steps != NULL) + steps->safe_push (scev_step); + + scev_step = fold_convert_loc (loc, sizetype, scev_step); + /* Compute absolute value of scev step. */ + if (access_dir == EV_DIR_DECREASES) + scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); - if (DDR_ARE_DEPENDENT (ddr) != chrec_known) + /* At each level of loop, scev step must equal to access size. In other + words, DR must access consecutive memory between loop iterations. */ + if (!operand_equal_p (scev_step, access_size, 0)) + return false; + + /* Compute DR's execution times in loop. */ + tree niters = number_of_latch_executions (loop); + niters = fold_convert_loc (loc, sizetype, niters); + if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) + niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node); + + /* Compute DR's overall access size in loop. */ + access_size = fold_build2_loc (loc, MULT_EXPR, sizetype, + niters, scev_step); + /* Adjust base address in case of negative step. */ + if (access_dir == EV_DIR_DECREASES) { - if (DDR_NUM_DIST_VECTS (ddr) == 0) - return; + tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype, + scev_step, access_size); + access_base = fold_build_pointer_plus_loc (loc, access_base, adj); + } + } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); + + *base = access_base; + *size = access_size; + return true; +} + +/* Allocate and return builtin struct. Record information like DST_DR, + SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ + +static struct builtin_info * +alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr, + tree dst_base, tree src_base, tree size) +{ + struct builtin_info *builtin = XNEW (struct builtin_info); + builtin->dst_dr = dst_dr; + builtin->src_dr = src_dr; + builtin->dst_base = dst_base; + builtin->src_base = src_base; + builtin->size = size; + return builtin; +} + +/* Given data reference DR in loop nest LOOP, classify if it forms builtin + memset call. */ + +static void +classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) +{ + gimple *stmt = DR_STMT (dr); + tree base, size, rhs = gimple_assign_rhs1 (stmt); + + if (const_with_all_bytes_same (rhs) == -1 + && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs)) + || (TYPE_MODE (TREE_TYPE (rhs)) + != TYPE_MODE (unsigned_char_type_node)))) + return; + + if (TREE_CODE (rhs) == SSA_NAME + && !SSA_NAME_IS_DEFAULT_DEF (rhs) + && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) + return; + + if (!compute_access_range (loop, dr, &base, &size)) + return; + + partition->builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); + partition->kind = PKIND_MEMSET; +} + +/* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify + if it forms builtin memcpy or memmove call. */ + +static void +classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, + data_reference_p dst_dr, data_reference_p src_dr) +{ + tree base, size, src_base, src_size; + auto_vec<tree> dst_steps, src_steps; + + /* Compute access range of both load and store. They much have the same + access size. */ + if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps) + || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps) + || !operand_equal_p (size, src_size, 0)) + return; + + /* Load and store in loop nest must access memory in the same way, i.e, + their must have the same steps in each loop of the nest. */ + if (dst_steps.length () != src_steps.length ()) + return; + for (unsigned i = 0; i < dst_steps.length (); ++i) + if (!operand_equal_p (dst_steps[i], src_steps[i], 0)) + return; + + /* Now check that if there is a dependence. */ + ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr); + + /* Classify as memcpy if no dependence between load and store. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + { + partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); + partition->kind = PKIND_MEMCPY; + return; + } + + /* Can't do memmove in case of unknown dependence or dependence without + classical distance vector. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know + || DDR_NUM_DIST_VECTS (ddr) == 0) + return; + + unsigned i; + lambda_vector dist_v; + int num_lev = (DDR_LOOP_NEST (ddr)).length (); + FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) + { + unsigned dep_lev = dependence_level (dist_v, num_lev); + /* Can't do memmove if load depends on store. */ + if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr)) + return; + } + + partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); + partition->kind = PKIND_MEMMOVE; + return; +} + +/* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. + For the moment we detect memset, memcpy and memmove patterns. Bitmap + STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */ + +static void +classify_partition (loop_p loop, struct graph *rdg, partition *partition, + bitmap stmt_in_all_partitions) +{ + bitmap_iterator bi; + unsigned i; + data_reference_p single_ld = NULL, single_st = NULL; + bool volatiles_p = false, has_reduction = false; + + EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) + { + gimple *stmt = RDG_STMT (rdg, i); + + if (gimple_has_volatile_ops (stmt)) + volatiles_p = true; - lambda_vector dist_v; - FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) + /* If the stmt is not included by all partitions and there is uses + outside of the loop, then mark the partition as reduction. */ + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + { + /* Due to limitation in the transform phase we have to fuse all + reduction partitions. As a result, this could cancel valid + loop distribution especially for loop that induction variable + is used outside of loop. To workaround this issue, we skip + marking partition as reudction if the reduction stmt belongs + to all partitions. In such case, reduction will be computed + correctly no matter how partitions are fused/distributed. */ + if (!bitmap_bit_p (stmt_in_all_partitions, i)) { - int dist = dist_v[index_in_loop_nest (loop->num, - DDR_LOOP_NEST (ddr))]; - if (dist > 0 && !DDR_REVERSED_P (ddr)) - return; + partition->reduction_p = true; + return; } - partition->kind = PKIND_MEMMOVE; + has_reduction = true; } - else - partition->kind = PKIND_MEMCPY; - partition->main_dr = single_store; - partition->secondary_dr = single_load; - partition->niter = nb_iter; - partition->plus_one = plus_one; } + + /* Perform general partition disqualification for builtins. */ + if (volatiles_p + /* Simple workaround to prevent classifying the partition as builtin + if it contains any use outside of loop. */ + || has_reduction + || !flag_tree_loop_distribute_patterns) + return; + + /* Find single load/store data references for builtin partition. */ + if (!find_single_drs (rdg, partition, &single_st, &single_ld)) + return; + + /* Classify the builtin kind. */ + if (single_ld == NULL) + classify_builtin_st (loop, partition, single_st); + else + classify_builtin_ldst (loop, rdg, partition, single_st, single_ld); } /* Returns true when PARTITION1 and PARTITION2 access the same memory @@ -1939,7 +2077,8 @@ build_partition_graph (struct graph *rdg, return pg; } -/* Sort partitions in PG by post order and store them in PARTITIONS. */ +/* Sort partitions in PG in descending post order and store them in + PARTITIONS. */ static void sort_partitions_by_post_order (struct graph *pg, @@ -1948,7 +2087,7 @@ sort_partitions_by_post_order (struct graph *pg, int i; struct pg_vdata *data; - /* Now order the remaining nodes in postorder. */ + /* Now order the remaining nodes in descending postorder. */ qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp); partitions->truncate (0); for (i = 0; i < pg->n_vertices; ++i) @@ -1960,16 +2099,18 @@ sort_partitions_by_post_order (struct graph *pg, } /* Given reduced dependence graph RDG merge strong connected components - of PARTITIONS. In this function, data dependence caused by possible - alias between references is ignored, as if it doesn't exist at all. */ + of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by + possible alias between references is ignored, as if it doesn't exist + at all; otherwise all depdendences are considered. */ static void merge_dep_scc_partitions (struct graph *rdg, - vec<struct partition *> *partitions) + vec<struct partition *> *partitions, + bool ignore_alias_p) { struct partition *partition1, *partition2; struct pg_vdata *data; - graph *pg = build_partition_graph (rdg, partitions, true); + graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p); int i, j, num_sccs = graphds_scc (pg, NULL); /* Strong connected compoenent means dependence cycle, we cannot distribute @@ -2044,7 +2185,7 @@ break_alias_scc_partitions (struct graph *rdg, vec<struct partition *> *partitions, vec<ddr_p> *alias_ddrs) { - int i, j, num_sccs, num_sccs_no_alias; + int i, j, k, num_sccs, num_sccs_no_alias; /* Build partition dependence graph. */ graph *pg = build_partition_graph (rdg, partitions, false); @@ -2061,7 +2202,7 @@ break_alias_scc_partitions (struct graph *rdg, auto_vec<enum partition_type> scc_types; struct partition *partition, *first; - /* If all paritions in a SCC has the same type, we can simply merge the + /* If all partitions in a SCC have the same type, we can simply merge the SCC. This loop finds out such SCCS and record them in bitmap. */ bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs); for (i = 0; i < num_sccs; ++i) @@ -2074,6 +2215,10 @@ break_alias_scc_partitions (struct graph *rdg, if (pg->vertices[j].component != i) continue; + /* Note we Merge partitions of parallel type on purpose, though + the result partition is sequential. The reason is vectorizer + can do more accurate runtime alias check in this case. Also + it results in more conservative distribution. */ if (first->type != partition->type) { bitmap_clear_bit (sccs_to_merge, i); @@ -2095,7 +2240,7 @@ break_alias_scc_partitions (struct graph *rdg, if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs) { /* Run SCC finding algorithm again, with alias dependence edges - skipped. This is to topologically sort paritions according to + skipped. This is to topologically sort partitions according to compilation time known dependence. Note the topological order is stored in the form of pg's post order number. */ num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge); @@ -2117,19 +2262,29 @@ break_alias_scc_partitions (struct graph *rdg, for (j = 0; partitions->iterate (j, &first); ++j) if (cbdata.vertices_component[j] == i) break; - for (++j; partitions->iterate (j, &partition); ++j) + for (k = j + 1; partitions->iterate (k, &partition); ++k) { struct pg_vdata *data; - if (cbdata.vertices_component[j] != i) + if (cbdata.vertices_component[k] != i) continue; + /* Update postorder number so that merged reduction partition is + sorted after other partitions. */ + if (!partition_reduction_p (first) + && partition_reduction_p (partition)) + { + gcc_assert (pg->vertices[k].post < pg->vertices[j].post); + pg->vertices[j].post = pg->vertices[k].post; + } partition_merge_into (NULL, first, partition, FUSE_SAME_SCC); - (*partitions)[j] = NULL; + (*partitions)[k] = NULL; partition_free (partition); - data = (struct pg_vdata *)pg->vertices[j].data; - gcc_assert (data->id == j); + data = (struct pg_vdata *)pg->vertices[k].data; + gcc_assert (data->id == k); data->partition = NULL; + /* The result partition of merged SCC must be sequential. */ + first->type = PTYPE_SEQUENTIAL; } } } @@ -2321,38 +2476,49 @@ version_for_distribution_p (vec<struct partition *> *partitions, return (alias_ddrs->length () > 0); } -/* Fuse all partitions if necessary before finalizing distribution. */ +/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ static void -finalize_partitions (vec<struct partition *> *partitions, +finalize_partitions (struct loop *loop, vec<struct partition *> *partitions, vec<ddr_p> *alias_ddrs) { unsigned i; - struct partition *a, *partition; + struct partition *partition, *a; if (partitions->length () == 1 || alias_ddrs->length () > 0) return; - a = (*partitions)[0]; - if (a->kind != PKIND_NORMAL) - return; - - for (i = 1; partitions->iterate (i, &partition); ++i) + unsigned num_builtin = 0, num_normal = 0; + bool same_type_p = true; + enum partition_type type = ((*partitions)[0])->type; + for (i = 0; partitions->iterate (i, &partition); ++i) { - /* Don't fuse if partition has different type or it is a builtin. */ - if (partition->type != a->type - || partition->kind != PKIND_NORMAL) - return; + same_type_p &= (type == partition->type); + if (partition->kind != PKIND_NORMAL) + num_builtin++; + else + num_normal++; } - /* Fuse all partitions. */ - for (i = 1; partitions->iterate (i, &partition); ++i) + /* Don't distribute current loop into too many loops given we don't have + memory stream cost model. Be even more conservative in case of loop + nest distribution. */ + if ((same_type_p && num_builtin == 0) + || (loop->inner != NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) + || (loop->inner == NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) { - partition_merge_into (NULL, a, partition, FUSE_FINALIZE); - partition_free (partition); + a = (*partitions)[0]; + for (i = 1; partitions->iterate (i, &partition); ++i) + { + partition_merge_into (NULL, a, partition, FUSE_FINALIZE); + partition_free (partition); + } + partitions->truncate (1); } - partitions->truncate (1); } /* Distributes the code from LOOP in such a way that producer statements @@ -2505,16 +2671,23 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts, i--; } - /* Build the partition dependency graph. */ + /* Build the partition dependency graph and fuse partitions in strong + connected component. */ if (partitions.length () > 1) { - merge_dep_scc_partitions (rdg, &partitions); - alias_ddrs.truncate (0); - if (partitions.length () > 1) - break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); + /* Don't support loop nest distribution under runtime alias check + since it's not likely to enable many vectorization opportunities. */ + if (loop->inner) + merge_dep_scc_partitions (rdg, &partitions, false); + else + { + merge_dep_scc_partitions (rdg, &partitions, true); + if (partitions.length () > 1) + break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); + } } - finalize_partitions (&partitions, &alias_ddrs); + finalize_partitions (loop, &partitions, &alias_ddrs); nbp = partitions.length (); if (nbp == 0 @@ -2595,6 +2768,86 @@ public: }; // class pass_loop_distribution + +/* Given LOOP, this function records seed statements for distribution in + WORK_LIST. Return false if there is nothing for distribution. */ + +static bool +find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list) +{ + basic_block *bbs = get_loop_body_in_dom_order (loop); + + /* Initialize the worklist with stmts we seed the partitions with. */ + for (unsigned i = 0; i < loop->num_nodes; ++i) + { + for (gphi_iterator gsi = gsi_start_phis (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + continue; + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) + continue; + work_list->safe_push (phi); + } + for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + /* If there is a stmt with side-effects bail out - we + cannot and should not distribute this loop. */ + if (gimple_has_side_effects (stmt)) + { + free (bbs); + return false; + } + + /* Distribute stmts which have defs that are used outside of + the loop. */ + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + ; + /* Otherwise only distribute stores for now. */ + else if (!gimple_vdef (stmt)) + continue; + + work_list->safe_push (stmt); + } + } + free (bbs); + return work_list->length () > 0; +} + +/* Given innermost LOOP, return the outermost enclosing loop that forms a + perfect loop nest. */ + +static struct loop * +prepare_perfect_loop_nest (struct loop *loop) +{ + struct loop *outer = loop_outer (loop); + tree niters = number_of_latch_executions (loop); + + /* TODO: We only support the innermost 2-level loop nest distribution + because of compilation time issue for now. This should be relaxed + in the future. */ + while (loop->inner == NULL + && loop_outer (outer) + && outer->inner == loop && loop->next == NULL + && single_exit (outer) + && optimize_loop_for_speed_p (outer) + && !chrec_contains_symbols_defined_in_loop (niters, outer->num) + && (niters = number_of_latch_executions (outer)) != NULL_TREE + && niters != chrec_dont_know) + { + loop = outer; + outer = loop_outer (loop); + } + + return loop; +} + unsigned int pass_loop_distribution::execute (function *fun) { @@ -2637,18 +2890,9 @@ pass_loop_distribution::execute (function *fun) walking to innermost loops. */ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) { - auto_vec<gimple *> work_list; - basic_block *bbs; - int num = loop->num; - unsigned int i; - - /* If the loop doesn't have a single exit we will fail anyway, - so do that early. */ - if (!single_exit (loop)) - continue; - - /* Only optimize hot loops. */ - if (!optimize_loop_for_speed_p (loop)) + /* Don't distribute multiple exit edges loop, or cold loop. */ + if (!single_exit (loop) + || !optimize_loop_for_speed_p (loop)) continue; /* Don't distribute loop if niters is unknown. */ @@ -2656,56 +2900,16 @@ pass_loop_distribution::execute (function *fun) if (niters == NULL_TREE || niters == chrec_dont_know) continue; - /* Initialize the worklist with stmts we seed the partitions with. */ - bbs = get_loop_body_in_dom_order (loop); - for (i = 0; i < loop->num_nodes; ++i) + /* Get the perfect loop nest for distribution. */ + loop = prepare_perfect_loop_nest (loop); + for (; loop; loop = loop->inner) { - for (gphi_iterator gsi = gsi_start_phis (bbs[i]); - !gsi_end_p (gsi); - gsi_next (&gsi)) - { - gphi *phi = gsi.phi (); - if (virtual_operand_p (gimple_phi_result (phi))) - continue; - /* Distribute stmts which have defs that are used outside of - the loop. */ - if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) - continue; - work_list.safe_push (phi); - } - for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); - !gsi_end_p (gsi); - gsi_next (&gsi)) - { - gimple *stmt = gsi_stmt (gsi); + auto_vec<gimple *> work_list; + if (!find_seed_stmts_for_distribution (loop, &work_list)) + break; - /* If there is a stmt with side-effects bail out - we - cannot and should not distribute this loop. */ - if (gimple_has_side_effects (stmt)) - { - work_list.truncate (0); - goto out; - } - - /* Distribute stmts which have defs that are used outside of - the loop. */ - if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) - ; - /* Otherwise only distribute stores for now. */ - else if (!gimple_vdef (stmt)) - continue; - - work_list.safe_push (stmt); - } - } -out: - free (bbs); - - int nb_generated_loops = 0; - int nb_generated_calls = 0; - location_t loc = find_loop_location (loop); - if (work_list.length () > 0) - { + const char *str = loop->inner ? " nest" : ""; + location_t loc = find_loop_location (loop); if (!cd) { calculate_dominance_info (CDI_DOMINATORS); @@ -2713,24 +2917,29 @@ out: cd = new control_dependences (); free_dominance_info (CDI_POST_DOMINATORS); } + bool destroy_p; + int nb_generated_loops, nb_generated_calls; nb_generated_loops = distribute_loop (loop, work_list, cd, &nb_generated_calls, &destroy_p); if (destroy_p) loops_to_be_destroyed.safe_push (loop); - } - if (nb_generated_loops + nb_generated_calls > 0) - { - changed = true; - dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, - loc, "Loop %d distributed: split to %d loops " - "and %d library calls.\n", - num, nb_generated_loops, nb_generated_calls); + if (nb_generated_loops + nb_generated_calls > 0) + { + changed = true; + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, + loc, "Loop%s %d distributed: split to %d loops " + "and %d library calls.\n", str, loop->num, + nb_generated_loops, nb_generated_calls); + + break; + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); } - else if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Loop %d is the same.\n", num); } if (cd) |