aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/aarch64/aarch64.c
diff options
context:
space:
mode:
authorEdward Smith-Rowland <3dw4rd@verizon.net>2017-07-20 14:54:44 +0000
committerEdward Smith-Rowland <3dw4rd@verizon.net>2017-07-20 14:54:44 +0000
commit3acaf2e51caf356a9afc763cfd70b91d1ab094b5 (patch)
treef13b1087143457ae5c053b6ec3b664c2aaeab169 /gcc/config/aarch64/aarch64.c
parentc4d46197c5fe4461da59ce027bc68306c43186b0 (diff)
Merged revisions r232323 through r250392 to the branch
git-svn-id: https://gcc.gnu.org/svn/gcc/branches/tr29124@250393 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/aarch64/aarch64.c')
-rw-r--r--gcc/config/aarch64/aarch64.c778
1 files changed, 589 insertions, 189 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c43e96..b8a4160d9de 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -193,10 +193,10 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
static const struct cpu_addrcost_table generic_addrcost_table =
{
{
- 0, /* hi */
+ 1, /* hi */
0, /* si */
0, /* di */
- 0, /* ti */
+ 1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
@@ -526,6 +526,61 @@ static const cpu_approx_modes xgene1_approx_modes =
AARCH64_APPROX_ALL /* recip_sqrt */
};
+/* Generic prefetch settings (which disable prefetch). */
+static const cpu_prefetch_tune generic_prefetch_tune =
+{
+ 0, /* num_slots */
+ -1, /* l1_cache_size */
+ -1, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ -1 /* default_opt_level */
+};
+
+static const cpu_prefetch_tune exynosm1_prefetch_tune =
+{
+ 0, /* num_slots */
+ -1, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ -1 /* default_opt_level */
+};
+
+static const cpu_prefetch_tune qdf24xx_prefetch_tune =
+{
+ 4, /* num_slots */
+ 32, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 1024, /* l2_cache_size */
+ 3 /* default_opt_level */
+};
+
+static const cpu_prefetch_tune thunderxt88_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 128, /* l1_cache_line_size */
+ 16*1024, /* l2_cache_size */
+ 3 /* default_opt_level */
+};
+
+static const cpu_prefetch_tune thunderx_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 128, /* l1_cache_line_size */
+ -1, /* l2_cache_size */
+ -1 /* default_opt_level */
+};
+
+static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
+{
+ 8, /* num_slots */
+ 32, /* l1_cache_size */
+ 64, /* l1_cache_line_size */
+ 256, /* l2_cache_size */
+ -1 /* default_opt_level */
+};
+
static const struct tune_params generic_tunings =
{
&cortexa57_extra_costs,
@@ -538,17 +593,17 @@ static const struct tune_params generic_tunings =
2, /* issue_rate */
(AARCH64_FUSE_AES_AESMC), /* fusible_ops */
8, /* function_align. */
- 8, /* jump_align. */
- 4, /* loop_align. */
+ 4, /* jump_align. */
+ 8, /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune
};
static const struct tune_params cortexa35_tunings =
@@ -564,7 +619,7 @@ static const struct tune_params cortexa35_tunings =
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
16, /* function_align. */
- 8, /* jump_align. */
+ 4, /* jump_align. */
8, /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
@@ -572,9 +627,9 @@ static const struct tune_params cortexa35_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune
};
static const struct tune_params cortexa53_tunings =
@@ -590,7 +645,7 @@ static const struct tune_params cortexa53_tunings =
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
16, /* function_align. */
- 8, /* jump_align. */
+ 4, /* jump_align. */
8, /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
@@ -598,9 +653,9 @@ static const struct tune_params cortexa53_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune
};
static const struct tune_params cortexa57_tunings =
@@ -616,7 +671,7 @@ static const struct tune_params cortexa57_tunings =
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
16, /* function_align. */
- 8, /* jump_align. */
+ 4, /* jump_align. */
8, /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
@@ -624,9 +679,9 @@ static const struct tune_params cortexa57_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
+ &generic_prefetch_tune
};
static const struct tune_params cortexa72_tunings =
@@ -642,7 +697,7 @@ static const struct tune_params cortexa72_tunings =
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
16, /* function_align. */
- 8, /* jump_align. */
+ 4, /* jump_align. */
8, /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
@@ -650,9 +705,9 @@ static const struct tune_params cortexa72_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune
};
static const struct tune_params cortexa73_tunings =
@@ -668,7 +723,7 @@ static const struct tune_params cortexa73_tunings =
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
16, /* function_align. */
- 8, /* jump_align. */
+ 4, /* jump_align. */
8, /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
@@ -676,11 +731,13 @@ static const struct tune_params cortexa73_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune
};
+
+
static const struct tune_params exynosm1_tunings =
{
&exynosm1_extra_costs,
@@ -701,9 +758,34 @@ static const struct tune_params exynosm1_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
48, /* max_case_values. */
- 64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &exynosm1_prefetch_tune
+};
+
+static const struct tune_params thunderxt88_tunings =
+{
+ &thunderx_extra_costs,
+ &generic_addrcost_table,
+ &thunderx_regmove_cost,
+ &thunderx_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ 6, /* memmov_cost */
+ 2, /* issue_rate */
+ AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
+ 8, /* function_align. */
+ 8, /* jump_align. */
+ 8, /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
+ &thunderxt88_prefetch_tune
};
static const struct tune_params thunderx_tunings =
@@ -726,9 +808,10 @@ static const struct tune_params thunderx_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
+ | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
+ &thunderx_prefetch_tune
};
static const struct tune_params xgene1_tunings =
@@ -751,9 +834,9 @@ static const struct tune_params xgene1_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &generic_prefetch_tune
};
static const struct tune_params qdf24xx_tunings =
@@ -777,9 +860,9 @@ static const struct tune_params qdf24xx_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &qdf24xx_prefetch_tune
};
static const struct tune_params thunderx2t99_tunings =
@@ -792,7 +875,8 @@ static const struct tune_params thunderx2t99_tunings =
&generic_approx_modes,
4, /* memmov_cost. */
4, /* issue_rate. */
- (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops */
+ (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
+ | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
16, /* function_align. */
8, /* jump_align. */
16, /* loop_align. */
@@ -802,9 +886,9 @@ static const struct tune_params thunderx2t99_tunings =
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
- 64, /* cache_line_size. */
- tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ &thunderx2t99_prefetch_tune
};
/* Support for fine-grained override of the tuning structures. */
@@ -948,7 +1032,7 @@ static reg_class_t
aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
reg_class_t best_class)
{
- enum machine_mode mode;
+ machine_mode mode;
if (allocno_class != ALL_REGS)
return allocno_class;
@@ -961,7 +1045,7 @@ aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
}
static unsigned int
-aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
+aarch64_min_divisions_for_recip_mul (machine_mode mode)
{
if (GET_MODE_UNIT_SIZE (mode) == 4)
return aarch64_tune_params.min_div_recip_mul_sf;
@@ -970,7 +1054,7 @@ aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
static int
aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
- enum machine_mode mode)
+ machine_mode mode)
{
if (VECTOR_MODE_P (mode))
return aarch64_tune_params.vec_reassoc_width;
@@ -1649,41 +1733,41 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
machine_mode dst_mode = GET_MODE (dst);
gcc_assert (VECTOR_MODE_P (dst_mode));
+ gcc_assert (register_operand (dst, dst_mode)
+ && register_operand (src1, src_mode)
+ && register_operand (src2, src_mode));
- if (REG_P (dst) && REG_P (src1) && REG_P (src2))
- {
- rtx (*gen) (rtx, rtx, rtx);
-
- switch (src_mode)
- {
- case V8QImode:
- gen = gen_aarch64_simd_combinev8qi;
- break;
- case V4HImode:
- gen = gen_aarch64_simd_combinev4hi;
- break;
- case V2SImode:
- gen = gen_aarch64_simd_combinev2si;
- break;
- case V4HFmode:
- gen = gen_aarch64_simd_combinev4hf;
- break;
- case V2SFmode:
- gen = gen_aarch64_simd_combinev2sf;
- break;
- case DImode:
- gen = gen_aarch64_simd_combinedi;
- break;
- case DFmode:
- gen = gen_aarch64_simd_combinedf;
- break;
- default:
- gcc_unreachable ();
- }
+ rtx (*gen) (rtx, rtx, rtx);
- emit_insn (gen (dst, src1, src2));
- return;
+ switch (src_mode)
+ {
+ case V8QImode:
+ gen = gen_aarch64_simd_combinev8qi;
+ break;
+ case V4HImode:
+ gen = gen_aarch64_simd_combinev4hi;
+ break;
+ case V2SImode:
+ gen = gen_aarch64_simd_combinev2si;
+ break;
+ case V4HFmode:
+ gen = gen_aarch64_simd_combinev4hf;
+ break;
+ case V2SFmode:
+ gen = gen_aarch64_simd_combinev2sf;
+ break;
+ case DImode:
+ gen = gen_aarch64_simd_combinedi;
+ break;
+ case DFmode:
+ gen = gen_aarch64_simd_combinedf;
+ break;
+ default:
+ gcc_unreachable ();
}
+
+ emit_insn (gen (dst, src1, src2));
+ return;
}
/* Split a complex SIMD move. */
@@ -1919,6 +2003,8 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
gcc_assert (can_create_pseudo_p ());
base = gen_reg_rtx (ptr_mode);
aarch64_expand_mov_immediate (base, XEXP (mem, 0));
+ if (ptr_mode != Pmode)
+ base = convert_memory_address (Pmode, base);
mem = gen_rtx_MEM (ptr_mode, base);
}
@@ -2266,6 +2352,7 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type)
{
if (!type)
return GET_MODE_ALIGNMENT (mode);
+
if (integer_zerop (TYPE_SIZE (type)))
return 0;
@@ -2278,9 +2365,9 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type)
return TYPE_ALIGN (TREE_TYPE (type));
unsigned int alignment = 0;
-
for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
- alignment = std::max (alignment, DECL_ALIGN (field));
+ if (TREE_CODE (field) == FIELD_DECL)
+ alignment = std::max (alignment, DECL_ALIGN (field));
return alignment;
}
@@ -2369,24 +2456,28 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
entirely general registers. */
if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
{
- unsigned int alignment = aarch64_function_arg_alignment (mode, type);
gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
/* C.8 if the argument has an alignment of 16 then the NGRN is
rounded up to the next even number. */
- if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
+ if (nregs == 2
+ && ncrn % 2
+ /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
+ comparison is there because for > 16 * BITS_PER_UNIT
+ alignment nregs should be > 2 and therefore it should be
+ passed by reference rather than value. */
+ && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
{
++ncrn;
gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
}
+
/* NREGS can be 0 when e.g. an empty structure is to be passed.
A reg is still generated for it, but the caller should be smart
enough not to use it. */
if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
- {
- pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
- }
+ pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
else
{
rtx par;
@@ -2414,6 +2505,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
this argument and align the total size if necessary. */
on_stack:
pcum->aapcs_stack_words = size / UNITS_PER_WORD;
+
if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
16 / UNITS_PER_WORD);
@@ -2506,12 +2598,7 @@ static unsigned int
aarch64_function_arg_boundary (machine_mode mode, const_tree type)
{
unsigned int alignment = aarch64_function_arg_alignment (mode, type);
-
- if (alignment < PARM_BOUNDARY)
- alignment = PARM_BOUNDARY;
- if (alignment > STACK_BOUNDARY)
- alignment = STACK_BOUNDARY;
- return alignment;
+ return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
}
/* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
@@ -2682,11 +2769,19 @@ aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
plus_constant (Pmode, stack_pointer_rtx, -first));
/* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
- emit_set_insn (reg2,
- plus_constant (Pmode, stack_pointer_rtx,
- -(first + rounded_size)));
-
-
+ HOST_WIDE_INT adjustment = - (first + rounded_size);
+ if (! aarch64_uimm12_shift (adjustment))
+ {
+ aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
+ true, Pmode);
+ emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
+ }
+ else
+ {
+ emit_set_insn (reg2,
+ plus_constant (Pmode, stack_pointer_rtx, adjustment));
+ }
+
/* Step 3: the loop
do
@@ -4548,6 +4643,24 @@ aarch64_classify_address (struct aarch64_address_info *info,
}
}
+/* Return true if the address X is valid for a PRFM instruction.
+ STRICT_P is true if we should do strict checking with
+ aarch64_classify_address. */
+
+bool
+aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
+{
+ struct aarch64_address_info addr;
+
+ /* PRFM accepts the same addresses as DImode... */
+ bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
+ if (!res)
+ return false;
+
+ /* ... except writeback forms. */
+ return addr.type != ADDRESS_REG_WB;
+}
+
bool
aarch64_symbolic_address_p (rtx x)
{
@@ -4632,6 +4745,50 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
return true;
}
+/* This function is used by the call expanders of the machine description.
+ RESULT is the register in which the result is returned. It's NULL for
+ "call" and "sibcall".
+ MEM is the location of the function call.
+ SIBCALL indicates whether this function call is normal call or sibling call.
+ It will generate different pattern accordingly. */
+
+void
+aarch64_expand_call (rtx result, rtx mem, bool sibcall)
+{
+ rtx call, callee, tmp;
+ rtvec vec;
+ machine_mode mode;
+
+ gcc_assert (MEM_P (mem));
+ callee = XEXP (mem, 0);
+ mode = GET_MODE (callee);
+ gcc_assert (mode == Pmode);
+
+ /* Decide if we should generate indirect calls by loading the
+ address of the callee into a register before performing
+ the branch-and-link. */
+ if (SYMBOL_REF_P (callee)
+ ? (aarch64_is_long_call_p (callee)
+ || aarch64_is_noplt_call_p (callee))
+ : !REG_P (callee))
+ XEXP (mem, 0) = force_reg (mode, callee);
+
+ call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
+
+ if (result != NULL_RTX)
+ call = gen_rtx_SET (result, call);
+
+ if (sibcall)
+ tmp = ret_rtx;
+ else
+ tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
+
+ vec = gen_rtvec (2, call, tmp);
+ call = gen_rtx_PARALLEL (VOIDmode, vec);
+
+ aarch64_emit_call_insn (call);
+}
+
/* Emit call insn with PAT and do aarch64-specific handling. */
void
@@ -4704,7 +4861,7 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
the comparison will have to be swapped when we emit the assembly
code. */
if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
- && (REG_P (y) || GET_CODE (y) == SUBREG)
+ && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
&& (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
|| GET_CODE (x) == LSHIFTRT
|| GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
@@ -4730,7 +4887,7 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
}
static int
-aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
+aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
int
aarch64_get_condition_code (rtx x)
@@ -4744,7 +4901,7 @@ aarch64_get_condition_code (rtx x)
}
static int
-aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
+aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
{
switch (mode)
{
@@ -4896,12 +5053,43 @@ static const int aarch64_nzcv_codes[] =
0 /* NV, Any. */
};
+/* Print operand X to file F in a target specific manner according to CODE.
+ The acceptable formatting commands given by CODE are:
+ 'c': An integer or symbol address without a preceding #
+ sign.
+ 'e': Print the sign/zero-extend size as a character 8->b,
+ 16->h, 32->w.
+ 'p': Prints N such that 2^N == X (X must be power of 2 and
+ const int).
+ 'P': Print the number of non-zero bits in X (a const_int).
+ 'H': Print the higher numbered register of a pair (TImode)
+ of regs.
+ 'm': Print a condition (eq, ne, etc).
+ 'M': Same as 'm', but invert condition.
+ 'b/h/s/d/q': Print a scalar FP/SIMD register name.
+ 'S/T/U/V': Print a FP/SIMD register name for a register list.
+ The register printed is the FP/SIMD register name
+ of X + 0/1/2/3 for S/T/U/V.
+ 'R': Print a scalar FP/SIMD register name + 1.
+ 'X': Print bottom 16 bits of integer constant in hex.
+ 'w/x': Print a general register name or the zero register
+ (32-bit or 64-bit).
+ '0': Print a normal operand, if it's a general register,
+ then we assume DImode.
+ 'k': Print NZCV for conditional compare instructions.
+ 'A': Output address constant representing the first
+ argument of X, specifying a relocation offset
+ if appropriate.
+ 'L': Output constant address specified by X
+ with a relocation offset if appropriate.
+ 'G': Prints address of X, specifying a PC relative
+ relocation mode if appropriate. */
+
static void
aarch64_print_operand (FILE *f, rtx x, int code)
{
switch (code)
{
- /* An integer or symbol address without a preceding # sign. */
case 'c':
switch (GET_CODE (x))
{
@@ -4928,7 +5116,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
break;
case 'e':
- /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
{
int n;
@@ -4961,7 +5148,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
{
int n;
- /* Print N such that 2^N == X. */
if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
{
output_operand_lossage ("invalid operand for '%%%c'", code);
@@ -4973,7 +5159,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
break;
case 'P':
- /* Print the number of non-zero bits in X (a const_int). */
if (!CONST_INT_P (x))
{
output_operand_lossage ("invalid operand for '%%%c'", code);
@@ -4984,7 +5169,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
break;
case 'H':
- /* Print the higher numbered register of a pair (TImode) of regs. */
if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
{
output_operand_lossage ("invalid operand for '%%%c'", code);
@@ -4998,8 +5182,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
case 'm':
{
int cond_code;
- /* Print a condition (eq, ne, etc) or its inverse. */
-
/* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
if (x == const_true_rtx)
{
@@ -5027,7 +5209,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
case 's':
case 'd':
case 'q':
- /* Print a scalar FP/SIMD register name. */
if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
{
output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
@@ -5040,7 +5221,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
case 'T':
case 'U':
case 'V':
- /* Print the first FP/SIMD register name in a list. */
if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
{
output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
@@ -5050,7 +5230,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
break;
case 'R':
- /* Print a scalar FP/SIMD register name + 1. */
if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
{
output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
@@ -5060,7 +5239,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
break;
case 'X':
- /* Print bottom 16 bits of integer constant in hex. */
if (!CONST_INT_P (x))
{
output_operand_lossage ("invalid operand for '%%%c'", code);
@@ -5071,8 +5249,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
case 'w':
case 'x':
- /* Print a general register name or the zero register (32-bit or
- 64-bit). */
if (x == const0_rtx
|| (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
{
@@ -5095,8 +5271,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
/* Fall through */
case 0:
- /* Print a normal operand, if it's a general register, then we
- assume DImode. */
if (x == NULL)
{
output_operand_lossage ("missing operand");
@@ -5111,6 +5285,8 @@ aarch64_print_operand (FILE *f, rtx x, int code)
case MEM:
output_address (GET_MODE (x), XEXP (x, 0));
+ /* Check all memory references are Pmode - even with ILP32. */
+ gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
break;
case CONST:
@@ -5247,7 +5423,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
break;
case 'G':
-
switch (aarch64_classify_symbolic_expression (x))
{
case SYMBOL_TLSLE24:
@@ -5262,7 +5437,6 @@ aarch64_print_operand (FILE *f, rtx x, int code)
case 'k':
{
HOST_WIDE_INT cond_code;
- /* Print nzcv. */
if (!CONST_INT_P (x))
{
@@ -5975,9 +6149,10 @@ aarch64_strip_shift (rtx x)
/* Helper function for rtx cost calculation. Strip an extend
expression from X. Returns the inner operand if successful, or the
original expression on failure. We deal with a number of possible
- canonicalization variations here. */
+ canonicalization variations here. If STRIP_SHIFT is true, then
+ we can strip off a shift also. */
static rtx
-aarch64_strip_extend (rtx x)
+aarch64_strip_extend (rtx x, bool strip_shift)
{
rtx op = x;
@@ -6001,7 +6176,8 @@ aarch64_strip_extend (rtx x)
/* Now handle extended register, as this may also have an optional
left shift by 1..4. */
- if (GET_CODE (op) == ASHIFT
+ if (strip_shift
+ && GET_CODE (op) == ASHIFT
&& CONST_INT_P (XEXP (op, 1))
&& ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
op = XEXP (op, 0);
@@ -6025,6 +6201,39 @@ aarch64_shift_p (enum rtx_code code)
return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
}
+
+/* Return true iff X is a cheap shift without a sign extend. */
+
+static bool
+aarch64_cheap_mult_shift_p (rtx x)
+{
+ rtx op0, op1;
+
+ op0 = XEXP (x, 0);
+ op1 = XEXP (x, 1);
+
+ if (!(aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
+ return false;
+
+ if (GET_CODE (op0) == SIGN_EXTEND)
+ return false;
+
+ if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
+ && UINTVAL (op1) <= 4)
+ return true;
+
+ if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
+ return false;
+
+ HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
+
+ if (l2 > 0 && l2 <= 4)
+ return true;
+
+ return false;
+}
+
/* Helper function for rtx cost calculation. Calculate the cost of
a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
Return the calculated cost of the expression, recursing manually in to
@@ -6062,7 +6271,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
{
if (compound_p)
{
- if (REG_P (op1))
+ /* If the shift is considered cheap,
+ then don't add any cost. */
+ if (aarch64_cheap_mult_shift_p (x))
+ ;
+ else if (REG_P (op1))
/* ARITH + shift-by-register. */
cost += extra_cost->alu.arith_shift_reg;
else if (is_extend)
@@ -6080,7 +6293,7 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
}
/* Strip extends as we will have costed them in the case above. */
if (is_extend)
- op0 = aarch64_strip_extend (op0);
+ op0 = aarch64_strip_extend (op0, true);
cost += rtx_cost (op0, VOIDmode, code, 0, speed);
@@ -6924,13 +7137,13 @@ cost_minus:
if (speed)
*cost += extra_cost->alu.extend_arith;
- op1 = aarch64_strip_extend (op1);
+ op1 = aarch64_strip_extend (op1, true);
*cost += rtx_cost (op1, VOIDmode,
(enum rtx_code) GET_CODE (op1), 0, speed);
return true;
}
- rtx new_op1 = aarch64_strip_extend (op1);
+ rtx new_op1 = aarch64_strip_extend (op1, false);
/* Cost this as an FMA-alike operation. */
if ((GET_CODE (new_op1) == MULT
@@ -7003,7 +7216,7 @@ cost_plus:
if (speed)
*cost += extra_cost->alu.extend_arith;
- op0 = aarch64_strip_extend (op0);
+ op0 = aarch64_strip_extend (op0, true);
*cost += rtx_cost (op0, VOIDmode,
(enum rtx_code) GET_CODE (op0), 0, speed);
return true;
@@ -7011,7 +7224,7 @@ cost_plus:
/* Strip any extend, leave shifts behind as we will
cost them through mult_cost. */
- new_op0 = aarch64_strip_extend (op0);
+ new_op0 = aarch64_strip_extend (op0, false);
if (GET_CODE (new_op0) == MULT
|| aarch64_shift_p (GET_CODE (new_op0)))
@@ -7344,17 +7557,26 @@ cost_plus:
}
else
{
- if (speed)
+ if (VECTOR_MODE_P (mode))
{
- if (VECTOR_MODE_P (mode))
- {
- /* Vector shift (register). */
- *cost += extra_cost->vect.alu;
- }
- else
+ if (speed)
+ /* Vector shift (register). */
+ *cost += extra_cost->vect.alu;
+ }
+ else
+ {
+ if (speed)
+ /* LSLV. */
+ *cost += extra_cost->alu.shift_reg;
+
+ if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+ && CONST_INT_P (XEXP (op1, 1))
+ && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
{
- /* LSLV. */
- *cost += extra_cost->alu.shift_reg;
+ *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+ /* We already demanded XEXP (op1, 0) to be REG_P, so
+ don't recurse into it. */
+ return true;
}
}
return false; /* All arguments need to be in registers. */
@@ -7383,14 +7605,27 @@ cost_plus:
}
else
{
-
- /* ASR (register) and friends. */
- if (speed)
+ if (VECTOR_MODE_P (mode))
{
- if (VECTOR_MODE_P (mode))
+ if (speed)
+ /* Vector shift (register). */
*cost += extra_cost->vect.alu;
- else
+ }
+ else
+ {
+ if (speed)
+ /* ASR (register) and friends. */
*cost += extra_cost->alu.shift_reg;
+
+ if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+ && CONST_INT_P (XEXP (op1, 1))
+ && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
+ {
+ *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+ /* We already demanded XEXP (op1, 0) to be REG_P, so
+ don't recurse into it. */
+ return true;
+ }
}
return false; /* All arguments need to be in registers. */
}
@@ -7481,17 +7716,13 @@ cost_plus:
case UMOD:
if (speed)
{
+ /* Slighly prefer UMOD over SMOD. */
if (VECTOR_MODE_P (mode))
*cost += extra_cost->vect.alu;
else if (GET_MODE_CLASS (mode) == MODE_INT)
*cost += (extra_cost->mult[mode == DImode].add
- + extra_cost->mult[mode == DImode].idiv);
- else if (mode == DFmode)
- *cost += (extra_cost->fp[1].mult
- + extra_cost->fp[1].div);
- else if (mode == SFmode)
- *cost += (extra_cost->fp[0].mult
- + extra_cost->fp[0].div);
+ + extra_cost->mult[mode == DImode].idiv
+ + (code == MOD ? 1 : 0));
}
return false; /* All arguments need to be in registers. */
@@ -7505,7 +7736,9 @@ cost_plus:
else if (GET_MODE_CLASS (mode) == MODE_INT)
/* There is no integer SQRT, so only DIV and UDIV can get
here. */
- *cost += extra_cost->mult[mode == DImode].idiv;
+ *cost += (extra_cost->mult[mode == DImode].idiv
+ /* Slighly prefer UDIV over SDIV. */
+ + (code == DIV ? 1 : 0));
else
*cost += extra_cost->fp[mode == DFmode].div;
}
@@ -7924,33 +8157,40 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
machine_mode mode = GET_MODE (dst);
if (GET_MODE_INNER (mode) == HFmode)
- return false;
+ {
+ gcc_assert (!recp);
+ return false;
+ }
- machine_mode mmsk = mode_for_vector
- (int_mode_for_mode (GET_MODE_INNER (mode)),
- GET_MODE_NUNITS (mode));
- bool use_approx_sqrt_p = (!recp
- && (flag_mlow_precision_sqrt
- || (aarch64_tune_params.approx_modes->sqrt
- & AARCH64_APPROX_MODE (mode))));
- bool use_approx_rsqrt_p = (recp
- && (flag_mrecip_low_precision_sqrt
- || (aarch64_tune_params.approx_modes->recip_sqrt
- & AARCH64_APPROX_MODE (mode))));
+ machine_mode mmsk
+ = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+ GET_MODE_NUNITS (mode));
+ if (!recp)
+ {
+ if (!(flag_mlow_precision_sqrt
+ || (aarch64_tune_params.approx_modes->sqrt
+ & AARCH64_APPROX_MODE (mode))))
+ return false;
+
+ if (flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun))
+ return false;
+ }
+ else
+ /* Caller assumes we cannot fail. */
+ gcc_assert (use_rsqrt_p (mode));
- if (!flag_finite_math_only
- || flag_trapping_math
- || !flag_unsafe_math_optimizations
- || !(use_approx_sqrt_p || use_approx_rsqrt_p)
- || optimize_function_for_size_p (cfun))
- return false;
rtx xmsk = gen_reg_rtx (mmsk);
if (!recp)
- /* When calculating the approximate square root, compare the argument with
- 0.0 and create a mask. */
- emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
- CONST0_RTX (mode)))));
+ /* When calculating the approximate square root, compare the
+ argument with 0.0 and create a mask. */
+ emit_insn (gen_rtx_SET (xmsk,
+ gen_rtx_NEG (mmsk,
+ gen_rtx_EQ (mmsk, src,
+ CONST0_RTX (mode)))));
/* Estimate the approximate reciprocal square root. */
rtx xdst = gen_reg_rtx (mode);
@@ -8679,12 +8919,38 @@ aarch64_override_options_internal (struct gcc_options *opts)
opts->x_param_values,
global_options_set.x_param_values);
- /* Set the L1 cache line size. */
- if (selected_cpu->tune->cache_line_size != 0)
+ /* Set up parameters to be used in prefetching algorithm. Do not
+ override the defaults unless we are tuning for a core we have
+ researched values for. */
+ if (aarch64_tune_params.prefetch->num_slots > 0)
+ maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+ aarch64_tune_params.prefetch->num_slots,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+ if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
+ maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+ aarch64_tune_params.prefetch->l1_cache_size,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+ if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
- selected_cpu->tune->cache_line_size,
+ aarch64_tune_params.prefetch->l1_cache_line_size,
opts->x_param_values,
global_options_set.x_param_values);
+ if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
+ maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+ aarch64_tune_params.prefetch->l2_cache_size,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+
+ /* Enable sw prefetching at specified optimization level for
+ CPUS that have prefetch. Lower optimization level threshold by 1
+ when profiling is enabled. */
+ if (opts->x_flag_prefetch_loop_arrays < 0
+ && !opts->x_optimize_size
+ && aarch64_tune_params.prefetch->default_opt_level >= 0
+ && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
+ opts->x_flag_prefetch_loop_arrays = 1;
aarch64_override_options_after_change_1 (opts);
}
@@ -9996,6 +10262,11 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x)
&& aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
return true;
+ /* Treat symbols as constants. Avoid TLS symbols as they are complex,
+ so spilling them is better than rematerialization. */
+ if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
+ return true;
+
return aarch64_constant_address_p (x);
}
@@ -11437,7 +11708,7 @@ aarch64_simd_mem_operand_p (rtx op)
COUNT is the number of components into which the copy needs to be
decomposed. */
void
-aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
+aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
unsigned int count)
{
unsigned int i;
@@ -11458,7 +11729,7 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
one of VSTRUCT modes: OI, CI, or XI. */
int
-aarch64_simd_attr_length_rglist (enum machine_mode mode)
+aarch64_simd_attr_length_rglist (machine_mode mode)
{
return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
}
@@ -11639,6 +11910,57 @@ aarch64_expand_vector_init (rtx target, rtx vals)
return;
}
+ enum insn_code icode = optab_handler (vec_set_optab, mode);
+ gcc_assert (icode != CODE_FOR_nothing);
+
+ /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions. */
+
+ /* The algorithm will fill matches[*][0] with the earliest matching element,
+ and matches[X][1] with the count of duplicate elements (if X is the
+ earliest element which has duplicates). */
+
+ if (n_var == n_elts && n_elts <= 16)
+ {
+ int matches[16][2] = {0};
+ for (int i = 0; i < n_elts; i++)
+ {
+ for (int j = 0; j <= i; j++)
+ {
+ if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+ {
+ matches[i][0] = j;
+ matches[j][1]++;
+ break;
+ }
+ }
+ }
+ int maxelement = 0;
+ int maxv = 0;
+ for (int i = 0; i < n_elts; i++)
+ if (matches[i][1] > maxv)
+ {
+ maxelement = i;
+ maxv = matches[i][1];
+ }
+
+ /* Create a duplicate of the most common element. */
+ rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+ aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+ /* Insert the rest. */
+ for (int i = 0; i < n_elts; i++)
+ {
+ rtx x = XVECEXP (vals, 0, i);
+ if (matches[i][0] == maxelement)
+ continue;
+ x = copy_to_mode_reg (inner_mode, x);
+ emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+ }
+ return;
+ }
+
/* Initialise a vector which is part-variable. We want to first try
to build those lanes which are constant in the most efficient way we
can. */
@@ -11672,10 +11994,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
}
/* Insert the variable lanes directly. */
-
- enum insn_code icode = optab_handler (vec_set_optab, mode);
- gcc_assert (icode != CODE_FOR_nothing);
-
for (int i = 0; i < n_elts; i++)
{
rtx x = XVECEXP (vals, 0, i);
@@ -11843,10 +12161,8 @@ aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
static void
aarch64_emit_unlikely_jump (rtx insn)
{
- int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
-
rtx_insn *jump = emit_jump_insn (insn);
- add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
+ add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
}
/* Expand a compare and swap pattern. */
@@ -12041,6 +12357,17 @@ aarch64_split_compare_and_swap (rtx operands[])
mode = GET_MODE (mem);
model = memmodel_from_int (INTVAL (model_rtx));
+ /* When OLDVAL is zero and we want the strong version we can emit a tighter
+ loop:
+ .label1:
+ LD[A]XR rval, [mem]
+ CBNZ rval, .label2
+ ST[L]XR scratch, newval, [mem]
+ CBNZ scratch, .label1
+ .label2:
+ CMP rval, 0. */
+ bool strong_zero_p = !is_weak && oldval == const0_rtx;
+
label1 = NULL;
if (!is_weak)
{
@@ -12057,11 +12384,21 @@ aarch64_split_compare_and_swap (rtx operands[])
else
aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
- cond = aarch64_gen_compare_reg (NE, rval, oldval);
- x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
- x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
- gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
- aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+ if (strong_zero_p)
+ {
+ x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
+ x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+ gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+ aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+ }
+ else
+ {
+ cond = aarch64_gen_compare_reg (NE, rval, oldval);
+ x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+ x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+ gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+ aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+ }
aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
@@ -12080,7 +12417,15 @@ aarch64_split_compare_and_swap (rtx operands[])
}
emit_label (label2);
-
+ /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
+ to set the condition flags. If this is not used it will be removed by
+ later passes. */
+ if (strong_zero_p)
+ {
+ cond = gen_rtx_REG (CCmode, CC_REGNUM);
+ x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
+ emit_insn (gen_rtx_SET (cond, x));
+ }
/* Emit any final barrier needed for a __sync operation. */
if (is_mm_sync (model))
aarch64_emit_post_barrier (model);
@@ -13373,7 +13718,7 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
}
rtx
-aarch64_reverse_mask (enum machine_mode mode)
+aarch64_reverse_mask (machine_mode mode)
{
/* We have to reverse each vector because we dont have
a permuted load that can reverse-load according to ABI rules. */
@@ -13973,13 +14318,68 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
{
enum attr_type prev_type = get_attr_type (prev);
- /* FIXME: this misses some which is considered simple arthematic
- instructions for ThunderX. Simple shifts are missed here. */
- if (prev_type == TYPE_ALUS_SREG
- || prev_type == TYPE_ALUS_IMM
- || prev_type == TYPE_LOGICS_REG
- || prev_type == TYPE_LOGICS_IMM)
- return true;
+ unsigned int condreg1, condreg2;
+ rtx cc_reg_1;
+ aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
+ cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+
+ if (reg_referenced_p (cc_reg_1, PATTERN (curr))
+ && prev
+ && modified_in_p (cc_reg_1, prev))
+ {
+ /* FIXME: this misses some which is considered simple arthematic
+ instructions for ThunderX. Simple shifts are missed here. */
+ if (prev_type == TYPE_ALUS_SREG
+ || prev_type == TYPE_ALUS_IMM
+ || prev_type == TYPE_LOGICS_REG
+ || prev_type == TYPE_LOGICS_IMM)
+ return true;
+ }
+ }
+
+ if (prev_set
+ && curr_set
+ && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
+ && any_condjump_p (curr))
+ {
+ /* We're trying to match:
+ prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
+ curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
+ (const_int 0))
+ (label_ref ("SYM"))
+ (pc)) */
+ if (SET_DEST (curr_set) == (pc_rtx)
+ && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
+ && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
+ && REG_P (SET_DEST (prev_set))
+ && REGNO (SET_DEST (prev_set))
+ == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
+ {
+ /* Fuse ALU operations followed by conditional branch instruction. */
+ switch (get_attr_type (prev))
+ {
+ case TYPE_ALU_IMM:
+ case TYPE_ALU_SREG:
+ case TYPE_ADC_REG:
+ case TYPE_ADC_IMM:
+ case TYPE_ADCS_REG:
+ case TYPE_ADCS_IMM:
+ case TYPE_LOGIC_REG:
+ case TYPE_LOGIC_IMM:
+ case TYPE_CSEL:
+ case TYPE_ADR:
+ case TYPE_MOV_IMM:
+ case TYPE_SHIFT_REG:
+ case TYPE_SHIFT_IMM:
+ case TYPE_BFM:
+ case TYPE_RBIT:
+ case TYPE_REV:
+ case TYPE_EXTEND:
+ return true;
+
+ default:;
+ }
+ }
}
return false;
@@ -14163,7 +14563,7 @@ aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
bool
aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
- enum machine_mode mode)
+ machine_mode mode)
{
HOST_WIDE_INT offval_1, offval_2, msize;
enum reg_class rclass_1, rclass_2;
@@ -14270,7 +14670,7 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
bool
aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
- enum machine_mode mode)
+ machine_mode mode)
{
enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
@@ -14404,7 +14804,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
bool
aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
- enum machine_mode mode, RTX_CODE code)
+ machine_mode mode, RTX_CODE code)
{
rtx base, offset, t1, t2;
rtx mem_1, mem_2, mem_3, mem_4;