diff options
author | Edward Smith-Rowland <3dw4rd@verizon.net> | 2017-07-20 14:54:44 +0000 |
---|---|---|
committer | Edward Smith-Rowland <3dw4rd@verizon.net> | 2017-07-20 14:54:44 +0000 |
commit | 3acaf2e51caf356a9afc763cfd70b91d1ab094b5 (patch) | |
tree | f13b1087143457ae5c053b6ec3b664c2aaeab169 /gcc/config/aarch64/aarch64.c | |
parent | c4d46197c5fe4461da59ce027bc68306c43186b0 (diff) |
Merged revisions r232323 through r250392 to the branch
git-svn-id: https://gcc.gnu.org/svn/gcc/branches/tr29124@250393 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/aarch64/aarch64.c')
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 778 |
1 files changed, 589 insertions, 189 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2e385c43e96..b8a4160d9de 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -193,10 +193,10 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = static const struct cpu_addrcost_table generic_addrcost_table = { { - 0, /* hi */ + 1, /* hi */ 0, /* si */ 0, /* di */ - 0, /* ti */ + 1, /* ti */ }, 0, /* pre_modify */ 0, /* post_modify */ @@ -526,6 +526,61 @@ static const cpu_approx_modes xgene1_approx_modes = AARCH64_APPROX_ALL /* recip_sqrt */ }; +/* Generic prefetch settings (which disable prefetch). */ +static const cpu_prefetch_tune generic_prefetch_tune = +{ + 0, /* num_slots */ + -1, /* l1_cache_size */ + -1, /* l1_cache_line_size */ + -1, /* l2_cache_size */ + -1 /* default_opt_level */ +}; + +static const cpu_prefetch_tune exynosm1_prefetch_tune = +{ + 0, /* num_slots */ + -1, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + -1, /* l2_cache_size */ + -1 /* default_opt_level */ +}; + +static const cpu_prefetch_tune qdf24xx_prefetch_tune = +{ + 4, /* num_slots */ + 32, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + 1024, /* l2_cache_size */ + 3 /* default_opt_level */ +}; + +static const cpu_prefetch_tune thunderxt88_prefetch_tune = +{ + 8, /* num_slots */ + 32, /* l1_cache_size */ + 128, /* l1_cache_line_size */ + 16*1024, /* l2_cache_size */ + 3 /* default_opt_level */ +}; + +static const cpu_prefetch_tune thunderx_prefetch_tune = +{ + 8, /* num_slots */ + 32, /* l1_cache_size */ + 128, /* l1_cache_line_size */ + -1, /* l2_cache_size */ + -1 /* default_opt_level */ +}; + +static const cpu_prefetch_tune thunderx2t99_prefetch_tune = +{ + 8, /* num_slots */ + 32, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + 256, /* l2_cache_size */ + -1 /* default_opt_level */ +}; + static const struct tune_params generic_tunings = { &cortexa57_extra_costs, @@ -538,17 +593,17 @@ static const struct tune_params generic_tunings = 2, /* issue_rate */ (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ 8, /* function_align. */ - 8, /* jump_align. */ - 4, /* loop_align. */ + 4, /* jump_align. */ + 8, /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ - tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune }; static const struct tune_params cortexa35_tunings = @@ -564,7 +619,7 @@ static const struct tune_params cortexa35_tunings = (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 16, /* function_align. */ - 8, /* jump_align. */ + 4, /* jump_align. */ 8, /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ @@ -572,9 +627,9 @@ static const struct tune_params cortexa35_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune }; static const struct tune_params cortexa53_tunings = @@ -590,7 +645,7 @@ static const struct tune_params cortexa53_tunings = (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 16, /* function_align. */ - 8, /* jump_align. */ + 4, /* jump_align. */ 8, /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ @@ -598,9 +653,9 @@ static const struct tune_params cortexa53_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune }; static const struct tune_params cortexa57_tunings = @@ -616,7 +671,7 @@ static const struct tune_params cortexa57_tunings = (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ 16, /* function_align. */ - 8, /* jump_align. */ + 4, /* jump_align. */ 8, /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ @@ -624,9 +679,9 @@ static const struct tune_params cortexa57_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ + &generic_prefetch_tune }; static const struct tune_params cortexa72_tunings = @@ -642,7 +697,7 @@ static const struct tune_params cortexa72_tunings = (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ 16, /* function_align. */ - 8, /* jump_align. */ + 4, /* jump_align. */ 8, /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ @@ -650,9 +705,9 @@ static const struct tune_params cortexa72_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune }; static const struct tune_params cortexa73_tunings = @@ -668,7 +723,7 @@ static const struct tune_params cortexa73_tunings = (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 16, /* function_align. */ - 8, /* jump_align. */ + 4, /* jump_align. */ 8, /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ @@ -676,11 +731,13 @@ static const struct tune_params cortexa73_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune }; + + static const struct tune_params exynosm1_tunings = { &exynosm1_extra_costs, @@ -701,9 +758,34 @@ static const struct tune_params exynosm1_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 48, /* max_case_values. */ - 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &exynosm1_prefetch_tune +}; + +static const struct tune_params thunderxt88_tunings = +{ + &thunderx_extra_costs, + &generic_addrcost_table, + &thunderx_regmove_cost, + &thunderx_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + 6, /* memmov_cost */ + 2, /* issue_rate */ + AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */ + 8, /* function_align. */ + 8, /* jump_align. */ + 8, /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 1, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */ + &thunderxt88_prefetch_tune }; static const struct tune_params thunderx_tunings = @@ -726,9 +808,10 @@ static const struct tune_params thunderx_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW + | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + &thunderx_prefetch_tune }; static const struct tune_params xgene1_tunings = @@ -751,9 +834,9 @@ static const struct tune_params xgene1_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &generic_prefetch_tune }; static const struct tune_params qdf24xx_tunings = @@ -777,9 +860,9 @@ static const struct tune_params qdf24xx_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &qdf24xx_prefetch_tune }; static const struct tune_params thunderx2t99_tunings = @@ -792,7 +875,8 @@ static const struct tune_params thunderx2t99_tunings = &generic_approx_modes, 4, /* memmov_cost. */ 4, /* issue_rate. */ - (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops */ + (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC + | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */ 16, /* function_align. */ 8, /* jump_align. */ 16, /* loop_align. */ @@ -802,9 +886,9 @@ static const struct tune_params thunderx2t99_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ - 64, /* cache_line_size. */ - tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &thunderx2t99_prefetch_tune }; /* Support for fine-grained override of the tuning structures. */ @@ -948,7 +1032,7 @@ static reg_class_t aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class, reg_class_t best_class) { - enum machine_mode mode; + machine_mode mode; if (allocno_class != ALL_REGS) return allocno_class; @@ -961,7 +1045,7 @@ aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class, } static unsigned int -aarch64_min_divisions_for_recip_mul (enum machine_mode mode) +aarch64_min_divisions_for_recip_mul (machine_mode mode) { if (GET_MODE_UNIT_SIZE (mode) == 4) return aarch64_tune_params.min_div_recip_mul_sf; @@ -970,7 +1054,7 @@ aarch64_min_divisions_for_recip_mul (enum machine_mode mode) static int aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED, - enum machine_mode mode) + machine_mode mode) { if (VECTOR_MODE_P (mode)) return aarch64_tune_params.vec_reassoc_width; @@ -1649,41 +1733,41 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2) machine_mode dst_mode = GET_MODE (dst); gcc_assert (VECTOR_MODE_P (dst_mode)); + gcc_assert (register_operand (dst, dst_mode) + && register_operand (src1, src_mode) + && register_operand (src2, src_mode)); - if (REG_P (dst) && REG_P (src1) && REG_P (src2)) - { - rtx (*gen) (rtx, rtx, rtx); - - switch (src_mode) - { - case V8QImode: - gen = gen_aarch64_simd_combinev8qi; - break; - case V4HImode: - gen = gen_aarch64_simd_combinev4hi; - break; - case V2SImode: - gen = gen_aarch64_simd_combinev2si; - break; - case V4HFmode: - gen = gen_aarch64_simd_combinev4hf; - break; - case V2SFmode: - gen = gen_aarch64_simd_combinev2sf; - break; - case DImode: - gen = gen_aarch64_simd_combinedi; - break; - case DFmode: - gen = gen_aarch64_simd_combinedf; - break; - default: - gcc_unreachable (); - } + rtx (*gen) (rtx, rtx, rtx); - emit_insn (gen (dst, src1, src2)); - return; + switch (src_mode) + { + case V8QImode: + gen = gen_aarch64_simd_combinev8qi; + break; + case V4HImode: + gen = gen_aarch64_simd_combinev4hi; + break; + case V2SImode: + gen = gen_aarch64_simd_combinev2si; + break; + case V4HFmode: + gen = gen_aarch64_simd_combinev4hf; + break; + case V2SFmode: + gen = gen_aarch64_simd_combinev2sf; + break; + case DImode: + gen = gen_aarch64_simd_combinedi; + break; + case DFmode: + gen = gen_aarch64_simd_combinedf; + break; + default: + gcc_unreachable (); } + + emit_insn (gen (dst, src1, src2)); + return; } /* Split a complex SIMD move. */ @@ -1919,6 +2003,8 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) gcc_assert (can_create_pseudo_p ()); base = gen_reg_rtx (ptr_mode); aarch64_expand_mov_immediate (base, XEXP (mem, 0)); + if (ptr_mode != Pmode) + base = convert_memory_address (Pmode, base); mem = gen_rtx_MEM (ptr_mode, base); } @@ -2266,6 +2352,7 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type) { if (!type) return GET_MODE_ALIGNMENT (mode); + if (integer_zerop (TYPE_SIZE (type))) return 0; @@ -2278,9 +2365,9 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type) return TYPE_ALIGN (TREE_TYPE (type)); unsigned int alignment = 0; - for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) - alignment = std::max (alignment, DECL_ALIGN (field)); + if (TREE_CODE (field) == FIELD_DECL) + alignment = std::max (alignment, DECL_ALIGN (field)); return alignment; } @@ -2369,24 +2456,28 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode, entirely general registers. */ if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS)) { - unsigned int alignment = aarch64_function_arg_alignment (mode, type); gcc_assert (nregs == 0 || nregs == 1 || nregs == 2); /* C.8 if the argument has an alignment of 16 then the NGRN is rounded up to the next even number. */ - if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2) + if (nregs == 2 + && ncrn % 2 + /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT + comparison is there because for > 16 * BITS_PER_UNIT + alignment nregs should be > 2 and therefore it should be + passed by reference rather than value. */ + && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT) { ++ncrn; gcc_assert (ncrn + nregs <= NUM_ARG_REGS); } + /* NREGS can be 0 when e.g. an empty structure is to be passed. A reg is still generated for it, but the caller should be smart enough not to use it. */ if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT) - { - pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn); - } + pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn); else { rtx par; @@ -2414,6 +2505,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode, this argument and align the total size if necessary. */ on_stack: pcum->aapcs_stack_words = size / UNITS_PER_WORD; + if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT) pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD); @@ -2506,12 +2598,7 @@ static unsigned int aarch64_function_arg_boundary (machine_mode mode, const_tree type) { unsigned int alignment = aarch64_function_arg_alignment (mode, type); - - if (alignment < PARM_BOUNDARY) - alignment = PARM_BOUNDARY; - if (alignment > STACK_BOUNDARY) - alignment = STACK_BOUNDARY; - return alignment; + return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY); } /* For use by FUNCTION_ARG_PADDING (MODE, TYPE). @@ -2682,11 +2769,19 @@ aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size) plus_constant (Pmode, stack_pointer_rtx, -first)); /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */ - emit_set_insn (reg2, - plus_constant (Pmode, stack_pointer_rtx, - -(first + rounded_size))); - - + HOST_WIDE_INT adjustment = - (first + rounded_size); + if (! aarch64_uimm12_shift (adjustment)) + { + aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment), + true, Pmode); + emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2)); + } + else + { + emit_set_insn (reg2, + plus_constant (Pmode, stack_pointer_rtx, adjustment)); + } + /* Step 3: the loop do @@ -4548,6 +4643,24 @@ aarch64_classify_address (struct aarch64_address_info *info, } } +/* Return true if the address X is valid for a PRFM instruction. + STRICT_P is true if we should do strict checking with + aarch64_classify_address. */ + +bool +aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p) +{ + struct aarch64_address_info addr; + + /* PRFM accepts the same addresses as DImode... */ + bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p); + if (!res) + return false; + + /* ... except writeback forms. */ + return addr.type != ADDRESS_REG_WB; +} + bool aarch64_symbolic_address_p (rtx x) { @@ -4632,6 +4745,50 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2) return true; } +/* This function is used by the call expanders of the machine description. + RESULT is the register in which the result is returned. It's NULL for + "call" and "sibcall". + MEM is the location of the function call. + SIBCALL indicates whether this function call is normal call or sibling call. + It will generate different pattern accordingly. */ + +void +aarch64_expand_call (rtx result, rtx mem, bool sibcall) +{ + rtx call, callee, tmp; + rtvec vec; + machine_mode mode; + + gcc_assert (MEM_P (mem)); + callee = XEXP (mem, 0); + mode = GET_MODE (callee); + gcc_assert (mode == Pmode); + + /* Decide if we should generate indirect calls by loading the + address of the callee into a register before performing + the branch-and-link. */ + if (SYMBOL_REF_P (callee) + ? (aarch64_is_long_call_p (callee) + || aarch64_is_noplt_call_p (callee)) + : !REG_P (callee)) + XEXP (mem, 0) = force_reg (mode, callee); + + call = gen_rtx_CALL (VOIDmode, mem, const0_rtx); + + if (result != NULL_RTX) + call = gen_rtx_SET (result, call); + + if (sibcall) + tmp = ret_rtx; + else + tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM)); + + vec = gen_rtvec (2, call, tmp); + call = gen_rtx_PARALLEL (VOIDmode, vec); + + aarch64_emit_call_insn (call); +} + /* Emit call insn with PAT and do aarch64-specific handling. */ void @@ -4704,7 +4861,7 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y) the comparison will have to be swapped when we emit the assembly code. */ if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode) - && (REG_P (y) || GET_CODE (y) == SUBREG) + && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx) && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT || GET_CODE (x) == LSHIFTRT || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)) @@ -4730,7 +4887,7 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y) } static int -aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code); +aarch64_get_condition_code_1 (machine_mode, enum rtx_code); int aarch64_get_condition_code (rtx x) @@ -4744,7 +4901,7 @@ aarch64_get_condition_code (rtx x) } static int -aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code) +aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code) { switch (mode) { @@ -4896,12 +5053,43 @@ static const int aarch64_nzcv_codes[] = 0 /* NV, Any. */ }; +/* Print operand X to file F in a target specific manner according to CODE. + The acceptable formatting commands given by CODE are: + 'c': An integer or symbol address without a preceding # + sign. + 'e': Print the sign/zero-extend size as a character 8->b, + 16->h, 32->w. + 'p': Prints N such that 2^N == X (X must be power of 2 and + const int). + 'P': Print the number of non-zero bits in X (a const_int). + 'H': Print the higher numbered register of a pair (TImode) + of regs. + 'm': Print a condition (eq, ne, etc). + 'M': Same as 'm', but invert condition. + 'b/h/s/d/q': Print a scalar FP/SIMD register name. + 'S/T/U/V': Print a FP/SIMD register name for a register list. + The register printed is the FP/SIMD register name + of X + 0/1/2/3 for S/T/U/V. + 'R': Print a scalar FP/SIMD register name + 1. + 'X': Print bottom 16 bits of integer constant in hex. + 'w/x': Print a general register name or the zero register + (32-bit or 64-bit). + '0': Print a normal operand, if it's a general register, + then we assume DImode. + 'k': Print NZCV for conditional compare instructions. + 'A': Output address constant representing the first + argument of X, specifying a relocation offset + if appropriate. + 'L': Output constant address specified by X + with a relocation offset if appropriate. + 'G': Prints address of X, specifying a PC relative + relocation mode if appropriate. */ + static void aarch64_print_operand (FILE *f, rtx x, int code) { switch (code) { - /* An integer or symbol address without a preceding # sign. */ case 'c': switch (GET_CODE (x)) { @@ -4928,7 +5116,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) break; case 'e': - /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */ { int n; @@ -4961,7 +5148,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) { int n; - /* Print N such that 2^N == X. */ if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0) { output_operand_lossage ("invalid operand for '%%%c'", code); @@ -4973,7 +5159,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) break; case 'P': - /* Print the number of non-zero bits in X (a const_int). */ if (!CONST_INT_P (x)) { output_operand_lossage ("invalid operand for '%%%c'", code); @@ -4984,7 +5169,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) break; case 'H': - /* Print the higher numbered register of a pair (TImode) of regs. */ if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1)) { output_operand_lossage ("invalid operand for '%%%c'", code); @@ -4998,8 +5182,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) case 'm': { int cond_code; - /* Print a condition (eq, ne, etc) or its inverse. */ - /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */ if (x == const_true_rtx) { @@ -5027,7 +5209,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) case 's': case 'd': case 'q': - /* Print a scalar FP/SIMD register name. */ if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) { output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); @@ -5040,7 +5221,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) case 'T': case 'U': case 'V': - /* Print the first FP/SIMD register name in a list. */ if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) { output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); @@ -5050,7 +5230,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) break; case 'R': - /* Print a scalar FP/SIMD register name + 1. */ if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) { output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); @@ -5060,7 +5239,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) break; case 'X': - /* Print bottom 16 bits of integer constant in hex. */ if (!CONST_INT_P (x)) { output_operand_lossage ("invalid operand for '%%%c'", code); @@ -5071,8 +5249,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) case 'w': case 'x': - /* Print a general register name or the zero register (32-bit or - 64-bit). */ if (x == const0_rtx || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x))) { @@ -5095,8 +5271,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) /* Fall through */ case 0: - /* Print a normal operand, if it's a general register, then we - assume DImode. */ if (x == NULL) { output_operand_lossage ("missing operand"); @@ -5111,6 +5285,8 @@ aarch64_print_operand (FILE *f, rtx x, int code) case MEM: output_address (GET_MODE (x), XEXP (x, 0)); + /* Check all memory references are Pmode - even with ILP32. */ + gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode); break; case CONST: @@ -5247,7 +5423,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) break; case 'G': - switch (aarch64_classify_symbolic_expression (x)) { case SYMBOL_TLSLE24: @@ -5262,7 +5437,6 @@ aarch64_print_operand (FILE *f, rtx x, int code) case 'k': { HOST_WIDE_INT cond_code; - /* Print nzcv. */ if (!CONST_INT_P (x)) { @@ -5975,9 +6149,10 @@ aarch64_strip_shift (rtx x) /* Helper function for rtx cost calculation. Strip an extend expression from X. Returns the inner operand if successful, or the original expression on failure. We deal with a number of possible - canonicalization variations here. */ + canonicalization variations here. If STRIP_SHIFT is true, then + we can strip off a shift also. */ static rtx -aarch64_strip_extend (rtx x) +aarch64_strip_extend (rtx x, bool strip_shift) { rtx op = x; @@ -6001,7 +6176,8 @@ aarch64_strip_extend (rtx x) /* Now handle extended register, as this may also have an optional left shift by 1..4. */ - if (GET_CODE (op) == ASHIFT + if (strip_shift + && GET_CODE (op) == ASHIFT && CONST_INT_P (XEXP (op, 1)) && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4) op = XEXP (op, 0); @@ -6025,6 +6201,39 @@ aarch64_shift_p (enum rtx_code code) return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT; } + +/* Return true iff X is a cheap shift without a sign extend. */ + +static bool +aarch64_cheap_mult_shift_p (rtx x) +{ + rtx op0, op1; + + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); + + if (!(aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND)) + return false; + + if (GET_CODE (op0) == SIGN_EXTEND) + return false; + + if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1) + && UINTVAL (op1) <= 4) + return true; + + if (GET_CODE (x) != MULT || !CONST_INT_P (op1)) + return false; + + HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1)); + + if (l2 > 0 && l2 <= 4) + return true; + + return false; +} + /* Helper function for rtx cost calculation. Calculate the cost of a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx. Return the calculated cost of the expression, recursing manually in to @@ -6062,7 +6271,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) { if (compound_p) { - if (REG_P (op1)) + /* If the shift is considered cheap, + then don't add any cost. */ + if (aarch64_cheap_mult_shift_p (x)) + ; + else if (REG_P (op1)) /* ARITH + shift-by-register. */ cost += extra_cost->alu.arith_shift_reg; else if (is_extend) @@ -6080,7 +6293,7 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) } /* Strip extends as we will have costed them in the case above. */ if (is_extend) - op0 = aarch64_strip_extend (op0); + op0 = aarch64_strip_extend (op0, true); cost += rtx_cost (op0, VOIDmode, code, 0, speed); @@ -6924,13 +7137,13 @@ cost_minus: if (speed) *cost += extra_cost->alu.extend_arith; - op1 = aarch64_strip_extend (op1); + op1 = aarch64_strip_extend (op1, true); *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) GET_CODE (op1), 0, speed); return true; } - rtx new_op1 = aarch64_strip_extend (op1); + rtx new_op1 = aarch64_strip_extend (op1, false); /* Cost this as an FMA-alike operation. */ if ((GET_CODE (new_op1) == MULT @@ -7003,7 +7216,7 @@ cost_plus: if (speed) *cost += extra_cost->alu.extend_arith; - op0 = aarch64_strip_extend (op0); + op0 = aarch64_strip_extend (op0, true); *cost += rtx_cost (op0, VOIDmode, (enum rtx_code) GET_CODE (op0), 0, speed); return true; @@ -7011,7 +7224,7 @@ cost_plus: /* Strip any extend, leave shifts behind as we will cost them through mult_cost. */ - new_op0 = aarch64_strip_extend (op0); + new_op0 = aarch64_strip_extend (op0, false); if (GET_CODE (new_op0) == MULT || aarch64_shift_p (GET_CODE (new_op0))) @@ -7344,17 +7557,26 @@ cost_plus: } else { - if (speed) + if (VECTOR_MODE_P (mode)) { - if (VECTOR_MODE_P (mode)) - { - /* Vector shift (register). */ - *cost += extra_cost->vect.alu; - } - else + if (speed) + /* Vector shift (register). */ + *cost += extra_cost->vect.alu; + } + else + { + if (speed) + /* LSLV. */ + *cost += extra_cost->alu.shift_reg; + + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) + && CONST_INT_P (XEXP (op1, 1)) + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1) { - /* LSLV. */ - *cost += extra_cost->alu.shift_reg; + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); + /* We already demanded XEXP (op1, 0) to be REG_P, so + don't recurse into it. */ + return true; } } return false; /* All arguments need to be in registers. */ @@ -7383,14 +7605,27 @@ cost_plus: } else { - - /* ASR (register) and friends. */ - if (speed) + if (VECTOR_MODE_P (mode)) { - if (VECTOR_MODE_P (mode)) + if (speed) + /* Vector shift (register). */ *cost += extra_cost->vect.alu; - else + } + else + { + if (speed) + /* ASR (register) and friends. */ *cost += extra_cost->alu.shift_reg; + + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) + && CONST_INT_P (XEXP (op1, 1)) + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1) + { + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); + /* We already demanded XEXP (op1, 0) to be REG_P, so + don't recurse into it. */ + return true; + } } return false; /* All arguments need to be in registers. */ } @@ -7481,17 +7716,13 @@ cost_plus: case UMOD: if (speed) { + /* Slighly prefer UMOD over SMOD. */ if (VECTOR_MODE_P (mode)) *cost += extra_cost->vect.alu; else if (GET_MODE_CLASS (mode) == MODE_INT) *cost += (extra_cost->mult[mode == DImode].add - + extra_cost->mult[mode == DImode].idiv); - else if (mode == DFmode) - *cost += (extra_cost->fp[1].mult - + extra_cost->fp[1].div); - else if (mode == SFmode) - *cost += (extra_cost->fp[0].mult - + extra_cost->fp[0].div); + + extra_cost->mult[mode == DImode].idiv + + (code == MOD ? 1 : 0)); } return false; /* All arguments need to be in registers. */ @@ -7505,7 +7736,9 @@ cost_plus: else if (GET_MODE_CLASS (mode) == MODE_INT) /* There is no integer SQRT, so only DIV and UDIV can get here. */ - *cost += extra_cost->mult[mode == DImode].idiv; + *cost += (extra_cost->mult[mode == DImode].idiv + /* Slighly prefer UDIV over SDIV. */ + + (code == DIV ? 1 : 0)); else *cost += extra_cost->fp[mode == DFmode].div; } @@ -7924,33 +8157,40 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) machine_mode mode = GET_MODE (dst); if (GET_MODE_INNER (mode) == HFmode) - return false; + { + gcc_assert (!recp); + return false; + } - machine_mode mmsk = mode_for_vector - (int_mode_for_mode (GET_MODE_INNER (mode)), - GET_MODE_NUNITS (mode)); - bool use_approx_sqrt_p = (!recp - && (flag_mlow_precision_sqrt - || (aarch64_tune_params.approx_modes->sqrt - & AARCH64_APPROX_MODE (mode)))); - bool use_approx_rsqrt_p = (recp - && (flag_mrecip_low_precision_sqrt - || (aarch64_tune_params.approx_modes->recip_sqrt - & AARCH64_APPROX_MODE (mode)))); + machine_mode mmsk + = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)), + GET_MODE_NUNITS (mode)); + if (!recp) + { + if (!(flag_mlow_precision_sqrt + || (aarch64_tune_params.approx_modes->sqrt + & AARCH64_APPROX_MODE (mode)))) + return false; + + if (flag_finite_math_only + || flag_trapping_math + || !flag_unsafe_math_optimizations + || optimize_function_for_size_p (cfun)) + return false; + } + else + /* Caller assumes we cannot fail. */ + gcc_assert (use_rsqrt_p (mode)); - if (!flag_finite_math_only - || flag_trapping_math - || !flag_unsafe_math_optimizations - || !(use_approx_sqrt_p || use_approx_rsqrt_p) - || optimize_function_for_size_p (cfun)) - return false; rtx xmsk = gen_reg_rtx (mmsk); if (!recp) - /* When calculating the approximate square root, compare the argument with - 0.0 and create a mask. */ - emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src, - CONST0_RTX (mode))))); + /* When calculating the approximate square root, compare the + argument with 0.0 and create a mask. */ + emit_insn (gen_rtx_SET (xmsk, + gen_rtx_NEG (mmsk, + gen_rtx_EQ (mmsk, src, + CONST0_RTX (mode))))); /* Estimate the approximate reciprocal square root. */ rtx xdst = gen_reg_rtx (mode); @@ -8679,12 +8919,38 @@ aarch64_override_options_internal (struct gcc_options *opts) opts->x_param_values, global_options_set.x_param_values); - /* Set the L1 cache line size. */ - if (selected_cpu->tune->cache_line_size != 0) + /* Set up parameters to be used in prefetching algorithm. Do not + override the defaults unless we are tuning for a core we have + researched values for. */ + if (aarch64_tune_params.prefetch->num_slots > 0) + maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, + aarch64_tune_params.prefetch->num_slots, + opts->x_param_values, + global_options_set.x_param_values); + if (aarch64_tune_params.prefetch->l1_cache_size >= 0) + maybe_set_param_value (PARAM_L1_CACHE_SIZE, + aarch64_tune_params.prefetch->l1_cache_size, + opts->x_param_values, + global_options_set.x_param_values); + if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0) maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, - selected_cpu->tune->cache_line_size, + aarch64_tune_params.prefetch->l1_cache_line_size, opts->x_param_values, global_options_set.x_param_values); + if (aarch64_tune_params.prefetch->l2_cache_size >= 0) + maybe_set_param_value (PARAM_L2_CACHE_SIZE, + aarch64_tune_params.prefetch->l2_cache_size, + opts->x_param_values, + global_options_set.x_param_values); + + /* Enable sw prefetching at specified optimization level for + CPUS that have prefetch. Lower optimization level threshold by 1 + when profiling is enabled. */ + if (opts->x_flag_prefetch_loop_arrays < 0 + && !opts->x_optimize_size + && aarch64_tune_params.prefetch->default_opt_level >= 0 + && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level) + opts->x_flag_prefetch_loop_arrays = 1; aarch64_override_options_after_change_1 (opts); } @@ -9996,6 +10262,11 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x) && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0)))) return true; + /* Treat symbols as constants. Avoid TLS symbols as they are complex, + so spilling them is better than rematerialization. */ + if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x)) + return true; + return aarch64_constant_address_p (x); } @@ -11437,7 +11708,7 @@ aarch64_simd_mem_operand_p (rtx op) COUNT is the number of components into which the copy needs to be decomposed. */ void -aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode, +aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode, unsigned int count) { unsigned int i; @@ -11458,7 +11729,7 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode, /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is one of VSTRUCT modes: OI, CI, or XI. */ int -aarch64_simd_attr_length_rglist (enum machine_mode mode) +aarch64_simd_attr_length_rglist (machine_mode mode) { return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4; } @@ -11639,6 +11910,57 @@ aarch64_expand_vector_init (rtx target, rtx vals) return; } + enum insn_code icode = optab_handler (vec_set_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); + + /* If there are only variable elements, try to optimize + the insertion using dup for the most common element + followed by insertions. */ + + /* The algorithm will fill matches[*][0] with the earliest matching element, + and matches[X][1] with the count of duplicate elements (if X is the + earliest element which has duplicates). */ + + if (n_var == n_elts && n_elts <= 16) + { + int matches[16][2] = {0}; + for (int i = 0; i < n_elts; i++) + { + for (int j = 0; j <= i; j++) + { + if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j))) + { + matches[i][0] = j; + matches[j][1]++; + break; + } + } + } + int maxelement = 0; + int maxv = 0; + for (int i = 0; i < n_elts; i++) + if (matches[i][1] > maxv) + { + maxelement = i; + maxv = matches[i][1]; + } + + /* Create a duplicate of the most common element. */ + rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); + aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); + + /* Insert the rest. */ + for (int i = 0; i < n_elts; i++) + { + rtx x = XVECEXP (vals, 0, i); + if (matches[i][0] == maxelement) + continue; + x = copy_to_mode_reg (inner_mode, x); + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); + } + return; + } + /* Initialise a vector which is part-variable. We want to first try to build those lanes which are constant in the most efficient way we can. */ @@ -11672,10 +11994,6 @@ aarch64_expand_vector_init (rtx target, rtx vals) } /* Insert the variable lanes directly. */ - - enum insn_code icode = optab_handler (vec_set_optab, mode); - gcc_assert (icode != CODE_FOR_nothing); - for (int i = 0; i < n_elts; i++) { rtx x = XVECEXP (vals, 0, i); @@ -11843,10 +12161,8 @@ aarch64_emit_store_exclusive (machine_mode mode, rtx bval, static void aarch64_emit_unlikely_jump (rtx insn) { - int very_unlikely = REG_BR_PROB_BASE / 100 - 1; - rtx_insn *jump = emit_jump_insn (insn); - add_int_reg_note (jump, REG_BR_PROB, very_unlikely); + add_reg_br_prob_note (jump, profile_probability::very_unlikely ()); } /* Expand a compare and swap pattern. */ @@ -12041,6 +12357,17 @@ aarch64_split_compare_and_swap (rtx operands[]) mode = GET_MODE (mem); model = memmodel_from_int (INTVAL (model_rtx)); + /* When OLDVAL is zero and we want the strong version we can emit a tighter + loop: + .label1: + LD[A]XR rval, [mem] + CBNZ rval, .label2 + ST[L]XR scratch, newval, [mem] + CBNZ scratch, .label1 + .label2: + CMP rval, 0. */ + bool strong_zero_p = !is_weak && oldval == const0_rtx; + label1 = NULL; if (!is_weak) { @@ -12057,11 +12384,21 @@ aarch64_split_compare_and_swap (rtx operands[]) else aarch64_emit_load_exclusive (mode, rval, mem, model_rtx); - cond = aarch64_gen_compare_reg (NE, rval, oldval); - x = gen_rtx_NE (VOIDmode, cond, const0_rtx); - x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, - gen_rtx_LABEL_REF (Pmode, label2), pc_rtx); - aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); + if (strong_zero_p) + { + x = gen_rtx_NE (VOIDmode, rval, const0_rtx); + x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, + gen_rtx_LABEL_REF (Pmode, label2), pc_rtx); + aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); + } + else + { + cond = aarch64_gen_compare_reg (NE, rval, oldval); + x = gen_rtx_NE (VOIDmode, cond, const0_rtx); + x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, + gen_rtx_LABEL_REF (Pmode, label2), pc_rtx); + aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); + } aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx); @@ -12080,7 +12417,15 @@ aarch64_split_compare_and_swap (rtx operands[]) } emit_label (label2); - + /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL + to set the condition flags. If this is not used it will be removed by + later passes. */ + if (strong_zero_p) + { + cond = gen_rtx_REG (CCmode, CC_REGNUM); + x = gen_rtx_COMPARE (CCmode, rval, const0_rtx); + emit_insn (gen_rtx_SET (cond, x)); + } /* Emit any final barrier needed for a __sync operation. */ if (is_mm_sync (model)) aarch64_emit_post_barrier (model); @@ -13373,7 +13718,7 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, } rtx -aarch64_reverse_mask (enum machine_mode mode) +aarch64_reverse_mask (machine_mode mode) { /* We have to reverse each vector because we dont have a permuted load that can reverse-load according to ABI rules. */ @@ -13973,13 +14318,68 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) { enum attr_type prev_type = get_attr_type (prev); - /* FIXME: this misses some which is considered simple arthematic - instructions for ThunderX. Simple shifts are missed here. */ - if (prev_type == TYPE_ALUS_SREG - || prev_type == TYPE_ALUS_IMM - || prev_type == TYPE_LOGICS_REG - || prev_type == TYPE_LOGICS_IMM) - return true; + unsigned int condreg1, condreg2; + rtx cc_reg_1; + aarch64_fixed_condition_code_regs (&condreg1, &condreg2); + cc_reg_1 = gen_rtx_REG (CCmode, condreg1); + + if (reg_referenced_p (cc_reg_1, PATTERN (curr)) + && prev + && modified_in_p (cc_reg_1, prev)) + { + /* FIXME: this misses some which is considered simple arthematic + instructions for ThunderX. Simple shifts are missed here. */ + if (prev_type == TYPE_ALUS_SREG + || prev_type == TYPE_ALUS_IMM + || prev_type == TYPE_LOGICS_REG + || prev_type == TYPE_LOGICS_IMM) + return true; + } + } + + if (prev_set + && curr_set + && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH) + && any_condjump_p (curr)) + { + /* We're trying to match: + prev (alu_insn) == (set (r0) plus ((r0) (r1/imm))) + curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0) + (const_int 0)) + (label_ref ("SYM")) + (pc)) */ + if (SET_DEST (curr_set) == (pc_rtx) + && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE + && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) + && REG_P (SET_DEST (prev_set)) + && REGNO (SET_DEST (prev_set)) + == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0))) + { + /* Fuse ALU operations followed by conditional branch instruction. */ + switch (get_attr_type (prev)) + { + case TYPE_ALU_IMM: + case TYPE_ALU_SREG: + case TYPE_ADC_REG: + case TYPE_ADC_IMM: + case TYPE_ADCS_REG: + case TYPE_ADCS_IMM: + case TYPE_LOGIC_REG: + case TYPE_LOGIC_IMM: + case TYPE_CSEL: + case TYPE_ADR: + case TYPE_MOV_IMM: + case TYPE_SHIFT_REG: + case TYPE_SHIFT_IMM: + case TYPE_BFM: + case TYPE_RBIT: + case TYPE_REV: + case TYPE_EXTEND: + return true; + + default:; + } + } } return false; @@ -14163,7 +14563,7 @@ aarch64_sched_adjust_priority (rtx_insn *insn, int priority) bool aarch64_operands_ok_for_ldpstp (rtx *operands, bool load, - enum machine_mode mode) + machine_mode mode) { HOST_WIDE_INT offval_1, offval_2, msize; enum reg_class rclass_1, rclass_2; @@ -14270,7 +14670,7 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load, bool aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, - enum machine_mode mode) + machine_mode mode) { enum reg_class rclass_1, rclass_2, rclass_3, rclass_4; HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize; @@ -14404,7 +14804,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, bool aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, - enum machine_mode mode, RTX_CODE code) + machine_mode mode, RTX_CODE code) { rtx base, offset, t1, t2; rtx mem_1, mem_2, mem_3, mem_4; |