diff options
Diffstat (limited to 'gcc/config/aarch64')
30 files changed, 892 insertions, 3671 deletions
diff --git a/gcc/config/aarch64/aarch64-abi-ms.h b/gcc/config/aarch64/aarch64-abi-ms.h new file mode 100644 index 00000000000..15dc33d0474 --- /dev/null +++ b/gcc/config/aarch64/aarch64-abi-ms.h @@ -0,0 +1,34 @@ +/* Machine description for AArch64 MS ABI. + Copyright (C) 2024 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#ifndef GCC_AARCH64_ABI_MS_H +#define GCC_AARCH64_ABI_MS_H + +/* X18 reserved for the TEB on Windows. */ + +#undef FIXED_X18 +#define FIXED_X18 1 + +#undef CALL_USED_X18 +#define CALL_USED_X18 0 + +#undef STATIC_CHAIN_REGNUM +#define STATIC_CHAIN_REGNUM R17_REGNUM + +#endif /* GCC_AARCH64_ABI_MS_H. */ diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 75d21de1401..d589e59defc 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -534,6 +534,22 @@ BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0) BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0) BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0) +/* The builtins below should be expanded through the standard optabs + CODE_FOR_extend<mode><Vwide>2. */ +#undef VAR1 +#define VAR1(F,T,N,M) \ + constexpr insn_code CODE_FOR_aarch64_##F##M = CODE_FOR_##T##N##M##2; + +VAR1 (float_extend_lo_, extend, v2sf, v2df) +VAR1 (float_extend_lo_, extend, v4hf, v4sf) + +/* __builtin_aarch64_float_truncate_lo_<mode> should be expanded through the + standard optabs CODE_FOR_trunc<Vwide><mode>2. */ +constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v4hf + = CODE_FOR_truncv4sfv4hf2; +constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v2sf + = CODE_FOR_truncv2dfv2sf2; + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG}, @@ -658,6 +674,40 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VREINTERPRET_BUILTINS \ VREINTERPRETQ_BUILTINS +#define AARCH64_SIMD_VGET_LOW_BUILTINS \ + VGET_LOW_BUILTIN(f16) \ + VGET_LOW_BUILTIN(f32) \ + VGET_LOW_BUILTIN(f64) \ + VGET_LOW_BUILTIN(p8) \ + VGET_LOW_BUILTIN(p16) \ + VGET_LOW_BUILTIN(p64) \ + VGET_LOW_BUILTIN(s8) \ + VGET_LOW_BUILTIN(s16) \ + VGET_LOW_BUILTIN(s32) \ + VGET_LOW_BUILTIN(s64) \ + VGET_LOW_BUILTIN(u8) \ + VGET_LOW_BUILTIN(u16) \ + VGET_LOW_BUILTIN(u32) \ + VGET_LOW_BUILTIN(u64) \ + VGET_LOW_BUILTIN(bf16) + +#define AARCH64_SIMD_VGET_HIGH_BUILTINS \ + VGET_HIGH_BUILTIN(f16) \ + VGET_HIGH_BUILTIN(f32) \ + VGET_HIGH_BUILTIN(f64) \ + VGET_HIGH_BUILTIN(p8) \ + VGET_HIGH_BUILTIN(p16) \ + VGET_HIGH_BUILTIN(p64) \ + VGET_HIGH_BUILTIN(s8) \ + VGET_HIGH_BUILTIN(s16) \ + VGET_HIGH_BUILTIN(s32) \ + VGET_HIGH_BUILTIN(s64) \ + VGET_HIGH_BUILTIN(u8) \ + VGET_HIGH_BUILTIN(u16) \ + VGET_HIGH_BUILTIN(u32) \ + VGET_HIGH_BUILTIN(u64) \ + VGET_HIGH_BUILTIN(bf16) + typedef struct { const char *name; @@ -697,6 +747,12 @@ typedef struct #define VREINTERPRET_BUILTIN(A, B, L) \ AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B, +#define VGET_LOW_BUILTIN(A) \ + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, + +#define VGET_HIGH_BUILTIN(A) \ + AARCH64_SIMD_BUILTIN_VGET_HIGH_##A, + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, @@ -732,6 +788,8 @@ enum aarch64_builtins AARCH64_CRC32_BUILTIN_MAX, /* SIMD intrinsic builtins. */ AARCH64_SIMD_VREINTERPRET_BUILTINS + AARCH64_SIMD_VGET_LOW_BUILTINS + AARCH64_SIMD_VGET_HIGH_BUILTINS /* ARMv8.3-A Pointer Authentication Builtins. */ AARCH64_PAUTH_BUILTIN_AUTIA1716, AARCH64_PAUTH_BUILTIN_PACIA1716, @@ -823,8 +881,32 @@ static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] = { && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \ }, +#undef VGET_LOW_BUILTIN +#define VGET_LOW_BUILTIN(A) \ + {"vget_low_" #A, \ + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \ + 2, \ + { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \ + { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \ + FLAG_AUTO_FP, \ + false \ + }, + +#undef VGET_HIGH_BUILTIN +#define VGET_HIGH_BUILTIN(A) \ + {"vget_high_" #A, \ + AARCH64_SIMD_BUILTIN_VGET_HIGH_##A, \ + 2, \ + { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \ + { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \ + FLAG_AUTO_FP, \ + false \ + }, + static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = { AARCH64_SIMD_VREINTERPRET_BUILTINS + AARCH64_SIMD_VGET_LOW_BUILTINS + AARCH64_SIMD_VGET_HIGH_BUILTINS }; @@ -3216,6 +3298,13 @@ aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2) #define VREINTERPRET_BUILTIN(A, B, L) \ case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B: +#undef VGET_LOW_BUILTIN +#define VGET_LOW_BUILTIN(A) \ + case AARCH64_SIMD_BUILTIN_VGET_LOW_##A: + +#undef VGET_HIGH_BUILTIN +#define VGET_HIGH_BUILTIN(A) \ + case AARCH64_SIMD_BUILTIN_VGET_HIGH_##A: /* Try to fold a call to the built-in function with subcode FCODE. The function is passed the N_ARGS arguments in ARGS and it returns a value @@ -3235,6 +3324,20 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type, return fold_build1 (FLOAT_EXPR, type, args[0]); AARCH64_SIMD_VREINTERPRET_BUILTINS return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]); + AARCH64_SIMD_VGET_LOW_BUILTINS + { + auto pos = BYTES_BIG_ENDIAN ? 64 : 0; + + return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64), + bitsize_int (pos)); + } + AARCH64_SIMD_VGET_HIGH_BUILTINS + { + auto pos = BYTES_BIG_ENDIAN ? 0 : 64; + + return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64), + bitsize_int (pos)); + } case AARCH64_SIMD_BUILTIN_LANE_CHECK: gcc_assert (n_args == 3); if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2])) diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index fe1a20e4e54..d042e5fbd8c 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -75,6 +75,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile) builtin_define ("__ARM_STATE_ZA"); builtin_define ("__ARM_STATE_ZT0"); + builtin_define ("__ARM_NEON_SVE_BRIDGE"); /* Define keyword attributes like __arm_streaming as macros that expand to the associated [[...]] attribute. Use __extension__ in the attribute diff --git a/gcc/config/aarch64/aarch64-coff.h b/gcc/config/aarch64/aarch64-coff.h new file mode 100644 index 00000000000..81fd9954f75 --- /dev/null +++ b/gcc/config/aarch64/aarch64-coff.h @@ -0,0 +1,91 @@ +/* Machine description for AArch64 architecture. + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef GCC_AARCH64_COFF_H +#define GCC_AARCH64_COFF_H + +#ifndef LOCAL_LABEL_PREFIX +# define LOCAL_LABEL_PREFIX "" +#endif + +/* Using long long breaks -ansi and -std=c90, so these will need to be + made conditional for an LLP64 ABI. */ +#undef SIZE_TYPE +#define SIZE_TYPE "long long unsigned int" + +#undef PTRDIFF_TYPE +#define PTRDIFF_TYPE "long long int" + +#undef LONG_TYPE_SIZE +#define LONG_TYPE_SIZE 32 + +#ifndef ASM_GENERATE_INTERNAL_LABEL +# define ASM_GENERATE_INTERNAL_LABEL(STRING, PREFIX, NUM) \ + sprintf (STRING, "*%s%s%u", LOCAL_LABEL_PREFIX, PREFIX, (unsigned int)(NUM)) +#endif + +#define ASM_OUTPUT_ALIGN(STREAM, POWER) \ + fprintf (STREAM, "\t.align\t%d\n", (int)POWER) + +/* Output a common block. */ +#ifndef ASM_OUTPUT_COMMON +# define ASM_OUTPUT_COMMON(STREAM, NAME, SIZE, ROUNDED) \ + { \ + fprintf (STREAM, "\t.comm\t"); \ + assemble_name (STREAM, NAME); \ + asm_fprintf (STREAM, ", %d, %d\n", \ + (int)(ROUNDED), (int)(SIZE)); \ + } +#endif + +/* Output a local common block. /bin/as can't do this, so hack a + `.space' into the bss segment. Note that this is *bad* practice, + which is guaranteed NOT to work since it doesn't define STATIC + COMMON space but merely STATIC BSS space. */ +#ifndef ASM_OUTPUT_ALIGNED_LOCAL +# define ASM_OUTPUT_ALIGNED_LOCAL(STREAM, NAME, SIZE, ALIGN) \ + { \ + switch_to_section (bss_section); \ + ASM_OUTPUT_ALIGN (STREAM, floor_log2 (ALIGN / BITS_PER_UNIT)); \ + ASM_OUTPUT_LABEL (STREAM, NAME); \ + fprintf (STREAM, "\t.space\t%d\n", (int)(SIZE)); \ + } +#endif + +#define ASM_OUTPUT_SKIP(STREAM, NBYTES) \ + fprintf (STREAM, "\t.space\t%d // skip\n", (int) (NBYTES)) + +/* Definitions that are not yet supported by binutils for the + aarch64-w64-mingw32 target. */ +#define ASM_OUTPUT_TYPE_DIRECTIVE(STREAM, NAME, TYPE) +#define ASM_DECLARE_FUNCTION_SIZE(FILE, FNAME, DECL) + +#define TEXT_SECTION_ASM_OP "\t.text" +#define DATA_SECTION_ASM_OP "\t.data" +#define BSS_SECTION_ASM_OP "\t.bss" + +#define CTORS_SECTION_ASM_OP "\t.section\t.ctors, \"aw\"" +#define DTORS_SECTION_ASM_OP "\t.section\t.dtors, \"aw\"" + +#define GLOBAL_ASM_OP "\t.global\t" + +#undef SUPPORTS_INIT_PRIORITY +#define SUPPORTS_INIT_PRIORITY 0 + +#endif diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index f69fc212d56..be60929e400 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -151,6 +151,11 @@ AARCH64_CORE("neoverse-512tvb", neoverse512tvb, cortexa57, V8_4A, (SVE, I8MM, B /* Qualcomm ('Q') cores. */ AARCH64_CORE("saphira", saphira, saphira, V8_4A, (CRYPTO), saphira, 0x51, 0xC01, -1) +/* ARMv8.6-A Architecture Processors. */ + +/* Qualcomm ('Q') cores. */ +AARCH64_CORE("oryon-1", oryon1, cortexa57, V8_6A, (CRYPTO, SM4, SHA3, F16), cortexa72, 0x51, 0x001, -1) + /* ARMv8-A big.LITTLE implementations. */ AARCH64_CORE("cortex-a57.cortex-a53", cortexa57cortexa53, cortexa53, V8A, (CRC), cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1) diff --git a/gcc/config/aarch64/aarch64-early-ra.cc b/gcc/config/aarch64/aarch64-early-ra.cc index 1e2c823cb2e..99324423ee5 100644 --- a/gcc/config/aarch64/aarch64-early-ra.cc +++ b/gcc/config/aarch64/aarch64-early-ra.cc @@ -3446,7 +3446,7 @@ early_ra::process_block (basic_block bb, bool is_isolated) fprintf (dump_file, "\nBlock %d:\n", bb->index); fprintf (dump_file, "%6d:", m_current_point); pretty_printer rtl_slim_pp; - rtl_slim_pp.buffer->stream = dump_file; + rtl_slim_pp.set_output_stream (dump_file); print_insn (&rtl_slim_pp, insn, 1); pp_flush (&rtl_slim_pp); fprintf (dump_file, "\n"); diff --git a/gcc/config/aarch64/aarch64-freebsd.h b/gcc/config/aarch64/aarch64-freebsd.h index 53cc17a1caf..e26d69ce46c 100644 --- a/gcc/config/aarch64/aarch64-freebsd.h +++ b/gcc/config/aarch64/aarch64-freebsd.h @@ -35,6 +35,7 @@ #undef FBSD_TARGET_LINK_SPEC #define FBSD_TARGET_LINK_SPEC " \ %{p:%nconsider using `-pg' instead of `-p' with gprof (1)} \ + " FBSD_LINK_PG_NOTE " \ %{v:-V} \ %{assert*} %{R*} %{rpath*} %{defsym*} \ %{shared:-Bshareable %{h*} %{soname*}} \ diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc index 365dcf48b22..b255dcbe73c 100644 --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc @@ -17,997 +17,148 @@ // along with GCC; see the file COPYING3. If not see // <http://www.gnu.org/licenses/>. -#define INCLUDE_ALGORITHM -#define INCLUDE_FUNCTIONAL -#define INCLUDE_LIST -#define INCLUDE_TYPE_TRAITS #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "rtl.h" -#include "df.h" +#include "memmodel.h" +#include "emit-rtl.h" +#include "tm_p.h" #include "rtl-iter.h" -#include "rtl-ssa.h" -#include "cfgcleanup.h" #include "tree-pass.h" -#include "ordered-hash-map.h" -#include "tree-dfa.h" -#include "fold-const.h" -#include "tree-hash-traits.h" -#include "print-tree.h" #include "insn-attr.h" - -using namespace rtl_ssa; +#include "pair-fusion.h" static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7; static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1)); static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1; static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1; -// We pack these fields (load_p, fpsimd_p, and size) into an integer -// (LFS) which we use as part of the key into the main hash tables. -// -// The idea is that we group candidates together only if they agree on -// the fields below. Candidates that disagree on any of these -// properties shouldn't be merged together. -struct lfs_fields -{ - bool load_p; - bool fpsimd_p; - unsigned size; -}; - -using insn_list_t = std::list<insn_info *>; -using insn_iter_t = insn_list_t::iterator; - -// Information about the accesses at a given offset from a particular -// base. Stored in an access_group, see below. -struct access_record -{ - poly_int64 offset; - std::list<insn_info *> cand_insns; - std::list<access_record>::iterator place; - - access_record (poly_int64 off) : offset (off) {} -}; - -// A group of accesses where adjacent accesses could be ldp/stp -// candidates. The splay tree supports efficient insertion, -// while the list supports efficient iteration. -struct access_group -{ - splay_tree<access_record *> tree; - std::list<access_record> list; - - template<typename Alloc> - inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn); -}; - -// Information about a potential base candidate, used in try_fuse_pair. -// There may be zero, one, or two viable RTL bases for a given pair. -struct base_cand +struct aarch64_pair_fusion : public pair_fusion { - // DEF is the def of the base register to be used by the pair. - def_info *def; - - // FROM_INSN is -1 if the base candidate is already shared by both - // candidate insns. Otherwise it holds the index of the insn from - // which the base originated. - // - // In the case that the base is shared, either DEF is already used - // by both candidate accesses, or both accesses see different versions - // of the same regno, in which case DEF is the def consumed by the - // first candidate access. - int from_insn; - - // To form a pair, we do so by moving the first access down and the second - // access up. To determine where to form the pair, and whether or not - // it is safe to form the pair, we track instructions which cannot be - // re-ordered past due to either dataflow or alias hazards. - // - // Since we allow changing the base used by an access, the choice of - // base can change which instructions act as re-ordering hazards for - // this pair (due to different dataflow). We store the initial - // dataflow hazards for this choice of base candidate in HAZARDS. - // - // These hazards act as re-ordering barriers to each candidate insn - // respectively, in program order. - // - // Later on, when we take alias analysis into account, we narrow - // HAZARDS accordingly. - insn_info *hazards[2]; - - base_cand (def_info *def, int insn) - : def (def), from_insn (insn), hazards {nullptr, nullptr} {} - - base_cand (def_info *def) : base_cand (def, -1) {} - - // Test if this base candidate is viable according to HAZARDS. - bool viable () const + bool fpsimd_op_p (rtx reg_op, machine_mode mem_mode, + bool load_p) override final { - return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]); + // Before RA, we use the modes, noting that stores of constant zero + // operands use GPRs (even in non-integer modes). After RA, we use + // the hard register numbers. + return reload_completed + ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op))) + : (GET_MODE_CLASS (mem_mode) != MODE_INT + && (load_p || !aarch64_const_zero_rtx_p (reg_op))); } -}; -// Information about an alternate base. For a def_info D, it may -// instead be expressed as D = BASE + OFFSET. -struct alt_base -{ - def_info *base; - poly_int64 offset; -}; - -// State used by the pass for a given basic block. -struct ldp_bb_info -{ - using def_hash = nofree_ptr_hash<def_info>; - using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>; - using def_key_t = pair_hash<def_hash, int_hash<int, -1, -2>>; + bool pair_mem_insn_p (rtx_insn *rti, bool &load_p) override final; - // Map of <tree base, LFS> -> access_group. - ordered_hash_map<expr_key_t, access_group> expr_map; - - // Map of <RTL-SSA def_info *, LFS> -> access_group. - ordered_hash_map<def_key_t, access_group> def_map; + bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) override final + { + return aarch64_mem_ok_with_ldpstp_policy_model (base_mem, + load_p, + GET_MODE (base_mem)); + } - // Given the def_info for an RTL base register, express it as an offset from - // some canonical base instead. - // - // Canonicalizing bases in this way allows us to identify adjacent accesses - // even if they see different base register defs. - hash_map<def_hash, alt_base> canon_base_map; + bool pair_operand_mode_ok_p (machine_mode mode) override final; - static const size_t obstack_alignment = sizeof (void *); - bb_info *m_bb; + rtx gen_pair (rtx *pats, rtx writeback, bool load_p) override final; - ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false) + bool pair_reg_operand_ok_p (bool load_p, rtx reg_op, + machine_mode mode) override final { - obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE, - obstack_alignment, obstack_chunk_alloc, - obstack_chunk_free); + return (load_p + ? aarch64_ldp_reg_operand (reg_op, mode) + : aarch64_stp_reg_operand (reg_op, mode)); } - ~ldp_bb_info () - { - obstack_free (&m_obstack, nullptr); - if (m_emitted_tombstone) - { - bitmap_release (&m_tombstone_bitmap); - bitmap_obstack_release (&m_bitmap_obstack); - } + int pair_mem_alias_check_limit () override final + { + return aarch64_ldp_alias_check_limit; } - inline void track_access (insn_info *, bool load, rtx mem); - inline void transform (); - inline void cleanup_tombstones (); - -private: - obstack m_obstack; - - // State for keeping track of tombstone insns emitted for this BB. - bitmap_obstack m_bitmap_obstack; - bitmap_head m_tombstone_bitmap; - bool m_emitted_tombstone; - - inline splay_tree_node<access_record *> *node_alloc (access_record *); - - template<typename Map> - inline void traverse_base_map (Map &map); - inline void transform_for_base (int load_size, access_group &group); + bool should_handle_writeback (writeback_type which) override final + { + if (which == writeback_type::ALL) + return aarch64_ldp_writeback > 1; + else + return aarch64_ldp_writeback; + } - inline void merge_pairs (insn_list_t &, insn_list_t &, - bool load_p, unsigned access_size); + bool track_loads_p () override final + { + return aarch64_tune_params.ldp_policy_model + != AARCH64_LDP_STP_POLICY_NEVER; + } - inline bool try_fuse_pair (bool load_p, unsigned access_size, - insn_info *i1, insn_info *i2); + bool track_stores_p () override final + { + return aarch64_tune_params.stp_policy_model + != AARCH64_LDP_STP_POLICY_NEVER; + } - inline bool fuse_pair (bool load_p, unsigned access_size, - int writeback, - insn_info *i1, insn_info *i2, - base_cand &base, - const insn_range_info &move_range); + bool pair_mem_in_range_p (HOST_WIDE_INT offset) override final + { + return (offset >= LDP_MIN_IMM && offset <= LDP_MAX_IMM); + } - inline void track_tombstone (int uid); + rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, rtx regs[2], + bool load_p) override final; - inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs); + rtx destructure_pair (rtx regs[2], rtx pattern, bool load_p) override final; }; -splay_tree_node<access_record *> * -ldp_bb_info::node_alloc (access_record *access) -{ - using T = splay_tree_node<access_record *>; - void *addr = obstack_alloc (&m_obstack, sizeof (T)); - return new (addr) T (access); -} - -// Given a mem MEM, if the address has side effects, return a MEM that accesses -// the same address but without the side effects. Otherwise, return -// MEM unchanged. -static rtx -drop_writeback (rtx mem) +bool +aarch64_pair_fusion::pair_mem_insn_p (rtx_insn *rti, bool &load_p) { - rtx addr = XEXP (mem, 0); - - if (!side_effects_p (addr)) - return mem; - - switch (GET_CODE (addr)) - { - case PRE_MODIFY: - addr = XEXP (addr, 1); - break; - case POST_MODIFY: - case POST_INC: - case POST_DEC: - addr = XEXP (addr, 0); - break; - case PRE_INC: - case PRE_DEC: + rtx pat = PATTERN (rti); + if (GET_CODE (pat) == PARALLEL + && XVECLEN (pat, 0) == 2) { - poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem)); - if (GET_CODE (addr) == PRE_DEC) - adjustment *= -1; - addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment); - break; - } - default: - gcc_unreachable (); - } + const auto attr = get_attr_ldpstp (rti); + if (attr == LDPSTP_NONE) + return false; - return change_address (mem, GET_MODE (mem), addr); + load_p = (attr == LDPSTP_LDP); + gcc_checking_assert (load_p || attr == LDPSTP_STP); + return true; + } + return false; } -// Convenience wrapper around strip_offset that can also look through -// RTX_AUTOINC addresses. The interface is like strip_offset except we take a -// MEM so that we know the mode of the access. -static rtx -ldp_strip_offset (rtx mem, poly_int64 *offset) +rtx +aarch64_pair_fusion::gen_pair (rtx *pats, rtx writeback, bool load_p) { - rtx addr = XEXP (mem, 0); + rtx pair_pat; - switch (GET_CODE (addr)) + if (writeback) { - case PRE_MODIFY: - case POST_MODIFY: - addr = strip_offset (XEXP (addr, 1), offset); - gcc_checking_assert (REG_P (addr)); - gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr)); - break; - case PRE_INC: - case POST_INC: - addr = XEXP (addr, 0); - *offset = GET_MODE_SIZE (GET_MODE (mem)); - gcc_checking_assert (REG_P (addr)); - break; - case PRE_DEC: - case POST_DEC: - addr = XEXP (addr, 0); - *offset = -GET_MODE_SIZE (GET_MODE (mem)); - gcc_checking_assert (REG_P (addr)); - break; - - default: - addr = strip_offset (addr, offset); + auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]); + return gen_rtx_PARALLEL (VOIDmode, patvec); } - - return addr; -} - -// Return true if X is a PRE_{INC,DEC,MODIFY} rtx. -static bool -any_pre_modify_p (rtx x) -{ - const auto code = GET_CODE (x); - return code == PRE_INC || code == PRE_DEC || code == PRE_MODIFY; -} - -// Return true if X is a POST_{INC,DEC,MODIFY} rtx. -static bool -any_post_modify_p (rtx x) -{ - const auto code = GET_CODE (x); - return code == POST_INC || code == POST_DEC || code == POST_MODIFY; + else if (load_p) + return aarch64_gen_load_pair (XEXP (pats[0], 0), + XEXP (pats[1], 0), + XEXP (pats[0], 1)); + else + return aarch64_gen_store_pair (XEXP (pats[0], 0), + XEXP (pats[0], 1), + XEXP (pats[1], 1)); + return pair_pat; } // Return true if we should consider forming ldp/stp insns from memory // accesses with operand mode MODE at this stage in compilation. -static bool -ldp_operand_mode_ok_p (machine_mode mode) +bool +aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode) { - const bool allow_qregs - = !(aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS); - if (!aarch64_ldpstp_operand_mode_p (mode)) return false; - const auto size = GET_MODE_SIZE (mode).to_constant (); - if (size == 16 && !allow_qregs) - return false; - // We don't pair up TImode accesses before RA because TImode is // special in that it can be allocated to a pair of GPRs or a single // FPR, and the RA is best placed to make that decision. return reload_completed || mode != TImode; } -// Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these -// into an integer for use as a hash table key. -static int -encode_lfs (lfs_fields fields) -{ - int size_log2 = exact_log2 (fields.size); - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); - return ((int)fields.load_p << 3) - | ((int)fields.fpsimd_p << 2) - | (size_log2 - 2); -} - -// Inverse of encode_lfs. -static lfs_fields -decode_lfs (int lfs) -{ - bool load_p = (lfs & (1 << 3)); - bool fpsimd_p = (lfs & (1 << 2)); - unsigned size = 1U << ((lfs & 3) + 2); - return { load_p, fpsimd_p, size }; -} - -// Track the access INSN at offset OFFSET in this access group. -// ALLOC_NODE is used to allocate splay tree nodes. -template<typename Alloc> -void -access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn) -{ - auto insert_before = [&](std::list<access_record>::iterator after) - { - auto it = list.emplace (after, offset); - it->cand_insns.push_back (insn); - it->place = it; - return &*it; - }; - - if (!list.size ()) - { - auto access = insert_before (list.end ()); - tree.insert_max_node (alloc_node (access)); - return; - } - - auto compare = [&](splay_tree_node<access_record *> *node) - { - return compare_sizes_for_sort (offset, node->value ()->offset); - }; - auto result = tree.lookup (compare); - splay_tree_node<access_record *> *node = tree.root (); - if (result == 0) - node->value ()->cand_insns.push_back (insn); - else - { - auto it = node->value ()->place; - auto after = (result > 0) ? std::next (it) : it; - auto access = insert_before (after); - tree.insert_child (node, result > 0, alloc_node (access)); - } -} - -// Given a candidate access INSN (with mem MEM), see if it has a suitable -// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access. -// LFS is used as part of the key to the hash table, see track_access. -bool -ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs) -{ - if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem)) - return false; - - poly_int64 offset; - tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem), - &offset); - if (!base_expr || !DECL_P (base_expr)) - return false; - - offset += MEM_OFFSET (mem); - - const machine_mode mem_mode = GET_MODE (mem); - const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant (); - - // Punt on misaligned offsets. LDP/STP instructions require offsets to be a - // multiple of the access size, and we believe that misaligned offsets on - // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases. - if (!multiple_p (offset, mem_size)) - return false; - - const auto key = std::make_pair (base_expr, encode_lfs (lfs)); - access_group &group = expr_map.get_or_insert (key, NULL); - auto alloc = [&](access_record *access) { return node_alloc (access); }; - group.track (alloc, offset, insn); - - if (dump_file) - { - fprintf (dump_file, "[bb %u] tracking insn %d via ", - m_bb->index (), insn->uid ()); - print_node_brief (dump_file, "mem expr", base_expr, 0); - fprintf (dump_file, " [L=%d FP=%d, %smode, off=", - lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]); - print_dec (offset, dump_file); - fprintf (dump_file, "]\n"); - } - - return true; -} - -// Main function to begin pair discovery. Given a memory access INSN, -// determine whether it could be a candidate for fusing into an ldp/stp, -// and if so, track it in the appropriate data structure for this basic -// block. LOAD_P is true if the access is a load, and MEM is the mem -// rtx that occurs in INSN. -void -ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem) -{ - // We can't combine volatile MEMs, so punt on these. - if (MEM_VOLATILE_P (mem)) - return; - - // Ignore writeback accesses if the param says to do so. - if (!aarch64_ldp_writeback - && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) - return; - - const machine_mode mem_mode = GET_MODE (mem); - if (!ldp_operand_mode_ok_p (mem_mode)) - return; - - rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p); - - // Ignore the access if the register operand isn't suitable for ldp/stp. - if (load_p - ? !aarch64_ldp_reg_operand (reg_op, mem_mode) - : !aarch64_stp_reg_operand (reg_op, mem_mode)) - return; - - // We want to segregate FP/SIMD accesses from GPR accesses. - // - // Before RA, we use the modes, noting that stores of constant zero - // operands use GPRs (even in non-integer modes). After RA, we use - // the hard register numbers. - const bool fpsimd_op_p - = reload_completed - ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op))) - : (GET_MODE_CLASS (mem_mode) != MODE_INT - && (load_p || !aarch64_const_zero_rtx_p (reg_op))); - - // Note ldp_operand_mode_ok_p already rejected VL modes. - const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant (); - const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size }; - - if (track_via_mem_expr (insn, mem, lfs)) - return; - - poly_int64 mem_off; - rtx addr = XEXP (mem, 0); - const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC; - rtx base = ldp_strip_offset (mem, &mem_off); - if (!REG_P (base)) - return; - - // Need to calculate two (possibly different) offsets: - // - Offset at which the access occurs. - // - Offset of the new base def. - poly_int64 access_off; - if (autoinc_p && any_post_modify_p (addr)) - access_off = 0; - else - access_off = mem_off; - - poly_int64 new_def_off = mem_off; - - // Punt on accesses relative to eliminable regs. Since we don't know the - // elimination offset pre-RA, we should postpone forming pairs on such - // accesses until after RA. - // - // As it stands, addresses with offsets in range for LDR but not - // in range for LDP/STP are currently reloaded inefficiently, - // ending up with a separate base register for each pair. - // - // In theory LRA should make use of - // targetm.legitimize_address_displacement to promote sharing of - // bases among multiple (nearby) address reloads, but the current - // LRA code returns early from process_address_1 for operands that - // satisfy "m", even if they don't satisfy the real (relaxed) address - // constraint; this early return means we never get to the code - // that calls targetm.legitimize_address_displacement. - // - // So for now, it's better to punt when we can't be sure that the - // offset is in range for LDP/STP. Out-of-range cases can then be - // handled after RA by the out-of-range LDP/STP peepholes. Eventually, it - // would be nice to handle known out-of-range opportunities in the - // pass itself (for stack accesses, this would be in the post-RA pass). - if (!reload_completed - && (REGNO (base) == FRAME_POINTER_REGNUM - || REGNO (base) == ARG_POINTER_REGNUM)) - return; - - // Now need to find def of base register. - use_info *base_use = find_access (insn->uses (), REGNO (base)); - gcc_assert (base_use); - def_info *base_def = base_use->def (); - if (!base_def) - { - if (dump_file) - fprintf (dump_file, - "base register (regno %d) of insn %d is undefined", - REGNO (base), insn->uid ()); - return; - } - - alt_base *canon_base = canon_base_map.get (base_def); - if (canon_base) - { - // Express this as the combined offset from the canonical base. - base_def = canon_base->base; - new_def_off += canon_base->offset; - access_off += canon_base->offset; - } - - if (autoinc_p) - { - auto def = find_access (insn->defs (), REGNO (base)); - gcc_assert (def); - - // Record that DEF = BASE_DEF + MEM_OFF. - if (dump_file) - { - pretty_printer pp; - pp_access (&pp, def, 0); - pp_string (&pp, " = "); - pp_access (&pp, base_def, 0); - fprintf (dump_file, "[bb %u] recording %s + ", - m_bb->index (), pp_formatted_text (&pp)); - print_dec (new_def_off, dump_file); - fprintf (dump_file, "\n"); - } - - alt_base base_rec { base_def, new_def_off }; - if (canon_base_map.put (def, base_rec)) - gcc_unreachable (); // Base defs should be unique. - } - - // Punt on misaligned offsets. LDP/STP require offsets to be a multiple of - // the access size. - if (!multiple_p (mem_off, mem_size)) - return; - - const auto key = std::make_pair (base_def, encode_lfs (lfs)); - access_group &group = def_map.get_or_insert (key, NULL); - auto alloc = [&](access_record *access) { return node_alloc (access); }; - group.track (alloc, access_off, insn); - - if (dump_file) - { - pretty_printer pp; - pp_access (&pp, base_def, 0); - - fprintf (dump_file, "[bb %u] tracking insn %d via %s", - m_bb->index (), insn->uid (), pp_formatted_text (&pp)); - fprintf (dump_file, - " [L=%d, WB=%d, FP=%d, %smode, off=", - lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]); - print_dec (access_off, dump_file); - fprintf (dump_file, "]\n"); - } -} - -// Dummy predicate that never ignores any insns. -static bool no_ignore (insn_info *) { return false; } - -// Return the latest dataflow hazard before INSN. -// -// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for -// dataflow purposes. This is needed when considering changing the RTL base of -// an access discovered through a MEM_EXPR base. -// -// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising -// from that insn. -// -// N.B. we ignore any defs/uses of memory here as we deal with that separately, -// making use of alias disambiguation. -static insn_info * -latest_hazard_before (insn_info *insn, rtx *ignore, - insn_info *ignore_insn = nullptr) -{ - insn_info *result = nullptr; - - // If the insn can throw then it is at the end of a BB and we can't - // move it, model this by recording a hazard in the previous insn - // which will prevent moving the insn up. - if (cfun->can_throw_non_call_exceptions - && find_reg_note (insn->rtl (), REG_EH_REGION, NULL_RTX)) - return insn->prev_nondebug_insn (); - - // Return true if we registered the hazard. - auto hazard = [&](insn_info *h) -> bool - { - gcc_checking_assert (*h < *insn); - if (h == ignore_insn) - return false; - - if (!result || *h > *result) - result = h; - - return true; - }; - - rtx pat = PATTERN (insn->rtl ()); - auto ignore_use = [&](use_info *u) - { - if (u->is_mem ()) - return true; - - return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore); - }; - - // Find defs of uses in INSN (RaW). - for (auto use : insn->uses ()) - if (!ignore_use (use) && use->def ()) - hazard (use->def ()->insn ()); - - // Find previous defs (WaW) or previous uses (WaR) of defs in INSN. - for (auto def : insn->defs ()) - { - if (def->is_mem ()) - continue; - - if (def->prev_def ()) - { - hazard (def->prev_def ()->insn ()); // WaW - - auto set = dyn_cast<set_info *> (def->prev_def ()); - if (set && set->has_nondebug_insn_uses ()) - for (auto use : set->reverse_nondebug_insn_uses ()) - if (use->insn () != insn && hazard (use->insn ())) // WaR - break; - } - - if (!HARD_REGISTER_NUM_P (def->regno ())) - continue; - - // Also need to check backwards for call clobbers (WaW). - for (auto call_group : def->ebb ()->call_clobbers ()) - { - if (!call_group->clobbers (def->resource ())) - continue; - - auto clobber_insn = prev_call_clobbers_ignoring (*call_group, - def->insn (), - no_ignore); - if (clobber_insn) - hazard (clobber_insn); - } - - } - - return result; -} - -// Return the first dataflow hazard after INSN. -// -// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for -// dataflow purposes. This is needed when considering changing the RTL base of -// an access discovered through a MEM_EXPR base. -// -// N.B. we ignore any defs/uses of memory here as we deal with that separately, -// making use of alias disambiguation. -static insn_info * -first_hazard_after (insn_info *insn, rtx *ignore) -{ - insn_info *result = nullptr; - auto hazard = [insn, &result](insn_info *h) - { - gcc_checking_assert (*h > *insn); - if (!result || *h < *result) - result = h; - }; - - rtx pat = PATTERN (insn->rtl ()); - auto ignore_use = [&](use_info *u) - { - if (u->is_mem ()) - return true; - - return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore); - }; - - for (auto def : insn->defs ()) - { - if (def->is_mem ()) - continue; - - if (def->next_def ()) - hazard (def->next_def ()->insn ()); // WaW - - auto set = dyn_cast<set_info *> (def); - if (set && set->has_nondebug_insn_uses ()) - hazard (set->first_nondebug_insn_use ()->insn ()); // RaW - - if (!HARD_REGISTER_NUM_P (def->regno ())) - continue; - - // Also check for call clobbers of this def (WaW). - for (auto call_group : def->ebb ()->call_clobbers ()) - { - if (!call_group->clobbers (def->resource ())) - continue; - - auto clobber_insn = next_call_clobbers_ignoring (*call_group, - def->insn (), - no_ignore); - if (clobber_insn) - hazard (clobber_insn); - } - } - - // Find any subsequent defs of uses in INSN (WaR). - for (auto use : insn->uses ()) - { - if (ignore_use (use)) - continue; - - if (use->def ()) - { - auto def = use->def ()->next_def (); - if (def && def->insn () == insn) - def = def->next_def (); - - if (def) - hazard (def->insn ()); - } - - if (!HARD_REGISTER_NUM_P (use->regno ())) - continue; - - // Also need to handle call clobbers of our uses (again WaR). - // - // See restrict_movement_for_uses_ignoring for why we don't - // need to check backwards for call clobbers. - for (auto call_group : use->ebb ()->call_clobbers ()) - { - if (!call_group->clobbers (use->resource ())) - continue; - - auto clobber_insn = next_call_clobbers_ignoring (*call_group, - use->insn (), - no_ignore); - if (clobber_insn) - hazard (clobber_insn); - } - } - - return result; -} - -// Return true iff R1 and R2 overlap. -static bool -ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2) -{ - // If either range is empty, then their intersection is empty. - if (!r1 || !r2) - return false; - - // When do they not overlap? When one range finishes before the other - // starts, i.e. (*r1.last < *r2.first || *r2.last < *r1.first). - // Inverting this, we get the below. - return *r1.last >= *r2.first && *r2.last >= *r1.first; -} - -// Get the range of insns that def feeds. -static insn_range_info get_def_range (def_info *def) -{ - insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn (); - return { def->insn (), last }; -} - -// Given a def (of memory), return the downwards range within which we -// can safely move this def. -static insn_range_info -def_downwards_move_range (def_info *def) -{ - auto range = get_def_range (def); - - auto set = dyn_cast<set_info *> (def); - if (!set || !set->has_any_uses ()) - return range; - - auto use = set->first_nondebug_insn_use (); - if (use) - range = move_earlier_than (range, use->insn ()); - - return range; -} - -// Given a def (of memory), return the upwards range within which we can -// safely move this def. -static insn_range_info -def_upwards_move_range (def_info *def) -{ - def_info *prev = def->prev_def (); - insn_range_info range { prev->insn (), def->insn () }; - - auto set = dyn_cast<set_info *> (prev); - if (!set || !set->has_any_uses ()) - return range; - - auto use = set->last_nondebug_insn_use (); - if (use) - range = move_later_than (range, use->insn ()); - - return range; -} - -// Class that implements a state machine for building the changes needed to form -// a store pair instruction. This allows us to easily build the changes in -// program order, as required by rtl-ssa. -struct stp_change_builder -{ - enum class state - { - FIRST, - INSERT, - FIXUP_USE, - LAST, - DONE - }; - - enum class action - { - TOMBSTONE, - CHANGE, - INSERT, - FIXUP_USE - }; - - struct change - { - action type; - insn_info *insn; - }; - - bool done () const { return m_state == state::DONE; } - - stp_change_builder (insn_info *insns[2], - insn_info *repurpose, - insn_info *dest) - : m_state (state::FIRST), m_insns { insns[0], insns[1] }, - m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {} - - change get_change () const - { - switch (m_state) - { - case state::FIRST: - return { - m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE, - m_insns[0] - }; - case state::LAST: - return { - m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE, - m_insns[1] - }; - case state::INSERT: - return { action::INSERT, m_dest }; - case state::FIXUP_USE: - return { action::FIXUP_USE, m_use->insn () }; - case state::DONE: - break; - } - - gcc_unreachable (); - } - - // Transition to the next state. - void advance () - { - switch (m_state) - { - case state::FIRST: - if (m_repurpose) - m_state = state::LAST; - else - m_state = state::INSERT; - break; - case state::INSERT: - { - def_info *def = memory_access (m_insns[0]->defs ()); - while (*def->next_def ()->insn () <= *m_dest) - def = def->next_def (); - - // Now we know DEF feeds the insertion point for the new stp. - // Look for any uses of DEF that will consume the new stp. - gcc_assert (*def->insn () <= *m_dest - && *def->next_def ()->insn () > *m_dest); - - auto set = as_a<set_info *> (def); - for (auto use : set->nondebug_insn_uses ()) - if (*use->insn () > *m_dest) - { - m_use = use; - break; - } - - if (m_use) - m_state = state::FIXUP_USE; - else - m_state = state::LAST; - break; - } - case state::FIXUP_USE: - m_use = m_use->next_nondebug_insn_use (); - if (!m_use) - m_state = state::LAST; - break; - case state::LAST: - m_state = state::DONE; - break; - case state::DONE: - gcc_unreachable (); - } - } - -private: - state m_state; - - // Original candidate stores. - insn_info *m_insns[2]; - - // If non-null, this is a candidate insn to change into an stp. Otherwise we - // are deleting both original insns and inserting a new insn for the stp. - insn_info *m_repurpose; - - // Destionation of the stp, it will be placed immediately after m_dest. - insn_info *m_dest; - - // Current nondebug use that needs updating due to stp insertion. - use_info *m_use; -}; - -// Given candidate store insns FIRST and SECOND, see if we can re-purpose one -// of them (together with its def of memory) for the stp insn. If so, return -// that insn. Otherwise, return null. -static insn_info * -try_repurpose_store (insn_info *first, - insn_info *second, - const insn_range_info &move_range) -{ - def_info * const defs[2] = { - memory_access (first->defs ()), - memory_access (second->defs ()) - }; - - if (move_range.includes (first) - || ranges_overlap_p (move_range, def_downwards_move_range (defs[0]))) - return first; - - if (move_range.includes (second) - || ranges_overlap_p (move_range, def_upwards_move_range (defs[1]))) - return second; - - return nullptr; -} - -// Generate the RTL pattern for a "tombstone"; used temporarily during this pass -// to replace stores that are marked for deletion where we can't immediately -// delete the store (since there are uses of mem hanging off the store). -// -// These are deleted at the end of the pass and uses re-parented appropriately -// at this point. -static rtx -gen_tombstone (void) -{ - return gen_rtx_CLOBBER (VOIDmode, - gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode))); -} - // Given a pair mode MODE, return a canonical mode to be used for a single // operand of such a pair. Currently we only use this when promoting a // non-writeback pair into a writeback pair, as it isn't otherwise clear @@ -1028,1962 +179,6 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode) } } -// Go through the reg notes rooted at NOTE, dropping those that we should drop, -// and preserving those that we want to keep by prepending them to (and -// returning) RESULT. EH_REGION is used to make sure we have at most one -// REG_EH_REGION note in the resulting list. FR_EXPR is used to return any -// REG_FRAME_RELATED_EXPR note we find, as these can need special handling in -// combine_reg_notes. -static rtx -filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr) -{ - for (; note; note = XEXP (note, 1)) - { - switch (REG_NOTE_KIND (note)) - { - case REG_DEAD: - // REG_DEAD notes aren't required to be maintained. - case REG_EQUAL: - case REG_EQUIV: - case REG_UNUSED: - case REG_NOALIAS: - // These can all be dropped. For REG_EQU{AL,IV} they cannot apply to - // non-single_set insns, and REG_UNUSED is re-computed by RTl-SSA, see - // rtl-ssa/changes.cc:update_notes. - // - // Similarly, REG_NOALIAS cannot apply to a parallel. - case REG_INC: - // When we form the pair insn, the reg update is implemented - // as just another SET in the parallel, so isn't really an - // auto-increment in the RTL sense, hence we drop the note. - break; - case REG_EH_REGION: - gcc_assert (!*eh_region); - *eh_region = true; - result = alloc_reg_note (REG_EH_REGION, XEXP (note, 0), result); - break; - case REG_CFA_DEF_CFA: - case REG_CFA_OFFSET: - case REG_CFA_RESTORE: - result = alloc_reg_note (REG_NOTE_KIND (note), - copy_rtx (XEXP (note, 0)), - result); - break; - case REG_FRAME_RELATED_EXPR: - gcc_assert (!*fr_expr); - *fr_expr = copy_rtx (XEXP (note, 0)); - break; - default: - // Unexpected REG_NOTE kind. - gcc_unreachable (); - } - } - - return result; -} - -// Return the notes that should be attached to a combination of I1 and I2, where -// *I1 < *I2. LOAD_P is true for loads. -static rtx -combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p) -{ - // Temporary storage for REG_FRAME_RELATED_EXPR notes. - rtx fr_expr[2] = {}; - - bool found_eh_region = false; - rtx result = NULL_RTX; - result = filter_notes (REG_NOTES (i2->rtl ()), result, - &found_eh_region, fr_expr); - result = filter_notes (REG_NOTES (i1->rtl ()), result, - &found_eh_region, fr_expr + 1); - - if (!load_p) - { - // Simple frame-related sp-relative saves don't need CFI notes, but when - // we combine them into an stp we will need a CFI note as dwarf2cfi can't - // interpret the unspec pair representation directly. - if (RTX_FRAME_RELATED_P (i1->rtl ()) && !fr_expr[0]) - fr_expr[0] = copy_rtx (PATTERN (i1->rtl ())); - if (RTX_FRAME_RELATED_P (i2->rtl ()) && !fr_expr[1]) - fr_expr[1] = copy_rtx (PATTERN (i2->rtl ())); - } - - rtx fr_pat = NULL_RTX; - if (fr_expr[0] && fr_expr[1]) - { - // Combining two frame-related insns, need to construct - // a REG_FRAME_RELATED_EXPR note which represents the combined - // operation. - RTX_FRAME_RELATED_P (fr_expr[1]) = 1; - fr_pat = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (2, fr_expr[0], fr_expr[1])); - } - else - fr_pat = fr_expr[0] ? fr_expr[0] : fr_expr[1]; - - if (fr_pat) - result = alloc_reg_note (REG_FRAME_RELATED_EXPR, - fr_pat, result); - - return result; -} - -// Given two memory accesses in PATS, at least one of which is of a -// writeback form, extract two non-writeback memory accesses addressed -// relative to the initial value of the base register, and output these -// in PATS. Return an rtx that represents the overall change to the -// base register. -static rtx -extract_writebacks (bool load_p, rtx pats[2], int changed) -{ - rtx base_reg = NULL_RTX; - poly_int64 current_offset = 0; - - poly_int64 offsets[2]; - - for (int i = 0; i < 2; i++) - { - rtx mem = XEXP (pats[i], load_p); - rtx reg = XEXP (pats[i], !load_p); - - rtx addr = XEXP (mem, 0); - const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC; - - poly_int64 offset; - rtx this_base = ldp_strip_offset (mem, &offset); - gcc_assert (REG_P (this_base)); - if (base_reg) - gcc_assert (rtx_equal_p (base_reg, this_base)); - else - base_reg = this_base; - - // If we changed base for the current insn, then we already - // derived the correct mem for this insn from the effective - // address of the other access. - if (i == changed) - { - gcc_checking_assert (!autoinc_p); - offsets[i] = offset; - continue; - } - - if (autoinc_p && any_pre_modify_p (addr)) - current_offset += offset; - - poly_int64 this_off = current_offset; - if (!autoinc_p) - this_off += offset; - - offsets[i] = this_off; - rtx new_mem = change_address (mem, GET_MODE (mem), - plus_constant (GET_MODE (base_reg), - base_reg, this_off)); - pats[i] = load_p - ? gen_rtx_SET (reg, new_mem) - : gen_rtx_SET (new_mem, reg); - - if (autoinc_p && any_post_modify_p (addr)) - current_offset += offset; - } - - if (known_eq (current_offset, 0)) - return NULL_RTX; - - return gen_rtx_SET (base_reg, plus_constant (GET_MODE (base_reg), - base_reg, current_offset)); -} - -// INSNS contains either {nullptr, pair insn} (when promoting an existing -// non-writeback pair) or contains the candidate insns used to form the pair -// (when fusing a new pair). -// -// PAIR_RANGE specifies where we want to form the final pair. -// INITIAL_OFFSET gives the current base offset for the pair. -// Bit I of INITIAL_WRITEBACK is set if INSNS[I] initially had writeback. -// ACCESS_SIZE gives the access size for a single arm of the pair. -// BASE_DEF gives the initial def of the base register consumed by the pair. -// -// Given the above, this function looks for a trailing destructive update of the -// base register. If there is one, we choose the first such update after -// PAIR_DST that is still in the same BB as our pair. We return the new def in -// *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT. -static insn_info * -find_trailing_add (insn_info *insns[2], - const insn_range_info &pair_range, - int initial_writeback, - rtx *writeback_effect, - def_info **add_def, - def_info *base_def, - poly_int64 initial_offset, - unsigned access_size) -{ - // Punt on frame-related insns, it is better to be conservative and - // not try to form writeback pairs here, and means we don't have to - // worry about the writeback case in forming REG_FRAME_RELATED_EXPR - // notes (see combine_reg_notes). - if ((insns[0] && RTX_FRAME_RELATED_P (insns[0]->rtl ())) - || RTX_FRAME_RELATED_P (insns[1]->rtl ())) - return nullptr; - - insn_info *pair_dst = pair_range.singleton (); - gcc_assert (pair_dst); - - def_info *def = base_def->next_def (); - - // In the case that either of the initial pair insns had writeback, - // then there will be intervening defs of the base register. - // Skip over these. - for (int i = 0; i < 2; i++) - if (initial_writeback & (1 << i)) - { - gcc_assert (def->insn () == insns[i]); - def = def->next_def (); - } - - if (!def || def->bb () != pair_dst->bb ()) - return nullptr; - - // DEF should now be the first def of the base register after PAIR_DST. - insn_info *cand = def->insn (); - gcc_assert (*cand > *pair_dst); - - const auto base_regno = base_def->regno (); - - // If CAND doesn't also use our base register, - // it can't destructively update it. - if (!find_access (cand->uses (), base_regno)) - return nullptr; - - auto rti = cand->rtl (); - - if (!INSN_P (rti)) - return nullptr; - - auto pat = PATTERN (rti); - if (GET_CODE (pat) != SET) - return nullptr; - - auto dest = XEXP (pat, 0); - if (!REG_P (dest) || REGNO (dest) != base_regno) - return nullptr; - - poly_int64 offset; - rtx rhs_base = strip_offset (XEXP (pat, 1), &offset); - if (!REG_P (rhs_base) - || REGNO (rhs_base) != base_regno - || !offset.is_constant ()) - return nullptr; - - // If the initial base offset is zero, we can handle any add offset - // (post-inc). Otherwise, we require the offsets to match (pre-inc). - if (!known_eq (initial_offset, 0) && !known_eq (offset, initial_offset)) - return nullptr; - - auto off_hwi = offset.to_constant (); - - if (off_hwi % access_size != 0) - return nullptr; - - off_hwi /= access_size; - - if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM) - return nullptr; - - auto dump_prefix = [&]() - { - if (!insns[0]) - fprintf (dump_file, "existing pair i%d: ", insns[1]->uid ()); - else - fprintf (dump_file, " (%d,%d)", - insns[0]->uid (), insns[1]->uid ()); - }; - - insn_info *hazard = latest_hazard_before (cand, nullptr, insns[1]); - if (!hazard || *hazard <= *pair_dst) - { - if (dump_file) - { - dump_prefix (); - fprintf (dump_file, - "folding in trailing add (%d) to use writeback form\n", - cand->uid ()); - } - - *add_def = def; - *writeback_effect = copy_rtx (pat); - return cand; - } - - if (dump_file) - { - dump_prefix (); - fprintf (dump_file, - "can't fold in trailing add (%d), hazard = %d\n", - cand->uid (), hazard->uid ()); - } - - return nullptr; -} - -// We just emitted a tombstone with uid UID, track it in a bitmap for -// this BB so we can easily identify it later when cleaning up tombstones. -void -ldp_bb_info::track_tombstone (int uid) -{ - if (!m_emitted_tombstone) - { - // Lazily initialize the bitmap for tracking tombstone insns. - bitmap_obstack_initialize (&m_bitmap_obstack); - bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack); - m_emitted_tombstone = true; - } - - if (!bitmap_set_bit (&m_tombstone_bitmap, uid)) - gcc_unreachable (); // Bit should have changed. -} - -// Reset the debug insn containing USE (the debug insn has been -// optimized away). -static void -reset_debug_use (use_info *use) -{ - auto use_insn = use->insn (); - auto use_rtl = use_insn->rtl (); - insn_change change (use_insn); - change.new_uses = {}; - INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC (); - crtl->ssa->change_insn (change); -} - -// USE is a debug use that needs updating because DEF (a def of the same -// register) is being re-ordered over it. If BASE is non-null, then DEF -// is an update of the register BASE by a constant, given by WB_OFFSET, -// and we can preserve debug info by accounting for the change in side -// effects. -static void -fixup_debug_use (obstack_watermark &attempt, - use_info *use, - def_info *def, - rtx base, - poly_int64 wb_offset) -{ - auto use_insn = use->insn (); - if (base) - { - auto use_rtl = use_insn->rtl (); - insn_change change (use_insn); - - gcc_checking_assert (REG_P (base) && use->regno () == REGNO (base)); - change.new_uses = check_remove_regno_access (attempt, - change.new_uses, - use->regno ()); - - // The effect of the writeback is to add WB_OFFSET to BASE. If - // we're re-ordering DEF below USE, then we update USE by adding - // WB_OFFSET to it. Otherwise, if we're re-ordering DEF above - // USE, we update USE by undoing the effect of the writeback - // (subtracting WB_OFFSET). - use_info *new_use; - if (*def->insn () > *use_insn) - { - // We now need USE_INSN to consume DEF. Create a new use of DEF. - // - // N.B. this means until we call change_insns for the main change - // group we will temporarily have a debug use consuming a def that - // comes after it, but RTL-SSA doesn't currently support updating - // debug insns as part of the main change group (together with - // nondebug changes), so we will have to live with this update - // leaving the IR being temporarily inconsistent. It seems to - // work out OK once the main change group is applied. - wb_offset *= -1; - new_use = crtl->ssa->create_use (attempt, - use_insn, - as_a<set_info *> (def)); - } - else - new_use = find_access (def->insn ()->uses (), use->regno ()); - - change.new_uses = insert_access (attempt, new_use, change.new_uses); - - if (dump_file) - { - const char *dir = (*def->insn () < *use_insn) ? "down" : "up"; - pretty_printer pp; - pp_string (&pp, "["); - pp_access (&pp, use, 0); - pp_string (&pp, "]"); - pp_string (&pp, " due to wb def "); - pp_string (&pp, "["); - pp_access (&pp, def, 0); - pp_string (&pp, "]"); - fprintf (dump_file, - " i%d: fix up debug use %s re-ordered %s, " - "sub r%u -> r%u + ", - use_insn->uid (), pp_formatted_text (&pp), - dir, REGNO (base), REGNO (base)); - print_dec (wb_offset, dump_file); - fprintf (dump_file, "\n"); - } - - insn_propagation prop (use_rtl, base, - plus_constant (GET_MODE (base), base, wb_offset)); - if (prop.apply_to_pattern (&INSN_VAR_LOCATION_LOC (use_rtl))) - crtl->ssa->change_insn (change); - else - { - if (dump_file) - fprintf (dump_file, " i%d: RTL substitution failed (%s)" - ", resetting debug insn", use_insn->uid (), - prop.failure_reason); - reset_debug_use (use); - } - } - else - { - if (dump_file) - { - pretty_printer pp; - pp_string (&pp, "["); - pp_access (&pp, use, 0); - pp_string (&pp, "] due to re-ordered load def ["); - pp_access (&pp, def, 0); - pp_string (&pp, "]"); - fprintf (dump_file, " i%d: resetting debug use %s\n", - use_insn->uid (), pp_formatted_text (&pp)); - } - reset_debug_use (use); - } -} - -// Update debug uses when folding in a trailing add insn to form a -// writeback pair. -// -// ATTEMPT is used to allocate RTL-SSA temporaries for the changes, -// the final pair is placed immediately after PAIR_DST, TRAILING_ADD -// is a trailing add insn which is being folded into the pair to make it -// use writeback addressing, and WRITEBACK_EFFECT is the pattern for -// TRAILING_ADD. -static void -fixup_debug_uses_trailing_add (obstack_watermark &attempt, - insn_info *pair_dst, - insn_info *trailing_add, - rtx writeback_effect) -{ - rtx base = SET_DEST (writeback_effect); - - poly_int64 wb_offset; - rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset); - gcc_checking_assert (rtx_equal_p (base, base2)); - - auto defs = trailing_add->defs (); - gcc_checking_assert (defs.size () == 1); - def_info *def = defs[0]; - - if (auto set = safe_dyn_cast<set_info *> (def->prev_def ())) - for (auto use : iterate_safely (set->debug_insn_uses ())) - if (*use->insn () > *pair_dst) - // DEF is getting re-ordered above USE, fix up USE accordingly. - fixup_debug_use (attempt, use, def, base, wb_offset); -} - -// Called from fuse_pair, fixes up any debug uses that will be affected -// by the changes. -// -// ATTEMPT is the obstack watermark used to allocate RTL-SSA temporaries for -// the changes, INSNS gives the candidate insns: at this point the use/def -// information should still be as on entry to fuse_pair, but the patterns may -// have changed, hence we pass ORIG_RTL which contains the original patterns -// for the candidate insns. -// -// The final pair will be placed immediately after PAIR_DST, LOAD_P is true if -// it is a load pair, bit I of WRITEBACK is set if INSNS[I] originally had -// writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to -// the base register in the final pair (if any). BASE_REGNO gives the register -// number of the base register used in the final pair. -static void -fixup_debug_uses (obstack_watermark &attempt, - insn_info *insns[2], - rtx orig_rtl[2], - insn_info *pair_dst, - insn_info *trailing_add, - bool load_p, - int writeback, - rtx writeback_effect, - unsigned base_regno) -{ - // USE is a debug use that needs updating because DEF (a def of the - // resource) is being re-ordered over it. If WRITEBACK_PAT is non-NULL, - // then it gives the original RTL pattern for DEF's insn, and DEF is a - // writeback update of the base register. - // - // This simply unpacks WRITEBACK_PAT if needed and calls fixup_debug_use. - auto update_debug_use = [&](use_info *use, def_info *def, - rtx writeback_pat) - { - poly_int64 offset = 0; - rtx base = NULL_RTX; - if (writeback_pat) - { - rtx mem = XEXP (writeback_pat, load_p); - gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) - == RTX_AUTOINC); - - base = ldp_strip_offset (mem, &offset); - gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno); - } - fixup_debug_use (attempt, use, def, base, offset); - }; - - // Reset any debug uses of mem over which we re-ordered a store. - // - // It would be nice to try and preserve debug info here, but it seems that - // would require doing alias analysis to see if the store aliases with the - // debug use, which seems a little extravagant just to preserve debug info. - if (!load_p) - { - auto def = memory_access (insns[0]->defs ()); - auto last_def = memory_access (insns[1]->defs ()); - for (; def != last_def; def = def->next_def ()) - { - auto set = as_a<set_info *> (def); - for (auto use : iterate_safely (set->debug_insn_uses ())) - { - if (dump_file) - fprintf (dump_file, " i%d: resetting debug use of mem\n", - use->insn ()->uid ()); - reset_debug_use (use); - } - } - } - - // Now let's take care of register uses, starting with debug uses - // attached to defs from our first insn. - for (auto def : insns[0]->defs ()) - { - auto set = dyn_cast<set_info *> (def); - if (!set || set->is_mem () || !set->first_debug_insn_use ()) - continue; - - def_info *defs[2] = { - def, - find_access (insns[1]->defs (), def->regno ()) - }; - - rtx writeback_pats[2] = {}; - if (def->regno () == base_regno) - for (int i = 0; i < 2; i++) - if (writeback & (1 << i)) - { - gcc_checking_assert (defs[i]); - writeback_pats[i] = orig_rtl[i]; - } - - // Now that we've characterized the defs involved, go through the - // debug uses and determine how to update them (if needed). - for (auto use : iterate_safely (set->debug_insn_uses ())) - { - if (*pair_dst < *use->insn () && defs[1]) - // We're re-ordering defs[1] above a previous use of the - // same resource. - update_debug_use (use, defs[1], writeback_pats[1]); - else if (*pair_dst >= *use->insn ()) - // We're re-ordering defs[0] below its use. - update_debug_use (use, defs[0], writeback_pats[0]); - } - } - - // Now let's look at registers which are def'd by the second insn - // but not by the first insn, there may still be debug uses of a - // previous def which can be affected by moving the second insn up. - for (auto def : insns[1]->defs ()) - { - // This should be M log N where N is the number of defs in - // insns[0] and M is the number of defs in insns[1]. - if (def->is_mem () || find_access (insns[0]->defs (), def->regno ())) - continue; - - auto prev_set = safe_dyn_cast<set_info *> (def->prev_def ()); - if (!prev_set) - continue; - - rtx writeback_pat = NULL_RTX; - if (def->regno () == base_regno && (writeback & 2)) - writeback_pat = orig_rtl[1]; - - // We have a def in insns[1] which isn't def'd by the first insn. - // Look to the previous def and see if it has any debug uses. - for (auto use : iterate_safely (prev_set->debug_insn_uses ())) - if (*pair_dst < *use->insn ()) - // We're ordering DEF above a previous use of the same register. - update_debug_use (use, def, writeback_pat); - } - - if ((writeback & 2) && !writeback_effect) - { - // If the second insn initially had writeback but the final - // pair does not, then there may be trailing debug uses of the - // second writeback def which need re-parenting: do that. - auto def = find_access (insns[1]->defs (), base_regno); - gcc_assert (def); - auto set = as_a<set_info *> (def); - for (auto use : iterate_safely (set->debug_insn_uses ())) - { - insn_change change (use->insn ()); - change.new_uses = check_remove_regno_access (attempt, - change.new_uses, - base_regno); - auto new_use = find_access (insns[0]->uses (), base_regno); - - // N.B. insns must have already shared a common base due to writeback. - gcc_assert (new_use); - - if (dump_file) - fprintf (dump_file, - " i%d: cancelling wb, re-parenting trailing debug use\n", - use->insn ()->uid ()); - - change.new_uses = insert_access (attempt, new_use, change.new_uses); - crtl->ssa->change_insn (change); - } - } - else if (trailing_add) - fixup_debug_uses_trailing_add (attempt, pair_dst, trailing_add, - writeback_effect); -} - -// Try and actually fuse the pair given by insns I1 and I2. -// -// Here we've done enough analysis to know this is safe, we only -// reject the pair at this stage if either the tuning policy says to, -// or recog fails on the final pair insn. -// -// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each -// candidate insn. Bit i of WRITEBACK is set if the ith insn (in program -// order) uses writeback. -// -// BASE gives the chosen base candidate for the pair and MOVE_RANGE is -// a singleton range which says where to place the pair. -bool -ldp_bb_info::fuse_pair (bool load_p, - unsigned access_size, - int writeback, - insn_info *i1, insn_info *i2, - base_cand &base, - const insn_range_info &move_range) -{ - auto attempt = crtl->ssa->new_change_attempt (); - - auto make_change = [&attempt](insn_info *insn) - { - return crtl->ssa->change_alloc<insn_change> (attempt, insn); - }; - auto make_delete = [&attempt](insn_info *insn) - { - return crtl->ssa->change_alloc<insn_change> (attempt, - insn, - insn_change::DELETE); - }; - - insn_info *first = (*i1 < *i2) ? i1 : i2; - insn_info *second = (first == i1) ? i2 : i1; - - insn_info *pair_dst = move_range.singleton (); - gcc_assert (pair_dst); - - insn_info *insns[2] = { first, second }; - - auto_vec<insn_change *> changes; - auto_vec<int, 2> tombstone_uids (2); - - rtx pats[2] = { - PATTERN (first->rtl ()), - PATTERN (second->rtl ()) - }; - - // Make copies of the patterns as we might need to refer to the original RTL - // later, for example when updating debug uses (which is after we've updated - // one or both of the patterns in the candidate insns). - rtx orig_rtl[2]; - for (int i = 0; i < 2; i++) - orig_rtl[i] = copy_rtx (pats[i]); - - use_array input_uses[2] = { first->uses (), second->uses () }; - def_array input_defs[2] = { first->defs (), second->defs () }; - - int changed_insn = -1; - if (base.from_insn != -1) - { - // If we're not already using a shared base, we need - // to re-write one of the accesses to use the base from - // the other insn. - gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1); - changed_insn = !base.from_insn; - - rtx base_pat = pats[base.from_insn]; - rtx change_pat = pats[changed_insn]; - rtx base_mem = XEXP (base_pat, load_p); - rtx change_mem = XEXP (change_pat, load_p); - - const bool lower_base_p = (insns[base.from_insn] == i1); - HOST_WIDE_INT adjust_amt = access_size; - if (!lower_base_p) - adjust_amt *= -1; - - rtx change_reg = XEXP (change_pat, !load_p); - machine_mode mode_for_mem = GET_MODE (change_mem); - rtx effective_base = drop_writeback (base_mem); - rtx new_mem = adjust_address_nv (effective_base, - mode_for_mem, - adjust_amt); - rtx new_set = load_p - ? gen_rtx_SET (change_reg, new_mem) - : gen_rtx_SET (new_mem, change_reg); - - pats[changed_insn] = new_set; - - auto keep_use = [&](use_info *u) - { - return refers_to_regno_p (u->regno (), u->regno () + 1, - change_pat, &XEXP (change_pat, load_p)); - }; - - // Drop any uses that only occur in the old address. - input_uses[changed_insn] = filter_accesses (attempt, - input_uses[changed_insn], - keep_use); - } - - rtx writeback_effect = NULL_RTX; - if (writeback) - writeback_effect = extract_writebacks (load_p, pats, changed_insn); - - const auto base_regno = base.def->regno (); - - if (base.from_insn == -1 && (writeback & 1)) - { - // If the first of the candidate insns had a writeback form, we'll need to - // drop the use of the updated base register from the second insn's uses. - // - // N.B. we needn't worry about the base register occurring as a store - // operand, as we checked that there was no non-address true dependence - // between the insns in try_fuse_pair. - gcc_checking_assert (find_access (input_uses[1], base_regno)); - input_uses[1] = check_remove_regno_access (attempt, - input_uses[1], - base_regno); - } - - // Go through and drop uses that only occur in register notes, - // as we won't be preserving those. - for (int i = 0; i < 2; i++) - { - auto rti = insns[i]->rtl (); - if (!REG_NOTES (rti)) - continue; - - input_uses[i] = remove_note_accesses (attempt, input_uses[i]); - } - - // Edge case: if the first insn is a writeback load and the - // second insn is a non-writeback load which transfers into the base - // register, then we should drop the writeback altogether as the - // update of the base register from the second load should prevail. - // - // For example: - // ldr x2, [x1], #8 - // ldr x1, [x1] - // --> - // ldp x2, x1, [x1] - if (writeback == 1 - && load_p - && find_access (input_defs[1], base_regno)) - { - if (dump_file) - fprintf (dump_file, - " ldp: i%d has wb but subsequent i%d has non-wb " - "update of base (r%d), dropping wb\n", - insns[0]->uid (), insns[1]->uid (), base_regno); - gcc_assert (writeback_effect); - writeback_effect = NULL_RTX; - } - - // So far the patterns have been in instruction order, - // now we want them in offset order. - if (i1 != first) - std::swap (pats[0], pats[1]); - - poly_int64 offsets[2]; - for (int i = 0; i < 2; i++) - { - rtx mem = XEXP (pats[i], load_p); - gcc_checking_assert (MEM_P (mem)); - rtx base = strip_offset (XEXP (mem, 0), offsets + i); - gcc_checking_assert (REG_P (base)); - gcc_checking_assert (base_regno == REGNO (base)); - } - - // If either of the original insns had writeback, but the resulting pair insn - // does not (can happen e.g. in the ldp edge case above, or if the writeback - // effects cancel out), then drop the def(s) of the base register as - // appropriate. - // - // Also drop the first def in the case that both of the original insns had - // writeback. The second def could well have uses, but the first def should - // only be used by the second insn (and we dropped that use above). - for (int i = 0; i < 2; i++) - if ((!writeback_effect && (writeback & (1 << i))) - || (i == 0 && writeback == 3)) - input_defs[i] = check_remove_regno_access (attempt, - input_defs[i], - base_regno); - - // If we don't currently have a writeback pair, and we don't have - // a load that clobbers the base register, look for a trailing destructive - // update of the base register and try and fold it in to make this into a - // writeback pair. - insn_info *trailing_add = nullptr; - if (aarch64_ldp_writeback > 1 - && !writeback_effect - && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1, - XEXP (pats[0], 0), nullptr) - && !refers_to_regno_p (base_regno, base_regno + 1, - XEXP (pats[1], 0), nullptr)))) - { - def_info *add_def; - trailing_add = find_trailing_add (insns, move_range, writeback, - &writeback_effect, - &add_def, base.def, offsets[0], - access_size); - if (trailing_add) - { - // The def of the base register from the trailing add should prevail. - input_defs[0] = insert_access (attempt, add_def, input_defs[0]); - gcc_assert (input_defs[0].is_valid ()); - } - } - - // Now that we know what base mem we're going to use, check if it's OK - // with the ldp/stp policy. - rtx first_mem = XEXP (pats[0], load_p); - if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem, - load_p, - GET_MODE (first_mem))) - { - if (dump_file) - fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n", - i1->uid (), i2->uid ()); - return false; - } - - rtx reg_notes = combine_reg_notes (first, second, load_p); - - rtx pair_pat; - if (writeback_effect) - { - auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]); - pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec); - } - else if (load_p) - pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0), - XEXP (pats[1], 0), - XEXP (pats[0], 1)); - else - pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0), - XEXP (pats[0], 1), - XEXP (pats[1], 1)); - - insn_change *pair_change = nullptr; - auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { - rtx_insn *rti = change->insn ()->rtl (); - validate_unshare_change (rti, &PATTERN (rti), pair_pat, true); - validate_change (rti, ®_NOTES (rti), reg_notes, true); - }; - - if (load_p) - { - changes.safe_push (make_delete (first)); - pair_change = make_change (second); - changes.safe_push (pair_change); - - pair_change->move_range = move_range; - pair_change->new_defs = merge_access_arrays (attempt, - input_defs[0], - input_defs[1]); - gcc_assert (pair_change->new_defs.is_valid ()); - - pair_change->new_uses - = merge_access_arrays (attempt, - drop_memory_access (input_uses[0]), - drop_memory_access (input_uses[1])); - gcc_assert (pair_change->new_uses.is_valid ()); - set_pair_pat (pair_change); - } - else - { - using Action = stp_change_builder::action; - insn_info *store_to_change = try_repurpose_store (first, second, - move_range); - stp_change_builder builder (insns, store_to_change, pair_dst); - insn_change *change; - set_info *new_set = nullptr; - for (; !builder.done (); builder.advance ()) - { - auto action = builder.get_change (); - change = (action.type == Action::INSERT) - ? nullptr : make_change (action.insn); - switch (action.type) - { - case Action::CHANGE: - { - set_pair_pat (change); - change->new_uses = merge_access_arrays (attempt, - input_uses[0], - input_uses[1]); - auto d1 = drop_memory_access (input_defs[0]); - auto d2 = drop_memory_access (input_defs[1]); - change->new_defs = merge_access_arrays (attempt, d1, d2); - gcc_assert (change->new_defs.is_valid ()); - def_info *stp_def = memory_access (change->insn ()->defs ()); - change->new_defs = insert_access (attempt, - stp_def, - change->new_defs); - gcc_assert (change->new_defs.is_valid ()); - change->move_range = move_range; - pair_change = change; - break; - } - case Action::TOMBSTONE: - { - tombstone_uids.quick_push (change->insn ()->uid ()); - rtx_insn *rti = change->insn ()->rtl (); - validate_change (rti, &PATTERN (rti), gen_tombstone (), true); - validate_change (rti, ®_NOTES (rti), NULL_RTX, true); - change->new_uses = use_array (nullptr, 0); - break; - } - case Action::INSERT: - { - if (dump_file) - fprintf (dump_file, - " stp: cannot re-purpose candidate stores\n"); - - auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat); - change = make_change (new_insn); - change->move_range = move_range; - change->new_uses = merge_access_arrays (attempt, - input_uses[0], - input_uses[1]); - gcc_assert (change->new_uses.is_valid ()); - - auto d1 = drop_memory_access (input_defs[0]); - auto d2 = drop_memory_access (input_defs[1]); - change->new_defs = merge_access_arrays (attempt, d1, d2); - gcc_assert (change->new_defs.is_valid ()); - - new_set = crtl->ssa->create_set (attempt, new_insn, memory); - change->new_defs = insert_access (attempt, new_set, - change->new_defs); - gcc_assert (change->new_defs.is_valid ()); - pair_change = change; - break; - } - case Action::FIXUP_USE: - { - // This use now needs to consume memory from our stp. - if (dump_file) - fprintf (dump_file, - " stp: changing i%d to use mem from new stp " - "(after i%d)\n", - action.insn->uid (), pair_dst->uid ()); - change->new_uses = drop_memory_access (change->new_uses); - gcc_assert (new_set); - auto new_use = crtl->ssa->create_use (attempt, action.insn, - new_set); - change->new_uses = insert_access (attempt, new_use, - change->new_uses); - break; - } - } - changes.safe_push (change); - } - } - - if (trailing_add) - changes.safe_push (make_delete (trailing_add)); - else if ((writeback & 2) && !writeback_effect) - { - // The second insn initially had writeback but now the pair does not, - // need to update any nondebug uses of the base register def in the - // second insn. We'll take care of debug uses later. - auto def = find_access (insns[1]->defs (), base_regno); - gcc_assert (def); - auto set = dyn_cast<set_info *> (def); - if (set && set->has_nondebug_uses ()) - { - auto orig_use = find_access (insns[0]->uses (), base_regno); - for (auto use : set->nondebug_insn_uses ()) - { - auto change = make_change (use->insn ()); - change->new_uses = check_remove_regno_access (attempt, - change->new_uses, - base_regno); - change->new_uses = insert_access (attempt, - orig_use, - change->new_uses); - changes.safe_push (change); - } - } - } - - auto is_changing = insn_is_changing (changes); - for (unsigned i = 0; i < changes.length (); i++) - gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing)); - - // Check the pair pattern is recog'd. - if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing)) - { - if (dump_file) - fprintf (dump_file, " failed to form pair, recog failed\n"); - - // Free any reg notes we allocated. - while (reg_notes) - { - rtx next = XEXP (reg_notes, 1); - free_EXPR_LIST_node (reg_notes); - reg_notes = next; - } - cancel_changes (0); - return false; - } - - gcc_assert (crtl->ssa->verify_insn_changes (changes)); - - // Fix up any debug uses that will be affected by the changes. - if (MAY_HAVE_DEBUG_INSNS) - fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add, - load_p, writeback, writeback_effect, base_regno); - - confirm_change_group (); - crtl->ssa->change_insns (changes); - - gcc_checking_assert (tombstone_uids.length () <= 2); - for (auto uid : tombstone_uids) - track_tombstone (uid); - - return true; -} - -// Return true if STORE_INSN may modify mem rtx MEM. Make sure we keep -// within our BUDGET for alias analysis. -static bool -store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget) -{ - if (!budget) - { - if (dump_file) - { - fprintf (dump_file, - "exceeded budget, assuming store %d aliases with mem ", - store_insn->uid ()); - print_simple_rtl (dump_file, mem); - fprintf (dump_file, "\n"); - } - - return true; - } - - budget--; - return memory_modified_in_insn_p (mem, store_insn->rtl ()); -} - -// Return true if LOAD may be modified by STORE. Make sure we keep -// within our BUDGET for alias analysis. -static bool -load_modified_by_store_p (insn_info *load, - insn_info *store, - int &budget) -{ - gcc_checking_assert (budget >= 0); - - if (!budget) - { - if (dump_file) - { - fprintf (dump_file, - "exceeded budget, assuming load %d aliases with store %d\n", - load->uid (), store->uid ()); - } - return true; - } - - // It isn't safe to re-order stores over calls. - if (CALL_P (load->rtl ())) - return true; - - budget--; - - // Iterate over all MEMs in the load, seeing if any alias with - // our store. - subrtx_var_iterator::array_type array; - rtx pat = PATTERN (load->rtl ()); - FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST) - if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ())) - return true; - - return false; -} - -// Virtual base class for load/store walkers used in alias analysis. -struct alias_walker -{ - virtual bool conflict_p (int &budget) const = 0; - virtual insn_info *insn () const = 0; - virtual bool valid () const = 0; - virtual void advance () = 0; -}; - -// Implement some common functionality used by both store_walker -// and load_walker. -template<bool reverse> -class def_walker : public alias_walker -{ -protected: - using def_iter_t = typename std::conditional<reverse, - reverse_def_iterator, def_iterator>::type; - - static use_info *start_use_chain (def_iter_t &def_iter) - { - set_info *set = nullptr; - for (; *def_iter; def_iter++) - { - set = dyn_cast<set_info *> (*def_iter); - if (!set) - continue; - - use_info *use = reverse - ? set->last_nondebug_insn_use () - : set->first_nondebug_insn_use (); - - if (use) - return use; - } - - return nullptr; - } - - def_iter_t def_iter; - insn_info *limit; - def_walker (def_info *def, insn_info *limit) : - def_iter (def), limit (limit) {} - - virtual bool iter_valid () const { return *def_iter; } - -public: - insn_info *insn () const override { return (*def_iter)->insn (); } - void advance () override { def_iter++; } - bool valid () const override final - { - if (!iter_valid ()) - return false; - - if (reverse) - return *(insn ()) > *limit; - else - return *(insn ()) < *limit; - } -}; - -// alias_walker that iterates over stores. -template<bool reverse, typename InsnPredicate> -class store_walker : public def_walker<reverse> -{ - rtx cand_mem; - InsnPredicate tombstone_p; - -public: - store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn, - InsnPredicate tombstone_fn) : - def_walker<reverse> (mem_def, limit_insn), - cand_mem (mem), tombstone_p (tombstone_fn) {} - - bool conflict_p (int &budget) const override final - { - if (tombstone_p (this->insn ())) - return false; - - return store_modifies_mem_p (cand_mem, this->insn (), budget); - } -}; - -// alias_walker that iterates over loads. -template<bool reverse> -class load_walker : public def_walker<reverse> -{ - using Base = def_walker<reverse>; - using use_iter_t = typename std::conditional<reverse, - reverse_use_iterator, nondebug_insn_use_iterator>::type; - - use_iter_t use_iter; - insn_info *cand_store; - - bool iter_valid () const override final { return *use_iter; } - -public: - void advance () override final - { - use_iter++; - if (*use_iter) - return; - this->def_iter++; - use_iter = Base::start_use_chain (this->def_iter); - } - - insn_info *insn () const override final - { - return (*use_iter)->insn (); - } - - bool conflict_p (int &budget) const override final - { - return load_modified_by_store_p (insn (), cand_store, budget); - } - - load_walker (def_info *def, insn_info *store, insn_info *limit_insn) - : Base (def, limit_insn), - use_iter (Base::start_use_chain (this->def_iter)), - cand_store (store) {} -}; - -// Process our alias_walkers in a round-robin fashion, proceeding until -// nothing more can be learned from alias analysis. -// -// We try to maintain the invariant that if a walker becomes invalid, we -// set its pointer to null. -static void -do_alias_analysis (insn_info *alias_hazards[4], - alias_walker *walkers[4], - bool load_p) -{ - const int n_walkers = 2 + (2 * !load_p); - int budget = aarch64_ldp_alias_check_limit; - - auto next_walker = [walkers,n_walkers](int current) -> int { - for (int j = 1; j <= n_walkers; j++) - { - int idx = (current + j) % n_walkers; - if (walkers[idx]) - return idx; - } - return -1; - }; - - int i = -1; - for (int j = 0; j < n_walkers; j++) - { - alias_hazards[j] = nullptr; - if (!walkers[j]) - continue; - - if (!walkers[j]->valid ()) - walkers[j] = nullptr; - else if (i == -1) - i = j; - } - - while (i >= 0) - { - int insn_i = i % 2; - int paired_i = (i & 2) + !insn_i; - int pair_fst = (i & 2); - int pair_snd = (i & 2) + 1; - - if (walkers[i]->conflict_p (budget)) - { - alias_hazards[i] = walkers[i]->insn (); - - // We got an aliasing conflict for this {load,store} walker, - // so we don't need to walk any further. - walkers[i] = nullptr; - - // If we have a pair of alias conflicts that prevent - // forming the pair, stop. There's no need to do further - // analysis. - if (alias_hazards[paired_i] - && (*alias_hazards[pair_fst] <= *alias_hazards[pair_snd])) - return; - - if (!load_p) - { - int other_pair_fst = (pair_fst ? 0 : 2); - int other_paired_i = other_pair_fst + !insn_i; - - int x_pair_fst = (i == pair_fst) ? i : other_paired_i; - int x_pair_snd = (i == pair_fst) ? other_paired_i : i; - - // Similarly, handle the case where we have a {load,store} - // or {store,load} alias hazard pair that prevents forming - // the pair. - if (alias_hazards[other_paired_i] - && *alias_hazards[x_pair_fst] <= *alias_hazards[x_pair_snd]) - return; - } - } - - if (walkers[i]) - { - walkers[i]->advance (); - - if (!walkers[i]->valid ()) - walkers[i] = nullptr; - } - - i = next_walker (i); - } -} - -// Given INSNS (in program order) which are known to be adjacent, look -// to see if either insn has a suitable RTL (register) base that we can -// use to form a pair. Push these to BASE_CANDS if we find any. CAND_MEMs -// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the -// size of a single candidate access, and REVERSED says whether the accesses -// are inverted in offset order. -// -// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback -// addressing. -static int -get_viable_bases (insn_info *insns[2], - vec<base_cand> &base_cands, - rtx cand_mems[2], - unsigned access_size, - bool reversed) -{ - // We discovered this pair through a common base. Need to ensure that - // we have a common base register that is live at both locations. - def_info *base_defs[2] = {}; - int writeback = 0; - for (int i = 0; i < 2; i++) - { - const bool is_lower = (i == reversed); - poly_int64 poly_off; - rtx base = ldp_strip_offset (cand_mems[i], &poly_off); - if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC) - writeback |= (1 << i); - - if (!REG_P (base) || !poly_off.is_constant ()) - continue; - - // Punt on accesses relative to eliminable regs. See the comment in - // ldp_bb_info::track_access for a detailed explanation of this. - if (!reload_completed - && (REGNO (base) == FRAME_POINTER_REGNUM - || REGNO (base) == ARG_POINTER_REGNUM)) - continue; - - HOST_WIDE_INT base_off = poly_off.to_constant (); - - // It should be unlikely that we ever punt here, since MEM_EXPR offset - // alignment should be a good proxy for register offset alignment. - if (base_off % access_size != 0) - { - if (dump_file) - fprintf (dump_file, - "base not viable, offset misaligned (insn %d)\n", - insns[i]->uid ()); - continue; - } - - base_off /= access_size; - - if (!is_lower) - base_off--; - - if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM) - continue; - - use_info *use = find_access (insns[i]->uses (), REGNO (base)); - gcc_assert (use); - base_defs[i] = use->def (); - } - - if (!base_defs[0] && !base_defs[1]) - { - if (dump_file) - fprintf (dump_file, "no viable base register for pair (%d,%d)\n", - insns[0]->uid (), insns[1]->uid ()); - return writeback; - } - - for (int i = 0; i < 2; i++) - if ((writeback & (1 << i)) && !base_defs[i]) - { - if (dump_file) - fprintf (dump_file, "insn %d has writeback but base isn't viable\n", - insns[i]->uid ()); - return writeback; - } - - if (writeback == 3 - && base_defs[0]->regno () != base_defs[1]->regno ()) - { - if (dump_file) - fprintf (dump_file, - "pair (%d,%d): double writeback with distinct regs (%d,%d): " - "punting\n", - insns[0]->uid (), insns[1]->uid (), - base_defs[0]->regno (), base_defs[1]->regno ()); - return writeback; - } - - if (base_defs[0] && base_defs[1] - && base_defs[0]->regno () == base_defs[1]->regno ()) - { - // Easy case: insns already share the same base reg. - base_cands.quick_push (base_defs[0]); - return writeback; - } - - // Otherwise, we know that one of the bases must change. - // - // Note that if there is writeback we must use the writeback base - // (we know now there is exactly one). - for (int i = 0; i < 2; i++) - if (base_defs[i] && (!writeback || (writeback & (1 << i)))) - base_cands.quick_push (base_cand { base_defs[i], i }); - - return writeback; -} - -// Given two adjacent memory accesses of the same size, I1 and I2, try -// and see if we can merge them into a ldp or stp. -// -// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true -// if the accesses are both loads, otherwise they are both stores. -bool -ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size, - insn_info *i1, insn_info *i2) -{ - if (dump_file) - fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n", - load_p, i1->uid (), i2->uid ()); - - insn_info *insns[2]; - bool reversed = false; - if (*i1 < *i2) - { - insns[0] = i1; - insns[1] = i2; - } - else - { - insns[0] = i2; - insns[1] = i1; - reversed = true; - } - - rtx cand_mems[2]; - rtx reg_ops[2]; - rtx pats[2]; - for (int i = 0; i < 2; i++) - { - pats[i] = PATTERN (insns[i]->rtl ()); - cand_mems[i] = XEXP (pats[i], load_p); - reg_ops[i] = XEXP (pats[i], !load_p); - } - - if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) - { - if (dump_file) - fprintf (dump_file, - "punting on ldp due to reg conflcits (%d,%d)\n", - insns[0]->uid (), insns[1]->uid ()); - return false; - } - - if (cfun->can_throw_non_call_exceptions - && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX) - && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX)) - { - if (dump_file) - fprintf (dump_file, - "can't combine insns with EH side effects (%d,%d)\n", - insns[0]->uid (), insns[1]->uid ()); - return false; - } - - auto_vec<base_cand, 2> base_cands (2); - - int writeback = get_viable_bases (insns, base_cands, cand_mems, - access_size, reversed); - if (base_cands.is_empty ()) - { - if (dump_file) - fprintf (dump_file, "no viable base for pair (%d,%d)\n", - insns[0]->uid (), insns[1]->uid ()); - return false; - } - - // Punt on frame-related insns with writeback. We probably won't see - // these in practice, but this is conservative and ensures we don't - // have to worry about these later on. - if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ()) - || RTX_FRAME_RELATED_P (i2->rtl ()))) - { - if (dump_file) - fprintf (dump_file, - "rejecting pair (%d,%d): frame-related insn with writeback\n", - i1->uid (), i2->uid ()); - return false; - } - - rtx *ignore = &XEXP (pats[1], load_p); - for (auto use : insns[1]->uses ()) - if (!use->is_mem () - && refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore) - && use->def () && use->def ()->insn () == insns[0]) - { - // N.B. we allow a true dependence on the base address, as this - // happens in the case of auto-inc accesses. Consider a post-increment - // load followed by a regular indexed load, for example. - if (dump_file) - fprintf (dump_file, - "%d has non-address true dependence on %d, rejecting pair\n", - insns[1]->uid (), insns[0]->uid ()); - return false; - } - - unsigned i = 0; - while (i < base_cands.length ()) - { - base_cand &cand = base_cands[i]; - - rtx *ignore[2] = {}; - for (int j = 0; j < 2; j++) - if (cand.from_insn == !j) - ignore[j] = &XEXP (cand_mems[j], 0); - - insn_info *h = first_hazard_after (insns[0], ignore[0]); - if (h && *h < *insns[1]) - cand.hazards[0] = h; - - h = latest_hazard_before (insns[1], ignore[1]); - if (h && *h > *insns[0]) - cand.hazards[1] = h; - - if (!cand.viable ()) - { - if (dump_file) - fprintf (dump_file, - "pair (%d,%d): rejecting base %d due to dataflow " - "hazards (%d,%d)\n", - insns[0]->uid (), - insns[1]->uid (), - cand.def->regno (), - cand.hazards[0]->uid (), - cand.hazards[1]->uid ()); - - base_cands.ordered_remove (i); - } - else - i++; - } - - if (base_cands.is_empty ()) - { - if (dump_file) - fprintf (dump_file, - "can't form pair (%d,%d) due to dataflow hazards\n", - insns[0]->uid (), insns[1]->uid ()); - return false; - } - - insn_info *alias_hazards[4] = {}; - - // First def of memory after the first insn, and last def of memory - // before the second insn, respectively. - def_info *mem_defs[2] = {}; - if (load_p) - { - if (!MEM_READONLY_P (cand_mems[0])) - { - mem_defs[0] = memory_access (insns[0]->uses ())->def (); - gcc_checking_assert (mem_defs[0]); - mem_defs[0] = mem_defs[0]->next_def (); - } - if (!MEM_READONLY_P (cand_mems[1])) - { - mem_defs[1] = memory_access (insns[1]->uses ())->def (); - gcc_checking_assert (mem_defs[1]); - } - } - else - { - mem_defs[0] = memory_access (insns[0]->defs ())->next_def (); - mem_defs[1] = memory_access (insns[1]->defs ())->prev_def (); - gcc_checking_assert (mem_defs[0]); - gcc_checking_assert (mem_defs[1]); - } - - auto tombstone_p = [&](insn_info *insn) -> bool { - return m_emitted_tombstone - && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()); - }; - - store_walker<false, decltype(tombstone_p)> - forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p); - - store_walker<true, decltype(tombstone_p)> - backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p); - - alias_walker *walkers[4] = {}; - if (mem_defs[0]) - walkers[0] = &forward_store_walker; - if (mem_defs[1]) - walkers[1] = &backward_store_walker; - - if (load_p && (mem_defs[0] || mem_defs[1])) - do_alias_analysis (alias_hazards, walkers, load_p); - else - { - // We want to find any loads hanging off the first store. - mem_defs[0] = memory_access (insns[0]->defs ()); - load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]); - load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]); - walkers[2] = &forward_load_walker; - walkers[3] = &backward_load_walker; - do_alias_analysis (alias_hazards, walkers, load_p); - // Now consolidate hazards back down. - if (alias_hazards[2] - && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0]))) - alias_hazards[0] = alias_hazards[2]; - - if (alias_hazards[3] - && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1]))) - alias_hazards[1] = alias_hazards[3]; - } - - if (alias_hazards[0] && alias_hazards[1] - && *alias_hazards[0] <= *alias_hazards[1]) - { - if (dump_file) - fprintf (dump_file, - "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n", - i1->uid (), i2->uid (), - alias_hazards[0]->uid (), alias_hazards[1]->uid ()); - return false; - } - - // Now narrow the hazards on each base candidate using - // the alias hazards. - i = 0; - while (i < base_cands.length ()) - { - base_cand &cand = base_cands[i]; - if (alias_hazards[0] && (!cand.hazards[0] - || *alias_hazards[0] < *cand.hazards[0])) - cand.hazards[0] = alias_hazards[0]; - if (alias_hazards[1] && (!cand.hazards[1] - || *alias_hazards[1] > *cand.hazards[1])) - cand.hazards[1] = alias_hazards[1]; - - if (cand.viable ()) - i++; - else - { - if (dump_file) - fprintf (dump_file, "pair (%d,%d): rejecting base %d due to " - "alias/dataflow hazards (%d,%d)", - insns[0]->uid (), insns[1]->uid (), - cand.def->regno (), - cand.hazards[0]->uid (), - cand.hazards[1]->uid ()); - - base_cands.ordered_remove (i); - } - } - - if (base_cands.is_empty ()) - { - if (dump_file) - fprintf (dump_file, - "cannot form pair (%d,%d) due to alias/dataflow hazards", - insns[0]->uid (), insns[1]->uid ()); - - return false; - } - - base_cand *base = &base_cands[0]; - if (base_cands.length () > 1) - { - // If there are still multiple viable bases, it makes sense - // to choose one that allows us to reduce register pressure, - // for loads this means moving further down, for stores this - // means moving further up. - gcc_checking_assert (base_cands.length () == 2); - const int hazard_i = !load_p; - if (base->hazards[hazard_i]) - { - if (!base_cands[1].hazards[hazard_i]) - base = &base_cands[1]; - else if (load_p - && *base_cands[1].hazards[hazard_i] - > *(base->hazards[hazard_i])) - base = &base_cands[1]; - else if (!load_p - && *base_cands[1].hazards[hazard_i] - < *(base->hazards[hazard_i])) - base = &base_cands[1]; - } - } - - // Otherwise, hazards[0] > hazards[1]. - // Pair can be formed anywhere in (hazards[1], hazards[0]). - insn_range_info range (insns[0], insns[1]); - if (base->hazards[1]) - range.first = base->hazards[1]; - if (base->hazards[0]) - range.last = base->hazards[0]->prev_nondebug_insn (); - - // If the second insn can throw, narrow the move range to exactly that insn. - // This prevents us trying to move the second insn from the end of the BB. - if (cfun->can_throw_non_call_exceptions - && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX)) - { - gcc_assert (range.includes (insns[1])); - range = insn_range_info (insns[1]); - } - - // Placement strategy: push loads down and pull stores up, this should - // help register pressure by reducing live ranges. - if (load_p) - range.first = range.last; - else - range.last = range.first; - - if (dump_file) - { - auto print_hazard = [](insn_info *i) - { - if (i) - fprintf (dump_file, "%d", i->uid ()); - else - fprintf (dump_file, "-"); - }; - auto print_pair = [print_hazard](insn_info **i) - { - print_hazard (i[0]); - fprintf (dump_file, ","); - print_hazard (i[1]); - }; - - fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (", - load_p, insns[0]->uid (), insns[1]->uid (), - base->def->regno ()); - print_pair (base->hazards); - fprintf (dump_file, "), move_range: (%d,%d)\n", - range.first->uid (), range.last->uid ()); - } - - return fuse_pair (load_p, access_size, writeback, - i1, i2, *base, range); -} - -static void -dump_insn_list (FILE *f, const insn_list_t &l) -{ - fprintf (f, "("); - - auto i = l.begin (); - auto end = l.end (); - - if (i != end) - fprintf (f, "%d", (*i)->uid ()); - i++; - - for (; i != end; i++) - fprintf (f, ", %d", (*i)->uid ()); - - fprintf (f, ")"); -} - -DEBUG_FUNCTION void -debug (const insn_list_t &l) -{ - dump_insn_list (stderr, l); - fprintf (stderr, "\n"); -} - -// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns -// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST. -// -// This function traverses the resulting 2D matrix of possible pair candidates -// and attempts to merge them into pairs. -// -// The algorithm is straightforward: if we consider a combined list of -// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order, -// then we advance through X until we reach a crossing point (where X[i] and -// X[i+1] come from different source lists). -// -// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to -// fuse them into a pair. If this succeeds, we remove X[i] and X[i+1] from -// their original lists and continue as above. -// -// In the failure case, we advance through the source list containing X[i] and -// continue as above (proceeding to the next crossing point). -// -// The rationale for skipping over groups of consecutive candidates from the -// same source list is as follows: -// -// In the store case, the insns in the group can't be re-ordered over each -// other as they are guaranteed to store to the same location, so we're -// guaranteed not to lose opportunities by doing this. -// -// In the load case, subsequent loads from the same location are either -// redundant (in which case they should have been cleaned up by an earlier -// optimization pass) or there is an intervening aliasing hazard, in which case -// we can't re-order them anyway, so provided earlier passes have cleaned up -// redundant loads, we shouldn't miss opportunities by doing this. -void -ldp_bb_info::merge_pairs (insn_list_t &left_list, - insn_list_t &right_list, - bool load_p, - unsigned access_size) -{ - if (dump_file) - { - fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p); - dump_insn_list (dump_file, left_list); - fprintf (dump_file, " x "); - dump_insn_list (dump_file, right_list); - fprintf (dump_file, "\n"); - } - - auto iter_l = left_list.begin (); - auto iter_r = right_list.begin (); - - while (iter_l != left_list.end () && iter_r != right_list.end ()) - { - auto next_l = std::next (iter_l); - auto next_r = std::next (iter_r); - if (**iter_l < **iter_r - && next_l != left_list.end () - && **next_l < **iter_r) - iter_l = next_l; - else if (**iter_r < **iter_l - && next_r != right_list.end () - && **next_r < **iter_l) - iter_r = next_r; - else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r)) - { - left_list.erase (iter_l); - iter_l = next_l; - right_list.erase (iter_r); - iter_r = next_r; - } - else if (**iter_l < **iter_r) - iter_l = next_l; - else - iter_r = next_r; - } -} - -// Iterate over the accesses in GROUP, looking for adjacent sets -// of accesses. If we find two sets of adjacent accesses, call -// merge_pairs. -void -ldp_bb_info::transform_for_base (int encoded_lfs, - access_group &group) -{ - const auto lfs = decode_lfs (encoded_lfs); - const unsigned access_size = lfs.size; - - bool skip_next = true; - access_record *prev_access = nullptr; - - for (auto &access : group.list) - { - if (skip_next) - skip_next = false; - else if (known_eq (access.offset, prev_access->offset + access_size)) - { - merge_pairs (prev_access->cand_insns, - access.cand_insns, - lfs.load_p, - access_size); - skip_next = access.cand_insns.empty (); - } - prev_access = &access; - } -} - -// If we emitted tombstone insns for this BB, iterate through the BB -// and remove all the tombstone insns, being sure to reparent any uses -// of mem to previous defs when we do this. -void -ldp_bb_info::cleanup_tombstones () -{ - // No need to do anything if we didn't emit a tombstone insn for this BB. - if (!m_emitted_tombstone) - return; - - for (auto insn : iterate_safely (m_bb->nondebug_insns ())) - { - if (!insn->is_real () - || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ())) - continue; - - auto set = as_a<set_info *> (memory_access (insn->defs ())); - if (set->has_any_uses ()) - { - auto prev_set = as_a<set_info *> (set->prev_def ()); - while (set->first_use ()) - crtl->ssa->reparent_use (set->first_use (), prev_set); - } - - // Now set has no uses, we can delete it. - insn_change change (insn, insn_change::DELETE); - crtl->ssa->change_insn (change); - } -} - -template<typename Map> -void -ldp_bb_info::traverse_base_map (Map &map) -{ - for (auto kv : map) - { - const auto &key = kv.first; - auto &value = kv.second; - transform_for_base (key.second, value); - } -} - -void -ldp_bb_info::transform () -{ - traverse_base_map (expr_map); - traverse_base_map (def_map); -} - -static void -ldp_fusion_init () -{ - calculate_dominance_info (CDI_DOMINATORS); - df_analyze (); - crtl->ssa = new rtl_ssa::function_info (cfun); -} - -static void -ldp_fusion_destroy () -{ - if (crtl->ssa->perform_pending_updates ()) - cleanup_cfg (0); - - free_dominance_info (CDI_DOMINATORS); - - delete crtl->ssa; - crtl->ssa = nullptr; -} - // Given a load pair insn in PATTERN, unpack the insn, storing // the registers in REGS and returning the mem. static rtx @@ -3023,16 +218,19 @@ aarch64_destructure_store_pair (rtx regs[2], rtx pattern) return mem; } -// Given a pair mem in PAIR_MEM, register operands in REGS, and an rtx -// representing the effect of writeback on the base register in WB_EFFECT, -// return an insn representing a writeback variant of this pair. -// LOAD_P is true iff the pair is a load. -// -// This is used when promoting existing non-writeback pairs to writeback -// variants. -static rtx -aarch64_gen_writeback_pair (rtx wb_effect, rtx pair_mem, rtx regs[2], - bool load_p) +rtx +aarch64_pair_fusion::destructure_pair (rtx regs[2], rtx pattern, bool load_p) +{ + if (load_p) + return aarch64_destructure_load_pair (regs, pattern); + else + return aarch64_destructure_store_pair (regs, pattern); +} + +rtx +aarch64_pair_fusion::gen_promote_writeback_pair (rtx wb_effect, rtx pair_mem, + rtx regs[2], + bool load_p) { auto op_mode = aarch64_operand_mode_for_pair_mode (GET_MODE (pair_mem)); @@ -3064,155 +262,6 @@ aarch64_gen_writeback_pair (rtx wb_effect, rtx pair_mem, rtx regs[2], gen_rtvec (3, wb_effect, pats[0], pats[1])); } -// Given an existing pair insn INSN, look for a trailing update of -// the base register which we can fold in to make this pair use -// a writeback addressing mode. -static void -try_promote_writeback (insn_info *insn) -{ - auto rti = insn->rtl (); - const auto attr = get_attr_ldpstp (rti); - if (attr == LDPSTP_NONE) - return; - - bool load_p = (attr == LDPSTP_LDP); - gcc_checking_assert (load_p || attr == LDPSTP_STP); - - rtx regs[2]; - rtx mem = NULL_RTX; - if (load_p) - mem = aarch64_destructure_load_pair (regs, PATTERN (rti)); - else - mem = aarch64_destructure_store_pair (regs, PATTERN (rti)); - gcc_checking_assert (MEM_P (mem)); - - poly_int64 offset; - rtx base = strip_offset (XEXP (mem, 0), &offset); - gcc_assert (REG_P (base)); - - const auto access_size = GET_MODE_SIZE (GET_MODE (mem)).to_constant () / 2; - - if (find_access (insn->defs (), REGNO (base))) - { - gcc_assert (load_p); - if (dump_file) - fprintf (dump_file, - "ldp %d clobbers base r%d, can't promote to writeback\n", - insn->uid (), REGNO (base)); - return; - } - - auto base_use = find_access (insn->uses (), REGNO (base)); - gcc_assert (base_use); - - if (!base_use->def ()) - { - if (dump_file) - fprintf (dump_file, - "found pair (i%d, L=%d): but base r%d is upwards exposed\n", - insn->uid (), load_p, REGNO (base)); - return; - } - - auto base_def = base_use->def (); - - rtx wb_effect = NULL_RTX; - def_info *add_def; - const insn_range_info pair_range (insn); - insn_info *insns[2] = { nullptr, insn }; - insn_info *trailing_add = find_trailing_add (insns, pair_range, 0, &wb_effect, - &add_def, base_def, offset, - access_size); - if (!trailing_add) - return; - - auto attempt = crtl->ssa->new_change_attempt (); - - insn_change pair_change (insn); - insn_change del_change (trailing_add, insn_change::DELETE); - insn_change *changes[] = { &pair_change, &del_change }; - - rtx pair_pat = aarch64_gen_writeback_pair (wb_effect, mem, regs, load_p); - validate_unshare_change (rti, &PATTERN (rti), pair_pat, true); - - // The pair must gain the def of the base register from the add. - pair_change.new_defs = insert_access (attempt, - add_def, - pair_change.new_defs); - gcc_assert (pair_change.new_defs.is_valid ()); - - auto is_changing = insn_is_changing (changes); - for (unsigned i = 0; i < ARRAY_SIZE (changes); i++) - gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing)); - - if (!rtl_ssa::recog_ignoring (attempt, pair_change, is_changing)) - { - if (dump_file) - fprintf (dump_file, "i%d: recog failed on wb pair, bailing out\n", - insn->uid ()); - cancel_changes (0); - return; - } - - gcc_assert (crtl->ssa->verify_insn_changes (changes)); - - if (MAY_HAVE_DEBUG_INSNS) - fixup_debug_uses_trailing_add (attempt, insn, trailing_add, wb_effect); - - confirm_change_group (); - crtl->ssa->change_insns (changes); -} - -// Main function for the pass. Iterate over the insns in BB looking -// for load/store candidates. If running after RA, also try and promote -// non-writeback pairs to use writeback addressing. Then try to fuse -// candidates into pairs. -void ldp_fusion_bb (bb_info *bb) -{ - const bool track_loads - = aarch64_tune_params.ldp_policy_model != AARCH64_LDP_STP_POLICY_NEVER; - const bool track_stores - = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER; - - ldp_bb_info bb_state (bb); - - for (auto insn : bb->nondebug_insns ()) - { - rtx_insn *rti = insn->rtl (); - - if (!rti || !INSN_P (rti)) - continue; - - rtx pat = PATTERN (rti); - if (reload_completed - && aarch64_ldp_writeback > 1 - && GET_CODE (pat) == PARALLEL - && XVECLEN (pat, 0) == 2) - try_promote_writeback (insn); - - if (GET_CODE (pat) != SET) - continue; - - if (track_stores && MEM_P (XEXP (pat, 0))) - bb_state.track_access (insn, false, XEXP (pat, 0)); - else if (track_loads && MEM_P (XEXP (pat, 1))) - bb_state.track_access (insn, true, XEXP (pat, 1)); - } - - bb_state.transform (); - bb_state.cleanup_tombstones (); -} - -void ldp_fusion () -{ - ldp_fusion_init (); - - for (auto bb : crtl->ssa->bbs ()) - ldp_fusion_bb (bb); - - ldp_fusion_destroy (); -} - namespace { const pass_data pass_data_ldp_fusion = @@ -3242,14 +291,6 @@ public: if (!optimize || optimize_debug) return false; - // If the tuning policy says never to form ldps or stps, don't run - // the pass. - if ((aarch64_tune_params.ldp_policy_model - == AARCH64_LDP_STP_POLICY_NEVER) - && (aarch64_tune_params.stp_policy_model - == AARCH64_LDP_STP_POLICY_NEVER)) - return false; - if (reload_completed) return flag_aarch64_late_ldp_fusion; else @@ -3258,7 +299,8 @@ public: unsigned execute (function *) final override { - ldp_fusion (); + aarch64_pair_fusion pass; + pass.run (); return 0; } }; diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md index b7c0bf05cd1..7890a8cc32b 100644 --- a/gcc/config/aarch64/aarch64-ldpstp.md +++ b/gcc/config/aarch64/aarch64-ldpstp.md @@ -96,9 +96,7 @@ (set (match_operand:VQ2 2 "register_operand" "") (match_operand:VQ2 3 "memory_operand" ""))] "TARGET_FLOAT - && aarch64_operands_ok_for_ldpstp (operands, true) - && (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0" + && aarch64_operands_ok_for_ldpstp (operands, true)" [(const_int 0)] { aarch64_finish_ldpstp_peephole (operands, true); @@ -111,9 +109,7 @@ (set (match_operand:VQ2 2 "memory_operand" "") (match_operand:VQ2 3 "register_operand" ""))] "TARGET_FLOAT - && aarch64_operands_ok_for_ldpstp (operands, false) - && (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0" + && aarch64_operands_ok_for_ldpstp (operands, false)" [(const_int 0)] { aarch64_finish_ldpstp_peephole (operands, false); diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 42639e9efcf..1d3f94c813e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1110,6 +1110,11 @@ extern void aarch64_output_patchable_area (unsigned int, bool); extern void aarch64_adjust_reg_alloc_order (); +extern void mingw_pe_maybe_record_exported_symbol (tree, const char *, int); +extern unsigned int mingw_pe_section_type_flags (tree, const char *, int); +extern void mingw_pe_unique_section (tree, int); +extern void mingw_pe_encode_section_info (tree, rtx, int); + bool aarch64_optimize_mode_switching (aarch64_mode_entity); void aarch64_restore_za (rtx); diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index da16f602a55..e65f73d7ba2 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -65,11 +65,6 @@ BUILTIN_VS (UNOP, ctz, 2, NONE) BUILTIN_VB (UNOP, popcount, 2, NONE) - /* Implemented by aarch64_get_low<mode>. */ - BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP) - /* Implemented by aarch64_get_high<mode>. */ - BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP) - /* Implemented by aarch64_<sur>q<r>shl<mode>. */ BUILTIN_VSDQ_I (BINOP, sqshl, 0, NONE) BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, NONE) @@ -960,10 +955,6 @@ VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf) VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf) - /* Implemented by aarch64_vget_lo/hi_halfv8bf. */ - VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf) - VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf) - /* Implemented by aarch64_simd_<sur>mmlav16qi. */ VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi) VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index f8bb973a278..0bb39091a38 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -288,28 +288,6 @@ } ) -(define_expand "aarch64_get_low<mode>" - [(match_operand:<VHALF> 0 "register_operand") - (match_operand:VQMOV 1 "register_operand")] - "TARGET_FLOAT" - { - rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); - emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo)); - DONE; - } -) - -(define_expand "aarch64_get_high<mode>" - [(match_operand:<VHALF> 0 "register_operand") - (match_operand:VQMOV 1 "register_operand")] - "TARGET_FLOAT" - { - rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); - emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], hi)); - DONE; - } -) - (define_insn_and_split "aarch64_simd_mov_from_<mode>low" [(set (match_operand:<VHALF> 0 "register_operand") (vec_select:<VHALF> @@ -399,8 +377,7 @@ (define_insn "aarch64_rbit<mode><vczle><vczbe>" [(set (match_operand:VB 0 "register_operand" "=w") - (unspec:VB [(match_operand:VB 1 "register_operand" "w")] - UNSPEC_RBIT))] + (bitreverse:VB (match_operand:VB 1 "register_operand" "w")))] "TARGET_SIMD" "rbit\\t%0.<Vbtype>, %1.<Vbtype>" [(set_attr "type" "neon_rbit")] @@ -3154,7 +3131,7 @@ DONE; } ) -(define_insn "aarch64_float_extend_lo_<Vwide>" +(define_insn "extend<mode><Vwide>2" [(set (match_operand:<VWIDE> 0 "register_operand" "=w") (float_extend:<VWIDE> (match_operand:VDF 1 "register_operand" "w")))] @@ -3219,7 +3196,7 @@ } ) -(define_insn "aarch64_float_truncate_lo_<mode><vczle><vczbe>" +(define_insn "trunc<Vwide><mode>2<vczle><vczbe>" [(set (match_operand:VDF 0 "register_operand" "=w") (float_truncate:VDF (match_operand:<VWIDE> 1 "register_operand" "w")))] @@ -3278,7 +3255,7 @@ int lo = BYTES_BIG_ENDIAN ? 2 : 1; int hi = BYTES_BIG_ENDIAN ? 1 : 2; - emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo])); + emit_insn (gen_truncv2dfv2sf2 (tmp, operands[lo])); emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0], tmp, operands[hi])); DONE; @@ -3294,7 +3271,7 @@ { rtx tmp = gen_reg_rtx (V2SFmode); emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2])); - emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp)); + emit_insn (gen_truncv2dfv2sf2 (operands[0], tmp)); DONE; } ) @@ -4388,7 +4365,7 @@ && (register_operand (operands[0], <VDBL>mode) || register_operand (operands[2], <MODE>mode))" {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] - [ w , 0 , w ; neon_ins<dblq> , simd ] ins\t%0.<single_type>[1], %2.<single_type>[0] + [ w , w , w ; neon_permute<dblq> , simd ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type> [ w , 0 , ?r ; neon_from_gp<dblq> , simd ] ins\t%0.<single_type>[1], %<single_wx>2 [ w , 0 , ?r ; f_mcr , * ] fmov\t%0.d[1], %2 [ w , 0 , Utv ; neon_load1_one_lane<dblq> , simd ] ld1\t{%0.<single_type>}[1], %2 @@ -4407,7 +4384,7 @@ && (register_operand (operands[0], <VDBL>mode) || register_operand (operands[2], <MODE>mode))" {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] - [ w , 0 , w ; neon_ins<dblq> , simd ] ins\t%0.<single_type>[1], %2.<single_type>[0] + [ w , w , w ; neon_permute<dblq> , simd ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type> [ w , 0 , ?r ; neon_from_gp<dblq> , simd ] ins\t%0.<single_type>[1], %<single_wx>2 [ w , 0 , ?r ; f_mcr , * ] fmov\t%0.d[1], %2 [ w , 0 , Utv ; neon_load1_one_lane<dblq> , simd ] ld1\t{%0.<single_type>}[1], %2 @@ -8496,7 +8473,7 @@ UNSPEC_CONCAT))] "TARGET_SIMD" "#" - "&& reload_completed" + "&& 1" [(const_int 0)] { aarch64_split_combinev16qi (operands); @@ -9774,27 +9751,6 @@ [(set_attr "type" "neon_dot<VDQSF:q>")] ) -;; vget_low/high_bf16 -(define_expand "aarch64_vget_lo_halfv8bf" - [(match_operand:V4BF 0 "register_operand") - (match_operand:V8BF 1 "register_operand")] - "TARGET_BF16_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false); - emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p)); - DONE; -}) - -(define_expand "aarch64_vget_hi_halfv8bf" - [(match_operand:V4BF 0 "register_operand") - (match_operand:V8BF 1 "register_operand")] - "TARGET_BF16_SIMD" -{ - rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, true); - emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p)); - DONE; -}) - ;; bfmmla (define_insn "aarch64_bfmmlaqv4sf" [(set (match_operand:V4SF 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 0d2edf3f19e..823d60040f9 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -1174,7 +1174,7 @@ public: Advanced SIMD argument as an SVE vector. */ if (!BYTES_BIG_ENDIAN && is_undef (CALL_EXPR_ARG (e.call_expr, 0))) - return simplify_gen_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0); + return force_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0); rtx_vector_builder builder (VNx16BImode, 16, 2); for (unsigned int i = 0; i < 16; i++) @@ -3186,7 +3186,7 @@ FUNCTION (svqincp, svqdecp_svqincp_impl, (SS_PLUS, US_PLUS)) FUNCTION (svqincw, svqinc_bhwd_impl, (SImode)) FUNCTION (svqincw_pat, svqinc_bhwd_impl, (SImode)) FUNCTION (svqsub, rtx_code_function, (SS_MINUS, US_MINUS, -1)) -FUNCTION (svrbit, unspec_based_function, (UNSPEC_RBIT, UNSPEC_RBIT, -1)) +FUNCTION (svrbit, rtx_code_function, (BITREVERSE, BITREVERSE, -1)) FUNCTION (svrdffr, svrdffr_impl,) FUNCTION (svrecpe, unspec_based_function, (-1, UNSPEC_URECPE, UNSPEC_FRECPE)) FUNCTION (svrecps, unspec_based_function, (-1, -1, UNSPEC_FRECPS)) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 0434358122d..5331e7121d5 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1156,76 +1156,96 @@ ;; Likewise with zero predication. (define_insn "aarch64_rdffr_z" - [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") + [(set (match_operand:VNx16BI 0 "register_operand") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) - (match_operand:VNx16BI 1 "register_operand" "Upa")))] + (match_operand:VNx16BI 1 "register_operand")))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffr\t%0.b, %1/z" + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ &Upa , Upa ; yes ] rdffr\t%0.b, %1/z + [ ?Upa , 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ + } ) ;; Read the FFR to test for a fault, without using the predicate result. (define_insn "*aarch64_rdffr_z_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 2 "aarch64_sve_ptrue_flag") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1))] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ &Upa , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa , 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ + } ) ;; Same for unpredicated RDFFR when tested with a known PTRUE. (define_insn "*aarch64_rdffr_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (const_int SVE_KNOWN_PTRUE) (reg:VNx16BI FFRT_REGNUM)] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ &Upa , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa , 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ + } ) ;; Read the FFR with zero predication and test the result. (define_insn "*aarch64_rdffr_z_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 2 "aarch64_sve_ptrue_flag") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1))] UNSPEC_PTEST)) - (set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (set (match_operand:VNx16BI 0 "register_operand") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1)))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ &Upa , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa , 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ + } ) ;; Same for unpredicated RDFFR when tested with a known PTRUE. (define_insn "*aarch64_rdffr_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (const_int SVE_KNOWN_PTRUE) (reg:VNx16BI FFRT_REGNUM)] UNSPEC_PTEST)) - (set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (set (match_operand:VNx16BI 0 "register_operand") (reg:VNx16BI FFRT_REGNUM))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ &Upa , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa , 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ + } ) ;; [R3 in the block comment above about FFR handling] @@ -3063,6 +3083,7 @@ ;; - CLS (= clrsb) ;; - CLZ ;; - CNT (= popcount) +;; - RBIT (= bitreverse) ;; - NEG ;; - NOT ;; ------------------------------------------------------------------------- @@ -3151,7 +3172,6 @@ ;; ---- [INT] General unary arithmetic corresponding to unspecs ;; ------------------------------------------------------------------------- ;; Includes -;; - RBIT ;; - REVB ;; - REVH ;; - REVW @@ -6637,11 +6657,15 @@ ;; Doubling the second operand is the preferred implementation ;; of the MOV alias, so we use that instead of %1/z, %1, %2. (define_insn "and<mode>3" - [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa") - (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand" "Upa") - (match_operand:PRED_ALL 2 "register_operand" "Upa")))] + [(set (match_operand:PRED_ALL 0 "register_operand") + (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand") + (match_operand:PRED_ALL 2 "register_operand")))] "TARGET_SVE" - "and\t%0.b, %1/z, %2.b, %2.b" + {@ [ cons: =0, 1 , 2 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa ; yes ] and\t%0.b, %1/z, %2.b, %2.b + [ ?Upa , 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa ; no ] ^ + } ) ;; Unpredicated predicate EOR and ORR. @@ -6660,14 +6684,18 @@ ;; Predicated predicate AND, EOR and ORR. (define_insn "@aarch64_pred_<optab><mode>_z" - [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa") + [(set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (LOGICAL:PRED_ALL - (match_operand:PRED_ALL 2 "register_operand" "Upa") - (match_operand:PRED_ALL 3 "register_operand" "Upa")) - (match_operand:PRED_ALL 1 "register_operand" "Upa")))] + (match_operand:PRED_ALL 2 "register_operand") + (match_operand:PRED_ALL 3 "register_operand")) + (match_operand:PRED_ALL 1 "register_operand")))] "TARGET_SVE" - "<logical>\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <logical>\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; Perform a logical operation on operands 2 and 3, using operand 1 as @@ -6676,38 +6704,46 @@ (define_insn "*<optab><mode>3_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (and:PRED_ALL (LOGICAL:PRED_ALL - (match_operand:PRED_ALL 2 "register_operand" "Upa") - (match_operand:PRED_ALL 3 "register_operand" "Upa")) + (match_operand:PRED_ALL 2 "register_operand") + (match_operand:PRED_ALL 3 "register_operand")) (match_dup 4))] UNSPEC_PTEST)) - (set (match_operand:PRED_ALL 0 "register_operand" "=Upa") + (set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3)) (match_dup 4)))] "TARGET_SVE" - "<logical>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <logical>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; Same with just the flags result. (define_insn "*<optab><mode>3_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (and:PRED_ALL (LOGICAL:PRED_ALL - (match_operand:PRED_ALL 2 "register_operand" "Upa") - (match_operand:PRED_ALL 3 "register_operand" "Upa")) + (match_operand:PRED_ALL 2 "register_operand") + (match_operand:PRED_ALL 3 "register_operand")) (match_dup 4))] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE" - "<logical>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <logical>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; ------------------------------------------------------------------------- @@ -6720,56 +6756,68 @@ ;; Predicated predicate BIC and ORN. (define_insn "aarch64_pred_<nlogical><mode>_z" - [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa") + [(set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (NLOGICAL:PRED_ALL - (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa")) - (match_operand:PRED_ALL 2 "register_operand" "Upa")) - (match_operand:PRED_ALL 1 "register_operand" "Upa")))] + (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand")) + (match_operand:PRED_ALL 2 "register_operand")) + (match_operand:PRED_ALL 1 "register_operand")))] "TARGET_SVE" - "<nlogical>\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <nlogical>\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; Same, but set the flags as a side-effect. (define_insn "*<nlogical><mode>3_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (and:PRED_ALL (NLOGICAL:PRED_ALL (not:PRED_ALL - (match_operand:PRED_ALL 3 "register_operand" "Upa")) - (match_operand:PRED_ALL 2 "register_operand" "Upa")) + (match_operand:PRED_ALL 3 "register_operand")) + (match_operand:PRED_ALL 2 "register_operand")) (match_dup 4))] UNSPEC_PTEST)) - (set (match_operand:PRED_ALL 0 "register_operand" "=Upa") + (set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (NLOGICAL:PRED_ALL (not:PRED_ALL (match_dup 3)) (match_dup 2)) (match_dup 4)))] "TARGET_SVE" - "<nlogical>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <nlogical>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; Same with just the flags result. (define_insn "*<nlogical><mode>3_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (and:PRED_ALL (NLOGICAL:PRED_ALL (not:PRED_ALL - (match_operand:PRED_ALL 3 "register_operand" "Upa")) - (match_operand:PRED_ALL 2 "register_operand" "Upa")) + (match_operand:PRED_ALL 3 "register_operand")) + (match_operand:PRED_ALL 2 "register_operand")) (match_dup 4))] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE" - "<nlogical>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <nlogical>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; ------------------------------------------------------------------------- @@ -6782,58 +6830,70 @@ ;; Predicated predicate NAND and NOR. (define_insn "aarch64_pred_<logical_nn><mode>_z" - [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa") + [(set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (NLOGICAL:PRED_ALL - (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa")) - (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa"))) - (match_operand:PRED_ALL 1 "register_operand" "Upa")))] + (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand")) + (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand"))) + (match_operand:PRED_ALL 1 "register_operand")))] "TARGET_SVE" - "<logical_nn>\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <logical_nn>\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; Same, but set the flags as a side-effect. (define_insn "*<logical_nn><mode>3_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (and:PRED_ALL (NLOGICAL:PRED_ALL (not:PRED_ALL - (match_operand:PRED_ALL 2 "register_operand" "Upa")) + (match_operand:PRED_ALL 2 "register_operand")) (not:PRED_ALL - (match_operand:PRED_ALL 3 "register_operand" "Upa"))) + (match_operand:PRED_ALL 3 "register_operand"))) (match_dup 4))] UNSPEC_PTEST)) - (set (match_operand:PRED_ALL 0 "register_operand" "=Upa") + (set (match_operand:PRED_ALL 0 "register_operand") (and:PRED_ALL (NLOGICAL:PRED_ALL (not:PRED_ALL (match_dup 2)) (not:PRED_ALL (match_dup 3))) (match_dup 4)))] "TARGET_SVE" - "<logical_nn>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <logical_nn>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; Same with just the flags result. (define_insn "*<logical_nn><mode>3_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (and:PRED_ALL (NLOGICAL:PRED_ALL (not:PRED_ALL - (match_operand:PRED_ALL 2 "register_operand" "Upa")) + (match_operand:PRED_ALL 2 "register_operand")) (not:PRED_ALL - (match_operand:PRED_ALL 3 "register_operand" "Upa"))) + (match_operand:PRED_ALL 3 "register_operand"))) (match_dup 4))] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE" - "<logical_nn>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] <logical_nn>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; ========================================================================= @@ -8074,9 +8134,13 @@ UNSPEC_PRED_Z)) (clobber (reg:CC_NZC CC_REGNUM))] "TARGET_SVE" - {@ [ cons: =0 , 1 , 3 , 4 ] - [ Upa , Upl , w , <sve_imm_con> ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #%4 - [ Upa , Upl , w , w ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype> + {@ [ cons: =0 , 1 , 3 , 4 ; attrs: pred_clobber ] + [ &Upa , Upl, w , <sve_imm_con>; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #%4 + [ ?Upl , 0 , w , <sve_imm_con>; yes ] ^ + [ Upa , Upl, w , <sve_imm_con>; no ] ^ + [ &Upa , Upl, w , w ; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype> + [ ?Upl , 0 , w , w ; yes ] ^ + [ Upa , Upl, w , w ; no ] ^ } ) @@ -8106,9 +8170,13 @@ UNSPEC_PRED_Z))] "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])" - {@ [ cons: =0 , 1 , 2 , 3 ] - [ Upa , Upl , w , <sve_imm_con> ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3 - [ Upa , Upl , w , w ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype> + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upl, w , <sve_imm_con>; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3 + [ ?Upl , 0 , w , <sve_imm_con>; yes ] ^ + [ Upa , Upl, w , <sve_imm_con>; no ] ^ + [ &Upa , Upl, w , w ; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype> + [ ?Upl , 0 , w , w ; yes ] ^ + [ Upa , Upl, w , w ; no ] ^ } "&& !rtx_equal_p (operands[4], operands[6])" { @@ -8133,12 +8201,16 @@ (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand"))] UNSPEC_PRED_Z)] UNSPEC_PTEST)) - (clobber (match_scratch:<VPRED> 0 "=Upa, Upa"))] + (clobber (match_scratch:<VPRED> 0))] "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])" - {@ [ cons: 1 , 2 , 3 ] - [ Upl , w , <sve_imm_con> ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3 - [ Upl , w , w ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype> + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upl, w , <sve_imm_con>; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3 + [ ?Upl , 0 , w , <sve_imm_con>; yes ] ^ + [ Upa , Upl, w , <sve_imm_con>; no ] ^ + [ &Upa , Upl, w , w ; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype> + [ ?Upl , 0 , w , w ; yes ] ^ + [ Upa , Upl, w , w ; no ] ^ } "&& !rtx_equal_p (operands[4], operands[6])" { @@ -8180,18 +8252,22 @@ ;; Predicated integer wide comparisons. (define_insn "@aarch64_pred_cmp<cmp_op><mode>_wide" - [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") + [(set (match_operand:<VPRED> 0 "register_operand") (unspec:<VPRED> - [(match_operand:VNx16BI 1 "register_operand" "Upl") + [(match_operand:VNx16BI 1 "register_operand") (match_operand:SI 2 "aarch64_sve_ptrue_flag") (unspec:<VPRED> - [(match_operand:SVE_FULL_BHSI 3 "register_operand" "w") - (match_operand:VNx2DI 4 "register_operand" "w")] + [(match_operand:SVE_FULL_BHSI 3 "register_operand") + (match_operand:VNx2DI 4 "register_operand")] SVE_COND_INT_CMP_WIDE)] UNSPEC_PRED_Z)) (clobber (reg:CC_NZC CC_REGNUM))] "TARGET_SVE" - "cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.d" + {@ [ cons: =0, 1 , 2, 3, 4; attrs: pred_clobber ] + [ &Upa , Upl, , w, w; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.d + [ ?Upl , 0 , , w, w; yes ] ^ + [ Upa , Upl, , w, w; no ] ^ + } ) ;; Predicated integer wide comparisons in which both the flag and @@ -8199,19 +8275,19 @@ (define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upl") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (unspec:<VPRED> - [(match_operand:VNx16BI 6 "register_operand" "Upl") + [(match_operand:VNx16BI 6 "register_operand") (match_operand:SI 7 "aarch64_sve_ptrue_flag") (unspec:<VPRED> - [(match_operand:SVE_FULL_BHSI 2 "register_operand" "w") - (match_operand:VNx2DI 3 "register_operand" "w")] + [(match_operand:SVE_FULL_BHSI 2 "register_operand") + (match_operand:VNx2DI 3 "register_operand")] SVE_COND_INT_CMP_WIDE)] UNSPEC_PRED_Z)] UNSPEC_PTEST)) - (set (match_operand:<VPRED> 0 "register_operand" "=Upa") + (set (match_operand:<VPRED> 0 "register_operand") (unspec:<VPRED> [(match_dup 6) (match_dup 7) @@ -8222,7 +8298,11 @@ UNSPEC_PRED_Z))] "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])" - "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d" + {@ [ cons: =0, 1 , 2, 3, 6 ; attrs: pred_clobber ] + [ &Upa , Upl, w, w, Upl; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d + [ ?Upl , 0 , w, w, Upl; yes ] ^ + [ Upa , Upl, w, w, Upl; no ] ^ + } ) ;; Predicated integer wide comparisons in which only the flags result @@ -8230,22 +8310,26 @@ (define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upl") + [(match_operand:VNx16BI 1 "register_operand") (match_operand 4) (match_operand:SI 5 "aarch64_sve_ptrue_flag") (unspec:<VPRED> - [(match_operand:VNx16BI 6 "register_operand" "Upl") + [(match_operand:VNx16BI 6 "register_operand") (match_operand:SI 7 "aarch64_sve_ptrue_flag") (unspec:<VPRED> - [(match_operand:SVE_FULL_BHSI 2 "register_operand" "w") - (match_operand:VNx2DI 3 "register_operand" "w")] + [(match_operand:SVE_FULL_BHSI 2 "register_operand") + (match_operand:VNx2DI 3 "register_operand")] SVE_COND_INT_CMP_WIDE)] UNSPEC_PRED_Z)] UNSPEC_PTEST)) - (clobber (match_scratch:<VPRED> 0 "=Upa"))] + (clobber (match_scratch:<VPRED> 0))] "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])" - "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d" + {@ [ cons: =0, 1 , 2, 3, 6 ; attrs: pred_clobber ] + [ &Upa , Upl, w, w, Upl; yes ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d + [ ?Upl , 0 , w, w, Upl; yes ] ^ + [ Upa , Upl, w, w, Upl; no ] ^ + } ) ;; ------------------------------------------------------------------------- @@ -9912,9 +9996,13 @@ (match_operand:VNx16BI 3 "aarch64_simd_reg_or_zero")] SVE_BRK_UNARY))] "TARGET_SVE" - {@ [ cons: =0 , 1 , 2 , 3 ] - [ Upa , Upa , Upa , Dz ] brk<brk_op>\t%0.b, %1/z, %2.b - [ Upa , Upa , Upa , 0 ] brk<brk_op>\t%0.b, %1/m, %2.b + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Dz; yes ] brk<brk_op>\t%0.b, %1/z, %2.b + [ ?Upa , 0Upa, 0Upa, Dz; yes ] ^ + [ Upa , Upa , Upa , Dz; no ] ^ + [ &Upa , Upa , Upa , 0 ; yes ] brk<brk_op>\t%0.b, %1/m, %2.b + [ ?Upa , 0Upa, 0Upa, 0 ; yes ] ^ + [ Upa , Upa , Upa , 0 ; no ] ^ } ) @@ -9922,41 +10010,49 @@ (define_insn "*aarch64_brk<brk_op>_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 4 "aarch64_sve_ptrue_flag") (unspec:VNx16BI [(match_dup 1) - (match_operand:VNx16BI 2 "register_operand" "Upa") + (match_operand:VNx16BI 2 "register_operand") (match_operand:VNx16BI 3 "aarch64_simd_imm_zero")] SVE_BRK_UNARY)] UNSPEC_PTEST)) - (set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (set (match_operand:VNx16BI 0 "register_operand") (unspec:VNx16BI [(match_dup 1) (match_dup 2) (match_dup 3)] SVE_BRK_UNARY))] "TARGET_SVE" - "brk<brk_op>s\t%0.b, %1/z, %2.b" + {@ [ cons: =0, 1 , 2 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa ; yes ] brk<brk_op>s\t%0.b, %1/z, %2.b + [ ?Upa , 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa ; no ] ^ + } ) ;; Same, but with only the flags result being interesting. (define_insn "*aarch64_brk<brk_op>_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 4 "aarch64_sve_ptrue_flag") (unspec:VNx16BI [(match_dup 1) - (match_operand:VNx16BI 2 "register_operand" "Upa") + (match_operand:VNx16BI 2 "register_operand") (match_operand:VNx16BI 3 "aarch64_simd_imm_zero")] SVE_BRK_UNARY)] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE" - "brk<brk_op>s\t%0.b, %1/z, %2.b" + {@ [ cons: =0, 1 , 2 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa ; yes ] brk<brk_op>s\t%0.b, %1/z, %2.b + [ ?Upa , 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa ; no ] ^ + } ) ;; ------------------------------------------------------------------------- @@ -9973,14 +10069,18 @@ ;; Binary BRKs (BRKN, BRKPA, BRKPB). (define_insn "@aarch64_brk<brk_op>" - [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") + [(set (match_operand:VNx16BI 0 "register_operand") (unspec:VNx16BI - [(match_operand:VNx16BI 1 "register_operand" "Upa") - (match_operand:VNx16BI 2 "register_operand" "Upa") - (match_operand:VNx16BI 3 "register_operand" "<brk_reg_con>")] + [(match_operand:VNx16BI 1 "register_operand") + (match_operand:VNx16BI 2 "register_operand") + (match_operand:VNx16BI 3 "register_operand")] SVE_BRK_BINARY))] "TARGET_SVE" - "brk<brk_op>\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , <brk_reg_con> ; yes ] brk<brk_op>\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b + [ ?Upa , 0Upa, 0Upa, 0<brk_reg_con>; yes ] ^ + [ Upa , Upa , Upa , <brk_reg_con> ; no ] ^ + } ) ;; BRKN, producing both a predicate and a flags result. Unlike other @@ -10041,41 +10141,49 @@ (define_insn "*aarch64_brk<brk_op>_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 4 "aarch64_sve_ptrue_flag") (unspec:VNx16BI [(match_dup 1) - (match_operand:VNx16BI 2 "register_operand" "Upa") - (match_operand:VNx16BI 3 "register_operand" "Upa")] + (match_operand:VNx16BI 2 "register_operand") + (match_operand:VNx16BI 3 "register_operand")] SVE_BRKP)] UNSPEC_PTEST)) - (set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (set (match_operand:VNx16BI 0 "register_operand") (unspec:VNx16BI [(match_dup 1) (match_dup 2) (match_dup 3)] SVE_BRKP))] "TARGET_SVE" - "brk<brk_op>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 , 4; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa , ; yes ] brk<brk_op>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa, ; yes ] ^ + [ Upa , Upa , Upa , Upa , ; no ] ^ + } ) ;; Same, but with only the flags result being interesting. (define_insn "*aarch64_brk<brk_op>_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 4 "aarch64_sve_ptrue_flag") (unspec:VNx16BI [(match_dup 1) - (match_operand:VNx16BI 2 "register_operand" "Upa") - (match_operand:VNx16BI 3 "register_operand" "Upa")] + (match_operand:VNx16BI 2 "register_operand") + (match_operand:VNx16BI 3 "register_operand")] SVE_BRKP)] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE" - "brk<brk_op>s\t%0.b, %1/z, %2.b, %3.b" + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ &Upa , Upa , Upa , Upa ; yes ] brk<brk_op>s\t%0.b, %1/z, %2.b, %3.b + [ ?Upa , 0Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa , Upa ; no ] ^ + } ) ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 934e57055d3..972b03a4fef 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -3338,18 +3338,22 @@ ;; Predicated string matching. (define_insn "@aarch64_pred_<sve_int_op><mode>" - [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") + [(set (match_operand:<VPRED> 0 "register_operand") (unspec:<VPRED> - [(match_operand:<VPRED> 1 "register_operand" "Upl") + [(match_operand:<VPRED> 1 "register_operand") (match_operand:SI 2 "aarch64_sve_ptrue_flag") (unspec:<VPRED> - [(match_operand:SVE_FULL_BHI 3 "register_operand" "w") - (match_operand:SVE_FULL_BHI 4 "register_operand" "w")] + [(match_operand:SVE_FULL_BHI 3 "register_operand") + (match_operand:SVE_FULL_BHI 4 "register_operand")] SVE2_MATCH)] UNSPEC_PRED_Z)) (clobber (reg:CC_NZC CC_REGNUM))] "TARGET_SVE2 && TARGET_NON_STREAMING" - "<sve_int_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>" + {@ [ cons: =0, 1 , 3, 4; attrs: pred_clobber ] + [ &Upa , Upl, w, w; yes ] <sve_int_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype> + [ ?Upl , 0 , w, w; yes ] ^ + [ Upa , Upl, w, w; no ] ^ + } ) ;; Predicated string matching in which both the flag and predicate results diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index abd3c9e0822..ba940f1c890 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,cobalt100,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,oryon1,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,cobalt100,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index d917da720b2..a9f48f5d3d4 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -36,9 +36,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) are not considered cheap. */ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) -/* Disallow load/store pair instructions on Q-registers. */ -AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS) - AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) @@ -51,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA) +/* Enable is the target prefers to use a fresh register for predicate outputs + rather than re-use an input predicate register. */ +AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1beec94629d..149e5b2f69a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -103,6 +103,10 @@ /* Defined for convenience. */ #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) +/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP + and 1 MOVI/DUP (same size as a call). */ +#define MAX_SET_SIZE(speed) (speed ? 256 : 96) + /* Flags that describe how a function shares certain architectural state with its callers. @@ -10400,9 +10404,7 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode mode) || mode == SDmode || mode == DDmode || (aarch64_vector_mode_supported_p (mode) && (known_eq (GET_MODE_SIZE (mode), 8) - || (known_eq (GET_MODE_SIZE (mode), 16) - && (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0))); + || known_eq (GET_MODE_SIZE (mode), 16))); } /* Return true if REGNO is a virtual pointer register, or an eliminable @@ -14347,10 +14349,24 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, return false; case CTZ: - *cost = COSTS_N_INSNS (2); - - if (speed) - *cost += extra_cost->alu.clz + extra_cost->alu.rev; + if (VECTOR_MODE_P (mode)) + { + *cost = COSTS_N_INSNS (3); + if (speed) + *cost += extra_cost->vect.alu * 3; + } + else if (TARGET_CSSC) + { + *cost = COSTS_N_INSNS (1); + if (speed) + *cost += extra_cost->alu.clz; + } + else + { + *cost = COSTS_N_INSNS (2); + if (speed) + *cost += extra_cost->alu.clz + extra_cost->alu.rev; + } return false; case COMPARE: @@ -14674,6 +14690,7 @@ cost_plus: return true; } + case BITREVERSE: case BSWAP: *cost = COSTS_N_INSNS (1); @@ -15323,14 +15340,6 @@ cost_plus: return false; } - - if (XINT (x, 1) == UNSPEC_RBIT) - { - if (speed) - *cost += extra_cost->alu.rev; - - return false; - } break; case TRUNCATE: @@ -16519,10 +16528,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind, return false; } - if (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) - return false; - return is_gimple_assign (stmt_info->stmt); } @@ -17170,9 +17175,6 @@ aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind, /* Count 1 insn per vector if we can't form STP Q pairs. */ if (aarch64_sve_mode_p (TYPE_MODE (vectype))) return count * 2; - if (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) - return count * 2; if (stmt_info) { @@ -25324,27 +25326,26 @@ aarch64_output_sve_ptrues (rtx const_unspec) void aarch64_split_combinev16qi (rtx operands[3]) { - unsigned int dest = REGNO (operands[0]); - unsigned int src1 = REGNO (operands[1]); - unsigned int src2 = REGNO (operands[2]); machine_mode halfmode = GET_MODE (operands[1]); - unsigned int halfregs = REG_NREGS (operands[1]); - rtx destlo, desthi; gcc_assert (halfmode == V16QImode); - if (src1 == dest && src2 == dest + halfregs) + rtx destlo = simplify_gen_subreg (halfmode, operands[0], + GET_MODE (operands[0]), 0); + rtx desthi = simplify_gen_subreg (halfmode, operands[0], + GET_MODE (operands[0]), + GET_MODE_SIZE (halfmode)); + + bool skiplo = rtx_equal_p (destlo, operands[1]); + bool skiphi = rtx_equal_p (desthi, operands[2]); + + if (skiplo && skiphi) { /* No-op move. Can't split to nothing; emit something. */ emit_note (NOTE_INSN_DELETED); return; } - /* Preserve register attributes for variable tracking. */ - destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0); - desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs, - GET_MODE_SIZE (halfmode)); - /* Special case of reversed high/low parts. */ if (reg_overlap_mentioned_p (operands[2], destlo) && reg_overlap_mentioned_p (operands[1], desthi)) @@ -25357,16 +25358,16 @@ aarch64_split_combinev16qi (rtx operands[3]) { /* Try to avoid unnecessary moves if part of the result is in the right place already. */ - if (src1 != dest) + if (!skiplo) emit_move_insn (destlo, operands[1]); - if (src2 != dest + halfregs) + if (!skiphi) emit_move_insn (desthi, operands[2]); } else { - if (src2 != dest + halfregs) + if (!skiphi) emit_move_insn (desthi, operands[2]); - if (src1 != dest) + if (!skiplo) emit_move_insn (destlo, operands[1]); } } @@ -25579,7 +25580,6 @@ static bool aarch64_evpc_reencode (struct expand_vec_perm_d *d) { expand_vec_perm_d newd; - unsigned HOST_WIDE_INT nelt; if (d->vec_flags != VEC_ADVSIMD) return false; @@ -25594,24 +25594,10 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d) if (new_mode == word_mode) return false; - /* to_constant is safe since this routine is specific to Advanced SIMD - vectors. */ - nelt = d->perm.length ().to_constant (); - - vec_perm_builder newpermconst; - newpermconst.new_vector (nelt / 2, nelt / 2, 1); + vec_perm_indices newpermindices; - /* Convert the perm constant if we can. Require even, odd as the pairs. */ - for (unsigned int i = 0; i < nelt; i += 2) - { - poly_int64 elt0 = d->perm[i]; - poly_int64 elt1 = d->perm[i + 1]; - poly_int64 newelt; - if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1)) - return false; - newpermconst.quick_push (newelt.to_constant ()); - } - newpermconst.finalize (); + if (!newpermindices.new_shrunk_vector (d->perm, 2)) + return false; newd.vmode = new_mode; newd.vec_flags = VEC_ADVSIMD; @@ -25623,7 +25609,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d) newd.testing_p = d->testing_p; newd.one_vector_p = d->one_vector_p; - newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2); + newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2, + newpermindices.nelts_per_input ()); return aarch64_expand_vec_perm_const_1 (&newd); } @@ -26574,15 +26561,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount) next, amount); } -/* Return a new RTX holding the result of moving POINTER forward by the - size of the mode it points to. */ - -static rtx -aarch64_progress_pointer (rtx pointer) -{ - return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); -} - /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove rather than memcpy. Return true iff we succeeded. */ @@ -26625,11 +26603,9 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove) return aarch64_expand_cpymem_mops (operands, is_memmove); unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); - bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS); /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */ - unsigned max_copy_size = use_ldpq ? 256 : 128; + unsigned max_copy_size = TARGET_SIMD ? 256 : 128; unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold : aarch64_mops_memcpy_size_threshold; @@ -26710,48 +26686,6 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove) return true; } -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where - SRC is a register we have created with the duplicated value to be set. */ -static void -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst, - machine_mode mode) -{ - /* If we are copying 128bits or 256bits, we can do that straight from - the SIMD register we prepared. */ - if (known_eq (GET_MODE_BITSIZE (mode), 256)) - { - mode = GET_MODE (src); - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_move_insn (*dst, src); - emit_move_insn (aarch64_move_pointer (*dst, 16), src); - - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 32); - return; - } - if (known_eq (GET_MODE_BITSIZE (mode), 128)) - { - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, GET_MODE (src), 0); - /* Emit the memset. */ - emit_move_insn (*dst, src); - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 16); - return; - } - /* For copying less, we have to extract the right amount from src. */ - rtx reg = lowpart_subreg (mode, src, GET_MODE (src)); - - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_move_insn (*dst, reg); - /* Move the pointer forward. */ - *dst = aarch64_progress_pointer (*dst); -} - /* Expand a setmem using the MOPS instructions. OPERANDS are the same as for the setmem pattern. Return true iff we succeed. */ static bool @@ -26778,24 +26712,21 @@ aarch64_expand_setmem_mops (rtx *operands) bool aarch64_expand_setmem (rtx *operands) { - int n, mode_bits; + int mode_bytes; unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; unsigned align = UINTVAL (operands[3]); rtx base; - machine_mode cur_mode = BLKmode, next_mode; + machine_mode mode = BLKmode, next_mode; /* Variable-sized or strict-align memset may use the MOPS expansion. */ if (!CONST_INT_P (operands[1]) || !TARGET_SIMD || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); - bool size_p = optimize_function_for_size_p (cfun); - - /* Default the maximum to 256-bytes when considering only libcall vs - SIMD broadcast sequence. */ - unsigned max_set_size = 256; + /* Set inline limits for memset. MOPS has a separate threshold. */ + unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun)); unsigned mops_threshold = aarch64_mops_memset_size_threshold; len = UINTVAL (operands[1]); @@ -26804,91 +26735,51 @@ aarch64_expand_setmem (rtx *operands) if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) return aarch64_expand_setmem_mops (operands); - int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); - /* The MOPS sequence takes: - 3 instructions for the memory storing - + 1 to move the constant size into a reg - + 1 if VAL is a non-zero constant to move into a reg - (zero constants can use XZR directly). */ - unsigned mops_cost = 3 + 1 + cst_val; - /* A libcall to memset in the worst case takes 3 instructions to prepare - the arguments + 1 for the call. */ - unsigned libcall_cost = 4; - - /* Attempt a sequence with a vector broadcast followed by stores. - Count the number of operations involved to see if it's worth it - against the alternatives. A simple counter simd_ops on the - algorithmically-relevant operations is used rather than an rtx_insn count - as all the pointer adjusmtents and mode reinterprets will be optimized - away later. */ - start_sequence (); - unsigned simd_ops = 0; - base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); dst = adjust_automodify_address (dst, VOIDmode, base, 0); /* Prepare the val using a DUP/MOVI v0.16B, val. */ - src = expand_vector_broadcast (V16QImode, val); - src = force_reg (V16QImode, src); - simd_ops++; - /* Convert len to bits to make the rest of the code simpler. */ - n = len * BITS_PER_UNIT; + val = expand_vector_broadcast (V16QImode, val); + val = force_reg (V16QImode, val); - /* Maximum amount to copy in one go. We allow 256-bit chunks based on the - AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */ - const int copy_limit = (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) - ? GET_MODE_BITSIZE (TImode) : 256; - - while (n > 0) + int offset = 0; + while (len > 0) { /* Find the largest mode in which to do the copy without over writing. */ opt_scalar_int_mode mode_iter; FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) - if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit)) - cur_mode = mode_iter.require (); + if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16)) + mode = mode_iter.require (); + + gcc_assert (mode != BLKmode); + + mode_bytes = GET_MODE_SIZE (mode).to_constant (); - gcc_assert (cur_mode != BLKmode); + src = val; - mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); - aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode); - simd_ops++; - n -= mode_bits; + /* Prefer Q-register accesses. */ + if (mode_bytes == 16) + mode = V16QImode; + else + src = lowpart_subreg (mode, src, GET_MODE (val)); + + emit_move_insn (adjust_address (dst, mode, offset), src); + len -= mode_bytes; + offset += mode_bytes; /* Emit trailing writes using overlapping unaligned accesses - (when !STRICT_ALIGNMENT) - this is smaller and faster. */ - if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ + if (len > 0 && len < 16 && !STRICT_ALIGNMENT) { - next_mode = smallest_mode_for_size (n, MODE_INT); - int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); - gcc_assert (n_bits <= mode_bits); - dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); - n = n_bits; + next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT); + int n_bytes = GET_MODE_SIZE (next_mode).to_constant (); + gcc_assert (n_bytes <= mode_bytes); + offset -= n_bytes - len; + len = n_bytes; } } - rtx_insn *seq = get_insns (); - end_sequence (); - - if (size_p) - { - /* When optimizing for size we have 3 options: the SIMD broadcast sequence, - call to memset or the MOPS expansion. */ - if (TARGET_MOPS - && mops_cost <= libcall_cost - && mops_cost <= simd_ops) - return aarch64_expand_setmem_mops (operands); - /* If MOPS is not available or not shorter pick a libcall if the SIMD - sequence is too long. */ - else if (libcall_cost < simd_ops) - return false; - emit_insn (seq); - return true; - } - /* At this point the SIMD broadcast sequence is the best choice when - optimizing for speed. */ - emit_insn (seq); return true; } diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 4fa1dfc7906..0997b82dbc0 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; enabled through +gcs. */ #define TARGET_GCS (AARCH64_ISA_GCS) +/* Prefer different predicate registers for the output of a predicated + operation over re-using an existing input predicate. */ +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ + && (aarch64_tune_params.extra_tuning_flags \ + & AARCH64_EXTRA_TUNE_AVOID_PRED_RMW)) /* Standard register usage. */ @@ -537,11 +542,14 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; register. GCC internally uses the poly_int variable aarch64_sve_vg instead. */ +#define FIXED_X18 0 +#define CALL_USED_X18 1 + #define FIXED_REGISTERS \ { \ 0, 0, 0, 0, 0, 0, 0, 0, /* R0 - R7 */ \ 0, 0, 0, 0, 0, 0, 0, 0, /* R8 - R15 */ \ - 0, 0, 0, 0, 0, 0, 0, 0, /* R16 - R23 */ \ + 0, 0, FIXED_X18, 0, 0, 0, 0, 0, /* R16 - R23. */ \ 0, 0, 0, 0, 0, 1, 0, 1, /* R24 - R30, SP */ \ 0, 0, 0, 0, 0, 0, 0, 0, /* V0 - V7 */ \ 0, 0, 0, 0, 0, 0, 0, 0, /* V8 - V15 */ \ @@ -565,7 +573,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; { \ 1, 1, 1, 1, 1, 1, 1, 1, /* R0 - R7 */ \ 1, 1, 1, 1, 1, 1, 1, 1, /* R8 - R15 */ \ - 1, 1, 1, 0, 0, 0, 0, 0, /* R16 - R23 */ \ + 1, 1, CALL_USED_X18, 0, 0, 0, 0, 0, /* R16 - R23. */ \ 0, 0, 0, 0, 0, 1, 1, 1, /* R24 - R30, SP */ \ 1, 1, 1, 1, 1, 1, 1, 1, /* V0 - V7 */ \ 0, 0, 0, 0, 0, 0, 0, 0, /* V8 - V15 */ \ @@ -1043,6 +1051,9 @@ struct GTY (()) aarch64_frame bool is_scs_enabled; }; +/* Private to winnt.cc. */ +struct seh_frame_state; + #ifdef hash_set_h typedef struct GTY (()) machine_function { @@ -1083,6 +1094,9 @@ typedef struct GTY (()) machine_function still exists and still fulfils its original purpose. the same register can be reused by other code. */ rtx_insn *advsimd_zero_insn; + + /* During SEH output, this is non-null. */ + struct seh_frame_state * GTY ((skip (""))) seh; } machine_function; #endif #endif diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index dbde066f747..9de6235b139 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -259,7 +259,6 @@ UNSPEC_PACIBSP UNSPEC_PRLG_STK UNSPEC_REV - UNSPEC_RBIT UNSPEC_SADALP UNSPEC_SCVTF UNSPEC_SETMEM @@ -445,6 +444,10 @@ ;; target-independent code. (define_attr "is_call" "no,yes" (const_string "no")) +;; Indicates whether we want to enable the pattern with an optional early +;; clobber for SVE predicates. +(define_attr "pred_clobber" "any,no,yes" (const_string "any")) + ;; [For compatibility with Arm in pipeline models] ;; Attribute that specifies whether or not the instruction touches fp ;; registers. @@ -460,7 +463,17 @@ (define_attr "arch_enabled" "no,yes" (if_then_else - (ior + (and + (ior + (and + (eq_attr "pred_clobber" "no") + (match_test "!TARGET_SVE_PRED_CLOBBER")) + (and + (eq_attr "pred_clobber" "yes") + (match_test "TARGET_SVE_PRED_CLOBBER")) + (eq_attr "pred_clobber" "any")) + + (ior (eq_attr "arch" "any") (and (eq_attr "arch" "rcpc8_4") @@ -488,7 +501,7 @@ (match_test "TARGET_SVE")) (and (eq_attr "arch" "sme") - (match_test "TARGET_SME"))) + (match_test "TARGET_SME")))) (const_string "yes") (const_string "no"))) @@ -1447,7 +1460,7 @@ [w , m ; load_4 , fp , 4] ldr\t%s0, %1 [m , r Z; store_4 , * , 4] str\t%w1, %0 [m , w ; store_4 , fp , 4] str\t%s1, %0 - [r , Usw; load_4 , * , 8] adrp\t%x0, %A1;ldr\t%w0, [%x0, %L1] + [r , Usw; load_4 , * , 8] adrp\t%x0, %A1\;ldr\t%w0, [%x0, %L1] [r , Usa; adr , * , 4] adr\t%x0, %c1 [r , Ush; adr , * , 4] adrp\t%x0, %A1 [w , r Z; f_mcr , fp , 4] fmov\t%s0, %w1 @@ -1484,7 +1497,7 @@ [w, m ; load_8 , fp , 4] ldr\t%d0, %1 [m, r Z; store_8 , * , 4] str\t%x1, %0 [m, w ; store_8 , fp , 4] str\t%d1, %0 - [r, Usw; load_8 , * , 8] << TARGET_ILP32 ? "adrp\t%0, %A1;ldr\t%w0, [%0, %L1]" : "adrp\t%0, %A1;ldr\t%0, [%0, %L1]"; + [r, Usw; load_8 , * , 8] << TARGET_ILP32 ? "adrp\t%0, %A1\;ldr\t%w0, [%0, %L1]" : "adrp\t%0, %A1\;ldr\t%0, [%0, %L1]"; [r, Usa; adr , * , 4] adr\t%x0, %c1 [r, Ush; adr , * , 4] adrp\t%x0, %A1 [w, r Z; f_mcr , fp , 4] fmov\t%d0, %x1 @@ -5354,7 +5367,7 @@ (define_insn "@aarch64_rbit<mode>" [(set (match_operand:GPI 0 "register_operand" "=r") - (unspec:GPI [(match_operand:GPI 1 "register_operand" "r")] UNSPEC_RBIT))] + (bitreverse:GPI (match_operand:GPI 1 "register_operand" "r")))] "" "rbit\\t%<w>0, %<w>1" [(set_attr "type" "rbit")] diff --git a/gcc/config/aarch64/aarch64.opt.urls b/gcc/config/aarch64/aarch64.opt.urls index 993634c52f8..4fa90384378 100644 --- a/gcc/config/aarch64/aarch64.opt.urls +++ b/gcc/config/aarch64/aarch64.opt.urls @@ -18,7 +18,8 @@ UrlSuffix(gcc/AArch64-Options.html#index-mfix-cortex-a53-843419) mlittle-endian UrlSuffix(gcc/AArch64-Options.html#index-mlittle-endian) -; skipping UrlSuffix for 'mcmodel=' due to finding no URLs +mcmodel= +UrlSuffix(gcc/AArch64-Options.html#index-mcmodel_003d) mtp= UrlSuffix(gcc/AArch64-Options.html#index-mtp) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0ee325dccad..c4a09528ffd 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -3027,202 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index) return __aarch64_vset_lane_any (__elem, __vec, __index); } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_f16 (float16x8_t __a) -{ - return __builtin_aarch64_get_lowv8hf (__a); -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_f32 (float32x4_t __a) -{ - return __builtin_aarch64_get_lowv4sf (__a); -} - -__extension__ extern __inline float64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_f64 (float64x2_t __a) -{ - return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)}; -} - -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_p8 (poly8x16_t __a) -{ - return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); -} - -__extension__ extern __inline poly16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_p16 (poly16x8_t __a) -{ - return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_p64 (poly64x2_t __a) -{ - return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a); -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s8 (int8x16_t __a) -{ - return __builtin_aarch64_get_lowv16qi (__a); -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s16 (int16x8_t __a) -{ - return __builtin_aarch64_get_lowv8hi (__a); -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s32 (int32x4_t __a) -{ - return __builtin_aarch64_get_lowv4si (__a); -} - -__extension__ extern __inline int64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_s64 (int64x2_t __a) -{ - return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)}; -} - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u8 (uint8x16_t __a) -{ - return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u16 (uint16x8_t __a) -{ - return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); -} - -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u32 (uint32x4_t __a) -{ - return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a); -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_u64 (uint64x2_t __a) -{ - return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)}; -} - -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_f16 (float16x8_t __a) -{ - return __builtin_aarch64_get_highv8hf (__a); -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_f32 (float32x4_t __a) -{ - return __builtin_aarch64_get_highv4sf (__a); -} - -__extension__ extern __inline float64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_f64 (float64x2_t __a) -{ - return (float64x1_t) {__builtin_aarch64_get_highv2df (__a)}; -} - -__extension__ extern __inline poly8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_p8 (poly8x16_t __a) -{ - return (poly8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a); -} - -__extension__ extern __inline poly16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_p16 (poly16x8_t __a) -{ - return (poly16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a); -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_p64 (poly64x2_t __a) -{ - return (poly64x1_t) __builtin_aarch64_get_highv2di ((int64x2_t) __a); -} - -__extension__ extern __inline int8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_s8 (int8x16_t __a) -{ - return __builtin_aarch64_get_highv16qi (__a); -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_s16 (int16x8_t __a) -{ - return __builtin_aarch64_get_highv8hi (__a); -} - -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_s32 (int32x4_t __a) -{ - return __builtin_aarch64_get_highv4si (__a); -} - -__extension__ extern __inline int64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_s64 (int64x2_t __a) -{ - return (int64x1_t) {__builtin_aarch64_get_highv2di (__a)}; -} - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_u8 (uint8x16_t __a) -{ - return (uint8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a); -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_u16 (uint16x8_t __a) -{ - return (uint16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a); -} - -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_u32 (uint32x4_t __a) -{ - return (uint32x2_t) __builtin_aarch64_get_highv4si ((int32x4_t) __a); -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_u64 (uint64x2_t __a) -{ - return (uint64x1_t) {__builtin_aarch64_get_highv2di ((int64x2_t) __a)}; -} - __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -28479,20 +28283,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index); } -__extension__ extern __inline bfloat16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_low_bf16 (bfloat16x8_t __a) -{ - return __builtin_aarch64_vget_lo_halfv8bf (__a); -} - -__extension__ extern __inline bfloat16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vget_high_bf16 (bfloat16x8_t __a) -{ - return __builtin_aarch64_vget_hi_halfv8bf (__a); -} - __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvt_f32_bf16 (bfloat16x4_t __a) diff --git a/gcc/config/aarch64/cygming.h b/gcc/config/aarch64/cygming.h new file mode 100644 index 00000000000..2e7b01feb76 --- /dev/null +++ b/gcc/config/aarch64/cygming.h @@ -0,0 +1,172 @@ +/* Operating system specific defines to be used when targeting GCC for + hosting on Windows32, using a Unix style C library and tools. + Copyright (C) 1995-2024 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#ifndef GCC_AARCH64_CYGMING_H +#define GCC_AARCH64_CYGMING_H + +#undef PREFERRED_DEBUGGING_TYPE +#define PREFERRED_DEBUGGING_TYPE DINFO_TYPE_NONE + +#define FASTCALL_PREFIX '@' + +#define print_reg(rtx, code, file) (gcc_unreachable ()) + +#define SYMBOL_FLAG_DLLIMPORT 0 +#define SYMBOL_FLAG_DLLEXPORT 0 + +#define SYMBOL_REF_DLLEXPORT_P(X) \ + ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLEXPORT) != 0) + +/* Disable SEH and declare the required SEH-related macros that are +still needed for compilation. */ +#undef TARGET_SEH +#define TARGET_SEH 0 + +#define SSE_REGNO_P(N) (gcc_unreachable (), 0) +#define GENERAL_REGNO_P(N) (gcc_unreachable (), 0) +#define SEH_MAX_FRAME_SIZE (gcc_unreachable (), 0) + +#undef TARGET_PECOFF +#define TARGET_PECOFF 1 + +#include <stdbool.h> +#ifdef __MINGW32__ +#include <stdio.h> +#endif + +extern void mingw_pe_asm_named_section (const char *, unsigned int, tree); +extern void mingw_pe_declare_function_type (FILE *file, const char *name, + int pub); + +#define TARGET_ASM_NAMED_SECTION mingw_pe_asm_named_section + +/* Select attributes for named sections. */ +#define TARGET_SECTION_TYPE_FLAGS mingw_pe_section_type_flags + +#define TARGET_ASM_UNIQUE_SECTION mingw_pe_unique_section +#define TARGET_ENCODE_SECTION_INFO mingw_pe_encode_section_info + +/* Declare the type properly for any external libcall. */ +#define ASM_OUTPUT_EXTERNAL_LIBCALL(FILE, FUN) \ + mingw_pe_declare_function_type (FILE, XSTR (FUN, 0), 1) + +#define TARGET_OS_CPP_BUILTINS() \ + do \ + { \ + builtin_define ("__MSVCRT__"); \ + builtin_define ("__MINGW32__"); \ + builtin_define ("_WIN32"); \ + builtin_define_std ("WIN32"); \ + builtin_define_std ("WINNT"); \ + builtin_define_with_int_value ("_INTEGRAL_MAX_BITS", \ + TYPE_PRECISION (intmax_type_node)); \ + builtin_define ("__MINGW64__"); \ + builtin_define_std ("WIN64"); \ + builtin_define ("_WIN64"); \ + builtin_define ("__stdcall=__attribute__((__stdcall__))"); \ + builtin_define ("__fastcall=__attribute__((__fastcall__))"); \ + builtin_define ("__thiscall=__attribute__((__thiscall__))"); \ + builtin_define ("__cdecl=__attribute__((__cdecl__))"); \ + } \ + while (0) + +/* Windows64 continues to use a 32-bit long type. */ +#undef LONG_TYPE_SIZE +#define LONG_TYPE_SIZE 32 + +#undef DEFAULT_SIGNED_CHAR +#define DEFAULT_SIGNED_CHAR 1 + +#undef SIZE_TYPE +#undef PTRDIFF_TYPE +#define SIZE_TYPE "long long unsigned int" +#define PTRDIFF_TYPE "long long int" + +#undef WCHAR_TYPE_SIZE +#undef WCHAR_TYPE +#define WCHAR_TYPE_SIZE 16 +#define WCHAR_TYPE "short unsigned int" + +#define drectve_section() \ + (fprintf (asm_out_file, "\t.section\t.drectve\n"), \ + in_section = NULL) + + +/* Enable alias attribute support. */ +#ifndef SET_ASM_OP +#define SET_ASM_OP "\t.set\t" +#endif + +/* GNU as supports weak symbols on PECOFF. */ +#define ASM_WEAKEN_LABEL(FILE, NAME) \ + do \ + { \ + fputs ("\t.weak\t", (FILE)); \ + assemble_name ((FILE), (NAME)); \ + fputc ('\n', (FILE)); \ + } \ + while (0) + +/* Get tree.cc to declare a target-specific specialization of + merge_decl_attributes. */ +#define TARGET_DLLIMPORT_DECL_ATTRIBUTES 1 + +#define ASM_OUTPUT_ADDR_DIFF_ELT(STREAM, BODY, VALUE, REL) \ + do { \ + switch (GET_MODE (BODY)) \ + { \ + case E_QImode: \ + asm_fprintf (STREAM, "\t.byte\t(%LL%d - %LLrtx%d) / 4\n", \ + VALUE, REL); \ + break; \ + case E_HImode: \ + asm_fprintf (STREAM, "\t.2byte\t(%LL%d - %LLrtx%d) / 4\n", \ + VALUE, REL); \ + break; \ + case E_SImode: \ + case E_DImode: /* See comment in aarch64_output_casesi. */ \ + asm_fprintf (STREAM, "\t.word\t(%LL%d - %LLrtx%d) / 4\n", \ + VALUE, REL); \ + break; \ + default: \ + gcc_unreachable (); \ + } \ + } while (0) + +#define READONLY_DATA_SECTION_ASM_OP "\t.section\t.rdata,\"dr\"" + +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS \ + do { \ + flag_stack_check = STATIC_BUILTIN_STACK_CHECK; \ + } while (0) + + +#define SUPPORTS_ONE_ONLY 1 + +/* Define this to be nonzero if static stack checking is supported. */ +#define STACK_CHECK_STATIC_BUILTIN 1 + +#define HAVE_GAS_ALIGNED_COMM 1 + +#undef MAX_OFILE_ALIGNMENT +#define MAX_OFILE_ALIGNMENT (8192 * 8) + +#endif diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 99cde46f1ba..f527b2cfeb8 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -2525,6 +2525,7 @@ ;; SVE integer unary operations. (define_code_iterator SVE_INT_UNARY [abs neg not clrsb clz popcount + bitreverse (ss_abs "TARGET_SVE2") (ss_neg "TARGET_SVE2")]) @@ -2573,6 +2574,7 @@ (clrsb "clrsb") (clz "clz") (popcount "popcount") + (bitreverse "rbit") (and "and") (ior "ior") (xor "xor") @@ -2785,6 +2787,7 @@ (clrsb "cls") (clz "clz") (popcount "cnt") + (bitreverse "rbit") (ss_plus "sqadd") (us_plus "uqadd") (ss_minus "sqsub") @@ -2990,7 +2993,7 @@ (define_int_iterator LAST [UNSPEC_LASTA UNSPEC_LASTB]) -(define_int_iterator SVE_INT_UNARY [UNSPEC_RBIT UNSPEC_REVB +(define_int_iterator SVE_INT_UNARY [UNSPEC_REVB UNSPEC_REVH UNSPEC_REVW]) (define_int_iterator SVE_FP_UNARY [UNSPEC_FRECPE UNSPEC_RSQRTE]) @@ -3568,7 +3571,6 @@ (UNSPEC_FRECPS "frecps") (UNSPEC_RSQRTE "frsqrte") (UNSPEC_RSQRTS "frsqrts") - (UNSPEC_RBIT "rbit") (UNSPEC_REVB "revb") (UNSPEC_REVD "revd") (UNSPEC_REVH "revh") @@ -4039,7 +4041,6 @@ (UNSPEC_PMULLT_PAIR "pmullt") (UNSPEC_RADDHNB "raddhnb") (UNSPEC_RADDHNT "raddhnt") - (UNSPEC_RBIT "rbit") (UNSPEC_REVB "revb") (UNSPEC_REVH "revh") (UNSPEC_REVW "revw") @@ -4416,8 +4417,7 @@ (UNSPEC_PFIRST "8") (UNSPEC_PNEXT "64")]) ;; The minimum number of element bits that an instruction can handle. -(define_int_attr min_elem_bits [(UNSPEC_RBIT "8") - (UNSPEC_REVB "16") +(define_int_attr min_elem_bits [(UNSPEC_REVB "16") (UNSPEC_REVH "32") (UNSPEC_REVW "64")]) diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 index 78713558e7d..c2a0715e9ab 100644 --- a/gcc/config/aarch64/t-aarch64 +++ b/gcc/config/aarch64/t-aarch64 @@ -201,9 +201,8 @@ aarch64-early-ra.o: $(srcdir)/config/aarch64/aarch64-early-ra.cc \ $(srcdir)/config/aarch64/aarch64-early-ra.cc aarch64-ldp-fusion.o: $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc \ - $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ - $(RTL_SSA_H) cfgcleanup.h tree-pass.h ordered-hash-map.h tree-dfa.h \ - fold-const.h tree-hash-traits.h print-tree.h + $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) \ + tree-pass.h pair-fusion.h $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc diff --git a/gcc/config/aarch64/tuning_models/emag.h b/gcc/config/aarch64/tuning_models/emag.h index cbaf8853ec4..b6a9c9e2eb1 100644 --- a/gcc/config/aarch64/tuning_models/emag.h +++ b/gcc/config/aarch64/tuning_models/emag.h @@ -51,7 +51,7 @@ static const struct tune_params emag_tunings = 2, /* min_div_recip_mul_df. */ 17, /* max_case_values. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ &xgene1_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h index 7e799bbe762..be9a48ac3ad 100644 --- a/gcc/config/aarch64/tuning_models/neoversen2.h +++ b/gcc/config/aarch64/tuning_models/neoversen2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversen2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h index 9363f2ad98a..0fc41ce6a41 100644 --- a/gcc/config/aarch64/tuning_models/neoversev1.h +++ b/gcc/config/aarch64/tuning_models/neoversev1.h @@ -227,7 +227,8 @@ static const struct tune_params neoversev1_tunings = (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index bc01ed767c9..f76e4ef358f 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversev2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ &generic_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/xgene1.h b/gcc/config/aarch64/tuning_models/xgene1.h index 3301f025260..432793eba9c 100644 --- a/gcc/config/aarch64/tuning_models/xgene1.h +++ b/gcc/config/aarch64/tuning_models/xgene1.h @@ -136,7 +136,7 @@ static const struct tune_params xgene1_tunings = 2, /* min_div_recip_mul_df. */ 17, /* max_case_values. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ &xgene1_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ |