aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r--gcc/config/i386/i386.c2041
1 files changed, 1765 insertions, 276 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7e89dbde7b9..4af4e5958b7 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1672,7 +1672,7 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
- 2, /* cost for loading QImode using movzbl */
+ 4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
@@ -3057,6 +3057,22 @@ ix86_option_override_internal (bool main_args_p)
PTA_64BIT /* flags are only used for -march switch. */ },
};
+ /* -mrecip options. */
+ static struct
+ {
+ const char *string; /* option name */
+ unsigned int mask; /* mask bits to set */
+ }
+ const recip_options[] =
+ {
+ { "all", RECIP_MASK_ALL },
+ { "none", RECIP_MASK_NONE },
+ { "div", RECIP_MASK_DIV },
+ { "sqrt", RECIP_MASK_SQRT },
+ { "vec-div", RECIP_MASK_VEC_DIV },
+ { "vec-sqrt", RECIP_MASK_VEC_SQRT },
+ };
+
int const pta_size = ARRAY_SIZE (processor_alias_table);
/* Set up prefix/suffix so the error messages refer to either the command
@@ -3814,6 +3830,56 @@ ix86_option_override_internal (bool main_args_p)
target_flags &= ~MASK_VZEROUPPER;
}
+ if (ix86_recip_name)
+ {
+ char *p = ASTRDUP (ix86_recip_name);
+ char *q;
+ unsigned int mask, i;
+ bool invert;
+
+ while ((q = strtok (p, ",")) != NULL)
+ {
+ p = NULL;
+ if (*q == '!')
+ {
+ invert = true;
+ q++;
+ }
+ else
+ invert = false;
+
+ if (!strcmp (q, "default"))
+ mask = RECIP_MASK_ALL;
+ else
+ {
+ for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+ if (!strcmp (q, recip_options[i].string))
+ {
+ mask = recip_options[i].mask;
+ break;
+ }
+
+ if (i == ARRAY_SIZE (recip_options))
+ {
+ error ("unknown option for -mrecip=%s", q);
+ invert = false;
+ mask = RECIP_MASK_NONE;
+ }
+ }
+
+ recip_mask_explicit |= mask;
+ if (invert)
+ recip_mask &= ~mask;
+ else
+ recip_mask |= mask;
+ }
+ }
+
+ if (TARGET_RECIP)
+ recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
+ else if (target_flags_explicit & MASK_RECIP)
+ recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
+
/* Save the initial options in case the user does function specific
options. */
if (main_args_p)
@@ -3946,6 +4012,7 @@ ix86_function_specific_save (struct cl_target_option *ptr)
ptr->arch_specified = ix86_arch_specified;
ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
ptr->ix86_target_flags_explicit = target_flags_explicit;
+ ptr->x_recip_mask_explicit = recip_mask_explicit;
/* The fields are char but the variables are not; make sure the
values fit in the fields. */
@@ -3973,6 +4040,7 @@ ix86_function_specific_restore (struct cl_target_option *ptr)
ix86_arch_specified = ptr->arch_specified;
ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
target_flags_explicit = ptr->ix86_target_flags_explicit;
+ recip_mask_explicit = ptr->x_recip_mask_explicit;
/* Recreate the arch feature tests if the arch changed */
if (old_arch != ix86_arch)
@@ -7959,7 +8027,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
else
{
tree copy
- = build_call_expr (implicit_built_in_decls[BUILT_IN_MEMCPY],
+ = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
3, dest_addr, src_addr,
size_int (cur_size));
gimplify_and_add (copy, pre_p);
@@ -9134,7 +9202,8 @@ static GTY(()) rtx queued_cfa_restores;
static void
ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
{
- if (cfa_offset <= cfun->machine->fs.red_zone_offset)
+ if (!crtl->shrink_wrapped
+ && cfa_offset <= cfun->machine->fs.red_zone_offset)
return;
if (insn)
@@ -10738,6 +10807,8 @@ ix86_expand_epilogue (int style)
GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
style, true);
}
+ else
+ ix86_add_queued_cfa_restore_notes (get_last_insn ());
/* Sibcall epilogues don't want a return instruction. */
if (style == 0)
@@ -10779,13 +10850,13 @@ ix86_expand_epilogue (int style)
pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
popc, -1, true);
- emit_jump_insn (gen_return_indirect_internal (ecx));
+ emit_jump_insn (gen_simple_return_indirect_internal (ecx));
}
else
- emit_jump_insn (gen_return_pop_internal (popc));
+ emit_jump_insn (gen_simple_return_pop_internal (popc));
}
else
- emit_jump_insn (gen_return_internal ());
+ emit_jump_insn (gen_simple_return_internal ());
/* Restore the state back to the state from the prologue,
so that it's correct for the next epilogue. */
@@ -15727,6 +15798,12 @@ ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
if (MEM_P (src1) && !rtx_equal_p (dst, src1))
src1 = force_reg (mode, src1);
+ /* Improve address combine. */
+ if (code == PLUS
+ && GET_MODE_CLASS (mode) == MODE_INT
+ && MEM_P (src2))
+ src2 = force_reg (mode, src2);
+
operands[1] = src1;
operands[2] = src2;
return dst;
@@ -16139,19 +16216,20 @@ distance_non_agu_define (unsigned int regno1, unsigned int regno2,
FOR_EACH_EDGE (e, ei, bb->preds)
{
- int bb_dist = distance_non_agu_define_in_bb (regno1, regno2,
- insn, distance,
- BB_END (e->src),
- &found_in_bb);
+ int bb_dist
+ = distance_non_agu_define_in_bb (regno1, regno2,
+ insn, distance,
+ BB_END (e->src),
+ &found_in_bb);
if (found_in_bb)
{
if (shortest_dist < 0)
shortest_dist = bb_dist;
else if (bb_dist > 0)
shortest_dist = MIN (bb_dist, shortest_dist);
- }
- found = found || found_in_bb;
+ found = true;
+ }
}
distance = shortest_dist;
@@ -16164,11 +16242,9 @@ distance_non_agu_define (unsigned int regno1, unsigned int regno2,
extract_insn_cached (insn);
if (!found)
- distance = -1;
- else
- distance = distance >> 1;
+ return -1;
- return distance;
+ return distance >> 1;
}
/* Return the distance in half-cycles between INSN and the next
@@ -16181,9 +16257,9 @@ distance_non_agu_define (unsigned int regno1, unsigned int regno2,
found and false otherwise. */
static int
-distance_agu_use_in_bb(unsigned int regno,
- rtx insn, int distance, rtx start,
- bool *found, bool *redefined)
+distance_agu_use_in_bb (unsigned int regno,
+ rtx insn, int distance, rtx start,
+ bool *found, bool *redefined)
{
basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
rtx next = start;
@@ -16268,18 +16344,19 @@ distance_agu_use (unsigned int regno0, rtx insn)
FOR_EACH_EDGE (e, ei, bb->succs)
{
- int bb_dist = distance_agu_use_in_bb (regno0, insn,
- distance, BB_HEAD (e->dest),
- &found_in_bb, &redefined_in_bb);
+ int bb_dist
+ = distance_agu_use_in_bb (regno0, insn,
+ distance, BB_HEAD (e->dest),
+ &found_in_bb, &redefined_in_bb);
if (found_in_bb)
{
if (shortest_dist < 0)
shortest_dist = bb_dist;
else if (bb_dist > 0)
shortest_dist = MIN (bb_dist, shortest_dist);
- }
- found = found || found_in_bb;
+ found = true;
+ }
}
distance = shortest_dist;
@@ -16287,11 +16364,9 @@ distance_agu_use (unsigned int regno0, rtx insn)
}
if (!found || redefined)
- distance = -1;
- else
- distance = distance >> 1;
+ return -1;
- return distance;
+ return distance >> 1;
}
/* Define this macro to tune LEA priority vs ADD, it take effect when
@@ -16346,7 +16421,7 @@ ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
false otherwise. */
static bool
-ix86_ok_to_clobber_flags(rtx insn)
+ix86_ok_to_clobber_flags (rtx insn)
{
basic_block bb = BLOCK_FOR_INSN (insn);
df_ref *use;
@@ -16470,6 +16545,21 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
}
+/* Emit x86 binary operand CODE in mode MODE, where the first operand
+ matches destination. RTX includes clobber of FLAGS_REG. */
+
+static void
+ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
+ rtx dst, rtx src)
+{
+ rtx op, clob;
+
+ op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
+ clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+}
+
/* Split lea instructions into a sequence of instructions
which are executed on ALU to avoid AGU stalls.
It is assumed that it is allowed to clobber flags register
@@ -16482,8 +16572,7 @@ ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
unsigned int regno1 = INVALID_REGNUM;
unsigned int regno2 = INVALID_REGNUM;
struct ix86_address parts;
- rtx tmp, clob;
- rtvec par;
+ rtx tmp;
int ok, adds;
ok = ix86_decompose_address (operands[1], &parts);
@@ -16515,14 +16604,7 @@ ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
gcc_assert (regno2 != regno0);
for (adds = parts.scale; adds > 0; adds--)
- {
- tmp = gen_rtx_PLUS (mode, operands[0], parts.index);
- tmp = gen_rtx_SET (VOIDmode, operands[0], tmp);
- clob = gen_rtx_CLOBBER (VOIDmode,
- gen_rtx_REG (CCmode, FLAGS_REG));
- par = gen_rtvec (2, tmp, clob);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
- }
+ ix86_emit_binop (PLUS, mode, operands[0], parts.index);
}
else
{
@@ -16531,30 +16613,14 @@ ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
/* Use shift for scaling. */
- tmp = gen_rtx_ASHIFT (mode, operands[0],
- GEN_INT (exact_log2 (parts.scale)));
- tmp = gen_rtx_SET (VOIDmode, operands[0], tmp);
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- par = gen_rtvec (2, tmp, clob);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
+ ix86_emit_binop (ASHIFT, mode, operands[0],
+ GEN_INT (exact_log2 (parts.scale)));
if (parts.base)
- {
- tmp = gen_rtx_PLUS (mode, operands[0], parts.base);
- tmp = gen_rtx_SET (VOIDmode, operands[0], tmp);
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- par = gen_rtvec (2, tmp, clob);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
- }
+ ix86_emit_binop (PLUS, mode, operands[0], parts.base);
if (parts.disp && parts.disp != const0_rtx)
- {
- tmp = gen_rtx_PLUS (mode, operands[0], parts.disp);
- tmp = gen_rtx_SET (VOIDmode, operands[0], tmp);
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- par = gen_rtvec (2, tmp, clob);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
- }
+ ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
}
}
else if (!parts.base && !parts.index)
@@ -16565,41 +16631,32 @@ ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
else
{
if (!parts.base)
- {
- if (regno0 != regno2)
- emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
- }
+ {
+ if (regno0 != regno2)
+ emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
+ }
else if (!parts.index)
- {
- if (regno0 != regno1)
- emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
- }
- else
- {
- if (regno0 == regno1)
- tmp = gen_rtx_PLUS (mode, operands[0], parts.index);
- else if (regno0 == regno2)
- tmp = gen_rtx_PLUS (mode, operands[0], parts.base);
- else
- {
+ {
+ if (regno0 != regno1)
emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
- tmp = gen_rtx_PLUS (mode, operands[0], parts.index);
- }
+ }
+ else
+ {
+ if (regno0 == regno1)
+ tmp = parts.index;
+ else if (regno0 == regno2)
+ tmp = parts.base;
+ else
+ {
+ emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
+ tmp = parts.index;
+ }
- tmp = gen_rtx_SET (VOIDmode, operands[0], tmp);
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- par = gen_rtvec (2, tmp, clob);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
- }
+ ix86_emit_binop (PLUS, mode, operands[0], tmp);
+ }
if (parts.disp && parts.disp != const0_rtx)
- {
- tmp = gen_rtx_PLUS (mode, operands[0], parts.disp);
- tmp = gen_rtx_SET (VOIDmode, operands[0], tmp);
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- par = gen_rtvec (2, tmp, clob);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
- }
+ ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
}
}
@@ -16951,6 +17008,10 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
switch (mode)
{
+ case V32QImode:
+ case V16QImode:
+ case V16HImode:
+ case V8HImode:
case V8SImode:
case V4SImode:
case V4DImode:
@@ -18890,7 +18951,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
enum machine_mode mode = GET_MODE (dest);
rtx t2, t3, x;
- if (vector_all_ones_operand (op_true, GET_MODE (op_true))
+ if (vector_all_ones_operand (op_true, mode)
&& rtx_equal_p (op_false, CONST0_RTX (mode)))
{
emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
@@ -19119,7 +19180,8 @@ ix86_expand_fp_vcond (rtx operands[])
bool
ix86_expand_int_vcond (rtx operands[])
{
- enum machine_mode mode = GET_MODE (operands[0]);
+ enum machine_mode data_mode = GET_MODE (operands[0]);
+ enum machine_mode mode = GET_MODE (operands[4]);
enum rtx_code code = GET_CODE (operands[3]);
bool negate = false;
rtx x, cop0, cop1;
@@ -19246,14 +19308,372 @@ ix86_expand_int_vcond (rtx operands[])
}
}
- x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
- operands[1+negate], operands[2-negate]);
+ /* Allow the comparison to be done in one mode, but the movcc to
+ happen in another mode. */
+ if (data_mode == mode)
+ {
+ x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
+ operands[1+negate], operands[2-negate]);
+ }
+ else
+ {
+ gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
+ x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
+ code, cop0, cop1,
+ operands[1+negate], operands[2-negate]);
+ x = gen_lowpart (data_mode, x);
+ }
ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
operands[2-negate]);
return true;
}
+/* Expand a variable vector permutation. */
+
+void
+ix86_expand_vec_perm (rtx operands[])
+{
+ rtx target = operands[0];
+ rtx op0 = operands[1];
+ rtx op1 = operands[2];
+ rtx mask = operands[3];
+ rtx t1, t2, t3, t4, vt, vt2, vec[32];
+ enum machine_mode mode = GET_MODE (op0);
+ enum machine_mode maskmode = GET_MODE (mask);
+ int w, e, i;
+ bool one_operand_shuffle = rtx_equal_p (op0, op1);
+
+ /* Number of elements in the vector. */
+ w = GET_MODE_NUNITS (mode);
+ e = GET_MODE_UNIT_SIZE (mode);
+ gcc_assert (w <= 32);
+
+ if (TARGET_AVX2)
+ {
+ if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
+ {
+ /* Unfortunately, the VPERMQ and VPERMPD instructions only support
+ an constant shuffle operand. With a tiny bit of effort we can
+ use VPERMD instead. A re-interpretation stall for V4DFmode is
+ unfortunate but there's no avoiding it.
+ Similarly for V16HImode we don't have instructions for variable
+ shuffling, while for V32QImode we can use after preparing suitable
+ masks vpshufb; vpshufb; vpermq; vpor. */
+
+ if (mode == V16HImode)
+ {
+ maskmode = mode = V32QImode;
+ w = 32;
+ e = 1;
+ }
+ else
+ {
+ maskmode = mode = V8SImode;
+ w = 8;
+ e = 4;
+ }
+ t1 = gen_reg_rtx (maskmode);
+
+ /* Replicate the low bits of the V4DImode mask into V8SImode:
+ mask = { A B C D }
+ t1 = { A A B B C C D D }. */
+ for (i = 0; i < w / 2; ++i)
+ vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
+ vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+ vt = force_reg (maskmode, vt);
+ mask = gen_lowpart (maskmode, mask);
+ if (maskmode == V8SImode)
+ emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
+ else
+ emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
+
+ /* Multiply the shuffle indicies by two. */
+ t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
+ OPTAB_DIRECT);
+
+ /* Add one to the odd shuffle indicies:
+ t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
+ for (i = 0; i < w / 2; ++i)
+ {
+ vec[i * 2] = const0_rtx;
+ vec[i * 2 + 1] = const1_rtx;
+ }
+ vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+ vt = force_const_mem (maskmode, vt);
+ t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
+ OPTAB_DIRECT);
+
+ /* Continue as if V8SImode (resp. V32QImode) was used initially. */
+ operands[3] = mask = t1;
+ target = gen_lowpart (mode, target);
+ op0 = gen_lowpart (mode, op0);
+ op1 = gen_lowpart (mode, op1);
+ }
+
+ switch (mode)
+ {
+ case V8SImode:
+ /* The VPERMD and VPERMPS instructions already properly ignore
+ the high bits of the shuffle elements. No need for us to
+ perform an AND ourselves. */
+ if (one_operand_shuffle)
+ emit_insn (gen_avx2_permvarv8si (target, mask, op0));
+ else
+ {
+ t1 = gen_reg_rtx (V8SImode);
+ t2 = gen_reg_rtx (V8SImode);
+ emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
+ emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
+ goto merge_two;
+ }
+ return;
+
+ case V8SFmode:
+ mask = gen_lowpart (V8SFmode, mask);
+ if (one_operand_shuffle)
+ emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
+ else
+ {
+ t1 = gen_reg_rtx (V8SFmode);
+ t2 = gen_reg_rtx (V8SFmode);
+ emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
+ emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
+ goto merge_two;
+ }
+ return;
+
+ case V4SImode:
+ /* By combining the two 128-bit input vectors into one 256-bit
+ input vector, we can use VPERMD and VPERMPS for the full
+ two-operand shuffle. */
+ t1 = gen_reg_rtx (V8SImode);
+ t2 = gen_reg_rtx (V8SImode);
+ emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
+ emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+ emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
+ emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
+ return;
+
+ case V4SFmode:
+ t1 = gen_reg_rtx (V8SFmode);
+ t2 = gen_reg_rtx (V8SFmode);
+ mask = gen_lowpart (V4SFmode, mask);
+ emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
+ emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
+ emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
+ emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
+ return;
+
+ case V32QImode:
+ t1 = gen_reg_rtx (V32QImode);
+ t2 = gen_reg_rtx (V32QImode);
+ t3 = gen_reg_rtx (V32QImode);
+ vt2 = GEN_INT (128);
+ for (i = 0; i < 32; i++)
+ vec[i] = vt2;
+ vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+ vt = force_reg (V32QImode, vt);
+ for (i = 0; i < 32; i++)
+ vec[i] = i < 16 ? vt2 : const0_rtx;
+ vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+ vt2 = force_reg (V32QImode, vt2);
+ /* From mask create two adjusted masks, which contain the same
+ bits as mask in the low 7 bits of each vector element.
+ The first mask will have the most significant bit clear
+ if it requests element from the same 128-bit lane
+ and MSB set if it requests element from the other 128-bit lane.
+ The second mask will have the opposite values of the MSB,
+ and additionally will have its 128-bit lanes swapped.
+ E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
+ t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
+ t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
+ stands for other 12 bytes. */
+ /* The bit whether element is from the same lane or the other
+ lane is bit 4, so shift it up by 3 to the MSB position. */
+ emit_insn (gen_avx2_lshlv4di3 (gen_lowpart (V4DImode, t1),
+ gen_lowpart (V4DImode, mask),
+ GEN_INT (3)));
+ /* Clear MSB bits from the mask just in case it had them set. */
+ emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
+ /* After this t1 will have MSB set for elements from other lane. */
+ emit_insn (gen_xorv32qi3 (t1, t1, vt2));
+ /* Clear bits other than MSB. */
+ emit_insn (gen_andv32qi3 (t1, t1, vt));
+ /* Or in the lower bits from mask into t3. */
+ emit_insn (gen_iorv32qi3 (t3, t1, t2));
+ /* And invert MSB bits in t1, so MSB is set for elements from the same
+ lane. */
+ emit_insn (gen_xorv32qi3 (t1, t1, vt));
+ /* Swap 128-bit lanes in t3. */
+ emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+ gen_lowpart (V4DImode, t3),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ /* And or in the lower bits from mask into t1. */
+ emit_insn (gen_iorv32qi3 (t1, t1, t2));
+ if (one_operand_shuffle)
+ {
+ /* Each of these shuffles will put 0s in places where
+ element from the other 128-bit lane is needed, otherwise
+ will shuffle in the requested value. */
+ emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
+ emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
+ /* For t3 the 128-bit lanes are swapped again. */
+ emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+ gen_lowpart (V4DImode, t3),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ /* And oring both together leads to the result. */
+ emit_insn (gen_iorv32qi3 (target, t1, t3));
+ return;
+ }
+
+ t4 = gen_reg_rtx (V32QImode);
+ /* Similarly to the above one_operand_shuffle code,
+ just for repeated twice for each operand. merge_two:
+ code will merge the two results together. */
+ emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
+ emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
+ emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
+ emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
+ emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
+ gen_lowpart (V4DImode, t4),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+ gen_lowpart (V4DImode, t3),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ emit_insn (gen_iorv32qi3 (t4, t2, t4));
+ emit_insn (gen_iorv32qi3 (t3, t1, t3));
+ t1 = t4;
+ t2 = t3;
+ goto merge_two;
+
+ default:
+ gcc_assert (GET_MODE_SIZE (mode) <= 16);
+ break;
+ }
+ }
+
+ if (TARGET_XOP)
+ {
+ /* The XOP VPPERM insn supports three inputs. By ignoring the
+ one_operand_shuffle special case, we avoid creating another
+ set of constant vectors in memory. */
+ one_operand_shuffle = false;
+
+ /* mask = mask & {2*w-1, ...} */
+ vt = GEN_INT (2*w - 1);
+ }
+ else
+ {
+ /* mask = mask & {w-1, ...} */
+ vt = GEN_INT (w - 1);
+ }
+
+ for (i = 0; i < w; i++)
+ vec[i] = vt;
+ vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+ mask = expand_simple_binop (maskmode, AND, mask, vt,
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* For non-QImode operations, convert the word permutation control
+ into a byte permutation control. */
+ if (mode != V16QImode)
+ {
+ mask = expand_simple_binop (maskmode, ASHIFT, mask,
+ GEN_INT (exact_log2 (e)),
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* Convert mask to vector of chars. */
+ mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
+
+ /* Replicate each of the input bytes into byte positions:
+ (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
+ (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+ (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
+ for (i = 0; i < 16; ++i)
+ vec[i] = GEN_INT (i/e * e);
+ vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+ vt = force_const_mem (V16QImode, vt);
+ if (TARGET_XOP)
+ emit_insn (gen_xop_pperm (mask, mask, mask, vt));
+ else
+ emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
+
+ /* Convert it into the byte positions by doing
+ mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
+ for (i = 0; i < 16; ++i)
+ vec[i] = GEN_INT (i % e);
+ vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+ vt = force_const_mem (V16QImode, vt);
+ emit_insn (gen_addv16qi3 (mask, mask, vt));
+ }
+
+ /* The actual shuffle operations all operate on V16QImode. */
+ op0 = gen_lowpart (V16QImode, op0);
+ op1 = gen_lowpart (V16QImode, op1);
+ target = gen_lowpart (V16QImode, target);
+
+ if (TARGET_XOP)
+ {
+ emit_insn (gen_xop_pperm (target, op0, op1, mask));
+ }
+ else if (one_operand_shuffle)
+ {
+ emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
+ }
+ else
+ {
+ rtx xops[6];
+ bool ok;
+
+ /* Shuffle the two input vectors independently. */
+ t1 = gen_reg_rtx (V16QImode);
+ t2 = gen_reg_rtx (V16QImode);
+ emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
+ emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
+
+ merge_two:
+ /* Then merge them together. The key is whether any given control
+ element contained a bit set that indicates the second word. */
+ mask = operands[3];
+ vt = GEN_INT (w);
+ if (maskmode == V2DImode && !TARGET_SSE4_1)
+ {
+ /* Without SSE4.1, we don't have V2DImode EQ. Perform one
+ more shuffle to convert the V2DI input mask into a V4SI
+ input mask. At which point the masking that expand_int_vcond
+ will work as desired. */
+ rtx t3 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
+ const0_rtx, const0_rtx,
+ const2_rtx, const2_rtx));
+ mask = t3;
+ maskmode = V4SImode;
+ e = w = 4;
+ }
+
+ for (i = 0; i < w; i++)
+ vec[i] = vt;
+ vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+ vt = force_reg (maskmode, vt);
+ mask = expand_simple_binop (maskmode, AND, mask, vt,
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ xops[0] = gen_lowpart (mode, operands[0]);
+ xops[1] = gen_lowpart (mode, t2);
+ xops[2] = gen_lowpart (mode, t1);
+ xops[3] = gen_rtx_EQ (maskmode, mask, vt);
+ xops[4] = mask;
+ xops[5] = vt;
+ ok = ix86_expand_int_vcond (xops);
+ gcc_assert (ok);
+ }
+}
+
/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
true if we should do zero extension, else sign extension. HIGH_P is
true if we want the N/2 high elements, else the low elements. */
@@ -19267,9 +19687,38 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
if (TARGET_SSE4_1)
{
rtx (*unpack)(rtx, rtx);
+ rtx (*extract)(rtx, rtx) = NULL;
+ enum machine_mode halfmode = BLKmode;
switch (imode)
{
+ case V32QImode:
+ if (unsigned_p)
+ unpack = gen_avx2_zero_extendv16qiv16hi2;
+ else
+ unpack = gen_avx2_sign_extendv16qiv16hi2;
+ halfmode = V16QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
+ break;
+ case V16HImode:
+ if (unsigned_p)
+ unpack = gen_avx2_zero_extendv8hiv8si2;
+ else
+ unpack = gen_avx2_sign_extendv8hiv8si2;
+ halfmode = V8HImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
+ break;
+ case V8SImode:
+ if (unsigned_p)
+ unpack = gen_avx2_zero_extendv4siv4di2;
+ else
+ unpack = gen_avx2_sign_extendv4siv4di2;
+ halfmode = V4SImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
+ break;
case V16QImode:
if (unsigned_p)
unpack = gen_sse4_1_zero_extendv8qiv8hi2;
@@ -19292,7 +19741,12 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
gcc_unreachable ();
}
- if (high_p)
+ if (GET_MODE_SIZE (imode) == 32)
+ {
+ tmp = gen_reg_rtx (halfmode);
+ emit_insn (extract (tmp, operands[1]));
+ }
+ else if (high_p)
{
/* Shift higher 8 bytes to lower 8 bytes. */
tmp = gen_reg_rtx (imode);
@@ -25797,7 +26251,7 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
- { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv4di, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
@@ -25861,7 +26315,7 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
- { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlqv4di3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
@@ -25872,7 +26326,7 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
- { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrqv4di3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
@@ -27502,6 +27956,11 @@ ix86_expand_args_builtin (const struct builtin_description *d,
rmode = V1TImode;
nargs_constant = 1;
break;
+ case V4DI_FTYPE_V4DI_INT_CONVERT:
+ nargs = 2;
+ rmode = V2TImode;
+ nargs_constant = 1;
+ break;
case V8HI_FTYPE_V8HI_INT:
case V8HI_FTYPE_V8SF_INT:
case V8HI_FTYPE_V4SF_INT:
@@ -27779,7 +28238,6 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
klass = store;
memory = 0;
break;
- break;
case UINT64_FTYPE_VOID:
case UNSIGNED_FTYPE_VOID:
nargs = 0;
@@ -28432,7 +28890,6 @@ rdrand_step:
op4 = expand_normal (arg4);
/* Note the arg order is different from the operand order. */
mode0 = insn_data[icode].operand[1].mode;
- mode1 = insn_data[icode].operand[2].mode;
mode2 = insn_data[icode].operand[3].mode;
mode3 = insn_data[icode].operand[4].mode;
mode4 = insn_data[icode].operand[5].mode;
@@ -28446,12 +28903,11 @@ rdrand_step:
if (GET_MODE (op1) != Pmode)
op1 = convert_to_mode (Pmode, op1, 1);
op1 = force_reg (Pmode, op1);
- op1 = gen_rtx_MEM (mode1, op1);
if (!insn_data[icode].operand[1].predicate (op0, mode0))
op0 = copy_to_mode_reg (mode0, op0);
- if (!insn_data[icode].operand[2].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
+ if (!insn_data[icode].operand[2].predicate (op1, Pmode))
+ op1 = copy_to_mode_reg (Pmode, op1);
if (!insn_data[icode].operand[3].predicate (op2, mode2))
op2 = copy_to_mode_reg (mode2, op2);
if (!insn_data[icode].operand[4].predicate (op3, mode3))
@@ -28844,7 +29300,7 @@ ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
return NULL_TREE;
}
- bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
+ bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
if (fn == BUILT_IN_LOGF)
strcpy (name, "vmlsLn4");
@@ -28862,7 +29318,8 @@ ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
name[4] &= ~0x20;
arity = 0;
- for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
+ for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
+ args;
args = TREE_CHAIN (args))
arity++;
@@ -28943,11 +29400,12 @@ ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
return NULL_TREE;
}
- bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
+ bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
sprintf (name + 7, "%s", bname+10);
arity = 0;
- for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
+ for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
+ args;
args = TREE_CHAIN (args))
arity++;
@@ -30799,7 +31257,7 @@ x86_output_mi_thunk (FILE *file,
}
}
- emit_insn (ix86_gen_add3 (delta_dst, delta_dst, delta_rtx));
+ ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
}
/* Adjust the this parameter by a value stored in the vtable. */
@@ -30842,7 +31300,7 @@ x86_output_mi_thunk (FILE *file,
REGNO (this_reg)),
vcall_mem));
else
- emit_insn (ix86_gen_add3 (this_reg, this_reg, vcall_mem));
+ ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
}
/* If necessary, drop THIS back to its stack slot. */
@@ -31189,7 +31647,7 @@ ix86_pad_returns (void)
}
if (replace)
{
- emit_jump_insn_before (gen_return_internal_long (), ret);
+ emit_jump_insn_before (gen_simple_return_internal_long (), ret);
delete_insn (ret);
}
}
@@ -31457,9 +31915,9 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}
-/* AVX does not support 32-byte integer vector operations,
- thus the longest vector we are faced with is V16QImode. */
-#define MAX_VECT_LEN 16
+/* AVX2 does support 32-byte integer vector operations,
+ thus the longest vector we are faced with is V32QImode. */
+#define MAX_VECT_LEN 32
struct expand_vec_perm_d
{
@@ -31472,6 +31930,9 @@ struct expand_vec_perm_d
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+static int extract_vec_perm_cst (struct expand_vec_perm_d *, tree);
+static bool ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask);
+
/* Get a vector mode of the same size as the original but with elements
twice as wide. This is only guaranteed to apply to integral vectors. */
@@ -32793,72 +33254,100 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
}
}
-/* Expand a vector reduction. FN is the binary pattern to reduce;
- DEST is the destination; IN is the input vector. */
+/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
+ to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
+ The upper bits of DEST are undefined, though they shouldn't cause
+ exceptions (some bits from src or all zeros are ok). */
-void
-ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+static void
+emit_reduc_half (rtx dest, rtx src, int i)
{
- rtx tmp1, tmp2, tmp3, tmp4, tmp5;
- enum machine_mode mode = GET_MODE (in);
- int i;
-
- tmp1 = gen_reg_rtx (mode);
- tmp2 = gen_reg_rtx (mode);
- tmp3 = gen_reg_rtx (mode);
-
- switch (mode)
+ rtx tem;
+ switch (GET_MODE (src))
{
case V4SFmode:
- emit_insn (gen_sse_movhlps (tmp1, in, in));
- emit_insn (fn (tmp2, tmp1, in));
- emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
- const1_rtx, const1_rtx,
- GEN_INT (1+4), GEN_INT (1+4)));
+ if (i == 128)
+ tem = gen_sse_movhlps (dest, src, src);
+ else
+ tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
+ GEN_INT (1 + 4), GEN_INT (1 + 4));
+ break;
+ case V2DFmode:
+ tem = gen_vec_interleave_highv2df (dest, src, src);
+ break;
+ case V16QImode:
+ case V8HImode:
+ case V4SImode:
+ case V2DImode:
+ tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
+ gen_lowpart (V1TImode, src),
+ GEN_INT (i / 2));
break;
case V8SFmode:
- tmp4 = gen_reg_rtx (mode);
- tmp5 = gen_reg_rtx (mode);
- emit_insn (gen_avx_vperm2f128v8sf3 (tmp4, in, in, const1_rtx));
- emit_insn (fn (tmp5, tmp4, in));
- emit_insn (gen_avx_shufps256 (tmp1, tmp5, tmp5, GEN_INT (2+12)));
- emit_insn (fn (tmp2, tmp1, tmp5));
- emit_insn (gen_avx_shufps256 (tmp3, tmp2, tmp2, const1_rtx));
+ if (i == 256)
+ tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
+ else
+ tem = gen_avx_shufps256 (dest, src, src,
+ GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
break;
case V4DFmode:
- emit_insn (gen_avx_vperm2f128v4df3 (tmp1, in, in, const1_rtx));
- emit_insn (fn (tmp2, tmp1, in));
- emit_insn (gen_avx_shufpd256 (tmp3, tmp2, tmp2, const1_rtx));
+ if (i == 256)
+ tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
+ else
+ tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
break;
case V32QImode:
case V16HImode:
case V8SImode:
case V4DImode:
- emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, tmp1),
- gen_lowpart (V4DImode, in),
- gen_lowpart (V4DImode, in),
- const1_rtx));
- tmp4 = in;
- tmp5 = tmp1;
- for (i = 64; i >= GET_MODE_BITSIZE (GET_MODE_INNER (mode)); i >>= 1)
- {
- if (i != 64)
- {
- tmp2 = gen_reg_rtx (mode);
- tmp3 = gen_reg_rtx (mode);
- }
- emit_insn (fn (tmp2, tmp4, tmp5));
- emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, tmp3),
- gen_lowpart (V2TImode, tmp2),
- GEN_INT (i)));
- tmp4 = tmp2;
- tmp5 = tmp3;
- }
+ if (i == 256)
+ tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
+ gen_lowpart (V4DImode, src),
+ gen_lowpart (V4DImode, src),
+ const1_rtx);
+ else
+ tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
+ gen_lowpart (V2TImode, src),
+ GEN_INT (i / 2));
break;
default:
gcc_unreachable ();
}
- emit_insn (fn (dest, tmp2, tmp3));
+ emit_insn (tem);
+}
+
+/* Expand a vector reduction. FN is the binary pattern to reduce;
+ DEST is the destination; IN is the input vector. */
+
+void
+ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+{
+ rtx half, dst, vec = in;
+ enum machine_mode mode = GET_MODE (in);
+ int i;
+
+ /* SSE4 has a special instruction for V8HImode UMIN reduction. */
+ if (TARGET_SSE4_1
+ && mode == V8HImode
+ && fn == gen_uminv8hi3)
+ {
+ emit_insn (gen_sse4_1_phminposuw (dest, in));
+ return;
+ }
+
+ for (i = GET_MODE_BITSIZE (mode);
+ i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
+ i >>= 1)
+ {
+ half = gen_reg_rtx (mode);
+ emit_reduc_half (half, vec, i);
+ if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
+ dst = dest;
+ else
+ dst = gen_reg_rtx (mode);
+ emit_insn (fn (dst, half, vec));
+ vec = dst;
+ }
}
/* Target hook for scalar_mode_supported_p. */
@@ -33103,7 +33592,7 @@ void ix86_emit_i387_round (rtx op0, rtx op1)
res = gen_reg_rtx (outmode);
half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
-
+
/* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
/* scratch = fxam(op1) */
@@ -34263,7 +34752,7 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- in terms of blendp[sd] / pblendw / pblendvb. */
+ in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
static bool
expand_vec_perm_blend (struct expand_vec_perm_d *d)
@@ -34271,10 +34760,17 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
enum machine_mode vmode = d->vmode;
unsigned i, mask, nelt = d->nelt;
rtx target, op0, op1, x;
+ rtx rperm[32], vperm;
- if (!TARGET_SSE4_1 || d->op0 == d->op1)
+ if (d->op0 == d->op1)
return false;
- if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
+ if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ ;
+ else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+ ;
+ else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ ;
+ else
return false;
/* This is a blend, not a permute. Elements must stay in their
@@ -34292,30 +34788,6 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
/* ??? Without SSE4.1, we could implement this with and/andn/or. This
decision should be extracted elsewhere, so that we only try that
sequence once all budget==3 options have been tried. */
-
- /* For bytes, see if bytes move in pairs so we can use pblendw with
- an immediate argument, rather than pblendvb with a vector argument. */
- if (vmode == V16QImode)
- {
- bool pblendw_ok = true;
- for (i = 0; i < 16 && pblendw_ok; i += 2)
- pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
-
- if (!pblendw_ok)
- {
- rtx rperm[16], vperm;
-
- for (i = 0; i < nelt; ++i)
- rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
-
- vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
- vperm = force_reg (V16QImode, vperm);
-
- emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
- return true;
- }
- }
-
target = d->target;
op0 = d->op0;
op1 = d->op1;
@@ -34328,6 +34800,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
case V2DFmode:
case V4SFmode:
case V8HImode:
+ case V8SImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -34335,24 +34808,122 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
case V2DImode:
for (i = 0; i < 2; ++i)
mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+ vmode = V8HImode;
goto do_subreg;
case V4SImode:
for (i = 0; i < 4; ++i)
mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+ vmode = V8HImode;
goto do_subreg;
case V16QImode:
+ /* See if bytes move in pairs so we can use pblendw with
+ an immediate argument, rather than pblendvb with a vector
+ argument. */
+ for (i = 0; i < 16; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ {
+ use_pblendvb:
+ for (i = 0; i < nelt; ++i)
+ rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+ finish_pblendvb:
+ vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+ vperm = force_reg (vmode, vperm);
+
+ if (GET_MODE_SIZE (vmode) == 16)
+ emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
+ else
+ emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+ return true;
+ }
+
for (i = 0; i < 8; ++i)
mask |= (d->perm[i * 2] >= 16) << i;
+ vmode = V8HImode;
+ /* FALLTHRU */
do_subreg:
- vmode = V8HImode;
target = gen_lowpart (vmode, target);
op0 = gen_lowpart (vmode, op0);
op1 = gen_lowpart (vmode, op1);
break;
+ case V32QImode:
+ /* See if bytes move in pairs. If not, vpblendvb must be used. */
+ for (i = 0; i < 32; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ goto use_pblendvb;
+ /* See if bytes move in quadruplets. If yes, vpblendd
+ with immediate can be used. */
+ for (i = 0; i < 32; i += 4)
+ if (d->perm[i] + 2 != d->perm[i + 2])
+ break;
+ if (i < 32)
+ {
+ /* See if bytes move the same in both lanes. If yes,
+ vpblendw with immediate can be used. */
+ for (i = 0; i < 16; i += 2)
+ if (d->perm[i] + 16 != d->perm[i + 16])
+ goto use_pblendvb;
+
+ /* Use vpblendw. */
+ for (i = 0; i < 16; ++i)
+ mask |= (d->perm[i * 2] >= 32) << i;
+ vmode = V16HImode;
+ goto do_subreg;
+ }
+
+ /* Use vpblendd. */
+ for (i = 0; i < 8; ++i)
+ mask |= (d->perm[i * 4] >= 32) << i;
+ vmode = V8SImode;
+ goto do_subreg;
+
+ case V16HImode:
+ /* See if words move in pairs. If yes, vpblendd can be used. */
+ for (i = 0; i < 16; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ break;
+ if (i < 16)
+ {
+ /* See if words move the same in both lanes. If not,
+ vpblendvb must be used. */
+ for (i = 0; i < 8; i++)
+ if (d->perm[i] + 8 != d->perm[i + 8])
+ {
+ /* Use vpblendvb. */
+ for (i = 0; i < 32; ++i)
+ rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
+
+ vmode = V32QImode;
+ nelt = 32;
+ target = gen_lowpart (vmode, target);
+ op0 = gen_lowpart (vmode, op0);
+ op1 = gen_lowpart (vmode, op1);
+ goto finish_pblendvb;
+ }
+
+ /* Use vpblendw. */
+ for (i = 0; i < 16; ++i)
+ mask |= (d->perm[i] >= 16) << i;
+ break;
+ }
+
+ /* Use vpblendd. */
+ for (i = 0; i < 8; ++i)
+ mask |= (d->perm[i * 2] >= 16) << i;
+ vmode = V8SImode;
+ goto do_subreg;
+
+ case V4DImode:
+ /* Use vpblendd. */
+ for (i = 0; i < 4; ++i)
+ mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+ vmode = V8SImode;
+ goto do_subreg;
+
default:
gcc_unreachable ();
}
@@ -34413,43 +34984,165 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- in terms of pshufb or vpperm. */
+/* Return true if permutation D can be performed as VMODE permutation
+ instead. */
static bool
-expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
{
- unsigned i, nelt, eltsz;
- rtx rperm[16], vperm, target, op0, op1;
+ unsigned int i, j, chunk;
- if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
- return false;
- if (GET_MODE_SIZE (d->vmode) != 16)
+ if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
+ || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
+ || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
return false;
- if (d->testing_p)
+ if (GET_MODE_NUNITS (vmode) >= d->nelt)
return true;
+ chunk = d->nelt / GET_MODE_NUNITS (vmode);
+ for (i = 0; i < d->nelt; i += chunk)
+ if (d->perm[i] & (chunk - 1))
+ return false;
+ else
+ for (j = 1; j < chunk; ++j)
+ if (d->perm[i] + j != d->perm[i + j])
+ return false;
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt, eltsz, mask;
+ unsigned char perm[32];
+ enum machine_mode vmode = V16QImode;
+ rtx rperm[32], vperm, target, op0, op1;
+
nelt = d->nelt;
- eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
- for (i = 0; i < nelt; ++i)
+ if (d->op0 != d->op1)
{
- unsigned j, e = d->perm[i];
- for (j = 0; j < eltsz; ++j)
- rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+ if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
+ {
+ if (TARGET_AVX2
+ && valid_perm_using_mode_p (V2TImode, d))
+ {
+ if (d->testing_p)
+ return true;
+
+ /* Use vperm2i128 insn. The pattern uses
+ V4DImode instead of V2TImode. */
+ target = gen_lowpart (V4DImode, d->target);
+ op0 = gen_lowpart (V4DImode, d->op0);
+ op1 = gen_lowpart (V4DImode, d->op1);
+ rperm[0]
+ = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
+ || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
+ emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+ return true;
+ }
+ return false;
+ }
}
+ else
+ {
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (!TARGET_SSSE3)
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX2)
+ return false;
- vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
- vperm = force_reg (V16QImode, vperm);
+ /* V4DImode should be already handled through
+ expand_vselect by vpermq instruction. */
+ gcc_assert (d->vmode != V4DImode);
+
+ vmode = V32QImode;
+ if (d->vmode == V8SImode
+ || d->vmode == V16HImode
+ || d->vmode == V32QImode)
+ {
+ /* First see if vpermq can be used for
+ V8SImode/V16HImode/V32QImode. */
+ if (valid_perm_using_mode_p (V4DImode, d))
+ {
+ for (i = 0; i < 4; i++)
+ perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+ if (d->testing_p)
+ return true;
+ return expand_vselect (gen_lowpart (V4DImode, d->target),
+ gen_lowpart (V4DImode, d->op0),
+ perm, 4);
+ }
+
+ /* Next see if vpermd can be used. */
+ if (valid_perm_using_mode_p (V8SImode, d))
+ vmode = V8SImode;
+ }
- target = gen_lowpart (V16QImode, d->target);
- op0 = gen_lowpart (V16QImode, d->op0);
+ if (vmode == V32QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 2))
+ return false;
+ }
+ }
+ else
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ if (vmode == V8SImode)
+ for (i = 0; i < 8; ++i)
+ rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+ else
+ {
+ eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+ if (d->op0 != d->op1)
+ mask = 2 * nelt - 1;
+ else if (vmode == V16QImode)
+ mask = nelt - 1;
+ else
+ mask = nelt / 2 - 1;
+
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & mask;
+ for (j = 0; j < eltsz; ++j)
+ rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (vmode,
+ gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
+ vperm = force_reg (vmode, vperm);
+
+ target = gen_lowpart (vmode, d->target);
+ op0 = gen_lowpart (vmode, d->op0);
if (d->op0 == d->op1)
- emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+ {
+ if (vmode == V16QImode)
+ emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+ else if (vmode == V32QImode)
+ emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else
+ emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
+ }
else
{
- op1 = gen_lowpart (V16QImode, d->op1);
+ op1 = gen_lowpart (vmode, d->op1);
emit_insn (gen_xop_pperm (target, op0, op1, vperm));
}
@@ -34471,9 +35164,58 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (d->op0 == d->op1)
{
int mask = nelt - 1;
+ bool identity_perm = true;
+ bool broadcast_perm = true;
for (i = 0; i < nelt; i++)
- perm2[i] = d->perm[i] & mask;
+ {
+ perm2[i] = d->perm[i] & mask;
+ if (perm2[i] != i)
+ identity_perm = false;
+ if (perm2[i])
+ broadcast_perm = false;
+ }
+
+ if (identity_perm)
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target, d->op0);
+ return true;
+ }
+ else if (broadcast_perm && TARGET_AVX2)
+ {
+ /* Use vpbroadcast{b,w,d}. */
+ rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
+ switch (d->vmode)
+ {
+ case V32QImode:
+ op = gen_lowpart (V16QImode, op);
+ gen = gen_avx2_pbroadcastv32qi;
+ break;
+ case V16HImode:
+ op = gen_lowpart (V8HImode, op);
+ gen = gen_avx2_pbroadcastv16hi;
+ break;
+ case V8SImode:
+ op = gen_lowpart (V4SImode, op);
+ gen = gen_avx2_pbroadcastv8si;
+ break;
+ case V16QImode:
+ gen = gen_avx2_pbroadcastv16qi;
+ break;
+ case V8HImode:
+ gen = gen_avx2_pbroadcastv8hi;
+ break;
+ /* For other modes prefer other shuffles this function creates. */
+ default: break;
+ }
+ if (gen != NULL)
+ {
+ if (!d->testing_p)
+ emit_insn (gen (d->target, op));
+ return true;
+ }
+ }
if (expand_vselect (d->target, d->op0, perm2, nelt))
return true;
@@ -34537,7 +35279,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_vpermil (d))
return true;
- /* Try the SSSE3 pshufb or XOP vpperm variable permutation. */
+ /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
+ vpshufb, vpermd or vpermq variable permutation. */
if (expand_vec_perm_pshufb (d))
return true;
@@ -34656,93 +35399,210 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
{
struct expand_vec_perm_d dremap, dfinal;
unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
- unsigned contents, h1, h2, h3, h4;
+ unsigned HOST_WIDE_INT contents;
unsigned char remap[2 * MAX_VECT_LEN];
rtx seq;
- bool ok;
-
- if (d->op0 == d->op1)
- return false;
+ bool ok, same_halves = false;
- /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit
- lanes. We can use similar techniques with the vperm2f128 instruction,
- but it requires slightly different logic. */
- if (GET_MODE_SIZE (d->vmode) != 16)
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (d->op0 == d->op1)
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX)
+ return false;
+ /* For 32-byte modes allow even d->op0 == d->op1.
+ The lack of cross-lane shuffling in some instructions
+ might prevent a single insn shuffle. */
+ }
+ else
return false;
/* Examine from whence the elements come. */
contents = 0;
for (i = 0; i < nelt; ++i)
- contents |= 1u << d->perm[i];
-
- /* Split the two input vectors into 4 halves. */
- h1 = (1u << nelt2) - 1;
- h2 = h1 << nelt2;
- h3 = h2 << nelt2;
- h4 = h3 << nelt2;
+ contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
memset (remap, 0xff, sizeof (remap));
dremap = *d;
- /* If the elements from the low halves use interleave low, and similarly
- for interleave high. If the elements are from mis-matched halves, we
- can use shufps for V4SF/V4SI or do a DImode shuffle. */
- if ((contents & (h1 | h3)) == contents)
+ if (GET_MODE_SIZE (d->vmode) == 16)
{
- for (i = 0; i < nelt2; ++i)
+ unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+ /* Split the two input vectors into 4 halves. */
+ h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
+ h2 = h1 << nelt2;
+ h3 = h2 << nelt2;
+ h4 = h3 << nelt2;
+
+ /* If the elements from the low halves use interleave low, and similarly
+ for interleave high. If the elements are from mis-matched halves, we
+ can use shufps for V4SF/V4SI or do a DImode shuffle. */
+ if ((contents & (h1 | h3)) == contents)
{
- remap[i] = i * 2;
- remap[i + nelt] = i * 2 + 1;
- dremap.perm[i * 2] = i;
- dremap.perm[i * 2 + 1] = i + nelt;
+ /* punpckl* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ }
}
- }
- else if ((contents & (h2 | h4)) == contents)
- {
- for (i = 0; i < nelt2; ++i)
+ else if ((contents & (h2 | h4)) == contents)
{
- remap[i + nelt2] = i * 2;
- remap[i + nelt + nelt2] = i * 2 + 1;
- dremap.perm[i * 2] = i + nelt2;
- dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ /* punpckh* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i * 2;
+ remap[i + nelt + nelt2] = i * 2 + 1;
+ dremap.perm[i * 2] = i + nelt2;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ }
}
- }
- else if ((contents & (h1 | h4)) == contents)
- {
- for (i = 0; i < nelt2; ++i)
+ else if ((contents & (h1 | h4)) == contents)
{
- remap[i] = i;
- remap[i + nelt + nelt2] = i + nelt2;
- dremap.perm[i] = i;
- dremap.perm[i + nelt2] = i + nelt + nelt2;
+ /* shufps */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i;
+ remap[i + nelt + nelt2] = i + nelt2;
+ dremap.perm[i] = i;
+ dremap.perm[i + nelt2] = i + nelt + nelt2;
+ }
+ if (nelt != 4)
+ {
+ /* shufpd */
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 0;
+ dremap.perm[1] = 3;
+ }
}
- if (nelt != 4)
+ else if ((contents & (h2 | h3)) == contents)
{
- dremap.vmode = V2DImode;
- dremap.nelt = 2;
- dremap.perm[0] = 0;
- dremap.perm[1] = 3;
+ /* shufps */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i;
+ remap[i + nelt] = i + nelt2;
+ dremap.perm[i] = i + nelt2;
+ dremap.perm[i + nelt2] = i + nelt;
+ }
+ if (nelt != 4)
+ {
+ /* shufpd */
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 1;
+ dremap.perm[1] = 2;
+ }
}
+ else
+ return false;
}
- else if ((contents & (h2 | h3)) == contents)
+ else
{
- for (i = 0; i < nelt2; ++i)
+ unsigned int nelt4 = nelt / 4, nzcnt = 0;
+ unsigned HOST_WIDE_INT q[8];
+ unsigned int nonzero_halves[4];
+
+ /* Split the two input vectors into 8 quarters. */
+ q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
+ for (i = 1; i < 8; ++i)
+ q[i] = q[0] << (nelt4 * i);
+ for (i = 0; i < 4; ++i)
+ if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
+ {
+ nonzero_halves[nzcnt] = i;
+ ++nzcnt;
+ }
+
+ if (nzcnt == 1)
{
- remap[i + nelt2] = i;
- remap[i + nelt] = i + nelt2;
- dremap.perm[i] = i + nelt2;
- dremap.perm[i + nelt2] = i + nelt;
+ gcc_assert (d->op0 == d->op1);
+ nonzero_halves[1] = nonzero_halves[0];
+ same_halves = true;
}
- if (nelt != 4)
+ else if (d->op0 == d->op1)
{
- dremap.vmode = V2DImode;
- dremap.nelt = 2;
- dremap.perm[0] = 1;
- dremap.perm[1] = 2;
+ gcc_assert (nonzero_halves[0] == 0);
+ gcc_assert (nonzero_halves[1] == 1);
}
+
+ if (nzcnt <= 2)
+ {
+ if (d->perm[0] / nelt2 == nonzero_halves[1])
+ {
+ /* Attempt to increase the likelyhood that dfinal
+ shuffle will be intra-lane. */
+ char tmph = nonzero_halves[0];
+ nonzero_halves[0] = nonzero_halves[1];
+ nonzero_halves[1] = tmph;
+ }
+
+ /* vperm2f128 or vperm2i128. */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
+ remap[i + nonzero_halves[0] * nelt2] = i;
+ dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
+ dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+ }
+
+ if (d->vmode != V8SFmode
+ && d->vmode != V4DFmode
+ && d->vmode != V8SImode)
+ {
+ dremap.vmode = V8SImode;
+ dremap.nelt = 8;
+ for (i = 0; i < 4; ++i)
+ {
+ dremap.perm[i] = i + nonzero_halves[0] * 4;
+ dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+ }
+ }
+ }
+ else if (d->op0 == d->op1)
+ return false;
+ else if (TARGET_AVX2
+ && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
+ {
+ /* vpunpckl* */
+ for (i = 0; i < nelt4; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ remap[i + nelt2] = i * 2 + nelt2;
+ remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ dremap.perm[i * 2 + nelt2] = i + nelt2;
+ dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+ }
+ }
+ else if (TARGET_AVX2
+ && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
+ {
+ /* vpunpckh* */
+ for (i = 0; i < nelt4; ++i)
+ {
+ remap[i + nelt4] = i * 2;
+ remap[i + nelt + nelt4] = i * 2 + 1;
+ remap[i + nelt2 + nelt4] = i * 2 + nelt2;
+ remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
+ dremap.perm[i * 2] = i + nelt4;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt4;
+ dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
+ dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+ }
+ }
+ else
+ return false;
}
- else
- return false;
/* Use the remapping array set up above to move the elements from their
swizzled locations into their final destinations. */
@@ -34751,7 +35611,15 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
{
unsigned e = remap[d->perm[i]];
gcc_assert (e < nelt);
- dfinal.perm[i] = e;
+ /* If same_halves is true, both halves of the remapped vector are the
+ same. Avoid cross-lane accesses if possible. */
+ if (same_halves && i >= nelt2)
+ {
+ gcc_assert (e < nelt2);
+ dfinal.perm[i] = e + nelt2;
+ }
+ else
+ dfinal.perm[i] = e;
}
dfinal.op0 = gen_reg_rtx (dfinal.vmode);
dfinal.op1 = dfinal.op0;
@@ -34767,6 +35635,9 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
if (!ok)
return false;
+ if (d->testing_p)
+ return true;
+
if (dremap.vmode != dfinal.vmode)
{
dremap.target = gen_lowpart (dremap.vmode, dremap.target);
@@ -34781,6 +35652,159 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ a single vector cross-lane permutation into vpermq followed
+ by any of the single insn permutations. */
+
+static bool
+expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dremap, dfinal;
+ unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
+ unsigned contents[2];
+ bool ok;
+
+ if (!(TARGET_AVX2
+ && (d->vmode == V32QImode || d->vmode == V16HImode)
+ && d->op0 == d->op1))
+ return false;
+
+ contents[0] = 0;
+ contents[1] = 0;
+ for (i = 0; i < nelt2; ++i)
+ {
+ contents[0] |= 1u << (d->perm[i] / nelt4);
+ contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned int cnt = 0;
+ for (j = 0; j < 4; ++j)
+ if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ dremap = *d;
+ dremap.vmode = V4DImode;
+ dremap.nelt = 4;
+ dremap.target = gen_reg_rtx (V4DImode);
+ dremap.op0 = gen_lowpart (V4DImode, d->op0);
+ dremap.op1 = dremap.op0;
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned int cnt = 0;
+ for (j = 0; j < 4; ++j)
+ if ((contents[i] & (1u << j)) != 0)
+ dremap.perm[2 * i + cnt++] = j;
+ for (; cnt < 2; ++cnt)
+ dremap.perm[2 * i + cnt] = 0;
+ }
+
+ dfinal = *d;
+ dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+ dfinal.op1 = dfinal.op0;
+ for (i = 0, j = 0; i < nelt; ++i)
+ {
+ if (i == nelt2)
+ j = 2;
+ dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
+ if ((d->perm[i] / nelt4) == dremap.perm[j])
+ ;
+ else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
+ dfinal.perm[i] |= nelt4;
+ else
+ gcc_unreachable ();
+ }
+
+ ok = expand_vec_perm_1 (&dremap);
+ gcc_assert (ok);
+
+ ok = expand_vec_perm_1 (&dfinal);
+ gcc_assert (ok);
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ a two vector permutation using 2 intra-lane interleave insns
+ and cross-lane shuffle for 32-byte vectors. */
+
+static bool
+expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt;
+ rtx (*gen) (rtx, rtx, rtx);
+
+ if (d->op0 == d->op1)
+ return false;
+ if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
+ ;
+ else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
+ ;
+ else
+ return false;
+
+ nelt = d->nelt;
+ if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
+ return false;
+ for (i = 0; i < nelt; i += 2)
+ if (d->perm[i] != d->perm[0] + i / 2
+ || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ switch (d->vmode)
+ {
+ case V32QImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv32qi;
+ else
+ gen = gen_vec_interleave_lowv32qi;
+ break;
+ case V16HImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv16hi;
+ else
+ gen = gen_vec_interleave_lowv16hi;
+ break;
+ case V8SImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv8si;
+ else
+ gen = gen_vec_interleave_lowv8si;
+ break;
+ case V4DImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv4di;
+ else
+ gen = gen_vec_interleave_lowv4di;
+ break;
+ case V8SFmode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv8sf;
+ else
+ gen = gen_vec_interleave_lowv8sf;
+ break;
+ case V4DFmode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv4df;
+ else
+ gen = gen_vec_interleave_lowv4df;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_insn (gen (d->target, d->op0, d->op1));
+ return true;
+}
+
/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
permutation with two pshufb insns and an ior. We should have already
failed all two instruction sequences. */
@@ -34837,6 +35861,152 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
return true;
}
+/* Implement arbitrary permutation of one V32QImode and V16QImode operand
+ with two vpshufb insns, vpermq and vpor. We should have already failed
+ all two or three instruction sequences. */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
+{
+ rtx rperm[2][32], vperm, l, h, hp, op, m128;
+ unsigned int i, nelt, eltsz;
+
+ if (!TARGET_AVX2
+ || d->op0 != d->op1
+ || (d->vmode != V32QImode && d->vmode != V16HImode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+ /* Generate two permutation masks. If the required element is within
+ the same lane, it is shuffled in. If the required element from the
+ other lane, force a zero by setting bit 7 in the permutation mask.
+ In the other mask the mask has non-negative elements if element
+ is requested from the other lane, but also moved to the other lane,
+ so that the result of vpshufb can have the two V2TImode halves
+ swapped. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+ unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+
+ for (j = 0; j < eltsz; ++j)
+ {
+ rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
+ rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+ vperm = force_reg (V32QImode, vperm);
+
+ h = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+ /* Swap the 128-byte lanes of h into hp. */
+ hp = gen_reg_rtx (V4DImode);
+ op = gen_lowpart (V4DImode, h);
+ emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
+ const1_rtx));
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+ vperm = force_reg (V32QImode, vperm);
+
+ l = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+ op = gen_lowpart (V32QImode, d->target);
+ emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+
+ return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
+ and extract-odd permutations of two V32QImode and V16QImode operand
+ with two vpshufb insns, vpor and vpermq. We should have already
+ failed all two or three instruction sequences. */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+{
+ rtx rperm[2][32], vperm, l, h, ior, op, m128;
+ unsigned int i, nelt, eltsz;
+
+ if (!TARGET_AVX2
+ || d->op0 == d->op1
+ || (d->vmode != V32QImode && d->vmode != V16HImode))
+ return false;
+
+ for (i = 0; i < d->nelt; ++i)
+ if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+ /* Generate two permutation masks. In the first permutation mask
+ the first quarter will contain indexes for the first half
+ of the op0, the second quarter will contain bit 7 set, third quarter
+ will contain indexes for the second half of the op0 and the
+ last quarter bit 7 set. In the second permutation mask
+ the first quarter will contain bit 7 set, the second quarter
+ indexes for the first half of the op1, the third quarter bit 7 set
+ and last quarter indexes for the second half of the op1.
+ I.e. the first mask e.g. for V32QImode extract even will be:
+ 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
+ (all values masked with 0xf except for -128) and second mask
+ for extract even will be
+ -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+ unsigned which = d->perm[i] >= nelt;
+ unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+
+ for (j = 0; j < eltsz; ++j)
+ {
+ rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
+ rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+ vperm = force_reg (V32QImode, vperm);
+
+ l = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+ vperm = force_reg (V32QImode, vperm);
+
+ h = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op1);
+ emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+ ior = gen_reg_rtx (V32QImode);
+ emit_insn (gen_iorv32qi3 (ior, l, h));
+
+ /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
+ op = gen_lowpart (V4DImode, d->target);
+ ior = gen_lowpart (V4DImode, ior);
+ emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
and extract-odd permutations. */
@@ -34946,6 +36116,61 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
}
break;
+ case V16HImode:
+ case V32QImode:
+ return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+
+ case V4DImode:
+ t1 = gen_reg_rtx (V4DImode);
+ t2 = gen_reg_rtx (V4DImode);
+
+ /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
+ emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
+ emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+ /* Now an vpunpck[lh]qdq will produce the result required. */
+ if (odd)
+ t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
+ else
+ t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
+ emit_insn (t3);
+ break;
+
+ case V8SImode:
+ t1 = gen_reg_rtx (V8SImode);
+ t2 = gen_reg_rtx (V8SImode);
+
+ /* Shuffle the lanes around into
+ { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
+ emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
+ gen_lowpart (V4DImode, d->op0),
+ gen_lowpart (V4DImode, d->op1),
+ GEN_INT (0x20)));
+ emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
+ gen_lowpart (V4DImode, d->op0),
+ gen_lowpart (V4DImode, d->op1),
+ GEN_INT (0x31)));
+
+ /* Swap the 2nd and 3rd position in each lane into
+ { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
+ emit_insn (gen_avx2_pshufdv3 (t1, t1,
+ GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+ emit_insn (gen_avx2_pshufdv3 (t2, t2,
+ GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+
+ /* Now an vpunpck[lh]qdq will produce
+ { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
+ if (odd)
+ t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
+ gen_lowpart (V4DImode, t1),
+ gen_lowpart (V4DImode, t2));
+ else
+ t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
+ gen_lowpart (V4DImode, t1),
+ gen_lowpart (V4DImode, t2));
+ emit_insn (t3);
+ break;
+
default:
gcc_unreachable ();
}
@@ -35026,6 +36251,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
gcc_assert (ok);
return true;
+ case V32QImode:
+ case V16HImode:
+ case V8SImode:
+ case V4DImode:
+ /* For AVX2 broadcasts of the first element vpbroadcast* or
+ vpermq should be used by expand_vec_perm_1. */
+ gcc_assert (!TARGET_AVX2 || d->perm[0]);
+ return false;
+
default:
gcc_unreachable ();
}
@@ -35050,6 +36284,117 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
return expand_vec_perm_broadcast_1 (d);
}
+/* Implement arbitrary permutation of two V32QImode and V16QImode operands
+ with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
+ all the shorter instruction sequences. */
+
+static bool
+expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+{
+ rtx rperm[4][32], vperm, l[2], h[2], op, m128;
+ unsigned int i, nelt, eltsz;
+ bool used[4];
+
+ if (!TARGET_AVX2
+ || d->op0 == d->op1
+ || (d->vmode != V32QImode && d->vmode != V16HImode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+ /* Generate 4 permutation masks. If the required element is within
+ the same lane, it is shuffled in. If the required element from the
+ other lane, force a zero by setting bit 7 in the permutation mask.
+ In the other mask the mask has non-negative elements if element
+ is requested from the other lane, but also moved to the other lane,
+ so that the result of vpshufb can have the two V2TImode halves
+ swapped. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < 32; ++i)
+ {
+ rperm[0][i] = m128;
+ rperm[1][i] = m128;
+ rperm[2][i] = m128;
+ rperm[3][i] = m128;
+ }
+ used[0] = false;
+ used[1] = false;
+ used[2] = false;
+ used[3] = false;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+ unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+ unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+
+ for (j = 0; j < eltsz; ++j)
+ rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
+ used[which] = true;
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (!used[2 * i + 1])
+ {
+ h[i] = NULL_RTX;
+ continue;
+ }
+ vperm = gen_rtx_CONST_VECTOR (V32QImode,
+ gen_rtvec_v (32, rperm[2 * i + 1]));
+ vperm = force_reg (V32QImode, vperm);
+ h[i] = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+ }
+
+ /* Swap the 128-byte lanes of h[X]. */
+ for (i = 0; i < 2; ++i)
+ {
+ if (h[i] == NULL_RTX)
+ continue;
+ op = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
+ const2_rtx, GEN_INT (3), const0_rtx,
+ const1_rtx));
+ h[i] = gen_lowpart (V32QImode, op);
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (!used[2 * i])
+ {
+ l[i] = NULL_RTX;
+ continue;
+ }
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
+ vperm = force_reg (V32QImode, vperm);
+ l[i] = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (h[i] && l[i])
+ {
+ op = gen_reg_rtx (V32QImode);
+ emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
+ l[i] = op;
+ }
+ else if (h[i])
+ l[i] = h[i];
+ }
+
+ gcc_assert (l[0] && l[1]);
+ op = gen_lowpart (V32QImode, d->target);
+ emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+ return true;
+}
+
/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
With all of the interface bits taken care of, perform the expansion
in D and return true on success. */
@@ -35075,11 +36420,25 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_broadcast (d))
return true;
+ if (expand_vec_perm_vpermq_perm_1 (d))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_pshufb2 (d))
return true;
+ if (expand_vec_perm_interleave3 (d))
+ return true;
+
+ /* Try sequences of four instructions. */
+
+ if (expand_vec_perm_vpshufb2_vpermq (d))
+ return true;
+
+ if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
+ return true;
+
/* ??? Look for narrow permutations whose element orderings would
allow the promotion to a wider mode. */
@@ -35093,6 +36452,10 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_even_odd (d))
return true;
+ /* Even longer sequences. */
+ if (expand_vec_perm_vpshufb4_vpermq2 (d))
+ return true;
+
return false;
}
@@ -35135,6 +36498,7 @@ ix86_expand_vec_perm_builtin (tree exp)
{
struct expand_vec_perm_d d;
tree arg0, arg1, arg2;
+ bool maybe_retry = false;
arg0 = CALL_EXPR_ARG (exp, 0);
arg1 = CALL_EXPR_ARG (exp, 1);
@@ -35180,6 +36544,7 @@ ix86_expand_vec_perm_builtin (tree exp)
for (i = 0; i < nelt; ++i)
if (d.perm[i] >= nelt)
d.perm[i] -= nelt;
+ maybe_retry = true;
}
/* FALLTHRU */
@@ -35200,6 +36565,28 @@ ix86_expand_vec_perm_builtin (tree exp)
if (ix86_expand_vec_perm_builtin_1 (&d))
return d.target;
+ /* If the mask says both arguments are needed, but they are the same,
+ the above tried to expand with d.op0 == d.op1. If that didn't work,
+ retry with d.op0 != d.op1 as that is what testing has been done with. */
+ if (maybe_retry)
+ {
+ rtx seq;
+ bool ok;
+
+ extract_vec_perm_cst (&d, arg2);
+ d.op1 = gen_reg_rtx (d.vmode);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_builtin_1 (&d);
+ seq = get_insns ();
+ end_sequence ();
+ if (ok)
+ {
+ emit_move_insn (d.op1, d.op0);
+ emit_insn (seq);
+ return d.target;
+ }
+ }
+
/* For compiler generated permutations, we should never got here, because
the compiler should also be checking the ok hook. But since this is a
builtin the user has access too, so don't abort. */
@@ -35225,6 +36612,19 @@ ix86_expand_vec_perm_builtin (tree exp)
d.perm[8], d.perm[9], d.perm[10], d.perm[11],
d.perm[12], d.perm[13], d.perm[14], d.perm[15]);
break;
+ case 32:
+ sorry ("vector permutation "
+ "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d "
+ "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)",
+ d.perm[0], d.perm[1], d.perm[2], d.perm[3],
+ d.perm[4], d.perm[5], d.perm[6], d.perm[7],
+ d.perm[8], d.perm[9], d.perm[10], d.perm[11],
+ d.perm[12], d.perm[13], d.perm[14], d.perm[15],
+ d.perm[16], d.perm[17], d.perm[18], d.perm[19],
+ d.perm[20], d.perm[21], d.perm[22], d.perm[23],
+ d.perm[24], d.perm[25], d.perm[26], d.perm[27],
+ d.perm[28], d.perm[29], d.perm[30], d.perm[31]);
+ break;
default:
gcc_unreachable ();
}
@@ -35232,6 +36632,95 @@ ix86_expand_vec_perm_builtin (tree exp)
return CONST0_RTX (d.vmode);
}
+bool
+ix86_expand_vec_perm_const (rtx operands[4])
+{
+ struct expand_vec_perm_d d;
+ unsigned char perm[MAX_VECT_LEN];
+ int i, nelt, which;
+ rtx sel;
+
+ d.target = operands[0];
+ d.op0 = operands[1];
+ d.op1 = operands[2];
+ sel = operands[3];
+
+ d.vmode = GET_MODE (d.target);
+ gcc_assert (VECTOR_MODE_P (d.vmode));
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.testing_p = false;
+
+ gcc_assert (GET_CODE (sel) == CONST_VECTOR);
+ gcc_assert (XVECLEN (sel, 0) == nelt);
+ gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+
+ for (i = which = 0; i < nelt; ++i)
+ {
+ rtx e = XVECEXP (sel, 0, i);
+ int ei = INTVAL (e) & (2 * nelt - 1);
+
+ which |= (ei < nelt ? 1 : 2);
+ d.perm[i] = ei;
+ perm[i] = ei;
+ }
+
+ switch (which)
+ {
+ default:
+ gcc_unreachable();
+
+ case 3:
+ if (!rtx_equal_p (d.op0, d.op1))
+ break;
+
+ /* The elements of PERM do not suggest that only the first operand
+ is used, but both operands are identical. Allow easier matching
+ of the permutation by folding the permutation into the single
+ input vector. */
+ for (i = 0; i < nelt; ++i)
+ if (d.perm[i] >= nelt)
+ d.perm[i] -= nelt;
+ /* FALLTHRU */
+
+ case 1:
+ d.op1 = d.op0;
+ break;
+
+ case 2:
+ for (i = 0; i < nelt; ++i)
+ d.perm[i] -= nelt;
+ d.op0 = d.op1;
+ break;
+ }
+
+ if (ix86_expand_vec_perm_builtin_1 (&d))
+ return true;
+
+ /* If the mask says both arguments are needed, but they are the same,
+ the above tried to expand with d.op0 == d.op1. If that didn't work,
+ retry with d.op0 != d.op1 as that is what testing has been done with. */
+ if (which == 3 && d.op0 == d.op1)
+ {
+ rtx seq;
+ bool ok;
+
+ memcpy (d.perm, perm, sizeof (perm));
+ d.op1 = gen_reg_rtx (d.vmode);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_builtin_1 (&d);
+ seq = get_insns ();
+ end_sequence ();
+ if (ok)
+ {
+ emit_move_insn (d.op1, d.op0);
+ emit_insn (seq);
+ return true;
+ }
+ }
+
+ return false;
+}
+
/* Implement targetm.vectorize.builtin_vec_perm_ok. */
static bool
@@ -35262,10 +36751,10 @@ ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
vec_mask = extract_vec_perm_cst (&d, mask);
- /* This hook is cannot be called in response to something that the
- user does (unlike the builtin expander) so we shouldn't ever see
- an error generated from the extract. */
- gcc_assert (vec_mask > 0 && vec_mask <= 3);
+ /* Check whether the mask can be applied to the vector type. */
+ if (vec_mask < 0 || vec_mask > 3)
+ return false;
+
one_vec = (vec_mask != 3);
/* Implementable with shufps or pshufd. */