summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2019-01-15 16:11:05 +0000
committerSanjay Patel <spatel@rotateright.com>2019-01-15 16:11:05 +0000
commit47d902511b7de7292813ef031a540d99852c3919 (patch)
tree7745ae7b605fce07ffddd5a1c1fa278cfdbb01ae
parente51de053d347ed5d259f323e54caf0789d4ce9dd (diff)
[DAGCombiner] reduce buildvec of zexted extracted element to shuffle
The motivating case for this is shown in the first regression test. We are transferring to scalar and back rather than just zero-extending with 'vpmovzxdq'. That's a special-case for a more general pattern as shown here. In all tests, we're avoiding the vector-scalar-vector moves in favor of vector ops. We aren't producing optimal shuffle code in some cases though, so the patch is limited to reduce regressions. Differential Revision: https://reviews.llvm.org/D56281
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp75
-rw-r--r--llvm/test/CodeGen/X86/buildvec-extract.ll316
2 files changed, 236 insertions, 155 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b1770920f06..2ffaa9054ff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16195,6 +16195,78 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
return Shuffle;
}
+static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
+ assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
+
+ // First, determine where the build vector is not undef.
+ // TODO: We could extend this to handle zero elements as well as undefs.
+ int NumBVOps = BV->getNumOperands();
+ int ZextElt = -1;
+ for (int i = 0; i != NumBVOps; ++i) {
+ SDValue Op = BV->getOperand(i);
+ if (Op.isUndef())
+ continue;
+ if (ZextElt == -1)
+ ZextElt = i;
+ else
+ return SDValue();
+ }
+ // Bail out if there's no non-undef element.
+ if (ZextElt == -1)
+ return SDValue();
+
+ // The build vector contains some number of undef elements and exactly
+ // one other element. That other element must be a zero-extended scalar
+ // extracted from a vector at a constant index to turn this into a shuffle.
+ // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND.
+ SDValue Zext = BV->getOperand(ZextElt);
+ if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() ||
+ Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)))
+ return SDValue();
+
+ // The zero-extend must be a multiple of the source size.
+ SDValue Extract = Zext.getOperand(0);
+ unsigned DestSize = Zext.getValueSizeInBits();
+ unsigned SrcSize = Extract.getValueSizeInBits();
+ if (DestSize % SrcSize != 0)
+ return SDValue();
+
+ // Create a shuffle mask that will combine the extracted element with zeros
+ // and undefs.
+ int ZextRatio = DestSize / SrcSize;
+ int NumMaskElts = NumBVOps * ZextRatio;
+ SmallVector<int, 32> ShufMask(NumMaskElts, -1);
+ for (int i = 0; i != NumMaskElts; ++i) {
+ if (i / ZextRatio == ZextElt) {
+ // The low bits of the (potentially translated) extracted element map to
+ // the source vector. The high bits map to zero. We will use a zero vector
+ // as the 2nd source operand of the shuffle, so use the 1st element of
+ // that vector (mask value is number-of-elements) for the high bits.
+ if (i % ZextRatio == 0)
+ ShufMask[i] = Extract.getConstantOperandVal(1);
+ else
+ ShufMask[i] = NumMaskElts;
+ }
+
+ // Undef elements of the build vector remain undef because we initialize
+ // the shuffle mask with -1.
+ }
+
+ // Turn this into a shuffle with zero if that's legal.
+ EVT VecVT = Extract.getOperand(0).getValueType();
+ if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(ShufMask, VecVT))
+ return SDValue();
+
+ // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
+ // bitcast (shuffle V, ZeroVec, VectorMask)
+ SDLoc DL(BV);
+ SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
+ SDValue Shuf = DAG.getVectorShuffle(VecVT, DL, Extract.getOperand(0), ZeroVec,
+ ShufMask);
+ return DAG.getBitcast(BV->getValueType(0), Shuf);
+}
+
// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
// operations. If the types of the vectors we're extracting from allow it,
// turn this into a vector_shuffle node.
@@ -16206,6 +16278,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
if (!isTypeLegal(VT))
return SDValue();
+ if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG))
+ return V;
+
// May only combine to shuffle after legalize if shuffle is legal.
if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
return SDValue();
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 15386907e24..267eec4bdab 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -4,16 +4,20 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=ANY,AVX
define <2 x i64> @extract0_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE-LABEL: extract0_i32_zext_insert0_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: extract0_i32_zext_insert0_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract0_i32_zext_insert0_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract0_i32_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%z = zext i32 %e to i64
@@ -40,23 +44,14 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
}
define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract1_i32_zext_insert0_i64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: extract1_i32_zext_insert0_i64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $1, %xmm0, %eax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: extract1_i32_zext_insert0_i64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: extract1_i32_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vextractps $1, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 1
%z = zext i32 %e to i64
@@ -90,23 +85,16 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
}
define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract2_i32_zext_insert0_i64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: extract2_i32_zext_insert0_i64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $2, %xmm0, %eax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: extract2_i32_zext_insert0_i64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
;
; AVX-LABEL: extract2_i32_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vextractps $2, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
@@ -140,23 +128,14 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
}
define <2 x i64> @extract3_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract3_i32_zext_insert0_i64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: extract3_i32_zext_insert0_i64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $3, %xmm0, %eax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: extract3_i32_zext_insert0_i64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: retq
;
; AVX-LABEL: extract3_i32_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vextractps $3, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 3
%z = zext i32 %e to i64
@@ -190,18 +169,25 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) {
}
define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) {
-; SSE-LABEL: extract0_i32_zext_insert1_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: retq
+; SSE2-LABEL: extract0_i32_zext_insert1_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract0_i32_zext_insert1_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract0_i32_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%z = zext i32 %e to i64
@@ -232,24 +218,18 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_zero(<4 x i32> %x) {
define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) {
; SSE2-LABEL: extract1_i32_zext_insert1_i64_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract1_i32_zext_insert1_i64_undef:
; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $1, %xmm0, %eax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: extract1_i32_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vextractps $1, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 1
%z = zext i32 %e to i64
@@ -288,24 +268,19 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_zero(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert1_i64_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert1_i64_undef:
; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $2, %xmm0, %eax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: extract2_i32_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vextractps $2, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
@@ -342,26 +317,14 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) {
}
define <2 x i64> @extract3_i32_zext_insert1_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract3_i32_zext_insert1_i64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: extract3_i32_zext_insert1_i64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $3, %xmm0, %eax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE41-NEXT: retq
+; SSE-LABEL: extract3_i32_zext_insert1_i64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: extract3_i32_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vextractps $3, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 3
%z = zext i32 %e to i64
@@ -398,16 +361,21 @@ define <2 x i64> @extract3_i32_zext_insert1_i64_zero(<4 x i32> %x) {
}
define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract0_i16_zext_insert0_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $0, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: extract0_i16_zext_insert0_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract0_i16_zext_insert0_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract0_i16_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $0, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 0
%z = zext i16 %e to i64
@@ -434,16 +402,25 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
}
define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract1_i16_zext_insert0_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: extract1_i16_zext_insert0_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,1,4,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract1_i16_zext_insert0_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract1_i16_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 1
%z = zext i16 %e to i64
@@ -470,16 +447,26 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) {
}
define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract2_i16_zext_insert0_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: extract2_i16_zext_insert0_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract2_i16_zext_insert0_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract2_i16_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 2
%z = zext i16 %e to i64
@@ -508,14 +495,12 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) {
define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) {
; SSE-LABEL: extract3_i16_zext_insert0_i64_undef:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
+; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract3_i16_zext_insert0_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 3
%z = zext i16 %e to i64
@@ -542,18 +527,24 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) {
}
define <2 x i64> @extract0_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract0_i16_zext_insert1_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $0, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: retq
+; SSE2-LABEL: extract0_i16_zext_insert1_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract0_i16_zext_insert1_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract0_i16_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $0, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 0
%z = zext i16 %e to i64
@@ -582,18 +573,21 @@ define <2 x i64> @extract0_i16_zext_insert1_i64_zero(<8 x i16> %x) {
}
define <2 x i64> @extract1_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract1_i16_zext_insert1_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: retq
+; SSE2-LABEL: extract1_i16_zext_insert1_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract1_i16_zext_insert1_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract1_i16_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 1
%z = zext i16 %e to i64
@@ -622,18 +616,24 @@ define <2 x i64> @extract1_i16_zext_insert1_i64_zero(<8 x i16> %x) {
}
define <2 x i64> @extract2_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract2_i16_zext_insert1_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: retq
+; SSE2-LABEL: extract2_i16_zext_insert1_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract2_i16_zext_insert1_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract2_i16_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 2
%z = zext i16 %e to i64
@@ -662,18 +662,24 @@ define <2 x i64> @extract2_i16_zext_insert1_i64_zero(<8 x i16> %x) {
}
define <2 x i64> @extract3_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract3_i16_zext_insert1_i64_undef:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT: retq
+; SSE2-LABEL: extract3_i16_zext_insert1_i64_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extract3_i16_zext_insert1_i64_undef:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: extract3_i16_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 3
%z = zext i16 %e to i64