diff options
author | Craig Topper <craig.topper@intel.com> | 2018-09-27 21:28:52 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-09-27 21:28:52 +0000 |
commit | 745b5ffc593250c4efe8b0b0f450b8d6c2b55525 (patch) | |
tree | 1aaa79f53e84effa08d73b611607e9727c97894a | |
parent | 22589ff5d889fbde887a2f62a240248dacff694e (diff) |
[ScalarizeMaskedMemIntrin] When expanding masked loads, start with the passthru value and insert each conditional load result over their element.
Previously we started with undef and did one final merge at the end with a select.
-rw-r--r-- | llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp | 34 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_memop.ll | 54 | ||||
-rw-r--r-- | llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll | 17 |
3 files changed, 31 insertions, 74 deletions
diff --git a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp index 35580df42f1..b039cdd01d4 100644 --- a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -90,7 +90,7 @@ FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() { // cond.load: ; preds = %0 // %3 = getelementptr i32* %1, i32 0 // %4 = load i32* %3 -// %5 = insertelement <16 x i32> undef, i32 %4, i32 0 +// %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0 // br label %else // // else: ; preds = %0, %cond.load @@ -146,10 +146,8 @@ static void scalarizeMaskedLoad(CallInst *CI) { Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); unsigned VectorWidth = VecType->getNumElements(); - Value *UndefVal = UndefValue::get(VecType); - // The result vector - Value *VResult = UndefVal; + Value *VResult = Src0; if (isa<Constant>(Mask)) { for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { @@ -161,15 +159,11 @@ static void scalarizeMaskedLoad(CallInst *CI) { VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); } - Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); - CI->replaceAllUsesWith(NewI); + CI->replaceAllUsesWith(VResult); CI->eraseFromParent(); return; } - PHINode *Phi = nullptr; - Value *PrevPhi = UndefVal; - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // @@ -177,13 +171,6 @@ static void scalarizeMaskedLoad(CallInst *CI) { // %mask_1 = extractelement <16 x i1> %mask, i32 Idx // br i1 %mask_1, label %cond.load, label %else // - if (Idx > 0) { - Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - PrevPhi = Phi; - VResult = Phi; - } Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); @@ -200,7 +187,8 @@ static void scalarizeMaskedLoad(CallInst *CI) { Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + Value *NewVResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx)); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = @@ -211,13 +199,15 @@ static void scalarizeMaskedLoad(CallInst *CI) { OldBr->eraseFromParent(); PrevIfBlock = IfBlock; IfBlock = NewIfBlock; + + // Create the phi to join the new and previous value. + PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(NewVResult, CondBlock); + Phi->addIncoming(VResult, PrevIfBlock); + VResult = Phi; } - Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); - CI->replaceAllUsesWith(NewI); + CI->replaceAllUsesWith(VResult); CI->eraseFromParent(); } diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 50da0ed68a5..04cf58f556e 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -12,50 +12,20 @@ define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double ; AVX-LABEL: loadv1: ; AVX: ## %bb.0: ; AVX-NEXT: testq %rdi, %rdi -; AVX-NEXT: ## implicit-def: $xmm1 -; AVX-NEXT: je LBB0_1 -; AVX-NEXT: ## %bb.2: ## %else -; AVX-NEXT: testq %rdi, %rdi -; AVX-NEXT: jne LBB0_3 -; AVX-NEXT: LBB0_4: ## %else -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq -; AVX-NEXT: LBB0_1: ## %cond.load -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: testq %rdi, %rdi -; AVX-NEXT: je LBB0_4 -; AVX-NEXT: LBB0_3: ## %else -; AVX-NEXT: vmovaps %xmm0, %xmm1 -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: jne LBB0_2 +; AVX-NEXT: ## %bb.1: ## %cond.load +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: LBB0_2: ## %else ; AVX-NEXT: retq ; -; AVX512F-LABEL: loadv1: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: testq %rdi, %rdi -; AVX512F-NEXT: ## implicit-def: $xmm1 -; AVX512F-NEXT: jne LBB0_2 -; AVX512F-NEXT: ## %bb.1: ## %cond.load -; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: LBB0_2: ## %else -; AVX512F-NEXT: testq %rdi, %rdi -; AVX512F-NEXT: sete %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} -; AVX512F-NEXT: retq -; -; SKX-LABEL: loadv1: -; SKX: ## %bb.0: -; SKX-NEXT: testq %rdi, %rdi -; SKX-NEXT: ## implicit-def: $xmm1 -; SKX-NEXT: jne LBB0_2 -; SKX-NEXT: ## %bb.1: ## %cond.load -; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; SKX-NEXT: LBB0_2: ## %else -; SKX-NEXT: testq %rdi, %rdi -; SKX-NEXT: sete %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} -; SKX-NEXT: retq +; AVX512-LABEL: loadv1: +; AVX512: ## %bb.0: +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: jne LBB0_2 +; AVX512-NEXT: ## %bb.1: ## %cond.load +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: LBB0_2: ## %else +; AVX512-NEXT: retq %mask = icmp eq <1 x i64> %trigger, zeroinitializer %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst) ret <1 x double> %res diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll index cd575f8bfaa..96ebb470283 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll @@ -9,10 +9,10 @@ define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %pass ; CHECK: cond.load: ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP4]], i32 0 ; CHECK-NEXT: br label [[ELSE]] ; CHECK: else: -; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ undef, [[TMP0:%.*]] ] +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i32 1 ; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] ; CHECK: cond.load1: @@ -21,9 +21,8 @@ define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %pass ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP8]], i32 1 ; CHECK-NEXT: br label [[ELSE2]] ; CHECK: else2: -; CHECK-NEXT: [[RES_PHI_SELECT:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] -; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[MASK]], <2 x i64> [[RES_PHI_SELECT]], <2 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <2 x i64> [[TMP10]] +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> %mask, <2 x i64> %passthru) ret <2 x i64> %ret @@ -41,8 +40,7 @@ define <2 x i64> @scalarize_v2i64_ones_mask(<2 x i64>* %p, <2 x i64> %passthru) define <2 x i64> @scalarize_v2i64_zero_mask(<2 x i64>* %p, <2 x i64> %passthru) { ; CHECK-LABEL: @scalarize_v2i64_zero_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* -; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> zeroinitializer, <2 x i64> undef, <2 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru) ret <2 x i64> %ret @@ -53,9 +51,8 @@ define <2 x i64> @scalarize_v2i64_const_mask(<2 x i64>* %p, <2 x i64> %passthru) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64* ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> <i1 false, i1 true>, <2 x i64> [[TMP4]], <2 x i64> [[PASSTHRU:%.*]] -; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP3]], i32 1 +; CHECK-NEXT: ret <2 x i64> [[TMP4]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru) ret <2 x i64> %ret |