-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Use TTI::getScalarizationOverhead where possible #125725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Use TTI::getScalarizationOverhead where possible #125725
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesBetter to use TTI::getScalarizationOverhead instead of Patch is 22.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125725.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 558d75c5eb388f..7b3526cca119f5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10706,6 +10706,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
});
SmallPtrSet<Value *, 4> UniqueBases;
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+ SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
for (unsigned Part : seq<unsigned>(NumParts)) {
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
@@ -10756,10 +10757,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
continue;
}
}
- Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
- CostKind, Idx);
- }
- }
+ APInt &DemandedElts =
+ VectorOpsToExtracts
+ .try_emplace(VecBase,
+ APInt::getZero(getNumElements(VecBase->getType())))
+ .first->getSecond();
+ DemandedElts.setBit(Idx);
+ }
+ }
+ for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
+ Cost -= TTI.getScalarizationOverhead(cast<VectorType>(Vec->getType()),
+ DemandedElts, /*Insert=*/false,
+ /*Extract=*/true, CostKind);
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
// Found the bunch of extractelement instructions that must be gathered
@@ -11283,24 +11292,27 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
+ APInt DemandedElts;
+ VectorType *SrcVecTy = nullptr;
auto GetScalarCost = [&](unsigned Idx) {
if (isa<PoisonValue>(UniqueValues[Idx]))
return InstructionCost(TTI::TCC_Free);
auto *I = cast<Instruction>(UniqueValues[Idx]);
- VectorType *SrcVecTy;
- if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *EE = cast<ExtractElementInst>(I);
- SrcVecTy = EE->getVectorOperandType();
- } else {
- auto *EV = cast<ExtractValueInst>(I);
- Type *AggregateTy = EV->getAggregateOperand()->getType();
- unsigned NumElts;
- if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
- NumElts = ATy->getNumElements();
- else
- NumElts = AggregateTy->getStructNumElements();
- SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
+ if (!SrcVecTy) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *EE = cast<ExtractElementInst>(I);
+ SrcVecTy = EE->getVectorOperandType();
+ } else {
+ auto *EV = cast<ExtractValueInst>(I);
+ Type *AggregateTy = EV->getAggregateOperand()->getType();
+ unsigned NumElts;
+ if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
+ NumElts = ATy->getNumElements();
+ else
+ NumElts = AggregateTy->getStructNumElements();
+ SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
+ }
}
if (I->hasOneUse()) {
Instruction *Ext = I->user_back();
@@ -11317,10 +11329,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return Cost;
}
}
- return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
- CostKind, *getExtractIndex(I));
+ if (DemandedElts.isZero())
+ DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
+ DemandedElts.setBit(*getExtractIndex(I));
+ return InstructionCost(TTI::TCC_Free);
+ };
+ auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
+ return CommonCost - (DemandedElts.isZero()
+ ? TTI::TCC_Free
+ : TTI.getScalarizationOverhead(
+ SrcVecTy, DemandedElts, /*Insert=*/false,
+ /*Extract=*/true, CostKind));
};
- auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
return GetCostDiff(GetScalarCost, GetVectorCost);
}
case Instruction::InsertElement: {
@@ -13663,6 +13683,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
APInt ShuffledElements = APInt::getZero(VL.size());
+ APInt DemandedElements = APInt::getZero(VL.size());
DenseMap<Value *, unsigned> UniqueElements;
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost;
@@ -13673,9 +13694,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
V = nullptr;
}
if (!ForPoisonSrc)
- Cost +=
- TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
- I, Constant::getNullValue(VecTy), V);
+ DemandedElements.setBit(I);
};
SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
@@ -13698,6 +13717,10 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
ShuffledElements.setBit(I);
ShuffleMask[I] = Res.first->second;
}
+ if (!DemandedElements.isZero())
+ Cost +=
+ TTI->getScalarizationOverhead(VecTy, DemandedElements, /*Insert=*/true,
+ /*Extract=*/false, CostKind, VL);
if (ForPoisonSrc) {
if (isa<FixedVectorType>(ScalarTy)) {
assert(SLPReVec && "Only supported by REVEC.");
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
index 29bd81998cdb2a..bb88edff116347 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
@@ -607,35 +607,13 @@ define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i3
; computes (a/const + x - y) * z
define <2 x i32> @sdiv_v2i32_const_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
-; NO-SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
-; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0
-; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1
-; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], 2
-; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], 4
-; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0
-; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1
-; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]]
-; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]]
-; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0
-; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1
-; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]]
-; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]]
-; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0
-; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1
-; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]]
-; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]]
-; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1
-; NO-SVE-NEXT: ret <2 x i32> [[RES1]]
-;
-; SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
-; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; SVE-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
-; SVE-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
-; SVE-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
-; SVE-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
-; SVE-NEXT: ret <2 x i32> [[TMP4]]
+; CHECK-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
+; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
+; CHECK-NEXT: ret <2 x i32> [[TMP4]]
;
{
%a0 = extractelement <2 x i32> %a, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll
index 579239bc659bd8..75a413ffc1fb19 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll
@@ -10,15 +10,18 @@ define <4 x double> @test(ptr %ia, ptr %ib, ptr %ic, ptr %id, ptr %ie, ptr %x) {
; CHECK-NEXT: [[I4275:%.*]] = load double, ptr [[ID]], align 8
; CHECK-NEXT: [[I4277:%.*]] = load double, ptr [[IE]], align 8
; CHECK-NEXT: [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[I4238]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[I4252]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[I4264]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[I4277]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP7]]
-; CHECK-NEXT: ret <4 x double> [[TMP8]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I4238]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I4252]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[I4264]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[I4277]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[I44281:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x double> [[I44281]]
;
%i4238 = load double, ptr %ia, align 8
%i4252 = load double, ptr %ib, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index de99654d84eb81..c2369a6a89ec1d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
-; YAML-NEXT: - Cost: '-3'
+; YAML-NEXT: - Cost: '-4'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '10'
; YAML-NEXT: ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index 2cdbd5cff4468c..cb4783010965e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -49,11 +49,24 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
;
; AVX512-LABEL: @reduce_and4(
; AVX512-NEXT: entry:
-; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX512-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX512-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
+; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1
+; AVX512-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2
+; AVX512-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3
+; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
+; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
+; AVX512-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2
+; AVX512-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3
+; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[VECEXT8]], i32 8
+; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT7]], i32 9
+; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT10]], i32 10
+; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT12]], i32 11
+; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 12
+; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT]], i32 13
+; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT2]], i32 14
+; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT4]], i32 15
+; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP8]])
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
; AVX512-NEXT: ret i32 [[OP_RDX1]]
;
@@ -131,11 +144,24 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
; AVX2-NEXT: ret i32 [[OP_RDX]]
;
; AVX512-LABEL: @reduce_and4_transpose(
-; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX512-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX512-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
+; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
+; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
+; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1
+; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
+; AVX512-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2
+; AVX512-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2
+; AVX512-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3
+; AVX512-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT24]], i32 8
+; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT16]], i32 9
+; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT8]], i32 10
+; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 11
+; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT23]], i32 12
+; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT15]], i32 13
+; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT7]], i32 14
+; AVX512-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[VECEXT]], i32 15
+; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP9]])
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
; AVX512-NEXT: ret i32 [[OP_RDX1]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
index f1034f39711351..ae5018a63e2148 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
@@ -1,22 +1,43 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %}
define void @tes() {
-; CHECK-LABEL: define void @tes() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
-; CHECK-NEXT: br label [[TMP1:%.*]]
-; CHECK: 1:
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP4]], i1 false
-; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false
-; CHECK-NEXT: br i1 [[OP_RDX1]], label [[TMP6:%.*]], label [[TMP5:%.*]]
-; CHECK: 4:
-; CHECK-NEXT: ret void
-; CHECK: 5:
-; CHECK-NEXT: ret void
+; X86-LABEL: define void @tes() {
+; X86-NEXT: entry:
+; X86-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
+; X86-NEXT: br label [[TMP1:%.*]]
+; X86: 1:
+; X86-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
+; X86-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
+; X86-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP3]], i1 false
+; X86-NEXT: [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false
+; X86-NEXT: br i1 [[OP_RDX1]], label [[TMP4:%.*]], label [[TMP5:%.*]]
+; X86: 4:
+; X86-NEXT: ret void
+; X86: 5:
+; X86-NEXT: ret void
+;
+; AARCH64-LABEL: define void @tes() {
+; AARCH64-NEXT: entry:
+; AARCH64-NEXT: [[TMP0:%.*]] = extracte...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Better to use TTI::getScalarizationOverhead instead of TTI::getVectorInstrCost to correctly calculate the costs of buildvectors/extracts. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm/llvm-project#125725
Better to use TTI::getScalarizationOverhead instead of TTI::getVectorInstrCost to correctly calculate the costs of buildvectors/extracts. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#125725
Hi @alexey-bataev, I believe, unfortunately, this PR caused a performance regression, so filed #127244. Any chance this change can be reverted or fixed forward? |
Upstream change: llvm#125725
Better to use TTI::getScalarizationOverhead instead of
TTI::getVectorInstrCost to correctly calculate the costs of
buildvectors/extracts.