@@ -180,6 +180,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
180
180
STATISTIC (LoopsVectorized, " Number of loops vectorized" );
181
181
STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
182
182
STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
183
+ STATISTIC (CSAsVectorized,
184
+ " Number of conditional scalar assignments vectorized" );
183
185
184
186
static cl::opt<bool > EnableEpilogueVectorization (
185
187
" enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -500,6 +502,10 @@ class InnerLoopVectorizer {
500
502
virtual std::pair<BasicBlock *, Value *>
501
503
createVectorizedLoopSkeleton (const SCEV2ValueTy &ExpandedSCEVs);
502
504
505
+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
506
+ // / loop with the extracted scalar from the vector loop for.
507
+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
508
+
503
509
// / Fix the vectorized code, taking care of header phi's, live-outs, and more.
504
510
void fixVectorizedLoop (VPTransformState &State, VPlan &Plan);
505
511
@@ -2932,6 +2938,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2932
2938
TargetTransformInfo::TCK_RecipThroughput);
2933
2939
}
2934
2940
2941
+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2942
+ for (const auto &CSA : Plan.getCSAStates ()) {
2943
+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2944
+ assert (VPDataUpdate &&
2945
+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2946
+ Value *V = VPDataUpdate->getUnderlyingValue ();
2947
+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2948
+ /* NeedsScalar=*/ true );
2949
+ // Fix LCSSAPhis
2950
+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2951
+ for (User *U : V->users ())
2952
+ if (auto *Phi = dyn_cast<PHINode>(U);
2953
+ Phi && Phi->getParent () == LoopExitBlock)
2954
+ ToFix.insert (Phi);
2955
+ for (PHINode *Phi : ToFix)
2956
+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2957
+ }
2958
+ }
2959
+
2935
2960
void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State,
2936
2961
VPlan &Plan) {
2937
2962
// Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2972,6 +2997,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
2972
2997
getOrCreateVectorTripCount (VectorLoop->getLoopPreheader ()),
2973
2998
IVEndValues[Entry.first ], LoopMiddleBlock,
2974
2999
VectorLoop->getHeader (), Plan, State);
3000
+
3001
+ fixCSALiveOuts (State, Plan);
2975
3002
}
2976
3003
2977
3004
// Fix live-out phis not already fixed earlier.
@@ -4482,6 +4509,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4482
4509
case VPDef::VPEVLBasedIVPHISC:
4483
4510
case VPDef::VPPredInstPHISC:
4484
4511
case VPDef::VPBranchOnMaskSC:
4512
+ case VPRecipeBase::VPCSADataUpdateSC:
4513
+ case VPRecipeBase::VPCSAExtractScalarSC:
4514
+ case VPRecipeBase::VPCSAHeaderPHISC:
4485
4515
continue ;
4486
4516
case VPDef::VPReductionSC:
4487
4517
case VPDef::VPActiveLaneMaskPHISC:
@@ -8481,9 +8511,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8481
8511
return Recipe;
8482
8512
8483
8513
VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8484
- assert ((Legal->isReductionVariable (Phi) ||
8485
- Legal->isFixedOrderRecurrence (Phi)) &&
8486
- " can only widen reductions and fixed-order recurrences here" );
8487
8514
VPValue *StartV = Operands[0 ];
8488
8515
if (Legal->isReductionVariable (Phi)) {
8489
8516
const RecurrenceDescriptor &RdxDesc =
@@ -8493,12 +8520,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8493
8520
PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
8494
8521
CM.isInLoopReduction (Phi),
8495
8522
CM.useOrderedReductions (RdxDesc));
8496
- } else {
8523
+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
8497
8524
// TODO: Currently fixed-order recurrences are modeled as chains of
8498
8525
// first-order recurrences. If there are no users of the intermediate
8499
8526
// recurrences in the chain, the fixed order recurrence should be modeled
8500
8527
// directly, enabling more efficient codegen.
8501
8528
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8529
+ } else if (Legal->isCSAPhi (Phi)) {
8530
+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8531
+ VPValue *InitData = State->getVPInitData ();
8532
+ // When the VF=getFixed(1), InitData is just InitScalar.
8533
+ if (!InitData)
8534
+ InitData = State->getVPInitScalar ();
8535
+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8536
+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8537
+ } else {
8538
+ llvm_unreachable (
8539
+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
8502
8540
}
8503
8541
8504
8542
PhisToFix.push_back (PhiRecipe);
@@ -8528,6 +8566,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8528
8566
make_range (Operands.begin (), Operands.end ()));
8529
8567
8530
8568
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8569
+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8570
+ return CSADescriptor::isCSASelect (CSA.second , SI);
8571
+ });
8572
+ if (CSADescIt != Legal->getCSAs ().end ()) {
8573
+ PHINode *CSAPhi = CSADescIt->first ;
8574
+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8575
+ VPValue *VPDataPhi = State->getPhiRecipe ();
8576
+ auto *R = new VPCSADataUpdateRecipe (
8577
+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8578
+ State->setDataUpdate (R);
8579
+ return R;
8580
+ }
8581
+
8531
8582
return new VPWidenSelectRecipe (
8532
8583
*SI, make_range (Operands.begin (), Operands.end ()));
8533
8584
}
@@ -8540,6 +8591,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8540
8591
return tryToWiden (Instr, Operands, VPBB);
8541
8592
}
8542
8593
8594
+ // / Add CSA Recipes that can occur before each instruction in the input IR
8595
+ // / is processed and introduced into VPlan.
8596
+ static void
8597
+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8598
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8599
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8600
+ VPlan &Plan) {
8601
+
8602
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8603
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8604
+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8605
+
8606
+ for (const auto &CSA : CSAs) {
8607
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8608
+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8609
+
8610
+ // Scalar VF builds the scalar version of the loop. In that case,
8611
+ // no maintenence of mask nor extraction in middle block is needed.
8612
+ if (IsScalarVF) {
8613
+ VPCSAState *S = new VPCSAState (VPInitScalar);
8614
+ Plan.addCSAState (CSA.first , S);
8615
+ continue ;
8616
+ }
8617
+
8618
+ auto *VPInitMask =
8619
+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8620
+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8621
+ {VPInitScalar}, DL, " csa.init.data" );
8622
+ PreheaderVPBB->appendRecipe (VPInitMask);
8623
+ PreheaderVPBB->appendRecipe (VPInitData);
8624
+
8625
+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8626
+ DL, " csa.mask.phi" );
8627
+ HeaderVPBB->appendRecipe (VPMaskPhi);
8628
+
8629
+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8630
+ Plan.addCSAState (CSA.first , S);
8631
+ }
8632
+ }
8633
+
8634
+ // / Add CSA Recipes that must occur after each instruction in the input IR
8635
+ // / is processed and introduced into VPlan.
8636
+ static void
8637
+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8638
+ const LoopVectorizationLegality::CSAList &CSAs,
8639
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8640
+ VPlan &Plan) {
8641
+ // Don't build CSA for VF=ElementCount::getFixed(1)
8642
+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8643
+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8644
+ return ;
8645
+
8646
+ for (const auto &CSA : CSAs) {
8647
+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8648
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8649
+
8650
+ assert (VPDataUpdate &&
8651
+ " VPDataUpdate must have been introduced prior to postprocess" );
8652
+ assert (CSA.second .getCond () &&
8653
+ " CSADescriptor must know how to describe the condition" );
8654
+ auto GetVPValue = [&](Value *I) {
8655
+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8656
+ };
8657
+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8658
+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8659
+
8660
+ // The CSA optimization wants to use a condition such that when it is
8661
+ // true, a new value is assigned. However, it is possible that a true lane
8662
+ // in WidenedCond corresponds to selection of the initial value instead.
8663
+ // In that case, we must use the negation of WidenedCond.
8664
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8665
+ VPValue *CondToUse = WidenedCond;
8666
+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8667
+ CSA.first ) {
8668
+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8669
+ VPNotCond->insertBefore (
8670
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8671
+ CondToUse = VPNotCond;
8672
+ }
8673
+
8674
+ auto *VPAnyActive = new VPInstruction (
8675
+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8676
+ VPAnyActive->insertBefore (
8677
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8678
+
8679
+ auto *VPMaskSel = new VPInstruction (
8680
+ VPInstruction::CSAMaskSel,
8681
+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8682
+ VPMaskSel->insertAfter (VPAnyActive);
8683
+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8684
+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8685
+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8686
+
8687
+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8688
+
8689
+ // Update CSAState with new recipes
8690
+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8691
+ CSAState->setVPAnyActive (VPAnyActive);
8692
+ }
8693
+ }
8694
+
8543
8695
void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
8544
8696
ElementCount MaxVF) {
8545
8697
assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8596,7 +8748,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8596
8748
// VPWidenPointerInductionRecipe and induction increments.
8597
8749
static MapVector<PHINode *, VPValue *> collectUsersInExitBlock (
8598
8750
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8599
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8751
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8752
+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
8600
8753
auto MiddleVPBB =
8601
8754
cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
8602
8755
// No edge from the middle block to the unique exit block has been inserted
@@ -8625,6 +8778,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
8625
8778
return P && Inductions.contains (P);
8626
8779
})))
8627
8780
continue ;
8781
+ // Exit values for CSAs are computed and updated outside of VPlan and
8782
+ // independent of induction recipes.
8783
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8784
+ // live-outs.
8785
+ if (isa<VPCSADataUpdateRecipe>(V) &&
8786
+ (isa<Instruction>(IncomingValue) &&
8787
+ any_of (IncomingValue->users (), [&CSAs](User *U) {
8788
+ auto *P = dyn_cast<PHINode>(U);
8789
+ return P && CSAs.contains (P);
8790
+ })))
8791
+ continue ;
8628
8792
ExitingValuesToFix.insert ({&ExitPhi, V});
8629
8793
}
8630
8794
return ExitingValuesToFix;
@@ -8866,6 +9030,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8866
9030
bool HasNUW = Style == TailFoldingStyle::None;
8867
9031
addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
8868
9032
9033
+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9034
+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9035
+ Range, *Plan);
9036
+
8869
9037
VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8870
9038
8871
9039
// ---------------------------------------------------------------------------
@@ -8972,6 +9140,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8972
9140
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
8973
9141
}
8974
9142
9143
+ VPBasicBlock *MiddleVPBB =
9144
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9145
+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9146
+ Range, *Plan);
9147
+
8975
9148
// After here, VPBB should not be used.
8976
9149
VPBB = nullptr ;
8977
9150
@@ -8981,8 +9154,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8981
9154
" VPBasicBlock" );
8982
9155
RecipeBuilder.fixHeaderPhis ();
8983
9156
8984
- MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock (
8985
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9157
+ MapVector<PHINode *, VPValue *> ExitingValuesToFix =
9158
+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9159
+ Legal->getInductionVars (), Legal->getCSAs ());
8986
9160
8987
9161
addLiveOutsForFirstOrderRecurrences (*Plan, ExitingValuesToFix);
8988
9162
addUsersInExitBlock (*Plan, ExitingValuesToFix);
@@ -10079,6 +10253,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10079
10253
const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan (
10080
10254
EPI.MainLoopVF , EPI.MainLoopUF , *BestMainPlan, MainILV, DT, true );
10081
10255
++LoopsVectorized;
10256
+ CSAsVectorized += LVL.getCSAs ().size ();
10082
10257
10083
10258
// Second pass vectorizes the epilogue and adjusts the control flow
10084
10259
// edges from the first pass.
@@ -10171,6 +10346,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10171
10346
PSI, Checks);
10172
10347
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10173
10348
++LoopsVectorized;
10349
+ CSAsVectorized += LVL.getCSAs ().size ();
10174
10350
10175
10351
// Add metadata to disable runtime unrolling a scalar loop when there
10176
10352
// are no runtime checks about strides and memory. A scalar loop that is
0 commit comments