@@ -180,6 +180,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
180180STATISTIC (LoopsVectorized, " Number of loops vectorized" );
181181STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
182182STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
183+ STATISTIC (CSAsVectorized,
184+ " Number of conditional scalar assignments vectorized" );
183185
184186static cl::opt<bool > EnableEpilogueVectorization (
185187 " enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -500,6 +502,10 @@ class InnerLoopVectorizer {
500502 virtual std::pair<BasicBlock *, Value *>
501503 createVectorizedLoopSkeleton (const SCEV2ValueTy &ExpandedSCEVs);
502504
505+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
506+ // / loop with the extracted scalar from the vector loop for.
507+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
508+
503509 // / Fix the vectorized code, taking care of header phi's, live-outs, and more.
504510 void fixVectorizedLoop (VPTransformState &State, VPlan &Plan);
505511
@@ -2932,6 +2938,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29322938 TargetTransformInfo::TCK_RecipThroughput);
29332939}
29342940
2941+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2942+ for (const auto &CSA: Plan.getCSAStates ()) {
2943+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2944+ assert (VPDataUpdate &&
2945+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2946+ Value *V = VPDataUpdate->getUnderlyingValue ();
2947+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2948+ /* NeedsScalar=*/ true );
2949+ // Fix LCSSAPhis
2950+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2951+ for (User *U : V->users ())
2952+ if (auto *Phi = dyn_cast<PHINode>(U);
2953+ Phi && Phi->getParent () == LoopExitBlock)
2954+ ToFix.insert (Phi);
2955+ for (PHINode *Phi : ToFix)
2956+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2957+ }
2958+ }
2959+
29352960void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State,
29362961 VPlan &Plan) {
29372962 // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2972,6 +2997,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
29722997 getOrCreateVectorTripCount (VectorLoop->getLoopPreheader ()),
29732998 IVEndValues[Entry.first ], LoopMiddleBlock,
29742999 VectorLoop->getHeader (), Plan, State);
3000+
3001+ fixCSALiveOuts (State, Plan);
29753002 }
29763003
29773004 // Fix live-out phis not already fixed earlier.
@@ -4110,7 +4137,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
41104137 // found modulo the vectorization factor is not zero, try to fold the tail
41114138 // by masking.
41124139 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4113- setTailFoldingStyles (MaxFactors.ScalableVF .isScalable (), UserIC);
41144140 if (foldTailByMasking ()) {
41154141 if (getTailFoldingStyle () == TailFoldingStyle::DataWithEVL) {
41164142 LLVM_DEBUG (
@@ -4482,6 +4508,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44824508 case VPDef::VPEVLBasedIVPHISC:
44834509 case VPDef::VPPredInstPHISC:
44844510 case VPDef::VPBranchOnMaskSC:
4511+ case VPRecipeBase::VPCSADataUpdateSC:
4512+ case VPRecipeBase::VPCSAExtractScalarSC:
4513+ case VPRecipeBase::VPCSAHeaderPHISC:
44854514 continue ;
44864515 case VPDef::VPReductionSC:
44874516 case VPDef::VPActiveLaneMaskPHISC:
@@ -6995,6 +7024,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
69957024 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
69967025 return ;
69977026
7027+ CM.setTailFoldingStyles (MaxFactors.ScalableVF .isScalable (), UserIC);
7028+
69987029 // Invalidate interleave groups if all blocks of loop will be predicated.
69997030 if (CM.blockNeedsPredicationForAnyReason (OrigLoop->getHeader ()) &&
70007031 !useMaskedInterleavedAccesses (TTI)) {
@@ -8476,9 +8507,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
84768507 return Recipe;
84778508
84788509 VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8479- assert ((Legal->isReductionVariable (Phi) ||
8480- Legal->isFixedOrderRecurrence (Phi)) &&
8481- " can only widen reductions and fixed-order recurrences here" );
84828510 VPValue *StartV = Operands[0 ];
84838511 if (Legal->isReductionVariable (Phi)) {
84848512 const RecurrenceDescriptor &RdxDesc =
@@ -8488,12 +8516,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
84888516 PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
84898517 CM.isInLoopReduction (Phi),
84908518 CM.useOrderedReductions (RdxDesc));
8491- } else {
8519+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
84928520 // TODO: Currently fixed-order recurrences are modeled as chains of
84938521 // first-order recurrences. If there are no users of the intermediate
84948522 // recurrences in the chain, the fixed order recurrence should be modeled
84958523 // directly, enabling more efficient codegen.
84968524 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8525+ } else if (Legal->isCSAPhi (Phi)) {
8526+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8527+ VPValue *InitData = State->getVPInitData ();
8528+ // When the VF=getFixed(1), InitData is just InitScalar.
8529+ if (!InitData)
8530+ InitData = State->getVPInitScalar ();
8531+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8532+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8533+ } else {
8534+ llvm_unreachable (
8535+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
84978536 }
84988537
84998538 PhisToFix.push_back (PhiRecipe);
@@ -8523,6 +8562,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85238562 make_range (Operands.begin (), Operands.end ()));
85248563
85258564 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8565+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8566+ return CSADescriptor::isCSASelect (CSA.second , SI);
8567+ });
8568+ if (CSADescIt != Legal->getCSAs ().end ()) {
8569+ PHINode *CSAPhi = CSADescIt->first ;
8570+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8571+ VPValue *VPDataPhi = State->getPhiRecipe ();
8572+ auto *R = new VPCSADataUpdateRecipe (
8573+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8574+ State->setDataUpdate (R);
8575+ return R;
8576+ }
8577+
85268578 return new VPWidenSelectRecipe (
85278579 *SI, make_range (Operands.begin (), Operands.end ()));
85288580 }
@@ -8535,6 +8587,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85358587 return tryToWiden (Instr, Operands, VPBB);
85368588}
85378589
8590+ // / Add CSA Recipes that can occur before each instruction in the input IR
8591+ // / is processed and introduced into VPlan.
8592+ static void
8593+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8594+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8595+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8596+ VPlan &Plan) {
8597+
8598+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8599+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8600+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8601+
8602+ for (const auto &CSA : CSAs) {
8603+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8604+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8605+
8606+ // Scalar VF builds the scalar version of the loop. In that case,
8607+ // no maintenence of mask nor extraction in middle block is needed.
8608+ if (IsScalarVF) {
8609+ VPCSAState *S = new VPCSAState (VPInitScalar);
8610+ Plan.addCSAState (CSA.first , S);
8611+ continue ;
8612+ }
8613+
8614+ auto *VPInitMask = new VPInstruction (VPInstruction::CSAInitMask, {}, DL,
8615+ " csa.init.mask" );
8616+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8617+ {VPInitScalar}, DL, " csa.init.data" );
8618+ PreheaderVPBB->appendRecipe (VPInitMask);
8619+ PreheaderVPBB->appendRecipe (VPInitData);
8620+
8621+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8622+ DL, " csa.mask.phi" );
8623+ HeaderVPBB->appendRecipe (VPMaskPhi);
8624+
8625+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8626+ Plan.addCSAState (CSA.first , S);
8627+ }
8628+ }
8629+
8630+ // / Add CSA Recipes that must occur after each instruction in the input IR
8631+ // / is processed and introduced into VPlan.
8632+ static void
8633+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8634+ const LoopVectorizationLegality::CSAList &CSAs,
8635+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8636+ VPlan &Plan) {
8637+ // Don't build CSA for VF=ElementCount::getFixed(1)
8638+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8639+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8640+ return ;
8641+
8642+ for (const auto &CSA : CSAs) {
8643+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8644+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8645+
8646+ assert (VPDataUpdate &&
8647+ " VPDataUpdate must have been introduced prior to postprocess" );
8648+ assert (CSA.second .getCond () &&
8649+ " CSADescriptor must know how to describe the condition" );
8650+ auto GetVPValue = [&](Value *I) {
8651+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8652+ };
8653+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8654+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8655+
8656+ // The CSA optimization wants to use a condition such that when it is
8657+ // true, a new value is assigned. However, it is possible that a true lane
8658+ // in WidenedCond corresponds to selection of the initial value instead.
8659+ // In that case, we must use the negation of WidenedCond.
8660+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8661+ VPValue *CondToUse = WidenedCond;
8662+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8663+ CSA.first ) {
8664+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8665+ VPNotCond->insertBefore (
8666+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8667+ CondToUse = VPNotCond;
8668+ }
8669+
8670+ auto *VPAnyActive = new VPInstruction (
8671+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8672+ VPAnyActive->insertBefore (
8673+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8674+
8675+ auto *VPMaskSel = new VPInstruction (
8676+ VPInstruction::CSAMaskSel,
8677+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8678+ VPMaskSel->insertAfter (VPAnyActive);
8679+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8680+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8681+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8682+
8683+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8684+
8685+ // Update CSAState with new recipes
8686+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8687+ CSAState->setVPAnyActive (VPAnyActive);
8688+ }
8689+ }
8690+
85388691void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
85398692 ElementCount MaxVF) {
85408693 assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8591,7 +8744,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
85918744// VPWidenPointerInductionRecipe and induction increments.
85928745static MapVector<PHINode *, VPValue *> collectUsersInExitBlock (
85938746 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8594- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8747+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8748+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
85958749 auto MiddleVPBB =
85968750 cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
85978751 // No edge from the middle block to the unique exit block has been inserted
@@ -8620,6 +8774,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
86208774 return P && Inductions.contains (P);
86218775 })))
86228776 continue ;
8777+ // Exit values for CSAs are computed and updated outside of VPlan and
8778+ // independent of induction recipes.
8779+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8780+ // live-outs.
8781+ if (isa<VPCSADataUpdateRecipe>(V) &&
8782+ (isa<Instruction>(IncomingValue) &&
8783+ any_of (IncomingValue->users (), [&CSAs](User *U) {
8784+ auto *P = dyn_cast<PHINode>(U);
8785+ return P && CSAs.contains (P);
8786+ })))
8787+ continue ;
86238788 ExitingValuesToFix.insert ({&ExitPhi, V});
86248789 }
86258790 return ExitingValuesToFix;
@@ -8861,6 +9026,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
88619026 bool HasNUW = Style == TailFoldingStyle::None;
88629027 addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
88639028
9029+ // CSA vectorization is only supported for None or DataWithEVL tail folding
9030+ // styles.
9031+ // FIXME: Implement CSA for more tail folding styles
9032+ if (Style != TailFoldingStyle::None &&
9033+ Style != TailFoldingStyle::DataWithEVL && !Legal->getCSAs ().empty ())
9034+ return nullptr ;
9035+
9036+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9037+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9038+ Range, *Plan);
9039+
88649040 VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
88659041
88669042 // ---------------------------------------------------------------------------
@@ -8967,6 +9143,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89679143 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
89689144 }
89699145
9146+ VPBasicBlock *MiddleVPBB =
9147+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9148+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9149+ Range, *Plan);
9150+
89709151 // After here, VPBB should not be used.
89719152 VPBB = nullptr ;
89729153
@@ -8976,8 +9157,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89769157 " VPBasicBlock" );
89779158 RecipeBuilder.fixHeaderPhis ();
89789159
8979- MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock (
8980- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9160+ MapVector<PHINode *, VPValue *> ExitingValuesToFix =
9161+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9162+ Legal->getInductionVars (), Legal->getCSAs ());
89819163
89829164 addLiveOutsForFirstOrderRecurrences (*Plan, ExitingValuesToFix);
89839165 addUsersInExitBlock (*Plan, ExitingValuesToFix);
@@ -10074,6 +10256,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1007410256 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan (
1007510257 EPI.MainLoopVF , EPI.MainLoopUF , *BestMainPlan, MainILV, DT, true );
1007610258 ++LoopsVectorized;
10259+ CSAsVectorized += LVL.getCSAs ().size ();
1007710260
1007810261 // Second pass vectorizes the epilogue and adjusts the control flow
1007910262 // edges from the first pass.
@@ -10166,6 +10349,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1016610349 PSI, Checks);
1016710350 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1016810351 ++LoopsVectorized;
10352+ CSAsVectorized += LVL.getCSAs ().size ();
1016910353
1017010354 // Add metadata to disable runtime unrolling a scalar loop when there
1017110355 // are no runtime checks about strides and memory. A scalar loop that is
0 commit comments