@@ -180,6 +180,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
180180STATISTIC (LoopsVectorized, " Number of loops vectorized" );
181181STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
182182STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
183+ STATISTIC (CSAsVectorized,
184+ " Number of conditional scalar assignments vectorized" );
183185
184186static cl::opt<bool > EnableEpilogueVectorization (
185187 " enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -500,6 +502,10 @@ class InnerLoopVectorizer {
500502 virtual std::pair<BasicBlock *, Value *>
501503 createVectorizedLoopSkeleton (const SCEV2ValueTy &ExpandedSCEVs);
502504
505+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
506+ // / loop with the extracted scalar from the vector loop for.
507+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
508+
503509 // / Fix the vectorized code, taking care of header phi's, live-outs, and more.
504510 void fixVectorizedLoop (VPTransformState &State, VPlan &Plan);
505511
@@ -2932,6 +2938,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29322938 TargetTransformInfo::TCK_RecipThroughput);
29332939}
29342940
2941+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2942+ for (const auto &CSA : Plan.getCSAStates ()) {
2943+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2944+ assert (VPDataUpdate &&
2945+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2946+ Value *V = VPDataUpdate->getUnderlyingValue ();
2947+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2948+ /* NeedsScalar=*/ true );
2949+ // Fix LCSSAPhis
2950+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2951+ for (User *U : V->users ())
2952+ if (auto *Phi = dyn_cast<PHINode>(U);
2953+ Phi && Phi->getParent () == LoopExitBlock)
2954+ ToFix.insert (Phi);
2955+ for (PHINode *Phi : ToFix)
2956+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2957+ }
2958+ }
2959+
29352960void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State,
29362961 VPlan &Plan) {
29372962 // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2972,6 +2997,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
29722997 getOrCreateVectorTripCount (VectorLoop->getLoopPreheader ()),
29732998 IVEndValues[Entry.first ], LoopMiddleBlock,
29742999 VectorLoop->getHeader (), Plan, State);
3000+
3001+ fixCSALiveOuts (State, Plan);
29753002 }
29763003
29773004 // Fix live-out phis not already fixed earlier.
@@ -4482,6 +4509,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44824509 case VPDef::VPEVLBasedIVPHISC:
44834510 case VPDef::VPPredInstPHISC:
44844511 case VPDef::VPBranchOnMaskSC:
4512+ case VPRecipeBase::VPCSADataUpdateSC:
4513+ case VPRecipeBase::VPCSAExtractScalarSC:
4514+ case VPRecipeBase::VPCSAHeaderPHISC:
44854515 continue ;
44864516 case VPDef::VPReductionSC:
44874517 case VPDef::VPActiveLaneMaskPHISC:
@@ -8508,9 +8538,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85088538 return Recipe;
85098539
85108540 VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8511- assert ((Legal->isReductionVariable (Phi) ||
8512- Legal->isFixedOrderRecurrence (Phi)) &&
8513- " can only widen reductions and fixed-order recurrences here" );
85148541 VPValue *StartV = Operands[0 ];
85158542 if (Legal->isReductionVariable (Phi)) {
85168543 const RecurrenceDescriptor &RdxDesc =
@@ -8520,12 +8547,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85208547 PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
85218548 CM.isInLoopReduction (Phi),
85228549 CM.useOrderedReductions (RdxDesc));
8523- } else {
8550+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
85248551 // TODO: Currently fixed-order recurrences are modeled as chains of
85258552 // first-order recurrences. If there are no users of the intermediate
85268553 // recurrences in the chain, the fixed order recurrence should be modeled
85278554 // directly, enabling more efficient codegen.
85288555 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8556+ } else if (Legal->isCSAPhi (Phi)) {
8557+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8558+ VPValue *InitData = State->getVPInitData ();
8559+ // When the VF=getFixed(1), InitData is just InitScalar.
8560+ if (!InitData)
8561+ InitData = State->getVPInitScalar ();
8562+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8563+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8564+ } else {
8565+ llvm_unreachable (
8566+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
85298567 }
85308568
85318569 PhisToFix.push_back (PhiRecipe);
@@ -8555,6 +8593,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85558593 make_range (Operands.begin (), Operands.end ()));
85568594
85578595 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8596+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8597+ return CSADescriptor::isCSASelect (CSA.second , SI);
8598+ });
8599+ if (CSADescIt != Legal->getCSAs ().end ()) {
8600+ PHINode *CSAPhi = CSADescIt->first ;
8601+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8602+ VPValue *VPDataPhi = State->getPhiRecipe ();
8603+ auto *R = new VPCSADataUpdateRecipe (
8604+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8605+ State->setDataUpdate (R);
8606+ return R;
8607+ }
8608+
85588609 return new VPWidenSelectRecipe (
85598610 *SI, make_range (Operands.begin (), Operands.end ()));
85608611 }
@@ -8567,6 +8618,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85678618 return tryToWiden (Instr, Operands, VPBB);
85688619}
85698620
8621+ // / Add CSA Recipes that can occur before each instruction in the input IR
8622+ // / is processed and introduced into VPlan.
8623+ static void
8624+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8625+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8626+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8627+ VPlan &Plan) {
8628+
8629+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8630+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8631+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8632+
8633+ for (const auto &CSA : CSAs) {
8634+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8635+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8636+
8637+ // Scalar VF builds the scalar version of the loop. In that case,
8638+ // no maintenence of mask nor extraction in middle block is needed.
8639+ if (IsScalarVF) {
8640+ VPCSAState *S = new VPCSAState (VPInitScalar);
8641+ Plan.addCSAState (CSA.first , S);
8642+ continue ;
8643+ }
8644+
8645+ auto *VPInitMask =
8646+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8647+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8648+ {VPInitScalar}, DL, " csa.init.data" );
8649+ PreheaderVPBB->appendRecipe (VPInitMask);
8650+ PreheaderVPBB->appendRecipe (VPInitData);
8651+
8652+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8653+ DL, " csa.mask.phi" );
8654+ HeaderVPBB->appendRecipe (VPMaskPhi);
8655+
8656+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8657+ Plan.addCSAState (CSA.first , S);
8658+ }
8659+ }
8660+
8661+ // / Add CSA Recipes that must occur after each instruction in the input IR
8662+ // / is processed and introduced into VPlan.
8663+ static void
8664+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8665+ const LoopVectorizationLegality::CSAList &CSAs,
8666+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8667+ VPlan &Plan) {
8668+ // Don't build CSA for VF=ElementCount::getFixed(1)
8669+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8670+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8671+ return ;
8672+
8673+ for (const auto &CSA : CSAs) {
8674+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8675+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8676+
8677+ assert (VPDataUpdate &&
8678+ " VPDataUpdate must have been introduced prior to postprocess" );
8679+ assert (CSA.second .getCond () &&
8680+ " CSADescriptor must know how to describe the condition" );
8681+ auto GetVPValue = [&](Value *I) {
8682+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8683+ };
8684+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8685+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8686+
8687+ // The CSA optimization wants to use a condition such that when it is
8688+ // true, a new value is assigned. However, it is possible that a true lane
8689+ // in WidenedCond corresponds to selection of the initial value instead.
8690+ // In that case, we must use the negation of WidenedCond.
8691+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8692+ VPValue *CondToUse = WidenedCond;
8693+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8694+ CSA.first ) {
8695+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8696+ VPNotCond->insertBefore (
8697+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8698+ CondToUse = VPNotCond;
8699+ }
8700+
8701+ auto *VPAnyActive = new VPInstruction (
8702+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8703+ VPAnyActive->insertBefore (
8704+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8705+
8706+ auto *VPMaskSel = new VPInstruction (
8707+ VPInstruction::CSAMaskSel,
8708+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8709+ VPMaskSel->insertAfter (VPAnyActive);
8710+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8711+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8712+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8713+
8714+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8715+
8716+ // Update CSAState with new recipes
8717+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8718+ CSAState->setVPAnyActive (VPAnyActive);
8719+ }
8720+ }
8721+
85708722void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
85718723 ElementCount MaxVF) {
85728724 assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8623,7 +8775,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
86238775// VPWidenPointerInductionRecipe and induction increments.
86248776static MapVector<PHINode *, VPValue *> collectUsersInExitBlock (
86258777 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8626- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8778+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8779+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
86278780 auto MiddleVPBB =
86288781 cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
86298782 // No edge from the middle block to the unique exit block has been inserted
@@ -8652,6 +8805,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
86528805 return P && Inductions.contains (P);
86538806 })))
86548807 continue ;
8808+ // Exit values for CSAs are computed and updated outside of VPlan and
8809+ // independent of induction recipes.
8810+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8811+ // live-outs.
8812+ if (isa<VPCSADataUpdateRecipe>(V) &&
8813+ (isa<Instruction>(IncomingValue) &&
8814+ any_of (IncomingValue->users (), [&CSAs](User *U) {
8815+ auto *P = dyn_cast<PHINode>(U);
8816+ return P && CSAs.contains (P);
8817+ })))
8818+ continue ;
86558819 ExitingValuesToFix.insert ({&ExitPhi, V});
86568820 }
86578821 return ExitingValuesToFix;
@@ -8893,6 +9057,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
88939057 bool HasNUW = Style == TailFoldingStyle::None;
88949058 addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
88959059
9060+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9061+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9062+ Range, *Plan);
9063+
88969064 VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
88979065
88989066 // ---------------------------------------------------------------------------
@@ -8999,6 +9167,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89999167 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
90009168 }
90019169
9170+ VPBasicBlock *MiddleVPBB =
9171+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9172+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9173+ Range, *Plan);
9174+
90029175 // After here, VPBB should not be used.
90039176 VPBB = nullptr ;
90049177
@@ -9008,8 +9181,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90089181 " VPBasicBlock" );
90099182 RecipeBuilder.fixHeaderPhis ();
90109183
9011- MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock (
9012- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9184+ MapVector<PHINode *, VPValue *> ExitingValuesToFix =
9185+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9186+ Legal->getInductionVars (), Legal->getCSAs ());
90139187
90149188 addLiveOutsForFirstOrderRecurrences (*Plan, ExitingValuesToFix);
90159189 addUsersInExitBlock (*Plan, ExitingValuesToFix);
@@ -10106,6 +10280,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1010610280 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan (
1010710281 EPI.MainLoopVF , EPI.MainLoopUF , *BestMainPlan, MainILV, DT, true );
1010810282 ++LoopsVectorized;
10283+ CSAsVectorized += LVL.getCSAs ().size ();
1010910284
1011010285 // Second pass vectorizes the epilogue and adjusts the control flow
1011110286 // edges from the first pass.
@@ -10198,6 +10373,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1019810373 PSI, Checks);
1019910374 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1020010375 ++LoopsVectorized;
10376+ CSAsVectorized += LVL.getCSAs ().size ();
1020110377
1020210378 // Add metadata to disable runtime unrolling a scalar loop when there
1020310379 // are no runtime checks about strides and memory. A scalar loop that is
0 commit comments