@@ -180,6 +180,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
180180STATISTIC (LoopsVectorized, " Number of loops vectorized" );
181181STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
182182STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
183+ STATISTIC (CSAsVectorized,
184+ " Number of conditional scalar assignments vectorized" );
183185
184186static cl::opt<bool > EnableEpilogueVectorization (
185187 " enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -500,6 +502,10 @@ class InnerLoopVectorizer {
500502 virtual std::pair<BasicBlock *, Value *>
501503 createVectorizedLoopSkeleton (const SCEV2ValueTy &ExpandedSCEVs);
502504
505+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
506+ // / loop with the extracted scalar from the vector loop for.
507+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
508+
503509 // / Fix the vectorized code, taking care of header phi's, live-outs, and more.
504510 void fixVectorizedLoop (VPTransformState &State, VPlan &Plan);
505511
@@ -2932,6 +2938,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29322938 TargetTransformInfo::TCK_RecipThroughput);
29332939}
29342940
2941+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2942+ for (const auto &CSA : Plan.getCSAStates ()) {
2943+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2944+ assert (VPDataUpdate &&
2945+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2946+ Value *V = VPDataUpdate->getUnderlyingValue ();
2947+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2948+ /* NeedsScalar=*/ true );
2949+ // Fix LCSSAPhis
2950+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2951+ for (User *U : V->users ())
2952+ if (auto *Phi = dyn_cast<PHINode>(U);
2953+ Phi && Phi->getParent () == LoopExitBlock)
2954+ ToFix.insert (Phi);
2955+ for (PHINode *Phi : ToFix)
2956+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2957+ }
2958+ }
2959+
29352960void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State,
29362961 VPlan &Plan) {
29372962 // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2972,6 +2997,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
29722997 getOrCreateVectorTripCount (VectorLoop->getLoopPreheader ()),
29732998 IVEndValues[Entry.first ], LoopMiddleBlock,
29742999 VectorLoop->getHeader (), Plan, State);
3000+
3001+ fixCSALiveOuts (State, Plan);
29753002 }
29763003
29773004 // Fix live-out phis not already fixed earlier.
@@ -4482,6 +4509,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44824509 case VPDef::VPEVLBasedIVPHISC:
44834510 case VPDef::VPPredInstPHISC:
44844511 case VPDef::VPBranchOnMaskSC:
4512+ case VPRecipeBase::VPCSADataUpdateSC:
4513+ case VPRecipeBase::VPCSAExtractScalarSC:
4514+ case VPRecipeBase::VPCSAHeaderPHISC:
44854515 continue ;
44864516 case VPDef::VPReductionSC:
44874517 case VPDef::VPActiveLaneMaskPHISC:
@@ -8480,9 +8510,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
84808510 return Recipe;
84818511
84828512 VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8483- assert ((Legal->isReductionVariable (Phi) ||
8484- Legal->isFixedOrderRecurrence (Phi)) &&
8485- " can only widen reductions and fixed-order recurrences here" );
84868513 VPValue *StartV = Operands[0 ];
84878514 if (Legal->isReductionVariable (Phi)) {
84888515 const RecurrenceDescriptor &RdxDesc =
@@ -8492,12 +8519,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
84928519 PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
84938520 CM.isInLoopReduction (Phi),
84948521 CM.useOrderedReductions (RdxDesc));
8495- } else {
8522+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
84968523 // TODO: Currently fixed-order recurrences are modeled as chains of
84978524 // first-order recurrences. If there are no users of the intermediate
84988525 // recurrences in the chain, the fixed order recurrence should be modeled
84998526 // directly, enabling more efficient codegen.
85008527 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8528+ } else if (Legal->isCSAPhi (Phi)) {
8529+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8530+ VPValue *InitData = State->getVPInitData ();
8531+ // When the VF=getFixed(1), InitData is just InitScalar.
8532+ if (!InitData)
8533+ InitData = State->getVPInitScalar ();
8534+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8535+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8536+ } else {
8537+ llvm_unreachable (
8538+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
85018539 }
85028540
85038541 PhisToFix.push_back (PhiRecipe);
@@ -8527,6 +8565,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85278565 make_range (Operands.begin (), Operands.end ()));
85288566
85298567 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8568+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8569+ return CSADescriptor::isCSASelect (CSA.second , SI);
8570+ });
8571+ if (CSADescIt != Legal->getCSAs ().end ()) {
8572+ PHINode *CSAPhi = CSADescIt->first ;
8573+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8574+ VPValue *VPDataPhi = State->getPhiRecipe ();
8575+ auto *R = new VPCSADataUpdateRecipe (
8576+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8577+ State->setDataUpdate (R);
8578+ return R;
8579+ }
8580+
85308581 return new VPWidenSelectRecipe (
85318582 *SI, make_range (Operands.begin (), Operands.end ()));
85328583 }
@@ -8539,6 +8590,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
85398590 return tryToWiden (Instr, Operands, VPBB);
85408591}
85418592
8593+ // / Add CSA Recipes that can occur before each instruction in the input IR
8594+ // / is processed and introduced into VPlan.
8595+ static void
8596+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8597+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8598+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8599+ VPlan &Plan) {
8600+
8601+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8602+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8603+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8604+
8605+ for (const auto &CSA : CSAs) {
8606+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8607+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8608+
8609+ // Scalar VF builds the scalar version of the loop. In that case,
8610+ // no maintenence of mask nor extraction in middle block is needed.
8611+ if (IsScalarVF) {
8612+ VPCSAState *S = new VPCSAState (VPInitScalar);
8613+ Plan.addCSAState (CSA.first , S);
8614+ continue ;
8615+ }
8616+
8617+ auto *VPInitMask =
8618+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8619+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8620+ {VPInitScalar}, DL, " csa.init.data" );
8621+ PreheaderVPBB->appendRecipe (VPInitMask);
8622+ PreheaderVPBB->appendRecipe (VPInitData);
8623+
8624+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8625+ DL, " csa.mask.phi" );
8626+ HeaderVPBB->appendRecipe (VPMaskPhi);
8627+
8628+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8629+ Plan.addCSAState (CSA.first , S);
8630+ }
8631+ }
8632+
8633+ // / Add CSA Recipes that must occur after each instruction in the input IR
8634+ // / is processed and introduced into VPlan.
8635+ static void
8636+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8637+ const LoopVectorizationLegality::CSAList &CSAs,
8638+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8639+ VPlan &Plan) {
8640+ // Don't build CSA for VF=ElementCount::getFixed(1)
8641+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8642+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8643+ return ;
8644+
8645+ for (const auto &CSA : CSAs) {
8646+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8647+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8648+
8649+ assert (VPDataUpdate &&
8650+ " VPDataUpdate must have been introduced prior to postprocess" );
8651+ assert (CSA.second .getCond () &&
8652+ " CSADescriptor must know how to describe the condition" );
8653+ auto GetVPValue = [&](Value *I) {
8654+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8655+ };
8656+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8657+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8658+
8659+ // The CSA optimization wants to use a condition such that when it is
8660+ // true, a new value is assigned. However, it is possible that a true lane
8661+ // in WidenedCond corresponds to selection of the initial value instead.
8662+ // In that case, we must use the negation of WidenedCond.
8663+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8664+ VPValue *CondToUse = WidenedCond;
8665+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8666+ CSA.first ) {
8667+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8668+ VPNotCond->insertBefore (
8669+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8670+ CondToUse = VPNotCond;
8671+ }
8672+
8673+ auto *VPAnyActive = new VPInstruction (
8674+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8675+ VPAnyActive->insertBefore (
8676+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8677+
8678+ auto *VPMaskSel = new VPInstruction (
8679+ VPInstruction::CSAMaskSel,
8680+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8681+ VPMaskSel->insertAfter (VPAnyActive);
8682+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8683+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8684+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8685+
8686+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8687+
8688+ // Update CSAState with new recipes
8689+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8690+ CSAState->setVPAnyActive (VPAnyActive);
8691+ }
8692+ }
8693+
85428694void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
85438695 ElementCount MaxVF) {
85448696 assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8595,7 +8747,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
85958747// VPWidenPointerInductionRecipe and induction increments.
85968748static MapVector<PHINode *, VPValue *> collectUsersInExitBlock (
85978749 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8598- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8750+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8751+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
85998752 auto MiddleVPBB =
86008753 cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
86018754 // No edge from the middle block to the unique exit block has been inserted
@@ -8624,6 +8777,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
86248777 return P && Inductions.contains (P);
86258778 })))
86268779 continue ;
8780+ // Exit values for CSAs are computed and updated outside of VPlan and
8781+ // independent of induction recipes.
8782+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8783+ // live-outs.
8784+ if (isa<VPCSADataUpdateRecipe>(V) &&
8785+ (isa<Instruction>(IncomingValue) &&
8786+ any_of (IncomingValue->users (), [&CSAs](User *U) {
8787+ auto *P = dyn_cast<PHINode>(U);
8788+ return P && CSAs.contains (P);
8789+ })))
8790+ continue ;
86278791 ExitingValuesToFix.insert ({&ExitPhi, V});
86288792 }
86298793 return ExitingValuesToFix;
@@ -8865,6 +9029,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
88659029 bool HasNUW = Style == TailFoldingStyle::None;
88669030 addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
88679031
9032+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9033+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9034+ Range, *Plan);
9035+
88689036 VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
88699037
88709038 // ---------------------------------------------------------------------------
@@ -8971,6 +9139,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89719139 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
89729140 }
89739141
9142+ VPBasicBlock *MiddleVPBB =
9143+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9144+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9145+ Range, *Plan);
9146+
89749147 // After here, VPBB should not be used.
89759148 VPBB = nullptr ;
89769149
@@ -8980,8 +9153,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89809153 " VPBasicBlock" );
89819154 RecipeBuilder.fixHeaderPhis ();
89829155
8983- MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock (
8984- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9156+ MapVector<PHINode *, VPValue *> ExitingValuesToFix =
9157+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9158+ Legal->getInductionVars (), Legal->getCSAs ());
89859159
89869160 addLiveOutsForFirstOrderRecurrences (*Plan, ExitingValuesToFix);
89879161 addUsersInExitBlock (*Plan, ExitingValuesToFix);
@@ -10078,6 +10252,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1007810252 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan (
1007910253 EPI.MainLoopVF , EPI.MainLoopUF , *BestMainPlan, MainILV, DT, true );
1008010254 ++LoopsVectorized;
10255+ CSAsVectorized += LVL.getCSAs ().size ();
1008110256
1008210257 // Second pass vectorizes the epilogue and adjusts the control flow
1008310258 // edges from the first pass.
@@ -10170,6 +10345,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1017010345 PSI, Checks);
1017110346 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1017210347 ++LoopsVectorized;
10348+ CSAsVectorized += LVL.getCSAs ().size ();
1017310349
1017410350 // Add metadata to disable runtime unrolling a scalar loop when there
1017510351 // are no runtime checks about strides and memory. A scalar loop that is
0 commit comments