@@ -173,6 +173,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
173173STATISTIC (LoopsVectorized, " Number of loops vectorized" );
174174STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
175175STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
176+ STATISTIC (CSAsVectorized,
177+ " Number of conditional scalar assignments vectorized" );
176178
177179static cl::opt<bool > EnableEpilogueVectorization (
178180 " enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -497,6 +499,10 @@ class InnerLoopVectorizer {
497499 // / Fix the vectorized code, taking care of header phi's, and more.
498500 void fixVectorizedLoop (VPTransformState &State);
499501
502+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
503+ // / loop with the extracted scalar from the vector loop for.
504+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
505+
500506 // Return true if any runtime check is added.
501507 bool areSafetyChecksAdded () { return AddedSafetyChecks; }
502508
@@ -2937,6 +2943,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29372943 TargetTransformInfo::TCK_RecipThroughput);
29382944}
29392945
2946+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2947+ for (const auto &CSA : Plan.getCSAStates ()) {
2948+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2949+ assert (VPDataUpdate &&
2950+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2951+ Value *V = VPDataUpdate->getUnderlyingValue ();
2952+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2953+ /* NeedsScalar=*/ true );
2954+ // Fix LCSSAPhis
2955+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2956+ for (User *U : V->users ())
2957+ if (auto *Phi = dyn_cast<PHINode>(U);
2958+ Phi && Phi->getParent () == LoopExitBlock)
2959+ ToFix.insert (Phi);
2960+ for (PHINode *Phi : ToFix)
2961+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2962+ }
2963+ }
2964+
29402965void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
29412966 // Fix widened non-induction PHIs by setting up the PHI operands.
29422967 if (EnableVPlanNativePath)
@@ -2972,6 +2997,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
29722997 fixupIVUsers (Entry.first , Entry.second ,
29732998 getOrCreateVectorTripCount (nullptr ),
29742999 IVEndValues[Entry.first ], LoopMiddleBlock, State);
3000+ fixCSALiveOuts (State, Plan);
29753001 }
29763002
29773003 for (Instruction *PI : PredicatedInstructions)
@@ -4497,6 +4523,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44974523 case VPDef::VPEVLBasedIVPHISC:
44984524 case VPDef::VPPredInstPHISC:
44994525 case VPDef::VPBranchOnMaskSC:
4526+ case VPRecipeBase::VPCSADataUpdateSC:
4527+ case VPRecipeBase::VPCSAExtractScalarSC:
4528+ case VPRecipeBase::VPCSAHeaderPHISC:
45004529 continue ;
45014530 case VPDef::VPReductionSC:
45024531 case VPDef::VPActiveLaneMaskPHISC:
@@ -8680,9 +8709,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
86808709 return Recipe;
86818710
86828711 VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8683- assert ((Legal->isReductionVariable (Phi) ||
8684- Legal->isFixedOrderRecurrence (Phi)) &&
8685- " can only widen reductions and fixed-order recurrences here" );
86868712 VPValue *StartV = Operands[0 ];
86878713 if (Legal->isReductionVariable (Phi)) {
86888714 const RecurrenceDescriptor &RdxDesc =
@@ -8692,12 +8718,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
86928718 PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
86938719 CM.isInLoopReduction (Phi),
86948720 CM.useOrderedReductions (RdxDesc));
8695- } else {
8721+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
86968722 // TODO: Currently fixed-order recurrences are modeled as chains of
86978723 // first-order recurrences. If there are no users of the intermediate
86988724 // recurrences in the chain, the fixed order recurrence should be modeled
86998725 // directly, enabling more efficient codegen.
87008726 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8727+ } else if (Legal->isCSAPhi (Phi)) {
8728+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8729+ VPValue *InitData = State->getVPInitData ();
8730+ // When the VF=getFixed(1), InitData is just InitScalar.
8731+ if (!InitData)
8732+ InitData = State->getVPInitScalar ();
8733+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8734+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8735+ } else {
8736+ llvm_unreachable (
8737+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
87018738 }
87028739
87038740 PhisToFix.push_back (PhiRecipe);
@@ -8731,6 +8768,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87318768 make_range (Operands.begin (), Operands.end ()));
87328769
87338770 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8771+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8772+ return CSADescriptor::isCSASelect (CSA.second , SI);
8773+ });
8774+ if (CSADescIt != Legal->getCSAs ().end ()) {
8775+ PHINode *CSAPhi = CSADescIt->first ;
8776+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8777+ VPValue *VPDataPhi = State->getPhiRecipe ();
8778+ auto *R = new VPCSADataUpdateRecipe (
8779+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8780+ State->setDataUpdate (R);
8781+ return R;
8782+ }
8783+
87348784 return new VPWidenSelectRecipe (
87358785 *SI, make_range (Operands.begin (), Operands.end ()));
87368786 }
@@ -8743,6 +8793,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87438793 return tryToWiden (Instr, Operands, VPBB);
87448794}
87458795
8796+ // / Add CSA Recipes that can occur before each instruction in the input IR
8797+ // / is processed and introduced into VPlan.
8798+ static void
8799+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8800+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8801+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8802+ VPlan &Plan) {
8803+
8804+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8805+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8806+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8807+
8808+ for (const auto &CSA : CSAs) {
8809+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8810+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8811+
8812+ // Scalar VF builds the scalar version of the loop. In that case,
8813+ // no maintenence of mask nor extraction in middle block is needed.
8814+ if (IsScalarVF) {
8815+ VPCSAState *S = new VPCSAState (VPInitScalar);
8816+ Plan.addCSAState (CSA.first , S);
8817+ continue ;
8818+ }
8819+
8820+ auto *VPInitMask =
8821+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8822+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8823+ {VPInitScalar}, DL, " csa.init.data" );
8824+ PreheaderVPBB->appendRecipe (VPInitMask);
8825+ PreheaderVPBB->appendRecipe (VPInitData);
8826+
8827+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8828+ DL, " csa.mask.phi" );
8829+ HeaderVPBB->appendRecipe (VPMaskPhi);
8830+
8831+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8832+ Plan.addCSAState (CSA.first , S);
8833+ }
8834+ }
8835+
8836+ // / Add CSA Recipes that must occur after each instruction in the input IR
8837+ // / is processed and introduced into VPlan.
8838+ static void
8839+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8840+ const LoopVectorizationLegality::CSAList &CSAs,
8841+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8842+ VPlan &Plan) {
8843+ // Don't build CSA for VF=ElementCount::getFixed(1)
8844+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8845+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8846+ return ;
8847+
8848+ for (const auto &CSA : CSAs) {
8849+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8850+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8851+
8852+ assert (VPDataUpdate &&
8853+ " VPDataUpdate must have been introduced prior to postprocess" );
8854+ assert (CSA.second .getCond () &&
8855+ " CSADescriptor must know how to describe the condition" );
8856+ auto GetVPValue = [&](Value *I) {
8857+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8858+ };
8859+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8860+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8861+
8862+ // The CSA optimization wants to use a condition such that when it is
8863+ // true, a new value is assigned. However, it is possible that a true lane
8864+ // in WidenedCond corresponds to selection of the initial value instead.
8865+ // In that case, we must use the negation of WidenedCond.
8866+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8867+ VPValue *CondToUse = WidenedCond;
8868+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8869+ CSA.first ) {
8870+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8871+ VPNotCond->insertBefore (
8872+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8873+ CondToUse = VPNotCond;
8874+ }
8875+
8876+ auto *VPAnyActive = new VPInstruction (
8877+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8878+ VPAnyActive->insertBefore (
8879+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8880+
8881+ auto *VPMaskSel = new VPInstruction (
8882+ VPInstruction::CSAMaskSel,
8883+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8884+ VPMaskSel->insertAfter (VPAnyActive);
8885+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8886+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8887+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8888+
8889+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8890+
8891+ // Update CSAState with new recipes
8892+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8893+ CSAState->setVPAnyActive (VPAnyActive);
8894+ }
8895+ }
8896+
87468897void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
87478898 ElementCount MaxVF) {
87488899 assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8835,7 +8986,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
88358986// increments.
88368987static SetVector<VPIRInstruction *> collectUsersInExitBlock (
88378988 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8838- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8989+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8990+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
88398991 auto *MiddleVPBB = Plan.getMiddleBlock ();
88408992 // No edge from the middle block to the unique exit block has been inserted
88418993 // and there is nothing to fix from vector loop; phis should have incoming
@@ -8867,6 +9019,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
88679019 return P && Inductions.contains (P);
88689020 })))
88699021 continue ;
9022+ // Exit values for CSAs are computed and updated outside of VPlan and
9023+ // independent of induction recipes.
9024+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9025+ // live-outs.
9026+ if (isa<VPCSADataUpdateRecipe>(V) &&
9027+ (isa<Instruction>(IncomingValue) &&
9028+ any_of (IncomingValue->users (), [&CSAs](User *U) {
9029+ auto *P = dyn_cast<PHINode>(U);
9030+ return P && CSAs.contains (P);
9031+ })))
9032+ continue ;
88709033 ExitUsersToFix.insert (ExitIRI);
88719034 ExitIRI->addOperand (V);
88729035 }
@@ -9043,6 +9206,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90439206 bool HasNUW = Style == TailFoldingStyle::None;
90449207 addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
90459208
9209+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9210+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9211+ Range, *Plan);
9212+
90469213 VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
90479214
90489215 // ---------------------------------------------------------------------------
@@ -9160,6 +9327,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91609327 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
91619328 }
91629329
9330+ VPBasicBlock *MiddleVPBB =
9331+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9332+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9333+ Range, *Plan);
9334+
91639335 // After here, VPBB should not be used.
91649336 VPBB = nullptr ;
91659337
@@ -9170,8 +9342,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91709342 RecipeBuilder.fixHeaderPhis ();
91719343
91729344 addScalarResumePhis (RecipeBuilder, *Plan);
9173- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock (
9174- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9345+ SetVector<VPIRInstruction *> ExitUsersToFix =
9346+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9347+ Legal->getInductionVars (), Legal->getCSAs ());
91759348 addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
91769349 addUsersInExitBlock (*Plan, ExitUsersToFix);
91779350 // ---------------------------------------------------------------------------
@@ -10238,6 +10411,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1023810411 auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
1023910412 *BestMainPlan, MainILV, DT, false );
1024010413 ++LoopsVectorized;
10414+ CSAsVectorized += LVL.getCSAs ().size ();
1024110415
1024210416 // Second pass vectorizes the epilogue and adjusts the control flow
1024310417 // edges from the first pass.
@@ -10333,6 +10507,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1033310507 PSI, Checks, BestPlan);
1033410508 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1033510509 ++LoopsVectorized;
10510+ CSAsVectorized += LVL.getCSAs ().size ();
1033610511
1033710512 // Add metadata to disable runtime unrolling a scalar loop when there
1033810513 // are no runtime checks about strides and memory. A scalar loop that is
0 commit comments