diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2ca703ac01352..9224f6712f089 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1405,6 +1405,11 @@ class LoopVectorizationCostModel { return InLoopReductions.contains(Phi); } + /// Returns the set of in-loop reduction PHIs. + const SmallPtrSetImpl &getInLoopReductions() const { + return InLoopReductions; + } + /// Returns true if the predicated reduction select should be used to set the /// incoming value for the reduction phi. bool usePredicatedReductionSelect() const { @@ -7683,60 +7688,6 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI, Consecutive, Reverse, *VPI, VPI->getDebugLoc()); } -/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will -/// also insert a recipe to expand the step for the induction recipe. -static VPWidenIntOrFpInductionRecipe * -createWidenInductionRecipes(VPInstruction *PhiR, - const InductionDescriptor &IndDesc, VPlan &Plan, - ScalarEvolution &SE, Loop &OrigLoop) { - assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && - "step must be loop invariant"); - - VPValue *Start = PhiR->getOperand(0); - assert((Plan.getLiveIn(IndDesc.getStartValue()) == Start || - (SE.isSCEVable(IndDesc.getStartValue()->getType()) && - SE.getSCEV(IndDesc.getStartValue()) == - vputils::getSCEVExprForVPValue(Start, SE))) && - "Start VPValue must match IndDesc's start value"); - - // It is always safe to copy over the NoWrap and FastMath flags. In - // particular, when folding tail by masking, the masked-off lanes are never - // used, so it is safe. - VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc); - VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); - - // Update wide induction increments to use the same step as the corresponding - // wide induction. This enables detecting induction increments directly in - // VPlan and removes redundant splats. - using namespace llvm::VPlanPatternMatch; - if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue()))) - PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step); - - PHINode *Phi = cast(PhiR->getUnderlyingInstr()); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc, Flags, PhiR->getDebugLoc()); -} - -VPHeaderPHIRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI) { - auto *Phi = cast(VPI->getUnderlyingInstr()); - - // Check if this is an integer or fp induction. If so, build the recipe that - // produces its scalar and vector values. - if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop); - - // Check if this is pointer induction. If so, build the recipe for it. - if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { - VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep()); - return new VPWidenPointerInductionRecipe(Phi, VPI->getOperand(0), Step, - &Plan.getVFxUF(), *II, - VPI->getDebugLoc()); - } - return nullptr; -} - VPWidenIntOrFpInductionRecipe * VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI, VFRange &Range) { @@ -8218,56 +8169,15 @@ bool VPRecipeBuilder::getScaledReductions( return false; } -VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, - VFRange &Range) { - // First, check for specific widening recipes that deal with inductions, Phi - // nodes, calls and memory operations. - VPRecipeBase *Recipe; - if (auto *PhiR = dyn_cast(R)) { - VPBasicBlock *Parent = PhiR->getParent(); - [[maybe_unused]] VPRegionBlock *LoopRegionOf = - Parent->getEnclosingLoopRegion(); - assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent && - "Non-header phis should have been handled during predication"); - auto *Phi = cast(R->getUnderlyingInstr()); - assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis"); - if ((Recipe = tryToOptimizeInductionPHI(PhiR))) - return Recipe; - - assert((Legal->isReductionVariable(Phi) || - Legal->isFixedOrderRecurrence(Phi)) && - "can only widen reductions and fixed-order recurrences here"); - VPValue *StartV = R->getOperand(0); - VPValue *BackedgeValue = R->getOperand(1); - if (Legal->isReductionVariable(Phi)) { - const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi); - assert(RdxDesc.getRecurrenceStartValue() == - Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - - // If the PHI is used by a partial reduction, set the scale factor. - bool UseInLoopReduction = CM.isInLoopReduction(Phi); - bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc); - // Will be updated later to >1 if reduction is partial. - unsigned ScaleFactor = 1; - - return new VPReductionPHIRecipe( - Phi, RdxDesc.getRecurrenceKind(), *StartV, *BackedgeValue, - getReductionStyle(UseInLoopReduction, UseOrderedReductions, - ScaleFactor), - RdxDesc.hasUsesOutsideReductionChain()); - } - - // TODO: Currently fixed-order recurrences are modeled as chains of - // first-order recurrences. If there are no users of the intermediate - // recurrences in the chain, the fixed order recurrence should be modeled - // directly, enabling more efficient codegen. - return new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV, *BackedgeValue); - } - - assert(!R->isPhi() && "only VPPhi nodes expected at this point"); +VPRecipeBase * +VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, + VFRange &Range) { + assert(!R->isPhi() && "phis must be handled earlier"); + // First, check for specific widening recipes that deal with optimizing + // truncates, calls and memory operations. + VPRecipeBase *Recipe; auto *VPI = cast(R); - Instruction *Instr = R->getUnderlyingInstr(); if (VPI->getOpcode() == Instruction::Trunc && (Recipe = tryToOptimizeInductionTruncate(VPI, Range))) return Recipe; @@ -8280,6 +8190,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (VPI->getOpcode() == Instruction::Call) return tryToWidenCall(VPI, Range); + Instruction *Instr = R->getUnderlyingInstr(); if (VPI->getOpcode() == Instruction::Store) if (auto HistInfo = Legal->getHistogramInfo(cast(Instr))) return tryToWidenHistogram(*HistInfo, VPI); @@ -8377,6 +8288,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, OrigLoop, *LI, Legal->getWidestInductionType(), getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer); + // Create recipes for header phis. + VPlanTransforms::createHeaderPhiRecipes( + *VPlan0, *PSE.getSE(), *OrigLoop, Legal->getInductionVars(), + Legal->getReductionVars(), Legal->getFixedOrderRecurrences(), + CM.getInLoopReductions(), Hints.allowReordering()); + auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; @@ -8482,8 +8399,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Construct wide recipes and apply predication for original scalar // VPInstructions in the loop. // --------------------------------------------------------------------------- - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, - Builder, BlockMaskCache); + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, Builder, + BlockMaskCache); // TODO: Handle partial reductions with EVL tail folding. if (!CM.foldTailWithEVL()) RecipeBuilder.collectScaledReductions(Range); @@ -8499,26 +8416,22 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Mapping from VPValues in the initial plan to their widened VPValues. Needed // temporarily to update created block masks. DenseMap Old2New; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { // Convert input VPInstructions to widened recipes. for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - auto *SingleDef = cast(&R); - auto *UnderlyingValue = SingleDef->getUnderlyingValue(); - // Skip recipes that do not need transforming, including canonical IV, - // wide canonical IV and VPInstructions without underlying values. The - // latter are added above for masking. - // FIXME: Migrate code relying on the underlying instruction from VPlan0 - // to construct recipes below to not use the underlying instruction. - if (isa( - &R) || - (isa(&R) && !UnderlyingValue)) + auto *VPI = dyn_cast(&R); + // Skip recipes that do not need transforming, including + // non-VPInstructions (such as ...) and VPInstructions without underlying + // values. The latter are added above for masking. + if (!VPI || !VPI->getUnderlyingValue()) continue; - assert(isa(&R) && UnderlyingValue && "unsupported recipe"); // TODO: Gradually replace uses of underlying instruction by analyses on - // VPlan. - Instruction *Instr = cast(UnderlyingValue); - Builder.setInsertPoint(SingleDef); + // VPlan. Migrate code relying on the underlying instruction from VPlan0 + // to construct recipes below to not use the underlying instruction. + Instruction *Instr = cast(VPI->getUnderlyingValue()); + Builder.setInsertPoint(VPI); // The stores with invariant address inside the loop will be deleted, and // in the exit block, a uniform store recipe will be created for the final @@ -8528,7 +8441,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { // Only create recipe for the final invariant store of the reduction. if (Legal->isInvariantStoreOfReduction(SI)) { - auto *VPI = cast(SingleDef); auto *Recipe = new VPReplicateRecipe( SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc()); @@ -8539,10 +8451,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( } VPRecipeBase *Recipe = - RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); + RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range); if (!Recipe) - Recipe = RecipeBuilder.handleReplication(cast(SingleDef), - Range); + Recipe = + RecipeBuilder.handleReplication(cast(VPI), Range); RecipeBuilder.setRecipe(Instr, Recipe); if (isa(Recipe) && isa(Instr)) { @@ -8553,8 +8465,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( Builder.insert(Recipe); } if (Recipe->getNumDefinedValues() == 1) { - SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue()); - Old2New[SingleDef] = Recipe->getVPSingleValue(); + VPI->replaceAllUsesWith(Recipe->getVPSingleValue()); + Old2New[VPI] = Recipe->getVPSingleValue(); } else { assert(Recipe->getNumDefinedValues() == 0 && "Unexpected multidef recipe"); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 1808be118cd2a..59f72ea29c946 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -63,8 +63,6 @@ class VPRecipeBuilder { /// The profitablity analysis. LoopVectorizationCostModel &CM; - PredicatedScalarEvolution &PSE; - VPBuilder &Builder; /// The mask of each VPBB, generated earlier and used for predicating recipes @@ -94,10 +92,6 @@ class VPRecipeBuilder { /// recipe that takes an additional VPInstruction for the mask. VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range); - /// Check if an induction recipe should be constructed for \p VPI. If so build - /// and return it. If not, return null. - VPHeaderPHIRecipe *tryToOptimizeInductionPHI(VPInstruction *VPI); - /// Optimize the special case where the operand of \p VPI is a constant /// integer induction variable. VPWidenIntOrFpInductionRecipe * @@ -137,11 +131,10 @@ class VPRecipeBuilder { VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, - PredicatedScalarEvolution &PSE, VPBuilder &Builder, + LoopVectorizationCostModel &CM, VPBuilder &Builder, DenseMap &BlockMaskCache) : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), - CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache) {} + CM(CM), Builder(Builder), BlockMaskCache(BlockMaskCache) {} std::optional getScalingForReduction(const Instruction *ExitInst) { auto It = ScaledReductionMap.find(ExitInst); @@ -153,9 +146,10 @@ class VPRecipeBuilder { /// that are valid so recipes can be formed later. void collectScaledReductions(VFRange &Range); - /// Create and return a widened recipe for \p R if one can be created within - /// the given VF \p Range. - VPRecipeBase *tryToCreateWidenRecipe(VPSingleDefRecipe *R, VFRange &Range); + /// Create and return a widened recipe for a non-phi recipe \p R if one can be + /// created within the given VF \p Range. + VPRecipeBase *tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, + VFRange &Range); /// Create and return a partial reduction recipe for a reduction instruction /// along with binary operation and reduction phi operands. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index f9a61969f201f..318c05d8ef7c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -594,6 +594,111 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, return VPlan0; } +/// Creates a VPWidenIntOrFpInductionRecipe or VPWidenPointerInductionRecipe +/// for \p Phi based on \p IndDesc. +static VPHeaderPHIRecipe * +createWidenInductionRecipe(PHINode *Phi, VPPhi *PhiR, VPValue *Start, + const InductionDescriptor &IndDesc, VPlan &Plan, + ScalarEvolution &SE, Loop &OrigLoop, DebugLoc DL) { + assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && + "step must be loop invariant"); + assert((Plan.getLiveIn(IndDesc.getStartValue()) == Start || + (SE.isSCEVable(IndDesc.getStartValue()->getType()) && + SE.getSCEV(IndDesc.getStartValue()) == + vputils::getSCEVExprForVPValue(Start, SE))) && + "Start VPValue must match IndDesc's start value"); + + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); + + if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) + return new VPWidenPointerInductionRecipe(Phi, Start, Step, &Plan.getVFxUF(), + IndDesc, DL); + + assert((IndDesc.getKind() == InductionDescriptor::IK_IntInduction || + IndDesc.getKind() == InductionDescriptor::IK_FpInduction) && + "must have an integer or float induction at this point"); + + // Update wide induction increments to use the same step as the corresponding + // wide induction. This enables detecting induction increments directly in + // VPlan and removes redundant splats. + using namespace llvm::VPlanPatternMatch; + if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue()))) + PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step); + + // It is always safe to copy over the NoWrap and FastMath flags. In + // particular, when folding tail by masking, the masked-off lanes are never + // used, so it is safe. + VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc); + + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), + IndDesc, Flags, DL); +} + +void VPlanTransforms::createHeaderPhiRecipes( + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, + const MapVector &Inductions, + const MapVector &Reductions, + const SmallPtrSetImpl &FixedOrderRecurrences, + const SmallPtrSetImpl &InLoopReductions, bool AllowReordering) { + // Retrieve the header manually from the intial plain-CFG VPlan. + VPBasicBlock *HeaderVPBB = cast( + Plan.getEntry()->getSuccessors()[1]->getSingleSuccessor()); + assert(VPDominatorTree(Plan).dominates(HeaderVPBB, + HeaderVPBB->getPredecessors()[1]) && + "header must dominate its latch"); + + auto CreateHeaderPhiRecipe = [&](VPPhi *PhiR) -> VPHeaderPHIRecipe * { + // TODO: Gradually replace uses of underlying instruction by analyses on + // VPlan. + auto *Phi = cast(PhiR->getUnderlyingInstr()); + assert(PhiR->getNumOperands() == 2 && + "Must have 2 operands for header phis"); + + // Extract common values once. + VPValue *Start = PhiR->getOperand(0); + VPValue *BackedgeValue = PhiR->getOperand(1); + + if (FixedOrderRecurrences.contains(Phi)) { + // TODO: Currently fixed-order recurrences are modeled as chains of + // first-order recurrences. If there are no users of the intermediate + // recurrences in the chain, the fixed order recurrence should be + // modeled directly, enabling more efficient codegen. + return new VPFirstOrderRecurrencePHIRecipe(Phi, *Start, *BackedgeValue); + } + + auto InductionIt = Inductions.find(Phi); + if (InductionIt != Inductions.end()) + return createWidenInductionRecipe(Phi, PhiR, Start, InductionIt->second, + Plan, SE, OrigLoop, + PhiR->getDebugLoc()); + + assert(Reductions.contains(Phi) && "only reductions are expected now"); + const RecurrenceDescriptor &RdxDesc = Reductions.lookup(Phi); + assert(RdxDesc.getRecurrenceStartValue() == + Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()) && + "incoming value must match start value"); + // Will be updated later to >1 if reduction is partial. + unsigned ScaleFactor = 1; + bool UseOrderedReductions = !AllowReordering && RdxDesc.isOrdered(); + return new VPReductionPHIRecipe( + Phi, RdxDesc.getRecurrenceKind(), *Start, *BackedgeValue, + getReductionStyle(InLoopReductions.contains(Phi), UseOrderedReductions, + ScaleFactor), + RdxDesc.hasUsesOutsideReductionChain()); + }; + + for (VPRecipeBase &R : make_early_inc_range(HeaderVPBB->phis())) { + if (isa(&R)) + continue; + auto *PhiR = cast(&R); + VPHeaderPHIRecipe *HeaderPhiR = CreateHeaderPhiRecipe(PhiR); + HeaderPhiR->insertBefore(PhiR); + PhiR->replaceAllUsesWith(HeaderPhiR); + PhiR->eraseFromParent(); + } +} + void VPlanTransforms::handleEarlyExits(VPlan &Plan, bool HasUncountableEarlyExit) { auto *MiddleVPBB = cast( diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index afdf1655b4622..1a3ff4f9b9bbc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -102,6 +102,17 @@ struct VPlanTransforms { buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr); + /// Replace VPPhi recipes in \p Plan's header with corresponding + /// VPHeaderPHIRecipe subclasses for inductions, reductions, and + /// fixed-order recurrences. This processes all header phis and creates + /// the appropriate widened recipe for each one. + static void createHeaderPhiRecipes( + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, + const MapVector &Inductions, + const MapVector &Reductions, + const SmallPtrSetImpl &FixedOrderRecurrences, + const SmallPtrSetImpl &InLoopReductions, bool AllowReordering); + /// Update \p Plan to account for all early exits. LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan, bool HasUncountableExit);