diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index c28314f6ab124..44502ac143857 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -246,12 +246,16 @@ class Vectorizer { const DataLayout &DL; IRBuilder<> Builder; - // We could erase instrs right after vectorizing them, but that can mess up - // our BB iterators, and also can make the equivalence class keys point to - // freed memory. This is fixable, but it's simpler just to wait until we're - // done with the BB and erase all at once. + /// We could erase instrs right after vectorizing them, but that can mess up + /// our BB iterators, and also can make the equivalence class keys point to + /// freed memory. This is fixable, but it's simpler just to wait until we're + /// done with the BB and erase all at once. SmallVector ToErase; + /// We insert load/store instructions and GEPs to fill gaps and extend chains + /// to enable vectorization. Keep track and delete them later. + DenseSet ExtraElements; + public: Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) @@ -279,13 +283,15 @@ class Vectorizer { bool runOnChain(Chain &C); /// Splits the chain into subchains of instructions which read/write a - /// contiguous block of memory. Discards any length-1 subchains (because - /// there's nothing to vectorize in there). + /// contiguous block of memory. Discards any length-1 subchains (because + /// there's nothing to vectorize in there). Also attempts to fill gaps with + /// "extra" elements to artificially make chains contiguous in some cases. std::vector splitChainByContiguity(Chain &C); /// Splits the chain into subchains where it's safe to hoist loads up to the /// beginning of the sub-chain and it's safe to sink loads up to the end of - /// the sub-chain. Discards any length-1 subchains. + /// the sub-chain. Discards any length-1 subchains. Also attempts to extend + /// non-power-of-two chains by adding "extra" elements in some cases. std::vector splitChainByMayAliasInstrs(Chain &C); /// Splits the chain into subchains that make legal, aligned accesses. @@ -344,6 +350,30 @@ class Vectorizer { /// Postcondition: For all i, ret[i][0].second == 0, because the first instr /// in the chain is the leader, and an instr touches distance 0 from itself. std::vector gatherChains(ArrayRef Instrs); + + /// Checks if a potential vector load/store with a given alignment is allowed + /// and fast. Aligned accesses are always allowed and fast, while misaligned + /// accesses depend on TTI checks to determine whether they can and should be + /// vectorized or kept as element-wise accesses. + bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment, + unsigned VecElemBits) const; + + /// Create a new GEP and a new Load/Store instruction such that the GEP + /// is pointing at PrevElem + Offset. In the case of stores, store poison. + /// Extra elements will either be combined into a masked load/store or + /// deleted before the end of the pass. + ChainElem createExtraElementAfter(const ChainElem &PrevElem, Type *Ty, + APInt Offset, StringRef Prefix, + Align Alignment = Align()); + + /// Create a mask that masks off the extra elements in the chain, to be used + /// for the creation of a masked load/store vector. + Value *createMaskForExtraElements(const ArrayRef C, + FixedVectorType *VecTy); + + /// Delete dead GEPs and extra Load/Store instructions created by + /// createExtraElementAfter + void deleteExtraElements(); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -457,12 +487,21 @@ bool Vectorizer::run() { Changed |= runOnPseudoBB(*It, *std::next(It)); for (Instruction *I : ToErase) { + // These will get deleted in deleteExtraElements. + // This is because ExtraElements will include both extra elements + // that *were* vectorized and extra elements that *were not* + // vectorized. ToErase will only include extra elements that *were* + // vectorized, so in order to avoid double deletion we skip them here and + // handle them in deleteExtraElements. + if (ExtraElements.contains(I)) + continue; auto *PtrOperand = getLoadStorePointerOperand(I); if (I->use_empty()) I->eraseFromParent(); RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); } ToErase.clear(); + deleteExtraElements(); } return Changed; @@ -623,15 +662,68 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { dumpChain(C); }); + // If the chain is not contiguous, we try to fill the gap with "extra" + // elements to artificially make it contiguous, to try to enable + // vectorization. We only fill gaps if there is potential to end up with a + // legal masked load/store given the target, address space, and element type. + // At this point, when querying the TTI, optimistically assume max alignment + // and max vector size, as splitChainByAlignment will ensure the final vector + // shape passes the legalization check. + unsigned AS = getLoadStoreAddressSpace(C[0].Inst); + Type *ElementType = getLoadStoreType(C[0].Inst)->getScalarType(); + unsigned MaxVecRegBits = TTI.getLoadStoreVecRegBitWidth(AS); + Align OptimisticAlign = Align(MaxVecRegBits / 8); + unsigned int MaxVectorNumElems = + MaxVecRegBits / DL.getTypeSizeInBits(ElementType); + // Note: This check decides whether to try to fill gaps based on the masked + // legality of the target's maximum vector size (getLoadStoreVecRegBitWidth). + // If a target *does not* support a masked load/store with this max vector + // size, but *does* support a masked load/store with a *smaller* vector size, + // that optimization will be missed. This does not occur in any of the targets + // that currently support this API. + FixedVectorType *OptimisticVectorType = + FixedVectorType::get(ElementType, MaxVectorNumElems); + bool TryFillGaps = + isa(C[0].Inst) + ? TTI.isLegalMaskedLoad(OptimisticVectorType, OptimisticAlign, AS, + TTI::MaskKind::ConstantMask) + : TTI.isLegalMaskedStore(OptimisticVectorType, OptimisticAlign, AS, + TTI::MaskKind::ConstantMask); + + // Cache the best aligned element in the chain for use when creating extra + // elements. + Align BestAlignedElemAlign = getLoadStoreAlignment(C[0].Inst); + APInt OffsetOfBestAlignedElemFromLeader = C[0].OffsetFromLeader; + for (const auto &E : C) { + Align ElementAlignment = getLoadStoreAlignment(E.Inst); + if (ElementAlignment > BestAlignedElemAlign) { + BestAlignedElemAlign = ElementAlignment; + OffsetOfBestAlignedElemFromLeader = E.OffsetFromLeader; + } + } + + auto DeriveAlignFromBestAlignedElem = [&](APInt NewElemOffsetFromLeader) { + return commonAlignment( + BestAlignedElemAlign, + (NewElemOffsetFromLeader - OffsetOfBestAlignedElemFromLeader) + .abs() + .getLimitedValue()); + }; + + unsigned ASPtrBits = DL.getIndexSizeInBits(AS); + std::vector Ret; Ret.push_back({C.front()}); unsigned ChainElemTyBits = DL.getTypeSizeInBits(getChainElemTy(C)); - APInt PrevReadEnd = C[0].OffsetFromLeader + - DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst)); + ChainElem &Prev = C[0]; for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) { auto &CurChain = Ret.back(); - unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst)); + + APInt PrevSzBytes = + APInt(ASPtrBits, DL.getTypeStoreSize(getLoadStoreType(Prev.Inst))); + APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes; + unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(It->Inst)); // Add this instruction to the end of the current chain, or start a new one. assert( @@ -653,11 +745,48 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { << *It->Inst << " (starts at offset " << It->OffsetFromLeader << ")\n"); - if (AreContiguous) + // If the chain is not contiguous, try to fill in gaps between Prev and + // Curr. For now, we aren't filling gaps between load/stores of different + // sizes. Additionally, as a conservative heuristic, we only fill gaps of + // 1-2 elements. Generating loads/stores with too many unused bytes has a + // side effect of increasing register pressure (on NVIDIA targets at least), + // which could cancel out the benefits of reducing number of load/stores. + bool GapFilled = false; + if (!AreContiguous && TryFillGaps && PrevSzBytes == SzBytes) { + APInt GapSzBytes = It->OffsetFromLeader - PrevReadEnd; + if (GapSzBytes == PrevSzBytes) { + // There is a single gap between Prev and Curr, create one extra element + ChainElem NewElem = createExtraElementAfter( + Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill", + DeriveAlignFromBestAlignedElem(PrevReadEnd)); + CurChain.push_back(NewElem); + GapFilled = true; + } + // There are two gaps between Prev and Curr, only create two extra + // elements if Prev is the first element in a sequence of four. + // This has the highest chance of resulting in a beneficial vectorization. + if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) { + ChainElem NewElem1 = createExtraElementAfter( + Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill", + DeriveAlignFromBestAlignedElem(PrevReadEnd)); + ChainElem NewElem2 = createExtraElementAfter( + NewElem1, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill", + DeriveAlignFromBestAlignedElem(PrevReadEnd + PrevSzBytes)); + CurChain.push_back(NewElem1); + CurChain.push_back(NewElem2); + GapFilled = true; + } + } + + if (AreContiguous || GapFilled) CurChain.push_back(*It); else Ret.push_back({*It}); - PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd); + // In certain cases when handling redundant elements with partial overlaps, + // the previous element may still extend beyond the current element. Only + // update Prev if the current element is the new end of the chain. + if (ReadEnd.sge(PrevReadEnd)) + Prev = *It; } // Filter out length-1 chains, these are uninteresting. @@ -733,6 +862,12 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { unsigned AS = getLoadStoreAddressSpace(C[0].Inst); unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8; + // For compile time reasons, we cache whether or not the superset + // of all candidate chains contains any extra loads/stores from earlier gap + // filling. + bool CandidateChainsMayContainExtraLoadsStores = any_of( + C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); }); + std::vector Ret; for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) { // Find candidate chains of size not greater than the largest vector reg. @@ -787,41 +922,6 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { continue; } - // Is a load/store with this alignment allowed by TTI and at least as fast - // as an unvectorized load/store? - // - // TTI and F are passed as explicit captures to WAR an MSVC misparse (??). - auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI, - &F = F](Align Alignment) { - if (Alignment.value() % SizeBytes == 0) - return true; - unsigned VectorizedSpeed = 0; - bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( - F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); - if (!AllowsMisaligned) { - LLVM_DEBUG(dbgs() - << "LSV: Access of " << SizeBytes << "B in addrspace " - << AS << " with alignment " << Alignment.value() - << " is misaligned, and therefore can't be vectorized.\n"); - return false; - } - - unsigned ElementwiseSpeed = 0; - (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, - Alignment, &ElementwiseSpeed); - if (VectorizedSpeed < ElementwiseSpeed) { - LLVM_DEBUG(dbgs() - << "LSV: Access of " << SizeBytes << "B in addrspace " - << AS << " with alignment " << Alignment.value() - << " has relative speed " << VectorizedSpeed - << ", which is lower than the elementwise speed of " - << ElementwiseSpeed - << ". Therefore this access won't be vectorized.\n"); - return false; - } - return true; - }; - // If we're loading/storing from an alloca, align it if possible. // // FIXME: We eagerly upgrade the alignment, regardless of whether TTI @@ -837,7 +937,7 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { Align Alignment = getLoadStoreAlignment(C[CBegin].Inst); Align PrefAlign = Align(StackAdjustedAlignment); if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 && - IsAllowedAndFast(PrefAlign)) { + accessIsAllowedAndFast(SizeBytes, AS, PrefAlign, VecElemBits)) { Align NewAlign = getOrEnforceKnownAlignment( PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT); if (NewAlign >= Alignment) { @@ -849,12 +949,79 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { } } - if (!IsAllowedAndFast(Alignment)) { - LLVM_DEBUG( - dbgs() << "LSV: splitChainByAlignment discarding candidate chain " - "because its alignment is not AllowedAndFast: " - << Alignment.value() << "\n"); - continue; + Chain ExtendingLoadsStores; + if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) { + // If we have a non-power-of-2 element count, attempt to extend the + // chain to the next power-of-2 if it makes the access allowed and + // fast. + bool AllowedAndFast = false; + if (NumVecElems < TargetVF && !isPowerOf2_32(NumVecElems) && + VecElemBits >= 8) { + // TargetVF may be a lot higher than NumVecElems, + // so only extend to the next power of 2. + assert(VecElemBits % 8 == 0); + unsigned VecElemBytes = VecElemBits / 8; + unsigned NewNumVecElems = PowerOf2Ceil(NumVecElems); + unsigned NewSizeBytes = VecElemBytes * NewNumVecElems; + + assert(isPowerOf2_32(TargetVF) && + "TargetVF expected to be a power of 2"); + assert(NewNumVecElems <= TargetVF && + "Should not extend past TargetVF"); + + LLVM_DEBUG(dbgs() + << "LSV: attempting to extend chain of " << NumVecElems + << " " << (IsLoadChain ? "loads" : "stores") << " to " + << NewNumVecElems << " elements\n"); + bool IsLegalToExtend = + IsLoadChain ? TTI.isLegalMaskedLoad( + FixedVectorType::get(VecElemTy, NewNumVecElems), + Alignment, AS, TTI::MaskKind::ConstantMask) + : TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NewNumVecElems), + Alignment, AS, TTI::MaskKind::ConstantMask); + // Only artificially increase the chain if it would be AllowedAndFast + // and if the resulting masked load/store will be legal for the + // target. + if (IsLegalToExtend && + accessIsAllowedAndFast(NewSizeBytes, AS, Alignment, + VecElemBits)) { + LLVM_DEBUG(dbgs() + << "LSV: extending " << (IsLoadChain ? "load" : "store") + << " chain of " << NumVecElems << " " + << (IsLoadChain ? "loads" : "stores") + << " with total byte size of " << SizeBytes << " to " + << NewNumVecElems << " " + << (IsLoadChain ? "loads" : "stores") + << " with total byte size of " << NewSizeBytes + << ", TargetVF=" << TargetVF << " \n"); + + // Create (NewNumVecElems - NumVecElems) extra elements. + // We are basing each extra element on CBegin, which means the + // offsets should be based on SizeBytes, which represents the offset + // from CBegin to the current end of the chain. + unsigned ASPtrBits = DL.getIndexSizeInBits(AS); + for (unsigned I = 0; I < (NewNumVecElems - NumVecElems); I++) { + ChainElem NewElem = createExtraElementAfter( + C[CBegin], VecElemTy, + APInt(ASPtrBits, SizeBytes + I * VecElemBytes), "Extend"); + ExtendingLoadsStores.push_back(NewElem); + } + + // Update the size and number of elements for upcoming checks. + SizeBytes = NewSizeBytes; + NumVecElems = NewNumVecElems; + AllowedAndFast = true; + } + } + if (!AllowedAndFast) { + // We were not able to achieve legality by extending the chain. + LLVM_DEBUG(dbgs() + << "LSV: splitChainByAlignment discarding candidate chain " + "because its alignment is not AllowedAndFast: " + << Alignment.value() << "\n"); + continue; + } } if ((IsLoadChain && @@ -867,10 +1034,43 @@ std::vector Vectorizer::splitChainByAlignment(Chain &C) { continue; } + if (CandidateChainsMayContainExtraLoadsStores) { + // If the candidate chain contains extra loads/stores from an earlier + // optimization, confirm legality now. This filter is essential because + // when filling gaps in splitChainByContiguity, we queried the API to + // check that (for a given element type and address space) there *may* + // have been a legal masked load/store we could possibly create. Now, we + // need to check if the actual chain we ended up with is legal to turn + // into a masked load/store. This is relevant for NVPTX, for example, + // where a masked store is only legal if we have ended up with a 256-bit + // vector. + bool CurrCandContainsExtraLoadsStores = llvm::any_of( + ArrayRef(C).slice(CBegin, CEnd - CBegin + 1), + [this](const ChainElem &E) { + return ExtraElements.contains(E.Inst); + }); + + if (CurrCandContainsExtraLoadsStores && + (IsLoadChain ? !TTI.isLegalMaskedLoad( + FixedVectorType::get(VecElemTy, NumVecElems), + Alignment, AS, TTI::MaskKind::ConstantMask) + : !TTI.isLegalMaskedStore( + FixedVectorType::get(VecElemTy, NumVecElems), + Alignment, AS, TTI::MaskKind::ConstantMask))) { + LLVM_DEBUG(dbgs() + << "LSV: splitChainByAlignment discarding candidate chain " + "because it contains extra loads/stores that we cannot " + "legally vectorize into a masked load/store \n"); + continue; + } + } + // Hooray, we can vectorize this chain! Chain &NewChain = Ret.emplace_back(); for (unsigned I = CBegin; I <= CEnd; ++I) NewChain.emplace_back(C[I]); + for (ChainElem E : ExtendingLoadsStores) + NewChain.emplace_back(E); CBegin = CEnd; // Skip over the instructions we've added to the chain. break; } @@ -882,6 +1082,14 @@ bool Vectorizer::vectorizeChain(Chain &C) { if (C.size() < 2) return false; + bool ChainContainsExtraLoadsStores = llvm::any_of( + C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); }); + + // If we are left with a two-element chain, and one of the elements is an + // extra element, we don't want to vectorize + if (C.size() == 2 && ChainContainsExtraLoadsStores) + return false; + sortChainInOffsetOrder(C); LLVM_DEBUG({ @@ -936,15 +1144,25 @@ bool Vectorizer::vectorizeChain(Chain &C) { llvm::min_element(C, [](const auto &A, const auto &B) { return A.Inst->comesBefore(B.Inst); })->Inst); - // This can happen due to a chain of redundant loads. - // In this case, just use the element-type, and avoid ExtractElement. - if (NumElem == 1) - VecTy = VecElemTy; - // Chain is in offset order, so C[0] is the instr with the lowest offset, - // i.e. the root of the vector. - VecInst = Builder.CreateAlignedLoad(VecTy, - getLoadStorePointerOperand(C[0].Inst), - Alignment); + + // If the chain contains extra loads, we need to vectorize into a + // masked load. + if (ChainContainsExtraLoadsStores) { + assert(TTI.isLegalMaskedLoad(VecTy, Alignment, AS, + TTI::MaskKind::ConstantMask)); + Value *Mask = createMaskForExtraElements(C, cast(VecTy)); + VecInst = Builder.CreateMaskedLoad( + VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask); + } else { + // This can happen due to a chain of redundant loads. + // In this case, just use the element-type, and avoid ExtractElement. + if (NumElem == 1) + VecTy = VecElemTy; + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedLoad( + VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment); + } for (const ChainElem &E : C) { Instruction *I = E.Inst; @@ -1018,12 +1236,21 @@ bool Vectorizer::vectorizeChain(Chain &C) { } } - // Chain is in offset order, so C[0] is the instr with the lowest offset, - // i.e. the root of the vector. - VecInst = Builder.CreateAlignedStore( - Vec, - getLoadStorePointerOperand(C[0].Inst), - Alignment); + // If the chain originates from extra stores, we need to vectorize into a + // masked store. + if (ChainContainsExtraLoadsStores) { + assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS, + TTI::MaskKind::ConstantMask)); + Value *Mask = + createMaskForExtraElements(C, cast(Vec->getType())); + VecInst = Builder.CreateMaskedStore( + Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask); + } else { + // Chain is in offset order, so C[0] is the instr with the lowest offset, + // i.e. the root of the vector. + VecInst = Builder.CreateAlignedStore( + Vec, getLoadStorePointerOperand(C[0].Inst), Alignment); + } } propagateMetadata(VecInst, C); @@ -1676,3 +1903,114 @@ std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, .sextOrTrunc(OrigBitWidth); return std::nullopt; } + +bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, + Align Alignment, + unsigned VecElemBits) const { + // Aligned vector accesses are ALWAYS faster than element-wise accesses. + if (Alignment.value() % SizeBytes == 0) + return true; + + // Ask TTI whether misaligned accesses are faster as vector or element-wise. + unsigned VectorizedSpeed = 0; + bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses( + F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed); + if (!AllowsMisaligned) { + LLVM_DEBUG( + dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS + << " with alignment " << Alignment.value() + << " is misaligned, and therefore can't be vectorized.\n"); + return false; + } + + unsigned ElementwiseSpeed = 0; + (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS, + Alignment, &ElementwiseSpeed); + if (VectorizedSpeed < ElementwiseSpeed) { + LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " + << AS << " with alignment " << Alignment.value() + << " has relative speed " << VectorizedSpeed + << ", which is lower than the elementwise speed of " + << ElementwiseSpeed + << ". Therefore this access won't be vectorized.\n"); + return false; + } + return true; +} + +ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, Type *Ty, + APInt Offset, StringRef Prefix, + Align Alignment) { + Instruction *NewElement = nullptr; + Builder.SetInsertPoint(Prev.Inst->getNextNode()); + if (LoadInst *PrevLoad = dyn_cast(Prev.Inst)) { + Value *NewGep = Builder.CreatePtrAdd( + PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP"); + LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n"); + NewElement = Builder.CreateAlignedLoad(Ty, NewGep, Alignment, Prefix); + } else { + StoreInst *PrevStore = cast(Prev.Inst); + + Value *NewGep = Builder.CreatePtrAdd( + PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP"); + LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n"); + NewElement = + Builder.CreateAlignedStore(PoisonValue::get(Ty), NewGep, Alignment); + } + + // Attach all metadata to the new element. + // propagateMetadata will fold it into the final vector when applicable. + NewElement->copyMetadata(*Prev.Inst); + + // Cache created elements for tracking and cleanup + ExtraElements.insert(NewElement); + + APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset; + LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n" + << *NewElement + << " OffsetFromLeader: " << NewOffsetFromLeader << "\n"); + return ChainElem{NewElement, NewOffsetFromLeader}; +} + +Value *Vectorizer::createMaskForExtraElements(const ArrayRef C, + FixedVectorType *VecTy) { + // Start each mask element as false + SmallVector MaskElts(VecTy->getNumElements(), + Builder.getInt1(false)); + // Iterate over the chain and set the corresponding mask element to true for + // each element that is not an extra element. + for (const ChainElem &E : C) { + if (ExtraElements.contains(E.Inst)) + continue; + unsigned EOffset = + (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue(); + unsigned VecIdx = + 8 * EOffset / DL.getTypeSizeInBits(VecTy->getScalarType()); + if (FixedVectorType *VT = + dyn_cast(getLoadStoreType(E.Inst))) + for (unsigned J = 0; J < VT->getNumElements(); ++J) + MaskElts[VecIdx + J] = Builder.getInt1(true); + else + MaskElts[VecIdx] = Builder.getInt1(true); + } + return ConstantVector::get(MaskElts); +} + +void Vectorizer::deleteExtraElements() { + for (auto *ExtraElement : ExtraElements) { + if (isa(ExtraElement)) { + [[maybe_unused]] bool Deleted = + RecursivelyDeleteTriviallyDeadInstructions(ExtraElement); + assert(Deleted && "Extra Load should always be trivially dead"); + } else { + // Unlike Extra Loads, Extra Stores won't be "dead", but should all be + // deleted regardless. They will have either been combined into a masked + // store, or will be left behind and need to be cleaned up. + auto *PtrOperand = getLoadStorePointerOperand(ExtraElement); + ExtraElement->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); + } + } + + ExtraElements.clear(); +} diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index dd9a472984c25..a75ddd032d4c0 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -45,29 +45,32 @@ define half @fh(ptr %p) { ; ENABLED-LABEL: fh( ; ENABLED: { ; ENABLED-NEXT: .reg .b16 %rs<10>; -; ENABLED-NEXT: .reg .b32 %r<13>; +; ENABLED-NEXT: .reg .b32 %r<17>; ; ENABLED-NEXT: .reg .b64 %rd<2>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; -; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8]; -; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2; -; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1; -; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3; -; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4; -; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3; -; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6; -; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7; -; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6; -; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9; -; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8; -; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5; -; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11; -; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12; +; ENABLED-NEXT: .pragma "used_bytes_mask 0x3ff"; +; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } +; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2; +; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5; +; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4; +; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7; +; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3; +; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2; +; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10; +; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7; +; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6; +; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13; +; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8; +; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1; +; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15; +; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16; ; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9; ; ENABLED-NEXT: ret; ; diff --git a/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll b/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll new file mode 100644 index 0000000000000..bba240c694040 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll +; where two 3xhalfs are chained together and extended to 8xhalf. +define void @halfx3_extend_chain(ptr align 16 captures(none) %rd0) { +; CHECK-LABEL: halfx3_extend_chain( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0]; +; CHECK-NEXT: .pragma "used_bytes_mask 0xfff"; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {_, %rs3}, %r2; +; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1}; +; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4}; +; CHECK-NEXT: mov.b32 %r7, 0; +; CHECK-NEXT: max.f16x2 %r8, %r2, %r7; +; CHECK-NEXT: max.f16x2 %r9, %r1, %r7; +; CHECK-NEXT: st.b32 [%rd1], %r9; +; CHECK-NEXT: mov.b32 {%rs5, _}, %r8; +; CHECK-NEXT: st.b16 [%rd1+4], %rs5; +; CHECK-NEXT: max.f16x2 %r10, %r6, %r7; +; CHECK-NEXT: max.f16x2 %r11, %r5, %r7; +; CHECK-NEXT: st.b32 [%rd1+6], %r11; +; CHECK-NEXT: mov.b32 {%rs6, _}, %r10; +; CHECK-NEXT: st.b16 [%rd1+10], %rs6; +; CHECK-NEXT: ret; + %load1 = load <3 x half>, ptr %rd0, align 16 + %p1 = fcmp ogt <3 x half> %load1, zeroinitializer + %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer + store <3 x half> %s1, ptr %rd0, align 16 + %in2 = getelementptr half, ptr %rd0, i64 3 + %load2 = load <3 x half>, ptr %in2, align 4 + %p2 = fcmp ogt <3 x half> %load2, zeroinitializer + %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer + store <3 x half> %s2, ptr %in2, align 4 + ret void +} + +; This disables the vectorization by reducing the alignment. +define void @halfx3_no_align(ptr align 4 captures(none) %rd0) { +; CHECK-LABEL: halfx3_no_align( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_no_align_param_0]; +; CHECK-NEXT: ld.b16 %rs1, [%rd1+4]; +; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-NEXT: ld.b32 %r2, [%rd1]; +; CHECK-NEXT: mov.b32 %r3, 0; +; CHECK-NEXT: max.f16x2 %r4, %r1, %r3; +; CHECK-NEXT: max.f16x2 %r5, %r2, %r3; +; CHECK-NEXT: st.b32 [%rd1], %r5; +; CHECK-NEXT: mov.b32 {%rs3, _}, %r4; +; CHECK-NEXT: st.b16 [%rd1+4], %rs3; +; CHECK-NEXT: ld.b16 %rs4, [%rd1+10]; +; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs5}; +; CHECK-NEXT: ld.b32 %r7, [%rd1+6]; +; CHECK-NEXT: max.f16x2 %r8, %r6, %r3; +; CHECK-NEXT: max.f16x2 %r9, %r7, %r3; +; CHECK-NEXT: st.b32 [%rd1+6], %r9; +; CHECK-NEXT: mov.b32 {%rs6, _}, %r8; +; CHECK-NEXT: st.b16 [%rd1+10], %rs6; +; CHECK-NEXT: ret; + %load1 = load <3 x half>, ptr %rd0, align 4 + %p1 = fcmp ogt <3 x half> %load1, zeroinitializer + %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer + store <3 x half> %s1, ptr %rd0, align 4 + %in2 = getelementptr half, ptr %rd0, i64 3 + %load2 = load <3 x half>, ptr %in2, align 4 + %p2 = fcmp ogt <3 x half> %load2, zeroinitializer + %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer + store <3 x half> %s2, ptr %in2, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index 51f6b00601069..643de006f14c4 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -171,8 +171,8 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) ; CHECK-LABEL: callee_St4x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] - ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; - ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; + ; CHECK: .pragma "used_bytes_mask 0xfff"; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0]; ; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]]; ; CHECK-NEXT: ret; @@ -394,8 +394,8 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St4x7( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; - ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; - ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; + ; CHECK: .pragma "used_bytes_mask 0xfff"; + ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16]; ; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]]; diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll index 3c2b9933c59a3..a6becb0c7c28c 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll @@ -1,6 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s define void @ldg_f16(ptr nocapture align 16 %rd0) { +; CHECK-LABEL: define void @ldg_f16( +; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[RD0]], align 16 +; CHECK-NEXT: [[LOAD11:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> +; CHECK-NEXT: [[LOAD22:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> +; CHECK-NEXT: [[LOAD33:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> +; CHECK-NEXT: [[LOAD44:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> +; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <2 x half> [[LOAD11]], zeroinitializer +; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x half> [[LOAD11]], <2 x half> zeroinitializer +; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <2 x half> [[LOAD22]], zeroinitializer +; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x half> [[LOAD22]], <2 x half> zeroinitializer +; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <2 x half> [[LOAD33]], zeroinitializer +; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x half> [[LOAD33]], <2 x half> zeroinitializer +; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <2 x half> [[LOAD44]], zeroinitializer +; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x half> [[LOAD44]], <2 x half> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[S1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[S1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x half> [[S2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP5]], half [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x half> [[S2]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP7]], half [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x half> [[S3]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP9]], half [[TMP10]], i32 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[S3]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x half> [[S4]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x half> [[S4]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x half> [[TMP15]], half [[TMP16]], i32 7 +; CHECK-NEXT: store <8 x half> [[TMP17]], ptr [[RD0]], align 16 +; CHECK-NEXT: ret void +; %load1 = load <2 x half>, ptr %rd0, align 16 %p1 = fcmp ogt <2 x half> %load1, zeroinitializer %s1 = select <2 x i1> %p1, <2 x half> %load1, <2 x half> zeroinitializer @@ -22,20 +57,39 @@ define void @ldg_f16(ptr nocapture align 16 %rd0) { store <2 x half> %s4, ptr %in4, align 4 ret void -; CHECK-LABEL: @ldg_f16 -; CHECK: %[[LD:.*]] = load <8 x half>, ptr -; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> -; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> -; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> -; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> -; CHECK: store <8 x half> } define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) { - %load1 = load <3 x half>, ptr %rd0, align 4 +; CHECK-LABEL: define void @no_nonpow2_vector( +; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 16 [[RD0]], <8 x i1> , <8 x half> poison) +; CHECK-NEXT: [[LOAD13:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> +; CHECK-NEXT: [[LOAD24:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> +; CHECK-NEXT: [[EXTEND5:%.*]] = extractelement <8 x half> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTEND26:%.*]] = extractelement <8 x half> [[TMP1]], i32 7 +; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <3 x half> [[LOAD13]], zeroinitializer +; CHECK-NEXT: [[S1:%.*]] = select <3 x i1> [[P1]], <3 x half> [[LOAD13]], <3 x half> zeroinitializer +; CHECK-NEXT: store <3 x half> [[S1]], ptr [[RD0]], align 16 +; CHECK-NEXT: [[IN2:%.*]] = getelementptr half, ptr [[RD0]], i64 3 +; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <3 x half> [[LOAD24]], zeroinitializer +; CHECK-NEXT: [[S2:%.*]] = select <3 x i1> [[P2]], <3 x half> [[LOAD24]], <3 x half> zeroinitializer +; CHECK-NEXT: store <3 x half> [[S2]], ptr [[IN2]], align 4 +; CHECK-NEXT: [[IN3:%.*]] = getelementptr half, ptr [[RD0]], i64 6 +; CHECK-NEXT: [[LOAD3:%.*]] = load <3 x half>, ptr [[IN3]], align 4 +; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <3 x half> [[LOAD3]], zeroinitializer +; CHECK-NEXT: [[S3:%.*]] = select <3 x i1> [[P3]], <3 x half> [[LOAD3]], <3 x half> zeroinitializer +; CHECK-NEXT: store <3 x half> [[S3]], ptr [[IN3]], align 4 +; CHECK-NEXT: [[IN4:%.*]] = getelementptr half, ptr [[RD0]], i64 9 +; CHECK-NEXT: [[LOAD4:%.*]] = load <3 x half>, ptr [[IN4]], align 4 +; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <3 x half> [[LOAD4]], zeroinitializer +; CHECK-NEXT: [[S4:%.*]] = select <3 x i1> [[P4]], <3 x half> [[LOAD4]], <3 x half> zeroinitializer +; CHECK-NEXT: store <3 x half> [[S4]], ptr [[IN4]], align 4 +; CHECK-NEXT: ret void +; + %load1 = load <3 x half>, ptr %rd0, align 16 %p1 = fcmp ogt <3 x half> %load1, zeroinitializer %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer - store <3 x half> %s1, ptr %rd0, align 4 + store <3 x half> %s1, ptr %rd0, align 16 %in2 = getelementptr half, ptr %rd0, i64 3 %load2 = load <3 x half>, ptr %in2, align 4 %p2 = fcmp ogt <3 x half> %load2, zeroinitializer @@ -52,16 +106,36 @@ define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) { %s4 = select <3 x i1> %p4, <3 x half> %load4, <3 x half> zeroinitializer store <3 x half> %s4, ptr %in4, align 4 ret void - -; CHECK-LABEL: @no_nonpow2_vector -; CHECK-NOT: shufflevector } define void @no_pointer_vector(ptr nocapture align 16 %rd0) { - %load1 = load <2 x ptr>, ptr %rd0, align 4 +; CHECK-LABEL: define void @no_pointer_vector( +; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) { +; CHECK-NEXT: [[LOAD1:%.*]] = load <2 x ptr>, ptr [[RD0]], align 16 +; CHECK-NEXT: [[P1:%.*]] = icmp ne <2 x ptr> [[LOAD1]], zeroinitializer +; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x ptr> [[LOAD1]], <2 x ptr> zeroinitializer +; CHECK-NEXT: store <2 x ptr> [[S1]], ptr [[RD0]], align 16 +; CHECK-NEXT: [[IN2:%.*]] = getelementptr ptr, ptr [[RD0]], i64 2 +; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x ptr>, ptr [[IN2]], align 4 +; CHECK-NEXT: [[P2:%.*]] = icmp ne <2 x ptr> [[LOAD2]], zeroinitializer +; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x ptr> [[LOAD2]], <2 x ptr> zeroinitializer +; CHECK-NEXT: store <2 x ptr> [[S2]], ptr [[IN2]], align 4 +; CHECK-NEXT: [[IN3:%.*]] = getelementptr ptr, ptr [[RD0]], i64 4 +; CHECK-NEXT: [[LOAD3:%.*]] = load <2 x ptr>, ptr [[IN3]], align 4 +; CHECK-NEXT: [[P3:%.*]] = icmp ne <2 x ptr> [[LOAD3]], zeroinitializer +; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x ptr> [[LOAD3]], <2 x ptr> zeroinitializer +; CHECK-NEXT: store <2 x ptr> [[S3]], ptr [[IN3]], align 4 +; CHECK-NEXT: [[IN4:%.*]] = getelementptr ptr, ptr [[RD0]], i64 6 +; CHECK-NEXT: [[LOAD4:%.*]] = load <2 x ptr>, ptr [[IN4]], align 4 +; CHECK-NEXT: [[P4:%.*]] = icmp ne <2 x ptr> [[LOAD4]], zeroinitializer +; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x ptr> [[LOAD4]], <2 x ptr> zeroinitializer +; CHECK-NEXT: store <2 x ptr> [[S4]], ptr [[IN4]], align 4 +; CHECK-NEXT: ret void +; + %load1 = load <2 x ptr>, ptr %rd0, align 16 %p1 = icmp ne <2 x ptr> %load1, zeroinitializer %s1 = select <2 x i1> %p1, <2 x ptr> %load1, <2 x ptr> zeroinitializer - store <2 x ptr> %s1, ptr %rd0, align 4 + store <2 x ptr> %s1, ptr %rd0, align 16 %in2 = getelementptr ptr, ptr %rd0, i64 2 %load2 = load <2 x ptr>, ptr %in2, align 4 %p2 = icmp ne <2 x ptr> %load2, zeroinitializer @@ -78,7 +152,4 @@ define void @no_pointer_vector(ptr nocapture align 16 %rd0) { %s4 = select <2 x i1> %p4, <2 x ptr> %load4, <2 x ptr> zeroinitializer store <2 x ptr> %s4, ptr %in4, align 4 ret void - -; CHECK-LABEL: @no_pointer_vector -; CHECK-NOT: shufflevector } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll new file mode 100644 index 0000000000000..5c3757867f71f --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +;; Check that the vectorizer extends a Chain to the next power of two, +;; essentially loading more vector elements than the original +;; code. Alignment and other requirement for vectorization should +;; still be met. + +define void @load3to4(ptr %p) { +; CHECK-LABEL: define void @load3to4( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[P_0]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i32, ptr %p, i32 0 + %p.1 = getelementptr i32, ptr %p, i32 1 + %p.2 = getelementptr i32, ptr %p, i32 2 + + %v0 = load i32, ptr %p.0, align 16 + %v1 = load i32, ptr %p.1, align 4 + %v2 = load i32, ptr %p.2, align 8 + + ret void +} + +define void @load5to8(ptr %p) { +; CHECK-LABEL: define void @load5to8( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P_0]], <8 x i1> , <8 x i16> poison) +; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i16, ptr %p, i32 0 + %p.1 = getelementptr i16, ptr %p, i32 1 + %p.2 = getelementptr i16, ptr %p, i32 2 + %p.3 = getelementptr i16, ptr %p, i32 3 + %p.4 = getelementptr i16, ptr %p, i32 4 + + %v0 = load i16, ptr %p.0, align 16 + %v1 = load i16, ptr %p.1, align 2 + %v2 = load i16, ptr %p.2, align 4 + %v3 = load i16, ptr %p.3, align 2 + %v4 = load i16, ptr %p.4, align 8 + + ret void +} + +define void @load6to8(ptr %p) { +; CHECK-LABEL: define void @load6to8( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P_0]], <8 x i1> , <8 x i16> poison) +; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i16, ptr %p, i32 0 + %p.1 = getelementptr i16, ptr %p, i32 1 + %p.2 = getelementptr i16, ptr %p, i32 2 + %p.3 = getelementptr i16, ptr %p, i32 3 + %p.4 = getelementptr i16, ptr %p, i32 4 + %p.5 = getelementptr i16, ptr %p, i32 5 + + %v0 = load i16, ptr %p.0, align 16 + %v1 = load i16, ptr %p.1, align 2 + %v2 = load i16, ptr %p.2, align 4 + %v3 = load i16, ptr %p.3, align 2 + %v4 = load i16, ptr %p.4, align 8 + %v5 = load i16, ptr %p.5, align 2 + + ret void +} + +define void @load3to4_unaligned(ptr %p) { +; CHECK-LABEL: define void @load3to4_unaligned( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0 +; CHECK-NEXT: [[P_2:%.*]] = getelementptr i32, ptr [[P]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P_0]], align 8 +; CHECK-NEXT: [[V01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[V12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P_2]], align 8 +; CHECK-NEXT: ret void +; + %p.0 = getelementptr i32, ptr %p, i32 0 + %p.1 = getelementptr i32, ptr %p, i32 1 + %p.2 = getelementptr i32, ptr %p, i32 2 + + %v0 = load i32, ptr %p.0, align 8 + %v1 = load i32, ptr %p.1, align 4 + %v2 = load i32, ptr %p.2, align 8 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll new file mode 100644 index 0000000000000..7b659b0feeb03 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Test that gap filled instructions get deleted if they are not used +%struct.S10 = type { i32, i32, i32, i32 } + +; First, confirm that gap instructions get generated and would be vectorized if the alignment is correct +define void @fillTwoGapsCanVectorize(ptr %in) { +; CHECK-LABEL: define void @fillTwoGapsCanVectorize( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[IN]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr %in, align 16 + %getElem = getelementptr i8, ptr %in, i64 12 + %load3 = load i32, ptr %getElem, align 4 + ret void +} + +; Then, confirm that gap instructions get deleted if the alignment prevents the vectorization +define void @fillTwoGapsCantVectorize(ptr %in) { +; CHECK-LABEL: define void @fillTwoGapsCantVectorize( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[IN]], align 4 +; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[IN]], i64 12 +; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEM]], align 4 +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr %in, align 4 + %getElem = getelementptr i8, ptr %in, i64 12 + %load3 = load i32, ptr %getElem, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll new file mode 100644 index 0000000000000..145512863f4d7 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Test that gap filled instructions don't lose invariant metadata +%struct.S10 = type { i32, i32, i32, i32 } + +; With no gaps, if every load is invariant, the vectorized load will be too. +define i32 @noGaps(ptr %in) { +; CHECK-LABEL: define i32 @noGaps( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]] +; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]] +; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]] +; CHECK-NEXT: ret i32 [[SUM0123]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1 + %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0 + %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2 + %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0 + %sum01 = add i32 %load0, %load1 + %sum012 = add i32 %sum01, %load2 + %sum0123 = add i32 %sum012, %load3 + ret i32 %sum0123 +} + +; If one of the loads is not invariant, the vectorized load will not be invariant. +define i32 @noGapsMissingInvariant(ptr %in) { +; CHECK-LABEL: define i32 @noGapsMissingInvariant( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16 +; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]] +; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]] +; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]] +; CHECK-NEXT: ret i32 [[SUM0123]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1 + %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0 + %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2 + %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4 + %sum01 = add i32 %load0, %load1 + %sum012 = add i32 %sum01, %load2 + %sum0123 = add i32 %sum012, %load3 + ret i32 %sum0123 +} + +; With two gaps, if every real load is invariant, the vectorized load will be too. +define i32 @twoGaps(ptr %in) { +; CHECK-LABEL: define i32 @twoGaps( +; CHECK-SAME: ptr [[IN:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[IN]], <4 x i1> , <4 x i32> poison), !invariant.load [[META0]] +; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LOAD03]], [[LOAD36]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %load0 = load i32, ptr %in, align 16, !invariant.load !0 + %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3 + %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0 + %sum = add i32 %load0, %load3 + ret i32 %sum +} + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll new file mode 100644 index 0000000000000..15a92fbb452ac --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll @@ -0,0 +1,166 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S < %s | FileCheck %s + +; The LSV can handle vector inputs, and gap filling can too, with one exception: +; currently, we do not gap fill when the loads enclosing the gap are different sizes +; Otherwise, vectors are treated the same as any other scalar types + +; Gap of two load <2 x i8>s gets filled +define void @i8x2_gap_gap_i8x2(ptr %ptr) { +; CHECK-LABEL: define void @i8x2_gap_gap_i8x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 8 [[PTR0]], <8 x i1> , <8 x i8> poison) +; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 6 + + %l0 = load <2 x i8>, ptr %ptr0, align 8 + %l3 = load <2 x i8>, ptr %ptr3, align 2 + + ret void +} + +; The chain elements are different sizes, gap filling won't kick in +define void @i8x2_gap_gap_i8(ptr %ptr) { +; CHECK-LABEL: define void @i8x2_gap_gap_i8( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 6 +; CHECK-NEXT: [[L0:%.*]] = load <2 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[PTR3]], align 1 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 6 + + %l0 = load <2 x i8>, ptr %ptr0, align 8 + %l3 = load i8, ptr %ptr3, align 1 + + ret void +} + + +define void @i16x2_gap_i16x2_i16x2(ptr %ptr) { +; CHECK-LABEL: define void @i16x2_gap_i16x2_i16x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[PTR0]], <8 x i1> , <8 x i16> poison) +; CHECK-NEXT: [[L01:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L23:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L34:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr2 = getelementptr i8, ptr %ptr, i64 8 + %ptr3 = getelementptr i8, ptr %ptr, i64 12 + + %l0 = load <2 x i16>, ptr %ptr0, align 16 + %l2 = load <2 x i16>, ptr %ptr2, align 2 + %l3 = load <2 x i16>, ptr %ptr3, align 2 + + ret void +} + +define void @i16x2_gap_gap_i16x2(ptr %ptr) { +; CHECK-LABEL: define void @i16x2_gap_gap_i16x2( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[PTR0]], <8 x i1> , <8 x i16> poison) +; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 12 + + %l0 = load <2 x i16>, ptr %ptr0, align 16 + %l3 = load <2 x i16>, ptr %ptr3, align 4 + + ret void +} + +define void @i32x2_i32x2_gap_i32x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i32x2_i32x2_gap_i32x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 [[IN]], <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[GAPFILL3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[VEC34:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i32>, ptr addrspace(1) %in, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %vec1 = load <2 x i32>, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %vec3 = load <2 x i32>, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; This gap is filled but then eventually discarded because the total size +; of the vector is larger than the target supports. +define void @i64x2_gap_i64x2_i64x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i64x2_gap_i64x2_i64x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VEC0:%.*]] = load <2 x i64>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 32 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[GETELEM3]], align 32 +; CHECK-NEXT: [[VEC31:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 16 + ret void +} + +; This gap is filled but then eventually discarded because the total size +; of the vector is larger than the target supports. +define void @i64x2_i64x2_gap_i64x2(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @i64x2_i64x2_gap_i64x2( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[VEC32:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48 +; CHECK-NEXT: [[VEC1:%.*]] = load <2 x i64>, ptr addrspace(1) [[GETELEM1]], align 8 +; CHECK-NEXT: ret void +; + %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 16 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 8 + ret void +} + +; Masked loads are not supported for sub-byte element types. +define void @i1x8_gap_gap_i1x8(ptr %ptr) { +; CHECK-LABEL: define void @i1x8_gap_gap_i1x8( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[L0:%.*]] = load <8 x i1>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[L3:%.*]] = load <8 x i1>, ptr [[PTR3]], align 1 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr i8, ptr %ptr, i64 0 + %ptr3 = getelementptr i8, ptr %ptr, i64 3 + + %l0 = load <8 x i1>, ptr %ptr0, align 4 + %l3 = load <8 x i1>, ptr %ptr3, align 1 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-with-redundant-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-with-redundant-elements.ll new file mode 100644 index 0000000000000..e22035b4ed922 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-with-redundant-elements.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S -o - %s | FileCheck %s + +define void @test_redundant_no_gap(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define void @test_redundant_no_gap( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[PTR]], align 32 +; CHECK-NEXT: [[LD03:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD14:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LD25:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD37:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD45:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ld0 = load <2 x i32>, ptr addrspace(1) %ptr, align 32 + %gep1 = getelementptr inbounds i8, ptr addrspace(1) %ptr, i32 4 + %ld1 = load i32, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds i8, ptr addrspace(1) %ptr, i32 8 + %ld2 = load <2 x i32>, ptr addrspace(1) %gep2, align 8 + %gep3 = getelementptr inbounds i8, ptr addrspace(1) %ptr, i32 16 + %ld3 = load <2 x i32>, ptr addrspace(1) %gep3, align 16 + %gep4 = getelementptr inbounds i8, ptr addrspace(1) %ptr, i32 24 + %ld4 = load <2 x i32>, ptr addrspace(1) %gep4, align 8 + ret void +} + +; This fills the 2-byte gap between ld0 and ld3. +; ld1 is folded into the vector but ld0 is treated as the end of the chain +; at the point when the gap is considered, because it reads further than ld1. +define void @test_redundant_gap(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define void @test_redundant_gap( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 [[PTR]], <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[LD01:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD12:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD34:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD45:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: ret void +; + %ld0 = load <2 x i32>, ptr addrspace(1) %ptr, align 32 + %ld1 = load i32, ptr addrspace(1) %ptr, align 4 + %gep3 = getelementptr inbounds i8, ptr addrspace(1) %ptr, i32 16 + %ld3 = load <2 x i32>, ptr addrspace(1) %gep3, align 16 + %gep4 = getelementptr inbounds i8, ptr addrspace(1) %ptr, i32 24 + %ld4 = load <2 x i32>, ptr addrspace(1) %gep4, align 8 + ret void +} + +; This chain contains two elements, one before a gap, +; and one before the end of the chain. Chain should be correctly extended. +define void @test_redundant_gap_and_extend(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define void @test_redundant_gap_and_extend( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 [[PTR]], <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[LD03:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD14:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD36:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD47:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTEND8:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTEND29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: ret void +; + %ld0 = load <2 x i32>, ptr addrspace(1) %ptr, align 32 + %ld1 = load i32, ptr addrspace(1) %ptr, align 4 + %gep3 = getelementptr inbounds i8, ptr addrspace(1) %ptr, i32 16 + %ld3 = load <2 x i32>, ptr addrspace(1) %gep3, align 16 + %ld4 = load i32, ptr addrspace(1) %gep3, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll new file mode 100644 index 0000000000000..83152ece5c4d1 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll @@ -0,0 +1,194 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s + +; Load elements 0, 1, and 3, filling the gap with a generated load of element 2 +define void @test(ptr %ptr) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[PTR]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; Load elements 0, 2, and 3, filling the gap with a generated load of element 1 +define void @test2(ptr %ptr) { +; CHECK-LABEL: define void @test2( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[PTR]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LD23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8 + %ld2 = load i32, ptr %gep2, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; This gap can be filled, but the types are too large to do a v4 load, +; So we should end up with a v2 load and a single scalar load +define void @test3(ptr %ptr) { +; CHECK-LABEL: define void @test3( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[PTR]], align 16 +; CHECK-NEXT: [[LD01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 24 +; CHECK-NEXT: [[LD3:%.*]] = load i64, ptr [[GEP3]], align 4 +; CHECK-NEXT: ret void +; + %ld0 = load i64, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 8 + %ld1 = load i64, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld3 = load i64, ptr %gep3, align 4 + ret void +} + +; This gap can be filled, but the types are too large to do a v4 load, +; So we should end up with a v2 load and a single scalar load +define void @test4(ptr %ptr) { +; CHECK-LABEL: define void @test4( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[PTR]], align 16 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[GEP2]], align 16 +; CHECK-NEXT: [[LD21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD32:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: ret void +; + %ld0 = load i64, ptr %ptr, align 16 + %gep2 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld2 = load i64, ptr %gep2, align 16 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld3 = load i64, ptr %gep3, align 4 + ret void +} + +; Load elements 0 and 3, filling the gap with a generated load of element 1 and 2 +define void @test5(ptr %ptr) { +; CHECK-LABEL: define void @test5( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[PTR]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + ret void +} + +; Load elements 0, 1, 3, 4, 6, and 7, filling gaps at elements 2 and 5. +define void @test6(ptr %ptr) { +; CHECK-LABEL: define void @test6( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[PTR]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[GEP4]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD47:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[GAPFILL28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld4 = load i32, ptr %gep4, align 16 + %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld6 = load i32, ptr %gep6, align 4 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} + +; Load elements 0, 1, 3, 4 and 7, elements 2, 5, and 6 will be filled +define void @test7(ptr %ptr) { +; CHECK-LABEL: define void @test7( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[PTR]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD05:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD16:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL7:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD38:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[GEP4]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD49:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[GAPFILL210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[GAPFILL411:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD712:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16 + %ld4 = load i32, ptr %gep4, align 16 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} + +; Load elements 0, 1, 3, 5, 6, and 7. Elements 2 and 4 will be filled. +; Element 4 will be created and well-aligned because of its +; distance from the first load. +define void @test8(ptr %ptr) { +; CHECK-LABEL: define void @test8( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[PTR]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 12 +; CHECK-NEXT: [[GAPFILLGEP1:%.*]] = getelementptr i8, ptr [[GEP3]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[GAPFILLGEP1]], <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[GAPFILL27:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[LD58:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: ret void +; + %ld0 = load i32, ptr %ptr, align 16 + %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4 + %ld1 = load i32, ptr %gep1, align 4 + %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12 + %ld3 = load i32, ptr %gep3, align 4 + + %gep5 = getelementptr inbounds i8, ptr %ptr, i32 20 + %ld5 = load i32, ptr %gep5, align 16 + %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24 + %ld6 = load i32, ptr %gep6, align 4 + %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28 + %ld7 = load i32, ptr %gep7, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll index 11063dfeca54f..abe15c00c494b 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/many_loads_stores.ll @@ -1,9 +1,10 @@ ; This is an end-to-end test that checks that LSV succeeds at vectorizing a ; large program with many loads. ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s > %t -; RUN: grep 'load i8' < %t | count 18 -; RUN: grep 'load <2 x i8>' < %t | count 9 +; RUN: grep 'load i8' < %t | count 12 +; RUN: grep 'load <2 x i8>' < %t | count 3 ; RUN: grep 'load <4 x i8>' < %t | count 27 +; RUN: grep 'call <4 x i8> @llvm.masked.load.v4i8.p1.*<4 x i1> ' < %t | count 6 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll new file mode 100644 index 0000000000000..a9e9cf674c72e --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll @@ -0,0 +1,541 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=infer-alignment,load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S -o - %s | FileCheck %s + +; POSITIVE TESTS + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @singleGap(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGap( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) align 32 [[OUT]], <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @singleGapDouble(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapDouble( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p1(<4 x double> , ptr addrspace(1) align 32 [[OUT]], <4 x i1> ) +; CHECK-NEXT: ret void +; + store double 1.0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store double 2.0, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store double 4.0, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 3, filling the gaps with generated stores of elements 1 and 2 +define void @multipleGaps(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @multipleGaps( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) align 32 [[OUT]], <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 3, 4, 7, filling the gaps with generated stores of elements 1, 2, 5, 6 +define void @multipleGaps8xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @multipleGaps8xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) align 32 [[OUT]], <8 x i1> ) +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 2, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 4, ptr addrspace(1) %getElem4, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i32 8, ptr addrspace(1) %getElem7, align 4 + ret void +} + +; store elements 0, 1, 2, 3, 5, 6, 7, filling the gap with a generated store of element 4, +; resulting in two 4xi64 stores with the second one led by a gap filled store. +define void @singleGapLongerChain(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapLongerChain( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store <4 x i64> , ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GAPFILLGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[GETELEM3]], i64 8 +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) align 32 [[GAPFILLGEP]], <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i64 3, ptr addrspace(1) %getElem2, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 40 + store i64 6, ptr addrspace(1) %getElem5, align 8 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 48 + store i64 7, ptr addrspace(1) %getElem6, align 8 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 56 + store i64 8, ptr addrspace(1) %getElem7, align 8 + ret void +} + +; store elements 0, 1, and 3, filling the gap with a generated store of element 2 +define void @vectorElements(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @vectorElements( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) align 32 [[OUT]], <8 x i1> ) +; CHECK-NEXT: ret void +; + store <2 x i32> , ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store <2 x i32> , ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store <2 x i32> , ptr addrspace(1) %getElem3, align 8 + ret void +} + +; store elements 0, 1, 3. 2 should not end up filled because 8xi64 is not legal. +define void @vectorElements64(ptr addrspace(1) %in) { +; CHECK-LABEL: define void @vectorElements64( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <4 x i64> , ptr addrspace(1) [[IN]], align 32 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48 +; CHECK-NEXT: store <2 x i64> , ptr addrspace(1) [[GETELEM1]], align 16 +; CHECK-NEXT: ret void +; + store <2 x i64> , ptr addrspace(1) %in, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + store <2 x i64> , ptr addrspace(1) %getElem1, align 16 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48 + store <2 x i64> , ptr addrspace(1) %getElem3, align 16 + ret void +} + +; store elements 0, 1, 2, extending element 3 +define void @extendStores(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStores( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) align 32 [[OUT]], <4 x i1> ) +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i64 3, ptr addrspace(1) %getElem2, align 8 + ret void +} + +; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7 +define void @extendStores8xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStores8xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> , ptr addrspace(1) align 32 [[OUT]], <8 x i1> ) +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 2, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 4, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 5, ptr addrspace(1) %getElem4, align 4 + ret void +} + +; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7 +define void @extendStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendStoresFromLoads8xi32( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 [[IN]], <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[LOAD27]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) align 32 [[OUT]], <8 x i1> ) +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr addrspace(1) %in, align 32 + %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4 + %loadGetElem2 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load2 = load i32, ptr addrspace(1) %loadGetElem2, align 4 + %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4 + %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4 + + store i32 %load0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 %load1, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 %load2, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 %load3, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 %load4, ptr addrspace(1) %getElem4, align 4 + ret void +} + +; store elements 0, 1, 3, 4, gap fill element 2, extend elements 5, 6, 7 +define void @extendAndGapFillStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @extendAndGapFillStoresFromLoads8xi32( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 [[IN]], <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 poison, i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) align 32 [[OUT]], <8 x i1> ) +; CHECK-NEXT: ret void +; + %load0 = load i32, ptr addrspace(1) %in, align 32 + %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4 + %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4 + %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4 + + store i32 %load0, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 %load1, ptr addrspace(1) %getElem1, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 %load3, ptr addrspace(1) %getElem3, align 4 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i32 %load4, ptr addrspace(1) %getElem4, align 4 + ret void +} + + +; NEGATIVE TESTS + +; Wrong address space, no gap filling +define void @singleGapWrongAddrSpace(ptr addrspace(3) %out) { +; CHECK-LABEL: define void @singleGapWrongAddrSpace( +; CHECK-SAME: ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i64> , ptr addrspace(3) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[OUT]], i32 24 +; CHECK-NEXT: store i64 4, ptr addrspace(3) [[GETELEM3]], align 8 +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(3) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 8 + store i64 2, ptr addrspace(3) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 24 + store i64 4, ptr addrspace(3) %getElem3, align 8 + ret void +} + +; Not enough alignment for masked store, but we still vectorize the smaller vector +define void @singleGapMisaligned(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGapMisaligned( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i64> , ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i64 4, ptr addrspace(1) [[GETELEM3]], align 8 +; CHECK-NEXT: ret void +; + store i64 1, ptr addrspace(1) %out, align 16 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i64 2, ptr addrspace(1) %getElem1, align 8 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i64 4, ptr addrspace(1) %getElem3, align 8 + ret void +} + +; Not enough bytes to meet the minimum masked store size for the target +define void @singleGap4xi32(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @singleGap4xi32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store i32 1, ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[GETELEM2]], align 8 +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i32 4, ptr addrspace(1) %getElem3, align 4 + ret void +} + +; store elements 0, 1, 2, 5, 6, 7. 3 and 4 don't get filled because the heuristic +; only fills 2-element gaps that are in the middle of a multiple of 4 +define void @gapInWrongLocation(ptr addrspace(1) %out) { +; CHECK-LABEL: define void @gapInWrongLocation( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i32 3, ptr addrspace(1) [[GETELEM2]], align 8 +; CHECK-NEXT: [[GETELEM5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20 +; CHECK-NEXT: store i32 5, ptr addrspace(1) [[GETELEM5]], align 4 +; CHECK-NEXT: [[GETELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[GETELEM6]], align 8 +; CHECK-NEXT: ret void +; + store i32 1, ptr addrspace(1) %out, align 32 + %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i32 2, ptr addrspace(1) %getElem1, align 4 + %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i32 3, ptr addrspace(1) %getElem2, align 4 + %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20 + store i32 5, ptr addrspace(1) %getElem5, align 4 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i32 6, ptr addrspace(1) %getElem6, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i32 7, ptr addrspace(1) %getElem7, align 4 + ret void +} + +; This test has 32-bytes of i8s with a 2-element gap in the middle of each 4-byte chunk. +; i8s are not supported by masked stores on the target, so the stores will not be vectorized. +; The loads, on the other hand, get gap filled. +define void @cantMaski8(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @cantMaski8( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p1(ptr addrspace(1) align 32 [[IN]], <32 x i1> , <32 x i8> poison) +; CHECK-NEXT: [[LOAD031:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL32:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL233:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD334:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD435:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4 +; CHECK-NEXT: [[GAPFILL436:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5 +; CHECK-NEXT: [[GAPFILL637:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6 +; CHECK-NEXT: [[LOAD738:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7 +; CHECK-NEXT: [[LOAD839:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8 +; CHECK-NEXT: [[GAPFILL840:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9 +; CHECK-NEXT: [[GAPFILL1041:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10 +; CHECK-NEXT: [[LOAD1142:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11 +; CHECK-NEXT: [[LOAD1243:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12 +; CHECK-NEXT: [[GAPFILL1244:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13 +; CHECK-NEXT: [[GAPFILL1445:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14 +; CHECK-NEXT: [[LOAD1546:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15 +; CHECK-NEXT: [[LOAD1647:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16 +; CHECK-NEXT: [[GAPFILL1648:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17 +; CHECK-NEXT: [[GAPFILL1849:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18 +; CHECK-NEXT: [[LOAD1950:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19 +; CHECK-NEXT: [[LOAD2051:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20 +; CHECK-NEXT: [[GAPFILL2052:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21 +; CHECK-NEXT: [[GAPFILL2253:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22 +; CHECK-NEXT: [[LOAD2354:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23 +; CHECK-NEXT: [[LOAD2455:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24 +; CHECK-NEXT: [[GAPFILL2456:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25 +; CHECK-NEXT: [[GAPFILL2657:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26 +; CHECK-NEXT: [[LOAD2758:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27 +; CHECK-NEXT: [[LOAD2859:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28 +; CHECK-NEXT: [[GAPFILL2860:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29 +; CHECK-NEXT: [[GAPFILL3061:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30 +; CHECK-NEXT: [[LOAD3162:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31 +; CHECK-NEXT: store i8 [[LOAD031]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[OUTELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 3 +; CHECK-NEXT: store i8 [[LOAD334]], ptr addrspace(1) [[OUTELEM3]], align 1 +; CHECK-NEXT: [[OUTELEM4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 4 +; CHECK-NEXT: store i8 [[LOAD435]], ptr addrspace(1) [[OUTELEM4]], align 4 +; CHECK-NEXT: [[OUTELEM7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 7 +; CHECK-NEXT: store i8 [[LOAD738]], ptr addrspace(1) [[OUTELEM7]], align 1 +; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i8 [[LOAD839]], ptr addrspace(1) [[OUTELEM8]], align 8 +; CHECK-NEXT: [[OUTELEM11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 11 +; CHECK-NEXT: store i8 [[LOAD1142]], ptr addrspace(1) [[OUTELEM11]], align 1 +; CHECK-NEXT: [[OUTELEM12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 12 +; CHECK-NEXT: store i8 [[LOAD1243]], ptr addrspace(1) [[OUTELEM12]], align 4 +; CHECK-NEXT: [[OUTELEM15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 15 +; CHECK-NEXT: store i8 [[LOAD1546]], ptr addrspace(1) [[OUTELEM15]], align 1 +; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16 +; CHECK-NEXT: store i8 [[LOAD1647]], ptr addrspace(1) [[OUTELEM16]], align 16 +; CHECK-NEXT: [[OUTELEM19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 19 +; CHECK-NEXT: store i8 [[LOAD1950]], ptr addrspace(1) [[OUTELEM19]], align 1 +; CHECK-NEXT: [[OUTELEM20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20 +; CHECK-NEXT: store i8 [[LOAD2051]], ptr addrspace(1) [[OUTELEM20]], align 4 +; CHECK-NEXT: [[OUTELEM23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 23 +; CHECK-NEXT: store i8 [[LOAD2354]], ptr addrspace(1) [[OUTELEM23]], align 1 +; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i8 [[LOAD2455]], ptr addrspace(1) [[OUTELEM24]], align 8 +; CHECK-NEXT: [[OUTELEM27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 27 +; CHECK-NEXT: store i8 [[LOAD2758]], ptr addrspace(1) [[OUTELEM27]], align 1 +; CHECK-NEXT: [[OUTELEM28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 28 +; CHECK-NEXT: store i8 [[LOAD2859]], ptr addrspace(1) [[OUTELEM28]], align 4 +; CHECK-NEXT: [[OUTELEM31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 31 +; CHECK-NEXT: store i8 [[LOAD3162]], ptr addrspace(1) [[OUTELEM31]], align 1 +; CHECK-NEXT: ret void +; + %load0 = load i8, ptr addrspace(1) %in, align 32 + %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 3 + %load3 = load i8, ptr addrspace(1) %getElem3, align 1 + %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4 + %load4 = load i8, ptr addrspace(1) %getElem4, align 4 + %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 7 + %load7 = load i8, ptr addrspace(1) %getElem7, align 1 + %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load8 = load i8, ptr addrspace(1) %getElem8, align 8 + %getElem11 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 11 + %load11 = load i8, ptr addrspace(1) %getElem11, align 1 + %getElem12 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12 + %load12 = load i8, ptr addrspace(1) %getElem12, align 4 + %getElem15 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 15 + %load15 = load i8, ptr addrspace(1) %getElem15, align 1 + %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load16 = load i8, ptr addrspace(1) %getElem16, align 16 + %getElem19 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 19 + %load19 = load i8, ptr addrspace(1) %getElem19, align 1 + %getElem20 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 20 + %load20 = load i8, ptr addrspace(1) %getElem20, align 4 + %getElem23 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 23 + %load23 = load i8, ptr addrspace(1) %getElem23, align 1 + %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %load24 = load i8, ptr addrspace(1) %getElem24, align 8 + %getElem27 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 27 + %load27 = load i8, ptr addrspace(1) %getElem27, align 1 + %getElem28 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 28 + %load28 = load i8, ptr addrspace(1) %getElem28, align 4 + %getElem31 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 31 + %load31 = load i8, ptr addrspace(1) %getElem31, align 1 + + store i8 %load0, ptr addrspace(1) %out, align 32 + %outElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 3 + store i8 %load3, ptr addrspace(1) %outElem3, align 1 + %outElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4 + store i8 %load4, ptr addrspace(1) %outElem4, align 4 + %outElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 7 + store i8 %load7, ptr addrspace(1) %outElem7, align 1 + %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i8 %load8, ptr addrspace(1) %outElem8, align 8 + %outElem11 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 11 + store i8 %load11, ptr addrspace(1) %outElem11, align 1 + %outElem12 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12 + store i8 %load12, ptr addrspace(1) %outElem12, align 4 + %outElem15 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 15 + store i8 %load15, ptr addrspace(1) %outElem15, align 1 + %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i8 %load16, ptr addrspace(1) %outElem16, align 16 + %outElem19 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 19 + store i8 %load19, ptr addrspace(1) %outElem19, align 1 + %outElem20 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20 + store i8 %load20, ptr addrspace(1) %outElem20, align 4 + %outElem23 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 23 + store i8 %load23, ptr addrspace(1) %outElem23, align 1 + %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i8 %load24, ptr addrspace(1) %outElem24, align 8 + %outElem27 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 27 + store i8 %load27, ptr addrspace(1) %outElem27, align 1 + %outElem28 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28 + store i8 %load28, ptr addrspace(1) %outElem28, align 4 + %outElem31 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 31 + store i8 %load31, ptr addrspace(1) %outElem31, align 1 + + ret void +} + +; This test has 32-bytes of i16s with a 2-element gap in the middle of each 4-element chunk. +; i16s are not supported by masked stores on the target, so the stores will not be vectorized. +; The loads, on the other hand, get gap filled. +define void @cantMaski16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; CHECK-LABEL: define void @cantMaski16( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.masked.load.v16i16.p1(ptr addrspace(1) align 32 [[IN]], <16 x i1> , <16 x i16> poison) +; CHECK-NEXT: [[LOAD015:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[GAPFILL16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[GAPFILL217:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[LOAD318:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[LOAD419:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[GAPFILL420:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[GAPFILL621:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[LOAD722:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: [[LOAD823:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8 +; CHECK-NEXT: [[GAPFILL824:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9 +; CHECK-NEXT: [[GAPFILL1025:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10 +; CHECK-NEXT: [[LOAD1126:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11 +; CHECK-NEXT: [[LOAD1227:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12 +; CHECK-NEXT: [[GAPFILL1228:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13 +; CHECK-NEXT: [[GAPFILL1429:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14 +; CHECK-NEXT: [[LOAD1530:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15 +; CHECK-NEXT: store i16 [[LOAD015]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: [[OUTELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 6 +; CHECK-NEXT: store i16 [[LOAD318]], ptr addrspace(1) [[OUTELEM6]], align 2 +; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8 +; CHECK-NEXT: store i16 [[LOAD419]], ptr addrspace(1) [[OUTELEM8]], align 8 +; CHECK-NEXT: [[OUTELEM14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 14 +; CHECK-NEXT: store i16 [[LOAD722]], ptr addrspace(1) [[OUTELEM14]], align 2 +; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16 +; CHECK-NEXT: store i16 [[LOAD823]], ptr addrspace(1) [[OUTELEM16]], align 16 +; CHECK-NEXT: [[OUTELEM22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 22 +; CHECK-NEXT: store i16 [[LOAD1126]], ptr addrspace(1) [[OUTELEM22]], align 2 +; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24 +; CHECK-NEXT: store i16 [[LOAD1227]], ptr addrspace(1) [[OUTELEM24]], align 8 +; CHECK-NEXT: [[OUTELEM30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 30 +; CHECK-NEXT: store i16 [[LOAD1530]], ptr addrspace(1) [[OUTELEM30]], align 2 +; CHECK-NEXT: ret void +; + %load0 = load i16, ptr addrspace(1) %in, align 32 + %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 6 + %load3 = load i16, ptr addrspace(1) %getElem6, align 2 + %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8 + %load4 = load i16, ptr addrspace(1) %getElem8, align 8 + %getElem14 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 14 + %load7 = load i16, ptr addrspace(1) %getElem14, align 2 + %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16 + %load8 = load i16, ptr addrspace(1) %getElem16, align 16 + %getElem22 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 22 + %load11 = load i16, ptr addrspace(1) %getElem22, align 2 + %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24 + %load12 = load i16, ptr addrspace(1) %getElem24, align 8 + %getElem30 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 30 + %load15 = load i16, ptr addrspace(1) %getElem30, align 2 + + store i16 %load0, ptr addrspace(1) %out, align 32 + %outElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 6 + store i16 %load3, ptr addrspace(1) %outElem6, align 2 + %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8 + store i16 %load4, ptr addrspace(1) %outElem8, align 8 + %outElem14 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 14 + store i16 %load7, ptr addrspace(1) %outElem14, align 2 + %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16 + store i16 %load8, ptr addrspace(1) %outElem16, align 16 + %outElem22 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 22 + store i16 %load11, ptr addrspace(1) %outElem22, align 2 + %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24 + store i16 %load12, ptr addrspace(1) %outElem24, align 8 + %outElem30 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 30 + store i16 %load15, ptr addrspace(1) %outElem30, align 2 + + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll index 2d3c289c2a12b..47c9d9a88d958 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll @@ -1,8 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s ; Vectorize and emit valid code (Issue #54896). define void @int8x3a2(ptr nocapture align 2 %ptr) { +; CHECK-LABEL: define void @int8x3a2( +; CHECK-SAME: ptr align 2 captures(none) [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[PTR0]], align 2 +; CHECK-NEXT: [[L01:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[PTR2]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> poison, i8 [[L2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> [[TMP2]], i8 [[L12]], i32 1 +; CHECK-NEXT: store <2 x i8> [[TMP3]], ptr [[PTR0]], align 2 +; CHECK-NEXT: store i8 [[L01]], ptr [[PTR2]], align 2 +; CHECK-NEXT: ret void +; %ptr0 = getelementptr i8, ptr %ptr, i64 0 %ptr1 = getelementptr i8, ptr %ptr, i64 1 %ptr2 = getelementptr i8, ptr %ptr, i64 2 @@ -17,14 +32,24 @@ define void @int8x3a2(ptr nocapture align 2 %ptr) { ret void -; CHECK-LABEL: @int8x3a2 -; CHECK-DAG: load <2 x i8> -; CHECK-DAG: load i8 -; CHECK-DAG: store <2 x i8> -; CHECK-DAG: store i8 } define void @int8x3a4(ptr nocapture align 4 %ptr) { +; CHECK-LABEL: define void @int8x3a4( +; CHECK-SAME: ptr align 4 captures(none) [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 4 [[PTR0]], <4 x i1> , <4 x i8> poison) +; CHECK-NEXT: [[L01:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[L23:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> poison, i8 [[L23]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> [[TMP2]], i8 [[L12]], i32 1 +; CHECK-NEXT: store <2 x i8> [[TMP3]], ptr [[PTR0]], align 4 +; CHECK-NEXT: store i8 [[L01]], ptr [[PTR2]], align 2 +; CHECK-NEXT: ret void +; %ptr0 = getelementptr i8, ptr %ptr, i64 0 %ptr1 = getelementptr i8, ptr %ptr, i64 1 %ptr2 = getelementptr i8, ptr %ptr, i64 2 @@ -33,20 +58,52 @@ define void @int8x3a4(ptr nocapture align 4 %ptr) { %l1 = load i8, ptr %ptr1, align 1 %l2 = load i8, ptr %ptr2, align 2 - store i8 %l2, ptr %ptr0, align 2 + store i8 %l2, ptr %ptr0, align 4 store i8 %l1, ptr %ptr1, align 1 - store i8 %l0, ptr %ptr2, align 4 + store i8 %l0, ptr %ptr2, align 2 ret void -; CHECK-LABEL: @int8x3a4 -; CHECK: load <2 x i8> -; CHECK: load i8 -; CHECK: store <2 x i8> -; CHECK: store i8 } define void @int8x12a4(ptr nocapture align 4 %ptr) { +; CHECK-LABEL: define void @int8x12a4( +; CHECK-SAME: ptr align 4 captures(none) [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr [[PTR]], i64 4 +; CHECK-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr [[PTR]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[L01:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[L23:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[L34:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PTR4]], align 4 +; CHECK-NEXT: [[L45:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0 +; CHECK-NEXT: [[L56:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1 +; CHECK-NEXT: [[L67:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2 +; CHECK-NEXT: [[L78:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[PTR8]], align 4 +; CHECK-NEXT: [[L89:%.*]] = extractelement <4 x i8> [[TMP3]], i32 0 +; CHECK-NEXT: [[L910:%.*]] = extractelement <4 x i8> [[TMP3]], i32 1 +; CHECK-NEXT: [[LA11:%.*]] = extractelement <4 x i8> [[TMP3]], i32 2 +; CHECK-NEXT: [[LB12:%.*]] = extractelement <4 x i8> [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> poison, i8 [[LB12]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[LA11]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[L910]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i8> [[TMP6]], i8 [[L89]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP7]], ptr [[PTR0]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> poison, i8 [[L78]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[L67]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i8> [[TMP9]], i8 [[L56]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP10]], i8 [[L45]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[PTR4]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i8> poison, i8 [[L34]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[L23]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[L12]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[L01]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP15]], ptr [[PTR8]], align 4 +; CHECK-NEXT: ret void +; %ptr0 = getelementptr i8, ptr %ptr, i64 0 %ptr1 = getelementptr i8, ptr %ptr, i64 1 %ptr2 = getelementptr i8, ptr %ptr, i64 2 @@ -88,17 +145,58 @@ define void @int8x12a4(ptr nocapture align 4 %ptr) { ret void -; CHECK-LABEL: @int8x12a4 -; CHECK: load <4 x i8> -; CHECK: load <4 x i8> -; CHECK: load <4 x i8> -; CHECK: store <4 x i8> -; CHECK: store <4 x i8> -; CHECK: store <4 x i8> } define void @int8x16a4(ptr nocapture align 4 %ptr) { +; CHECK-LABEL: define void @int8x16a4( +; CHECK-SAME: ptr align 4 captures(none) [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR4:%.*]] = getelementptr i8, ptr [[PTR]], i64 4 +; CHECK-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr [[PTR]], i64 8 +; CHECK-NEXT: [[PTRC:%.*]] = getelementptr i8, ptr [[PTR]], i64 12 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[L01:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[L23:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[L34:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PTR4]], align 4 +; CHECK-NEXT: [[L45:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0 +; CHECK-NEXT: [[L56:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1 +; CHECK-NEXT: [[L67:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2 +; CHECK-NEXT: [[L78:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[PTR8]], align 4 +; CHECK-NEXT: [[L89:%.*]] = extractelement <4 x i8> [[TMP3]], i32 0 +; CHECK-NEXT: [[L910:%.*]] = extractelement <4 x i8> [[TMP3]], i32 1 +; CHECK-NEXT: [[LA11:%.*]] = extractelement <4 x i8> [[TMP3]], i32 2 +; CHECK-NEXT: [[LB12:%.*]] = extractelement <4 x i8> [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PTRC]], align 4 +; CHECK-NEXT: [[LC13:%.*]] = extractelement <4 x i8> [[TMP4]], i32 0 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i8> [[TMP4]], i32 1 +; CHECK-NEXT: [[LE15:%.*]] = extractelement <4 x i8> [[TMP4]], i32 2 +; CHECK-NEXT: [[LF16:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> poison, i8 [[LF16]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[LE15]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i8> [[TMP6]], i8 [[LD14]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP7]], i8 [[LC13]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP8]], ptr [[PTRC]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> poison, i8 [[LB12]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i8> [[TMP9]], i8 [[LA11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP10]], i8 [[L910]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[L89]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP12]], ptr [[PTR0]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> poison, i8 [[L78]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[L67]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[L56]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[L45]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP16]], ptr [[PTR4]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i8> poison, i8 [[L34]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[L23]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[L12]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[L01]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP20]], ptr [[PTR8]], align 4 +; CHECK-NEXT: ret void +; %ptr0 = getelementptr i8, ptr %ptr, i64 0 %ptr1 = getelementptr i8, ptr %ptr, i64 1 %ptr2 = getelementptr i8, ptr %ptr, i64 2 @@ -152,18 +250,32 @@ define void @int8x16a4(ptr nocapture align 4 %ptr) { ret void -; CHECK-LABEL: @int8x16a4 -; CHECK: load <4 x i8> -; CHECK: load <4 x i8> -; CHECK: load <4 x i8> -; CHECK: load <4 x i8> -; CHECK: store <4 x i8> -; CHECK: store <4 x i8> -; CHECK: store <4 x i8> -; CHECK: store <4 x i8> } define void @int8x8a8(ptr nocapture align 8 %ptr) { +; CHECK-LABEL: define void @int8x8a8( +; CHECK-SAME: ptr align 8 captures(none) [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L01:%.*]] = extractelement <8 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <8 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[L23:%.*]] = extractelement <8 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[L34:%.*]] = extractelement <8 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[L45:%.*]] = extractelement <8 x i8> [[TMP1]], i32 4 +; CHECK-NEXT: [[L56:%.*]] = extractelement <8 x i8> [[TMP1]], i32 5 +; CHECK-NEXT: [[L67:%.*]] = extractelement <8 x i8> [[TMP1]], i32 6 +; CHECK-NEXT: [[L78:%.*]] = extractelement <8 x i8> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[L78]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[L67]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[L56]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[L45]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[L34]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[L23]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[L12]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[L01]], i32 7 +; CHECK-NEXT: store <8 x i8> [[TMP9]], ptr [[PTR0]], align 8 +; CHECK-NEXT: ret void +; %ptr0 = getelementptr i8, ptr %ptr, i64 0 %ptr1 = getelementptr i8, ptr %ptr, i64 1 %ptr2 = getelementptr i8, ptr %ptr, i64 2 @@ -193,12 +305,43 @@ define void @int8x8a8(ptr nocapture align 8 %ptr) { ret void -; CHECK-LABEL: @int8x8a8 -; CHECK: load <8 x i8> -; CHECK: store <8 x i8> } define void @int8x12a8(ptr nocapture align 8 %ptr) { +; CHECK-LABEL: define void @int8x12a8( +; CHECK-SAME: ptr align 8 captures(none) [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr [[PTR]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L01:%.*]] = extractelement <8 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <8 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[L23:%.*]] = extractelement <8 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[L34:%.*]] = extractelement <8 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[L45:%.*]] = extractelement <8 x i8> [[TMP1]], i32 4 +; CHECK-NEXT: [[L56:%.*]] = extractelement <8 x i8> [[TMP1]], i32 5 +; CHECK-NEXT: [[L67:%.*]] = extractelement <8 x i8> [[TMP1]], i32 6 +; CHECK-NEXT: [[L78:%.*]] = extractelement <8 x i8> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PTR8]], align 8 +; CHECK-NEXT: [[L89:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0 +; CHECK-NEXT: [[L910:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1 +; CHECK-NEXT: [[LA11:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2 +; CHECK-NEXT: [[LB12:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[LB12]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[LA11]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[L910]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[L89]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[L78]], i32 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[L67]], i32 5 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[L56]], i32 6 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[L45]], i32 7 +; CHECK-NEXT: store <8 x i8> [[TMP10]], ptr [[PTR0]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> poison, i8 [[L34]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[L23]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[L12]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[L01]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP14]], ptr [[PTR8]], align 8 +; CHECK-NEXT: ret void +; %ptr0 = getelementptr i8, ptr %ptr, i64 0 %ptr1 = getelementptr i8, ptr %ptr, i64 1 %ptr2 = getelementptr i8, ptr %ptr, i64 2 @@ -240,15 +383,52 @@ define void @int8x12a8(ptr nocapture align 8 %ptr) { ret void -; CHECK-LABEL: @int8x12a8 -; CHECK-DAG: load <8 x i8> -; CHECK-DAG: load <4 x i8> -; CHECK-DAG: store <8 x i8> -; CHECK-DAG: store <4 x i8> } define void @int8x16a8(ptr nocapture align 8 %ptr) { +; CHECK-LABEL: define void @int8x16a8( +; CHECK-SAME: ptr align 8 captures(none) [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR8:%.*]] = getelementptr i8, ptr [[PTR]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[L01:%.*]] = extractelement <8 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <8 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[L23:%.*]] = extractelement <8 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[L34:%.*]] = extractelement <8 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[L45:%.*]] = extractelement <8 x i8> [[TMP1]], i32 4 +; CHECK-NEXT: [[L56:%.*]] = extractelement <8 x i8> [[TMP1]], i32 5 +; CHECK-NEXT: [[L67:%.*]] = extractelement <8 x i8> [[TMP1]], i32 6 +; CHECK-NEXT: [[L78:%.*]] = extractelement <8 x i8> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[PTR8]], align 8 +; CHECK-NEXT: [[L89:%.*]] = extractelement <8 x i8> [[TMP2]], i32 0 +; CHECK-NEXT: [[L910:%.*]] = extractelement <8 x i8> [[TMP2]], i32 1 +; CHECK-NEXT: [[LA11:%.*]] = extractelement <8 x i8> [[TMP2]], i32 2 +; CHECK-NEXT: [[LB12:%.*]] = extractelement <8 x i8> [[TMP2]], i32 3 +; CHECK-NEXT: [[LC13:%.*]] = extractelement <8 x i8> [[TMP2]], i32 4 +; CHECK-NEXT: [[LD14:%.*]] = extractelement <8 x i8> [[TMP2]], i32 5 +; CHECK-NEXT: [[LE15:%.*]] = extractelement <8 x i8> [[TMP2]], i32 6 +; CHECK-NEXT: [[LF16:%.*]] = extractelement <8 x i8> [[TMP2]], i32 7 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[LF16]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[LE15]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[LD14]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[LC13]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[LB12]], i32 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[LA11]], i32 5 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[L910]], i32 6 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[L89]], i32 7 +; CHECK-NEXT: store <8 x i8> [[TMP10]], ptr [[PTR0]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> poison, i8 [[L78]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[L67]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i8> [[TMP12]], i8 [[L56]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[L45]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[L34]], i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[L23]], i32 5 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[L12]], i32 6 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[L01]], i32 7 +; CHECK-NEXT: store <8 x i8> [[TMP18]], ptr [[PTR8]], align 8 +; CHECK-NEXT: ret void +; %ptr0 = getelementptr i8, ptr %ptr, i64 0 %ptr1 = getelementptr i8, ptr %ptr, i64 1 %ptr2 = getelementptr i8, ptr %ptr, i64 2 @@ -302,9 +482,4 @@ define void @int8x16a8(ptr nocapture align 8 %ptr) { ret void -; CHECK-LABEL: @int8x16a8 -; CHECK: load <8 x i8> -; CHECK: load <8 x i8> -; CHECK: store <8 x i8> -; CHECK: store <8 x i8> }