-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[VPlan] Compute cost of intrinsics directly for VPReplicateRecipe (NFCI). #154617
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesHandle intrinsic calls in VPReplicateRecipe::computeCost. There are some Depends on #154291. (included Full diff: https://github.com/llvm/llvm-project/pull/154617.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f8fde0500b77a..a55ebdb60f939 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1673,18 +1673,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
State.set(this, V);
}
-InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
+static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
+ ArrayRef<const VPValue *> Operands,
+ const VPRecipeWithIRFlags &R,
+ ElementCount VF,
+ VPCostContext &Ctx) {
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
// clear Arguments.
// TODO: Rework TTI interface to be independent of concrete IR values.
SmallVector<const Value *> Arguments;
- for (const auto &[Idx, Op] : enumerate(operands())) {
+ for (const auto &[Idx, Op] : enumerate(Operands)) {
auto *V = Op->getUnderlyingValue();
if (!V) {
- if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
+ if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
Arguments.push_back(UI->getArgOperand(Idx));
continue;
}
@@ -1694,21 +1698,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
Arguments.push_back(V);
}
- Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
+ Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
+ Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
SmallVector<Type *> ParamTys;
- for (unsigned I = 0; I != getNumOperands(); ++I)
- ParamTys.push_back(
- toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+ for (const VPValue *Op : Operands) {
+ ParamTys.push_back(VF.isVector()
+ ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
+ : Ctx.Types.inferScalarType(Op));
+ }
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
- FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
+ FastMathFlags FMF =
+ R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
- VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+ ID, RetTy, Arguments, ParamTys, FMF,
+ dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
+InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ SmallVector<const VPValue *> ArgOps(operands());
+ return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
+}
+
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}
@@ -3002,23 +3016,73 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// instruction cost.
return 0;
case Instruction::Call: {
- if (!isSingleScalar()) {
- // TODO: Handle remaining call costs here as well.
- if (VF.isScalable())
- return InstructionCost::getInvalid();
- break;
- }
-
auto *CalledFn =
cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
- if (CalledFn->isIntrinsic())
- break;
+ SmallVector<const VPValue *> ArgOps(drop_end(operands()));
SmallVector<Type *, 4> Tys;
- for (VPValue *ArgOp : drop_end(operands()))
+ for (const VPValue *ArgOp : ArgOps)
Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+
+ if (CalledFn->isIntrinsic())
+ // Various pseudo-intrinsics with costs of 0 are scalarized instead of
+ // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
+ switch (CalledFn->getIntrinsicID()) {
+ case Intrinsic::assume:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::sideeffect:
+ case Intrinsic::pseudoprobe:
+ case Intrinsic::experimental_noalias_scope_decl: {
+ assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx) == 0 && "pseudo-intrinsic must have zero cost");
+ return InstructionCost(0);
+ }
+ default:
+ break;
+ }
+
Type *ResultTy = Ctx.Types.inferScalarType(this);
- return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ InstructionCost ScalarCallCost =
+ Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ if (isSingleScalar()) {
+ if (CalledFn->isIntrinsic())
+ ScalarCallCost = std::min(
+ ScalarCallCost,
+ getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx));
+ return ScalarCallCost;
+ }
+
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+
+ // Compute the cost of scalarizing the result and operands if needed.
+ InstructionCost ScalarizationCost = 0;
+ if (VF.isVector()) {
+ if (!ResultTy->isVoidTy()) {
+ for (Type *VectorTy : getContainedTypes(toVectorizedTy(ResultTy, VF))) {
+ ScalarizationCost += Ctx.TTI.getScalarizationOverhead(
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+ /*Insert=*/true,
+ /*Extract=*/false, Ctx.CostKind);
+ }
+ }
+ // Skip operands that do not require extraction/scalarization and do not
+ // incur any overhead.
+ SmallVector<Type *> Tys;
+ SmallPtrSet<const VPValue *, 4> UniqueOperands;
+ for (auto *Op : ArgOps) {
+ if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ !UniqueOperands.insert(Op).second)
+ continue;
+ Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF));
+ }
+ ScalarizationCost +=
+ Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind);
+ }
+
+ return ScalarCallCost * VF.getFixedValue() + ScalarizationCost;
}
case Instruction::Add:
case Instruction::Sub:
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
a1801fd
to
672b04b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
Refactor to prepare for #154617.
…CI). Handle intrinsic calls in VPReplicateRecipe::computeCost. There are some intrinsics pseudo intrinsics for which the computed cost is known zero, so we handle those up front. Depends on llvm#154291.
672b04b
to
bddd21b
Compare
… (NFC). Refactor to prepare for llvm/llvm-project#154617.
…eRecipe (NFCI). (#154617) Handle intrinsic calls in VPReplicateRecipe::computeCost. There are some intrinsics pseudo intrinsics for which the computed cost is known zero, so we handle those up front. Depends on llvm/llvm-project#154291. PR: llvm/llvm-project#154617
Handle intrinsic calls in VPReplicateRecipe::computeCost. There are some
intrinsics pseudo intrinsics for which the computed cost is known zero,
so we handle those up front.
Depends on #154291. (included
in PR)