Skip to content

Commit 878d79e

Browse files
committed
[VPlan] Explicitly replicate VPInstructions by VF.
Extend replicateByVF added in #142433 (aa24029) to also explicitly unroll replicating VPInstructions. Now the only remaining case where we replicate for all lanes is VPReplicateRecipes in replicate regions.
1 parent a325391 commit 878d79e

File tree

9 files changed

+110
-117
lines changed

9 files changed

+110
-117
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
333333
LastLane = 0;
334334
}
335335

336+
assert(IsSingleScalar && "must be a single-scalar at this point");
337+
// We need to construct the vector value for a single-scalar value by
338+
// broadcasting the scalar to all lanes.
336339
auto *LastInst = cast<Instruction>(get(Def, LastLane));
337340
// Set the insert point after the last scalarized instruction or after the
338341
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
@@ -343,27 +346,8 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
343346
: std::next(BasicBlock::iterator(LastInst));
344347
Builder.SetInsertPoint(&*NewIP);
345348

346-
// However, if we are vectorizing, we need to construct the vector values.
347-
// If the value is known to be uniform after vectorization, we can just
348-
// broadcast the scalar value corresponding to lane zero. Otherwise, we
349-
// construct the vector values using insertelement instructions. Since the
350-
// resulting vectors are stored in State, we will only generate the
351-
// insertelements once.
352-
Value *VectorValue = nullptr;
353-
if (IsSingleScalar) {
354-
VectorValue = GetBroadcastInstrs(ScalarValue);
355-
set(Def, VectorValue);
356-
} else {
357-
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
358-
assert(isa<VPInstruction>(Def) &&
359-
"Explicit BuildVector recipes must have"
360-
"handled packing for non-VPInstructions.");
361-
// Initialize packing with insertelements to start from poison.
362-
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
363-
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
364-
VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane);
365-
set(Def, VectorValue);
366-
}
349+
Value *VectorValue = GetBroadcastInstrs(ScalarValue);
350+
set(Def, VectorValue);
367351
Builder.restoreIP(OldIP);
368352
return VectorValue;
369353
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -897,6 +897,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
897897
return R && classof(R);
898898
}
899899

900+
virtual VPRecipeWithIRFlags *clone() override = 0;
901+
900902
void execute(VPTransformState &State) override = 0;
901903

902904
/// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
@@ -1040,13 +1042,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10401042
VScale,
10411043
};
10421044

1043-
private:
1044-
typedef unsigned char OpcodeTy;
1045-
OpcodeTy Opcode;
1046-
1047-
/// An optional name that can be used for the generated IR instruction.
1048-
const std::string Name;
1049-
10501045
/// Returns true if this VPInstruction generates scalar values for all lanes.
10511046
/// Most VPInstructions generate a single value per part, either vector or
10521047
/// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
@@ -1055,6 +1050,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10551050
/// underlying ingredient.
10561051
bool doesGeneratePerAllLanes() const;
10571052

1053+
private:
1054+
typedef unsigned char OpcodeTy;
1055+
OpcodeTy Opcode;
1056+
1057+
/// An optional name that can be used for the generated IR instruction.
1058+
const std::string Name;
1059+
10581060
/// Returns true if we can generate a scalar for the first lane only if
10591061
/// needed.
10601062
bool canGenerateScalarForFirstLane() const;
@@ -1064,11 +1066,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
10641066
/// existing value is returned rather than a generated one.
10651067
Value *generate(VPTransformState &State);
10661068

1067-
/// Utility methods serving execute(): generates a scalar single instance of
1068-
/// the modeled instruction for a given lane. \returns the scalar generated
1069-
/// value for lane \p Lane.
1070-
Value *generatePerLane(VPTransformState &State, const VPLane &Lane);
1071-
10721069
#if !defined(NDEBUG)
10731070
/// Return the number of operands determined by the opcode of the
10741071
/// VPInstruction. Returns -1u if the number of operands cannot be determined

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -525,16 +525,6 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
525525
}
526526
}
527527

528-
Value *VPInstruction::generatePerLane(VPTransformState &State,
529-
const VPLane &Lane) {
530-
IRBuilderBase &Builder = State.Builder;
531-
532-
assert(getOpcode() == VPInstruction::PtrAdd &&
533-
"only PtrAdd opcodes are supported for now");
534-
return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),
535-
State.get(getOperand(1), Lane), Name);
536-
}
537-
538528
/// Create a conditional branch using \p Cond branching to the successors of \p
539529
/// VPBB. Note that the first successor is always forward (i.e. not created yet)
540530
/// while the second successor may already have been created (if it is a header
@@ -1123,24 +1113,13 @@ void VPInstruction::execute(VPTransformState &State) {
11231113
"Set flags not supported for the provided opcode");
11241114
if (hasFastMathFlags())
11251115
State.Builder.setFastMathFlags(getFastMathFlags());
1126-
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1127-
(vputils::onlyFirstLaneUsed(this) ||
1128-
isVectorToScalar() || isSingleScalar());
1129-
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1130-
if (GeneratesPerAllLanes) {
1131-
for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
1132-
Lane != NumLanes; ++Lane) {
1133-
Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
1134-
assert(GeneratedValue && "generatePerLane must produce a value");
1135-
State.set(this, GeneratedValue, VPLane(Lane));
1136-
}
1137-
return;
1138-
}
1139-
11401116
Value *GeneratedValue = generate(State);
11411117
if (!hasResult())
11421118
return;
11431119
assert(GeneratedValue && "generate must produce a value");
1120+
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1121+
(vputils::onlyFirstLaneUsed(this) ||
1122+
isVectorToScalar() || isSingleScalar());
11441123
assert((((GeneratedValue->getType()->isVectorTy() ||
11451124
GeneratedValue->getType()->isStructTy()) ==
11461125
!GeneratesPerFirstLaneOnly) ||
@@ -1213,6 +1192,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
12131192
case VPInstruction::Broadcast:
12141193
case VPInstruction::ReductionStartVector:
12151194
return true;
1195+
case VPInstruction::BuildStructVector:
1196+
case VPInstruction::BuildVector:
1197+
return getNumOperands() > 1;
12161198
case VPInstruction::PtrAdd:
12171199
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
12181200
case VPInstruction::WidePtrAdd:

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3354,34 +3354,40 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
33543354
vp_depth_first_shallow(Plan.getEntry()));
33553355
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
33563356
vp_depth_first_shallow(LoopRegion->getEntry()));
3357-
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
3358-
// excluding ones in replicate regions. Those are not materialized explicitly
3359-
// yet. Those vector users are still handled in VPReplicateRegion::execute(),
3360-
// via shouldPack().
3357+
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3358+
// VPInstructions, excluding ones in replicate regions. Those are not
3359+
// materialized explicitly yet. Those vector users are still handled in
3360+
// VPReplicateRegion::execute(), via shouldPack().
33613361
// TODO: materialize build vectors for replicating recipes in replicating
33623362
// regions.
33633363
// TODO: materialize build vectors for VPInstructions.
33643364
for (VPBasicBlock *VPBB :
33653365
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
33663366
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3367-
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3368-
auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
3367+
auto *DefR = dyn_cast<VPRecipeWithIRFlags>(&R);
3368+
if (!DefR || !isa<VPReplicateRecipe, VPInstruction>(DefR))
3369+
continue;
3370+
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
33693371
VPRegionBlock *ParentRegion =
33703372
cast<VPRecipeBase>(U)->getParent()->getParent();
3371-
return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
3373+
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
33723374
};
3373-
if (!RepR || RepR->isSingleScalar() ||
3374-
none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
3375+
if ((isa<VPReplicateRecipe>(DefR) &&
3376+
cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3377+
(isa<VPInstruction>(DefR) &&
3378+
!cast<VPInstruction>(DefR)->doesGeneratePerAllLanes()) ||
3379+
vputils::onlyFirstLaneUsed(DefR) ||
3380+
none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
33753381
continue;
33763382

3377-
Type *ScalarTy = TypeInfo.inferScalarType(RepR);
3383+
Type *ScalarTy = TypeInfo.inferScalarType(DefR);
33783384
unsigned Opcode = ScalarTy->isStructTy()
33793385
? VPInstruction::BuildStructVector
33803386
: VPInstruction::BuildVector;
3381-
auto *BuildVector = new VPInstruction(Opcode, {RepR});
3382-
BuildVector->insertAfter(RepR);
3387+
auto *BuildVector = new VPInstruction(Opcode, {DefR});
3388+
BuildVector->insertAfter(DefR);
33833389

3384-
RepR->replaceUsesWithIf(
3390+
DefR->replaceUsesWithIf(
33853391
BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
33863392
VPUser &U, unsigned) {
33873393
return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ struct VPlanTransforms {
120120
/// Explicitly unroll \p Plan by \p UF.
121121
static void unrollByUF(VPlan &Plan, unsigned UF);
122122

123-
/// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
124-
/// with \p VF single-scalar recipes.
123+
/// Replace each VPReplicateRecipe and replicating VPInstruction outside on
124+
/// any replicate region in \p Plan with \p VF single-scalar recipes.
125125
/// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby
126126
/// dissolving the latter.
127127
static void replicateByVF(VPlan &Plan, ElementCount VF);

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -464,15 +464,15 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
464464
VPlanTransforms::removeDeadRecipes(Plan);
465465
}
466466

467-
/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
468-
/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
469-
static VPReplicateRecipe *
467+
/// Create a single-scalar clone of \p DefR for lane \p Lane. Use \p
468+
/// Def2LaneDefs to look up scalar definitions for operands of \DefR.
469+
static VPRecipeWithIRFlags *
470470
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
471-
VPReplicateRecipe *RepR, VPLane Lane,
471+
VPRecipeWithIRFlags *DefR, VPLane Lane,
472472
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
473473
// Collect the operands at Lane, creating extracts as needed.
474474
SmallVector<VPValue *> NewOps;
475-
for (VPValue *Op : RepR->operands()) {
475+
for (VPValue *Op : DefR->operands()) {
476476
// If Op is a definition that has been unrolled, directly use the clone for
477477
// the corresponding lane.
478478
auto LaneDefs = Def2LaneDefs.find(Op);
@@ -502,11 +502,19 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
502502
NewOps.push_back(Ext);
503503
}
504504

505-
auto *New =
506-
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
507-
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
508-
New->transferFlags(*RepR);
509-
New->insertBefore(RepR);
505+
VPRecipeWithIRFlags *New;
506+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) {
507+
New =
508+
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
509+
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
510+
} else {
511+
New = DefR->clone();
512+
for (const auto &[Idx, Op] : enumerate(NewOps)) {
513+
New->setOperand(Idx, Op);
514+
}
515+
}
516+
New->transferFlags(*DefR);
517+
New->insertBefore(DefR);
510518
return New;
511519
}
512520

@@ -531,41 +539,46 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
531539
SmallVector<VPRecipeBase *> ToRemove;
532540
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
533541
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
534-
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
535-
if (!RepR || RepR->isSingleScalar())
542+
auto *DefR = dyn_cast<VPRecipeWithIRFlags>(&R);
543+
if (!DefR || !isa<VPInstruction, VPReplicateRecipe>(DefR))
544+
continue;
545+
if ((isa<VPReplicateRecipe>(DefR) &&
546+
cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
547+
(isa<VPInstruction>(DefR) &&
548+
!cast<VPInstruction>(DefR)->doesGeneratePerAllLanes()))
536549
continue;
537550

538-
VPBuilder Builder(RepR);
539-
if (RepR->getNumUsers() == 0) {
540-
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
541-
vputils::isSingleScalar(RepR->getOperand(1))) {
551+
VPBuilder Builder(DefR);
552+
if (DefR->getNumUsers() == 0) {
553+
if (isa<StoreInst>(DefR->getUnderlyingInstr()) &&
554+
vputils::isSingleScalar(DefR->getOperand(1))) {
542555
// Stores to invariant addresses need to store the last lane only.
543-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
556+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane::getLastLaneForVF(VF),
544557
Def2LaneDefs);
545558
} else {
546-
// Create single-scalar version of RepR for all lanes.
559+
// Create single-scalar version of DefR for all lanes.
547560
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
548-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
561+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs);
549562
}
550-
RepR->eraseFromParent();
563+
DefR->eraseFromParent();
551564
continue;
552565
}
553-
/// Create single-scalar version of RepR for all lanes.
566+
/// Create single-scalar version of DefR for all lanes.
554567
SmallVector<VPValue *> LaneDefs;
555568
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
556569
LaneDefs.push_back(
557-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
570+
cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs));
558571

559-
Def2LaneDefs[RepR] = LaneDefs;
572+
Def2LaneDefs[DefR] = LaneDefs;
560573
/// Users that only demand the first lane can use the definition for lane
561574
/// 0.
562-
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
563-
return U.onlyFirstLaneUsed(RepR);
575+
DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
576+
return U.onlyFirstLaneUsed(DefR);
564577
});
565578

566-
// Update each build vector user that currently has RepR as its only
579+
// Update each build vector user that currently has DefR as its only
567580
// operand, to have all LaneDefs as its operands.
568-
for (VPUser *U : to_vector(RepR->users())) {
581+
for (VPUser *U : to_vector(DefR->users())) {
569582
auto *VPI = dyn_cast<VPInstruction>(U);
570583
if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
571584
VPI->getOpcode() != VPInstruction::BuildStructVector))
@@ -577,7 +590,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
577590
for (VPValue *LaneDef : drop_begin(LaneDefs))
578591
VPI->addOperand(LaneDef);
579592
}
580-
ToRemove.push_back(RepR);
593+
ToRemove.push_back(DefR);
581594
}
582595
}
583596
for (auto *R : reverse(ToRemove))

llvm/test/Transforms/LoopVectorize/pointer-induction.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ define void @a(ptr readnone %b) {
3333
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
3434
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP14]]
3535
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr null, i64 [[TMP17]]
36+
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
37+
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x ptr> [[TMP21]], ptr [[NEXT_GEP2]], i32 1
38+
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x ptr> [[TMP22]], ptr [[NEXT_GEP3]], i32 2
39+
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x ptr> [[TMP23]], ptr [[NEXT_GEP4]], i32 3
3640
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 -1
3741
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
3842
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
@@ -649,9 +653,6 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) {
649653
; STRIDED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], [[TMP8]]
650654
; STRIDED-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP1]]
651655
; STRIDED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], [[TMP10]]
652-
; STRIDED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr null, i64 [[TMP5]]
653-
; STRIDED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr null, i64 [[TMP7]]
654-
; STRIDED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr null, i64 [[TMP9]]
655656
; STRIDED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr null, i64 [[TMP11]]
656657
; STRIDED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]]
657658
; STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8

0 commit comments

Comments
 (0)