-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[VP] Detect truncated and shifted EVLs during expansion #154334
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-llvm-transforms Author: Luke Lau (lukel97) ChangesThis PR aims to help VP intrinsics on AArch64 (i.e. #154327), where the EVL operand needs to be "expanded" into the mask operand because SVE doesn't have the notion of EVL. If the loop vectorizer tries to use an exact EVL that matches a scalable vector type's element count, in theory it should be something like ExpandVectorPredication tries to detect this, and if it does then it skips folding the EVL into the mask since the EVL covers every element anyway. In practice though vscale will be truncated from i64, and because it's a known power of 2 the multiply will be canonicalized to a shl. So it will really look like Full diff: https://github.com/llvm/llvm-project/pull/154334.diff 3 Files Affected:
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 2ab652ca258c6..ec4dc787747ca 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2193,6 +2193,13 @@ m_TruncOrSelf(const OpTy &Op) {
return m_CombineOr(m_Trunc(Op), Op);
}
+template <typename OpTy>
+inline match_combine_or<NoWrapTrunc_match<OpTy, TruncInst::NoUnsignedWrap>,
+ OpTy>
+m_NUWTruncOrSelf(const OpTy &Op) {
+ return m_CombineOr(m_NUWTrunc(Op), Op);
+}
+
/// Matches SExt.
template <typename OpTy>
inline CastInst_match<OpTy, SExtInst> m_SExt(const OpTy &Op) {
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 23a4d1b5c615e..22c1479174e2c 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -613,8 +613,12 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
if (EC.isScalable()) {
// Compare vscale patterns
uint64_t VScaleFactor;
- if (match(VLParam, m_Mul(m_VScale(), m_ConstantInt(VScaleFactor))))
+ if (match(VLParam,
+ m_NUWTruncOrSelf(m_Mul(m_VScale(), m_ConstantInt(VScaleFactor)))))
return VScaleFactor >= EC.getKnownMinValue();
+ if (match(VLParam,
+ m_NUWTruncOrSelf(m_Shl(m_VScale(), m_ConstantInt(VScaleFactor)))))
+ return 1 << VScaleFactor >= EC.getKnownMinValue();
return (EC.getKnownMinValue() == 1) && match(VLParam, m_VScale());
}
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-convert-evl.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-convert-evl.ll
new file mode 100644
index 0000000000000..8dd8ae153ae58
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-convert-evl.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=pre-isel-intrinsic-lowering -expandvp-override-evl-transform=Convert -expandvp-override-mask-transform=Legal -S < %s | FileCheck %s
+
+define <vscale x 2 x i64> @unknown_evl(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m, i32 %evl) {
+; CHECK-LABEL: define <vscale x 2 x i64> @unknown_evl(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]], i32 [[EVL:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 [[EVL]])
+; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M]]
+; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[TMP2]], i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[ADD]]
+;
+ %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i64> %add
+}
+
+define <2 x i64> @exact_evl_fixed(<2 x i64> %x, <2 x i64> %y, <2 x i1> %m) {
+; CHECK-LABEL: define <2 x i64> @exact_evl_fixed(
+; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i1> [[M:%.*]]) {
+; CHECK-NEXT: [[ADD:%.*]] = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> [[M]], i32 2)
+; CHECK-NEXT: ret <2 x i64> [[ADD]]
+;
+ %add = call <2 x i64> @llvm.vp.add(<2 x i64> poison, <2 x i64> poison, <2 x i1> %m, i32 2)
+ ret <2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_mul(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_mul(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[EVL:%.*]] = mul i32 [[VSCALE]], 2
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[ADD]]
+;
+ %vscale = call i32 @llvm.vscale()
+ %evl = mul i32 %vscale, 2
+ %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_shl(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_shl(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[EVL:%.*]] = shl i32 [[VSCALE]], 1
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[ADD]]
+;
+ %vscale = call i32 @llvm.vscale()
+ %evl = shl i32 %vscale, 1
+ %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_mul_trunc(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_mul_trunc(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[SHL:%.*]] = mul i64 [[VSCALE]], 2
+; CHECK-NEXT: [[EVL:%.*]] = trunc nuw i64 [[SHL]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[ADD]]
+;
+ %vscale = call i64 @llvm.vscale()
+ %shl = mul i64 %vscale, 2
+ %evl = trunc nuw i64 %shl to i32
+ %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i64> %add
+}
+
+
+define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[VSCALE]], 1
+; CHECK-NEXT: [[EVL:%.*]] = trunc nuw i64 [[SHL]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[ADD]]
+;
+ %vscale = call i64 @llvm.vscale()
+ %shl = shl i64 %vscale, 1
+ %evl = trunc nuw i64 %shl to i32
+ %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc_no_nuw(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc_no_nuw(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[VSCALE]], 1
+; CHECK-NEXT: [[EVL:%.*]] = trunc i64 [[SHL]] to i32
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 [[EVL]])
+; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M]]
+; CHECK-NEXT: [[VSCALE1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE1]], 2
+; CHECK-NEXT: [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[TMP2]], i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[ADD]]
+;
+ %vscale = call i64 @llvm.vscale()
+ %shl = shl i64 %vscale, 1
+ %evl = trunc i64 %shl to i32
+ %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i64> %add
+}
|
llvm/lib/IR/IntrinsicInst.cpp
Outdated
@@ -613,8 +613,12 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const { | |||
if (EC.isScalable()) { | |||
// Compare vscale patterns | |||
uint64_t VScaleFactor; | |||
if (match(VLParam, m_Mul(m_VScale(), m_ConstantInt(VScaleFactor)))) | |||
if (match(VLParam, | |||
m_NUWTruncOrSelf(m_Mul(m_VScale(), m_ConstantInt(VScaleFactor))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to check for NUW on the Mul too?
This PR aims to help VP intrinsics on AArch64 (i.e. #154327), where the EVL operand needs to be "expanded" into the mask operand because SVE doesn't have the notion of EVL.
If the loop vectorizer tries to use an exact EVL that matches a scalable vector type's element count, in theory it should be something like
(mul i32 vscale, n)
.ExpandVectorPredication tries to detect this, and if it does then it skips folding the EVL into the mask since the EVL covers every element anyway.
In practice though vscale will be truncated from i64, and because it's a known power of 2 the multiply will be canonicalized to a shl. So it will really look like
(trunc (shl i64 vscale, n) to i32)
.