Skip to content

Conversation

lukel97
Copy link
Contributor

@lukel97 lukel97 commented Aug 19, 2025

This PR aims to help VP intrinsics on AArch64 (i.e. #154327), where the EVL operand needs to be "expanded" into the mask operand because SVE doesn't have the notion of EVL.

If the loop vectorizer tries to use an exact EVL that matches a scalable vector type's element count, in theory it should be something like (mul i32 vscale, n).

ExpandVectorPredication tries to detect this, and if it does then it skips folding the EVL into the mask since the EVL covers every element anyway.

In practice though vscale will be truncated from i64, and because it's a known power of 2 the multiply will be canonicalized to a shl. So it will really look like (trunc (shl i64 vscale, n) to i32).

@llvmbot
Copy link
Member

llvmbot commented Aug 19, 2025

@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-llvm-transforms

Author: Luke Lau (lukel97)

Changes

This PR aims to help VP intrinsics on AArch64 (i.e. #154327), where the EVL operand needs to be "expanded" into the mask operand because SVE doesn't have the notion of EVL.

If the loop vectorizer tries to use an exact EVL that matches a scalable vector type's element count, in theory it should be something like (mul i32 vscale, n).

ExpandVectorPredication tries to detect this, and if it does then it skips folding the EVL into the mask since the EVL covers every element anyway.

In practice though vscale will be truncated from i64, and because it's a known power of 2 the multiply will be canonicalized to a shl. So it will really look like (trunc (shl i64 vscale, n) to i32).


Full diff: https://github.com/llvm/llvm-project/pull/154334.diff

3 Files Affected:

  • (modified) llvm/include/llvm/IR/PatternMatch.h (+7)
  • (modified) llvm/lib/IR/IntrinsicInst.cpp (+5-1)
  • (added) llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-convert-evl.ll (+107)
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 2ab652ca258c6..ec4dc787747ca 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2193,6 +2193,13 @@ m_TruncOrSelf(const OpTy &Op) {
   return m_CombineOr(m_Trunc(Op), Op);
 }
 
+template <typename OpTy>
+inline match_combine_or<NoWrapTrunc_match<OpTy, TruncInst::NoUnsignedWrap>,
+                        OpTy>
+m_NUWTruncOrSelf(const OpTy &Op) {
+  return m_CombineOr(m_NUWTrunc(Op), Op);
+}
+
 /// Matches SExt.
 template <typename OpTy>
 inline CastInst_match<OpTy, SExtInst> m_SExt(const OpTy &Op) {
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 23a4d1b5c615e..22c1479174e2c 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -613,8 +613,12 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
   if (EC.isScalable()) {
     // Compare vscale patterns
     uint64_t VScaleFactor;
-    if (match(VLParam, m_Mul(m_VScale(), m_ConstantInt(VScaleFactor))))
+    if (match(VLParam,
+              m_NUWTruncOrSelf(m_Mul(m_VScale(), m_ConstantInt(VScaleFactor)))))
       return VScaleFactor >= EC.getKnownMinValue();
+    if (match(VLParam,
+              m_NUWTruncOrSelf(m_Shl(m_VScale(), m_ConstantInt(VScaleFactor)))))
+      return 1 << VScaleFactor >= EC.getKnownMinValue();
     return (EC.getKnownMinValue() == 1) && match(VLParam, m_VScale());
   }
 
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-convert-evl.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-convert-evl.ll
new file mode 100644
index 0000000000000..8dd8ae153ae58
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-convert-evl.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=pre-isel-intrinsic-lowering -expandvp-override-evl-transform=Convert -expandvp-override-mask-transform=Legal -S < %s | FileCheck %s
+
+define <vscale x 2 x i64> @unknown_evl(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m, i32 %evl) {
+; CHECK-LABEL: define <vscale x 2 x i64> @unknown_evl(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]], i32 [[EVL:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 [[EVL]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M]]
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[TMP2]], i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[ADD]]
+;
+  %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %add
+}
+
+define <2 x i64> @exact_evl_fixed(<2 x i64> %x, <2 x i64> %y, <2 x i1> %m) {
+; CHECK-LABEL: define <2 x i64> @exact_evl_fixed(
+; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i1> [[M:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> [[M]], i32 2)
+; CHECK-NEXT:    ret <2 x i64> [[ADD]]
+;
+  %add = call <2 x i64> @llvm.vp.add(<2 x i64> poison, <2 x i64> poison, <2 x i1> %m, i32 2)
+  ret <2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_mul(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_mul(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[EVL:%.*]] = mul i32 [[VSCALE]], 2
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[ADD]]
+;
+  %vscale = call i32 @llvm.vscale()
+  %evl = mul i32 %vscale, 2
+  %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_shl(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_shl(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[EVL:%.*]] = shl i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[ADD]]
+;
+  %vscale = call i32 @llvm.vscale()
+  %evl = shl i32 %vscale, 1
+  %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_mul_trunc(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_mul_trunc(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[SHL:%.*]] = mul i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[EVL:%.*]] = trunc nuw i64 [[SHL]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[ADD]]
+;
+  %vscale = call i64 @llvm.vscale()
+  %shl = mul i64 %vscale, 2
+  %evl = trunc nuw i64 %shl to i32
+  %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %add
+}
+
+
+define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[VSCALE]], 1
+; CHECK-NEXT:    [[EVL:%.*]] = trunc nuw i64 [[SHL]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[ADD]]
+;
+  %vscale = call i64 @llvm.vscale()
+  %shl = shl i64 %vscale, 1
+  %evl = trunc nuw i64 %shl to i32
+  %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc_no_nuw(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: define <vscale x 2 x i64> @exact_evl_vscale_shl_trunc_no_nuw(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]], <vscale x 2 x i64> [[Y:%.*]], <vscale x 2 x i1> [[M:%.*]]) {
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[VSCALE]], 1
+; CHECK-NEXT:    [[EVL:%.*]] = trunc i64 [[SHL]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 [[EVL]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M]]
+; CHECK-NEXT:    [[VSCALE1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE1]], 2
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> [[TMP2]], i32 [[SCALABLE_SIZE]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[ADD]]
+;
+  %vscale = call i64 @llvm.vscale()
+  %shl = shl i64 %vscale, 1
+  %evl = trunc i64 %shl to i32
+  %add = call <vscale x 2 x i64> @llvm.vp.add(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %add
+}

@@ -613,8 +613,12 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
if (EC.isScalable()) {
// Compare vscale patterns
uint64_t VScaleFactor;
if (match(VLParam, m_Mul(m_VScale(), m_ConstantInt(VScaleFactor))))
if (match(VLParam,
m_NUWTruncOrSelf(m_Mul(m_VScale(), m_ConstantInt(VScaleFactor)))))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check for NUW on the Mul too?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants