Skip to content

Conversation

jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Aug 29, 2025

Summary:
The masked load / store LLVM intrinsics take an argument for the
alignment. If the user is pessimistic about alignment they can provide a
value of 1 for an unaligned load. This patch updates infer-alignment to
increase the alignment value of the alignment argument if it is known
greater than the provided one.

Ignoring the gather / scatter versions for now since they contain many
pointers.

@jhuber6 jhuber6 requested a review from nikic as a code owner August 29, 2025 16:36
@llvmbot llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Aug 29, 2025
@llvmbot
Copy link
Member

llvmbot commented Aug 29, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Joseph Huber (jhuber6)

Changes

Summary:
The masked load / store LLVM intrinsics take an argument for the
alignment. If the user is pessimistic about alignment they can provide a
value of 1 for an unaligned load. This patch updates instcombine to
increase the alignment value of the alignment argument if it is known
greater than the provided one.

Ignoring the gather / scatter versions for now since they contain many
pointers.


Full diff: https://github.com/llvm/llvm-project/pull/156057.diff

3 Files Affected:

  • (modified) llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (+34-14)
  • (modified) llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll (+1-1)
  • (modified) llvm/test/Transforms/InstCombine/masked_intrinsics.ll (+31)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 42b65dde67255..7e50e55ae24c8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -288,8 +288,11 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
 // * Narrow width by halfs excluding zero/undef lanes
 Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
   Value *LoadPtr = II.getArgOperand(0);
-  const Align Alignment =
-      cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+  Align Alignment = cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+
+  Align KnownAlign = getKnownAlignment(LoadPtr, DL, &II, &AC, &DT);
+  if (Alignment < KnownAlign)
+    Alignment = KnownAlign;
 
   // If the mask is all ones or undefs, this is a plain vector load of the 1st
   // argument.
@@ -310,6 +313,15 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
     return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
   }
 
+  // Update the alignment if the known value is greater than the provided one.
+  if (cast<ConstantInt>(II.getArgOperand(1))->getAlignValue() < Alignment) {
+    SmallVector<Value *> Args(II.arg_begin(), II.arg_end());
+    Args[1] = Builder.getInt32(Alignment.value());
+    CallInst *CI = Builder.CreateCall(II.getCalledFunction(), Args);
+    CI->copyMetadata(II);
+    return CI;
+  }
+
   return nullptr;
 }
 
@@ -317,33 +329,41 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
 // * Single constant active lane -> store
 // * Narrow width by halfs excluding zero/undef lanes
 Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
-  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
-  if (!ConstMask)
-    return nullptr;
+  Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+
+  Align KnownAlign = getKnownAlignment(II.getArgOperand(1), DL, &II, &AC, &DT);
+  if (Alignment < KnownAlign)
+    Alignment = KnownAlign;
 
   // If the mask is all zeros, this instruction does nothing.
-  if (ConstMask->isNullValue())
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  if (ConstMask && ConstMask->isNullValue())
     return eraseInstFromFunction(II);
 
   // If the mask is all ones, this is a plain vector store of the 1st argument.
-  if (ConstMask->isAllOnesValue()) {
+  if (ConstMask && ConstMask->isAllOnesValue()) {
     Value *StorePtr = II.getArgOperand(1);
-    Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
     StoreInst *S =
         new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
     S->copyMetadata(II);
     return S;
   }
 
-  if (isa<ScalableVectorType>(ConstMask->getType()))
+  if (ConstMask && isa<ScalableVectorType>(ConstMask->getType()))
     return nullptr;
 
   // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
-  APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
-  APInt PoisonElts(DemandedElts.getBitWidth(), 0);
-  if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts,
-                                            PoisonElts))
-    return replaceOperand(II, 0, V);
+  if (ConstMask) {
+    APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
+    APInt PoisonElts(DemandedElts.getBitWidth(), 0);
+    if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts,
+                                              PoisonElts))
+      return replaceOperand(II, 0, V);
+  }
+
+  // Update the alignment if the known value is greater than the provided one.
+  if (cast<ConstantInt>(II.getArgOperand(2))->getAlignValue() < Alignment)
+    return replaceOperand(II, 2, Builder.getInt32(Alignment.value()));
 
   return nullptr;
 }
diff --git a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
index 918ea605a10bf..6ba52c178b8d4 100644
--- a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
@@ -7,7 +7,7 @@
 define void @combine_masked_load_store_from_constant_array(ptr %ptr) {
 ; CHECK-LABEL: @combine_masked_load_store_from_constant_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 0, i32 10)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nonnull @contant_int_array, i32 8, <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nonnull @contant_int_array, i32 16, <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP2]], ptr [[PTR:%.*]], i32 1, <vscale x 2 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 8f7683419a82a..6c1168e6c1c70 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -439,3 +439,34 @@ define <2 x i64> @negative_gather_v2i64_uniform_ptrs_no_all_active_mask(ptr %src
 declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
 
+; Alignment tests
+
+define <2 x i32> @unaligned_load(<2 x i1> %mask, ptr %ptr) {
+; CHECK-LABEL: @unaligned_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[PTR:%.*]], i64 64) ]
+; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[PTR]], i32 64, <2 x i1> [[MASK:%.*]], <2 x i32> poison)
+; CHECK-NEXT:    ret <2 x i32> [[MASKED_LOAD]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %ptr, i64 64) ]
+  %masked_load = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %ptr, i32 1, <2 x i1> %mask, <2 x i32> poison)
+  ret <2 x i32> %masked_load
+}
+
+define void @unaligned_store(<2 x i1> %mask, <2 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: @unaligned_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[PTR:%.*]], i64 64) ]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v2i32.p0(<2 x i32> [[VAL:%.*]], ptr [[PTR]], i32 64, <2 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %ptr, i64 64) ]
+  tail call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %ptr, i32 1, <2 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.assume(i1)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)

@jhuber6 jhuber6 requested a review from arsenm August 29, 2025 17:07
jhuber6 added a commit to jhuber6/llvm-project that referenced this pull request Aug 29, 2025
Summary:
Right now these enformce alignment, which isn't convenient for the user
on platforms that support unaligned accesses. The options are to either
permit passing the alignment manually, or just assume it's unaligned
unless the user specifies it.

I've added llvm#156057 which should
make the requiested alignment show up on the intrinsic if the user
passed `__builtin_assume_aligned`, however that's only with
optimizations. This shouldn't cause issues unless the backend
categorically decides to reject an unaligned access.
Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should happen in the InferAlignment pass instead.

@nikic
Copy link
Contributor

nikic commented Aug 29, 2025

As a side note, we should really replace this argument with an align attribute on the pointer parameter...

@jhuber6
Copy link
Contributor Author

jhuber6 commented Aug 29, 2025

As a side note, we should really replace this argument with an align attribute on the pointer parameter...

Yeah, I was wondering why it was needed since the compress and expand variants don't use it. I'm guessing that would need some auto-upgrade handling or something.

…s if known

Summary:
The masked load / store LLVM intrinsics take an argument for the
alignment. If the user is pessimistic about alignment they can provide a
value of `1` for an unaligned load. This patch updates infer-alignment to
increase the alignment value of the alignment argument if it is known
greater than the provided one.

Ignoring the gather / scatter versions for now since they contain many
pointers.
@jhuber6 jhuber6 changed the title [InstCombine] Increase alignment in masked load / store instrinsics if known [InferAlignment] Increase alignment in masked load / store instrinsics if known Aug 29, 2025
@jhuber6
Copy link
Contributor Author

jhuber6 commented Aug 29, 2025

This should happen in the InferAlignment pass instead.

Done.

@nikic
Copy link
Contributor

nikic commented Aug 30, 2025

As a side note, we should really replace this argument with an align attribute on the pointer parameter...

Yeah, I was wondering why it was needed since the compress and expand variants don't use it. I'm guessing that would need some auto-upgrade handling or something.

Right. We've done this change for the memcpy etc intrinsics in the past:

case Intrinsic::memcpy:
case Intrinsic::memmove:
case Intrinsic::memset: {
// We have to make sure that the call signature is what we're expecting.
// We only want to change the old signatures by removing the alignment arg:
// @llvm.mem[cpy|move]...(i8*, i8*, i[32|i64], i32, i1)
// -> @llvm.mem[cpy|move]...(i8*, i8*, i[32|i64], i1)
// @llvm.memset...(i8*, i8, i[32|64], i32, i1)
// -> @llvm.memset...(i8*, i8, i[32|64], i1)
// Note: i8*'s in the above can be any pointer type
if (CI->arg_size() != 5) {
DefaultCase();
return;
}
// Remove alignment argument (3), and add alignment attributes to the
// dest/src pointers.
Value *Args[4] = {CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2), CI->getArgOperand(4)};
NewCall = Builder.CreateCall(NewFn, Args);
AttributeList OldAttrs = CI->getAttributes();
AttributeList NewAttrs = AttributeList::get(
C, OldAttrs.getFnAttrs(), OldAttrs.getRetAttrs(),
{OldAttrs.getParamAttrs(0), OldAttrs.getParamAttrs(1),
OldAttrs.getParamAttrs(2), OldAttrs.getParamAttrs(4)});
NewCall->setAttributes(NewAttrs);
auto *MemCI = cast<MemIntrinsic>(NewCall);
// All mem intrinsics support dest alignment.
const ConstantInt *Align = cast<ConstantInt>(CI->getArgOperand(3));
MemCI->setDestAlignment(Align->getMaybeAlignValue());
// Memcpy/Memmove also support source alignment.
if (auto *MTI = dyn_cast<MemTransferInst>(MemCI))
MTI->setSourceAlignment(Align->getMaybeAlignValue());
break;
}

Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with one very petty minor - cheers!

// TODO: Also handle memory intrinsics.
return false;

IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(style) auto *II = dyn_cast<IntrinsicInst>(I);

@jhuber6 jhuber6 merged commit abda8be into llvm:main Sep 2, 2025
9 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 2, 2025

LLVM Buildbot has detected a new failure on builder cross-project-tests-sie-ubuntu-dwarf5 running on doug-worker-1b while building llvm at step 6 "test-build-unified-tree-check-cross-project".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/163/builds/25722

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-cross-project) failure: test (failure)
******************** TEST 'cross-project-tests :: debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
clang++ -O0 -glldb -std=gnu++11 /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp -o /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/projects/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/Output/float_range_no_arg.cpp.tmp # RUN: at line 10
+ clang++ -O0 -glldb -std=gnu++11 /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp -o /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/projects/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/Output/float_range_no_arg.cpp.tmp
"/usr/bin/python3.10" "/home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/dexter.py" test --fail-lt 1.0 -w --debugger lldb-dap --lldb-executable "/home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/bin/lldb-dap" --binary /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/projects/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/Output/float_range_no_arg.cpp.tmp -- /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp | /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/bin/FileCheck /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp # RUN: at line 11
+ /usr/bin/python3.10 /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/dexter.py test --fail-lt 1.0 -w --debugger lldb-dap --lldb-executable /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/bin/lldb-dap --binary /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/projects/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/Output/float_range_no_arg.cpp.tmp -- /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp
+ /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/build/bin/FileCheck /home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp


****************************************


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants