-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[IndVarSimplify] Allow predicateLoopExit on some loops with local writes #155901
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[IndVarSimplify] Allow predicateLoopExit on some loops with local writes #155901
Conversation
Created using spr 1.3.4
Created using spr 1.3.4
CI error is preexisting |
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Florian Mayer (fmayer) ChangesThis is important to optimize patterns that frequently appear with
which gets roughly turned into
Alive2: https://alive2.llvm.org/ce/z/NREqBS I did a I also ran some Google-internal tests with Patch is 32.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155901.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index c32731185afd0..dc14bfe857a76 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -53,6 +53,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
@@ -117,6 +118,10 @@ static cl::opt<bool>
LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
cl::desc("Predicate conditions in read only loops"));
+static cl::opt<bool> LoopPredicationTraps(
+ "indvars-predicate-loop-traps", cl::Hidden, cl::init(true),
+ cl::desc("Predicate conditions that trap in loops with only local writes"));
+
static cl::opt<bool>
AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
cl::desc("Allow widening of indvars to eliminate s/zext"));
@@ -1816,11 +1821,31 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
// suggestions on how to improve this? I can obviously bail out for outer
// loops, but that seems less than ideal. MemorySSA can find memory writes,
// is that enough for *all* side effects?
+ bool HasLocalSideEffects = false;
for (BasicBlock *BB : L->blocks())
for (auto &I : *BB)
// TODO:isGuaranteedToTransfer
- if (I.mayHaveSideEffects())
- return false;
+ if (I.mayHaveSideEffects()) {
+ if (!LoopPredicationTraps)
+ return false;
+ HasLocalSideEffects = true;
+ if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ // The local could have leaked out of the function, so we need to
+ // consider atomic operations as effects.
+ // Because we need to preserve the relative order of volatile
+ // accesses, turn off this optimization if we see any of them.
+ // TODO:
+ // We could be smarter about volatile, and check whether the
+ // reordering is valid.
+ // We also could be smarter about atomic, and check whether the
+ // local has leaked.
+ if (SI->isAtomic() || SI->isVolatile() ||
+ findAllocaForValue(SI->getPointerOperand(), false) == nullptr)
+ return false;
+ } else {
+ return false;
+ }
+ }
bool Changed = false;
// Finally, do the actual predication for all predicatable blocks. A couple
@@ -1840,6 +1865,34 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ if (HasLocalSideEffects) {
+ BasicBlock *Unreachable = nullptr;
+ BasicBlock *InLoop = nullptr;
+ for (BasicBlock *Succ : BI->successors()) {
+ if (isa<UnreachableInst>(Succ->getTerminator()))
+ Unreachable = Succ;
+ else if (L->contains(Succ))
+ InLoop = Succ;
+ }
+ // Exit BB which have one branch back into the loop and another one to
+ // a trap can still be optimized, because local side effects cannot
+ // be observed in the exit case (the trap). We could be smarter about
+ // this, but for now lets pattern match common cases that directly trap.
+ if (Unreachable == nullptr || InLoop == nullptr)
+ return Changed;
+ if (llvm::any_of(*Unreachable, [](Instruction &I) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ if (II->getIntrinsicID() != Intrinsic::trap &&
+ II->getIntrinsicID() != Intrinsic::ubsantrap)
+ return true;
+ } else if (!isa<UnreachableInst>(I)) {
+ return true;
+ }
+ return false;
+ })) {
+ return Changed;
+ }
+ }
Value *NewCond;
if (ExitCount == ExactBTC) {
NewCond = L->contains(BI->getSuccessor(0)) ?
diff --git a/llvm/test/Analysis/ScalarEvolution/unreachable-exit.ll b/llvm/test/Analysis/ScalarEvolution/unreachable-exit.ll
new file mode 100644
index 0000000000000..c44d5de1503eb
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/unreachable-exit.ll
@@ -0,0 +1,536 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=indvars < %s | FileCheck %s
+
+source_filename = "/usr/local/google/home/fmayer/loop/src3.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress uwtable
+define dso_local void @foo(i32 noundef %block_size) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @foo(
+; CHECK-SAME: i32 noundef [[BLOCK_SIZE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [1024 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [1025 x i8], align 16
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCK_SIZE]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 1024)
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 1024, [[UMIN]]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[BAR_ARR]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+ %foo_arr = alloca [1024 x i8], align 16
+ %bar_arr = alloca [1025 x i8], align 16
+ call void @llvm.lifetime.start.p0(ptr nonnull %foo_arr) #4
+ call void @llvm.lifetime.start.p0(ptr nonnull %bar_arr) #4
+ call void @x(ptr noundef nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr noundef nonnull %bar_arr)
+ call void @llvm.lifetime.end.p0(ptr nonnull %bar_arr) #4
+ call void @llvm.lifetime.end.p0(ptr nonnull %foo_arr) #4
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 1023
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %idxprom = zext nneg i32 %i.015 to i64
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1, !tbaa !5
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i64 %idxprom
+ store i8 %1, ptr %arrayidx7, align 1, !tbaa !5
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !8
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local void @foo_atomic(i32 noundef %block_size) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @foo_atomic(
+; CHECK-SAME: i32 noundef [[BLOCK_SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [1024 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [1025 x i8], align 16
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCK_SIZE]] to i64
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[BAR_ARR]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: store atomic i8 [[TMP4]], ptr [[ARRAYIDX7]] unordered, align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8]]
+;
+entry:
+ %foo_arr = alloca [1024 x i8], align 16
+ %bar_arr = alloca [1025 x i8], align 16
+ call void @llvm.lifetime.start.p0(ptr nonnull %foo_arr) #4
+ call void @llvm.lifetime.start.p0(ptr nonnull %bar_arr) #4
+ call void @x(ptr noundef nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr noundef nonnull %bar_arr)
+ call void @llvm.lifetime.end.p0(ptr nonnull %bar_arr) #4
+ call void @llvm.lifetime.end.p0(ptr nonnull %foo_arr) #4
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 1023
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %idxprom = zext nneg i32 %i.015 to i64
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1, !tbaa !5
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i64 %idxprom
+ store atomic i8 %1, ptr %arrayidx7 unordered, align 1, !tbaa !5
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !8
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local void @foo_volatile(i32 noundef %block_size) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @foo_volatile(
+; CHECK-SAME: i32 noundef [[BLOCK_SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [1024 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [1025 x i8], align 16
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCK_SIZE]] to i64
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[BAR_ARR]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8]]
+;
+entry:
+ %foo_arr = alloca [1024 x i8], align 16
+ %bar_arr = alloca [1025 x i8], align 16
+ call void @llvm.lifetime.start.p0(ptr nonnull %foo_arr) #4
+ call void @llvm.lifetime.start.p0(ptr nonnull %bar_arr) #4
+ call void @x(ptr noundef nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr noundef nonnull %bar_arr)
+ call void @llvm.lifetime.end.p0(ptr nonnull %bar_arr) #4
+ call void @llvm.lifetime.end.p0(ptr nonnull %foo_arr) #4
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 1023
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %idxprom = zext nneg i32 %i.015 to i64
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1, !tbaa !5
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i64 %idxprom
+ store volatile i8 %1, ptr %arrayidx7, align 1, !tbaa !5
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !8
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local void @foo_call(i32 noundef %block_size) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @foo_call(
+; CHECK-SAME: i32 noundef [[BLOCK_SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [1024 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [1025 x i8], align 16
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCK_SIZE]] to i64
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr noundef nonnull [[BAR_ARR]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[BAR_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[FOO_ARR]]) #[[ATTR6]]
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[INDVARS_IV]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT: call void @x(ptr null)
+; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1, !tbaa [[TBAA5]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_CLE...
[truncated]
|
This is important to optimize patterns that frequently appear with
bounds checks:
which gets roughly turned into
Motivating example: https://github.com/google/boringssl/blob/main/crypto/fipsmodule/hmac/hmac.cc.inc#L138
I hand-verified the assembly and confirmed that this optimization removes the check in the loop.
This also allowed the loop to be vectorized.
Alive2: https://alive2.llvm.org/ce/z/NREqBS
I did a
stage2-check-all
for both normal and-DBOOTSTRAP_CMAKE_C[XX]_FLAGS="-fsanitize=array-bounds -fsanitize-trap=all"
.I also ran some Google-internal tests with
fsanitize=array-bounds
. Everything passes.