Skip to content

Conversation

ergawy
Copy link
Member

@ergawy ergawy commented Sep 4, 2025

@llvmbot
Copy link
Member

llvmbot commented Sep 4, 2025

@llvm/pr-subscribers-mlir-llvm
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-flang-openmp

Author: Kareem Ergawy (ergawy)

Changes

Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators.


Full diff: https://github.com/llvm/llvm-project/pull/156837.diff

2 Files Affected:

  • (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+3)
  • (added) mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir (+85)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3d5e487c8990f..fe00a2a5696dc 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3506,6 +3506,8 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
         return AfterIP.takeError();
       if (!Builder.GetInsertBlock())
         return ReductionFunc;
+
+      Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint());
       Builder.CreateStore(Reduced, LHSPtr);
     }
   }
@@ -3750,6 +3752,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
           RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
       if (!AfterIP)
         return AfterIP.takeError();
+      Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint());
       Builder.CreateStore(Reduced, LHS, false);
     }
   }
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
new file mode 100644
index 0000000000000..aaf06d2d0e0c2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
@@ -0,0 +1,85 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Verifies that the IR builder can handle reductions with multi-block combiner
+// regions on the GPU.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+  llvm.func @bar() {}
+  llvm.func @baz() {}
+
+  omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    omp.yield(%arg1 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    llvm.call @bar() : () -> ()
+    llvm.br ^bb3
+
+  ^bb3:  // pred: ^bb1
+    llvm.call @baz() : () -> ()
+    omp.yield(%arg0 : !llvm.ptr)
+  }
+  llvm.func @foo_() {
+    %c1 = llvm.mlir.constant(1 : i64) : i64
+    %10 = llvm.alloca %c1 x !llvm.array<5 x f32> {bindc_name = "x"} : (i64) -> !llvm.ptr<5>
+    %11 = llvm.addrspacecast %10 : !llvm.ptr<5> to !llvm.ptr
+    %74 = omp.map.info var_ptr(%11 : !llvm.ptr, !llvm.array<5 x f32>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"}
+    omp.target map_entries(%74 -> %arg0 : !llvm.ptr) {
+      %c1_2 = llvm.mlir.constant(1 : i32) : i32
+      %c10 = llvm.mlir.constant(10 : i32) : i32
+      omp.teams reduction(byref @add_reduction_byref_box_5xf32 %arg0 -> %arg2 : !llvm.ptr) {
+        omp.parallel {
+          omp.distribute {
+            omp.wsloop {
+              omp.loop_nest (%arg5) : i32 = (%c1_2) to (%c10) inclusive step (%c1_2) {
+                omp.yield
+              }
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK:      call void @__kmpc_parallel_51({{.*}}, i32 1, i32 -1, i32 -1,
+// CHECK-SAME:   ptr @[[PAR_OUTLINED:.*]], ptr null, ptr %2, i64 1)
+
+// CHECK: define internal void @[[PAR_OUTLINED]]{{.*}} {
+// CHECK:   .omp.reduction.then:
+// CHECK:     br label %omp.reduction.nonatomic.body
+
+// CHECK:   omp.reduction.nonatomic.body:
+// CHECK:     call void @bar()
+// CHECK:     br label %[[BODY_2ND_BB:.*]]
+
+// CHECK:   [[BODY_2ND_BB]]:
+// CHECK:     call void @baz()
+// CHECK:     br label %[[CONT_BB:.*]]
+
+// CHECK:   [[CONT_BB]]:
+// CHECK:     br label %.omp.reduction.done
+// CHECK: }
+
+// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 {
+// CHECK:     br label %omp.reduction.nonatomic.body
+
+// CHECK:   [[BODY_2ND_BB:.*]]:
+// CHECK:     call void @baz()
+// CHECK:     br label %omp.region.cont
+
+
+// CHECK: omp.reduction.nonatomic.body:
+// CHECK:   call void @bar()
+// CHECK:     br label %[[BODY_2ND_BB]]
+
+// CHECK: }

@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_7_reduce_device branch from 6987182 to 9a7ae05 Compare September 8, 2025 12:00
@ergawy ergawy force-pushed the users/ergawy/omp_device_array_reds branch from 49715c2 to 3c3f326 Compare September 8, 2025 12:00
ergawy added a commit that referenced this pull request Sep 8, 2025
…ide values (#155754)

Following up on #154483, this PR introduces further refactoring to
extract some shared utils between OpenMP lowering and `do concurrent`
conversion pass. In particular, this PR extracts 2 utils that handle
mapping or cloning values used inside target regions but defined
outside.

Later `do concurrent` PR(s) will also use these utils.

PR stack:
- #155754 ◀️
- #155987
- #155992
- #155993
- #156589
- #156610
- #156837
… GPU

Fixes a bug related to insertion points when inlining multi-block
combiner reduction regions. The IP at the end of the inlined region was
not used resulting in emitting BBs with multiple terminators.
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_7_reduce_device branch from 9a7ae05 to 6d02115 Compare September 8, 2025 12:36
@ergawy ergawy force-pushed the users/ergawy/omp_device_array_reds branch from 3c3f326 to 8467dbc Compare September 8, 2025 12:36
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 8, 2025
… clone outside values (#155754)

Following up on #154483, this PR introduces further refactoring to
extract some shared utils between OpenMP lowering and `do concurrent`
conversion pass. In particular, this PR extracts 2 utils that handle
mapping or cloning values used inside target regions but defined
outside.

Later `do concurrent` PR(s) will also use these utils.

PR stack:
- llvm/llvm-project#155754 ◀️
- llvm/llvm-project#155987
- llvm/llvm-project#155992
- llvm/llvm-project#155993
- llvm/llvm-project#156589
- llvm/llvm-project#156610
- llvm/llvm-project#156837
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants