[flang][OpenMP] Support multi-block reduction combiner regions on the GPU #156837

ergawy · 2025-09-04T09:40:16Z

Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators.

PR stack:

… GPU Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators.

llvmbot · 2025-09-04T09:40:50Z

@llvm/pr-subscribers-mlir-llvm
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-flang-openmp

Author: Kareem Ergawy (ergawy)

Changes

Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators.

Full diff: https://github.com/llvm/llvm-project/pull/156837.diff

2 Files Affected:

(modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+3)
(added) mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir (+85)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3d5e487c8990f..fe00a2a5696dc 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3506,6 +3506,8 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
         return AfterIP.takeError();
       if (!Builder.GetInsertBlock())
         return ReductionFunc;
+
+      Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint());
       Builder.CreateStore(Reduced, LHSPtr);
     }
   }
@@ -3750,6 +3752,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
           RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
       if (!AfterIP)
         return AfterIP.takeError();
+      Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint());
       Builder.CreateStore(Reduced, LHS, false);
     }
   }
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
new file mode 100644
index 0000000000000..aaf06d2d0e0c2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
@@ -0,0 +1,85 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Verifies that the IR builder can handle reductions with multi-block combiner
+// regions on the GPU.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+  llvm.func @bar() {}
+  llvm.func @baz() {}
+
+  omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    omp.yield(%arg1 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    llvm.call @bar() : () -> ()
+    llvm.br ^bb3
+
+  ^bb3:  // pred: ^bb1
+    llvm.call @baz() : () -> ()
+    omp.yield(%arg0 : !llvm.ptr)
+  }
+  llvm.func @foo_() {
+    %c1 = llvm.mlir.constant(1 : i64) : i64
+    %10 = llvm.alloca %c1 x !llvm.array<5 x f32> {bindc_name = "x"} : (i64) -> !llvm.ptr<5>
+    %11 = llvm.addrspacecast %10 : !llvm.ptr<5> to !llvm.ptr
+    %74 = omp.map.info var_ptr(%11 : !llvm.ptr, !llvm.array<5 x f32>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"}
+    omp.target map_entries(%74 -> %arg0 : !llvm.ptr) {
+      %c1_2 = llvm.mlir.constant(1 : i32) : i32
+      %c10 = llvm.mlir.constant(10 : i32) : i32
+      omp.teams reduction(byref @add_reduction_byref_box_5xf32 %arg0 -> %arg2 : !llvm.ptr) {
+        omp.parallel {
+          omp.distribute {
+            omp.wsloop {
+              omp.loop_nest (%arg5) : i32 = (%c1_2) to (%c10) inclusive step (%c1_2) {
+                omp.yield
+              }
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK:      call void @__kmpc_parallel_51({{.*}}, i32 1, i32 -1, i32 -1,
+// CHECK-SAME:   ptr @[[PAR_OUTLINED:.*]], ptr null, ptr %2, i64 1)
+
+// CHECK: define internal void @[[PAR_OUTLINED]]{{.*}} {
+// CHECK:   .omp.reduction.then:
+// CHECK:     br label %omp.reduction.nonatomic.body
+
+// CHECK:   omp.reduction.nonatomic.body:
+// CHECK:     call void @bar()
+// CHECK:     br label %[[BODY_2ND_BB:.*]]
+
+// CHECK:   [[BODY_2ND_BB]]:
+// CHECK:     call void @baz()
+// CHECK:     br label %[[CONT_BB:.*]]
+
+// CHECK:   [[CONT_BB]]:
+// CHECK:     br label %.omp.reduction.done
+// CHECK: }
+
+// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 {
+// CHECK:     br label %omp.reduction.nonatomic.body
+
+// CHECK:   [[BODY_2ND_BB:.*]]:
+// CHECK:     call void @baz()
+// CHECK:     br label %omp.region.cont
+
+
+// CHECK: omp.reduction.nonatomic.body:
+// CHECK:   call void @bar()
+// CHECK:     br label %[[BODY_2ND_BB]]
+
+// CHECK: }

[flang][OpenMP] Support multi-block reduction combiner regions on the…

49715c2

… GPU Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators.

llvmbot added mlir:llvm mlir flang:openmp clang:openmp OpenMP related changes to Clang labels Sep 4, 2025

This was referenced Sep 4, 2025

[flang][OpenMP] do concurrent: support reduce on device #156610

Open

[flang][OpenMP] do concurrent: support local on device #156589

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][OpenMP] Support multi-block reduction combiner regions on the GPU #156837

[flang][OpenMP] Support multi-block reduction combiner regions on the GPU #156837

ergawy commented Sep 4, 2025 •

edited

Loading

Uh oh!

llvmbot commented Sep 4, 2025 •

edited

Loading

Uh oh!

Uh oh!

[flang][OpenMP] Support multi-block reduction combiner regions on the GPU #156837

Are you sure you want to change the base?

[flang][OpenMP] Support multi-block reduction combiner regions on the GPU #156837

Conversation

ergawy commented Sep 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

ergawy commented Sep 4, 2025 •

edited

Loading

llvmbot commented Sep 4, 2025 •

edited

Loading