diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index b1ae3cf6525b8..4c078a45628a9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3824,10 +3824,23 @@ let Visibility = [ClangOption, CC1Option, FC1Option, FlangOption] in { let Group = f_Group in { def fopenmp_target_debug_EQ : Joined<["-"], "fopenmp-target-debug=">; -def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">; -def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">; -def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">; -def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">; +def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">, + HelpText<"Allow enforcement to ensure there are enough teams to cover the " + "loop iteration space. It may ignore environment variables. " + "If the fopenmp-assume-teams-oversubscription and " + "fopenmp-assume-threads-oversubscription flags are set, Flang may " + "generate more optimized OpenMP kernels for target teams distribute " + "parallel do pragmas.">; +def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">, + HelpText<"Assume threads oversubscription. If the " + "fopenmp-assume-teams-oversubscription and " + "fopenmp-assume-threads-oversubscription flags are set, Flang may " + "generate more optimized OpenMP kernels for target teams distribute " + "parallel do pragmas.">; +def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">, + HelpText<"Do not assume teams oversubscription.">; +def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">, + HelpText<"Do not assume threads oversubscription.">; def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag>; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 1050e3d8b08dd..49078e4162ebc 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1075,11 +1075,13 @@ class OpenMPIRBuilder { /// preheader of the loop. /// \param LoopType Information about type of loop worksharing. /// It corresponds to type of loop workshare OpenMP pragma. + /// \param NoLoop If true, no-loop code is generated. /// /// \returns Point where to insert code after the workshare construct. InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, - omp::WorksharingLoopType LoopType); + omp::WorksharingLoopType LoopType, + bool NoLoop); /// Modifies the canonical loop to be a statically-scheduled workshare loop. /// @@ -1199,6 +1201,7 @@ class OpenMPIRBuilder { /// present. /// \param LoopType Information about type of loop worksharing. /// It corresponds to type of loop workshare OpenMP pragma. + /// \param NoLoop If true, no-loop code is generated. /// /// \returns Point where to insert code after the workshare construct. LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop( @@ -1209,7 +1212,8 @@ class OpenMPIRBuilder { bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false, bool HasOrderedClause = false, omp::WorksharingLoopType LoopType = - omp::WorksharingLoopType::ForStaticLoop); + omp::WorksharingLoopType::ForStaticLoop, + bool NoLoop = false); /// Tile a loop nest. /// diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index c0e956840f989..bec5abb45041f 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4955,7 +4955,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, - Function &LoopBodyFn) { + Function &LoopBodyFn, bool NoLoop) { Type *TripCountTy = TripCount->getType(); Module &M = OMPBuilder->M; IRBuilder<> &Builder = OMPBuilder->Builder; @@ -4984,7 +4984,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); } - RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0)); + RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop)); Builder.CreateCall(RTLFn, RealArgs); } @@ -4992,7 +4992,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, static void workshareLoopTargetCallback( OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector &ToBeDeleted, - WorksharingLoopType LoopType) { + WorksharingLoopType LoopType, bool NoLoop) { IRBuilder<> &Builder = OMPIRBuilder->Builder; BasicBlock *Preheader = CLI->getPreheader(); Value *TripCount = CLI->getTripCount(); @@ -5039,17 +5039,16 @@ static void workshareLoopTargetCallback( OutlinedFnCallInstruction->eraseFromParent(); createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident, - LoopBodyArg, TripCount, OutlinedFn); + LoopBodyArg, TripCount, OutlinedFn, NoLoop); for (auto &ToBeDeletedItem : ToBeDeleted) ToBeDeletedItem->eraseFromParent(); CLI->invalidate(); } -OpenMPIRBuilder::InsertPointTy -OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, - InsertPointTy AllocaIP, - WorksharingLoopType LoopType) { +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget( + DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, + WorksharingLoopType LoopType, bool NoLoop) { uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); @@ -5132,7 +5131,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, OI.PostOutlineCB = [=, ToBeDeletedVec = std::move(ToBeDeleted)](Function &OutlinedFn) { workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec, - LoopType); + LoopType, NoLoop); }; addOutlineInfo(std::move(OI)); return CLI->getAfterIP(); @@ -5143,9 +5142,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop( bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, - WorksharingLoopType LoopType) { + WorksharingLoopType LoopType, bool NoLoop) { if (Config.isTargetDevice()) - return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType); + return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop); OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, HasNonmonotonicModifier, HasOrderedClause); diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td index c080c3fac87d4..e0cd06805ab40 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td @@ -230,6 +230,7 @@ def TargetRegionFlagsNone : I32BitEnumAttrCaseNone<"none">; def TargetRegionFlagsGeneric : I32BitEnumAttrCaseBit<"generic", 0>; def TargetRegionFlagsSpmd : I32BitEnumAttrCaseBit<"spmd", 1>; def TargetRegionFlagsTripCount : I32BitEnumAttrCaseBit<"trip_count", 2>; +def TargetRegionFlagsNoLoop : I32BitEnumAttrCaseBit<"no_loop", 3>; def TargetRegionFlags : OpenMP_BitEnumAttr< "TargetRegionFlags", @@ -237,7 +238,8 @@ def TargetRegionFlags : OpenMP_BitEnumAttr< TargetRegionFlagsNone, TargetRegionFlagsGeneric, TargetRegionFlagsSpmd, - TargetRegionFlagsTripCount + TargetRegionFlagsTripCount, + TargetRegionFlagsNoLoop ]>; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 6e43f28e8d93d..1e10dd114b30a 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -2106,6 +2106,29 @@ Operation *TargetOp::getInnermostCapturedOmpOp() { }); } +/// Check if we can promote SPMD kernel to No-Loop kernel +static bool canPromoteToNoLoop(Operation *capturedOp, TeamsOp teamsOp, + WsloopOp *wsLoopOp) { + // num_teams clause can break no-loop teams/threads assumption + if (teamsOp.getNumTeamsUpper()) + return false; + // reduction kernels are slower in no-loop mode + if (teamsOp.getNumReductionVars()) + return false; + if (wsLoopOp->getNumReductionVars()) + return false; + // check if the user allows the promotion of kernels to no-loop mode + OffloadModuleInterface offloadMod = + capturedOp->getParentOfType(); + if (!offloadMod) + return false; + auto ompFlags = offloadMod.getFlags(); + if (!ompFlags) + return false; + return ompFlags.getAssumeTeamsOversubscription() && + ompFlags.getAssumeThreadsOversubscription(); +} + TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { // A non-null captured op is only valid if it resides inside of a TargetOp // and is the result of calling getInnermostCapturedOmpOp() on it. @@ -2134,7 +2157,8 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { // Detect target-teams-distribute-parallel-wsloop[-simd]. if (numWrappers == 2) { - if (!isa(innermostWrapper)) + WsloopOp *wsloopOp = dyn_cast(innermostWrapper); + if (!wsloopOp) return TargetRegionFlags::generic; innermostWrapper = std::next(innermostWrapper); @@ -2145,12 +2169,19 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { if (!isa_and_present(parallelOp)) return TargetRegionFlags::generic; - Operation *teamsOp = parallelOp->getParentOp(); - if (!isa_and_present(teamsOp)) + TeamsOp teamsOp = dyn_cast(parallelOp->getParentOp()); + if (!teamsOp) return TargetRegionFlags::generic; - if (teamsOp->getParentOp() == targetOp.getOperation()) - return TargetRegionFlags::spmd | TargetRegionFlags::trip_count; + TargetRegionFlags result; + + if (teamsOp->getParentOp() == targetOp.getOperation()) { + TargetRegionFlags result = + TargetRegionFlags::spmd | TargetRegionFlags::trip_count; + if (canPromoteToNoLoop(capturedOp, teamsOp, wsloopOp)) + result = result | TargetRegionFlags::no_loop; + return result; + } } // Detect target-teams-distribute[-simd] and target-teams-loop. else if (isa(innermostWrapper)) { diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 6694de8383534..d67d5eb741543 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2590,13 +2590,27 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, } builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin()); + + bool noLoopMode = false; + omp::TargetOp targetOp = wsloopOp->getParentOfType(); + if (targetOp) { + Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp(); + omp::TargetRegionFlags kernelFlags = + targetOp.getKernelExecFlags(targetCapturedOp); + if (omp::bitEnumContainsAll(kernelFlags, + omp::TargetRegionFlags::spmd | + omp::TargetRegionFlags::no_loop) && + !omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic)) + noLoopMode = true; + } + llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP = ompBuilder->applyWorkshareLoop( ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier, convertToScheduleKind(schedule), chunk, isSimd, scheduleMod == omp::ScheduleModifier::monotonic, scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered, - workshareLoopType); + workshareLoopType, noLoopMode); if (failed(handleError(wsloopIP, opInst))) return failure(); @@ -5365,6 +5379,12 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, ? llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD : llvm::omp::OMP_TGT_EXEC_MODE_GENERIC : llvm::omp::OMP_TGT_EXEC_MODE_SPMD; + if (omp::bitEnumContainsAll(kernelFlags, + omp::TargetRegionFlags::spmd | + omp::TargetRegionFlags::no_loop) && + !omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic)) + attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP; + attrs.MinTeams = minTeamsVal; attrs.MaxTeams.front() = maxTeamsVal; attrs.MinThreads = 1; diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index 59a2cc3f27aca..653104ce883d1 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -800,10 +800,6 @@ template class StaticLoopChunker { // If we know we have more threads than iterations we can indicate that to // avoid an outer loop. - if (config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - if (OneIterationPerThread) ASSERT(NumThreads >= NumIters, "Broken assumption"); @@ -851,10 +847,6 @@ template class StaticLoopChunker { // If we know we have more blocks than iterations we can indicate that to // avoid an outer loop. - if (config::getAssumeTeamsOversubscription()) { - OneIterationPerThread = true; - } - if (OneIterationPerThread) ASSERT(NumBlocks >= NumIters, "Broken assumption"); @@ -914,11 +906,6 @@ template class StaticLoopChunker { // If we know we have more threads (across all blocks) than iterations we // can indicate that to avoid an outer loop. - if (config::getAssumeTeamsOversubscription() & - config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - if (OneIterationPerThread) ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); diff --git a/offload/test/offloading/fortran/target-no-loop.f90 b/offload/test/offloading/fortran/target-no-loop.f90 new file mode 100644 index 0000000000000..dd2bf7c2196b6 --- /dev/null +++ b/offload/test/offloading/fortran/target-no-loop.f90 @@ -0,0 +1,43 @@ +! Check if the first OpenMP GPU kernel is promoted to no-loop mode. +! The second cannot be promoted due to the limit on the number of teams. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +program main + use omp_lib + implicit none + integer :: i + integer :: array(1024), errors = 0 + array = 1 + + !$omp target teams distribute parallel do + do i = 1, 1024 + array(i) = i + end do + + do i = 1, 1024 + if ( array( i) .ne. (i) ) then + errors = errors + 1 + end if + end do + + !$omp target teams distribute parallel do num_teams(3) + do i = 1, 1024 + array(i) = i + end do + + do i = 1, 1024 + if ( array( i) .ne. (i) ) then + errors = errors + 1 + end if + end do + + print *,"number of errors: ", errors + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode +! CHECK: number of errors: 0 +