-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[OpenMP][Offload] Add SPMD-No-Loop mode to OpenMP offload runtime #154105
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenMP][Offload] Add SPMD-No-Loop mode to OpenMP offload runtime #154105
Conversation
Kernels which are marked as SPMD-No-Loop should be launched with sufficient number of teams and threads to cover loop iteration space. No-Loop mode is described in RFC: https://discourse.llvm.org/t/rfc-no-loop-mode-for-openmp-gpu-kernels/87517/
@llvm/pr-subscribers-offload @llvm/pr-subscribers-flang-openmp Author: Dominik Adamski (DominikAdamski) ChangesKernels which are marked as SPMD-No-Loop should be launched with sufficient number of teams and threads to cover loop iteration space. No-Loop mode is described in RFC: Full diff: https://github.com/llvm/llvm-project/pull/154105.diff 3 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
index 3ae447b14f320..c41b4d1e9844c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -23,7 +23,8 @@ enum OMPTgtExecModeFlags : unsigned char {
OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
OMP_TGT_EXEC_MODE_GENERIC_SPMD =
- OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
+ OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP = 1 << 2 | OMP_TGT_EXEC_MODE_SPMD
};
} // end namespace omp
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index a448721755a6f..47e72147b1cc3 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -431,6 +431,8 @@ struct GenericKernelTy {
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
+ case OMP_TGT_EXEC_MODE_SPMD_NO_LOOP:
+ return "SPMD-No-Loop";
}
llvm_unreachable("Unknown execution mode!");
}
@@ -468,7 +470,8 @@ struct GenericKernelTy {
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
- /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+ /// Indicate if the kernel works in Generic SPMD, Generic, No-Loop
+ /// or SPMD mode.
bool isGenericSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -483,6 +486,10 @@ struct GenericKernelTy {
bool isBareMode() const {
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_BARE;
}
+ bool isNoLoopMode() const {
+ return KernelEnvironment.Configuration.ExecMode ==
+ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
+ }
/// The kernel name.
std::string Name;
@@ -1152,6 +1159,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;
+ /// Return value of OMP_TEAMS_THREAD_LIMIT environment variable
+ int32_t getOMPTeamsThreadLimit() const { return OMP_TeamsThreadLimit; }
+
private:
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c06c35e1e6a5b..72d75010d9657 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -640,6 +640,18 @@ uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
if (ThreadLimitClause[0] > 0 && isGenericMode())
ThreadLimitClause[0] += GenericDevice.getWarpSize();
+ // Honor OMP_TEAMS_THREAD_LIMIT environment variable and
+ // num_threads/thread_limit clause for NoLoop kernel types.
+ int32_t TeamsThreadLimitEnvVar = GenericDevice.getOMPTeamsThreadLimit();
+ uint16_t ConstWGSize = GenericDevice.getDefaultNumThreads();
+ if (isNoLoopMode()) {
+ if (TeamsThreadLimitEnvVar > 0)
+ return std::min(static_cast<int32_t>(ConstWGSize),
+ TeamsThreadLimitEnvVar);
+ if ((ThreadLimitClause[0] > 0) && (ThreadLimitClause[0] != (uint32_t)-1))
+ return std::min(static_cast<uint32_t>(ConstWGSize), ThreadLimitClause[0]);
+ return ConstWGSize;
+ }
return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
? ThreadLimitClause[0]
: PreferredNumThreads);
@@ -662,6 +674,16 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
}
+ const auto getNumGroupsFromThreadsAndTripCount =
+ [](const uint64_t TripCount, const uint32_t NumThreads) {
+ return ((TripCount - 1) / NumThreads) + 1;
+ };
+ if (isNoLoopMode()) {
+ return LoopTripCount > 0
+ ? getNumGroupsFromThreadsAndTripCount(LoopTripCount, NumThreads)
+ : 1;
+ }
+
uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
if (LoopTripCount > 0) {
|
PR for changes in the device RTL: #151959 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you miss the one in the DeviceRTL?
Please see: #151959 . I am planning to add support for No-Loop mode initially for Fortran OpenMP kernels. |
I meant
|
Added |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@@ -640,6 +640,18 @@ uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, | |||
if (ThreadLimitClause[0] > 0 && isGenericMode()) | |||
ThreadLimitClause[0] += GenericDevice.getWarpSize(); | |||
|
|||
// Honor OMP_TEAMS_THREAD_LIMIT environment variable and |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Taking a step back, what is the reason for special-casing getNumThreads handling for No-Loop kernels? I believe the existing logic (involving MaxNumThreads and PreferredNumThreads) handles both thread-related envars and OpenMP clauses. Is there a test case for No-Loop that does not work with the existing logic?
The primary change required for No-Loop kernels is making sure that the grid size is appropriate and that is ensured by the change in getNumBlocks. I am wondering whether this special handling in getNumThreads can be removed altogether.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a test case for No-Loop that does not work with the existing logic?
I haven't found and I removed unnecessary code.
if (TeamsThreadLimitEnvVar > 0) | ||
return std::min(static_cast<int32_t>(ConstWGSize), | ||
TeamsThreadLimitEnvVar); | ||
if ((ThreadLimitClause[0] > 0) && (ThreadLimitClause[0] != (uint32_t)-1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if ((ThreadLimitClause[0] > 0) && (ThreadLimitClause[0] != (uint32_t)-1)) | |
if ((ThreadLimitClause[0] > 0) && (ThreadLimitClause[0] != ~0U)) |
@@ -662,6 +674,16 @@ uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, | |||
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); | |||
} | |||
|
|||
const auto getNumGroupsFromThreadsAndTripCount = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the point of doing a lambda here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed lambda
@@ -1167,6 +1174,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { | |||
/// deallocated by the allocator. | |||
llvm::SmallVector<DeviceImageTy *> LoadedImages; | |||
|
|||
/// Return value of OMP_TEAMS_THREAD_LIMIT environment variable | |||
int32_t getOMPTeamsThreadLimit() const { return OMP_TeamsThreadLimit; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not used anymore.
if (isNoLoopMode()) { | ||
return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (isNoLoopMode()) { | |
return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1; | |
} | |
if (isNoLoopMode()) | |
return LoopTripCount > 0 ? (((LoopTripCount - 1) / NumThreads) + 1) : 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Kernels which are marked as SPMD-No-Loop should be launched with sufficient number of teams and threads to cover loop iteration space.
No-Loop mode is described in RFC:
https://discourse.llvm.org/t/rfc-no-loop-mode-for-openmp-gpu-kernels/87517/