-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[Offload] Implement olMemFill #154102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Offload] Implement olMemFill #154102
Conversation
f4ec35b
to
0c1583c
Compare
@callumfare I'm currently looking at supporting AMD for this, but I think it makes sense to merge this as is (after reviews) and I'll have that change be a separate MR. If only so we don't have to juggle repo permissions. |
0c1583c
to
f96bdec
Compare
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-offload Author: Callum Fare (callumfare) ChangesImplement olMemFill to support filling device memory with arbitrary length patterns. AMDGPU support will be added in a follow-up PR. Full diff: https://github.com/llvm/llvm-project/pull/154102.diff 11 Files Affected:
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index 5f7158588bc77..82f942b2e06c5 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -63,3 +63,23 @@ def : Function {
];
let returns = [];
}
+
+def : Function {
+ let name = "olMemFill";
+ let desc = "Fill memory with copies of the given pattern";
+ let details = [
+ "Filling with patterns larger than 4 bytes may be less performant",
+ "The destination pointer and queue must be associated with the same device",
+ "The fill size must be a multiple of the pattern size",
+ ];
+ let params = [
+ Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+ Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
+ Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
+ Param<"void*", "PatternPtr", "", PARAM_IN>,
+ Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
+ ];
+ let returns = [
+ Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
+ ];
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 1c9dfc69d445a..0b6363ab6ffbf 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -656,6 +656,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
return Error::success();
}
+Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
+ void *PatternPtr, size_t FillSize) {
+ return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
+ Queue->AsyncInfo);
+}
+
Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
size_t ProgDataSize, ol_program_handle_t *Program) {
// Make a copy of the program binary in case it is released by the caller.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 83280fe0a49c9..949f23278277c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2583,6 +2583,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ hsa_status_t Status;
+
+ // We can use hsa_amd_memory_fill for this size, but it's not async so the
+ // queue needs to be synchronized first
+ if (PatternSize == 4) {
+ if (AsyncInfoWrapper.hasQueue())
+ if (auto Err = synchronize(AsyncInfoWrapper))
+ return Err;
+ Status = hsa_amd_memory_fill(TgtPtr, *(uint32_t *)(PatternPtr),
+ Size / PatternSize);
+
+ if (auto Err =
+ Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
+ return Err;
+ } else {
+ // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
+ // memory and copying to the device in one go.
+ return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
+ }
+ }
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index a448721755a6f..b2145979ae599 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -957,6 +957,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ /// Fill data on the device with a pattern from the host
+ Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size, __tgt_async_info *AsyncInfo);
+ virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
+ int64_t PatternSize, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Run the kernel associated with \p EntryPtr
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c06c35e1e6a5b..fb672c9782a8b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
return Err;
}
+Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
+ int64_t PatternSize, int64_t Size,
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+ auto Err =
+ dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs,
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 361a781e8f9b6..e8da25bc1d155 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
DLWRAP(cuMemcpyHtoD, 3)
DLWRAP(cuMemcpyHtoDAsync, 4)
+DLWRAP(cuMemsetD8Async, 4)
+DLWRAP(cuMemsetD16Async, 4)
+DLWRAP(cuMemsetD32Async, 4)
+DLWRAP(cuMemsetD2D8Async, 6)
+DLWRAP(cuMemsetD2D16Async, 6)
+DLWRAP(cuMemsetD2D32Async, 6)
+
DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
DLWRAP(cuMemFreeAsync, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index b6c022c8e7e8b..93496d95327f3 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -321,6 +321,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
+CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+ CUstream);
+CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+ CUstream);
+CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+ CUstream);
+
CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index a99357a3adeaa..70020a6581a5c 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -844,6 +844,58 @@ struct CUDADeviceTy : public GenericDeviceTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ if (auto Err = setContext())
+ return Err;
+
+ CUstream Stream;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ CUresult Res;
+ size_t N = Size / PatternSize;
+ if (PatternSize == 1) {
+ Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, *((const uint8_t *)PatternPtr),
+ N, Stream);
+ } else if (PatternSize == 2) {
+ Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
+ *((const uint16_t *)PatternPtr), N, Stream);
+ } else if (PatternSize == 4) {
+ Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
+ *((const uint32_t *)PatternPtr), N, Stream);
+ } else {
+ // For larger patterns we can do a series of strided fills to copy the
+ // pattern efficiently
+ int64_t MemsetSize = PatternSize % 4u == 0u ? 4u
+ : PatternSize % 2u == 0u ? 2u
+ : 1u;
+
+ int64_t NumberOfSteps = PatternSize / MemsetSize;
+ int64_t Pitch = NumberOfSteps * MemsetSize;
+ int64_t Height = Size / PatternSize;
+
+ for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+ if (MemsetSize == 4) {
+ Res = cuMemsetD2D32Async(
+ (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+ *((const uint32_t *)PatternPtr + Step), 1u, Height, Stream);
+ } else if (MemsetSize == 2) {
+ Res = cuMemsetD2D16Async(
+ (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+ *((const uint16_t *)PatternPtr + Step), 1u, Height, Stream);
+ } else {
+ Res = cuMemsetD2D8Async((CUdeviceptr)TgtPtr + Step * MemsetSize,
+ Pitch, *((const uint8_t *)PatternPtr + Step),
+ 1u, Height, Stream);
+ }
+ }
+ }
+
+ return Plugin::check(Res, "error in cuMemset: %s");
+ }
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
if (auto Err = setContext())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 25443fd1ac0b3..0286fe216b2dc 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -302,6 +302,22 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ if (PatternSize == 1) {
+ std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size);
+ } else {
+ for (unsigned int Step = 0; Step < Size; Step += PatternSize) {
+ auto *Dst =
+ reinterpret_cast<void *>(reinterpret_cast<char *>(TgtPtr) + Step);
+ std::memcpy(TgtPtr, PatternPtr, PatternSize);
+ }
+ }
+
+ return Plugin::success();
+ }
+
/// All functions are already synchronous. No need to do anything on this
/// synchronization function.
Error synchronizeImpl(__tgt_async_info &AsyncInfo,
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index b25db7022e9d7..58c9b89d1ed0d 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -24,6 +24,7 @@ add_offload_unittest("kernel"
add_offload_unittest("memory"
memory/olMemAlloc.cpp
+ memory/olMemFill.cpp
memory/olMemFree.cpp
memory/olMemcpy.cpp)
diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
new file mode 100644
index 0000000000000..1b0bafa202080
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
@@ -0,0 +1,134 @@
+//===------- Offload API tests - olMemFill --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olMemFillTest = OffloadQueueTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);
+
+TEST_P(olMemFillTest, Success8) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint8_t Pattern = 0x42;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ uint8_t *AllocPtr = reinterpret_cast<uint8_t *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, Success16) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint16_t Pattern = 0x4242;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ uint16_t *AllocPtr = reinterpret_cast<uint16_t *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, Success32) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint32_t Pattern = 0xDEADBEEF;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ uint32_t *AllocPtr = reinterpret_cast<uint32_t *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLarge) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ struct PatternT {
+ uint64_t A;
+ uint64_t B;
+ } Pattern{UINT64_MAX, UINT64_MAX};
+
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+ constexpr size_t Size = 17 * 64;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ struct __attribute__((packed)) PatternT {
+ uint64_t A;
+ uint64_t B;
+ uint8_t C;
+ } Pattern{UINT64_MAX, UINT64_MAX, 255};
+
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].C, 255);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, InvalidPatternSize) {
+ constexpr size_t Size = 1025;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint16_t Pattern = 0x4242;
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+ olMemFree(Alloc);
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I remember there being some weird stuff with the DMA engine on AMDGPU, maybe @jplehr remembers.
This has been fixed with ROCm 6.3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens with the tests when run on AMDGPU? Am I right to assume that it will fail?
Less concerned about right now, mostly wonder about future things once we run the tests in the buildbots.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, they'll fail as-is. @RossBrunton is going to finish implementing AMDGPU support in a follow up PR (I don't have the ability to test on AMD hardware right now). I can wait to merge this until that's ready so there's only a small window where the tests are failing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also I think in future we could implement a way to mark certain tests as known failures on certain backends so they can be GTEST_SKIP
'd. We have similar functionality in Unified Runtime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think moving forward it would be preferable to have that ability indeed.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/11995 Here is the relevant piece of the build log for the reference
|
Fix regression introduced by #154102 - the way offload-tblgen handles names has changed
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/204/builds/19570 Here is the relevant piece of the build log for the reference
|
Regression fixed in #154947 |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/203/builds/20758 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/205/builds/19547 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/140/builds/29354 Here is the relevant piece of the build log for the reference
|
Implement olMemFill to support filling device memory with arbitrary length patterns. AMDGPU support will be added in a follow-up PR.