-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Move XPUEvent to c10 #158336
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
guangyey
wants to merge
17
commits into
gh/guangyey/171/base
Choose a base branch
from
gh/guangyey/171/head
base: gh/guangyey/171/base
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Move XPUEvent to c10 #158336
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
c79176d
Update
guangyey b939ff1
Update
guangyey 496422a
Update
guangyey f40b545
Update
guangyey c423d96
Update
guangyey 00f0721
Update
guangyey 8a3c701
Update
guangyey ea9f4ac
Update
guangyey e974c59
Update
guangyey c96631d
Update
guangyey d2f9e62
Update
guangyey 2747427
Update
guangyey bb9c43e
Update
guangyey 4f0b958
Update
guangyey acb8d1e
Update
guangyey 5a015dc
Update
guangyey fe221f5
Update
guangyey File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,191 +1,3 @@ | ||
#pragma once | ||
#include <ATen/xpu/XPUContext.h> | ||
|
||
#include <optional> | ||
|
||
namespace at::xpu { | ||
|
||
/* | ||
* XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are | ||
* constructed lazily when first recorded. It has a device, and this device is | ||
* acquired from the first recording stream. Later streams that record the event | ||
* must match the same device. | ||
* | ||
* Currently, XPUEvent does NOT support to export an inter-process event from | ||
* another process via inter-process communication(IPC). So it means that | ||
* inter-process communication for event handles between different processes is | ||
* not available. This could impact some applications that rely on cross-process | ||
* synchronization and communication. | ||
*/ | ||
struct TORCH_XPU_API XPUEvent { | ||
// Constructors | ||
XPUEvent(bool enable_timing = false) noexcept | ||
: enable_timing_{enable_timing} {} | ||
|
||
~XPUEvent() { | ||
if (isCreated()) { | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_deletion( | ||
at::kXPU, reinterpret_cast<uintptr_t>(event_.get())); | ||
} | ||
} | ||
} | ||
|
||
XPUEvent(const XPUEvent&) = delete; | ||
XPUEvent& operator=(const XPUEvent&) = delete; | ||
|
||
XPUEvent(XPUEvent&& other) = default; | ||
XPUEvent& operator=(XPUEvent&& other) = default; | ||
|
||
operator sycl::event&() const { | ||
return event(); | ||
} | ||
|
||
std::optional<at::Device> device() const { | ||
if (isCreated()) { | ||
return at::Device(at::kXPU, device_index_); | ||
} else { | ||
return std::nullopt; | ||
} | ||
} | ||
|
||
inline bool isCreated() const { | ||
return (event_.get() != nullptr); | ||
} | ||
|
||
DeviceIndex device_index() const { | ||
return device_index_; | ||
} | ||
|
||
sycl::event& event() const { | ||
return *event_; | ||
} | ||
|
||
bool query() const { | ||
using namespace sycl::info; | ||
if (!isCreated()) { | ||
return true; | ||
} | ||
|
||
return event().get_info<event::command_execution_status>() == | ||
event_command_status::complete; | ||
} | ||
|
||
void record() { | ||
record(getCurrentXPUStream()); | ||
} | ||
|
||
void recordOnce(const XPUStream& stream) { | ||
if (!isCreated()) { | ||
record(stream); | ||
} | ||
} | ||
|
||
void record(const XPUStream& stream) { | ||
if (!isCreated()) { | ||
device_index_ = stream.device_index(); | ||
assignEvent(stream.queue()); | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_creation( | ||
at::kXPU, reinterpret_cast<uintptr_t>(event_.get())); | ||
} | ||
} else { | ||
TORCH_CHECK( | ||
device_index_ == stream.device_index(), | ||
"Event device ", | ||
device_index_, | ||
" does not match recording stream's device ", | ||
stream.device_index(), | ||
"."); | ||
reassignEvent(stream.queue()); | ||
} | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_record( | ||
at::kXPU, | ||
reinterpret_cast<uintptr_t>(event_.get()), | ||
reinterpret_cast<uintptr_t>(&stream.queue())); | ||
} | ||
} | ||
|
||
void block(const XPUStream& stream) { | ||
if (isCreated()) { | ||
std::vector<sycl::event> event_list{event()}; | ||
// Make this stream wait until event_ is completed. | ||
stream.queue().ext_oneapi_submit_barrier(event_list); | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_wait( | ||
at::kXPU, | ||
reinterpret_cast<uintptr_t>(event_.get()), | ||
reinterpret_cast<uintptr_t>(&stream.queue())); | ||
} | ||
} | ||
} | ||
|
||
double elapsed_time(const XPUEvent& other) const { | ||
TORCH_CHECK( | ||
isCreated() && other.isCreated(), | ||
"Both events must be recorded before calculating elapsed time."); | ||
TORCH_CHECK( | ||
query() && other.query(), | ||
"Both events must be completed before calculating elapsed time."); | ||
TORCH_CHECK( | ||
enable_timing_ && other.enable_timing_, | ||
"Both events must be created with argument 'enable_timing=True'."); | ||
|
||
#if SYCL_COMPILER_VERSION < 20250000 | ||
TORCH_CHECK_NOT_IMPLEMENTED( | ||
false, | ||
"elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); | ||
#endif | ||
|
||
using namespace sycl::info::event_profiling; | ||
// Block until both of the recorded events are completed. | ||
uint64_t end_time_ns = other.event().get_profiling_info<command_end>(); | ||
uint64_t start_time_ns = event().get_profiling_info<command_end>(); | ||
// Return the eplased time in milliseconds. | ||
return 1e-6 * | ||
(static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns)); | ||
} | ||
|
||
void synchronize() const { | ||
if (isCreated()) { | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_synchronization( | ||
at::kXPU, reinterpret_cast<uintptr_t>(event_.get())); | ||
} | ||
event().wait_and_throw(); | ||
} | ||
} | ||
|
||
private: | ||
void assignEvent(sycl::queue& queue) { | ||
#if SYCL_COMPILER_VERSION >= 20250000 | ||
if (enable_timing_) { | ||
event_ = std::make_unique<sycl::event>( | ||
sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); | ||
} else { | ||
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier()); | ||
} | ||
#else | ||
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier()); | ||
#endif | ||
} | ||
|
||
void reassignEvent(sycl::queue& queue) { | ||
event_.reset(); | ||
assignEvent(queue); | ||
} | ||
|
||
bool enable_timing_ = false; | ||
DeviceIndex device_index_ = -1; | ||
// Only need to track the last event, as events in an in-order queue are | ||
// executed sequentially. | ||
std::unique_ptr<sycl::event> event_; | ||
}; | ||
|
||
} // namespace at::xpu | ||
#include <c10/xpu/XPUEvent.h> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
#pragma once | ||
#include <c10/xpu/XPUStream.h> | ||
|
||
namespace c10::xpu { | ||
|
||
/* | ||
* XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are | ||
* constructed lazily when first recorded. It has a device, and this device is | ||
* acquired from the first recording stream. Later streams that record the event | ||
* must match the same device. | ||
* | ||
* Currently, XPUEvent does NOT support to export an inter-process event from | ||
* another process via inter-process communication(IPC). So it means that | ||
* inter-process communication for event handles between different processes is | ||
* not available. This could impact some applications that rely on cross-process | ||
* synchronization and communication. | ||
*/ | ||
struct XPUEvent { | ||
// Constructors | ||
XPUEvent(bool enable_timing = false) noexcept | ||
: enable_timing_{enable_timing} {} | ||
|
||
~XPUEvent() { | ||
if (isCreated()) { | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_deletion( | ||
c10::kXPU, reinterpret_cast<uintptr_t>(event_.get())); | ||
} | ||
} | ||
} | ||
|
||
XPUEvent(const XPUEvent&) = delete; | ||
XPUEvent& operator=(const XPUEvent&) = delete; | ||
|
||
XPUEvent(XPUEvent&& other) = default; | ||
XPUEvent& operator=(XPUEvent&& other) = default; | ||
|
||
operator sycl::event&() const { | ||
return event(); | ||
} | ||
|
||
std::optional<c10::Device> device() const { | ||
if (isCreated()) { | ||
return c10::Device(c10::kXPU, device_index_); | ||
} else { | ||
return std::nullopt; | ||
} | ||
} | ||
|
||
inline bool isCreated() const { | ||
return (event_.get() != nullptr); | ||
} | ||
|
||
DeviceIndex device_index() const { | ||
return device_index_; | ||
} | ||
|
||
sycl::event& event() const { | ||
return *event_; | ||
} | ||
|
||
bool query() const { | ||
using namespace sycl::info; | ||
if (!isCreated()) { | ||
return true; | ||
} | ||
|
||
return event().get_info<event::command_execution_status>() == | ||
event_command_status::complete; | ||
} | ||
|
||
void record() { | ||
record(getCurrentXPUStream()); | ||
} | ||
|
||
void recordOnce(const XPUStream& stream) { | ||
if (!isCreated()) { | ||
record(stream); | ||
} | ||
} | ||
|
||
void record(const XPUStream& stream) { | ||
if (!isCreated()) { | ||
device_index_ = stream.device_index(); | ||
assignEvent(stream.queue()); | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_creation( | ||
c10::kXPU, reinterpret_cast<uintptr_t>(event_.get())); | ||
} | ||
} else { | ||
TORCH_CHECK( | ||
device_index_ == stream.device_index(), | ||
"Event device ", | ||
device_index_, | ||
" does not match recording stream's device ", | ||
stream.device_index(), | ||
"."); | ||
reassignEvent(stream.queue()); | ||
} | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_record( | ||
c10::kXPU, | ||
reinterpret_cast<uintptr_t>(event_.get()), | ||
reinterpret_cast<uintptr_t>(&stream.queue())); | ||
} | ||
} | ||
|
||
void block(const XPUStream& stream) { | ||
if (isCreated()) { | ||
std::vector<sycl::event> event_list{event()}; | ||
// Make this stream wait until event_ is completed. | ||
stream.queue().ext_oneapi_submit_barrier(event_list); | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_wait( | ||
c10::kXPU, | ||
reinterpret_cast<uintptr_t>(event_.get()), | ||
reinterpret_cast<uintptr_t>(&stream.queue())); | ||
} | ||
} | ||
} | ||
|
||
double elapsed_time(const XPUEvent& other) const { | ||
TORCH_CHECK( | ||
isCreated() && other.isCreated(), | ||
"Both events must be recorded before calculating elapsed time."); | ||
TORCH_CHECK( | ||
query() && other.query(), | ||
"Both events must be completed before calculating elapsed time."); | ||
TORCH_CHECK( | ||
enable_timing_ && other.enable_timing_, | ||
"Both events must be created with argument 'enable_timing=True'."); | ||
|
||
#if SYCL_COMPILER_VERSION < 20250000 | ||
TORCH_CHECK_NOT_IMPLEMENTED( | ||
false, | ||
"elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); | ||
#endif | ||
|
||
using namespace sycl::info::event_profiling; | ||
// Block until both of the recorded events are completed. | ||
uint64_t end_time_ns = other.event().get_profiling_info<command_end>(); | ||
uint64_t start_time_ns = event().get_profiling_info<command_end>(); | ||
// Return the eplased time in milliseconds. | ||
return 1e-6 * | ||
(static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns)); | ||
} | ||
|
||
void synchronize() const { | ||
if (isCreated()) { | ||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); | ||
if (C10_UNLIKELY(interp)) { | ||
(*interp)->trace_gpu_event_synchronization( | ||
c10::kXPU, reinterpret_cast<uintptr_t>(event_.get())); | ||
} | ||
event().wait_and_throw(); | ||
} | ||
} | ||
|
||
private: | ||
void assignEvent(sycl::queue& queue) { | ||
#if SYCL_COMPILER_VERSION >= 20250000 | ||
if (enable_timing_) { | ||
event_ = std::make_unique<sycl::event>( | ||
sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); | ||
} else { | ||
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier()); | ||
} | ||
#else | ||
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier()); | ||
#endif | ||
} | ||
|
||
void reassignEvent(sycl::queue& queue) { | ||
event_.reset(); | ||
assignEvent(queue); | ||
} | ||
|
||
bool enable_timing_ = false; | ||
DeviceIndex device_index_ = -1; | ||
// Only need to track the last event, as events in an in-order queue are | ||
// executed sequentially. | ||
std::unique_ptr<sycl::event> event_; | ||
}; | ||
|
||
} // namespace c10::xpu |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Already included in
<ATen/xpu/XPUContext.h>