Skip to content

Commit 200bd22

Browse files
committed
Move XPUEvent to c10
ghstack-source-id: 31a4793 Pull Request resolved: #158336
1 parent 354b477 commit 200bd22

File tree

3 files changed

+191
-189
lines changed

3 files changed

+191
-189
lines changed

aten/src/ATen/xpu/XPUEvent.h

Lines changed: 1 addition & 189 deletions
Original file line numberDiff line numberDiff line change
@@ -1,191 +1,3 @@
11
#pragma once
22
#include <ATen/xpu/XPUContext.h>
3-
4-
#include <optional>
5-
6-
namespace at::xpu {
7-
8-
/*
9-
* XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
10-
* constructed lazily when first recorded. It has a device, and this device is
11-
* acquired from the first recording stream. Later streams that record the event
12-
* must match the same device.
13-
*
14-
* Currently, XPUEvent does NOT support to export an inter-process event from
15-
* another process via inter-process comunication(IPC). So it means that
16-
* inter-process communication for event handles between different processes is
17-
* not available. This could impact some applications that rely on cross-process
18-
* synchronization and communication.
19-
*/
20-
struct TORCH_XPU_API XPUEvent {
21-
// Constructors
22-
XPUEvent(bool enable_timing = false) noexcept
23-
: enable_timing_{enable_timing} {}
24-
25-
~XPUEvent() {
26-
if (isCreated()) {
27-
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
28-
if (C10_UNLIKELY(interp)) {
29-
(*interp)->trace_gpu_event_deletion(
30-
at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
31-
}
32-
}
33-
}
34-
35-
XPUEvent(const XPUEvent&) = delete;
36-
XPUEvent& operator=(const XPUEvent&) = delete;
37-
38-
XPUEvent(XPUEvent&& other) = default;
39-
XPUEvent& operator=(XPUEvent&& other) = default;
40-
41-
operator sycl::event&() const {
42-
return event();
43-
}
44-
45-
std::optional<at::Device> device() const {
46-
if (isCreated()) {
47-
return at::Device(at::kXPU, device_index_);
48-
} else {
49-
return std::nullopt;
50-
}
51-
}
52-
53-
inline bool isCreated() const {
54-
return (event_.get() != nullptr);
55-
}
56-
57-
DeviceIndex device_index() const {
58-
return device_index_;
59-
}
60-
61-
sycl::event& event() const {
62-
return *event_;
63-
}
64-
65-
bool query() const {
66-
using namespace sycl::info;
67-
if (!isCreated()) {
68-
return true;
69-
}
70-
71-
return event().get_info<event::command_execution_status>() ==
72-
event_command_status::complete;
73-
}
74-
75-
void record() {
76-
record(getCurrentXPUStream());
77-
}
78-
79-
void recordOnce(const XPUStream& stream) {
80-
if (!isCreated()) {
81-
record(stream);
82-
}
83-
}
84-
85-
void record(const XPUStream& stream) {
86-
if (!isCreated()) {
87-
device_index_ = stream.device_index();
88-
assignEvent(stream.queue());
89-
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
90-
if (C10_UNLIKELY(interp)) {
91-
(*interp)->trace_gpu_event_creation(
92-
at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
93-
}
94-
} else {
95-
TORCH_CHECK(
96-
device_index_ == stream.device_index(),
97-
"Event device ",
98-
device_index_,
99-
" does not match recording stream's device ",
100-
stream.device_index(),
101-
".");
102-
reassignEvent(stream.queue());
103-
}
104-
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
105-
if (C10_UNLIKELY(interp)) {
106-
(*interp)->trace_gpu_event_record(
107-
at::kXPU,
108-
reinterpret_cast<uintptr_t>(event_.get()),
109-
reinterpret_cast<uintptr_t>(&stream.queue()));
110-
}
111-
}
112-
113-
void block(const XPUStream& stream) {
114-
if (isCreated()) {
115-
std::vector<sycl::event> event_list{event()};
116-
// Make this stream wait until event_ is completed.
117-
stream.queue().ext_oneapi_submit_barrier(event_list);
118-
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
119-
if (C10_UNLIKELY(interp)) {
120-
(*interp)->trace_gpu_event_wait(
121-
at::kXPU,
122-
reinterpret_cast<uintptr_t>(event_.get()),
123-
reinterpret_cast<uintptr_t>(&stream.queue()));
124-
}
125-
}
126-
}
127-
128-
double elapsed_time(const XPUEvent& other) const {
129-
TORCH_CHECK(
130-
isCreated() && other.isCreated(),
131-
"Both events must be recorded before calculating elapsed time.");
132-
TORCH_CHECK(
133-
query() && other.query(),
134-
"Both events must be completed before calculating elapsed time.");
135-
TORCH_CHECK(
136-
enable_timing_ && other.enable_timing_,
137-
"Both events must be created with argument 'enable_timing=True'.");
138-
139-
#if SYCL_COMPILER_VERSION < 20250000
140-
TORCH_CHECK_NOT_IMPLEMENTED(
141-
false,
142-
"elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer.");
143-
#endif
144-
145-
using namespace sycl::info::event_profiling;
146-
// Block until both of the recorded events are completed.
147-
uint64_t end_time_ns = other.event().get_profiling_info<command_end>();
148-
uint64_t start_time_ns = event().get_profiling_info<command_end>();
149-
// Return the eplased time in milliseconds.
150-
return 1e-6 *
151-
(static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
152-
}
153-
154-
void synchronize() const {
155-
if (isCreated()) {
156-
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
157-
if (C10_UNLIKELY(interp)) {
158-
(*interp)->trace_gpu_event_synchronization(
159-
at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
160-
}
161-
event().wait_and_throw();
162-
}
163-
}
164-
165-
private:
166-
void assignEvent(sycl::queue& queue) {
167-
#if SYCL_COMPILER_VERSION >= 20250000
168-
if (enable_timing_) {
169-
event_ = std::make_unique<sycl::event>(
170-
sycl::ext::oneapi::experimental::submit_profiling_tag(queue));
171-
} else {
172-
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
173-
}
174-
#else
175-
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
176-
#endif
177-
}
178-
179-
void reassignEvent(sycl::queue& queue) {
180-
event_.reset();
181-
assignEvent(queue);
182-
}
183-
184-
bool enable_timing_ = false;
185-
DeviceIndex device_index_ = -1;
186-
// Only need to track the last event, as events in an in-order queue are
187-
// executed sequentially.
188-
std::unique_ptr<sycl::event> event_;
189-
};
190-
191-
} // namespace at::xpu
3+
#include <c10/xpu/XPUEvent.h>

c10/xpu/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ set(C10_XPU_HEADERS
2424
XPUCachingAllocator.h
2525
XPUDeviceProp.h
2626
XPUException.h
27+
XPUEvent.h
2728
XPUFunctions.h
2829
XPUMacros.h
2930
XPUStream.h

c10/xpu/XPUEvent.h

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#pragma once
2+
#include <c10/xpu/XPUStream.h>
3+
4+
namespace c10::xpu {
5+
6+
/*
7+
* XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
8+
* constructed lazily when first recorded. It has a device, and this device is
9+
* acquired from the first recording stream. Later streams that record the event
10+
* must match the same device.
11+
*
12+
* Currently, XPUEvent does NOT support to export an inter-process event from
13+
* another process via inter-process communication(IPC). So it means that
14+
* inter-process communication for event handles between different processes is
15+
* not available. This could impact some applications that rely on cross-process
16+
* synchronization and communication.
17+
*/
18+
struct TORCH_XPU_API XPUEvent {
19+
// Constructors
20+
XPUEvent(bool enable_timing = false) noexcept
21+
: enable_timing_{enable_timing} {}
22+
23+
~XPUEvent() {
24+
if (isCreated()) {
25+
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
26+
if (C10_UNLIKELY(interp)) {
27+
(*interp)->trace_gpu_event_deletion(
28+
c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
29+
}
30+
}
31+
}
32+
33+
XPUEvent(const XPUEvent&) = delete;
34+
XPUEvent& operator=(const XPUEvent&) = delete;
35+
36+
XPUEvent(XPUEvent&& other) = default;
37+
XPUEvent& operator=(XPUEvent&& other) = default;
38+
39+
operator sycl::event&() const {
40+
return event();
41+
}
42+
43+
std::optional<c10::Device> device() const {
44+
if (isCreated()) {
45+
return c10::Device(c10::kXPU, device_index_);
46+
} else {
47+
return std::nullopt;
48+
}
49+
}
50+
51+
inline bool isCreated() const {
52+
return (event_.get() != nullptr);
53+
}
54+
55+
DeviceIndex device_index() const {
56+
return device_index_;
57+
}
58+
59+
sycl::event& event() const {
60+
return *event_;
61+
}
62+
63+
bool query() const {
64+
using namespace sycl::info;
65+
if (!isCreated()) {
66+
return true;
67+
}
68+
69+
return event().get_info<event::command_execution_status>() ==
70+
event_command_status::complete;
71+
}
72+
73+
void record() {
74+
record(getCurrentXPUStream());
75+
}
76+
77+
void recordOnce(const XPUStream& stream) {
78+
if (!isCreated()) {
79+
record(stream);
80+
}
81+
}
82+
83+
void record(const XPUStream& stream) {
84+
if (!isCreated()) {
85+
device_index_ = stream.device_index();
86+
assignEvent(stream.queue());
87+
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
88+
if (C10_UNLIKELY(interp)) {
89+
(*interp)->trace_gpu_event_creation(
90+
c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
91+
}
92+
} else {
93+
TORCH_CHECK(
94+
device_index_ == stream.device_index(),
95+
"Event device ",
96+
device_index_,
97+
" does not match recording stream's device ",
98+
stream.device_index(),
99+
".");
100+
reassignEvent(stream.queue());
101+
}
102+
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
103+
if (C10_UNLIKELY(interp)) {
104+
(*interp)->trace_gpu_event_record(
105+
c10::kXPU,
106+
reinterpret_cast<uintptr_t>(event_.get()),
107+
reinterpret_cast<uintptr_t>(&stream.queue()));
108+
}
109+
}
110+
111+
void block(const XPUStream& stream) {
112+
if (isCreated()) {
113+
std::vector<sycl::event> event_list{event()};
114+
// Make this stream wait until event_ is completed.
115+
stream.queue().ext_oneapi_submit_barrier(event_list);
116+
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
117+
if (C10_UNLIKELY(interp)) {
118+
(*interp)->trace_gpu_event_wait(
119+
c10::kXPU,
120+
reinterpret_cast<uintptr_t>(event_.get()),
121+
reinterpret_cast<uintptr_t>(&stream.queue()));
122+
}
123+
}
124+
}
125+
126+
double elapsed_time(const XPUEvent& other) const {
127+
TORCH_CHECK(
128+
isCreated() && other.isCreated(),
129+
"Both events must be recorded before calculating elapsed time.");
130+
TORCH_CHECK(
131+
query() && other.query(),
132+
"Both events must be completed before calculating elapsed time.");
133+
TORCH_CHECK(
134+
enable_timing_ && other.enable_timing_,
135+
"Both events must be created with argument 'enable_timing=True'.");
136+
137+
#if SYCL_COMPILER_VERSION < 20250000
138+
TORCH_CHECK_NOT_IMPLEMENTED(
139+
false,
140+
"elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer.");
141+
#endif
142+
143+
using namespace sycl::info::event_profiling;
144+
// Block until both of the recorded events are completed.
145+
uint64_t end_time_ns = other.event().get_profiling_info<command_end>();
146+
uint64_t start_time_ns = event().get_profiling_info<command_end>();
147+
// Return the eplased time in milliseconds.
148+
return 1e-6 *
149+
(static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
150+
}
151+
152+
void synchronize() const {
153+
if (isCreated()) {
154+
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
155+
if (C10_UNLIKELY(interp)) {
156+
(*interp)->trace_gpu_event_synchronization(
157+
c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
158+
}
159+
event().wait_and_throw();
160+
}
161+
}
162+
163+
private:
164+
void assignEvent(sycl::queue& queue) {
165+
#if SYCL_COMPILER_VERSION >= 20250000
166+
if (enable_timing_) {
167+
event_ = std::make_unique<sycl::event>(
168+
sycl::ext::oneapi::experimental::submit_profiling_tag(queue));
169+
} else {
170+
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
171+
}
172+
#else
173+
event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
174+
#endif
175+
}
176+
177+
void reassignEvent(sycl::queue& queue) {
178+
event_.reset();
179+
assignEvent(queue);
180+
}
181+
182+
bool enable_timing_ = false;
183+
DeviceIndex device_index_ = -1;
184+
// Only need to track the last event, as events in an in-order queue are
185+
// executed sequentially.
186+
std::unique_ptr<sycl::event> event_;
187+
};
188+
189+
} // namespace c10::xpu

0 commit comments

Comments
 (0)