|
1 | 1 | #pragma once
|
2 | 2 | #include <ATen/xpu/XPUContext.h>
|
3 |
| - |
4 |
| -#include <optional> |
5 |
| - |
6 |
| -namespace at::xpu { |
7 |
| - |
8 |
| -/* |
9 |
| - * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are |
10 |
| - * constructed lazily when first recorded. It has a device, and this device is |
11 |
| - * acquired from the first recording stream. Later streams that record the event |
12 |
| - * must match the same device. |
13 |
| - * |
14 |
| - * Currently, XPUEvent does NOT support to export an inter-process event from |
15 |
| - * another process via inter-process comunication(IPC). So it means that |
16 |
| - * inter-process communication for event handles between different processes is |
17 |
| - * not available. This could impact some applications that rely on cross-process |
18 |
| - * synchronization and communication. |
19 |
| - */ |
20 |
| -struct TORCH_XPU_API XPUEvent { |
21 |
| - // Constructors |
22 |
| - XPUEvent(bool enable_timing = false) noexcept |
23 |
| - : enable_timing_{enable_timing} {} |
24 |
| - |
25 |
| - ~XPUEvent() { |
26 |
| - if (isCreated()) { |
27 |
| - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
28 |
| - if (C10_UNLIKELY(interp)) { |
29 |
| - (*interp)->trace_gpu_event_deletion( |
30 |
| - at::kXPU, reinterpret_cast<uintptr_t>(event_.get())); |
31 |
| - } |
32 |
| - } |
33 |
| - } |
34 |
| - |
35 |
| - XPUEvent(const XPUEvent&) = delete; |
36 |
| - XPUEvent& operator=(const XPUEvent&) = delete; |
37 |
| - |
38 |
| - XPUEvent(XPUEvent&& other) = default; |
39 |
| - XPUEvent& operator=(XPUEvent&& other) = default; |
40 |
| - |
41 |
| - operator sycl::event&() const { |
42 |
| - return event(); |
43 |
| - } |
44 |
| - |
45 |
| - std::optional<at::Device> device() const { |
46 |
| - if (isCreated()) { |
47 |
| - return at::Device(at::kXPU, device_index_); |
48 |
| - } else { |
49 |
| - return std::nullopt; |
50 |
| - } |
51 |
| - } |
52 |
| - |
53 |
| - inline bool isCreated() const { |
54 |
| - return (event_.get() != nullptr); |
55 |
| - } |
56 |
| - |
57 |
| - DeviceIndex device_index() const { |
58 |
| - return device_index_; |
59 |
| - } |
60 |
| - |
61 |
| - sycl::event& event() const { |
62 |
| - return *event_; |
63 |
| - } |
64 |
| - |
65 |
| - bool query() const { |
66 |
| - using namespace sycl::info; |
67 |
| - if (!isCreated()) { |
68 |
| - return true; |
69 |
| - } |
70 |
| - |
71 |
| - return event().get_info<event::command_execution_status>() == |
72 |
| - event_command_status::complete; |
73 |
| - } |
74 |
| - |
75 |
| - void record() { |
76 |
| - record(getCurrentXPUStream()); |
77 |
| - } |
78 |
| - |
79 |
| - void recordOnce(const XPUStream& stream) { |
80 |
| - if (!isCreated()) { |
81 |
| - record(stream); |
82 |
| - } |
83 |
| - } |
84 |
| - |
85 |
| - void record(const XPUStream& stream) { |
86 |
| - if (!isCreated()) { |
87 |
| - device_index_ = stream.device_index(); |
88 |
| - assignEvent(stream.queue()); |
89 |
| - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
90 |
| - if (C10_UNLIKELY(interp)) { |
91 |
| - (*interp)->trace_gpu_event_creation( |
92 |
| - at::kXPU, reinterpret_cast<uintptr_t>(event_.get())); |
93 |
| - } |
94 |
| - } else { |
95 |
| - TORCH_CHECK( |
96 |
| - device_index_ == stream.device_index(), |
97 |
| - "Event device ", |
98 |
| - device_index_, |
99 |
| - " does not match recording stream's device ", |
100 |
| - stream.device_index(), |
101 |
| - "."); |
102 |
| - reassignEvent(stream.queue()); |
103 |
| - } |
104 |
| - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
105 |
| - if (C10_UNLIKELY(interp)) { |
106 |
| - (*interp)->trace_gpu_event_record( |
107 |
| - at::kXPU, |
108 |
| - reinterpret_cast<uintptr_t>(event_.get()), |
109 |
| - reinterpret_cast<uintptr_t>(&stream.queue())); |
110 |
| - } |
111 |
| - } |
112 |
| - |
113 |
| - void block(const XPUStream& stream) { |
114 |
| - if (isCreated()) { |
115 |
| - std::vector<sycl::event> event_list{event()}; |
116 |
| - // Make this stream wait until event_ is completed. |
117 |
| - stream.queue().ext_oneapi_submit_barrier(event_list); |
118 |
| - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
119 |
| - if (C10_UNLIKELY(interp)) { |
120 |
| - (*interp)->trace_gpu_event_wait( |
121 |
| - at::kXPU, |
122 |
| - reinterpret_cast<uintptr_t>(event_.get()), |
123 |
| - reinterpret_cast<uintptr_t>(&stream.queue())); |
124 |
| - } |
125 |
| - } |
126 |
| - } |
127 |
| - |
128 |
| - double elapsed_time(const XPUEvent& other) const { |
129 |
| - TORCH_CHECK( |
130 |
| - isCreated() && other.isCreated(), |
131 |
| - "Both events must be recorded before calculating elapsed time."); |
132 |
| - TORCH_CHECK( |
133 |
| - query() && other.query(), |
134 |
| - "Both events must be completed before calculating elapsed time."); |
135 |
| - TORCH_CHECK( |
136 |
| - enable_timing_ && other.enable_timing_, |
137 |
| - "Both events must be created with argument 'enable_timing=True'."); |
138 |
| - |
139 |
| -#if SYCL_COMPILER_VERSION < 20250000 |
140 |
| - TORCH_CHECK_NOT_IMPLEMENTED( |
141 |
| - false, |
142 |
| - "elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); |
143 |
| -#endif |
144 |
| - |
145 |
| - using namespace sycl::info::event_profiling; |
146 |
| - // Block until both of the recorded events are completed. |
147 |
| - uint64_t end_time_ns = other.event().get_profiling_info<command_end>(); |
148 |
| - uint64_t start_time_ns = event().get_profiling_info<command_end>(); |
149 |
| - // Return the eplased time in milliseconds. |
150 |
| - return 1e-6 * |
151 |
| - (static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns)); |
152 |
| - } |
153 |
| - |
154 |
| - void synchronize() const { |
155 |
| - if (isCreated()) { |
156 |
| - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); |
157 |
| - if (C10_UNLIKELY(interp)) { |
158 |
| - (*interp)->trace_gpu_event_synchronization( |
159 |
| - at::kXPU, reinterpret_cast<uintptr_t>(event_.get())); |
160 |
| - } |
161 |
| - event().wait_and_throw(); |
162 |
| - } |
163 |
| - } |
164 |
| - |
165 |
| - private: |
166 |
| - void assignEvent(sycl::queue& queue) { |
167 |
| -#if SYCL_COMPILER_VERSION >= 20250000 |
168 |
| - if (enable_timing_) { |
169 |
| - event_ = std::make_unique<sycl::event>( |
170 |
| - sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); |
171 |
| - } else { |
172 |
| - event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier()); |
173 |
| - } |
174 |
| -#else |
175 |
| - event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier()); |
176 |
| -#endif |
177 |
| - } |
178 |
| - |
179 |
| - void reassignEvent(sycl::queue& queue) { |
180 |
| - event_.reset(); |
181 |
| - assignEvent(queue); |
182 |
| - } |
183 |
| - |
184 |
| - bool enable_timing_ = false; |
185 |
| - DeviceIndex device_index_ = -1; |
186 |
| - // Only need to track the last event, as events in an in-order queue are |
187 |
| - // executed sequentially. |
188 |
| - std::unique_ptr<sycl::event> event_; |
189 |
| -}; |
190 |
| - |
191 |
| -} // namespace at::xpu |
| 3 | +#include <c10/xpu/XPUEvent.h> |
0 commit comments