From c79176d5f95eac5463a5ee3a35cd1b6ecacac82d Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" Date: Tue, 15 Jul 2025 15:07:36 +0000 Subject: [PATCH] Update [ghstack-poisoned] --- aten/src/ATen/xpu/XPUEvent.h | 190 +---------------------------------- c10/xpu/CMakeLists.txt | 1 + c10/xpu/XPUEvent.h | 189 ++++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 189 deletions(-) create mode 100644 c10/xpu/XPUEvent.h diff --git a/aten/src/ATen/xpu/XPUEvent.h b/aten/src/ATen/xpu/XPUEvent.h index ededd6ebf4f1..f33fd70ac061 100644 --- a/aten/src/ATen/xpu/XPUEvent.h +++ b/aten/src/ATen/xpu/XPUEvent.h @@ -1,191 +1,3 @@ #pragma once #include - -#include - -namespace at::xpu { - -/* - * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are - * constructed lazily when first recorded. It has a device, and this device is - * acquired from the first recording stream. Later streams that record the event - * must match the same device. - * - * Currently, XPUEvent does NOT support to export an inter-process event from - * another process via inter-process comunication(IPC). So it means that - * inter-process communication for event handles between different processes is - * not available. This could impact some applications that rely on cross-process - * synchronization and communication. - */ -struct TORCH_XPU_API XPUEvent { - // Constructors - XPUEvent(bool enable_timing = false) noexcept - : enable_timing_{enable_timing} {} - - ~XPUEvent() { - if (isCreated()) { - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_deletion( - at::kXPU, reinterpret_cast(event_.get())); - } - } - } - - XPUEvent(const XPUEvent&) = delete; - XPUEvent& operator=(const XPUEvent&) = delete; - - XPUEvent(XPUEvent&& other) = default; - XPUEvent& operator=(XPUEvent&& other) = default; - - operator sycl::event&() const { - return event(); - } - - std::optional device() const { - if (isCreated()) { - return at::Device(at::kXPU, device_index_); - } else { - return std::nullopt; - } - } - - inline bool isCreated() const { - return (event_.get() != nullptr); - } - - DeviceIndex device_index() const { - return device_index_; - } - - sycl::event& event() const { - return *event_; - } - - bool query() const { - using namespace sycl::info; - if (!isCreated()) { - return true; - } - - return event().get_info() == - event_command_status::complete; - } - - void record() { - record(getCurrentXPUStream()); - } - - void recordOnce(const XPUStream& stream) { - if (!isCreated()) { - record(stream); - } - } - - void record(const XPUStream& stream) { - if (!isCreated()) { - device_index_ = stream.device_index(); - assignEvent(stream.queue()); - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_creation( - at::kXPU, reinterpret_cast(event_.get())); - } - } else { - TORCH_CHECK( - device_index_ == stream.device_index(), - "Event device ", - device_index_, - " does not match recording stream's device ", - stream.device_index(), - "."); - reassignEvent(stream.queue()); - } - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_record( - at::kXPU, - reinterpret_cast(event_.get()), - reinterpret_cast(&stream.queue())); - } - } - - void block(const XPUStream& stream) { - if (isCreated()) { - std::vector event_list{event()}; - // Make this stream wait until event_ is completed. - stream.queue().ext_oneapi_submit_barrier(event_list); - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_wait( - at::kXPU, - reinterpret_cast(event_.get()), - reinterpret_cast(&stream.queue())); - } - } - } - - double elapsed_time(const XPUEvent& other) const { - TORCH_CHECK( - isCreated() && other.isCreated(), - "Both events must be recorded before calculating elapsed time."); - TORCH_CHECK( - query() && other.query(), - "Both events must be completed before calculating elapsed time."); - TORCH_CHECK( - enable_timing_ && other.enable_timing_, - "Both events must be created with argument 'enable_timing=True'."); - -#if SYCL_COMPILER_VERSION < 20250000 - TORCH_CHECK_NOT_IMPLEMENTED( - false, - "elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); -#endif - - using namespace sycl::info::event_profiling; - // Block until both of the recorded events are completed. - uint64_t end_time_ns = other.event().get_profiling_info(); - uint64_t start_time_ns = event().get_profiling_info(); - // Return the eplased time in milliseconds. - return 1e-6 * - (static_cast(end_time_ns) - static_cast(start_time_ns)); - } - - void synchronize() const { - if (isCreated()) { - const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); - if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_synchronization( - at::kXPU, reinterpret_cast(event_.get())); - } - event().wait_and_throw(); - } - } - - private: - void assignEvent(sycl::queue& queue) { -#if SYCL_COMPILER_VERSION >= 20250000 - if (enable_timing_) { - event_ = std::make_unique( - sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); - } else { - event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); - } -#else - event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); -#endif - } - - void reassignEvent(sycl::queue& queue) { - event_.reset(); - assignEvent(queue); - } - - bool enable_timing_ = false; - DeviceIndex device_index_ = -1; - // Only need to track the last event, as events in an in-order queue are - // executed sequentially. - std::unique_ptr event_; -}; - -} // namespace at::xpu +#include diff --git a/c10/xpu/CMakeLists.txt b/c10/xpu/CMakeLists.txt index 95b9f031c3e9..c2fa65ba35e7 100644 --- a/c10/xpu/CMakeLists.txt +++ b/c10/xpu/CMakeLists.txt @@ -24,6 +24,7 @@ set(C10_XPU_HEADERS XPUCachingAllocator.h XPUDeviceProp.h XPUException.h + XPUEvent.h XPUFunctions.h XPUMacros.h XPUStream.h diff --git a/c10/xpu/XPUEvent.h b/c10/xpu/XPUEvent.h new file mode 100644 index 000000000000..e7e959c84873 --- /dev/null +++ b/c10/xpu/XPUEvent.h @@ -0,0 +1,189 @@ +#pragma once +#include + +namespace c10::xpu { + +/* + * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are + * constructed lazily when first recorded. It has a device, and this device is + * acquired from the first recording stream. Later streams that record the event + * must match the same device. + * + * Currently, XPUEvent does NOT support to export an inter-process event from + * another process via inter-process communication(IPC). So it means that + * inter-process communication for event handles between different processes is + * not available. This could impact some applications that rely on cross-process + * synchronization and communication. + */ +struct TORCH_XPU_API XPUEvent { + // Constructors + XPUEvent(bool enable_timing = false) noexcept + : enable_timing_{enable_timing} {} + + ~XPUEvent() { + if (isCreated()) { + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_deletion( + c10::kXPU, reinterpret_cast(event_.get())); + } + } + } + + XPUEvent(const XPUEvent&) = delete; + XPUEvent& operator=(const XPUEvent&) = delete; + + XPUEvent(XPUEvent&& other) = default; + XPUEvent& operator=(XPUEvent&& other) = default; + + operator sycl::event&() const { + return event(); + } + + std::optional device() const { + if (isCreated()) { + return c10::Device(c10::kXPU, device_index_); + } else { + return std::nullopt; + } + } + + inline bool isCreated() const { + return (event_.get() != nullptr); + } + + DeviceIndex device_index() const { + return device_index_; + } + + sycl::event& event() const { + return *event_; + } + + bool query() const { + using namespace sycl::info; + if (!isCreated()) { + return true; + } + + return event().get_info() == + event_command_status::complete; + } + + void record() { + record(getCurrentXPUStream()); + } + + void recordOnce(const XPUStream& stream) { + if (!isCreated()) { + record(stream); + } + } + + void record(const XPUStream& stream) { + if (!isCreated()) { + device_index_ = stream.device_index(); + assignEvent(stream.queue()); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_creation( + c10::kXPU, reinterpret_cast(event_.get())); + } + } else { + TORCH_CHECK( + device_index_ == stream.device_index(), + "Event device ", + device_index_, + " does not match recording stream's device ", + stream.device_index(), + "."); + reassignEvent(stream.queue()); + } + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_record( + c10::kXPU, + reinterpret_cast(event_.get()), + reinterpret_cast(&stream.queue())); + } + } + + void block(const XPUStream& stream) { + if (isCreated()) { + std::vector event_list{event()}; + // Make this stream wait until event_ is completed. + stream.queue().ext_oneapi_submit_barrier(event_list); + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_wait( + c10::kXPU, + reinterpret_cast(event_.get()), + reinterpret_cast(&stream.queue())); + } + } + } + + double elapsed_time(const XPUEvent& other) const { + TORCH_CHECK( + isCreated() && other.isCreated(), + "Both events must be recorded before calculating elapsed time."); + TORCH_CHECK( + query() && other.query(), + "Both events must be completed before calculating elapsed time."); + TORCH_CHECK( + enable_timing_ && other.enable_timing_, + "Both events must be created with argument 'enable_timing=True'."); + +#if SYCL_COMPILER_VERSION < 20250000 + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "elapsed_time of XPUEvent requires PyTorch to be built with SYCL compiler version 2025.0.0 or newer."); +#endif + + using namespace sycl::info::event_profiling; + // Block until both of the recorded events are completed. + uint64_t end_time_ns = other.event().get_profiling_info(); + uint64_t start_time_ns = event().get_profiling_info(); + // Return the eplased time in milliseconds. + return 1e-6 * + (static_cast(end_time_ns) - static_cast(start_time_ns)); + } + + void synchronize() const { + if (isCreated()) { + const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); + if (C10_UNLIKELY(interp)) { + (*interp)->trace_gpu_event_synchronization( + c10::kXPU, reinterpret_cast(event_.get())); + } + event().wait_and_throw(); + } + } + + private: + void assignEvent(sycl::queue& queue) { +#if SYCL_COMPILER_VERSION >= 20250000 + if (enable_timing_) { + event_ = std::make_unique( + sycl::ext::oneapi::experimental::submit_profiling_tag(queue)); + } else { + event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); + } +#else + event_ = std::make_unique(queue.ext_oneapi_submit_barrier()); +#endif + } + + void reassignEvent(sycl::queue& queue) { + event_.reset(); + assignEvent(queue); + } + + bool enable_timing_ = false; + DeviceIndex device_index_ = -1; + // Only need to track the last event, as events in an in-order queue are + // executed sequentially. + std::unique_ptr event_; +}; + +} // namespace c10::xpu