From ae2d8f41b48326c23a166207ec92e25f30da2b0e Mon Sep 17 00:00:00 2001
From: FFFrog <ljw1101.vip@gmail.com>
Date: Fri, 18 Jul 2025 20:21:59 +0800
Subject: [PATCH] Update

[ghstack-poisoned]
---
 .ci/docker/requirements-docs.txt              |   4 +
 docs/source/conf.py                           |   4 +
 docs/source/notes/accelerator.md              | 274 ++++++++++++++++++
 .../torch_openreg/csrc/aten/OpenRegExtra.cpp  |   6 +
 .../csrc/aten/OpenRegMinimal.cpp              |   6 +
 .../torch_openreg/csrc/aten/native/Extra.cpp  |   4 +
 6 files changed, 298 insertions(+)
 create mode 100644 docs/source/notes/accelerator.md

diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index 54e9dbdfca26..dedc087e351a 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -15,6 +15,10 @@ sphinxext-opengraph==0.9.1
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.9.1
 
+sphinx-tabs==3.4.7
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 3.4.7
+
 sphinx_sitemap==2.6.0
 #Description: This is used to generate sitemap for PyTorch docs
 #Pinned versions: 2.6.0
diff --git a/docs/source/conf.py b/docs/source/conf.py
index acb2b088af72..37859299efbc 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -66,6 +66,7 @@
     "sphinx.ext.linkcode",
     "sphinxcontrib.mermaid",
     "sphinx_sitemap",
+    'sphinx_tabs.tabs',
 ]
 
 myst_enable_extensions = [
@@ -82,6 +83,9 @@
 ]
 sitemap_url_scheme = "{link}"
 
+# todo options
+todo_include_todos = True
+
 # build the templated autosummary files
 autosummary_generate = True
 numpydoc_show_class_members = False
diff --git a/docs/source/notes/accelerator.md b/docs/source/notes/accelerator.md
new file mode 100644
index 000000000000..c4ceef3931d2
--- /dev/null
+++ b/docs/source/notes/accelerator.md
@@ -0,0 +1,274 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Extending PyTorch with New Accelerators
+
+## Backgrounp
+
+The PrivateUse1-based third-party device integration mechanism has become the official path for integrating new devices into PyTorch. Ensuring the usability of this mechanism is crucial for enriching the hardware ecosystem of PyTorch.
+
+To assist third-party device developers in efficiently integrating new backends, this article introduces in detail the integration methods for typical PyTorch modules using a modular approach. It is accompanied by a streamlined code implementation from the official [torch_openreg][OpenReg URL] backend to help developers quickly get started while avoiding common pitfalls.
+
+This document is suitable for the following readers:
+
+* Developers who wish to integrate accelerator backends into PyTorch;
+* Developers interested in the principles of typical PyTorch modules;
+
+---
+
+## Operator Registration
+
+PyTorch provides multiple methods for operator registration and usage, both at the Python and C++ levels, along with a set of supporting tools to quickly locate issues and query information. The following sections detail the operator registration capabilities.
+
+### Tools
+
+#### Commands
+
+PyTorch provides a set of commands prefixed with `torch._C._dispatch_` around its Dispatch feature. You can query all related interfaces using the following command:
+
+```Shell
+python -c 'import torch; print("\n".join([x for x in dir(torch._C) if x.startswith("_dispatch_")]))'
+
+...
+_dispatch_dump
+_dispatch_dump_table
+_dispatch_has_kernel
+_dispatch_has_kernel_for_any_dispatch_key
+_dispatch_has_kernel_for_dispatch_key
+_dispatch_isTensorSubclassLike
+_dispatch_is_alias_key
+_dispatch_is_included_in_alias
+_dispatch_is_main_interpreter
+_dispatch_kernel_for_dispatch_key_is_fallthrough
+_dispatch_key_for_device
+_dispatch_key_name
+_dispatch_key_parse
+_dispatch_key_set
+...
+```
+
+Here are explanations for several commonly used commands:
+
+* `torch._C._dispatch_key_set`:
+
+    Displays the DispatchKey of the current Tensor, with priority increasing from left to right.
+
+    ```Python
+    >>> import torch
+    >>> a = torch.randn(3,3,device="cuda")
+    >>> torch._C._dispatch_key_set(a)
+    'DispatchKeySet(CUDA, ADInplaceOrView, AutogradCUDA, AutocastCUDA)'
+    ```
+
+* `torch._C._dispatch_dump_table`:
+
+    Queries the support status of a given operator across different Dispatch Keys, making it easy to locate the corresponding implementation code.
+
+    ```Python
+    >>> import torch
+    >>> print(torch._C._dispatch_dump_table("aten::add.Tensor"))
+    >>> ...
+        CPU: registered at ./build/aten/src/ATen/RegisterCPU_0.cpp:1309 [kernel]
+        CUDA: registered at ./build/aten/src/ATen/RegisterCUDA_0.cpp:2420 [kernel]
+        HIP: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MPS: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        IPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        XPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        HPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        VE: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MTIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MAIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        PrivateUse1: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        ...
+    ```
+
+#### Environment Variables
+
+PyTorch also provides some Dispatcher-related environment variables that can help with learning and quickly locating issues.
+
+* TORCH_SHOW_DISPATCH_TRACE
+
+    Displays detailed internal dispatch key scheduling during PyTorch execution.
+
+    ```Bash
+    export TORCH_SHOW_DISPATCH_TRACE=1
+    ```
+
+    ```Python
+    >>> import torch
+    >>> a = torch.randn(3,3)
+      [call] op=[aten::randn], key=[BackendSelect]
+       [redispatch] op=[aten::randn], key=[CPU]
+        [call] op=[aten::empty.memory_format], key=[BackendSelect]
+         [redispatch] op=[aten::empty.memory_format], key=[CPU]
+        [call] op=[aten::normal_], key=[CPU]
+    ```
+
+### Registration
+
+::::{tabs}
+
+:::{tab} C++
+
+1. Scenario One
+
+   This is the most common operator implementation scenario. PyTorch comes with many built-in operators, defining their namespace (mainly in `aten` and `c10d`), schema, and concrete implementations for backends like CPU and CUDA. Our task is to provide the corresponding implementations for new devices for these built-in operators.
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT
+        :end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT
+    ```
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: TORCH_LIBRARY_IMPL
+        :end-before:  LITERALINCLUDE END: TORCH_LIBRARY_IMPL
+        :emphasize-lines: 2
+        :linenos:
+    ```
+
+    This registers the `wrapper_empty_memory_format` implementation for the new device to the `aten::emtpy.memory_format` operator on the `PrivateUse1 DispatchKey`.
+
+2. Scenario Two
+
+    For built-in PyTorch operators, besides the registration method in Scenario One, a `STUB` registration method is also supported. Essentially, this approach is based on Scenario One but with added flexibility to enhance code reuse across devices or to enable further dispatching at other granularities (e.g., CPU feature capabilities).
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: STUB
+        :end-before: LITERALINCLUDE END: STUB
+        :linenos:
+    ```
+
+    ```{todo}
+    List of operators that can be registered via `STUB`
+    ```
+
+3. Scenario Three
+
+    Besides providing built-in operator definitions, PyTorch also supports user-defined operators, generally in two forms:
+
+    * Adding custom operators to a new namespace:
+
+    ```{todo}
+    TODO(including forward and backward)
+    ```
+
+    * Extending existing namespaces with custom operators:
+
+    ```{todo}
+    TODO(including forward and backward)
+    ```
+
+4. Scenario Four
+
+    In addition to separately registering forward and backward functions to `PrivateUse1` and `AutogradPrivateUse1` DispatchKeys, PyTorch also supports a more convenient option using `torch.autograd.Function`.
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: TORCH.AUTOGRAD.FUNCTION Part1
+        :end-before: LITERALINCLUDE END: TORCH.AUTOGRAD.FUNCTION Part1
+        :linenos:
+    ```
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: TORCH.AUTOGRAD.FUNCTION Part2
+        :end-before: LITERALINCLUDE END: TORCH.AUTOGRAD.FUNCTION Part2
+        :linenos:
+    ```
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: TORCH.AUTOGRAD.FUNCTION
+        :end-before: LITERALINCLUDE END: TORCH.AUTOGRAD.FUNCTION
+        :emphasize-lines: 2,7
+        :linenos:
+    ```
+
+5. Scenario Five
+
+    PyTorch provides a fallback mechanism that allows unsupported operators to fall back to CPU execution. This is crucial for in-development accelerator backends to ensure functional correctness at the cost of performance.
+
+    * Per-operator fallback
+
+    ```{todo}
+    TODO
+    ```
+
+    * Global fallback
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: FALLBACK GLOBAL
+        :end-before: LITERALINCLUDE END: FALLBACK GLOBAL
+        :linenos:
+    ```
+
+    This enables global fallback so all unimplemented operators on the new backend will default to CPU execution.
+
+6. Scenario Six
+
+    ```{todo}
+    * Meta registration
+    * Overriding default implementations
+    * Fallthrough
+    * ATen operator set
+    * ...
+    ```
+
+:::
+
+:::{tab} Python
+
+TODO
+
+:::
+
+::::
+
+### Minimum set of operators to support
+
+To help developers better prioritize their work, we provide a minimal set of operators. Implementing these operators ensures basic operator functionality is available.
+
+| Operator Name                      | Dispatch Key | Description                       |
+| :---:                              | :---:        | :---:                             |
+| empty.memory_format                | PrivateUse1  |                                   |
+| empty_strided                      | PrivateUse1  |                                   |
+| as_strided                         | PrivateUse1  |                                   |
+| resize_                            | PrivateUse1  |                                   |
+| _reshape_alias                     | PrivateUse1  |                                   |
+| _copy_from                         | PrivateUse1  |                                   |
+| _copy_from_and_resize              | PrivateUse1  |                                   |
+| _local_scalar_dense                | PrivateUse1  |                                   |
+| set_.source_Tensor                 | PrivateUse1  |                                   |
+| set_.source_Storage                | PrivateUse1  |                                   |
+| set_.source_Storage_storage_offset | PrivateUse1  |                                   |
+| view                               | PrivateUse1  |                                   |
+| fallback                           | PrivateUse1  |                                   |
+
+```{todo}
+Add/remove operators above to ensure the minimal set list is reliable and accurate
+```
+
+## Autocast
+
+## Autoload
+
+## Memory Management
+
+## Custom Storage
+
+## ...
+
+[OpenReg URL]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg "OpenReg URL"
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
index 3d8525697cc8..b515350bd74f 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
@@ -7,6 +7,7 @@
 
 namespace at::openreg {
 
+// START my snippet
 at::Tensor wrapper_quantize_per_tensor(
     const at::Tensor& self,
     double scale,
@@ -15,6 +16,7 @@ at::Tensor wrapper_quantize_per_tensor(
   return at::native::quantize_per_tensor_openreg(
       self, scale, zero_point, dtype);
 }
+// END my snippet
 
 int64_t wrapper__fused_sdp_choice(
     const at::Tensor& query,
@@ -112,6 +114,7 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
 
 } // namespace at::openreg
 
+// LITERALINCLUDE START: TORCH.AUTOGRAD.FUNCTION
 namespace at::openreg {
 TORCH_LIBRARY(openreg, m) {
   m.def("custom_autograd_fn_returns_self(Tensor input)-> Tensor");
@@ -126,7 +129,9 @@ TORCH_LIBRARY_IMPL(openreg, AutogradPrivateUse1, m) {
       "custom_autograd_fn_aliasing", &at::native::custom_autograd_fn_aliasing);
 }
 } // namespace at::openreg
+// LITERALINCLUDE END: TORCH.AUTOGRAD.FUNCTION
 
+// LITERALINCLUDE START: STUB
 namespace at::native {
 REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &abs_kernel_openreg);
 REGISTER_PRIVATEUSE1_DISPATCH(
@@ -136,3 +141,4 @@ REGISTER_PRIVATEUSE1_DISPATCH(
     _fused_sdp_choice_stub,
     &_fused_sdp_choice_openreg);
 } // namespace at::native
+// LITERALINCLUDE END: STUB
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
index fe75cdaea8b2..6958cf8b88fb 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
@@ -7,6 +7,7 @@
 
 namespace at::openreg {
 
+// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT
 at::Tensor wrapper_empty_memory_format(
     c10::IntArrayRef size,
     std::optional<c10::ScalarType> dtype_opt,
@@ -22,6 +23,7 @@ at::Tensor wrapper_empty_memory_format(
       pin_memory_opt,
       memory_format_opt);
 }
+// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT
 
 at::Tensor wrapper_empty_strided(
     c10::IntArrayRef size,
@@ -97,6 +99,7 @@ at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) {
   return at::native::view_openreg(self, size);
 }
 
+// LITERALINCLUDE START: TORCH_LIBRARY_IMPL
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("empty.memory_format", wrapper_empty_memory_format);
   m.impl("empty_strided", wrapper_empty_strided);
@@ -113,7 +116,9 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
       wrapper_set_source_Storage_storage_offsetset_);
   m.impl("view", wrapper_view);
 }
+// LITERALINCLUDE END: TORCH_LIBRARY_IMPL
 
+// LITERALINCLUDE START: FALLBACK GLOBAL
 void wrapper_cpu_fallback(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) {
@@ -124,5 +129,6 @@ TORCH_LIBRARY_IMPL(_, PrivateUse1, m) {
   m.fallback(
       torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
 }
+// LITERALINCLUDE END: FALLBACK GLOBAL
 
 } // namespace at::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
index 741d14803539..e0695475f3ee 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
@@ -211,6 +211,7 @@ struct CustomAutogradFnReturnsSelf
   }
 };
 
+// LITERALINCLUDE START: TORCH.AUTOGRAD.FUNCTION Part1
 struct CustomAutogradFnAliasing
     : public torch::autograd::Function<CustomAutogradFnAliasing> {
   static at::Tensor forward(
@@ -225,14 +226,17 @@ struct CustomAutogradFnAliasing
     return {grad_output[0] * 0.5};
   }
 };
+// LITERALINCLUDE END: TORCH.AUTOGRAD.FUNCTION Part1
 } // namespace
 
 at::Tensor custom_autograd_fn_returns_self(at::Tensor x) {
   return CustomAutogradFnReturnsSelf::apply(x);
 }
 
+// LITERALINCLUDE START: TORCH.AUTOGRAD.FUNCTION Part2
 at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
   return CustomAutogradFnAliasing::apply(x);
 }
+// LITERALINCLUDE END: TORCH.AUTOGRAD.FUNCTION Part2
 
 } // namespace at::native