[INTEL_HPU] add HPU stack kernel (#1872)

yanfeich · web-flow · commit ffe7beafb283 · 2025-08-07T13:44:58.000+08:00
diff --git a/backends/intel_hpu/kernels/stack_kernel.cc b/backends/intel_hpu/kernels/stack_kernel.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/funcs.h"
+#include "kernels/hpu_funcs.h"
+#include "kernels/hpu_operator.h"
+#include "utils/utils.h"
+
+namespace custom_kernel {
+
+class Stack : public HpuFusedOperator {
+ public:
+  explicit Stack(synDataType dtype)
+      : HpuFusedOperator("stack"), dtype_(dtype) {}
+
+  void AddNode(ConvertTensors& ct, unsigned params) {
+    auto inputs = ct.GetTensors();
+    auto outputs = ct.GetTensors(false);
+
+    std::vector<synTensor> syn_inputs;
+    for (size_t i = 0; i < inputs.size(); i++) {
+      syn_inputs.push_back(createTensorFromCT(&ct, i));
+    }
+
+    auto concat_dims = outputs[0].dims;
+
+    // Merge concat_dims[params] and concat_dims[params+1]
+    auto reduce_dim = concat_dims.size() - 2 - params;
+    concat_dims[reduce_dim] *= concat_dims[reduce_dim + 1];
+    concat_dims.erase(concat_dims.begin() + reduce_dim + 1);
+
+    std::vector<synTensor> outputs_concat;
+    auto concated = createTensorNoPresist("concat", dtype_, concat_dims);
+    outputs_concat.push_back(concated);
+
+    synConcatenateParams concatParams;
+    concatParams.axis = params;
+    AddNodeConcat(syn_inputs, outputs_concat, concatParams, guid_ + "concat");
+
+    std::vector<synTensor> syn_outputs;
+    auto stacked = createTensorFromCT(&ct, 0, false);
+    syn_outputs.push_back(stacked);
+
+    AddNodeReshape(outputs_concat, syn_outputs, guid_ + "reshape");
+  }
+
+ protected:
+  synDataType dtype_;
+};
+
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const phi::DenseTensor*>& x,
+                 int axis,
+                 phi::DenseTensor* y) {
+  dev_ctx.template Alloc<T>(y);
+
+  ConvertTensors ct;
+  for (size_t i = 0; i < x.size(); i++) {
+    ct.Add(x[i]);
+  }
+  ct.Add(y, false);
+
+  axis = CanonicalAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(x[0]->dims().size()));
+  axis = static_cast<int64_t>(x[0]->dims().size()) - 1 - axis;
+  unsigned params = static_cast<unsigned>(axis);
+
+  std::vector<DIMS> inputs_dims = ct.GetDims();
+  OpCacheOperator op_info;
+  op_info.prepareOpInfo<T, unsigned>("StackKernel", inputs_dims, &params);
+  auto recipe = op_info.GetRecipe();
+
+  if (recipe == nullptr) {
+    Stack op(op_info.datatype_);
+    op.AddNode(ct, params);
+    op.Compile();
+    op_info.setOp(op);
+    recipe = op_info.GetRecipe();
+  }
+
+  RecipeRunner runner(recipe);
+  auto tensors = ct.GetDeviceAddr();
+  runner.Run(reinterpret_cast<C_Stream>(dev_ctx.stream()), tensors);
+}
+
+}  // namespace custom_kernel
+
+PD_REGISTER_PLUGIN_KERNEL(stack,
+                          intel_hpu,
+                          ALL_LAYOUT,
+                          custom_kernel::StackKernel,
+                          float,
+                          int64_t,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16,
+                          phi::dtype::float8_e4m3fn) {}
diff --git a/backends/intel_hpu/tests/unittests/test_stack.py b/backends/intel_hpu/tests/unittests/test_stack.py
@@ -0,0 +1,82 @@
+#  Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import paddle
+from tests.op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
+
+import os
+
+intel_hpus_module_id = os.environ.get("FLAGS_selected_intel_hpus", 0)
+
+
+class TestStackOpBf16(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+
+    def initParameters(self):
+        pass
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append("x{}".format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = "stack"
+        self.set_hpu()
+        self.init_dtype()
+        self.x = []
+        self.y = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                convert_float_to_uint16(
+                    np.random.random(size=self.input_dim).astype(np.float32)
+                )
+            )
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {"X": tmp}
+        for i in self.x:
+            self.y.append(convert_uint16_to_float(i))
+        self.outputs = {"Y": np.stack(self.y, axis=self.axis)}
+        self.attrs = {"axis": self.axis}
+
+    def set_hpu(self):
+        self.__class__.use_custom_device = True
+        self.__class__.no_need_check_grad = True
+        self.place = paddle.CustomPlace("intel_hpu", int(intel_hpus_module_id))
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == "__main__":
+    unittest.main()