pytorch
diff --git a/‎.ci/docker/common/install_cpython.sh
Lines changed: 3 additions & 2 deletions b/‎.ci/docker/common/install_cpython.sh
Lines changed: 3 additions & 2 deletions
diff --git a/‎.ci/docker/common/install_xpu.sh
Lines changed: 25 additions & 16 deletions b/‎.ci/docker/common/install_xpu.sh
Lines changed: 25 additions & 16 deletions
diff --git a/‎.ci/docker/requirements-ci.txt
Lines changed: 9 additions & 7 deletions b/‎.ci/docker/requirements-ci.txt
Lines changed: 9 additions & 7 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 0 additions & 24 deletions b/‎.github/workflows/pull.yml
Lines changed: 0 additions & 24 deletions
diff --git a/‎.github/workflows/unstable.yml
Lines changed: 27 additions & 1 deletion b/‎.github/workflows/unstable.yml
Lines changed: 27 additions & 1 deletion
diff --git a/‎aten/src/ATen/DeviceAccelerator.h
Lines changed: 0 additions & 22 deletions b/‎aten/src/ATen/DeviceAccelerator.h
Lines changed: 0 additions & 22 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGraph.cpp
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/cuda/CUDAGraph.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGraph.h
Lines changed: 0 additions & 1 deletion b/‎aten/src/ATen/cuda/CUDAGraph.h
Lines changed: 0 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
Lines changed: 5 additions & 5 deletions b/‎aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
Lines changed: 5 additions & 5 deletions
diff --git a/‎c10/core/CachingDeviceAllocator.cpp
Lines changed: 0 additions & 10 deletions b/‎c10/core/CachingDeviceAllocator.cpp
Lines changed: 0 additions & 10 deletions
@@ -66,8 +66,9 @@ function do_cpython_build {
         ln -s pip3 ${prefix}/bin/pip
     fi
     # install setuptools since python 3.12 is required to use distutils
-    ${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
-    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+    # packaging is needed to create symlink since wheel no longer provides needed information
+    ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
+    local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
     ln -sf ${prefix} /opt/python/${abi_tag}
 }
 
 
@@ -34,18 +34,27 @@ function install_ubuntu() {
 
     # The xpu-smi packages
     apt-get install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    apt-get install -y \
-        intel-opencl-icd intel-level-zero-gpu level-zero \
-        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-        apt-get install -y intel-ocloc
+
+    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+        # Compute and Media Runtimes
+        apt-get install -y \
+            intel-opencl-icd intel-level-zero-gpu level-zero \
+            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+        # Development Packages
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+    else # rolling driver
+        apt-get install -y \
+            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
     fi
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+
     # Install Intel Support Packages
     apt-get install -y ${XPU_PACKAGES}
 
@@ -130,11 +139,11 @@ function install_sles() {
 
 }
 
-# Default use GPU driver LTS releases
-XPU_DRIVER_VERSION="/lts/2350"
-if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-    # Use GPU driver rolling releases
-    XPU_DRIVER_VERSION=""
+# Default use GPU driver rolling releases
+XPU_DRIVER_VERSION=""
+if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+    # Use GPU driver LTS releases
+    XPU_DRIVER_VERSION="/lts/2350"
 fi
 
 # Default use Intel® oneAPI Deep Learning Essentials 2025.0
 
@@ -63,11 +63,12 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:
 
-librosa>=0.6.2 ; python_version < "3.11"
-librosa==0.10.2 ; python_version == "3.12"
+librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
+librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
+#librosa depends on numba; disable it for s390x while numba is disabled too
 
 #mkl #this breaks linux-bionic-rocm4.5-py3.7
 #Description: Intel oneAPI Math Kernel Library
@@ -110,14 +111,15 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
-numba==0.55.2 ; python_version == "3.10"
-numba==0.60.0 ; python_version == "3.12"
+numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
+numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #For numba issue see https://github.com/pytorch/pytorch/issues/51511
+#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 
 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
@@ -307,7 +309,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
 
-z3-solver==4.15.1.0
+z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
 
@@ -304,30 +304,6 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-cpu-py3_10-gcc11-bazel-test:
     name: linux-jammy-cpu-py3.10-gcc11-bazel-test
     uses: ./.github/workflows/_bazel-build-test.yml
 
@@ -12,7 +12,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
-permissions: read-all
+permissions:
+  id-token: write
+  contents: read
 
 jobs:
   # There must be at least one job here to satisfy GitHub action workflow syntax
@@ -51,3 +53,27 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-py3_9-clang9-xla-build:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang9-xla-test:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_9-clang9-xla-build
+    with:
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
+    secrets: inherit
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>
 
@@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
 // original device index that was active before the change.
 TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
 
-TORCH_API inline void emptyCache() {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->emptyCache();
-}
-
-TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
-    c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
-}
-
-TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
-}
-
-TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
-}
-
 } // namespace at::accelerator
 
 namespace at {
 
@@ -2,6 +2,7 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/Functions.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 
 #include <cstddef>
 
@@ -2,7 +2,6 @@
 
 #include <ATen/Tensor.h>
 #include <c10/core/Device.h>
-#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/flat_hash_map.h>
 
@@ -333,14 +333,14 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
       weight.scalar_type() == at::ScalarType::Float ||
           weight.scalar_type() == at::ScalarType::Half,
       "'embedding_bag_byte_prepack' only support float32 or float16.");
-  const auto weight_sizes = weight.sizes();
-  const auto cols_dim = weight_sizes.size() - 1;
-  const int32_t embedding_cols = static_cast<int32_t>(weight_sizes[cols_dim]);
+  const auto weight_sizes = weight.sym_sizes();
+  const auto cols_dim = weight.ndimension() - 1;
+  const auto embedding_cols = weight_sizes[cols_dim];
   // Add 8 bytes per column to store FP32 scale and zero_point per row.
-  const int32_t output_columns = static_cast<int32_t>(embedding_cols + 2 * sizeof(float));
+  const auto output_columns = embedding_cols + 2 * sizeof(float);
 
   // Adjust output dimensions to account for FP32 scale and zero_points.
-  std::vector<int64_t> output_shape = weight_sizes.vec();
+  auto output_shape = weight_sizes.vec();
   output_shape.at(cols_dim) = output_columns;
   at::SymDimVector output_shape_vec(output_shape);