Skip to content

Commit 568bf25

Browse files
committed
Update base for Update on "[inductor][triton] support profile_scratch launcher arg"
This adds support for Triton after triton-lang/triton#7258 landed. triton-lang/triton#7258 adds a new argument to all the Triton kernels - a profile_scratch argument, similar to global_scratch. This PR updates the static cuda launcher and the AOTI kernel callers to pass in these arguments when calling the Triton kernel. Tests: #159158. I also verified these test locally with triton 3.2, 3.3, and 3.4. Fixes: * static_cuda_launcher (test/repro: `python tools/dynamo/verify_dynamo.py`) * AOTI calling logic (test/repro: `TORCHINDUCTOR_CPP_WRAPPER=1 python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_linalg_vander_cuda_float32`) cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben Differential Revision: [D79825121](https://our.internmc.facebook.com/intern/diff/D79825121) [ghstack-poisoned]
2 parents 90fb4c0 + 195b5c2 commit 568bf25

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+1177
-870
lines changed

.ci/docker/common/install_cpython.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,9 @@ function do_cpython_build {
6666
ln -s pip3 ${prefix}/bin/pip
6767
fi
6868
# install setuptools since python 3.12 is required to use distutils
69-
${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
70-
local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
69+
# packaging is needed to create symlink since wheel no longer provides needed information
70+
${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
71+
local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
7172
ln -sf ${prefix} /opt/python/${abi_tag}
7273
}
7374

.ci/docker/common/install_xpu.sh

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,27 @@ function install_ubuntu() {
3434

3535
# The xpu-smi packages
3636
apt-get install -y flex bison xpu-smi
37-
# Compute and Media Runtimes
38-
apt-get install -y \
39-
intel-opencl-icd intel-level-zero-gpu level-zero \
40-
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
41-
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
42-
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
43-
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
44-
if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
45-
apt-get install -y intel-ocloc
37+
38+
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
39+
# Compute and Media Runtimes
40+
apt-get install -y \
41+
intel-opencl-icd intel-level-zero-gpu level-zero \
42+
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
43+
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
44+
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
45+
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
46+
# Development Packages
47+
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
48+
else # rolling driver
49+
apt-get install -y \
50+
intel-opencl-icd libze-intel-gpu1 libze1 \
51+
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
52+
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
53+
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
54+
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
55+
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
4656
fi
47-
# Development Packages
48-
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
57+
4958
# Install Intel Support Packages
5059
apt-get install -y ${XPU_PACKAGES}
5160

@@ -130,11 +139,11 @@ function install_sles() {
130139

131140
}
132141

133-
# Default use GPU driver LTS releases
134-
XPU_DRIVER_VERSION="/lts/2350"
135-
if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
136-
# Use GPU driver rolling releases
137-
XPU_DRIVER_VERSION=""
142+
# Default use GPU driver rolling releases
143+
XPU_DRIVER_VERSION=""
144+
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
145+
# Use GPU driver LTS releases
146+
XPU_DRIVER_VERSION="/lts/2350"
138147
fi
139148

140149
# Default use Intel® oneAPI Deep Learning Essentials 2025.0

.ci/docker/requirements-ci.txt

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,12 @@ lark==0.12.0
6363
#Pinned versions: 0.12.0
6464
#test that import:
6565

66-
librosa>=0.6.2 ; python_version < "3.11"
67-
librosa==0.10.2 ; python_version == "3.12"
66+
librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
67+
librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
6868
#Description: A python package for music and audio analysis
6969
#Pinned versions: >=0.6.2
7070
#test that import: test_spectral_ops.py
71+
#librosa depends on numba; disable it for s390x while numba is disabled too
7172

7273
#mkl #this breaks linux-bionic-rocm4.5-py3.7
7374
#Description: Intel oneAPI Math Kernel Library
@@ -110,14 +111,15 @@ ninja==1.11.1.3
110111
#Pinned versions: 1.11.1.3
111112
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
112113

113-
numba==0.49.0 ; python_version < "3.9"
114-
numba==0.55.2 ; python_version == "3.9"
115-
numba==0.55.2 ; python_version == "3.10"
116-
numba==0.60.0 ; python_version == "3.12"
114+
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
115+
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
116+
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
117+
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
117118
#Description: Just-In-Time Compiler for Numerical Functions
118119
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
119120
#test that import: test_numba_integration.py
120121
#For numba issue see https://github.com/pytorch/pytorch/issues/51511
122+
#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
121123

122124
#numpy
123125
#Description: Provides N-dimensional arrays and linear algebra
@@ -307,7 +309,7 @@ pytest-cpp==2.3.0
307309
#Pinned versions: 2.3.0
308310
#test that import:
309311

310-
z3-solver==4.15.1.0
312+
z3-solver==4.15.1.0 ; platform_machine != "s390x"
311313
#Description: The Z3 Theorem Prover Project
312314
#Pinned versions:
313315
#test that import:

.github/workflows/pull.yml

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -304,30 +304,6 @@ jobs:
304304
]}
305305
secrets: inherit
306306

307-
linux-jammy-py3_9-clang9-xla-build:
308-
name: linux-jammy-py3_9-clang9-xla
309-
uses: ./.github/workflows/_linux-build.yml
310-
needs: get-label-type
311-
with:
312-
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
313-
build-environment: linux-jammy-py3.9-clang9-xla
314-
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
315-
test-matrix: |
316-
{ include: [
317-
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
318-
]}
319-
secrets: inherit
320-
321-
linux-jammy-py3_9-clang9-xla-test:
322-
name: linux-jammy-py3_9-clang9-xla
323-
uses: ./.github/workflows/_linux-test.yml
324-
needs: linux-jammy-py3_9-clang9-xla-build
325-
with:
326-
build-environment: linux-jammy-py3.9-clang9-xla
327-
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
328-
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
329-
secrets: inherit
330-
331307
linux-jammy-cpu-py3_10-gcc11-bazel-test:
332308
name: linux-jammy-cpu-py3.10-gcc11-bazel-test
333309
uses: ./.github/workflows/_bazel-build-test.yml

.github/workflows/unstable.yml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ concurrency:
1212
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
1313
cancel-in-progress: true
1414

15-
permissions: read-all
15+
permissions:
16+
id-token: write
17+
contents: read
1618

1719
jobs:
1820
# There must be at least one job here to satisfy GitHub action workflow syntax
@@ -51,3 +53,27 @@ jobs:
5153
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
5254
curr_branch: ${{ github.head_ref || github.ref_name }}
5355
curr_ref_type: ${{ github.ref_type }}
56+
57+
linux-jammy-py3_9-clang9-xla-build:
58+
name: linux-jammy-py3_9-clang9-xla
59+
uses: ./.github/workflows/_linux-build.yml
60+
needs: get-label-type
61+
with:
62+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
63+
build-environment: linux-jammy-py3.9-clang9-xla
64+
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
65+
test-matrix: |
66+
{ include: [
67+
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
68+
]}
69+
secrets: inherit
70+
71+
linux-jammy-py3_9-clang9-xla-test:
72+
name: linux-jammy-py3_9-clang9-xla
73+
uses: ./.github/workflows/_linux-test.yml
74+
needs: linux-jammy-py3_9-clang9-xla-build
75+
with:
76+
build-environment: linux-jammy-py3.9-clang9-xla
77+
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
78+
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
79+
secrets: inherit

aten/src/ATen/DeviceAccelerator.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#pragma once
22

3-
#include <c10/core/CachingDeviceAllocator.h>
43
#include <c10/core/DeviceType.h>
54
#include <c10/macros/Macros.h>
65

@@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
7372
// original device index that was active before the change.
7473
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
7574

76-
TORCH_API inline void emptyCache() {
77-
const auto device_type = getAccelerator(true).value();
78-
at::getDeviceAllocator(device_type)->emptyCache();
79-
}
80-
81-
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
82-
c10::DeviceIndex device_index) {
83-
const auto device_type = getAccelerator(true).value();
84-
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
85-
}
86-
87-
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
88-
const auto device_type = getAccelerator(true).value();
89-
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
90-
}
91-
92-
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
93-
const auto device_type = getAccelerator(true).value();
94-
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
95-
}
96-
9775
} // namespace at::accelerator
9876

9977
namespace at {

aten/src/ATen/cuda/CUDAGraph.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <ATen/cuda/CUDAGraph.h>
33
#include <ATen/cuda/Exceptions.h>
44
#include <ATen/Functions.h>
5+
#include <c10/cuda/CUDACachingAllocator.h>
56
#include <c10/cuda/CUDAFunctions.h>
67

78
#include <cstddef>

aten/src/ATen/cuda/CUDAGraph.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
#include <ATen/Tensor.h>
44
#include <c10/core/Device.h>
5-
#include <c10/cuda/CUDACachingAllocator.h>
65
#include <c10/cuda/CUDAGraphsC10Utils.h>
76
#include <c10/cuda/CUDAStream.h>
87
#include <c10/util/flat_hash_map.h>

aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,14 +333,14 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
333333
weight.scalar_type() == at::ScalarType::Float ||
334334
weight.scalar_type() == at::ScalarType::Half,
335335
"'embedding_bag_byte_prepack' only support float32 or float16.");
336-
const auto weight_sizes = weight.sizes();
337-
const auto cols_dim = weight_sizes.size() - 1;
338-
const int32_t embedding_cols = static_cast<int32_t>(weight_sizes[cols_dim]);
336+
const auto weight_sizes = weight.sym_sizes();
337+
const auto cols_dim = weight.ndimension() - 1;
338+
const auto embedding_cols = weight_sizes[cols_dim];
339339
// Add 8 bytes per column to store FP32 scale and zero_point per row.
340-
const int32_t output_columns = static_cast<int32_t>(embedding_cols + 2 * sizeof(float));
340+
const auto output_columns = embedding_cols + 2 * sizeof(float);
341341

342342
// Adjust output dimensions to account for FP32 scale and zero_points.
343-
std::vector<int64_t> output_shape = weight_sizes.vec();
343+
auto output_shape = weight_sizes.vec();
344344
output_shape.at(cols_dim) = output_columns;
345345
at::SymDimVector output_shape_vec(output_shape);
346346

c10/core/CachingDeviceAllocator.cpp

Lines changed: 0 additions & 10 deletions
This file was deleted.

0 commit comments

Comments
 (0)