Skip to content

Commit 10ce9c5

Browse files
committed
Update on "[inductor] add lowering for repeat_interleave.Tensor with output size specified (#147160)"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben [ghstack-poisoned]
2 parents b2965b9 + fc60ce8 commit 10ce9c5

File tree

78 files changed

+2317
-990
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+2317
-990
lines changed

.ci/docker/common/install_cuda.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ function install_nvshmem {
6868
# download, unpack, install
6969
wget -q "${url}"
7070
tar xf "${filename}.tar.gz"
71-
cp -a "libnvshmem/include/"* /usr/local/include/
72-
cp -a "libnvshmem/lib/"* /usr/local/lib/
71+
cp -a "libnvshmem/include/"* /usr/local/cuda/include/
72+
cp -a "libnvshmem/lib/"* /usr/local/cuda/lib64/
7373

7474
# cleanup
7575
cd ..

.ci/pytorch/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ fi
176176

177177
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
178178
# memory to build and will OOM
179-
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
179+
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
180180
export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
181181
fi
182182

.ci/wheel/build_wheel.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,6 @@ retry brew install libomp
192192
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
193193
export USE_DISTRIBUTED=1
194194

195-
if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
196-
export CMAKE_OSX_ARCHITECTURES=arm64
197-
fi
198195
export USE_MKLDNN=OFF
199196
export USE_QNNPACK=OFF
200197
export BUILD_TEST=OFF

.github/scripts/generate_ci_workflows.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ class BinaryBuildWorkflow:
5959
is_scheduled: str = ""
6060
branches: str = "nightly"
6161
# Mainly for macos
62-
cross_compile_arm64: bool = False
6362
macos_runner: str = "macos-14-xlarge"
6463
use_split_build: bool = False
6564
# Mainly used for libtorch builds
@@ -338,7 +337,6 @@ class OperatingSystem:
338337
generate_binary_build_matrix.RELEASE,
339338
libtorch_variants=["shared-with-deps"],
340339
),
341-
cross_compile_arm64=False,
342340
macos_runner="macos-14-xlarge",
343341
ciflow_config=CIFlowConfig(
344342
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@@ -351,7 +349,6 @@ class OperatingSystem:
351349
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
352350
OperatingSystem.MACOS_ARM64
353351
),
354-
cross_compile_arm64=False,
355352
macos_runner="macos-14-xlarge",
356353
ciflow_config=CIFlowConfig(
357354
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},

.github/templates/macos_binary_build_workflow.yml.j2

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,6 @@ env:
4747
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
4848
PR_NUMBER: ${{ github.event.pull_request.number }}
4949
SKIP_ALL_TESTS: 0
50-
{%- if cross_compile_arm64 %}
51-
CROSS_COMPILE_ARM64: 1
52-
{% endif %}
5350
!{{ common.concurrency(build_environment) }}
5451

5552
jobs:

.github/workflows/pull.yml

Lines changed: 5 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -254,36 +254,6 @@ jobs:
254254
timeout-minutes: 600
255255
secrets: inherit
256256

257-
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
258-
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
259-
uses: ./.github/workflows/_linux-build.yml
260-
needs: get-label-type
261-
with:
262-
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
263-
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
264-
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
265-
cuda-arch-list: '7.5'
266-
test-matrix: |
267-
{ include: [
268-
{ config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
269-
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
270-
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
271-
]}
272-
secrets: inherit
273-
274-
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
275-
name: linux-jammy-cuda12.8-py3.10-gcc11-test
276-
uses: ./.github/workflows/_linux-test.yml
277-
needs:
278-
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
279-
- target-determination
280-
with:
281-
timeout-minutes: 360
282-
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
283-
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
284-
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
285-
secrets: inherit
286-
287257
linux-jammy-cuda12_8-py3_10-gcc11-build:
288258
name: linux-jammy-cuda12.8-py3.10-gcc11
289259
uses: ./.github/workflows/_linux-build.yml
@@ -292,14 +262,18 @@ jobs:
292262
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
293263
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
294264
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
295-
cuda-arch-list: 8.9
265+
cuda-arch-list: '7.5 8.9'
296266
test-matrix: |
297267
{ include: [
298268
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
299269
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
300270
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
301271
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
302272
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
273+
{ config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
274+
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
275+
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
276+
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
303277
]}
304278
secrets: inherit
305279

@@ -429,31 +403,6 @@ jobs:
429403
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
430404
secrets: inherit
431405

432-
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
433-
name: cuda12.8-py3.10-gcc9-sm75
434-
uses: ./.github/workflows/_linux-build.yml
435-
needs: get-label-type
436-
with:
437-
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
438-
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
439-
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
440-
cuda-arch-list: '7.5'
441-
test-matrix: |
442-
{ include: [
443-
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
444-
]}
445-
secrets: inherit
446-
447-
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
448-
name: cuda12.8-py3.10-gcc9-sm75
449-
uses: ./.github/workflows/_linux-test.yml
450-
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
451-
with:
452-
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
453-
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
454-
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
455-
secrets: inherit
456-
457406
linux-jammy-xpu-2025_1-py3_9-build:
458407
name: linux-jammy-xpu-2025.1-py3.9
459408
uses: ./.github/workflows/_linux-build.yml

aten/src/ATen/mps/EmptyTensor.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ TensorBase empty_mps(
4343
int64_t nelements = c10::multiply_integers(size);
4444
auto dtype = dtype_or_default(dtype_opt);
4545
TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED);
46-
TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer");
4746

4847

4948
auto dtype_meta = scalarTypeToTypeMeta(dtype);

aten/src/ATen/mps/MPSDevice.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,7 @@ namespace at::mps {
1818

1919
// Helper enum to check if a MPSGraph op is supported in a given macOS version
2020
enum class MacOSVersion : uint32_t {
21-
MACOS_VER_13_1_PLUS = 0,
22-
MACOS_VER_13_2_PLUS,
23-
MACOS_VER_13_3_PLUS,
24-
MACOS_VER_14_0_PLUS,
25-
MACOS_VER_14_4_PLUS,
21+
MACOS_VER_14_4_PLUS = 0,
2622
MACOS_VER_15_0_PLUS,
2723
MACOS_VER_15_1_PLUS,
2824
MACOS_VER_15_2_PLUS,

aten/src/ATen/mps/MPSDevice.mm

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
3232

3333
MPSDevice::MPSDevice() : _mtl_device(nil) {
3434
// Check that MacOS 13.0+ version of MPS framework is available
35-
// Create the MPSGraph and check method introduced in 13.0
35+
// Create the MPSGraph and check method introduced in 14.0
3636
// which is used by MPS backend.
3737
id mpsCD = NSClassFromString(@"MPSGraph");
3838

39-
if ([mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == NO) {
39+
if ([mpsCD instancesRespondToSelector:@selector(HermiteanToRealFFTWithTensor:axes:descriptor:name:)] == NO) {
4040
return;
4141
}
4242

@@ -66,24 +66,12 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
6666
isOperatingSystemAtLeastVersion:{.majorVersion = major, .minorVersion = minor, .patchVersion = 0}];
6767
}
6868
};
69-
static bool _macos_13_1_plus = is_os_version_at_least(13, 1);
70-
static bool _macos_13_2_plus = is_os_version_at_least(13, 2);
71-
static bool _macos_13_3_plus = is_os_version_at_least(13, 3);
72-
static bool _macos_14_0_plus = is_os_version_at_least(14, 0);
7369
static bool _macos_14_4_plus = is_os_version_at_least(14, 4);
7470
static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
7571
static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
7672
static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
7773

7874
switch (version) {
79-
case MacOSVersion::MACOS_VER_13_1_PLUS:
80-
return _macos_13_1_plus;
81-
case MacOSVersion::MACOS_VER_13_2_PLUS:
82-
return _macos_13_2_plus;
83-
case MacOSVersion::MACOS_VER_13_3_PLUS:
84-
return _macos_13_3_plus;
85-
case MacOSVersion::MACOS_VER_14_0_PLUS:
86-
return _macos_14_0_plus;
8775
case MacOSVersion::MACOS_VER_14_4_PLUS:
8876
return _macos_14_4_plus;
8977
case MacOSVersion::MACOS_VER_15_0_PLUS:

aten/src/ATen/mps/MPSHooks.mm

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,27 +34,15 @@
3434
case 14:
3535
switch (minor) {
3636
case 0:
37-
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
37+
return true;
3838
case 4:
3939
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
4040
default:
4141
TORCH_WARN("Can't check whether running on 14.", minor, "+ returning one for 14.4+");
4242
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
4343
}
4444
case 13:
45-
switch (minor) {
46-
case 0:
47-
return true;
48-
case 1:
49-
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS);
50-
case 2:
51-
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
52-
case 3:
53-
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
54-
default:
55-
TORCH_WARN("Can't check whether running on 13.", minor, "+ returning one for 13.3+");
56-
return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
57-
}
45+
return true;
5846
default:
5947
TORCH_WARN("Checking for unexpected MacOS ", major, ".", minor, " returning false");
6048
return false;

0 commit comments

Comments
 (0)