Skip to content

Commit 132abd4

Browse files
committed
Merge branch 'refs/heads/main' into moksiucik_torchrun_xpu
2 parents 00e96f7 + d35b27d commit 132abd4

File tree

159 files changed

+3027
-4081
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

159 files changed

+3027
-4081
lines changed

.ci/docker/common/install_triton.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,5 +103,5 @@ fi
103103
# It depends on torch and triton. We don't want to install
104104
# triton and torch from production on Docker CI images
105105
if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
106-
pip_install helion==0.0.10 --no-deps
106+
pip_install helion --no-deps
107107
fi

.ci/manywheel/build_rocm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
194194
ROCBLAS_LIB_DST=lib/rocblas/library
195195
ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
196196
ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
197-
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
197+
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
198198

199199
# hipblaslt library files
200200
HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library

.ci/pytorch/test.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,8 @@ test_perf_for_dashboard() {
627627
device=cuda_a10g
628628
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
629629
device=cuda_h100
630+
elif [[ "${TEST_CONFIG}" == *b200* ]]; then
631+
device=cuda_b200
630632
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
631633
device=rocm
632634
fi
@@ -801,6 +803,16 @@ test_dynamo_benchmark() {
801803
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
802804
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
803805
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
806+
# TODO (huydhn): Just smoke test some sample models
807+
if [[ "${TEST_CONFIG}" == *b200* ]]; then
808+
if [[ "${suite}" == "huggingface" ]]; then
809+
export TORCHBENCH_ONLY_MODELS="DistillGPT2"
810+
elif [[ "${suite}" == "timm_models" ]]; then
811+
export TORCHBENCH_ONLY_MODELS="inception_v3"
812+
elif [[ "${suite}" == "torchbench" ]]; then
813+
export TORCHBENCH_ONLY_MODELS="hf_Bert"
814+
fi
815+
fi
804816
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
805817
else
806818
if [[ "${TEST_CONFIG}" == *cpu* ]]; then

.github/ci_commit_pins/audio.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
bf305f538005f2e900f8850ed57146024a8bc559
1+
9b57c7bd5ad4db093c5bb31c802df9f04d933ac9

.github/ci_commit_pins/vllm.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
ca9e2be3ed6320b51f52f536595cd24e254f8bb2
1+
6a39ba85fe0f2fff9494b5eccea717c93510c230

.github/workflows/_linux-test.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ jobs:
9696
steps:
9797
- name: Setup SSH (Click me for login details)
9898
uses: pytorch/test-infra/.github/actions/setup-ssh@main
99-
if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
99+
if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
100100
with:
101101
github-secret: ${{ secrets.GITHUB_TOKEN }}
102102
instructions: |
@@ -109,15 +109,15 @@ jobs:
109109
no-sudo: true
110110

111111
- name: Setup Python
112-
if: matrix.runner == 'B200'
112+
if: contains(matrix.runner, 'b200')
113113
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
114114
with:
115115
python-version: '3.12'
116116
cache: pip
117117

118118
- name: Setup Linux
119119
uses: ./.github/actions/setup-linux
120-
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
120+
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
121121

122122
- name: configure aws credentials
123123
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
@@ -128,7 +128,7 @@ jobs:
128128
aws-region: us-east-1
129129

130130
- name: Login to Amazon ECR
131-
if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
131+
if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
132132
id: login-ecr
133133
continue-on-error: true
134134
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
@@ -166,17 +166,17 @@ jobs:
166166
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
167167
with:
168168
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
169-
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
169+
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
170170

171171
- name: Setup GPU_FLAG for docker run
172172
id: setup-gpu-flag
173173
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
174-
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
174+
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
175175

176176
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
177177
id: setup-sscache-port-flag
178178
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
179-
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
179+
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
180180

181181
- name: Lock NVIDIA A100 40GB Frequency
182182
run: |
@@ -277,8 +277,8 @@ jobs:
277277
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
278278
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
279279
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
280-
SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
281-
SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
280+
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
281+
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
282282
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
283283
DOCKER_IMAGE: ${{ inputs.docker-image }}
284284
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@@ -403,7 +403,7 @@ jobs:
403403
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
404404

405405
- name: Authenticate with AWS
406-
if: ${{ matrix.runner == 'B200' }}
406+
if: ${{ contains(matrix.runner, 'b200') }}
407407
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
408408
with:
409409
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
name: inductor-perf-b200
2+
3+
on:
4+
schedule:
5+
- cron: 0 7 * * 1-6
6+
- cron: 0 7 * * 0
7+
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
8+
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
9+
workflow_dispatch:
10+
inputs:
11+
training:
12+
description: Run training (on by default)?
13+
required: false
14+
type: boolean
15+
default: true
16+
inference:
17+
description: Run inference (on by default)?
18+
required: false
19+
type: boolean
20+
default: true
21+
default:
22+
description: Run inductor_default?
23+
required: false
24+
type: boolean
25+
default: false
26+
dynamic:
27+
description: Run inductor_dynamic_shapes?
28+
required: false
29+
type: boolean
30+
default: false
31+
cppwrapper:
32+
description: Run inductor_cpp_wrapper?
33+
required: false
34+
type: boolean
35+
default: false
36+
cudagraphs:
37+
description: Run inductor_cudagraphs?
38+
required: false
39+
type: boolean
40+
default: true
41+
freezing_cudagraphs:
42+
description: Run inductor_cudagraphs with freezing for inference?
43+
required: false
44+
type: boolean
45+
default: false
46+
aotinductor:
47+
description: Run aot_inductor for inference?
48+
required: false
49+
type: boolean
50+
default: false
51+
maxautotune:
52+
description: Run inductor_max_autotune?
53+
required: false
54+
type: boolean
55+
default: false
56+
benchmark_configs:
57+
description: The list of configs used the benchmark
58+
required: false
59+
type: string
60+
default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
61+
62+
concurrency:
63+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
64+
cancel-in-progress: true
65+
66+
permissions:
67+
id-token: write
68+
contents: read
69+
70+
jobs:
71+
get-label-type:
72+
name: get-label-type
73+
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
74+
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
75+
with:
76+
triggering_actor: ${{ github.triggering_actor }}
77+
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
78+
curr_branch: ${{ github.head_ref || github.ref_name }}
79+
curr_ref_type: ${{ github.ref_type }}
80+
opt_out_experiments: lf
81+
82+
build:
83+
name: cuda12.8-py3.10-gcc9-sm100
84+
uses: ./.github/workflows/_linux-build.yml
85+
needs: get-label-type
86+
with:
87+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
88+
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
89+
# or newer GPUs, so it doesn't benefit much from existing compiler cache
90+
# from trunk. Also use a memory-intensive runner here because memory is
91+
# usually the bottleneck
92+
runner: linux.12xlarge.memory
93+
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
94+
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
95+
cuda-arch-list: '10.0'
96+
test-matrix: |
97+
{ include: [
98+
{ config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
99+
{ config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
100+
{ config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
101+
]}
102+
selected-test-configs: ${{ inputs.benchmark_configs }}
103+
build-additional-packages: "vision audio fbgemm torchao"
104+
secrets: inherit
105+
106+
test-periodically:
107+
name: cuda12.8-py3.10-gcc9-sm100
108+
uses: ./.github/workflows/_linux-test.yml
109+
needs: build
110+
if: github.event.schedule == '0 7 * * 1-6'
111+
with:
112+
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
113+
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
114+
docker-image: ${{ needs.build.outputs.docker-image }}
115+
test-matrix: ${{ needs.build.outputs.test-matrix }}
116+
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
117+
timeout-minutes: 720
118+
disable-monitor: false
119+
monitor-log-interval: 15
120+
monitor-data-collect-interval: 4
121+
secrets: inherit
122+
123+
test-weekly:
124+
name: cuda12.8-py3.10-gcc9-sm100
125+
uses: ./.github/workflows/_linux-test.yml
126+
needs: build
127+
if: github.event.schedule == '0 7 * * 0'
128+
with:
129+
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
130+
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
131+
docker-image: ${{ needs.build.outputs.docker-image }}
132+
test-matrix: ${{ needs.build.outputs.test-matrix }}
133+
timeout-minutes: 1440
134+
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
135+
disable-monitor: false
136+
monitor-log-interval: 15
137+
monitor-data-collect-interval: 4
138+
secrets: inherit
139+
140+
test:
141+
name: cuda12.8-py3.10-gcc9-sm100
142+
uses: ./.github/workflows/_linux-test.yml
143+
needs: build
144+
with:
145+
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
146+
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
147+
docker-image: ${{ needs.build.outputs.docker-image }}
148+
test-matrix: ${{ needs.build.outputs.test-matrix }}
149+
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
150+
timeout-minutes: 720
151+
disable-monitor: false
152+
monitor-log-interval: 15
153+
monitor-data-collect-interval: 4
154+
secrets: inherit

.github/workflows/update-viablestrict.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
with:
2424
repository: pytorch/pytorch
2525
stable-branch: viable/strict
26-
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
26+
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
2727
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
2828
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
2929
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}

aten/src/ATen/CMakeLists.txt

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ if(USE_ROCM)
439439
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
440440
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
441441
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
442+
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
442443
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
443444
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
444445
_pytorch_rocm_generate_ck_conf()
@@ -703,21 +704,17 @@ if(USE_MPS)
703704
if(CAN_COMPILE_METAL)
704705
foreach(SHADER ${native_mps_metal})
705706
cmake_path(GET SHADER STEM TGT_STEM)
706-
string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
707-
string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
707+
string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
708708
list(APPEND AIR_BASIC ${TGT_BASIC})
709-
list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
710-
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
711-
metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
709+
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
712710
endforeach()
713711
air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
714-
air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
715712
add_custom_command(
716713
COMMAND echo "// $$(date)" > metallib_dummy.cpp
717-
DEPENDS kernels_basic.metallib kernels_bfloat.metallib
714+
DEPENDS kernels_basic.metallib
718715
OUTPUT metallib_dummy.cpp
719716
COMMENT "Updating metallibs timestamp")
720-
add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
717+
add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
721718
else()
722719
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
723720
foreach(SHADER ${native_mps_metal})

aten/src/ATen/native/cuda/Blas.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1634,6 +1634,9 @@ bool use_fast_accum) {
16341634
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
16351635
const bool a_is_2d = mat_a.dim() == 2;
16361636
const bool b_is_2d = mat_b.dim() == 2;
1637+
if (!a_is_2d || !b_is_2d) {
1638+
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
1639+
}
16371640
TORCH_CHECK(
16381641
mat_a.size(-1) % 16 == 0,
16391642
"Expected trailing dimension of mat_a to be divisible by 16 ",
@@ -1716,6 +1719,9 @@ std::optional<c10::ScalarType> out_dtype) {
17161719
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
17171720
const bool a_is_2d = mat_a.dim() == 2;
17181721
const bool b_is_2d = mat_b.dim() == 2;
1722+
if (!a_is_2d || !b_is_2d) {
1723+
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
1724+
}
17191725

17201726
// check that the strides are valid, the fn will throw an error if not
17211727
check_valid_strides_and_return_transposed(mat_a);

0 commit comments

Comments
 (0)