Skip to content

Commit 99fc79b

Browse files
Update
[ghstack-poisoned]
2 parents 4c7cd95 + 2a20732 commit 99fc79b

File tree

848 files changed

+19579
-5094
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

848 files changed

+19579
-5094
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
v2.27.3-1
1+
v2.27.5-1

.ci/docker/ci_commit_pins/triton.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
c8757738a7418249896224430ce84888e8ecdd79
1+
ae848267bebc65c6181e8cc5e64a6357d2679260

.ci/docker/common/install_cuda.sh

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ else
1010
arch_path='sbsa'
1111
fi
1212

13+
NVSHMEM_VERSION=3.3.9
14+
1315
function install_cuda {
1416
version=$1
1517
runfile=$2
@@ -40,13 +42,52 @@ function install_cudnn {
4042
rm -rf tmp_cudnn
4143
}
4244

45+
function install_nvshmem {
46+
cuda_major_version=$1 # e.g. "12"
47+
nvshmem_version=$2 # e.g. "3.3.9"
48+
49+
case "${arch_path}" in
50+
sbsa)
51+
dl_arch="aarch64"
52+
;;
53+
x86_64)
54+
dl_arch="x64"
55+
;;
56+
*)
57+
dl_arch="${arch}"
58+
;;
59+
esac
60+
61+
tmpdir="tmp_nvshmem"
62+
mkdir -p "${tmpdir}" && cd "${tmpdir}"
63+
64+
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
65+
filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
66+
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
67+
68+
# download, unpack, install
69+
wget -q "${url}"
70+
tar xf "${filename}.tar.gz"
71+
cp -a "libnvshmem/include/"* /usr/local/include/
72+
cp -a "libnvshmem/lib/"* /usr/local/lib/
73+
74+
# cleanup
75+
cd ..
76+
rm -rf "${tmpdir}"
77+
78+
echo "nvSHMEM ${nvshmem_version} for CUDA ${cuda_major_version} (${arch_path}) installed."
79+
}
80+
81+
4382
function install_126 {
4483
CUDNN_VERSION=9.10.2.21
45-
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
84+
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
4685
install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
4786

4887
install_cudnn 12 $CUDNN_VERSION
4988

89+
install_nvshmem 12 $NVSHMEM_VERSION
90+
5091
CUDA_VERSION=12.6 bash install_nccl.sh
5192

5293
CUDA_VERSION=12.6 bash install_cusparselt.sh
@@ -56,13 +97,15 @@ function install_126 {
5697

5798
function install_129 {
5899
CUDNN_VERSION=9.10.2.21
59-
echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
100+
echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
60101
# install CUDA 12.9.1 in the same container
61102
install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
62103

63104
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
64105
install_cudnn 12 $CUDNN_VERSION
65106

107+
install_nvshmem 12 $NVSHMEM_VERSION
108+
66109
CUDA_VERSION=12.9 bash install_nccl.sh
67110

68111
CUDA_VERSION=12.9 bash install_cusparselt.sh
@@ -106,13 +149,15 @@ function prune_126 {
106149

107150
function install_128 {
108151
CUDNN_VERSION=9.8.0.87
109-
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
152+
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
110153
# install CUDA 12.8.1 in the same container
111154
install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
112155

113156
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
114157
install_cudnn 12 $CUDNN_VERSION
115158

159+
install_nvshmem 12 $NVSHMEM_VERSION
160+
116161
CUDA_VERSION=12.8 bash install_nccl.sh
117162

118163
CUDA_VERSION=12.8 bash install_cusparselt.sh

.ci/docker/common/install_xpu.sh

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,10 @@ function install_ubuntu() {
5656

5757
function install_rhel() {
5858
. /etc/os-release
59-
if [[ "${ID}" == "rhel" ]]; then
60-
if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
61-
echo "RHEL version ${VERSION_ID} not supported"
62-
exit
63-
fi
64-
elif [[ "${ID}" == "almalinux" ]]; then
65-
# Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
66-
VERSION_ID="8.8"
59+
60+
if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
61+
echo "RHEL version ${VERSION_ID} not supported"
62+
exit
6763
fi
6864

6965
dnf install -y 'dnf-command(config-manager)'

.ci/docker/requirements-ci.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,6 @@ cmake==4.0.0
383383
tlparse==0.3.30
384384
#Description: required for log parsing
385385

386-
cuda-bindings>=12.0,<13.0
386+
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
387387
#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
388388
#test that import: test_cuda.py

.ci/docker/triton_version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.3.1
1+
3.4.0

.ci/manywheel/build_cuda.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,22 @@ else
5151
fi
5252

5353
cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
54+
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
5455

55-
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
5656
case ${CUDA_VERSION} in
57-
12.8|12.9)
58-
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
59-
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
57+
#removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
58+
12.8)
59+
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0"
60+
;;
61+
12.9)
62+
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
6063
# WAR to resolve the ld error in libtorch build with CUDA 12.9
61-
if [[ "$DESIRED_CUDA" == "cu129" && "$PACKAGE_TYPE" == "libtorch" ]]; then
64+
if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
6265
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
6366
fi
6467
;;
6568
12.6)
66-
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
67-
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
69+
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
6870
;;
6971
*)
7072
echo "unknown cuda version $CUDA_VERSION"

.ci/pytorch/build.sh

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -393,10 +393,8 @@ else
393393
# This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
394394
# is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
395395
# 16 CPUs
396-
if [ -z "$MAX_JOBS_OVERRIDE" ]; then
397-
MAX_JOBS=$(nproc --ignore=4)
398-
export MAX_JOBS
399-
fi
396+
MAX_JOBS=$(nproc --ignore=4)
397+
export MAX_JOBS
400398

401399
# NB: Install outside of source directory (at the same level as the root
402400
# pytorch folder) so that it doesn't get cleaned away prior to docker push.

.ci/pytorch/common-build.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
1313
fi
1414

1515
if which sccache > /dev/null; then
16+
# Clear SCCACHE_BUCKET and SCCACHE_REGION if they are empty, otherwise
17+
# sccache will complain about invalid bucket configuration
18+
if [[ -z "${SCCACHE_BUCKET:-}" ]]; then
19+
unset SCCACHE_BUCKET
20+
unset SCCACHE_REGION
21+
fi
22+
1623
# Save sccache logs to file
1724
sccache --stop-server > /dev/null 2>&1 || true
1825
rm -f ~/sccache_error.log || true

.ci/pytorch/test.sh

Lines changed: 48 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ export TERM=vt100
1111

1212
# shellcheck source=./common.sh
1313
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
14+
# shellcheck source=./common-build.sh
15+
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
1416

1517
# Do not change workspace permissions for ROCm and s390x CI jobs
1618
# as it can leave workspace with bad permissions for cancelled jobs
@@ -163,8 +165,6 @@ elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
163165
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
164166
# setting PYTHON_TEST_EXTRA_OPTION
165167
export PYTHON_TEST_EXTRA_OPTION="--xpu"
166-
# Disable sccache for xpu test due to flaky issue https://github.com/pytorch/pytorch/issues/143585
167-
sudo rm -rf /opt/cache
168168
fi
169169

170170
if [[ "$TEST_CONFIG" == *crossref* ]]; then
@@ -333,9 +333,9 @@ test_h100_distributed() {
333333
test_h100_symm_mem() {
334334
# symmetric memory test
335335
time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
336-
time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
337-
time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
338-
time TORCH_SYMMMEM=NCCL python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
336+
time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
337+
time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
338+
time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
339339
assert_git_not_dirty
340340
}
341341

@@ -368,6 +368,16 @@ test_dynamo_wrapped_shard() {
368368
assert_git_not_dirty
369369
}
370370

371+
test_einops() {
372+
pip install einops==0.6.1
373+
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
374+
pip install einops==0.7.0
375+
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
376+
pip install einops==0.8.1
377+
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
378+
assert_git_not_dirty
379+
}
380+
371381

372382
test_inductor_distributed() {
373383
# Smuggle a few multi-gpu tests here so that we don't have to request another large node
@@ -426,14 +436,21 @@ test_inductor_aoti() {
426436
python3 tools/amd_build/build_amd.py
427437
fi
428438
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
429-
BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
439+
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
430440
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
431-
LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
432-
CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
441+
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
433442
else
434-
BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
435-
CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
443+
BUILD_COMMAND=(python setup.py develop)
444+
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
436445
fi
446+
447+
# aoti cmake custom command requires `torch` to be installed
448+
# initialize the cmake build cache and install torch
449+
/usr/bin/env "${BUILD_COMMAND[@]}"
450+
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
451+
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
452+
453+
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
437454
}
438455

439456
test_inductor_cpp_wrapper_shard() {
@@ -446,47 +463,26 @@ test_inductor_cpp_wrapper_shard() {
446463
TEST_REPORTS_DIR=$(pwd)/test/test-reports
447464
mkdir -p "$TEST_REPORTS_DIR"
448465

449-
if [[ "$1" -eq "2" ]]; then
450-
# For now, manually put the opinfo tests in shard 2, and all other tests in
451-
# shard 1. Run all CPU tests, as well as specific GPU tests triggering past
452-
# bugs, for now.
453-
python test/run_test.py \
454-
--include inductor/test_torchinductor_opinfo \
455-
-k 'linalg or to_sparse or TestInductorOpInfoCPU' \
456-
--verbose
457-
exit
458-
fi
459-
460466
# Run certain inductor unit tests with cpp wrapper. In the end state, we
461467
# should be able to run all the inductor unit tests with cpp_wrapper.
468+
#
469+
# TODO: I'm pretty sure that "TestInductorOpInfoCPU" is not a valid filter,
470+
# but change that in another PR to more accurately monitor the increased CI
471+
# usage.
472+
python test/run_test.py \
473+
--include inductor/test_torchinductor_opinfo \
474+
-k 'linalg or to_sparse or TestInductorOpInfoCPU' \
475+
--shard "$1" "$NUM_TEST_SHARDS" \
476+
--verbose
462477
python test/run_test.py \
463478
--include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \
479+
--shard "$1" "$NUM_TEST_SHARDS" \
480+
--verbose
481+
python test/run_test.py --inductor \
482+
--include test_torch \
483+
-k 'take' \
484+
--shard "$1" "$NUM_TEST_SHARDS" \
464485
--verbose
465-
python test/run_test.py --inductor --include test_torch -k 'take' --verbose
466-
467-
# Run inductor benchmark tests with cpp wrapper.
468-
# Skip benchmark tests if it's in rerun-disabled-mode.
469-
if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then
470-
echo "skip dynamo benchmark tests for rerun-disabled-test"
471-
else
472-
echo "run dynamo benchmark tests with cpp wrapper"
473-
python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
474-
--training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
475-
--output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
476-
python benchmarks/dynamo/check_accuracy.py \
477-
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
478-
--expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_timm_training.csv"
479-
480-
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
481-
--bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
482-
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
483-
--bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
484-
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
485-
--bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
486-
python benchmarks/dynamo/check_accuracy.py \
487-
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
488-
--expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_torchbench_inference.csv"
489-
fi
490486
}
491487

492488
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@@ -1698,11 +1694,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
16981694
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
16991695
fi
17001696
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
1701-
install_torchaudio cuda
17021697
install_torchvision
1703-
checkout_install_torchbench hf_T5 llama moco
17041698
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
1705-
test_inductor_aoti
1699+
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
1700+
test_inductor_aoti
1701+
fi
17061702
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
17071703
install_torchvision
17081704
test_inductor_shard "${SHARD_NUMBER}"
@@ -1711,6 +1707,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
17111707
test_inductor_distributed
17121708
fi
17131709
fi
1710+
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
1711+
test_einops
17141712
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
17151713
install_torchvision
17161714
test_dynamo_wrapped_shard "${SHARD_NUMBER}"
@@ -1760,7 +1758,7 @@ elif [[ "${TEST_CONFIG}" == smoke ]]; then
17601758
test_python_smoke
17611759
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
17621760
test_h100_distributed
1763-
elif [[ "${TEST_CONFIG}" == test_h100_symm_mem ]]; then
1761+
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
17641762
test_h100_symm_mem
17651763
else
17661764
install_torchvision

0 commit comments

Comments
 (0)