Skip to content

Commit 1943aac

Browse files
committed
Update on "[PP] Support OVERLAP_F_B computation type"
Some changes to validation code and visualizer to support a new computation type that will be used in DualPipeV (see #159591) The IR looks like: ``` [0F0, 0F1, 0F2, 0F3, 0F4, 0F5, 0F6, 7F0, 7I0, 7W0, 7F1, 7I1, 7W1, 7F2, 7I2, 7W2, 7F3, (0F7;7B3)OVERLAP_F_B, (7F4;0B0)OVERLAP_F_B, (0F8;7B4)OVERLAP_F_B, (7F5;0B1)OVERLAP_F_B, (0F9;7B5)OVERLAP_F_B, (7F6;0B2)OVERLAP_F_B, 7B6, (7F7;0B3)OVERLAP_F_B, 7B7, (7F8;0B4)OVERLAP_F_B, 7B8, (7F9;0B5)OVERLAP_F_B, 7B9, 0I6, 0W6, 0I7, 0W7, 0I8, 0W8, 0I9, 0W9] [1F0, 1F1, 1F2, 1F3, 1F4, 6F0, 1F5, 6F1, 6I0, 6W0, 6F2, 6I1, 6W1, 6F3, (1F6;6B2)OVERLAP_F_B, (6F4;1B0)OVERLAP_F_B, (1F7;6B3)OVERLAP_F_B, (6F5;1B1)OVERLAP_F_B, (1F8;6B4)OVERLAP_F_B, (6F6;1B2)OVERLAP_F_B, (1F9;6B5)OVERLAP_F_B, (6F7;1B3)OVERLAP_F_B, 6B6, (6F8;1B4)OVERLAP_F_B, 6B7, (6F9;1B5)OVERLAP_F_B, 6B8, 1B6, 6I9, 1I7, 6W9, 1I8, 1W7, 1I9, 1W8, 1W9] [2F0, 2F1, 2F2, 5F0, 2F3, 5F1, 2F4, 5F2, 5I0, 5W0, 5F3, (2F5;5B1)OVERLAP_F_B, (5F4;2B0)OVERLAP_F_B, (2F6;5B2)OVERLAP_F_B, (5F5;2B1)OVERLAP_F_B, (2F7;5B3)OVERLAP_F_B, (5F6;2B2)OVERLAP_F_B, (2F8;5B4)OVERLAP_F_B, (5F7;2B3)OVERLAP_F_B, (2F9;5B5)OVERLAP_F_B, (5F8;2B4)OVERLAP_F_B, 5B6, (5F9;2B5)OVERLAP_F_B, 5B7, 2B6, 5B8, 2I7, 5I9, 2I8, 2W7, 2I9, 5W9, 2W8, 2W9] [3F0, 4F0, 3F1, 4F1, 3F2, 4F2, 3F3, 4F3, 3F4, 4B0, (4F4;3B0)OVERLAP_F_B, (3F5;4B1)OVERLAP_F_B, (4F5;3B1)OVERLAP_F_B, (3F6;4B2)OVERLAP_F_B, (4F6;3B2)OVERLAP_F_B, (3F7;4B3)OVERLAP_F_B, (4F7;3B3)OVERLAP_F_B, (3F8;4B4)OVERLAP_F_B, (4F8;3B4)OVERLAP_F_B, (3F9;4B5)OVERLAP_F_B, (4F9;3B5)OVERLAP_F_B, 4B6, 3B6, 4B7, 3B7, 4I8, 3I8, 4I9, 3I9, 4W8, 3W8, 4W9, 3W9] ``` In this PR, the schedule execution will just treat the OVERLAP_F_B as two separate operations of F and B (so there is no actual overlap). The next step is to allow users to create a custom function to plug in what this operation does. https://github.com/pytorch/pytorch/blob/814629043a0c31441bc3749204c97f1e24fa3462/torch/distributed/pipelining/schedules.py#L1205-L1216 cc awgu wanchaol fegin fduwjj wz337 wconstab d4l3k pragupta [ghstack-poisoned]
2 parents 8146290 + 012117c commit 1943aac

File tree

136 files changed

+6696
-4775
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+6696
-4775
lines changed

.ci/docker/requirements-docs.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
sphinx==5.3.0
22
#Description: This is used to generate PyTorch docs
33
#Pinned versions: 5.3.0
4-
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
4+
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
55

66
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
77
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
@@ -50,7 +50,7 @@ IPython==8.12.0
5050
#Pinned versions: 8.12.0
5151

5252
myst-nb==0.17.2
53-
#Description: This is used to generate PyTorch functorch and torch.compile docs
53+
#Description: This is used to generate PyTorch functorch and torch.compile docs.
5454
#Pinned versions: 0.17.2
5555

5656
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs

.github/merge_rules.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,10 @@
488488
- torch/_dynamo/**
489489
- torch/csrc/dynamo/**
490490
- test/dynamo/**
491+
- test/dynamo_expected_failures/**
492+
- test/dynamo_skips/**
493+
- test/inductor_expected_failures/**
494+
- test/inductor_skips/**
491495
approved_by:
492496
- guilhermeleobas
493497
mandatory_checks_name:

.github/workflows/docker-builds.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ jobs:
7676
pytorch-linux-jammy-py3-clang12-onnx,
7777
pytorch-linux-jammy-linter,
7878
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
79-
pytorch-linux-jammy-py3-clang12-executorch,
79+
# Executorch pin needs update
80+
# pytorch-linux-jammy-py3-clang12-executorch,
8081
pytorch-linux-jammy-py3.12-triton-cpu
8182
]
8283
include:

.github/workflows/nightly.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,11 @@ jobs:
7575
repo-owner: pytorch
7676
branch: main
7777
pin-folder: .github/ci_commit_pins
78-
- repo-name: executorch
79-
repo-owner: pytorch
80-
branch: main
81-
pin-folder: .ci/docker/ci_commit_pins
78+
# executorch jobs are disabled since it needs some manual work for the hash update
79+
# - repo-name: executorch
80+
# repo-owner: pytorch
81+
# branch: main
82+
# pin-folder: .ci/docker/ci_commit_pins
8283
- repo-name: triton
8384
repo-owner: triton-lang
8485
branch: main

.github/workflows/pull.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,7 @@ jobs:
434434
secrets: inherit
435435

436436
linux-jammy-py3-clang12-executorch-build:
437+
if: false # Docker build needs pin update
437438
name: linux-jammy-py3-clang12-executorch
438439
uses: ./.github/workflows/_linux-build.yml
439440
needs: get-label-type

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,7 @@ if(MSVC)
564564
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
565565
if(MSVC_Z7_OVERRIDE)
566566
# CMake set debug flags to use /Z7
567-
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
567+
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
568568
endif()
569569
foreach(
570570
flag_var

CODEOWNERS

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
/torch/csrc/autograd/ @albanD @soulitzer
1515
/torch/autograd/ @albanD @soulitzer
1616
/tools/autograd/ @albanD @soulitzer
17-
/torch/header_only_apis.txt @janeyx99
1817
/torch/nn/ @albanD @jbschlosser @mikaylagawarecki
1918
/torch/optim/ @albanD @janeyx99
2019
/test/test_public_bindings.py @albanD
@@ -196,3 +195,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed
196195
/torch/utils/_cxx_pytree.py @XuehaiPan
197196
/torch/utils/pytree/ @XuehaiPan
198197
/torch/_dynamo/polyfills/pytree.py @XuehaiPan
198+
199+
# Relating to libtorch ABI
200+
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
201+
/torch/headeronly/ @janeyx99
202+
/torch/header_only_apis.txt @janeyx99

aten/src/ATen/cuda/CachingHostAllocator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
162162
}
163163

164164
bool pinned_use_background_threads() override {
165-
return c10::CachingAllocator::AcceleratorAllocatorConfig::
165+
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
166166
pinned_use_background_threads();
167167
}
168168

aten/src/ATen/native/cpu/int8mm_kernel.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -367,27 +367,27 @@ void int8pack_mm_kernel_(
367367
auto* C_data = C.data_ptr<T>();
368368
const auto* S_data = scales.const_data_ptr<T>();
369369

370-
int M = A.size(0);
371-
int N = B.size(0);
372-
int K = A.size(1);
373-
int lda = A.stride(0);
374-
constexpr int BLOCK_M = 4;
375-
constexpr int BLOCK_N = 4;
376-
377-
const int MB = (M + BLOCK_M - 1) / BLOCK_M;
378-
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
379-
380-
at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
381-
int mb{0}, nb{0};
370+
int64_t M = A.size(0);
371+
int64_t N = B.size(0);
372+
int64_t K = A.size(1);
373+
int64_t lda = A.stride(0);
374+
constexpr int64_t BLOCK_M = 4;
375+
constexpr int64_t BLOCK_N = 4;
376+
377+
const int64_t MB = (M + BLOCK_M - 1) / BLOCK_M;
378+
const int64_t NB = (N + BLOCK_N - 1) / BLOCK_N;
379+
380+
at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
381+
int64_t mb{0}, nb{0};
382382
data_index_init(begin, mb, MB, nb, NB);
383383

384384
for (const auto i : c10::irange(begin, end)) {
385385
(void)i;
386386

387-
int mb_start = mb * BLOCK_M;
388-
int mb_size = std::min(BLOCK_M, M - mb_start);
389-
int nb_start = nb * BLOCK_N;
390-
int nb_size = std::min(BLOCK_N, N - nb_start);
387+
int64_t mb_start = mb * BLOCK_M;
388+
int64_t mb_size = std::min(BLOCK_M, M - mb_start);
389+
int64_t nb_start = nb * BLOCK_N;
390+
int64_t nb_size = std::min(BLOCK_N, N - nb_start);
391391

392392
const auto* A_ptr = A_data + mb_start * lda;
393393
const auto* B_ptr = B_data + nb_start * K;

aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,7 @@ namespace {
526526

527527

528528
// we are dealing with packed tensor here. max index is the same as numel.
529-
// TODO: to really support input tensor large enought to go beyond int32,
529+
// TODO: to really support input tensor large enough to go beyond int32,
530530
// we will need to restrict out shared memory usage and adjust the launch
531531
// config;
532532
AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
@@ -681,7 +681,7 @@ namespace {
681681
const dim3 grid(grid_x, grid_y, grid_z);
682682

683683
// we are dealing with packed tensor here. max index is the same as numel.
684-
// TODO: to really support input tensor large enought to go beyond int32,
684+
// TODO: to really support input tensor large enough to go beyond int32,
685685
// we will need to restrict out shared memory usage and adjust the launch
686686
// config;
687687
AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());

0 commit comments

Comments
 (0)