Skip to content

Commit 233e622

Browse files
Update
[ghstack-poisoned]
2 parents 4b821de + 5d82068 commit 233e622

File tree

125 files changed

+5403
-2452
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

125 files changed

+5403
-2452
lines changed

.ci/pytorch/common_utils.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ function install_torchrec_and_fbgemm() {
245245
if [ "${found_whl}" == "0" ]; then
246246
git clone --recursive https://github.com/pytorch/fbgemm
247247
pushd fbgemm/fbgemm_gpu
248-
git checkout "${fbgemm_commit}"
248+
git checkout "${fbgemm_commit}" --recurse-submodules
249249
python setup.py bdist_wheel \
250250
--build-variant=rocm \
251251
-DHIP_ROOT_DIR="${ROCM_PATH}" \

.github/scripts/generate_binary_build_matrix.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def arch_type(arch_version: str) -> str:
193193
"cpu": "libtorch-cxx11-builder:cpu",
194194
}
195195

196-
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
196+
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
197197

198198

199199
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@@ -315,6 +315,11 @@ def generate_wheels_matrix(
315315
# TODO: Enable python 3.13t on cpu-s390x
316316
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
317317
continue
318+
# TODO: Enable python 3.14 on non linux OSes
319+
if os != "linux" and (
320+
python_version == "3.14" or python_version == "3.14t"
321+
):
322+
continue
318323

319324
if use_split_build and (
320325
arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"

.github/workflows/generated-linux-binary-manywheel-nightly.yml

Lines changed: 1226 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.github/workflows/inductor-periodic.yml

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -81,21 +81,21 @@ jobs:
8181
sync-tag: rocm-build
8282
test-matrix: |
8383
{ include: [
84-
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
85-
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
86-
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
87-
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
88-
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
89-
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
90-
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
91-
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
92-
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
93-
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
94-
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
95-
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
96-
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
97-
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
98-
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
84+
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
85+
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
86+
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
87+
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
88+
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
89+
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
90+
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
91+
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
92+
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
93+
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
94+
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
95+
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
96+
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
97+
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
98+
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
9999
]}
100100
secrets: inherit
101101

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ conda install pkg-config libuv
276276
pip install mkl-static mkl-include
277277
# Add these packages if torch.distributed is needed.
278278
# Distributed package support on Windows is a prototype feature and is subject to changes.
279-
conda install -c conda-forge libuv=1.39
279+
conda install -c conda-forge libuv
280280
```
281281

282282
#### Install PyTorch

aten/src/ATen/cuda/CachingHostAllocator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
162162
}
163163

164164
bool pinned_use_background_threads() override {
165-
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
165+
return c10::CachingAllocator::AcceleratorAllocatorConfig::
166166
pinned_use_background_threads();
167167
}
168168

aten/src/ATen/native/ComparisonUtils.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,29 @@ static void _assert_match(const O& original, const C& compared, const std::strin
2424
}
2525
}
2626

27+
template<>
28+
void _assert_match<c10::Device, std::optional<c10::Device>>(
29+
const c10::Device& original,
30+
const std::optional<c10::Device>& compared,
31+
const std::string& name) {
32+
if (compared) {
33+
const c10::Device& expected = compared.value();
34+
if (original.type() != expected.type()) {
35+
std::stringstream msg;
36+
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
37+
throw std::runtime_error(msg.str());
38+
}
39+
40+
// If the expected device doesn't have an index (e.g., just "cuda"),
41+
// or if both devices have the same index, consider them equal
42+
if (expected.has_index() && original.has_index() && expected.index() != original.index()) {
43+
std::stringstream msg;
44+
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
45+
throw std::runtime_error(msg.str());
46+
}
47+
}
48+
}
49+
2750
void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalSymIntArrayRef sizes, at::OptionalSymIntArrayRef strides, std::optional<c10::ScalarType> dtype, std::optional<c10::Device> device, std::optional<c10::Layout> layout) {
2851
_assert_match(tensor.sym_sizes(), sizes, "sizes");
2952
_assert_match(tensor.sym_strides(), strides, "strides");

aten/src/ATen/native/cudnn/BatchNorm.cpp

Lines changed: 66 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,22 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
2828
TORCH_CHECK(false, "cudnn_batch_norm: ATen not compiled with cuDNN support");
2929
}
3030

31+
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
32+
const Tensor& input,
33+
const Tensor& weight,
34+
const std::optional<Tensor>& bias,
35+
const std::optional<Tensor>& running_mean,
36+
const std::optional<Tensor>& running_var,
37+
bool training,
38+
double exponential_average_factor,
39+
double epsilon,
40+
Tensor& out,
41+
Tensor& save_mean,
42+
Tensor& save_var,
43+
Tensor& reserve) {
44+
AT_ERROR("cudnn_batch_norm_out: ATen not compiled with cuDNN support");
45+
}
46+
3147
std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
3248
const Tensor& input,
3349
const Tensor& grad_output,
@@ -120,15 +136,24 @@ size_t _get_cudnn_batch_norm_reserve_space_size(
120136
return reserve_size;
121137
}
122138

123-
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
139+
// Param `reserve` is a placeholder, just passing an empty tensor.
140+
// usage:
141+
// auto reserve = torch::empty({0}, torch::device(torch::kCUDA));
142+
// at::native::cudnn_batch_norm_out(..., epsilon, output, save_mean, save_var,
143+
// reserve);
144+
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
124145
const Tensor& input_t,
125146
const Tensor& weight_t,
126147
const std::optional<Tensor>& bias_t_opt,
127148
const std::optional<Tensor>& running_mean_t_opt,
128149
const std::optional<Tensor>& running_var_t_opt,
129150
bool training,
130151
double exponential_average_factor,
131-
double epsilon) {
152+
double epsilon,
153+
Tensor& output_t,
154+
Tensor& save_mean,
155+
Tensor& save_var,
156+
Tensor& reserve) {
132157
// See [Note: hacky wrapper removal for optional tensor]
133158
c10::MaybeOwned<Tensor> bias_t_maybe_owned =
134159
at::borrow_from_optional_tensor(bias_t_opt);
@@ -168,9 +193,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
168193
cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
169194
training, input->suggest_memory_format(), input->dim());
170195

171-
auto output_t =
172-
at::empty_like(*input, input->options(), input->suggest_memory_format());
173-
174196
TensorArg output{output_t, "output", 0};
175197

176198
auto handle = getCudnnHandle();
@@ -182,15 +204,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
182204

183205
Constant one(dataType, 1);
184206
Constant zero(dataType, 0);
185-
Tensor save_mean, save_var;
186-
187-
Tensor reserve;
188207

189208
if (training) {
190-
int64_t num_features = input_t.size(1);
191-
save_mean = at::empty({num_features}, weight_t.options());
192-
save_var = at::empty({num_features}, weight_t.options());
193-
194209
auto op = CUDNN_BATCHNORM_OPS_BN;
195210
size_t workspace_size;
196211
AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
@@ -238,9 +253,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
238253
reserve_size));
239254
} else {
240255
reserve = at::empty({0}, input->options().dtype(kByte));
241-
// This keeps a consistent output with native_batch_norm
242-
save_mean = at::empty({0}, weight_t.options());
243-
save_var = at::empty({0}, weight_t.options());
244256
AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
245257
handle,
246258
mode,
@@ -261,10 +273,48 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
261273
// save_mean and save_var can be undefined
262274
// If this causes problems, we can initialize them to empty tensors
263275
// of the correct type
264-
return std::tuple<Tensor, Tensor, Tensor, Tensor>{
276+
return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>{
265277
output_t, save_mean, save_var, reserve};
266278
}
267279

280+
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
281+
const Tensor& input_t,
282+
const Tensor& weight_t,
283+
const std::optional<Tensor>& bias_t_opt,
284+
const std::optional<Tensor>& running_mean_t_opt,
285+
const std::optional<Tensor>& running_var_t_opt,
286+
bool training,
287+
double exponential_average_factor,
288+
double epsilon) {
289+
auto output_t = at::empty_like(
290+
input_t, input_t.options(), input_t.suggest_memory_format());
291+
Tensor save_mean, save_var, reserve;
292+
293+
if (training) {
294+
int64_t num_features = input_t.size(1);
295+
save_mean = at::empty({num_features}, weight_t.options());
296+
save_var = at::empty({num_features}, weight_t.options());
297+
} else {
298+
// This keeps a consistent output with native_batch_norm
299+
save_mean = at::empty({0}, weight_t.options());
300+
save_var = at::empty({0}, weight_t.options());
301+
}
302+
303+
return cudnn_batch_norm_out(
304+
input_t,
305+
weight_t,
306+
bias_t_opt,
307+
running_mean_t_opt,
308+
running_var_t_opt,
309+
training,
310+
exponential_average_factor,
311+
epsilon,
312+
output_t,
313+
save_mean,
314+
save_var,
315+
reserve);
316+
}
317+
268318
// NB: CuDNN only implements the backward algorithm for batchnorm
269319
// in training mode (evaluation mode batchnorm has a different algorithm),
270320
// which is why this doesn't accept a 'training' parameter.

aten/src/ATen/native/mps/kernels/Pooling.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,14 @@ struct PoolingBackwardParams {
4848
::c10::metal::array<idx_type_t, N> grad_output_strides;
4949
::c10::metal::array<idx_type_t, N> indices_strides;
5050
};
51+
52+
template <unsigned N = 5, typename idx_type_t = int32_t>
53+
struct MaxUnpoolingParams {
54+
int32_t dims;
55+
int32_t pooling_dims;
56+
::c10::metal::array<idx_type_t, N> input_sizes;
57+
::c10::metal::array<idx_type_t, N> input_strides;
58+
::c10::metal::array<idx_type_t, N> output_sizes;
59+
::c10::metal::array<idx_type_t, N> output_strides;
60+
::c10::metal::array<idx_type_t, N> indices_strides;
61+
};

0 commit comments

Comments
 (0)