pytorch
diff --git a/‎.ci/docker/requirements-docs.txt
Lines changed: 2 additions & 2 deletions b/‎.ci/docker/requirements-docs.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/merge_rules.yaml
Lines changed: 4 additions & 0 deletions b/‎.github/merge_rules.yaml
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/docker-builds.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/docker-builds.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/nightly.yml
Lines changed: 5 additions & 4 deletions b/‎.github/workflows/nightly.yml
Lines changed: 5 additions & 4 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎CODEOWNERS
Lines changed: 5 additions & 1 deletion b/‎CODEOWNERS
Lines changed: 5 additions & 1 deletion
diff --git a/‎aten/src/ATen/cuda/CachingHostAllocator.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/CachingHostAllocator.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cpu/int8mm_kernel.cpp
Lines changed: 16 additions & 16 deletions b/‎aten/src/ATen/native/cpu/int8mm_kernel.cpp
Lines changed: 16 additions & 16 deletions
diff --git a/‎aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
Lines changed: 2 additions & 2 deletions
@@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
@@ -50,7 +50,7 @@ IPython==8.12.0
 #Pinned versions: 8.12.0
 
 myst-nb==0.17.2
-#Description: This is used to generate PyTorch functorch and torch.compile docs
+#Description: This is used to generate PyTorch functorch and torch.compile docs.
 #Pinned versions: 0.17.2
 
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 
@@ -488,6 +488,10 @@
   - torch/_dynamo/**
   - torch/csrc/dynamo/**
   - test/dynamo/**
+  - test/dynamo_expected_failures/**
+  - test/dynamo_skips/**
+  - test/inductor_expected_failures/**
+  - test/inductor_skips/**
   approved_by:
   - guilhermeleobas
   mandatory_checks_name:
 
@@ -76,7 +76,8 @@ jobs:
           pytorch-linux-jammy-py3-clang12-onnx,
           pytorch-linux-jammy-linter,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          pytorch-linux-jammy-py3-clang12-executorch,
+          # Executorch pin needs update
+          # pytorch-linux-jammy-py3-clang12-executorch,
           pytorch-linux-jammy-py3.12-triton-cpu
         ]
         include:
 
@@ -75,10 +75,11 @@ jobs:
             repo-owner: pytorch
             branch: main
             pin-folder: .github/ci_commit_pins
-          - repo-name: executorch
-            repo-owner: pytorch
-            branch: main
-            pin-folder: .ci/docker/ci_commit_pins
+          # executorch jobs are disabled since it needs some manual work for the hash update
+          # - repo-name: executorch
+          #   repo-owner: pytorch
+          #   branch: main
+          #   pin-folder: .ci/docker/ci_commit_pins
           - repo-name: triton
             repo-owner: triton-lang
             branch: main
 
@@ -434,6 +434,7 @@ jobs:
     secrets: inherit
 
   linux-jammy-py3-clang12-executorch-build:
+    if: false  # Docker build needs pin update
     name: linux-jammy-py3-clang12-executorch
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
 
@@ -564,7 +564,7 @@ if(MSVC)
   set(CMAKE_NINJA_CMCLDEPS_RC OFF)
   if(MSVC_Z7_OVERRIDE)
     # CMake set debug flags to use /Z7
-    set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
+    set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
   endif()
   foreach(
     flag_var
 
@@ -14,7 +14,6 @@
 /torch/csrc/autograd/ @albanD @soulitzer
 /torch/autograd/ @albanD @soulitzer
 /tools/autograd/ @albanD @soulitzer
-/torch/header_only_apis.txt @janeyx99
 /torch/nn/ @albanD @jbschlosser @mikaylagawarecki
 /torch/optim/ @albanD @janeyx99
 /test/test_public_bindings.py @albanD
@@ -196,3 +195,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed
 /torch/utils/_cxx_pytree.py @XuehaiPan
 /torch/utils/pytree/ @XuehaiPan
 /torch/_dynamo/polyfills/pytree.py @XuehaiPan
+
+# Relating to libtorch ABI
+/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
+/torch/headeronly/ @janeyx99
+/torch/header_only_apis.txt @janeyx99
@@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
   }
 
   bool pinned_use_background_threads() override {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+    return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
         pinned_use_background_threads();
   }
 
 
@@ -367,27 +367,27 @@ void int8pack_mm_kernel_(
   auto* C_data = C.data_ptr<T>();
   const auto* S_data = scales.const_data_ptr<T>();
 
-  int M = A.size(0);
-  int N = B.size(0);
-  int K = A.size(1);
-  int lda = A.stride(0);
-  constexpr int BLOCK_M = 4;
-  constexpr int BLOCK_N = 4;
-
-  const int MB = (M + BLOCK_M - 1) / BLOCK_M;
-  const int NB = (N + BLOCK_N - 1) / BLOCK_N;
-
-  at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
-    int mb{0}, nb{0};
+  int64_t M = A.size(0);
+  int64_t N = B.size(0);
+  int64_t K = A.size(1);
+  int64_t lda = A.stride(0);
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 4;
+
+  const int64_t MB = (M + BLOCK_M - 1) / BLOCK_M;
+  const int64_t NB = (N + BLOCK_N - 1) / BLOCK_N;
+
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int64_t mb{0}, nb{0};
     data_index_init(begin, mb, MB, nb, NB);
 
     for (const auto i : c10::irange(begin, end)) {
       (void)i;
 
-      int mb_start = mb * BLOCK_M;
-      int mb_size = std::min(BLOCK_M, M - mb_start);
-      int nb_start = nb * BLOCK_N;
-      int nb_size = std::min(BLOCK_N, N - nb_start);
+      int64_t mb_start = mb * BLOCK_M;
+      int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
       const auto* A_ptr = A_data + mb_start * lda;
       const auto* B_ptr = B_data + nb_start * K;
 
@@ -526,7 +526,7 @@ namespace {
 
 
         // we are dealing with packed tensor here. max index is the same as numel.
-        // TODO: to really support input tensor large enought to go beyond int32,
+        // TODO: to really support input tensor large enough to go beyond int32,
         // we will need to restrict out shared memory usage and adjust the launch
         // config;
         AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
@@ -681,7 +681,7 @@ namespace {
           const dim3 grid(grid_x, grid_y, grid_z);
 
           // we are dealing with packed tensor here. max index is the same as numel.
-          // TODO: to really support input tensor large enought to go beyond int32,
+          // TODO: to really support input tensor large enough to go beyond int32,
           // we will need to restrict out shared memory usage and adjust the launch
           // config;
           AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());
Original file line number	Diff line number	Diff line change
`@@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl`
`162`	`162`	`}`
`163`	`163`
`164`	`164`	`bool pinned_use_background_threads() override {`
`165`		`- return c10::CachingAllocator::AcceleratorAllocatorConfig::`
	`165`	`+ return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::`
`166`	`166`	`pinned_use_background_threads();`
`167`	`167`	`}`
`168`	`168`