pytorch
diff --git a/‎.ci/docker/common/install_cuda.sh
Lines changed: 2 additions & 2 deletions b/‎.ci/docker/common/install_cuda.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/pytorch/build.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/pytorch/build.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/wheel/build_wheel.sh
Lines changed: 0 additions & 3 deletions b/‎.ci/wheel/build_wheel.sh
Lines changed: 0 additions & 3 deletions
diff --git a/‎.github/scripts/generate_ci_workflows.py
Lines changed: 0 additions & 3 deletions b/‎.github/scripts/generate_ci_workflows.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎.github/templates/macos_binary_build_workflow.yml.j2
Lines changed: 0 additions & 3 deletions b/‎.github/templates/macos_binary_build_workflow.yml.j2
Lines changed: 0 additions & 3 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 5 additions & 56 deletions b/‎.github/workflows/pull.yml
Lines changed: 5 additions & 56 deletions
diff --git a/‎aten/src/ATen/mps/EmptyTensor.cpp
Lines changed: 0 additions & 1 deletion b/‎aten/src/ATen/mps/EmptyTensor.cpp
Lines changed: 0 additions & 1 deletion
diff --git a/‎aten/src/ATen/mps/MPSDevice.h
Lines changed: 1 addition & 5 deletions b/‎aten/src/ATen/mps/MPSDevice.h
Lines changed: 1 addition & 5 deletions
diff --git a/‎aten/src/ATen/mps/MPSDevice.mm
Lines changed: 2 additions & 14 deletions b/‎aten/src/ATen/mps/MPSDevice.mm
Lines changed: 2 additions & 14 deletions
diff --git a/‎aten/src/ATen/mps/MPSHooks.mm
Lines changed: 2 additions & 14 deletions b/‎aten/src/ATen/mps/MPSHooks.mm
Lines changed: 2 additions & 14 deletions
@@ -68,8 +68,8 @@ function install_nvshmem {
   # download, unpack, install
   wget -q "${url}"
   tar xf "${filename}.tar.gz"
-  cp -a "libnvshmem/include/"* /usr/local/include/
-  cp -a "libnvshmem/lib/"*     /usr/local/lib/
+  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/
 
   # cleanup
   cd ..
 
@@ -176,7 +176,7 @@ fi
 
 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
   export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
 fi
 
 
@@ -192,9 +192,6 @@ retry brew install libomp
 # For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1
 
-if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
-    export CMAKE_OSX_ARCHITECTURES=arm64
-fi
 export USE_MKLDNN=OFF
 export USE_QNNPACK=OFF
 export BUILD_TEST=OFF
 
@@ -59,7 +59,6 @@ class BinaryBuildWorkflow:
     is_scheduled: str = ""
     branches: str = "nightly"
     # Mainly for macos
-    cross_compile_arm64: bool = False
     macos_runner: str = "macos-14-xlarge"
     use_split_build: bool = False
     # Mainly used for libtorch builds
@@ -338,7 +337,6 @@ class OperatingSystem:
             generate_binary_build_matrix.RELEASE,
             libtorch_variants=["shared-with-deps"],
         ),
-        cross_compile_arm64=False,
         macos_runner="macos-14-xlarge",
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@@ -351,7 +349,6 @@ class OperatingSystem:
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.MACOS_ARM64
         ),
-        cross_compile_arm64=False,
         macos_runner="macos-14-xlarge",
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
 
@@ -47,9 +47,6 @@ env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SKIP_ALL_TESTS: 0
-{%- if cross_compile_arm64 %}
-  CROSS_COMPILE_ARM64: 1
-{% endif %}
 !{{ common.concurrency(build_environment) }}
 
 jobs:
 
@@ -254,36 +254,6 @@ jobs:
       timeout-minutes: 600
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-cuda12_8-py3_10-gcc11-build:
     name: linux-jammy-cuda12.8-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
@@ -292,14 +262,18 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: 8.9
+      cuda-arch-list: '7.5 8.9'
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
         ]}
     secrets: inherit
 
@@ -429,31 +403,6 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-xpu-2025_1-py3_9-build:
     name: linux-jammy-xpu-2025.1-py3.9
     uses: ./.github/workflows/_linux-build.yml
 
@@ -43,7 +43,6 @@ TensorBase empty_mps(
     int64_t nelements = c10::multiply_integers(size);
     auto dtype = dtype_or_default(dtype_opt);
     TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED);
-    TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer");
 
 
     auto dtype_meta = scalarTypeToTypeMeta(dtype);
 
@@ -18,11 +18,7 @@ namespace at::mps {
 
 // Helper enum to check if a MPSGraph op is supported in a given macOS version
 enum class MacOSVersion : uint32_t {
-  MACOS_VER_13_1_PLUS = 0,
-  MACOS_VER_13_2_PLUS,
-  MACOS_VER_13_3_PLUS,
-  MACOS_VER_14_0_PLUS,
-  MACOS_VER_14_4_PLUS,
+  MACOS_VER_14_4_PLUS = 0,
   MACOS_VER_15_0_PLUS,
   MACOS_VER_15_1_PLUS,
   MACOS_VER_15_2_PLUS,
 
@@ -32,11 +32,11 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 
 MPSDevice::MPSDevice() : _mtl_device(nil) {
   // Check that MacOS 13.0+ version of MPS framework is available
-  // Create the MPSGraph and check method introduced in 13.0
+  // Create the MPSGraph and check method introduced in 14.0
   // which is used by MPS backend.
   id mpsCD = NSClassFromString(@"MPSGraph");
 
-  if ([mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == NO) {
+  if ([mpsCD instancesRespondToSelector:@selector(HermiteanToRealFFTWithTensor:axes:descriptor:name:)] == NO) {
     return;
   }
 
@@ -66,24 +66,12 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
           isOperatingSystemAtLeastVersion:{.majorVersion = major, .minorVersion = minor, .patchVersion = 0}];
     }
   };
-  static bool _macos_13_1_plus = is_os_version_at_least(13, 1);
-  static bool _macos_13_2_plus = is_os_version_at_least(13, 2);
-  static bool _macos_13_3_plus = is_os_version_at_least(13, 3);
-  static bool _macos_14_0_plus = is_os_version_at_least(14, 0);
   static bool _macos_14_4_plus = is_os_version_at_least(14, 4);
   static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
   static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
   static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
 
   switch (version) {
-    case MacOSVersion::MACOS_VER_13_1_PLUS:
-      return _macos_13_1_plus;
-    case MacOSVersion::MACOS_VER_13_2_PLUS:
-      return _macos_13_2_plus;
-    case MacOSVersion::MACOS_VER_13_3_PLUS:
-      return _macos_13_3_plus;
-    case MacOSVersion::MACOS_VER_14_0_PLUS:
-      return _macos_14_0_plus;
     case MacOSVersion::MACOS_VER_14_4_PLUS:
       return _macos_14_4_plus;
     case MacOSVersion::MACOS_VER_15_0_PLUS:
 
@@ -34,27 +34,15 @@
     case 14:
       switch (minor) {
         case 0:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+          return true;
         case 4:
           return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
         default:
           TORCH_WARN("Can't check whether running on 14.", minor, "+ returning one for 14.4+");
           return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
       }
     case 13:
-      switch (minor) {
-        case 0:
-          return true;
-        case 1:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS);
-        case 2:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
-        case 3:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-        default:
-          TORCH_WARN("Can't check whether running on 13.", minor, "+ returning one for 13.3+");
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-      }
+      return true;
     default:
       TORCH_WARN("Checking for unexpected MacOS ", major, ".", minor, " returning false");
       return false;