pytorch
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/ivalue.cpp
Lines changed: 8 additions & 0 deletions b/‎aten/src/ATen/core/ivalue.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/ivalue.h
Lines changed: 39 additions & 2 deletions b/‎aten/src/ATen/core/ivalue.h
Lines changed: 39 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
Lines changed: 60 additions & 25 deletions b/‎aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
Lines changed: 60 additions & 25 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/LinearAlgebra.mm
Lines changed: 59 additions & 1 deletion b/‎aten/src/ATen/native/mps/operations/LinearAlgebra.mm
Lines changed: 59 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/sparse/mps/kernels/Sparse.metal
Lines changed: 6 additions & 1 deletion b/‎aten/src/ATen/native/sparse/mps/kernels/Sparse.metal
Lines changed: 6 additions & 1 deletion
diff --git a/‎c10/cuda/CUDAStream.cpp
Lines changed: 0 additions & 3 deletions b/‎c10/cuda/CUDAStream.cpp
Lines changed: 0 additions & 3 deletions
@@ -243,7 +243,7 @@ git submodule update --init --recursive
 
 ```bash
 conda install cmake ninja
-# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
+# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
 pip install -r requirements.txt
 ```
 
@@ -560,7 +560,7 @@ To learn more about making a contribution to Pytorch, please see our [Contributi
 
 PyTorch is a community-driven project with several skillful engineers and researchers contributing to it.
 
-PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
+PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
 A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). <!-- codespell:ignore -->
 
 Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch.
 
@@ -97,6 +97,8 @@ c10::TypePtr IValue::TagType<c10::Type>::get(const IValue& v) {
         return ComplexType::get();
       case Tag::Int:
         return IntType::get();
+      case Tag::UInt:
+        return IntType::get();
       case Tag::SymInt:
         return c10::SymIntType::get();
       case Tag::SymFloat:
@@ -320,6 +322,8 @@ IValue IValue::equals(const IValue& rhs) const {
       return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble();
     case Tag::Int:
       return rhs.isInt() && lhs.toInt() == rhs.toInt();
+    case Tag::UInt:
+      return rhs.isUnsigned() && lhs.toUInt() == rhs.toUInt();
     case Tag::SymInt:
       return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt();
     case Tag::SymFloat:
@@ -379,6 +383,8 @@ size_t IValue::hash(const IValue& v) {
     case Tag::Int:
       return c10::get_hash(v.payload.u.as_int);
     // NB: these are technically strict aliasing violations
+    case Tag::UInt:
+      return c10::get_hash(v.payload.u.as_int);
     case Tag::SymInt:
       return c10::get_hash(v.payload.u.as_int);
     case Tag::SymFloat:
@@ -806,6 +812,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return printComplex(out, v);
     } case IValue::Tag::Int:
       return out << v.toInt();
+    case IValue::Tag::UInt:
+      return out << v.toUInt();
     case IValue::Tag::SymInt:
       return out << v.toSymInt();
     case IValue::Tag::SymFloat:
 
@@ -12,6 +12,7 @@
 #include <c10/macros/Export.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/intrusive_ptr.h>
+#include <limits>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@@ -160,6 +161,7 @@ struct Capsule {
   _(Double)                  \
   _(ComplexDouble)           \
   _(Int)                     \
+  _(UInt)                    \
   _(SymInt)                  \
   _(SymFloat)                \
   _(SymBool)                 \
@@ -653,6 +655,29 @@ struct TORCH_API IValue final {
     }
   }
 
+  // Unsigned
+  IValue(uint64_t u) : tag( u <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt) {
+    payload.u.as_uint = u;
+  }
+
+
+  // See Note [Meaning of HAS_u]
+  // IValue type model closely follows that of c10::Scalar
+  // Where all integers are upcast to 64-bit representation, and `as_int` is used as default
+  // representation unless value could not be represented as signed int
+  bool isUnsigned() const {
+    return Tag::UInt == tag || (Tag::Int == tag && payload.u.as_int >= 0);
+  }
+
+  uint64_t toUInt() const {
+    if (isUnsigned()) {
+      return payload.u.as_uint;
+    } else {
+      TORCH_INTERNAL_ASSERT(0, "expected unsigned int");
+    }
+  }
+
+
   // Bool
   IValue(bool b) : tag(Tag::Bool) {
 #if defined(__clang__) && defined(__x86_64__)
@@ -893,8 +918,14 @@ struct TORCH_API IValue final {
     } else {
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           s.isIntegral(false), "Unknown type in Scalar");
-      tag = Tag::Int;
-      payload.u.as_int = s.toLong();
+      if (s.isUnsigned()) {
+        const auto val = s.toUInt64();
+        payload.u.as_uint = val;
+        tag = val <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt;
+      } else {
+        payload.u.as_int = s.toLong();
+        tag = Tag::Int;
+      }
     }
   }
 
@@ -918,6 +949,8 @@ struct TORCH_API IValue final {
       return toSymFloat();
     else if (isSymBool())
       return toSymBool();
+    else if (isUnsigned())
+      return toUInt();
     TORCH_CHECK(false, "IValue is not a Scalar");
   }
 
@@ -1247,6 +1280,8 @@ struct TORCH_API IValue final {
         return true;
       case Tag::Int:
         return false;
+      case Tag::UInt:
+        return false;
       case Tag::SymInt:
         return true;
       case Tag::SymFloat:
@@ -1343,6 +1378,8 @@ struct TORCH_API IValue final {
     union TriviallyCopyablePayload {
       TriviallyCopyablePayload() : as_int(0) {}
       int64_t as_int;
+      // See Note [Meaning of HAS_u]
+      uint64_t as_uint;
       double as_double;
       bool as_bool;
       // Invariant: never nullptr; null state is represented as
 
@@ -68,6 +68,37 @@ kernel void matmul(
   }
 }
 
+template <typename T>
+kernel void addmm(
+    constant T* mat1Data [[buffer(0)]],
+    constant T* mat2Data [[buffer(1)]],
+    device T* outputData [[buffer(2)]],
+    constant T* biasData [[buffer(3)]],
+    constant array<c10::metal::opmath_t<T>, 2>& alpha_beta [[buffer(4)]],
+    constant array<ulong2, 4>& strides [[buffer(5)]],
+    constant uint3& sizes [[buffer(6)]],
+    uint2 tid [[thread_position_in_threadgroup]],
+    uint2 thread_id [[thread_position_in_grid]]) {
+  threadgroup T A_tile[TILE_DIM][TILE_DIM];
+  threadgroup T B_tile[TILE_DIM][TILE_DIM];
+
+  auto sum = matmul_inner<T>(
+      mat1Data,
+      mat2Data,
+      reinterpret_cast<constant array<ulong2, 3>&>(strides),
+      sizes,
+      A_tile,
+      B_tile,
+      tid,
+      thread_id);
+  if (thread_id.y < sizes.x && thread_id.x < sizes.z) {
+    auto bias =
+        biasData[thread_id.y * strides[3].x + thread_id.x * strides[3].y];
+    outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] =
+        static_cast<T>(alpha_beta[0] * sum + alpha_beta[1] * bias);
+  }
+}
+
 template <typename T>
 kernel void naive_bmm(
     constant T* mat1Data [[buffer(0)]],
@@ -613,38 +644,42 @@ kernel void applyPivots(
   }
 }
 
-#define INSTANTIATE_NAIVE_MM(DTYPE)                                   \
-  template [[host_name("matmul_" #DTYPE)]] kernel void matmul<DTYPE>( \
-      constant DTYPE * mat1Data [[buffer(0)]],                        \
-      constant DTYPE * mat2Data [[buffer(1)]],                        \
-      device DTYPE * outputData [[buffer(2)]],                        \
-      constant array<ulong2, 3> & strides [[buffer(3)]],              \
-      constant uint3 & sizes [[buffer(4)]],                           \
-      uint2 tid [[thread_position_in_threadgroup]],                   \
-      uint2 group_id [[threadgroup_position_in_grid]])
-
-#define INSTANTIATE_NAIVE_BMM(DTYPE)                                        \
+#define INSTANTIATE_MM_OPS(DTYPE)                                           \
+  template [[host_name("matmul_" #DTYPE)]] kernel void matmul<DTYPE>(       \
+      constant DTYPE * mat1Data [[buffer(0)]],                              \
+      constant DTYPE * mat2Data [[buffer(1)]],                              \
+      device DTYPE * outputData [[buffer(2)]],                              \
+      constant array<ulong2, 3> & strides [[buffer(3)]],                    \
+      constant uint3 & sizes [[buffer(4)]],                                 \
+      uint2 tid [[thread_position_in_threadgroup]],                         \
+      uint2 group_id [[threadgroup_position_in_grid]]);                     \
   template [[host_name("naive_bmm_" #DTYPE)]] kernel void naive_bmm<DTYPE>( \
       constant DTYPE * mat1Data [[buffer(0)]],                              \
       constant DTYPE * mat2Data [[buffer(1)]],                              \
       device DTYPE * outputData [[buffer(2)]],                              \
       constant array<ulong, 9> & strides [[buffer(3)]],                     \
       constant uint4 & sizes [[buffer(4)]],                                 \
       uint3 tid [[thread_position_in_threadgroup]],                         \
-      uint3 group_id [[threadgroup_position_in_grid]])
+      uint3 group_id [[threadgroup_position_in_grid]]);                     \
+  template [[host_name("addmm_" #DTYPE)]] kernel void addmm<DTYPE>(         \
+      constant DTYPE * mat1Data [[buffer(0)]],                              \
+      constant DTYPE * mat2Data [[buffer(1)]],                              \
+      device DTYPE * outputData [[buffer(2)]],                              \
+      constant DTYPE * biasData [[buffer(3)]],                              \
+      constant array<c10::metal::opmath_t<DTYPE>, 2> &                      \
+          alpha_beta [[buffer(4)]],                                         \
+      constant array<ulong2, 4> & strides [[buffer(5)]],                    \
+      constant uint3 & sizes [[buffer(6)]],                                 \
+      uint2 tid [[thread_position_in_threadgroup]],                         \
+      uint2 group_id [[threadgroup_position_in_grid]])
 
-INSTANTIATE_NAIVE_MM(float);
-INSTANTIATE_NAIVE_MM(half);
-INSTANTIATE_NAIVE_MM(bfloat);
+INSTANTIATE_MM_OPS(float);
+INSTANTIATE_MM_OPS(half);
+INSTANTIATE_MM_OPS(bfloat);
 
 // Integral MM
-INSTANTIATE_NAIVE_MM(short);
-INSTANTIATE_NAIVE_MM(int);
-INSTANTIATE_NAIVE_MM(long);
-INSTANTIATE_NAIVE_MM(char);
-INSTANTIATE_NAIVE_MM(uchar);
-INSTANTIATE_NAIVE_BMM(short);
-INSTANTIATE_NAIVE_BMM(int);
-INSTANTIATE_NAIVE_BMM(long);
-INSTANTIATE_NAIVE_BMM(char);
-INSTANTIATE_NAIVE_BMM(uchar);
+INSTANTIATE_MM_OPS(long);
+INSTANTIATE_MM_OPS(int);
+INSTANTIATE_MM_OPS(short);
+INSTANTIATE_MM_OPS(char);
+INSTANTIATE_MM_OPS(uchar);
@@ -112,6 +112,61 @@
   return output;
 }
 
+Tensor& do_metal_addmm(const Tensor& self,
+                       const Tensor& other,
+                       Tensor& output,
+                       const Scalar& alpha,
+                       const Scalar& beta,
+                       const Tensor& bias) {
+  if (beta.toDouble() == 0 && alpha.toDouble() == 1) {
+    return do_metal_mm(self, other, output);
+  }
+  auto stream = getCurrentMPSStream();
+  auto device = MPSDevice::getInstance()->device();
+  auto matmulPSO = lib.getPipelineStateForFunc("addmm_" + mps::scalarToMetalTypeString(output));
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      getMPSProfiler().beginProfileKernel(matmulPSO, "addmm", {self, other});
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:matmulPSO];
+      std::array<uint32_t, 3> sizes = {static_cast<uint32_t>(self.size(0)),
+                                       static_cast<uint32_t>(self.size(1)),
+                                       static_cast<uint32_t>(output.size(1))};
+      std::array<int64_t, 8> strides = {self.stride(0),
+                                        self.stride(1),
+                                        other.stride(0),
+                                        other.stride(1),
+                                        output.stride(0),
+                                        output.stride(1),
+                                        bias.stride(0),
+                                        bias.stride(1)};
+      union {
+        std::array<int64_t, 2> i64;
+        std::array<int32_t, 2> i32;
+        std::array<float, 2> f32;
+      } alpha_beta;
+      if (output.scalar_type() == kLong) {
+        alpha_beta.i64 = {alpha.toLong(), beta.toLong()};
+      } else if (c10::isIntegralType(output.scalar_type(), true)) {
+        alpha_beta.i32 = {alpha.toInt(), beta.toInt()};
+      } else {
+        TORCH_INTERNAL_ASSERT(c10::isFloatingType(output.scalar_type()));
+        alpha_beta.f32 = {alpha.toFloat(), beta.toFloat()};
+      }
+      constexpr uint32_t TILE_DIM = 16; // fastest performance from tests on multiple macs
+      uint32_t gridSizeX = (output.size(1) + TILE_DIM - 1) / TILE_DIM;
+      uint32_t gridSizeY = (self.size(0) + TILE_DIM - 1) / TILE_DIM;
+
+      MTLSize threadsPerThreadgroup = MTLSizeMake(TILE_DIM, TILE_DIM, 1);
+      MTLSize threadgroupsPerGrid = MTLSizeMake(gridSizeX, gridSizeY, 1);
+      mtl_setArgs(computeEncoder, self, other, output, bias, alpha_beta.i64, strides, sizes);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadsPerThreadgroup];
+      getMPSProfiler().endProfileKernel(matmulPSO);
+    }
+  });
+  return output;
+}
+
 std::tuple<MPSGraphTensor*, MPSGraphTensor*, MPSGraphTensor*> do_mm(MPSGraph* graph,
                                                                     const Tensor& self,
                                                                     const Tensor& other) {
@@ -644,7 +699,6 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
 
   TORCH_CHECK(output.is_mps());
   TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(supportedFloatingOrComplexType(self), "MPS device does not support addmm for non-float input");
 
   TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}};
   checkAllSameGPU(__func__, args);
@@ -671,6 +725,10 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
     return output;
   }
 
+  if (use_metal_mm(self, other, output)) {
+    return do_metal_addmm(self, other, output, alpha, beta, *bias_);
+  }
+
   bool is_beta_non_zero = beta.toDouble() != 0.0;
 
   struct CachedGraph : public mps::MPSCachedGraph {
 
@@ -120,4 +120,9 @@ kernel void coalesce_with_positions_kernel(
 INSTANTIATE_COALESCE_WITH_POSITIONS(float);
 INSTANTIATE_COALESCE_WITH_POSITIONS(half);
 INSTANTIATE_COALESCE_WITH_POSITIONS(bfloat);
-INSTANTIATE_COALESCE_WITH_POSITIONS(bool);
+INSTANTIATE_COALESCE_WITH_POSITIONS(bool);
+INSTANTIATE_COALESCE_WITH_POSITIONS(long);
+INSTANTIATE_COALESCE_WITH_POSITIONS(char);
+INSTANTIATE_COALESCE_WITH_POSITIONS(uchar);
+INSTANTIATE_COALESCE_WITH_POSITIONS(short);
+INSTANTIATE_COALESCE_WITH_POSITIONS(int);
@@ -216,9 +216,6 @@ static void initSingleStream(int p, DeviceIndex device_index, int i) {
 // Creates the low and high priority stream pools for the specified device
 // Warning: only call once per device!
 static void initDeviceStreamState(DeviceIndex device_index) {
-  // Switches to the requested device so streams are properly associated
-  // with it.
-  CUDAGuard device_guard{device_index};
   for (const auto i : c10::irange(kStreamsPerPool)) {
     for (const auto p : c10::irange(max_stream_priorities)) {
       initSingleStream(p, device_index, i);