weizhixiangcoder
diff --git a/‎tensorflow/compiler/jit/BUILD
Lines changed: 7 additions & 5 deletions b/‎tensorflow/compiler/jit/BUILD
Lines changed: 7 additions & 5 deletions
diff --git a/‎tensorflow/compiler/jit/kernels/xla_launch_op.cc
Lines changed: 21 additions & 10 deletions b/‎tensorflow/compiler/jit/kernels/xla_launch_op.cc
Lines changed: 21 additions & 10 deletions
diff --git a/‎tensorflow/compiler/jit/xla_compile_on_demand_op.cc
Lines changed: 8 additions & 11 deletions b/‎tensorflow/compiler/jit/xla_compile_on_demand_op.cc
Lines changed: 8 additions & 11 deletions
diff --git a/‎tensorflow/compiler/jit/xla_device.cc
Lines changed: 8 additions & 20 deletions b/‎tensorflow/compiler/jit/xla_device.cc
Lines changed: 8 additions & 20 deletions
diff --git a/‎tensorflow/compiler/jit/xla_device.h
Lines changed: 2 additions & 12 deletions b/‎tensorflow/compiler/jit/xla_device.h
Lines changed: 2 additions & 12 deletions
diff --git a/‎tensorflow/compiler/jit/xla_device_context.cc
Lines changed: 32 additions & 29 deletions b/‎tensorflow/compiler/jit/xla_device_context.cc
Lines changed: 32 additions & 29 deletions
@@ -120,11 +120,13 @@ cc_library(
 )
 
 cc_library(
-    name = "xla_tensor_info",
-    srcs = ["xla_tensor_info.cc"],
-    hdrs = ["xla_tensor_info.h"],
+    name = "xla_tensor",
+    srcs = ["xla_tensor.cc"],
+    hdrs = ["xla_tensor.h"],
     deps = [
         ":common",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -154,7 +156,7 @@ cc_library(
         ":common",
         ":jit_compilation_passes",
         ":xla_launch_util",
-        ":xla_tensor_info",
+        ":xla_tensor",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
@@ -201,7 +203,7 @@ cc_library(
     deps = [
         ":common",
         ":xla_compilation_cache",
-        ":xla_tensor_info",
+        ":xla_tensor",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
 
@@ -116,11 +116,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   const XlaDevice::Metadata* metadata;
   Status s = XlaDevice::GetMetadata(ctx, &metadata);
-
-  XlaTensorInfoManager* tensor_info_manager = nullptr;
-  if (s.ok()) {
-    tensor_info_manager = &metadata->tensor_info_manager();
-  }
+  bool allocate_xla_tensors = s.ok();
 
   // Get the platform_id_ for XLA_* devices.
   if (platform_id_ == nullptr) {
@@ -134,16 +130,31 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
+  XlaAllocator local_xla_allocator(client->backend().platform(),
+                                   ctx->device()->GetAllocator({}));
+  xla::DeviceMemoryAllocator* xla_allocator;
+  // If we are on an XlaDevice, use the underlying XLA platform's allocator
+  // directly. We could use the StreamExecutor's allocator which may
+  // theoretically be more correct, but XLA returns a nice OOM message in a
+  // Status and StreamExecutor does not.
+  //
+  // Importantly we can't use ctx->device()->GetAllocator() as the allocator
+  // (which local_xla_allocator above uses) as on an XlaDevice, this is a
+  // dummy allocator that returns XlaTensor objects. The XlaCompiler needs a
+  // real allocator to allocate real buffers.
+  if (allocate_xla_tensors) {
+    xla_allocator = client->backend().memory_allocator();
+  } else {
+    xla_allocator = &local_xla_allocator;
+  }
 
   XlaCompiler::Options options;
   options.client = client;
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
-  options.device_allocator = &xla_allocator;
+  options.device_allocator = xla_allocator;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -159,14 +170,14 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "Executing XLA Computation...";
 
   XlaComputationLaunchContext launch_context(
-      num_resource_args_, client, &xla_allocator, tensor_info_manager);
+      num_resource_args_, client, xla_allocator, allocate_xla_tensors);
   launch_context.PopulateInputs(ctx, kernel, variables);
 
   // Execute the computation.
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(&xla_allocator);
+  run_options.set_allocator(xla_allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
@@ -51,12 +51,10 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   int64 num_resource_args = variables.size();
 
   xla::LocalClient* client = metadata.client();
-  XlaTensorInfoManager* tensor_info_manager = &metadata.tensor_info_manager();
 
   // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
   XlaComputationLaunchContext launch_context(
-      num_resource_args, client, &xla_allocator, tensor_info_manager);
+      num_resource_args, client, client->backend().memory_allocator(), true);
 
   launch_context.PopulateInputs(ctx, result, variables);
 
@@ -67,7 +65,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(&xla_allocator);
+  run_options.set_allocator(client->backend().memory_allocator());
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
 
   auto run_result = executable->Run(launch_context.arguments(), run_options);
@@ -106,25 +104,24 @@ Status XlaCompileOnDemandOp::Compile(
     OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
     const XlaCompiler::CompilationResult** result,
     xla::LocalExecutable** executable) {
-  XlaTensorInfoManager* tensor_info_manager = &metadata.tensor_info_manager();
-
   std::map<int, Tensor> constant_arguments;
   for (int64 i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& device_tensor = ctx->input(i);
-    if (const XlaTensorInfo* tensor_info =
-            tensor_info_manager->GetTensorInfo(device_tensor)) {
-      if (tensor_info->has_host_tensor() &&
+    if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
+      if (xla_tensor->has_host_tensor() &&
           ShouldArgumentBeConstant(&ctx->op_kernel(), i)) {
-        constant_arguments[i] = tensor_info->host_tensor();
+        constant_arguments[i] = xla_tensor->host_tensor();
       }
     }
     if (constant_arguments.count(i) == 0 &&
         MustArgumentBeConstant(&ctx->op_kernel(), i)) {
       // Slow path; the argument is not available as a host constant so we must
       // fetch it synchronously.
       Tensor host_tensor;
+      AllocatorAttributes attrs;
+      attrs.set_on_host(true);
       TF_RETURN_IF_ERROR(ctx->allocate_temp(
-          device_tensor.dtype(), device_tensor.shape(), &host_tensor));
+          device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
       Notification n;
       ctx->op_device_context()->CopyDeviceTensorToCPU(
           &device_tensor, "ConstantArgument",
 
@@ -100,7 +100,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      xla::MakeUnique<XlaDeviceAllocator>(backend, device_ordinal);
+      xla::MakeUnique<XlaDeviceAllocator>();
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -136,13 +136,11 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   return Status::OK();
 }
 
-XlaDevice::Metadata::Metadata(
-    int device_ordinal, se::Platform* platform, const DeviceType& device_type,
-    std::unique_ptr<XlaTensorInfoManager>* tensor_info_manager)
+XlaDevice::Metadata::Metadata(int device_ordinal, se::Platform* platform,
+                              const DeviceType& device_type)
     : device_ordinal_(device_ordinal),
       device_type_(device_type),
-      platform_(platform),
-      tensor_info_manager_(*tensor_info_manager) {}
+      platform_(platform) {}
 
 int XlaDevice::Metadata::device_ordinal() const { return device_ordinal_; }
 
@@ -157,12 +155,9 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
   return device_type_;
 }
 
-XlaTensorInfoManager& XlaDevice::Metadata::tensor_info_manager() const {
-  return *tensor_info_manager_;
-}
-
 /* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx,
                                            const Metadata** metadata) {
+  *metadata = nullptr;
   XlaDevice* xla_device =
       dynamic_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
   if (xla_device == nullptr) {
@@ -181,15 +176,11 @@ XlaDevice::XlaDevice(const SessionOptions& options,
                      const DeviceType& jit_device_name, se::Platform* platform,
                      bool transfer_as_literal)
     : LocalDevice(options, attrs),
-      xla_metadata_(
-          device_ordinal, platform, jit_device_name,
-          // Pass tensor_info_manager_ by reference as it is initialized lazily.
-          &tensor_info_manager_),
+      xla_metadata_(device_ordinal, platform, jit_device_name),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
       platform_(platform),
-      tensor_info_manager_(nullptr),
       transfer_as_literal_(transfer_as_literal) {}
 
 XlaDevice::~XlaDevice() {}
@@ -215,7 +206,6 @@ Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) {
     xla::Backend* backend = client()->mutable_backend();
     xla_allocator_ = XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
         backend, device_ordinal_);
-    tensor_info_manager_.reset(new XlaTensorInfoManager(xla_allocator_));
   }
   return xla_allocator_;
 }
@@ -236,8 +226,7 @@ Status XlaDevice::FillContextMap(const Graph* graph,
   // Call GetAllocator for the side-effect of ensuring the allocator and
   // XlaTensorInfoManager is created.
   (void)GetAllocator({});
-  auto ctx = new XlaDeviceContext(stream, tensor_info_manager_.get(),
-                                  transfer_as_literal_);
+  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
     ctx->Ref();
@@ -285,8 +274,7 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
     Notification n;
     TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-    XlaTransferManager manager(stream, tensor_info_manager_.get(),
-                               transfer_as_literal_);
+    XlaTransferManager manager(stream, client(), transfer_as_literal_);
     manager.CopyCPUTensorToDevice(&parsed, this, &copy,
                                   [&n, &status](const Status& s) {
                                     status = s;
 
@@ -26,7 +26,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 
-#include "tensorflow/compiler/jit/xla_tensor_info.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -50,22 +50,19 @@ class XlaDevice : public LocalDevice {
   class Metadata {
    public:
     Metadata(int device_ordinal, perftools::gputools::Platform* platform,
-             const DeviceType& device_type,
-             std::unique_ptr<XlaTensorInfoManager>* tensor_info_manager);
+             const DeviceType& device_type);
 
     // The index of the device on this host.
     int device_ordinal() const;
 
     perftools::gputools::Platform* platform() const;
     xla::LocalClient* client() const;
     const DeviceType& jit_device_type() const;
-    XlaTensorInfoManager& tensor_info_manager() const;
 
    private:
     const int device_ordinal_;
     const DeviceType device_type_;
     perftools::gputools::Platform* platform_;  // Not owned.
-    std::unique_ptr<XlaTensorInfoManager>& tensor_info_manager_;
 
     TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
   };
@@ -123,13 +120,6 @@ class XlaDevice : public LocalDevice {
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
   xla::Backend::StreamPtr stream_;
-  // Manages sideband data about tensors, in particular the on-device shape tree
-  // if the tensor requires multiple device buffers to represent (for example,
-  // tuple shapes).
-  // This is a unique_ptr because XlaTensorInfoManager is non-copy-constructible
-  // and we need to initialize this lazily (as we also lazily initialize the
-  // underlying allocator).
-  std::unique_ptr<XlaTensorInfoManager> tensor_info_manager_;
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
 
@@ -27,37 +27,32 @@ namespace se = ::perftools::gputools;
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
-XlaDeviceAllocator::XlaDeviceAllocator(const xla::Backend* backend,
-                                       int device_ordinal)
-    : backend_(backend), device_ordinal_(device_ordinal) {}
-
+XlaDeviceAllocator::XlaDeviceAllocator() {}
 XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 
 string XlaDeviceAllocator::Name() { return "xla"; }
 
 void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  se::DeviceMemoryBase dmem =
-      backend_->memory_allocator()
-          ->Allocate(device_ordinal_, num_bytes, /*retry_on_failure=*/false)
-          .ValueOrDie();
-  VLOG(2) << "Allocated XLA device tensor " << dmem.opaque() << "(" << num_bytes
-          << ")";
-  return dmem.opaque();
+  // We always return an empty XlaTensor object, encoded as an opaque tagged
+  // pointer. We can return an empty object and ignore num_bytes here because we
+  // have control over all of the uses of this device tensor, and can lazily
+  // allocate memory when used. This allows us to also know the shape of the
+  // allocated Tensor, which is useful if the device's tensor representation
+  // differs from the host.
+  return XlaTensor::ToOpaquePointer(new XlaTensor());
 }
 
 void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
-  se::DeviceMemoryBase dmem(ptr);
-  TF_CHECK_OK(backend_->memory_allocator()->Deallocate(device_ordinal_, &dmem));
-  VLOG(2) << "Deallocated XLA device tensor " << ptr;
+  delete XlaTensor::FromOpaquePointer(ptr);
 }
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
-XlaTransferManager::XlaTransferManager(
-    se::Stream* stream, XlaTensorInfoManager* tensor_info_manager,
-    bool transfer_as_literal)
+XlaTransferManager::XlaTransferManager(se::Stream* stream,
+                                       xla::LocalClient* client,
+                                       bool transfer_as_literal)
     : stream_(stream),
-      tensor_info_manager_(tensor_info_manager),
+      client_(client),
       transfer_as_literal_(transfer_as_literal) {}
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
@@ -74,9 +69,21 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
 
     void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* dst_ptr = DMAHelper::base(device_tensor);
-    se::DeviceMemoryBase dev_dst_ptr(dst_ptr, total_bytes);
 
+    XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+    CHECK(xla_tensor);
+    if (!xla_tensor->has_shaped_buffer()) {
+      Status s = xla_tensor->AllocateShapedBuffer(
+          device_tensor->dtype(), device_tensor->shape(), client_,
+          stream_->parent()->device_ordinal());
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
+    }
+
+    se::DeviceMemoryBase dev_dst_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
     if (transfer_as_literal_) {
       status = xla::Unimplemented(
@@ -92,10 +99,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
             block_status.error_message().c_str());
       }
     }
-
-    XlaTensorInfo* tensor_info =
-        tensor_info_manager_->GetOrCreateTensorInfo(*device_tensor);
-    tensor_info->set_host_tensor(*cpu_tensor);
+    xla_tensor->set_host_tensor(*cpu_tensor);
 
     done(status);
     return;
@@ -119,8 +123,8 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
             << device_tensor->NumElements();
 
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* src_ptr = const_cast<void*>(DMAHelper::base(device_tensor));
-    se::DeviceMemoryBase dev_src_ptr(src_ptr, total_bytes);
+    se::DeviceMemoryBase dev_src_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     void* dst_ptr = DMAHelper::base(cpu_tensor);
 
     Status status;
@@ -147,10 +151,9 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   done(Status::OK());
 }
 
-XlaDeviceContext::XlaDeviceContext(se::Stream* stream,
-                                   XlaTensorInfoManager* tensor_info_manager,
+XlaDeviceContext::XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
                                    bool transfer_as_literal)
-    : manager_(stream, tensor_info_manager, transfer_as_literal) {}
+    : manager_(stream, client, transfer_as_literal) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,