Use safe_open in HF consolidation

ankitageorge · ankitageorge · commit c8cf6cfa5402 · 2025-08-01T09:36:05.000-07:00
Pull Request resolved: #159395 Use safe_open to read tensors for HF consolidation instead of reading the bytes with f.read() which is having worse performance. ghstack-source-id: 300135655 @exported-using-ghexport Differential Revision: [D79105491](https://our.internmc.facebook.com/intern/diff/D79105491/)
diff --git a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
@@ -5,7 +5,6 @@
 import json
 import logging
 import math
-import mmap
 import os
 import struct
 import time
@@ -17,7 +16,6 @@
 from torch.distributed.checkpoint._hf_utils import (
     _gen_file_name,
     _get_dcp_custom_metadata,
-    _get_dtype,
     _get_safetensors_file_metadata,
     _metadata_fn,
     DATA_OFFSETS_KEY,
@@ -95,6 +93,9 @@ def _parse_input_metadata(
     Raises:
         ValueError: If no DCP custom metadata is found in a safetensors file
     """
+
+    from safetensors.torch import _getdtype  # type: ignore[import]
+
     # Dictionary to track the full size of each tensor across all shards
     fqn_to_size_mapping: dict[str, tuple[list[int], str]] = {}
 
@@ -133,7 +134,7 @@ def _parse_input_metadata(
             if fqn in output_data.fqn_data:
                 output_data.fqn_data[fqn] = _FqnData(
                     shape_in_file=tensor_size,
-                    dtype_size=torch.finfo(_get_dtype(dtype_str)).bits
+                    dtype_size=torch.finfo(_getdtype(dtype_str)).bits
                     // 8,  # Convert bits to bytes
                     dtype_str=dtype_str,
                 )
@@ -197,12 +198,7 @@ def _write_metadata(
             output_data.metadata_size = f.tell()
 
 
-def _read_tensor_data_mmap(
-    file_path: str,
-    start_offset: int,
-    end_offset: int,
-    metadata_size: int,
-) -> bytes:
+def _read_tensor_data(file_path: str, fqn: str) -> bytes:
     """
     Read tensor data from a safetensors file using memory mapping for efficiency.
 
@@ -215,12 +211,12 @@ def _read_tensor_data_mmap(
     Returns:
         Raw tensor data as bytes
     """
-    # Use mmap for efficient access
-    with open(file_path, "rb") as f:
-        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
-            absolute_start = metadata_size + start_offset
-            absolute_end = metadata_size + end_offset
-            return bytes(mm[absolute_start:absolute_end])
+
+    from safetensors import safe_open  # type: ignore[import]
+    from safetensors.torch import _tobytes  # type: ignore[import]
+
+    with safe_open(file_path, framework="pt") as f:
+        return _tobytes(f.get_tensor(fqn), fqn)
 
 
 def _process_output_file(
@@ -257,21 +253,16 @@ def _process_output_file(
             # Process each input safetensors file
             for safetensors_file in input_files_data.keys():
                 file_metadata = input_files_data[safetensors_file].metadata
-                input_metadata_size = input_files_data[safetensors_file].metadata_size
 
                 if tensor_fqn not in file_metadata.keys():
                     continue
 
                 metadata = file_metadata[tensor_fqn]
 
-                data_offsets = metadata[DATA_OFFSETS_KEY]
-
                 # Use memory mapping to read tensor data efficiently
-                data_to_write = _read_tensor_data_mmap(
+                data_to_write = _read_tensor_data(
                     safetensors_file,
-                    data_offsets[0],
-                    data_offsets[1],
-                    input_metadata_size,
+                    tensor_fqn,
                 )
 
                 # Get the offsets of this tensor shard within the full tensor
diff --git a/torch/distributed/checkpoint/_hf_utils.py b/torch/distributed/checkpoint/_hf_utils.py
@@ -22,18 +22,6 @@
 DTYPE_KEY = "dtype"
 DATA_OFFSETS_KEY = "data_offsets"
 
-DTYPE_MAP = {
-    "F16": torch.float16,
-    "F32": torch.float32,
-    "F64": torch.float64,
-    "I8": torch.int8,
-    "U8": torch.uint8,
-    "I16": torch.int16,
-    "I32": torch.int32,
-    "I64": torch.int64,
-    "BF16": torch.bfloat16,
-}
-
 HF_DCP_VERSION: float = 1.0
 DCP_VERSION_KEY = "DCP_VERSION"
 DCP_SHARDING_INFO_KEY = "DCP_SHARDING_INFO"
@@ -91,15 +79,6 @@ def _get_safetensors_file_metadata(file_bytes: io.IOBase) -> tuple[Any, int]:
     return (metadata, header_len + NUM_BYTES_FOR_HEADER_LEN)
 
 
-def _get_dtype(dtype_str: str) -> torch.dtype:
-    try:
-        dtype = DTYPE_MAP[dtype_str]
-    except KeyError:
-        dtype = torch.get_default_dtype()
-
-    return dtype
-
-
 def _get_dcp_custom_metadata(metadata: Any) -> Optional[Any]:
     if DEFAULT_EXTRA_METADATA_KEY in metadata:
         custom_metadata = metadata[DEFAULT_EXTRA_METADATA_KEY]
diff --git a/torch/distributed/checkpoint/hf_storage.py b/torch/distributed/checkpoint/hf_storage.py
@@ -13,7 +13,6 @@
 from torch.distributed.checkpoint._fsspec_filesystem import FsspecReader, FsspecWriter
 from torch.distributed.checkpoint._hf_utils import (
     _gen_file_name,
-    _get_dtype,
     _get_safetensors_file_metadata,
     _HFStorageInfo,
     _metadata_fn,
@@ -282,6 +281,8 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         return fut
 
     def read_metadata(self) -> Metadata:
+        from safetensors.torch import _getdtype  # type: ignore[import]
+
         state_dict_metadata: dict[str, TensorStorageMetadata] = {}
         storage_data: dict[MetadataIndex, _HFStorageInfo] = {}
 
@@ -314,7 +315,7 @@ def read_metadata(self) -> Metadata:
                     if key not in state_dict_metadata:
                         state_dict_metadata[key] = TensorStorageMetadata(
                             properties=TensorProperties(
-                                dtype=_get_dtype(val[DTYPE_KEY])
+                                dtype=_getdtype(val[DTYPE_KEY])
                             ),
                             size=torch.Size(
                                 [
@@ -354,7 +355,7 @@ def read_metadata(self) -> Metadata:
                         offset=val[DATA_OFFSETS_KEY][0] + metadata_size,
                         length=val[DATA_OFFSETS_KEY][1] - val[DATA_OFFSETS_KEY][0],
                         shape=torch.Size(val[SHAPE_KEY]),
-                        dtype=_get_dtype(val[DTYPE_KEY]),
+                        dtype=_getdtype(val[DTYPE_KEY]),
                     )
 
         metadata = Metadata(