Fix: Unify nn.Module device placement across all quantizers and base class

google-labs-jules[bot] · google-labs-jules[bot] · commit 33e21baf5e04 · 2025-05-25T08:12:25.000Z
This commit addresses multiple instances where torch.nn.Module objects
(including nn.Linear and PreTrainedModel) were incorrectly passed to
the `move_to_device` function, which is designed for tensors. This
could lead to AttributeError exceptions.

The following changes were made:
- In AWQQuantizer, GPTQQuantizer, and GGUFQuantizer, `nn.Linear`
  layer instances are now moved to the target device using `layer.to(device)`.
- In BaseQuantizer, `PreTrainedModel` instances are now moved to the
  primary device using `model.to(device)`.

These changes ensure that all nn.Module device placements utilize the
standard `.to()` method, preventing errors and ensuring correct and
consistent behavior across the library. This supersedes previous
individual fixes for QuantizedLinear by addressing the issue at all
identified points for various module types.
diff --git a/quantllm/quant/awq.py b/quantllm/quant/awq.py
@@ -195,7 +195,7 @@ def _quantize_layer(
         quantized = quantized.to(target_device)
 
         # Ensure layer parameters are on the target_device for computation
-        layer = move_to_device(layer, target_device)
+        layer = layer.to(target_device)
 
         # Copy bias if exists, ensuring it's on the target device
         if layer.bias is not None:
diff --git a/quantllm/quant/gguf.py b/quantllm/quant/gguf.py
@@ -203,7 +203,7 @@ def _quantize_layer(
         """Quantize a single layer to GGUF format with memory-efficient processing."""
         target_device = torch.device('cpu') if self.cpu_offload else self.device_manager.primary_device
         
-        layer = move_to_device(layer, target_device)
+        layer = layer.to(target_device)
 
         # Initialize quantized layer and move to target_device
         quantized = QuantizedLinear(
diff --git a/quantllm/quant/gptq.py b/quantllm/quant/gptq.py
@@ -187,7 +187,7 @@ def _quantize_layer(self, layer: nn.Linear, H: torch.Tensor) -> QuantizedLinear:
         # Ensure tensors are on the correct device
         H = move_to_device(H, target_device)
         # Original layer's weights should be moved to target_device before processing
-        layer = move_to_device(layer, target_device)
+        layer = layer.to(target_device)
         W = layer.weight.data # W is now on target_device
         
         # Initialize quantized layer
diff --git a/quantllm/quant/quantization_engine.py b/quantllm/quant/quantization_engine.py
@@ -521,7 +521,7 @@ def _prepare_model_instance(self, model_to_prepare: PreTrainedModel, make_copy:
         prepared_model.eval()
         if self.device_manager.primary_device is not None:
             self.logger.log_info(f"Moving model to device: {self.device_manager.primary_device}")
-            prepared_model = move_to_device(prepared_model, self.device_manager.primary_device)
+            prepared_model = prepared_model.to(self.device_manager.primary_device)
         
         self.logger.log_info("Model preparation (copy, eval, device move) completed successfully.")
         return prepared_model