diff --git a/auto_round/compressors/shard_writer.py b/auto_round/compressors/shard_writer.py index 130a4b42b..3e0f63c47 100644 --- a/auto_round/compressors/shard_writer.py +++ b/auto_round/compressors/shard_writer.py @@ -142,6 +142,11 @@ def _flush_shard(self): if self.use_safetensors: from safetensors.torch import save_file + # Ensure tensors are contiguous in-place to avoid duplicating them in a separate dict, + # which can increase peak RAM usage during saving. + for k, v in list(self.current_shard_tensors.items()): + if isinstance(v, torch.Tensor) and not v.is_contiguous(): + self.current_shard_tensors[k] = v.contiguous() save_file(self.current_shard_tensors, tmp_path) else: torch.save(self.current_shard_tensors, tmp_path) diff --git a/test/test_cuda/requirements_vllm.txt b/test/test_cuda/requirements_vllm.txt index fbedb5bf7..b92578c9c 100644 --- a/test/test_cuda/requirements_vllm.txt +++ b/test/test_cuda/requirements_vllm.txt @@ -1,2 +1,3 @@ vllm lm_eval >= 0.4.10 +ray