From 5ebcef3eb05ef9dfe61146bdfb55dbcc51975231 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 15:06:06 +0800 Subject: [PATCH 1/6] stop torch compile for fp8 on A100 Signed-off-by: Xin He --- test/test_cpu/integrations/test_llmc_integration.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_cpu/integrations/test_llmc_integration.py b/test/test_cpu/integrations/test_llmc_integration.py index 23bd55d07..48a66ba75 100644 --- a/test/test_cpu/integrations/test_llmc_integration.py +++ b/test/test_cpu/integrations/test_llmc_integration.py @@ -54,6 +54,7 @@ w8a8_dynamic_recipe_modifier = AutoRoundModifier( ignore=["lm_head"], iters=0, + enable_torch_compile=False, config_groups={ "group_0": QuantizationScheme( targets=["Linear"], @@ -66,6 +67,7 @@ w8a8_static_recipe_modifier = AutoRoundModifier( ignore=["lm_head"], iters=0, + enable_torch_compile=False, config_groups={ "group_0": QuantizationScheme( targets=["Linear"], @@ -192,9 +194,9 @@ def test_oneshot_with_device_ids(tiny_tiny_llama_model_path, tmp_path): "recipe", [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier], ) -def test_rtn_oneshot(recipe, tmp_path): +def test_rtn_oneshot(recipe, tmp_path, tiny_tiny_llama_model_path): output = tmp_path / "oneshot_output" - model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + model = tiny_tiny_llama_model_path tokenizer = AutoTokenizer.from_pretrained(model) dataset = get_dataset( tokenizer=tokenizer, From 9e2c4034d1b047bbc42e912ccebb622943585c8b Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 19:51:00 +0800 Subject: [PATCH 2/6] fix test_auto_gptq.py Signed-off-by: Xin He --- auto_round/export/export_to_autogptq/export.py | 2 +- test/test_cuda/export/test_auto_gptq_format.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index acfed7772..ae8c12d81 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -184,7 +184,7 @@ def pack_layer(name, model, backend, device=None): if param_count == 2: qlayer.pack(layer, scale, device) else: - qlayer.pack(layer, scale, zero, None, device) + qlayer.pack(layer, scale, zero.float(), None, device) qlayer.to(orig_device) set_module(model, name, qlayer) # Note: release weight and bias explicitly, in case they are referenced elsewhere diff --git a/test/test_cuda/export/test_auto_gptq_format.py b/test/test_cuda/export/test_auto_gptq_format.py index 2e98ad71b..474e058d0 100644 --- a/test/test_cuda/export/test_auto_gptq_format.py +++ b/test/test_cuda/export/test_auto_gptq_format.py @@ -9,7 +9,7 @@ from auto_round import AutoRound -from ...envs import require_optimum +from ...envs import require_gptqmodel from ...helpers import eval_generated_prompt, get_model_path, get_tiny_model, transformers_version @@ -33,7 +33,7 @@ def _save_dir(self, tmp_path): yield shutil.rmtree(self.save_dir, ignore_errors=True) - @require_optimum + @require_gptqmodel def test_autogptq_format(self, tiny_opt_model_path): bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -52,6 +52,7 @@ def test_autogptq_format(self, tiny_opt_model_path): assert model is not None, "Loaded model should not be None." @pytest.mark.skip_ci(reason="Only tiny model is suggested") # skip this test in CI + @require_gptqmodel def test_autogptq_format_qsave_ignore_layers(self): model = AutoModelForCausalLM.from_pretrained(get_model_path("facebook/opt-125m")) From 102ca966a424ede1519d6b5c5e83cdde02b9dfd5 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 19:52:17 +0800 Subject: [PATCH 3/6] fix gguf Signed-off-by: Xin He --- test/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conftest.py b/test/conftest.py index 5a500ed3d..f3761d66b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -18,7 +18,7 @@ try: Version(_gguf_mod.__version__) except Exception: - _gguf_mod.__version__ = "0.0.0" + _gguf_mod.__version__ = "0.10.0" except ImportError: pass From b9c839d6cfd9ae1ff4c9c790c9a84cb1ca7e83ff Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 19:52:26 +0800 Subject: [PATCH 4/6] fix auto-scheme Signed-off-by: Xin He --- test/test_cuda/algorithms/test_auto_scheme.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_cuda/algorithms/test_auto_scheme.py b/test/test_cuda/algorithms/test_auto_scheme.py index 133284bfc..526a33a31 100644 --- a/test/test_cuda/algorithms/test_auto_scheme.py +++ b/test/test_cuda/algorithms/test_auto_scheme.py @@ -263,7 +263,7 @@ def test_auto_scheme_export(self): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16")) ar = AutoRound(model=model_name, scheme=scheme) - ar.quantize() + ar.quantize_and_save(output_dir=self.save_dir) evaluate_accuracy(self.save_dir, threshold=0.25) @pytest.mark.skip_ci(reason="The evaluation is time-consuming") @@ -271,5 +271,5 @@ def test_enable_torch_compile(self): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True) - ar.quantize() + ar.quantize_and_save(output_dir=self.save_dir) evaluate_accuracy(self.save_dir, threshold=0.10) From 65d762fc52844d68f231833ac543f957d3150379 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 20:39:31 +0800 Subject: [PATCH 5/6] skip phi3.5 vision due to no longer maintained. Signed-off-by: Xin He --- test/test_cuda/models/test_support_vlms.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py index e927a3d64..58de4e072 100644 --- a/test/test_cuda/models/test_support_vlms.py +++ b/test/test_cuda/models/test_support_vlms.py @@ -84,6 +84,7 @@ def test_qwen2(self): shutil.rmtree(quantized_model_path, ignore_errors=True) @require_vlm_env + # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39 @require_package_version_ut("transformers", "<4.54.0") def test_phi3(self): model_path = get_model_path("microsoft/Phi-3.5-vision-instruct") @@ -135,6 +136,8 @@ def test_phi3(self): shutil.rmtree(quantized_model_path, ignore_errors=True) @require_vlm_env + # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39 + @require_package_version_ut("transformers", "<4.54.0") def test_phi3_vision_awq(self): model_path = get_model_path("microsoft/Phi-3.5-vision-instruct") ## test tune From c75686c9cf7eea631f221c585713bcda5254108e Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 24 Mar 2026 09:56:13 +0800 Subject: [PATCH 6/6] fix bug and optimize log Signed-off-by: Xin He --- auto_round/export/export_to_autogptq/export.py | 4 +++- auto_round/utils/missing_tensors.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index ae8c12d81..98fde7b73 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -179,12 +179,14 @@ def pack_layer(name, model, backend, device=None): zero = int(zero.flatten()[0]) else: layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero + if isinstance(zero, torch.Tensor) and zero.dtype == torch.bfloat16: + zero = zero.float() sig = inspect.signature(qlayer.pack) param_count = len(sig.parameters) if param_count == 2: qlayer.pack(layer, scale, device) else: - qlayer.pack(layer, scale, zero.float(), None, device) + qlayer.pack(layer, scale, zero, None, device) qlayer.to(orig_device) set_module(model, name, qlayer) # Note: release weight and bias explicitly, in case they are referenced elsewhere diff --git a/auto_round/utils/missing_tensors.py b/auto_round/utils/missing_tensors.py index 481bf7068..bc05dbe01 100644 --- a/auto_round/utils/missing_tensors.py +++ b/auto_round/utils/missing_tensors.py @@ -235,8 +235,7 @@ def _is_truly_missing(name: str) -> bool: parent_summary = compress_layer_names(list({name.rsplit(".", 1)[0] for name in missing_tensor_names})) logger.info( f"Found {len(missing_tensor_names)} tensor(s) in the source checkpoint that are " - f"absent from the saved output (e.g., MTP parameters). Copying them now...\n" - f" Layers: {parent_summary}" + f"absent from the saved output (e.g., MTP parameters): {parent_summary}. Copying them now...\n" ) # ------------------------------------------------------------------ #