From 5ebcef3eb05ef9dfe61146bdfb55dbcc51975231 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 15:06:06 +0800
Subject: [PATCH 1/6] stop torch compile for fp8 on A100

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/test_cpu/integrations/test_llmc_integration.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_cpu/integrations/test_llmc_integration.py b/test/test_cpu/integrations/test_llmc_integration.py
index 23bd55d07..48a66ba75 100644
--- a/test/test_cpu/integrations/test_llmc_integration.py
+++ b/test/test_cpu/integrations/test_llmc_integration.py
@@ -54,6 +54,7 @@
 w8a8_dynamic_recipe_modifier = AutoRoundModifier(
     ignore=["lm_head"],
     iters=0,
+    enable_torch_compile=False,
     config_groups={
         "group_0": QuantizationScheme(
             targets=["Linear"],
@@ -66,6 +67,7 @@
 w8a8_static_recipe_modifier = AutoRoundModifier(
     ignore=["lm_head"],
     iters=0,
+    enable_torch_compile=False,
     config_groups={
         "group_0": QuantizationScheme(
             targets=["Linear"],
@@ -192,9 +194,9 @@ def test_oneshot_with_device_ids(tiny_tiny_llama_model_path, tmp_path):
     "recipe",
     [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier],
 )
-def test_rtn_oneshot(recipe, tmp_path):
+def test_rtn_oneshot(recipe, tmp_path, tiny_tiny_llama_model_path):
     output = tmp_path / "oneshot_output"
-    model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    model = tiny_tiny_llama_model_path
     tokenizer = AutoTokenizer.from_pretrained(model)
     dataset = get_dataset(
         tokenizer=tokenizer,

From 9e2c4034d1b047bbc42e912ccebb622943585c8b Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 19:51:00 +0800
Subject: [PATCH 2/6] fix test_auto_gptq.py

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/export/export_to_autogptq/export.py | 2 +-
 test/test_cuda/export/test_auto_gptq_format.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index acfed7772..ae8c12d81 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -184,7 +184,7 @@ def pack_layer(name, model, backend, device=None):
     if param_count == 2:
         qlayer.pack(layer, scale, device)
     else:
-        qlayer.pack(layer, scale, zero, None, device)
+        qlayer.pack(layer, scale, zero.float(), None, device)
     qlayer.to(orig_device)
     set_module(model, name, qlayer)
     # Note: release weight and bias explicitly, in case they are referenced elsewhere
diff --git a/test/test_cuda/export/test_auto_gptq_format.py b/test/test_cuda/export/test_auto_gptq_format.py
index 2e98ad71b..474e058d0 100644
--- a/test/test_cuda/export/test_auto_gptq_format.py
+++ b/test/test_cuda/export/test_auto_gptq_format.py
@@ -9,7 +9,7 @@
 
 from auto_round import AutoRound
 
-from ...envs import require_optimum
+from ...envs import require_gptqmodel
 from ...helpers import eval_generated_prompt, get_model_path, get_tiny_model, transformers_version
 
 
@@ -33,7 +33,7 @@ def _save_dir(self, tmp_path):
         yield
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    @require_optimum
+    @require_gptqmodel
     def test_autogptq_format(self, tiny_opt_model_path):
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -52,6 +52,7 @@ def test_autogptq_format(self, tiny_opt_model_path):
         assert model is not None, "Loaded model should not be None."
 
     @pytest.mark.skip_ci(reason="Only tiny model is suggested")  # skip this test in CI
+    @require_gptqmodel
     def test_autogptq_format_qsave_ignore_layers(self):
         model = AutoModelForCausalLM.from_pretrained(get_model_path("facebook/opt-125m"))
 

From 102ca966a424ede1519d6b5c5e83cdde02b9dfd5 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 19:52:17 +0800
Subject: [PATCH 3/6] fix gguf

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/conftest.py b/test/conftest.py
index 5a500ed3d..f3761d66b 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -18,7 +18,7 @@
     try:
         Version(_gguf_mod.__version__)
     except Exception:
-        _gguf_mod.__version__ = "0.0.0"
+        _gguf_mod.__version__ = "0.10.0"
 except ImportError:
     pass
 

From b9c839d6cfd9ae1ff4c9c790c9a84cb1ca7e83ff Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 19:52:26 +0800
Subject: [PATCH 4/6] fix auto-scheme

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/test_cuda/algorithms/test_auto_scheme.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_cuda/algorithms/test_auto_scheme.py b/test/test_cuda/algorithms/test_auto_scheme.py
index 133284bfc..526a33a31 100644
--- a/test/test_cuda/algorithms/test_auto_scheme.py
+++ b/test/test_cuda/algorithms/test_auto_scheme.py
@@ -263,7 +263,7 @@ def test_auto_scheme_export(self):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16"))
         ar = AutoRound(model=model_name, scheme=scheme)
-        ar.quantize()
+        ar.quantize_and_save(output_dir=self.save_dir)
         evaluate_accuracy(self.save_dir, threshold=0.25)
 
     @pytest.mark.skip_ci(reason="The evaluation is time-consuming")
@@ -271,5 +271,5 @@ def test_enable_torch_compile(self):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True)
-        ar.quantize()
+        ar.quantize_and_save(output_dir=self.save_dir)
         evaluate_accuracy(self.save_dir, threshold=0.10)

From 65d762fc52844d68f231833ac543f957d3150379 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 20:39:31 +0800
Subject: [PATCH 5/6] skip phi3.5 vision due to no longer maintained.

Signed-off-by: Xin He <xin3.he@intel.com>
---
 test/test_cuda/models/test_support_vlms.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py
index e927a3d64..58de4e072 100644
--- a/test/test_cuda/models/test_support_vlms.py
+++ b/test/test_cuda/models/test_support_vlms.py
@@ -84,6 +84,7 @@ def test_qwen2(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_vlm_env
+    # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39
     @require_package_version_ut("transformers", "<4.54.0")
     def test_phi3(self):
         model_path = get_model_path("microsoft/Phi-3.5-vision-instruct")
@@ -135,6 +136,8 @@ def test_phi3(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_vlm_env
+    # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39
+    @require_package_version_ut("transformers", "<4.54.0")
     def test_phi3_vision_awq(self):
         model_path = get_model_path("microsoft/Phi-3.5-vision-instruct")
         ## test tune

From c75686c9cf7eea631f221c585713bcda5254108e Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 24 Mar 2026 09:56:13 +0800
Subject: [PATCH 6/6] fix bug and optimize log

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/export/export_to_autogptq/export.py | 4 +++-
 auto_round/utils/missing_tensors.py            | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index ae8c12d81..98fde7b73 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -179,12 +179,14 @@ def pack_layer(name, model, backend, device=None):
             zero = int(zero.flatten()[0])
     else:
         layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero
+    if isinstance(zero, torch.Tensor) and zero.dtype == torch.bfloat16:
+        zero = zero.float()
     sig = inspect.signature(qlayer.pack)
     param_count = len(sig.parameters)
     if param_count == 2:
         qlayer.pack(layer, scale, device)
     else:
-        qlayer.pack(layer, scale, zero.float(), None, device)
+        qlayer.pack(layer, scale, zero, None, device)
     qlayer.to(orig_device)
     set_module(model, name, qlayer)
     # Note: release weight and bias explicitly, in case they are referenced elsewhere
diff --git a/auto_round/utils/missing_tensors.py b/auto_round/utils/missing_tensors.py
index 481bf7068..bc05dbe01 100644
--- a/auto_round/utils/missing_tensors.py
+++ b/auto_round/utils/missing_tensors.py
@@ -235,8 +235,7 @@ def _is_truly_missing(name: str) -> bool:
     parent_summary = compress_layer_names(list({name.rsplit(".", 1)[0] for name in missing_tensor_names}))
     logger.info(
         f"Found {len(missing_tensor_names)} tensor(s) in the source checkpoint that are "
-        f"absent from the saved output (e.g., MTP parameters). Copying them now...\n"
-        f"  Layers: {parent_summary}"
+        f"absent from the saved output (e.g., MTP parameters): {parent_summary}. Copying them now...\n"
     )
 
     # ------------------------------------------------------------------ #