intel · xin3he · Mar 24, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
@@ -179,6 +179,8 @@ def pack_layer(name, model, backend, device=None):
             zero = int(zero.flatten()[0])
     else:
         layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero
+    if isinstance(zero, torch.Tensor) and zero.dtype == torch.bfloat16:
+        zero = zero.float()
     sig = inspect.signature(qlayer.pack)
     param_count = len(sig.parameters)
     if param_count == 2:

diff --git a/auto_round/utils/missing_tensors.py b/auto_round/utils/missing_tensors.py
@@ -235,8 +235,7 @@ def _is_truly_missing(name: str) -> bool:
     parent_summary = compress_layer_names(list({name.rsplit(".", 1)[0] for name in missing_tensor_names}))
     logger.info(
         f"Found {len(missing_tensor_names)} tensor(s) in the source checkpoint that are "
-        f"absent from the saved output (e.g., MTP parameters). Copying them now...\n"
-        f"  Layers: {parent_summary}"
+        f"absent from the saved output (e.g., MTP parameters): {parent_summary}. Copying them now...\n"
     )
 
     # ------------------------------------------------------------------ #

diff --git a/test/conftest.py b/test/conftest.py
@@ -18,7 +18,7 @@
     try:
         Version(_gguf_mod.__version__)
     except Exception:
-        _gguf_mod.__version__ = "0.0.0"
+        _gguf_mod.__version__ = "0.10.0"
 except ImportError:
     pass
 

diff --git a/test/test_cpu/integrations/test_llmc_integration.py b/test/test_cpu/integrations/test_llmc_integration.py
@@ -54,6 +54,7 @@
 w8a8_dynamic_recipe_modifier = AutoRoundModifier(
     ignore=["lm_head"],
     iters=0,
+    enable_torch_compile=False,
     config_groups={
         "group_0": QuantizationScheme(
             targets=["Linear"],
@@ -66,6 +67,7 @@
 w8a8_static_recipe_modifier = AutoRoundModifier(
     ignore=["lm_head"],
     iters=0,
+    enable_torch_compile=False,
     config_groups={
         "group_0": QuantizationScheme(
             targets=["Linear"],
@@ -192,9 +194,9 @@ def test_oneshot_with_device_ids(tiny_tiny_llama_model_path, tmp_path):
     "recipe",
     [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier],
 )
-def test_rtn_oneshot(recipe, tmp_path):
+def test_rtn_oneshot(recipe, tmp_path, tiny_tiny_llama_model_path):
     output = tmp_path / "oneshot_output"
-    model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    model = tiny_tiny_llama_model_path
     tokenizer = AutoTokenizer.from_pretrained(model)
     dataset = get_dataset(
         tokenizer=tokenizer,

diff --git a/test/test_cuda/algorithms/test_auto_scheme.py b/test/test_cuda/algorithms/test_auto_scheme.py
@@ -263,13 +263,13 @@ def test_auto_scheme_export(self):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16"))
         ar = AutoRound(model=model_name, scheme=scheme)
-        ar.quantize()
+        ar.quantize_and_save(output_dir=self.save_dir)
         evaluate_accuracy(self.save_dir, threshold=0.25)
 
     @pytest.mark.skip_ci(reason="The evaluation is time-consuming")
     def test_enable_torch_compile(self):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True)
-        ar.quantize()
+        ar.quantize_and_save(output_dir=self.save_dir)
         evaluate_accuracy(self.save_dir, threshold=0.10)
diff --git a/test/test_cuda/export/test_auto_gptq_format.py b/test/test_cuda/export/test_auto_gptq_format.py
@@ -9,7 +9,7 @@
 
 from auto_round import AutoRound
 
-from ...envs import require_optimum
+from ...envs import require_gptqmodel
 from ...helpers import eval_generated_prompt, get_model_path, get_tiny_model, transformers_version
 
 
@@ -33,7 +33,7 @@ def _save_dir(self, tmp_path):
         yield
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    @require_optimum
+    @require_gptqmodel
     def test_autogptq_format(self, tiny_opt_model_path):
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -52,6 +52,7 @@ def test_autogptq_format(self, tiny_opt_model_path):
         assert model is not None, "Loaded model should not be None."
 
     @pytest.mark.skip_ci(reason="Only tiny model is suggested")  # skip this test in CI
+    @require_gptqmodel
     def test_autogptq_format_qsave_ignore_layers(self):
         model = AutoModelForCausalLM.from_pretrained(get_model_path("facebook/opt-125m"))
 

diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py
@@ -84,6 +84,7 @@ def test_qwen2(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_vlm_env
+    # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39
     @require_package_version_ut("transformers", "<4.54.0")
     def test_phi3(self):
         model_path = get_model_path("microsoft/Phi-3.5-vision-instruct")
@@ -135,6 +136,8 @@ def test_phi3(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_vlm_env
+    # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39
+    @require_package_version_ut("transformers", "<4.54.0")
     def test_phi3_vision_awq(self):
         model_path = get_model_path("microsoft/Phi-3.5-vision-instruct")
         ## test tune