diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index acfed7772..98fde7b73 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -179,6 +179,8 @@ def pack_layer(name, model, backend, device=None): zero = int(zero.flatten()[0]) else: layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero + if isinstance(zero, torch.Tensor) and zero.dtype == torch.bfloat16: + zero = zero.float() sig = inspect.signature(qlayer.pack) param_count = len(sig.parameters) if param_count == 2: diff --git a/auto_round/utils/missing_tensors.py b/auto_round/utils/missing_tensors.py index 481bf7068..bc05dbe01 100644 --- a/auto_round/utils/missing_tensors.py +++ b/auto_round/utils/missing_tensors.py @@ -235,8 +235,7 @@ def _is_truly_missing(name: str) -> bool: parent_summary = compress_layer_names(list({name.rsplit(".", 1)[0] for name in missing_tensor_names})) logger.info( f"Found {len(missing_tensor_names)} tensor(s) in the source checkpoint that are " - f"absent from the saved output (e.g., MTP parameters). Copying them now...\n" - f" Layers: {parent_summary}" + f"absent from the saved output (e.g., MTP parameters): {parent_summary}. Copying them now...\n" ) # ------------------------------------------------------------------ # diff --git a/test/conftest.py b/test/conftest.py index 5a500ed3d..f3761d66b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -18,7 +18,7 @@ try: Version(_gguf_mod.__version__) except Exception: - _gguf_mod.__version__ = "0.0.0" + _gguf_mod.__version__ = "0.10.0" except ImportError: pass diff --git a/test/test_cpu/integrations/test_llmc_integration.py b/test/test_cpu/integrations/test_llmc_integration.py index 23bd55d07..48a66ba75 100644 --- a/test/test_cpu/integrations/test_llmc_integration.py +++ b/test/test_cpu/integrations/test_llmc_integration.py @@ -54,6 +54,7 @@ w8a8_dynamic_recipe_modifier = AutoRoundModifier( ignore=["lm_head"], iters=0, + enable_torch_compile=False, config_groups={ "group_0": QuantizationScheme( targets=["Linear"], @@ -66,6 +67,7 @@ w8a8_static_recipe_modifier = AutoRoundModifier( ignore=["lm_head"], iters=0, + enable_torch_compile=False, config_groups={ "group_0": QuantizationScheme( targets=["Linear"], @@ -192,9 +194,9 @@ def test_oneshot_with_device_ids(tiny_tiny_llama_model_path, tmp_path): "recipe", [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier], ) -def test_rtn_oneshot(recipe, tmp_path): +def test_rtn_oneshot(recipe, tmp_path, tiny_tiny_llama_model_path): output = tmp_path / "oneshot_output" - model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + model = tiny_tiny_llama_model_path tokenizer = AutoTokenizer.from_pretrained(model) dataset = get_dataset( tokenizer=tokenizer, diff --git a/test/test_cuda/algorithms/test_auto_scheme.py b/test/test_cuda/algorithms/test_auto_scheme.py index 133284bfc..526a33a31 100644 --- a/test/test_cuda/algorithms/test_auto_scheme.py +++ b/test/test_cuda/algorithms/test_auto_scheme.py @@ -263,7 +263,7 @@ def test_auto_scheme_export(self): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16")) ar = AutoRound(model=model_name, scheme=scheme) - ar.quantize() + ar.quantize_and_save(output_dir=self.save_dir) evaluate_accuracy(self.save_dir, threshold=0.25) @pytest.mark.skip_ci(reason="The evaluation is time-consuming") @@ -271,5 +271,5 @@ def test_enable_torch_compile(self): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True) - ar.quantize() + ar.quantize_and_save(output_dir=self.save_dir) evaluate_accuracy(self.save_dir, threshold=0.10) diff --git a/test/test_cuda/export/test_auto_gptq_format.py b/test/test_cuda/export/test_auto_gptq_format.py index 2e98ad71b..474e058d0 100644 --- a/test/test_cuda/export/test_auto_gptq_format.py +++ b/test/test_cuda/export/test_auto_gptq_format.py @@ -9,7 +9,7 @@ from auto_round import AutoRound -from ...envs import require_optimum +from ...envs import require_gptqmodel from ...helpers import eval_generated_prompt, get_model_path, get_tiny_model, transformers_version @@ -33,7 +33,7 @@ def _save_dir(self, tmp_path): yield shutil.rmtree(self.save_dir, ignore_errors=True) - @require_optimum + @require_gptqmodel def test_autogptq_format(self, tiny_opt_model_path): bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -52,6 +52,7 @@ def test_autogptq_format(self, tiny_opt_model_path): assert model is not None, "Loaded model should not be None." @pytest.mark.skip_ci(reason="Only tiny model is suggested") # skip this test in CI + @require_gptqmodel def test_autogptq_format_qsave_ignore_layers(self): model = AutoModelForCausalLM.from_pretrained(get_model_path("facebook/opt-125m")) diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py index e927a3d64..58de4e072 100644 --- a/test/test_cuda/models/test_support_vlms.py +++ b/test/test_cuda/models/test_support_vlms.py @@ -84,6 +84,7 @@ def test_qwen2(self): shutil.rmtree(quantized_model_path, ignore_errors=True) @require_vlm_env + # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39 @require_package_version_ut("transformers", "<4.54.0") def test_phi3(self): model_path = get_model_path("microsoft/Phi-3.5-vision-instruct") @@ -135,6 +136,8 @@ def test_phi3(self): shutil.rmtree(quantized_model_path, ignore_errors=True) @require_vlm_env + # This model is not updated for new transformers. https://huggingface.co/microsoft/Phi-3.5-vision-instruct/discussions/39 + @require_package_version_ut("transformers", "<4.54.0") def test_phi3_vision_awq(self): model_path = get_model_path("microsoft/Phi-3.5-vision-instruct") ## test tune