diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py index bbf0268c8b48..60ed30ea7da6 100644 --- a/tests/utils/test_cache_utils.py +++ b/tests/utils/test_cache_utils.py @@ -20,6 +20,7 @@ from transformers import set_seed from transformers.testing_utils import ( CaptureStderr, + cleanup, get_gpu_count, is_torch_available, require_gptq, @@ -53,6 +54,8 @@ @require_torch class CacheTest(unittest.TestCase): + """Cache tests that don't require loading models""" + def test_dynamic_cache_retrocompatibility(self): """Tests that we can convert back and forth between the legacy cache format and DynamicCache""" legacy_cache = () @@ -173,120 +176,17 @@ def _random_kvs(config): self.assertTrue(cached_keys.shape == (1, 1, 10, 128)) self.assertTrue(cached_values.shape == (1, 1, 10, 128)) - def test_dynamic_cache_exportability(self): - model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM") - model = model.eval() - tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM") - prompt = "What is the best way to debug python script?" - inputs = tokenizer(prompt, return_tensors="pt") - attention_mask = inputs.attention_mask - input_ids = inputs.input_ids - - past_key_values = DynamicCache() - ep = torch.export.export( - model, - (), - { - "input_ids": input_ids, - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "use_cache": True, - }, - strict=False, - ) - res = ep.module()( - input_ids=input_ids, - attention_mask=attention_mask, - past_key_values=past_key_values, - use_cache=True, - ) - self.assertTrue(len(res.past_key_values.key_cache) == model.config.num_hidden_layers) - self.assertEqual(2 * model.config.num_hidden_layers + 1, len(ep.graph_signature.output_specs)) - self.assertEqual( - 3, - len( - [ - x - for x in ep.graph_signature.input_specs - if x.kind == torch.export.graph_signature.InputKind.USER_INPUT - ] - ), - ) - past_key_values_eager = DynamicCache() - res_eager = model( - input_ids=input_ids, - attention_mask=attention_mask, - past_key_values=past_key_values_eager, - use_cache=True, - ) - self.assertTrue(torch.allclose(res.logits, res_eager.logits)) - for k1, k2 in zip(res.past_key_values.key_cache, res_eager.past_key_values.key_cache): - self.assertTrue(torch.allclose(k1, k2)) +@require_torch_accelerator +class CacheIntegrationTest(unittest.TestCase): + """Cache tests that require loading models""" - for v1, v2 in zip(res.past_key_values.value_cache, res_eager.past_key_values.value_cache): - self.assertTrue(torch.allclose(v1, v2)) + def tearDown(self): + # Some tests use large models, which might result in suboptimal torch re-allocation if we run multiple tests + # in a row + cleanup(torch_device, gc_collect=True) @slow - @require_read_token - def test_static_cache_exportability(self): - """ - Tests that static cache works with `torch.export()` - """ - if not is_torch_greater_or_equal("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - set_seed(0) - device = "cpu" - dtype = "bfloat16" - cache_implementation = "static" - attn_implementation = "sdpa" # Export and ExecuTorch only works for SdpaAttention - batch_size = 1 - max_cache_len = 1234 - model = AutoModelForCausalLM.from_pretrained( - "google/gemma-2b", - device_map=device, - torch_dtype=dtype, - attn_implementation=attn_implementation, - generation_config=GenerationConfig( - use_cache=True, - cache_implementation=cache_implementation, - max_length=max_cache_len, - cache_config={ - "batch_size": batch_size, - "max_cache_len": max_cache_len, - "device": device, - }, - ), - ) - # Check if cache config is passed through correctly - self.assertEqual(model.generation_config.use_cache, True) - self.assertEqual(model.generation_config.cache_implementation, cache_implementation) - self.assertEqual(model.generation_config.max_length, max_cache_len) - self.assertTrue(model.generation_config.cache_config is not None) - self.assertEqual(model.generation_config.cache_config.batch_size, batch_size) - self.assertEqual(model.generation_config.cache_config.max_cache_len, max_cache_len) - - exported_program = convert_and_export_with_cache(model) - - # Check if the exported model is configured with the `StaticCache` correctly - n_static_key_caches = n_static_value_caches = 0 - for buffer_name, buffer in exported_program.named_buffers(): - if buffer_name.startswith("key_cache"): - self.assertTrue(buffer.shape[0] == batch_size) - self.assertTrue(buffer.shape[2] == max_cache_len) - n_static_key_caches = n_static_key_caches + 1 - if buffer_name.startswith("value_cache"): - self.assertTrue(buffer.shape[0] == batch_size) - self.assertTrue(buffer.shape[2] == max_cache_len) - n_static_value_caches = n_static_value_caches + 1 - self.assertEqual(n_static_key_caches, model.config.num_hidden_layers) - self.assertEqual(n_static_value_caches, model.config.num_hidden_layers) - - -@require_torch_accelerator -@slow -class CacheIntegrationTest(unittest.TestCase): def test_dynamic_cache_hard(self): tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") model = AutoModelForCausalLM.from_pretrained( @@ -316,6 +216,7 @@ def test_dynamic_cache_hard(self): ) self.assertEqual(decoded[0], expected_text) + @slow def test_dynamic_cache_batched(self): tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") tokenizer.pad_token = tokenizer.eos_token @@ -331,6 +232,7 @@ def test_dynamic_cache_batched(self): expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"] self.assertListEqual(decoded, expected_text) + @slow def test_dynamic_cache_beam_search(self): tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left") model = AutoModelForCausalLM.from_pretrained( @@ -352,6 +254,7 @@ def test_dynamic_cache_beam_search(self): ] self.assertListEqual(decoded, expected_text) + @slow def test_hybrid_cache_n_sequences(self): tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") model = AutoModelForCausalLM.from_pretrained( @@ -379,6 +282,7 @@ def test_hybrid_cache_n_sequences(self): @require_non_xpu @require_gptq + @slow def test_sink_cache_hard(self): tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ") model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto") @@ -392,6 +296,7 @@ def test_sink_cache_hard(self): decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) self.assertTrue(decoded[0].endswith("to perform a variety of tasks. The Transformer is a neural network")) + @slow def test_sink_cache_iterative_prompts(self): """Tests that SinkCache supports more than one new token at once, when shifting the cache""" tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") @@ -434,13 +339,14 @@ def test_sink_cache_iterative_prompts(self): ) self.assertTrue(decoded[0].endswith(last_output)) - @require_torch_gpu @parameterized.expand( [ ("eager", "static"), ("sdpa", "static"), ] ) + @require_torch_gpu + @slow def test_static_cache_greedy_decoding_pad_left(self, attn_implementation, cache_implementation): EXPECTED_GENERATION = [ "The best color is the one that complements the skin tone of the", @@ -479,44 +385,7 @@ def test_static_cache_greedy_decoding_pad_left(self, attn_implementation, cache_ with self.subTest(f"{attn_implementation}, static, compiled"): self.assertListEqual(decoded, EXPECTED_GENERATION) - @require_torch_gpu - @parameterized.expand( - [ - ("eager", "static"), - ("sdpa", "static"), - ] - ) - def test_static_cache_greedy_decoding_pad_right(self, attn_implementation, cache_implementation): - EXPECTED_GENERATION = [ - "The best color isЋ the one that complements the skin tone of", - "We should not undermind the issues at hand.\nWe should not undermind the issues", - ] - - tokenizer = AutoTokenizer.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", padding_side="right", pad_token="" - ) - model = AutoModelForCausalLM.from_pretrained( - "NousResearch/Llama-2-7b-chat-hf", - torch_dtype=torch.bfloat16, - attn_implementation=attn_implementation, - ).to(torch_device) - inputs = tokenizer( - ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" - ).to(model.device) - - set_seed(0) - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - with self.subTest(f"{attn_implementation}, dynamic"): - self.assertListEqual(decoded, EXPECTED_GENERATION) - - set_seed(0) - model.generation_config.cache_implementation = cache_implementation - gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) - decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) - with self.subTest(f"{attn_implementation}, static, eager"): - self.assertListEqual(decoded, EXPECTED_GENERATION) - + @slow def test_dynamic_cache_extra_left_padding(self): """Tests that adding extra left-padding does not affect the generation with the dynamic cache""" EXPECTED_GENERATION = [ @@ -551,12 +420,8 @@ def test_dynamic_cache_extra_left_padding(self): decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) self.assertListEqual(decoded, EXPECTED_GENERATION) - @parameterized.expand( - [ - "static", - ] - ) - def test_static_cache_extra_left_padding(self, cache_implementation): + @slow + def test_static_cache_extra_left_padding(self): """Tests that adding extra left-padding does not affect the generation with the static cache""" EXPECTED_GENERATION = [ "The best color is the one that complements the skin tone of the", @@ -574,7 +439,7 @@ def test_static_cache_extra_left_padding(self, cache_implementation): ["The best color is", "We should not undermind the issues at hand"], padding=True, return_tensors="pt" ).to(model.device) - model.generation_config.cache_implementation = cache_implementation + model.generation_config.cache_implementation = "static" gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10) decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True) @@ -597,6 +462,7 @@ def test_static_cache_beam_search(self): pass @require_torch_accelerator + @slow def test_offloaded_cache_equivalent_to_dynamic_cache(self): """Tests that OffloadedCache produces the same result as the default DynamicCache""" model_name = "microsoft/Phi-3-mini-4k-instruct" @@ -625,6 +491,7 @@ def test_offloaded_cache_equivalent_to_dynamic_cache(self): assert torch.all(original_output == offloaded_output).item() @require_torch_accelerator + @slow def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self): """Tests that OffloadedCache uses less memory than the default DynamicCache""" model_name = "microsoft/Phi-3-mini-4k-instruct" @@ -664,6 +531,7 @@ def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self): assert offloaded_peak_memory < original_peak_memory @require_torch_gpu + @slow def test_cache_copy(self): model_name = "microsoft/Phi-3-mini-4k-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -745,6 +613,7 @@ def test_static_cache_no_cuda_graph_skips(self): self.assertEqual(cap.err, "") @require_torch_multi_gpu + @slow def test_static_cache_multi_gpu(self): """Regression test for #35164: static cache with multi-gpu""" @@ -764,3 +633,118 @@ def test_static_cache_multi_gpu(self): inputs = tokenizer("Today is a beautiful day!", return_tensors="pt").to(0) _ = model(**inputs) _ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid") + + +@require_torch +class CacheExportIntegrationTest(unittest.TestCase): + """Cache tests that rely on `torch.export()` and model loading""" + + def test_dynamic_cache_exportability(self): + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM") + model = model.eval() + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM") + prompt = "What is the best way to debug python script?" + inputs = tokenizer(prompt, return_tensors="pt") + attention_mask = inputs.attention_mask + input_ids = inputs.input_ids + + past_key_values = DynamicCache() + ep = torch.export.export( + model, + (), + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "use_cache": True, + }, + strict=False, + ) + res = ep.module()( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=True, + ) + self.assertTrue(len(res.past_key_values.key_cache) == model.config.num_hidden_layers) + self.assertEqual(2 * model.config.num_hidden_layers + 1, len(ep.graph_signature.output_specs)) + self.assertEqual( + 3, + len( + [ + x + for x in ep.graph_signature.input_specs + if x.kind == torch.export.graph_signature.InputKind.USER_INPUT + ] + ), + ) + + past_key_values_eager = DynamicCache() + res_eager = model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values_eager, + use_cache=True, + ) + self.assertTrue(torch.allclose(res.logits, res_eager.logits)) + for k1, k2 in zip(res.past_key_values.key_cache, res_eager.past_key_values.key_cache): + self.assertTrue(torch.allclose(k1, k2)) + + for v1, v2 in zip(res.past_key_values.value_cache, res_eager.past_key_values.value_cache): + self.assertTrue(torch.allclose(v1, v2)) + + @slow + @require_read_token + def test_static_cache_exportability(self): + """ + Tests that static cache works with `torch.export()` + """ + if not is_torch_greater_or_equal("2.3"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + set_seed(0) + device = "cpu" + dtype = "bfloat16" + cache_implementation = "static" + attn_implementation = "sdpa" # Export and ExecuTorch only works for SdpaAttention + batch_size = 1 + max_cache_len = 1234 + model = AutoModelForCausalLM.from_pretrained( + "google/gemma-2b", + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_cache_len, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_cache_len, + "device": device, + }, + ), + ) + # Check if cache config is passed through correctly + self.assertEqual(model.generation_config.use_cache, True) + self.assertEqual(model.generation_config.cache_implementation, cache_implementation) + self.assertEqual(model.generation_config.max_length, max_cache_len) + self.assertTrue(model.generation_config.cache_config is not None) + self.assertEqual(model.generation_config.cache_config.batch_size, batch_size) + self.assertEqual(model.generation_config.cache_config.max_cache_len, max_cache_len) + + exported_program = convert_and_export_with_cache(model) + + # Check if the exported model is configured with the `StaticCache` correctly + n_static_key_caches = n_static_value_caches = 0 + for buffer_name, buffer in exported_program.named_buffers(): + if buffer_name.startswith("key_cache"): + self.assertTrue(buffer.shape[0] == batch_size) + self.assertTrue(buffer.shape[2] == max_cache_len) + n_static_key_caches = n_static_key_caches + 1 + if buffer_name.startswith("value_cache"): + self.assertTrue(buffer.shape[0] == batch_size) + self.assertTrue(buffer.shape[2] == max_cache_len) + n_static_value_caches = n_static_value_caches + 1 + self.assertEqual(n_static_key_caches, model.config.num_hidden_layers) + self.assertEqual(n_static_value_caches, model.config.num_hidden_layers)