Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 38 additions & 43 deletions tests/models/qwen2_moe/test_modeling_qwen2_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
# limitations under the License.
"""Testing suite for the PyTorch Qwen2MoE model."""

import gc
import unittest

import pytest

from transformers import AutoTokenizer, Qwen2MoeConfig, is_torch_available, set_seed
from transformers.testing_utils import (
backend_empty_cache,
require_bitsandbytes,
cleanup,
require_flash_attn,
require_torch,
require_torch_gpu,
run_first,
run_test_using_subprocess,
slow,
torch_device,
)
Expand Down Expand Up @@ -145,54 +145,67 @@ def test_load_balancing_loss(self):

@require_torch
class Qwen2MoeIntegrationTest(unittest.TestCase):
model = None

@classmethod
def get_model(cls):
if cls.model is None:
cls.model = Qwen2MoeForCausalLM.from_pretrained(
"Qwen/Qwen1.5-MoE-A2.7B", device_map="auto", dtype=torch.float16
)
return cls.model

@classmethod
def tearDownClass(cls):
if cls.model is not None:
del cls.model
cleanup(torch_device, gc_collect=True)

def tearDown(self):
cleanup(torch_device, gc_collect=True)

@slow
def test_model_a2_7b_logits(self):
input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", device_map="auto")
model = self.get_model()
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
with torch.no_grad():
out = model(input_ids).logits.float().cpu()
# Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-4.2125, -3.6416, -4.9136, -4.3005, -4.9938, -3.4393, -3.5195, -4.1621]])
EXPECTED_MEAN = torch.tensor([[-4.2106, -3.6411, -4.9111, -4.2840, -4.9950, -3.4438, -3.5262, -4.1624]])
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed because now we use fp16 (previously fp32)

torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, rtol=1e-2, atol=1e-2)
# slicing logits[0, 0, 0:30]
EXPECTED_SLICE = torch.tensor([2.3013, -0.6595, -0.1389, -1.4095, -1.7381, -1.7609, -2.0449, -2.4289, -3.0271, -2.1351, -0.6568, -4.6012, -1.9102, -0.7475, -3.1377, 4.6904, 7.1936, 7.0991, 6.4414, 6.1720, 6.2617, 5.8751, 5.6997, 5.6011, 5.5828, -3.9505, -0.5384, -0.3392, 1.2445, 2.0714]) # fmt: skip
print(out[0, 0, :30])
torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, rtol=1e-4, atol=1e-4)

del model
backend_empty_cache(torch_device)
gc.collect()
# slicing logits[0, 0, 0:10]
EXPECTED_SLICE = torch.tensor([2.3008, -0.6777, -0.1287, -1.4043, -1.7393, -1.7627, -2.0547, -2.4414, -3.0332, -2.1406]) # fmt: skip
torch.testing.assert_close(out[0, 0, :10], EXPECTED_SLICE, rtol=1e-4, atol=1e-4)

@slow
def test_model_a2_7b_generation(self):
EXPECTED_TEXT_COMPLETION = """To be or not to be, that is the question. This is the question that has been asked by many people over the"""
prompt = "To be or not to"
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=False)
model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", device_map="auto")
model = self.get_model()
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)

# greedy generation outputs
generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

del model
backend_empty_cache(torch_device)
gc.collect()

@require_bitsandbytes
# run this test as the first test within this class and run with a separate process
# (to avoid potential CPU memory issue caused by `device_map="auto"`.)
@run_first
@run_test_using_subprocess
@slow
@require_flash_attn
@pytest.mark.flash_attn_test
def test_model_a2_7b_long_prompt(self):
def test_model_a2_7b_long_prompt_flash_attn(self):
EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
# An input with 4097 tokens that is above the size of the sliding window
input_ids = [1] + [306, 338] * 2048
model = Qwen2MoeForCausalLM.from_pretrained(
"Qwen/Qwen1.5-MoE-A2.7B",
device_map="auto",
load_in_4bit=True,
dtype=torch.float16,
attn_implementation="flash_attention_2",
)
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
Expand All @@ -206,21 +219,12 @@ def test_model_a2_7b_long_prompt(self):
generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another test where assistant_model is not actually used 😄

self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())

del assistant_model
del model
backend_empty_cache(torch_device)
gc.collect()

@slow
def test_model_a2_7b_long_prompt_sdpa(self):
EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
# An input with 4097 tokens that is above the size of the sliding window
input_ids = [1] + [306, 338] * 2048
model = Qwen2MoeForCausalLM.from_pretrained(
"Qwen/Qwen1.5-MoE-A2.7B",
device_map="auto",
attn_implementation="sdpa",
)
model = self.get_model()
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
Expand All @@ -232,10 +236,7 @@ def test_model_a2_7b_long_prompt_sdpa(self):
generated_ids = assistant_model.generate(input_ids, max_new_tokens=4, temperature=0)
self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())

del assistant_model

backend_empty_cache(torch_device)
gc.collect()
cleanup(torch_device, gc_collect=True)

EXPECTED_TEXT_COMPLETION = """To be or not to be, that is the question. This is the question that has been asked by many people over the"""
prompt = "To be or not to"
Expand All @@ -251,14 +252,12 @@ def test_model_a2_7b_long_prompt_sdpa(self):
@slow
def test_speculative_generation(self):
EXPECTED_TEXT_COMPLETION = (
"To be or not to be, that is the question.\nThe answer is to be, of course. But what does it"
"To be or not to be, that is the question. Whether 'tis nobler in the mind to suffer the sl"
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the previous value never pass

)
prompt = "To be or not to"
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=False)
model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", device_map="auto", dtype=torch.float16)
assistant_model = Qwen2MoeForCausalLM.from_pretrained(
"Qwen/Qwen1.5-MoE-A2.7B", device_map="auto", dtype=torch.float16
)
assistant_model = model
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)

# greedy generation outputs
Expand All @@ -268,7 +267,3 @@ def test_speculative_generation(self):
)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

del model
backend_empty_cache(torch_device)
gc.collect()