Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/transformers/models/blt/modeling_blt.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def forward(
attention_mask=encoder_attention_mask,
**kwargs,
)
patch_embeds = patch_embeds + cross_attention_output
patch_embeds = patch_embeds + cross_attention_output.to(patch_embeds.device)
encoder_cross_states = patch_embeds
return hidden_states, encoder_cross_states

Expand Down Expand Up @@ -1228,7 +1228,7 @@ def forward(
else:
batch_size, sequence_length = input_ids.shape
encoder_embeds = compute_hash_embeddings(
input_ids,
input_ids.to(self.local_encoder.embed_tokens.weight.device),
self.local_encoder,
self.encoder_hash_tok_embedding,
self.config.encoder_hash_byte_group_nb_functions,
Expand All @@ -1241,7 +1241,7 @@ def forward(
if input_ids is None:
raise ValueError("input_ids is required for entropy-based patching")
_, patch_lengths, _ = self.patcher(
input_ids,
input_ids.to(self.patcher.embed_tokens.weight.device),
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not clear why self.patcher doesn't handle the input device well.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likely the BltPatcher itself is BltPreTrainedModel

patch_size=self.config.patch_size,
threshold=self.config.patching_threshold,
max_patch_length=self.config.max_patch_length,
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/blt/modular_blt.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def forward(
attention_mask=encoder_attention_mask,
**kwargs,
)
patch_embeds = patch_embeds + cross_attention_output
patch_embeds = patch_embeds + cross_attention_output.to(patch_embeds.device)
encoder_cross_states = patch_embeds
return hidden_states, encoder_cross_states

Expand Down Expand Up @@ -955,7 +955,7 @@ def forward(
else:
batch_size, sequence_length = input_ids.shape
encoder_embeds = compute_hash_embeddings(
input_ids,
input_ids.to(self.local_encoder.embed_tokens.weight.device),
self.local_encoder,
self.encoder_hash_tok_embedding,
self.config.encoder_hash_byte_group_nb_functions,
Expand All @@ -968,7 +968,7 @@ def forward(
if input_ids is None:
raise ValueError("input_ids is required for entropy-based patching")
_, patch_lengths, _ = self.patcher(
input_ids,
input_ids.to(self.patcher.embed_tokens.weight.device),
patch_size=self.config.patch_size,
threshold=self.config.patching_threshold,
max_patch_length=self.config.max_patch_length,
Expand Down
216 changes: 72 additions & 144 deletions tests/models/blt/test_modeling_blt.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from transformers import AutoTokenizer, is_torch_available
from transformers.testing_utils import (
Expectations,
cleanup,
require_torch,
require_torch_accelerator,
Expand Down Expand Up @@ -184,6 +185,10 @@ class BltModelTest(CausalLMModelTest, unittest.TestCase):
def test_generate_from_inputs_embeds(self, _, num_beams):
pass

@pytest.mark.generate
def test_generate_with_quant_cache(self):
self.skipTest("BLT uses EncoderDecoderCache internally and does not support quantized cache")

@pytest.mark.generate
@unittest.skip(
"Blt requires real token IDs for its hash-based embedding computation, making inputs_embeds generation incompatible with identical outputs"
Expand Down Expand Up @@ -260,74 +265,25 @@ def test_model(self):

@slow
def test_model_logits(self):
EXPECTED_OUTPUT = torch.tensor(
[
[
-10.4948,
-10.7065,
-6.1813,
-10.5545,
-10.3428,
-9.1493,
-8.4937,
-8.6382,
-9.2159,
-9.5907,
-9.3679,
-8.4184,
-9.0655,
-3.4436,
2.9616,
-10.3157,
-6.3723,
-6.0133,
-9.7100,
-9.2128,
-8.8064,
-9.8179,
-9.7516,
-9.4681,
-9.7715,
-9.4897,
-9.0491,
-9.8098,
-9.4648,
-9.3294,
],
[
-13.3010,
-13.1910,
-5.7230,
-13.2895,
-13.4864,
-8.7140,
-7.0275,
-7.0182,
-10.1362,
-10.3762,
-9.9086,
-7.8049,
-8.8660,
-5.2711,
-3.5778,
-12.5346,
-9.1609,
-6.7925,
-10.3717,
-9.2650,
-10.6393,
-11.4807,
-11.2128,
-10.9615,
-10.5806,
-10.8873,
-11.0651,
-11.3471,
-10.5437,
-9.9688,
],
]
).to(torch_device)
# fmt: off
EXPECTED_OUTPUT = Expectations(
{
(None, None): torch.tensor(
[
[-10.5000, -10.6875, -6.2500, -10.5625, -10.3125, -9.1875, -8.5000, -8.5625, -9.1875, -9.6250, -9.3750, -8.5000, -9.1250, -3.3906, 2.9688, -10.3125, -6.4688, -6.0312, -9.7500, -9.1875, -8.8125, -9.8750, -9.8125, -9.5000, -9.8125, -9.5000, -9.0625, -9.8125, -9.5000, -9.3750],
[-13.2500, -13.1250, -5.6875, -13.1875, -13.3750, -8.6875, -6.9688, -6.9375, -10.0625, -10.3125, -9.8125, -7.7188, -8.8125, -5.2188, -3.5000, -12.4375, -9.0625, -6.6250, -10.3125, -9.1875, -10.6250, -11.4375, -11.1250, -10.8750, -10.5000, -10.8750, -11.0000, -11.3125, -10.5000, -9.8750],
]
),
("xpu", None): torch.tensor(
[
[-10.4375, -10.6875, -6.1875, -10.5000, -10.3125, -9.1250, -8.4375, -8.6250, -9.1875, -9.5625, -9.3125, -8.4375, -9.0625, -3.4375, 2.9531, -10.2500, -6.4062, -6.0000, -9.6875, -9.1875, -8.8125, -9.8125, -9.7500, -9.4375, -9.7500, -9.4375, -9.0000, -9.8125, -9.4375, -9.3125],
[-13.3125, -13.2500, -5.5938, -13.3125, -13.5000, -8.7500, -7.0625, -7.0312, -10.1875, -10.3750, -9.9375, -7.8438, -8.8750, -5.3438, -3.5938, -12.5625, -9.2500, -6.8125, -10.3750, -9.3125, -10.6875, -11.5625, -11.3125, -11.0000, -10.6250, -10.9375, -11.0625, -11.3750, -10.5625, -10.0000],
]
),
}
).get_expectation()
EXPECTED_OUTPUT = EXPECTED_OUTPUT.to(torch_device)
# fmt: on

input_ids = [1, 42, 21, 12, 43, 23, 1, 4]

Expand All @@ -336,14 +292,21 @@ def test_model_logits(self):
with torch.no_grad():
output = model(torch.tensor([input_ids]).to(torch_device))[0]

torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30], rtol=1e-4, atol=1e-4)
torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30].to(torch_device), rtol=1e-3, atol=1e-3)

@slow
@require_torch_bf16
def test_model_bf16(self):
"""Test Blt model with bfloat16 precision."""
NUM_TOKENS_TO_GENERATE = 200
EXPECTED_TEXT = "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m"
# fmt: off
EXPECTED_TEXT = Expectations(
{
(None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
}
)
# fmt: on

prompt = "my name is"

Expand All @@ -360,81 +323,32 @@ def test_model_bf16(self):
)

output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXT)
self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())

@slow
@require_torch_bf16
def test_model_logits_bf16(self):
"""Test Blt model logits with bfloat16 precision."""

EXPECTED_OUTPUT = torch.tensor(
[
[
-10.5000,
-10.6875,
-6.1875,
-10.5625,
-10.3125,
-9.1875,
-8.5000,
-8.6875,
-9.1875,
-9.5625,
-9.3750,
-8.5000,
-9.0625,
-3.4219,
2.9531,
-10.3125,
-6.4062,
-6.0000,
-9.6875,
-9.1875,
-8.8125,
-9.8125,
-9.7500,
-9.4375,
-9.8125,
-9.5000,
-9.0000,
-9.8125,
-9.4375,
-9.3125,
],
[
-13.2500,
-13.1875,
-5.6875,
-13.3125,
-13.5000,
-8.7500,
-7.0625,
-7.0312,
-10.1250,
-10.3750,
-9.8750,
-7.8438,
-8.8750,
-5.2812,
-3.5625,
-12.5000,
-9.1875,
-6.8125,
-10.3750,
-9.3125,
-10.6250,
-11.5000,
-11.2500,
-11.0000,
-10.5625,
-10.8750,
-11.0625,
-11.3750,
-10.5625,
-10.0000,
],
]
).to(torch_device)
# fmt: off
EXPECTED_OUTPUT = Expectations(
{
(None, None): torch.tensor(
[
[-10.5000, -10.6875, -6.2500, -10.5625, -10.3125, -9.1875, -8.5000, -8.5625, -9.1875, -9.6250, -9.3750, -8.5000, -9.1250, -3.3906, 2.9688, -10.3125, -6.4688, -6.0312, -9.7500, -9.1875, -8.8125, -9.8750, -9.8125, -9.5000, -9.8125, -9.5000, -9.0625, -9.8125, -9.5000, -9.3750],
[-13.2500, -13.1250, -5.6875, -13.1875, -13.3750, -8.6875, -6.9688, -6.9375, -10.0625, -10.3125, -9.8125, -7.7188, -8.8125, -5.2188, -3.5000, -12.4375, -9.0625, -6.6250, -10.3125, -9.1875, -10.6250, -11.4375, -11.1250, -10.8750, -10.5000, -10.8750, -11.0000, -11.3125, -10.5000, -9.8750],
]
),
("xpu", None): torch.tensor(
[
[-10.4375, -10.6875, -6.1875, -10.5000, -10.3125, -9.1250, -8.4375, -8.6250, -9.1875, -9.5625, -9.3125, -8.4375, -9.0625, -3.4375, 2.9531, -10.2500, -6.4062, -6.0000, -9.6875, -9.1875, -8.8125, -9.8125, -9.7500, -9.4375, -9.7500, -9.4375, -9.0000, -9.8125, -9.4375, -9.3125],
[-13.3125, -13.2500, -5.5938, -13.3125, -13.5000, -8.7500, -7.0625, -7.0312, -10.1875, -10.3750, -9.9375, -7.8438, -8.8750, -5.3438, -3.5938, -12.5625, -9.2500, -6.8125, -10.3750, -9.3125, -10.6875, -11.5625, -11.3125, -11.0000, -10.6250, -10.9375, -11.0625, -11.3750, -10.5625, -10.0000],
]
),
}
).get_expectation()
EXPECTED_OUTPUT = EXPECTED_OUTPUT.to(torch_device)
# fmt: on

input_ids = [1, 42, 21, 12, 43, 23, 1, 4]

Expand All @@ -445,13 +359,20 @@ def test_model_logits_bf16(self):
with torch.no_grad():
output = model(torch.tensor([input_ids]).to(torch_device))[0]

torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30], rtol=1e-3, atol=1e-3)
torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30].to(torch_device), rtol=1e-3, atol=1e-3)

@slow
def test_model_eager(self):
"""Test Blt model with bfloat16 precision using eager attention implementation."""
NUM_TOKENS_TO_GENERATE = 200
EXPECTED_TEXT = "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s"
# fmt: off
EXPECTED_TEXT = Expectations(
{
(None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
("xpu", None): "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m",
}
)
# fmt: on

prompt = "my name is"

Expand All @@ -466,14 +387,21 @@ def test_model_eager(self):
)

output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXT)
self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())

@slow
@require_torch_bf16
def test_model_bf16_static_cache(self):
"""Test Blt model with bfloat16 precision and static cache."""
NUM_TOKENS_TO_GENERATE = 200
EXPECTED_TEXT = "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m"
# fmt: off
EXPECTED_TEXT = Expectations(
{
(None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
}
)
# fmt: on

prompt = "my name is"

Expand All @@ -492,4 +420,4 @@ def test_model_bf16_static_cache(self):
)

output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXT)
self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())
Loading