From 0f4f4f569dc7e6420d4e930693551a596e96c170 Mon Sep 17 00:00:00 2001 From: medmekk Date: Wed, 28 Jan 2026 10:39:42 +0000 Subject: [PATCH 1/2] fix --- .../modeling_flash_attention_utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index b5f59b4bb1f9..372712150023 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -533,11 +533,24 @@ def _process_flash_attention_kwargs( flash_kwargs (`dict`): A dict of kwargs that are requested and supported. """ + + user_kwargs = { + "dropout_p": dropout, + "window_size": sliding_window, + "deterministic": deterministic, + "softcap": softcap, + "s_aux": s_aux, + } + # Note 'window_size' in supports_mapping maps to our 'sliding_window' param + for k, v in user_kwargs.items(): + if not supports_mapping[k] and v is not None: + raise ValueError(f"Parameter `{k}` is not supported by this Flash Attention implementation but was set, please use a different attentionimplementation.") + flash_kwargs = { "causal": is_causal and not (use_top_left_mask and query_length == 1), "softmax_scale": softmax_scale, } - + if supports_mapping["dropout_p"]: flash_kwargs["dropout_p"] = dropout From a9790b2ed10377a642545c4e4e680a86f2bc6879 Mon Sep 17 00:00:00 2001 From: medmekk Date: Wed, 28 Jan 2026 10:42:14 +0000 Subject: [PATCH 2/2] style --- src/transformers/modeling_flash_attention_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 372712150023..c6149e101d89 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -544,13 +544,15 @@ def _process_flash_attention_kwargs( # Note 'window_size' in supports_mapping maps to our 'sliding_window' param for k, v in user_kwargs.items(): if not supports_mapping[k] and v is not None: - raise ValueError(f"Parameter `{k}` is not supported by this Flash Attention implementation but was set, please use a different attentionimplementation.") + raise ValueError( + f"Parameter `{k}` is not supported by this Flash Attention implementation but was set, please use a different attentionimplementation." + ) flash_kwargs = { "causal": is_causal and not (use_top_left_mask and query_length == 1), "softmax_scale": softmax_scale, } - + if supports_mapping["dropout_p"]: flash_kwargs["dropout_p"] = dropout