diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 29ec365e7ce2..c9ba021c54db 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -89,6 +89,21 @@ "expert_count": "num_experts", "expert_used_count": "num_experts_per_tok", }, + "gpt_oss": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "rms_norm_eps", + "vocab_size": "vocab_size", + "expert_count": "num_local_experts", + "expert_used_count": "num_experts_per_tok", + "sliding_window": "sliding_window", + }, "lfm2": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 4e6c21d8517f..2de6cc13fc85 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -171,6 +171,107 @@ def _set_moe_expert_tensor(self, weights: np.ndarray, parsed_parameters: dict[st out.copy_(torch_weights) +class GptOssTensorProcessor(TensorProcessor): + """ + Tensor processor for GPT-OSS models (MoE with 128 experts). + Handles: + - Splitting stacked expert tensors (down_proj, gate_proj, up_proj) into individual experts. + - Interleaving gate and up projections if stored in a combined tensor (gate_up_projs). + - Bias tensors (1D) are passed through without transpose. + """ + + # Regex for separate expert tensors: e.g., blk.0.ffn_down_projs.weight + GGUF_MOE_WEIGHTS_PATTERN = re.compile(r"blk\.(?P\d+)\.ffn_(?Pdown|gate|up)_projs\.weight$") + # Regex for combined gate+up tensor: e.g., blk.0.ffn_gate_up_projs.weight + GGUF_MOE_COMBINED_PATTERN = re.compile(r"blk\.(?P\d+)\.ffn_gate_up_projs\.weight$") + + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name: str, **kwargs): + # 1. Handle separate MoE expert tensors (down, gate, up) + if m := self.GGUF_MOE_WEIGHTS_PATTERN.match(name): + tensor_key_mapping = kwargs.get("tensor_key_mapping") + parsed_parameters = kwargs.get("parsed_parameters") + if tensor_key_mapping and parsed_parameters: + self._split_moe_expert_tensor(weights, parsed_parameters, m["bid"], m["proj"], tensor_key_mapping) + return GGUFTensor(weights, None, {}) # signal handled + + # 2. Handle combined gate+up tensor + if m := self.GGUF_MOE_COMBINED_PATTERN.match(name): + tensor_key_mapping = kwargs.get("tensor_key_mapping") + parsed_parameters = kwargs.get("parsed_parameters") + if tensor_key_mapping and parsed_parameters: + self._interleave_gate_up_tensor(weights, parsed_parameters, m["bid"], tensor_key_mapping) + return GGUFTensor(weights, None, {}) + + # 3. Bias tensors (1D) → no transpose + if ".bias" in name and len(weights.shape) == 1: + return GGUFTensor(weights, name, {}) + + # 4. Default handling for all other tensors + return GGUFTensor(weights, name, {}) + + def _split_moe_expert_tensor( + self, + weights: np.ndarray, + parsed_parameters: dict, + bid: str, + proj: str, + tensor_key_mapping: dict, + ): + """Split a stacked MoE tensor into individual expert tensors.""" + num_experts = self.config.get("num_local_experts", 128) + # Expected shape: [num_experts, hidden_size, intermediate_size] (or swapped). + # We assume the stored order is correct for the projection after splitting. + for i in range(min(num_experts, weights.shape[0])): + expert_weight = weights[i] # shape: [hidden, inter] or [inter, hidden] + # Build HF parameter name + hf_name = f"model.layers.{bid}.block_sparse_moe.experts.{i}.{proj}_proj.weight" + # Apply any user‑provided tensor key mapping + for key, mapped_key in tensor_key_mapping.items(): + if key in hf_name: + hf_name = hf_name.replace(key, mapped_key) + # Store the tensor + parsed_parameters["tensors"][hf_name] = torch.tensor(expert_weight, copy=True) + + def _interleave_gate_up_tensor( + self, + weights: np.ndarray, + parsed_parameters: dict, + bid: str, + tensor_key_mapping: dict, + ): + """ + Process a combined gate+up tensor. + Expected shape: [num_experts, intermediate_size, hidden_size]. + Interleaving: gate occupies first half of intermediate dimension, + up occupies second half. Transpose to [hidden, half_inter] per expert. + """ + num_experts = self.config.get("num_local_experts", 128) + inter_size = weights.shape[1] + half_inter = inter_size // 2 + gate_part = weights[:, :half_inter, :] # [E, half_inter, hidden] + up_part = weights[:, half_inter:, :] # [E, half_inter, hidden] + + for i in range(min(num_experts, weights.shape[0])): + gate_weight = gate_part[i].T # [hidden, half_inter] + up_weight = up_part[i].T # [hidden, half_inter] + + gate_name = f"model.layers.{bid}.block_sparse_moe.experts.{i}.gate_proj.weight" + up_name = f"model.layers.{bid}.block_sparse_moe.experts.{i}.up_proj.weight" + + # Apply mapping + for key, mapped_key in tensor_key_mapping.items(): + if key in gate_name: + gate_name = gate_name.replace(key, mapped_key) + if key in up_name: + up_name = up_name.replace(key, mapped_key) + + parsed_parameters["tensors"][gate_name] = torch.tensor(gate_weight, copy=True) + parsed_parameters["tensors"][up_name] = torch.tensor(up_weight, copy=True) + + class BloomTensorProcessor(TensorProcessor): def __init__(self, config=None): super().__init__(config=config) @@ -355,6 +456,7 @@ def _set_moe_expert_tensor(self, weights: np.ndarray, parsed_parameters: dict[st TENSOR_PROCESSORS = { "llama": LlamaTensorProcessor, "qwen2moe": Qwen2MoeTensorProcessor, + "gpt_oss": GptOssTensorProcessor, "qwen3moe": Qwen2MoeTensorProcessor, "bloom": BloomTensorProcessor, "t5": T5TensorProcessor, @@ -416,6 +518,8 @@ def get_gguf_hf_weights_map( model_type = "t5" elif model_type == "minimax_m2": model_type = "minimax-m2" + elif model_type == "gpt_oss": + model_type = "gpt-oss" arch = None for key, value in MODEL_ARCH_NAMES.items(): if value == model_type: @@ -522,6 +626,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo if "qwen2moe" in architecture: updated_architecture = "qwen2_moe" + elif "gpt_oss" in architecture or "gpt-oss" in architecture: + updated_architecture = "gpt_oss" elif "qwen3moe" in architecture: updated_architecture = "qwen3_moe" elif "minimax-m2" in architecture: @@ -609,6 +715,45 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo i for i, num_kv_heads in enumerate(gguf_num_key_value_heads) if num_kv_heads > 0 ] + if updated_architecture == "gpt_oss": + # Helper to read keys with the correct prefix + def read_gpt_key(reader, suffix, default=None): + key = f"gpt-oss.{suffix}" + if key in reader.fields: + val = reader.fields[key].parts[0] + if isinstance(val, bytes): + val = val.decode("utf-8") + return val + return default + + # Reconstruct rope_scaling from GGUF metadata + rope_type = read_gpt_key(reader, "rope.scaling.type") + if rope_type is not None: + rope_scaling = {"rope_type": rope_type} + + # Collect all rope.scaling keys dynamically + for key in reader.fields: + if not key.startswith("gpt-oss.rope.scaling."): + continue + suffix = key[len("gpt-oss.rope.scaling.") :] + if suffix == "type": + continue + value = reader.fields[key].parts[0] + if isinstance(value, bytes): + value = value.decode("utf-8") + # Convert to appropriate type + if suffix in ("factor", "attention_factor", "beta_fast", "beta_slow"): + value = float(value) + elif suffix in ("original_context_length", "original_max_position_embeddings"): + # Map GGUF's original_context_length to HF's original_max_position_embeddings + suffix = "original_max_position_embeddings" + value = int(value) + else: + pass + rope_scaling[suffix] = value + + parsed_parameters["config"]["rope_scaling"] = rope_scaling + # retrieve config vocab_size from tokenizer # Please refer to https://github.com/huggingface/transformers/issues/32526 for more details if "vocab_size" not in parsed_parameters["config"]: diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 763f8ac40502..aa5cdbc7adc6 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -351,6 +351,8 @@ class GgufModelTests(unittest.TestCase): q4_k_m_qwen3moe_model_id = "Qwen3-30B-A3B-Q4_K_M.gguf" q8_0_umt5_encoder_model_id = "umt5-xxl-encoder-Q8_0.gguf" q4_k_m_lfm2_model_id = "LFM2-1.2B-Q4_K_M.gguf" + gpt_oss_model_id = "unsloth/gpt-oss-20b-GGUF" + gpt_oss_gguf_file = "gpt-oss-20b-Q5_K_M.gguf" example_text = "Hello" @@ -384,6 +386,20 @@ def test_qwen2_q4_0(self): EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_gpt_oss_q5_k_m(self): + tokenizer = AutoTokenizer.from_pretrained(self.gpt_oss_model_id, gguf_file=self.gpt_oss_gguf_file) + model = AutoModelForCausalLM.from_pretrained( + self.gpt_oss_model_id, + gguf_file=self.gpt_oss_gguf_file, + device_map="auto", + dtype=torch.float16, + ) + + text = tokenizer(self.example_text, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + EXPECTED_TEXT = "Hello, I just want to say that I am just" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_qwen2moe_q8(self): tokenizer = AutoTokenizer.from_pretrained(self.qwen2moe_model_id, gguf_file=self.q8_qwen2moe_model_id) model = AutoModelForCausalLM.from_pretrained(