huggingface · aliyevaladddin · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 24, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -713,6 +713,8 @@
         title: MegatronBERT
       - local: model_doc/megatron_gpt2
         title: MegatronGPT2
+      - local: model_doc/minicpm3
+        title: MiniCPM3
       - local: model_doc/minimax
         title: MiniMax
       - local: model_doc/minimax_m2

diff --git a/docs/source/en/model_doc/minicpm3.md b/docs/source/en/model_doc/minicpm3.md
@@ -0,0 +1,45 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MiniCPM3
+
+## Overview
+
+The MiniCPM3 model was proposed in [MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies](https://huggingface.co/papers/2404.06395) by OpenBMB.
+
+MiniCPM3-4B is a dense language model that uses Multi-head Latent Attention (MLA) for efficient KV cache compression, combined with embedding scaling, depth-dependent residual scaling, and logit scaling for stable training. Despite its compact 4B parameter size, it achieves performance comparable to larger 7B-9B models.
+
+This model was contributed by [aliyevaladddin](https://github.com/aliyevaladddin).
+The original code can be found [here](https://huggingface.co/openbmb/MiniCPM3-4B).
+
+## MiniCPM3Config
+
+[[autodoc]] MiniCPM3Config
+
+## MiniCPM3Model
+
+[[autodoc]] MiniCPM3Model
+    - forward
+
+## MiniCPM3ForCausalLM
+
+[[autodoc]] MiniCPM3ForCausalLM
+    - forward
+
+## MiniCPM3ForSequenceClassification
+
+[[autodoc]] MiniCPM3ForSequenceClassification
+    - forward
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -699,6 +699,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("marian", "MarianForCausalLM"),
         ("mbart", "MBartForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
+        ("minicpm3", "MiniCPM3ForCausalLM"),
         ("minimax", "MiniMaxForCausalLM"),
         ("minimax_m2", "MiniMaxM2ForCausalLM"),
         ("ministral", "MinistralForCausalLM"),
@@ -1299,6 +1300,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("markuplm", "MarkupLMForSequenceClassification"),
         ("mbart", "MBartForSequenceClassification"),
         ("megatron-bert", "MegatronBertForSequenceClassification"),
+        ("minicpm3", "MiniCPM3ForSequenceClassification"),
         ("minimax", "MiniMaxForSequenceClassification"),
         ("ministral", "MinistralForSequenceClassification"),
         ("ministral3", "Ministral3ForSequenceClassification"),

diff --git a/src/transformers/models/minicpm3/__init__.py b/src/transformers/models/minicpm3/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_minicpm3 import *
+    from .modeling_minicpm3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/minicpm3/configuration_minicpm3.py b/src/transformers/models/minicpm3/configuration_minicpm3.py
@@ -0,0 +1,126 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/minicpm3/modular_minicpm3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_minicpm3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub.dataclasses import strict
+
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring
+
+
+@auto_docstring(checkpoint="openbmb/MiniCPM3-4B")
+@strict
+class MiniCPM3Config(PreTrainedConfig):
+    r"""
+    kv_lora_rank (`int`, *optional*, defaults to 256):
+        Rank of the low-rank KV projection in multi-head latent attention.
+    q_lora_rank (`int`, *optional*, defaults to 768):
+        Rank of the low-rank query projection in multi-head latent attention.
+    qk_nope_head_dim (`int`, *optional*, defaults to 64):
+        Dimension of the non-RoPE part of each query/key head.
+    qk_rope_head_dim (`int`, *optional*, defaults to 32):
+        Dimension of the RoPE part of each query/key head.
+    v_head_dim (`int`, *optional*, defaults to 128):
+        Dimension of each value head.
+    scale_emb (`int`, *optional*, defaults to 1):
+        Scaling factor applied to input embeddings.
+    scale_depth (`float`, *optional*, defaults to 1.0):
+        Scaling factor for residual connections, applied as `scale_depth / sqrt(num_hidden_layers)`.
+    dim_model_base (`int`, *optional*, defaults to 1):
+        Base model dimension used to scale logits before the language model head.
+
+    Example:
+
+    ```python
+    >>> from transformers import MiniCPM3Model, MiniCPM3Config
+    >>> configuration = MiniCPM3Config()
+    >>> model = MiniCPM3Model(configuration)
+    >>> print(model.config)
+    ```
+    """
+
+    model_type = "minicpm3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.q_b_proj": "colwise",
+        "layers.*.self_attn.kv_a_proj_with_mqa": "mla_kv_a_proj",
+        "layers.*.self_attn.kv_b_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    vocab_size: int = 73448
+    hidden_size: int = 2560
+    intermediate_size: int = 6400
+    num_hidden_layers: int = 62
+    num_attention_heads: int = 40
+    num_key_value_heads: int | None = 40
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 32768
+    initializer_range: float = 0.1
+    rms_norm_eps: float = 1e-5
+    use_cache: bool = True
+    pad_token_id: int | None = None
+    bos_token_id: int | None = 1
+    eos_token_id: int | list[int] | None = 2
+    pretraining_tp: int | None = 1
+    tie_word_embeddings: bool = False
+    rope_parameters: RopeParameters | dict | None = None
+    attention_bias: bool = False
+    attention_dropout: float | None = 0.0
+    mlp_bias: bool = False
+    head_dim: int | None = None
+    kv_lora_rank: int = 256
+    q_lora_rank: int | None = 768
+    qk_nope_head_dim: int = 64
+    qk_rope_head_dim: int = 32
+    v_head_dim: int = 128
+    scale_emb: int = 1
+    scale_depth: float = 1.0
+    dim_model_base: int = 1
+
+    def __post_init__(self, **kwargs):
+        self.head_dim = self.qk_rope_head_dim
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.num_attention_heads
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+
+        super().__post_init__(**kwargs)
+
+    def validate_architecture(self):
+        """Part of `@strict`-powered validation. Validates the architecture of the config."""
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})."
+            )
+
+
+__all__ = ["MiniCPM3Config"]