Tencent · yghstill · Apr 21, 2026 · Apr 17, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/README.md b/README.md
@@ -94,7 +94,7 @@ A more accessible, comprehensive, and efficient toolkit for large model compress
           <li>
             <strong>Sparse Attention</strong>
             <ul style="padding-left: 1.5rem">
-              <li>Under Development</li>
+              <li><a href="https://angelslim.readthedocs.io/zh-cn/latest/features/sparse_attention/stem.html">Stem</a></li>
             </ul>
           </li>
         </ul>

diff --git a/README_cn.md b/README_cn.md
@@ -95,7 +95,7 @@
           <li>
             <strong>稀疏注意力</strong>
             <ul style="padding-left: 1.5rem">
-              <li>Minference(建设中)</li>
+              <li><a href="https://angelslim.readthedocs.io/zh-cn/latest/features/sparse_attention/stem.html">Stem</a></li>
             </ul>
           </li>
         </ul>

diff --git a/angelslim/compressor/sparsity/__init__.py b/angelslim/compressor/sparsity/__init__.py
@@ -11,3 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .stem import StemInference  # noqa: F401
+
+__all__ = ["StemInference"]
diff --git a/angelslim/compressor/sparsity/stem/__init__.py b/angelslim/compressor/sparsity/stem/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Stem — Sparse Token Estimation Module for long-context LLM inference.
+
+Public API:
+    StemInference: Callable that patches a HuggingFace model to use Stem
+        sparse attention during the prefill stage.
+"""
+
+from .stem import StemInference
+
+__all__ = ["StemInference"]
diff --git a/angelslim/compressor/sparsity/stem/backends/__init__.py b/angelslim/compressor/sparsity/stem/backends/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Stem backend implementations (torch / HPC)."""
+
+from .dispatcher import stem_forward
+
+__all__ = ["stem_forward"]
diff --git a/angelslim/compressor/sparsity/stem/backends/dispatcher.py b/angelslim/compressor/sparsity/stem/backends/dispatcher.py
@@ -0,0 +1,59 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Backend dispatcher: routes Stem prefill to the correct implementation."""
+
+from __future__ import annotations
+
+import torch
+
+from .torch_impl import stem_forward_torch
+
+
+def stem_forward(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    prefill_kwargs: dict,
+) -> torch.Tensor:
+    """Dispatch a Stem prefill call to the appropriate backend.
+
+    Args:
+        query_states: Query tensor of shape ``(B, H_q, L_q, D)``.
+        key_states: Key tensor of shape ``(B, H_kv, L_kv, D)``.
+        value_states: Value tensor of shape ``(B, H_kv, L_kv, D)``.
+        prefill_kwargs: Must contain ``"attn_forward_config"`` (with a
+            ``"backend"`` key) and ``"layer_idx"``.
+
+    Returns:
+        Attention output tensor of shape ``(B, H_q, L_q, D)``.
+
+    Raises:
+        ValueError: If the requested backend is not ``"torch"`` or ``"hpc"``.
+    """
+    config = prefill_kwargs["attn_forward_config"]
+    backend = config.get("backend", "torch")
+
+    if backend == "torch":
+        return stem_forward_torch(query_states, key_states, value_states, prefill_kwargs)
+
+    if backend == "hpc":
+        # Lazy import to avoid hard dependency on the ``hpc`` C++ extension
+        # when only the pure-torch path is needed.
+        from .hpc_impl import stem_forward_hpc
+
+        return stem_forward_hpc(query_states, key_states, value_states, prefill_kwargs)
+
+    raise ValueError(f"Unknown stem backend: {backend!r}")