From 7b30a360276729bf79448cd9a7a03e9aac21b2b9 Mon Sep 17 00:00:00 2001
From: Torsten Scholak <torsten.scholak@servicenow.com>
Date: Sun, 7 Dec 2025 17:19:20 +0000
Subject: [PATCH] Bump base image and dependencies for KDA support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update to nvcr.io/nvidia/pytorch:25.11-py3 which includes:
- PyTorch 2.10
- CUDA 13.0
- flash-attn 2.7.4.post1 (pre-installed, no compilation needed)

Dependency updates:
- causal-conv1d: v1.5.4 (was pinned to commit 2a288a1)
- mamba-ssm: 2.2.6.post3 (was pinned to commit 4a8a2a2)
- flash-linear-attention: pin to commit 67eee20 (was @main)
- flash-attn: 2.7.4.post1 to match base image (was 2.7.3)
- triton: 3.5.1 in Dockerfile (was 3.1.0)

These updates enable Kimi Delta Attention (KDA) support via the
flash-linear-attention library. The pinned versions are tested and
working, unlike the nightly/unpinned approach in #395.

Note: Dropless MoE kernel remains broken with triton >= 3.2.0 and
needs a complete rewrite (also limited to 32 experts). This is
tracked separately and doesn't block KDA work.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 Dockerfile |  9 +++++----
 setup.cfg  | 12 ++++++------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6bc900ae7..5804d0e47 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1.7-labs
-FROM nvcr.io/nvidia/pytorch:25.05-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 
 # Install dependencies.
 RUN apt-get update \
@@ -29,8 +29,9 @@ ENV PIP_CONSTRAINT=""
 # There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds.
 # We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 (same for causal-conv1d)
 # We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?)
-RUN MAX_JOBS=2 pip install --no-build-isolation  "causal-conv1d@git+https://github.com/Dao-AILab/causal-conv1d@2a288a1"
-RUN MAX_JOBS=2 pip install --no-build-isolation "mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@4a8a2a2"
+RUN MAX_JOBS=2 pip install --no-build-isolation "causal-conv1d @ git+https://github.com/Dao-AILab/causal-conv1d@v1.5.4"
+RUN MAX_JOBS=2 pip install --no-build-isolation mamba-ssm==2.2.6.post3
+RUN MAX_JOBS=2 pip install --no-build-isolation "flash-linear-attention @ git+https://github.com/fla-org/flash-linear-attention@67eee20c8503cd19eeb52aa1b99821308e9260c5"
 # Copy dependency files with universal write permissions for all users.
 COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
 COPY --chmod=777 ./fast_llm_external_models/__init__.py fast_llm_external_models/
@@ -38,7 +39,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
 COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
 
 # Install dependencies within the virtual environment.
-RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV]" triton==3.1.0
+RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV]" triton==3.5.1
 
 # Copy the remaining source code with universal write permissions.
 COPY --chmod=777 ./Megatron-LM Megatron-LM
diff --git a/setup.cfg b/setup.cfg
index f4b2c904b..58f8ea2d1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -25,10 +25,10 @@ CORE =
     # Used for checkpoints
     safetensors>=0.5.3
     # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation
-    flash-attn==2.7.3
-    # Dropless MLP is broken with triton 3.2.0, 3.3.0 and 3.3.1. TODO: Remove once a working triton version is released.
-    # TODO: Removed because it breaks cpu-only installs and pip dependency resolution.
-    # triton==3.1.0
+    flash-attn==2.7.4.post1
+    # Dropless MoE kernel is broken with triton >= 3.2.0 and needs a rewrite (also limited to 32 experts).
+    # Not pinning triton here as it breaks cpu-only installs and pip dependency resolution.
+    # triton==3.5.1
 
 
 # Small packages required for some optional features and tools.
@@ -52,8 +52,8 @@ HUGGINGFACE =
 # To install on cpu environment (ex. for IDE support):
 #   MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install -e ".[CORE,SSM]" --no-build-isolation
 SSM =
-    mamba_ssm[causal-conv1d]==2.2.4
-    flash-linear-attention @ git+https://github.com/fla-org/flash-linear-attention@main
+    mamba_ssm[causal-conv1d]==2.2.6.post3
+    flash-linear-attention @ git+https://github.com/fla-org/flash-linear-attention@67eee20c8503cd19eeb52aa1b99821308e9260c5
 
 GENERATION =
     lm_eval>=0.4.9