-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Port DeepSeek Sparse Attention to MambaModel
#3553
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
225a529
9429a63
70d02a7
43e6625
1e116b0
9b59922
c85dc05
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ | |
| make_sharded_tensors_for_checkpoint, | ||
| sharded_state_dict_default, | ||
| ) | ||
| from megatron.core.fp8_utils import get_fp8_align_size | ||
| from megatron.core.utils import ( | ||
| deprecate_inference_params, | ||
| is_causal_conv1d_min_version, | ||
|
|
@@ -207,9 +208,10 @@ def __init__( | |
| self.nheads = self.d_inner // self.headdim | ||
|
|
||
| if self.config.fp8: | ||
| assert (2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads) % 16 == 0, ( | ||
| fp8_align_size = get_fp8_align_size(self.config.fp8_recipe) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What prompts this fix in this PR?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, just wanted to push it for safety's sake. I'll move the DeepSeek-related changes to another branch. :) |
||
| assert (2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads) % fp8_align_size == 0, ( | ||
| "For FP8, the innermost dimension of the Mamba layer " | ||
| "input projection output tensor must be a multiple of 16." | ||
| f"input projection output tensor must be a multiple of {fp8_align_size}." | ||
| ) | ||
|
|
||
| tp_size = self.pg_collection.tp.size() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| {"0": {"input_prompt": "The quick brown fox jumps over the lazy dog.", "generated_text": "", "generated_tokens": [], "logprobs": []}} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| ENV_VARS: | ||
| CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
| NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
| NCCL_ALGO: Ring | ||
| CUBLAS_WORKSPACE_CONFIG: ":4096:8" | ||
| TEST_TYPE: frozen-start | ||
| MODE: inference | ||
| MODEL_ARGS: | ||
| --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_v3_proxy/dcp/checkpoint | ||
| --tokenizer-type: TikTokenizer | ||
| --tiktoken-pattern: v2 | ||
| --transformer-impl: transformer_engine | ||
| --tensor-model-parallel-size: 1 | ||
| --pipeline-model-parallel-size: 1 | ||
| --use-mcore-models: true | ||
| --is-hybrid-model: true | ||
| --model-provider: mamba | ||
| --disable-bias-linear: true | ||
| --position-embedding-type: rope | ||
| --multi-latent-attention: true | ||
| --q-lora-rank: 64 | ||
| --kv-lora-rank: 64 | ||
| --qk-head-dim: 64 | ||
| --qk-pos-emb-head-dim: 32 | ||
| --v-head-dim: 64 | ||
| --dsa-indexer-n-heads: 8 | ||
| --dsa-indexer-head-dim: 64 | ||
| --dsa-indexer-topk: 32 | ||
| --num-layers: 8 | ||
| --hidden-size: 256 | ||
| --num-attention-heads: 16 | ||
| --hybrid-override-pattern: "S-S-S-S-" | ||
| --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec | ||
| --normalization: RMSNorm | ||
| --swiglu: true | ||
| --bf16: true | ||
| --attention-backend: flash | ||
| --deterministic-mode: true | ||
| --temperature: 1.0 | ||
| --top_k: 1 | ||
| --return-log-probs: true | ||
| --num-tokens-to-generate: 30 | ||
| --inference-max-seq-length: 256 | ||
| --output-path: ${INFERENCE_OUTPUT_PATH} | ||
| --prompts: "The quick brown fox jumps over the lazy dog." | ||
| --incoming-requests-per-sec: -1 | ||
| METRICS: | ||
| - "generated_tokens" | ||
| - "logprobs" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| {"0": {"input_prompt": "The quick brown fox jumps over the lazy dog.", "generated_text": "", "generated_tokens": [], "logprobs": []}} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| ENV_VARS: | ||
| CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
| NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
| NCCL_ALGO: Ring | ||
| CUBLAS_WORKSPACE_CONFIG: ":4096:8" | ||
| TEST_TYPE: frozen-start | ||
| MODE: inference | ||
| MODEL_ARGS: | ||
| --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_v3_proxy/dcp/checkpoint | ||
| --tokenizer-type: TikTokenizer | ||
| --tiktoken-pattern: v2 | ||
| --transformer-impl: transformer_engine | ||
| --tensor-model-parallel-size: 2 | ||
| --pipeline-model-parallel-size: 1 | ||
| --use-mcore-models: true | ||
| --is-hybrid-model: true | ||
| --model-provider: mamba | ||
| --disable-bias-linear: true | ||
| --position-embedding-type: rope | ||
| --multi-latent-attention: true | ||
| --q-lora-rank: 64 | ||
| --kv-lora-rank: 64 | ||
| --qk-head-dim: 64 | ||
| --qk-pos-emb-head-dim: 32 | ||
| --v-head-dim: 64 | ||
| --dsa-indexer-n-heads: 8 | ||
| --dsa-indexer-head-dim: 64 | ||
| --dsa-indexer-topk: 32 | ||
| --num-layers: 8 | ||
| --hidden-size: 256 | ||
| --num-attention-heads: 16 | ||
| --hybrid-override-pattern: "S-S-S-S-" | ||
| --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec | ||
| --normalization: RMSNorm | ||
| --swiglu: true | ||
| --bf16: true | ||
| --attention-backend: flash | ||
| --deterministic-mode: true | ||
| --temperature: 1.0 | ||
| --top_k: 1 | ||
| --return-log-probs: true | ||
| --num-tokens-to-generate: 30 | ||
| --inference-max-seq-length: 256 | ||
| --output-path: ${INFERENCE_OUTPUT_PATH} | ||
| --prompts: "The quick brown fox jumps over the lazy dog." | ||
| --incoming-requests-per-sec: -1 | ||
| METRICS: | ||
| - "generated_tokens" | ||
| - "logprobs" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| {"0": {"input_prompt": "The quick brown fox jumps over the lazy dog.", "generated_text": "", "generated_tokens": [], "logprobs": []}} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| ENV_VARS: | ||
| CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
| NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
| NCCL_ALGO: Ring | ||
| CUBLAS_WORKSPACE_CONFIG: ":4096:8" | ||
| TEST_TYPE: frozen-start | ||
| MODE: inference | ||
| MODEL_ARGS: | ||
| --load: ${CHECKPOINT_LOAD_PATH}/model/deepseek_v3_proxy/dcp/checkpoint | ||
| --tokenizer-type: TikTokenizer | ||
| --tiktoken-pattern: v2 | ||
| --transformer-impl: transformer_engine | ||
| --tensor-model-parallel-size: 1 | ||
| --pipeline-model-parallel-size: 1 | ||
| --use-mcore-models: true | ||
| --is-hybrid-model: true | ||
| --model-provider: mamba | ||
| --disable-bias-linear: true | ||
| --position-embedding-type: rope | ||
| --multi-latent-attention: true | ||
| --q-lora-rank: 64 | ||
| --kv-lora-rank: 64 | ||
| --qk-head-dim: 64 | ||
| --qk-pos-emb-head-dim: 32 | ||
| --v-head-dim: 64 | ||
| --dsa-indexer-n-heads: 8 | ||
| --dsa-indexer-head-dim: 64 | ||
| --dsa-indexer-topk: 32 | ||
| --num-layers: 8 | ||
| --hidden-size: 256 | ||
| --num-attention-heads: 16 | ||
| --hybrid-override-pattern: "S-S-SESE" | ||
| --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec | ||
| --normalization: RMSNorm | ||
| --swiglu: true | ||
| --bf16: true | ||
| --attention-backend: flash | ||
| --deterministic-mode: true | ||
| --temperature: 1.0 | ||
| --top_k: 1 | ||
| --return-log-probs: true | ||
| --num-tokens-to-generate: 30 | ||
| --inference-max-seq-length: 256 | ||
| --output-path: ${INFERENCE_OUTPUT_PATH} | ||
| --prompts: "The quick brown fox jumps over the lazy dog." | ||
| --incoming-requests-per-sec: -1 | ||
| # MoE args | ||
| --num-experts: 4 | ||
| --moe-layer-freq: ([0]*2+[1]*2) | ||
| --moe-ffn-hidden-size: 512 | ||
| --moe-shared-expert-intermediate-size: 512 | ||
| --moe-router-topk: 2 | ||
| --moe-grouped-gemm: true | ||
| --moe-token-dispatcher-type: allgather | ||
| --moe-router-load-balancing-type: aux_loss | ||
| --moe-aux-loss-coeff: 0.0 | ||
| METRICS: | ||
| - "generated_tokens" | ||
| - "logprobs" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| {"0": {"input_prompt": "The quick brown fox jumps over the lazy dog.", "generated_text": "", "generated_tokens": [], "logprobs": []}} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wondering if 'S' should be reserved for sliding-window attention. Wondering if this should be 'D'. Of course, these choices are arbitrary and hopefully ultimately temporary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense!