From b7712aa9c010a7ddbbf88fea5782cbe7abdc74ea Mon Sep 17 00:00:00 2001 From: sayalinvidia <178231576+sayalinvidia@users.noreply.github.com> Date: Fri, 17 Apr 2026 18:06:13 +0000 Subject: [PATCH] chore: sync skills (CUDA-Q,cuOpt,TensorRT-LLM,Model-Optimizer,Megatron-Bridge,Nemotron Voice Agent,NeMo Gym,NeMo Evaluator) --- skills/CUDA-Q/cudaq-guide/SKILL.md | 312 +++++++ .../adding-model-support/SKILL.md | 443 ++++++++++ .../adding-model-support/llm-patterns.md | 217 +++++ .../adding-model-support/recipe-patterns.md | 169 ++++ .../tests-and-examples.md | 326 ++++++++ .../adding-model-support/vlm-patterns.md | 197 +++++ skills/Megatron-Bridge/code-style/SKILL.md | 304 +++++++ .../Megatron-Bridge/developer-guide/SKILL.md | 472 +++++++++++ .../mlm-bridge-training/SKILL.md | 161 ++++ .../mlm-bridge-training/card.yaml | 47 ++ .../Megatron-Bridge/multi-node-slurm/SKILL.md | 534 ++++++++++++ .../Megatron-Bridge/parity-testing/SKILL.md | 176 ++++ .../Megatron-Bridge/perf-techniques/README.md | 13 + .../activation-recompute/SKILL.md | 204 +++++ .../activation-recompute/card.yaml | 174 ++++ .../perf-techniques/cpu-offloading/SKILL.md | 304 +++++++ .../perf-techniques/cpu-offloading/card.yaml | 211 +++++ .../perf-techniques/cuda-graphs/SKILL.md | 321 ++++++++ .../perf-techniques/cuda-graphs/card.yaml | 283 +++++++ .../expert-parallel-overlap/SKILL.md | 249 ++++++ .../expert-parallel-overlap/card.yaml | 186 +++++ .../hybrid-context-parallel/SKILL.md | 154 ++++ .../hybrid-context-parallel/card.yaml | 65 ++ .../perf-techniques/megatron-fsdp/SKILL.md | 122 +++ .../perf-techniques/megatron-fsdp/card.yaml | 50 ++ .../perf-techniques/memory-tuning/SKILL.md | 230 ++++++ .../perf-techniques/memory-tuning/card.yaml | 173 ++++ .../perf-techniques/moe-comm-overlap/SKILL.md | 87 ++ .../moe-comm-overlap/card.yaml | 47 ++ .../moe-dispatcher-selection/SKILL.md | 161 ++++ .../moe-dispatcher-selection/card.yaml | 130 +++ .../moe-hardware-configs/SKILL.md | 148 ++++ .../moe-hardware-configs/card.yaml | 204 +++++ .../perf-techniques/moe-long-context/SKILL.md | 137 ++++ .../moe-long-context/card.yaml | 125 +++ .../moe-optimization-workflow/SKILL.md | 153 ++++ .../moe-optimization-workflow/card.yaml | 147 ++++ .../perf-techniques/moe-vlm-training/SKILL.md | 134 +++ .../moe-vlm-training/card.yaml | 102 +++ .../parallelism-strategies/SKILL.md | 233 ++++++ .../parallelism-strategies/card.yaml | 72 ++ .../perf-techniques/sequence-packing/SKILL.md | 142 ++++ .../sequence-packing/card.yaml | 93 +++ .../tp-dp-comm-overlap/SKILL.md | 117 +++ .../tp-dp-comm-overlap/card.yaml | 51 ++ .../recipe-recommender/SKILL.md | 415 ++++++++++ skills/Megatron-Bridge/resiliency/SKILL.md | 305 +++++++ skills/Megatron-Bridge/resiliency/card.yaml | 121 +++ .../common/environment-setup.md | 80 ++ .../common/remote-execution.md | 147 ++++ skills/Model-Optimizer/common/remote_exec.sh | 519 ++++++++++++ skills/Model-Optimizer/common/slurm-setup.md | 319 ++++++++ .../common/workspace-management.md | 110 +++ skills/Model-Optimizer/debug/SKILL.md | 33 + skills/Model-Optimizer/deployment/SKILL.md | 237 ++++++ .../deployment/references/setup.md | 106 +++ .../deployment/references/sglang.md | 81 ++ .../deployment/references/support-matrix.md | 65 ++ .../deployment/references/trtllm.md | 109 +++ .../references/unsupported-models.md | 70 ++ .../deployment/references/vllm.md | 91 +++ .../deployment/scripts/deploy.sh | 590 ++++++++++++++ .../deployment/tests/evals.json | 58 ++ skills/Model-Optimizer/evaluation/SKILL.md | 339 ++++++++ .../references/model-card-research.md | 30 + .../evaluation/references/multi-node.md | 53 ++ .../references/quantization-benchmarks.md | 26 + .../evaluation/tests/evals.json | 65 ++ skills/Model-Optimizer/ptq/SKILL.md | 170 ++++ .../ptq/references/checkpoint-validation.md | 86 ++ .../ptq/references/launcher-guide.md | 92 +++ .../ptq/references/slurm-setup-ptq.md | 95 +++ .../ptq/references/unsupported-models.md | 351 ++++++++ skills/Model-Optimizer/ptq/tests.json | 77 ++ .../accessing-mlflow/SKILL.md | 98 +++ .../launching-evals/SKILL.md | 65 ++ .../references/analyze-results.md | 57 ++ .../benchmarks/swebench-general-info.md | 188 +++++ .../benchmarks/terminal-bench-general-info.md | 122 +++ .../terminal-bench-trace-analysis.md | 145 ++++ .../references/check-progress.md | 24 + .../references/debug-failed-runs.md | 130 +++ .../references/run-evaluation.md | 26 + .../launching-evals/tests.json | 46 ++ .../nel-assistant/SKILL.md | 326 ++++++++ .../evals/nemotron3-nano-bf16-reasoning.json | 25 + skills/NeMo-Evaluator/byob/SKILL.md | 306 +++++++ skills/NeMo-Gym/add-benchmark/SKILL.md | 252 ++++++ .../add-benchmark/references/patterns.md | 711 ++++++++++++++++ skills/TensorRT-LLM/ad-model-onboard/SKILL.md | 317 ++++++++ .../ad-pipeline-failure-pr/SKILL.md | 320 ++++++++ .../ci-failure-retrieval/SKILL.md | 89 ++ .../TensorRT-LLM/exec-local-compile/SKILL.md | 97 +++ .../TensorRT-LLM/exec-slurm-compile/SKILL.md | 251 ++++++ .../exec-slurm-compile/scripts/compile.sh | 43 + .../exec-slurm-compile/scripts/compile.slurm | 42 + .../exec-slurm-compile/scripts/enroot-import | 160 ++++ .../scripts/submit_compile.sh | 61 ++ .../TensorRT-LLM/kernel-cute-writing/SKILL.md | 368 +++++++++ .../references/api-arch.md | 181 +++++ .../references/api-core.md | 239 ++++++ .../references/api-nvgpu.md | 268 ++++++ .../references/api-runtime-utils.md | 244 ++++++ .../references/concepts-architecture.md | 113 +++ .../references/concepts-layouts.md | 184 +++++ .../references/concepts-mma.md | 187 +++++ .../references/concepts-tensors.md | 195 +++++ .../references/patterns-compilation.md | 259 ++++++ .../references/patterns-elementwise.md | 279 +++++++ .../references/patterns-gemm.md | 294 +++++++ .../references/patterns-getting-started.md | 199 +++++ .../references/patterns-memory.md | 227 ++++++ .../references/patterns-pipeline.md | 269 ++++++ .../references/patterns-reduction.md | 239 ++++++ .../references/troubleshooting.md | 166 ++++ .../kernel-cute-writing/scripts/__init__.py | 14 + .../scripts/benchmark_kernel.py | 375 +++++++++ .../scripts/verify_kernel.py | 372 +++++++++ .../kernel-tileir-optimization/SKILL.md | 273 +++++++ .../references/config-templates.md | 191 +++++ .../references/tma-conversion.md | 107 +++ .../scripts/classify_kernel.py | 389 +++++++++ .../scripts/tileir_check.py | 181 +++++ .../kernel-triton-writing/SKILL.md | 342 ++++++++ .../references/api-core.md | 325 ++++++++ .../references/api-language.md | 280 +++++++ .../references/concepts-semantics.md | 196 +++++ .../references/operator-routing.md | 122 +++ .../references/patterns-advanced.md | 315 +++++++ .../references/patterns-basic.md | 235 ++++++ .../references/patterns-fusion.md | 346 ++++++++ .../references/patterns-gemm.md | 289 +++++++ .../references/troubleshooting.md | 278 +++++++ .../kernel-triton-writing/scripts/__init__.py | 14 + .../scripts/benchmark_kernel.py | 304 +++++++ .../scripts/verify_kernel.py | 370 +++++++++ skills/TensorRT-LLM/perf-analysis/SKILL.md | 154 ++++ .../TensorRT-LLM/perf-host-analysis/SKILL.md | 534 ++++++++++++ .../perf-host-analysis/references/examples.md | 137 ++++ .../iteration-isolation-techniques.md | 172 ++++ .../perf-host-analysis/references/metrics.md | 189 +++++ .../references/output-format.md | 172 ++++ .../references/phase-classification.md | 116 +++ .../references/thresholds.md | 48 ++ .../references/trtllm-nvtx-ranges.md | 179 ++++ .../scripts/analyze_host_overhead.py | 769 ++++++++++++++++++ .../perf-host-optimization/SKILL.md | 291 +++++++ .../references/examples.md | 151 ++++ .../references/hot-path-files.md | 120 +++ .../references/hotspot-classification.md | 274 +++++++ .../references/optimization-patterns.md | 657 +++++++++++++++ .../perf-nsight-compute-analysis/SKILL.md | 392 +++++++++ .../references/advanced-profiling.md | 290 +++++++ .../references/bottleneck-guide.md | 173 ++++ .../references/cli-reference.md | 311 +++++++ .../references/memory-analysis.md | 161 ++++ .../references/metrics-guide.md | 195 +++++ .../references/python-report-api.md | 254 ++++++ .../references/roofline-analysis.md | 119 +++ .../references/sections-guide.md | 258 ++++++ .../TensorRT-LLM/perf-nsight-systems/SKILL.md | 397 +++++++++ .../references/app-preparation.md | 236 ++++++ .../references/cli-post-collection.md | 221 +++++ .../references/cli-profiling.md | 264 ++++++ .../references/expert-systems.md | 162 ++++ .../references/nvtx-analysis.md | 183 +++++ .../references/recipes-dl.md | 268 ++++++ .../references/stats-reports.md | 191 +++++ .../TensorRT-LLM/perf-optimization/SKILL.md | 347 ++++++++ .../perf-torch-cuda-graphs/SKILL.md | 634 +++++++++++++++ .../references/api-pytorch.md | 253 ++++++ .../references/api-te-megatron.md | 238 ++++++ .../references/patterns-compatibility.md | 153 ++++ .../references/patterns-dynamic.md | 264 ++++++ .../references/troubleshooting.md | 223 +++++ .../scripts/verify_workload.py | 225 +++++ .../perf-torch-sync-free/SKILL.md | 269 ++++++ .../references/sync-patterns.md | 424 ++++++++++ .../scripts/verify_workload.py | 225 +++++ .../perf-workload-profiling/SKILL.md | 199 +++++ .../references/benchmarking-patterns.md | 184 +++++ .../references/nvtx-api.md | 95 +++ .../references/pytorch-profiler-api.md | 58 ++ .../TensorRT-LLM/serve-config-guide/SKILL.md | 77 ++ .../references/knob-heuristics.md | 70 ++ .../trtllm-code-contribution/SKILL.md | 413 ++++++++++ .../trtllm-codebase-exploration/SKILL.md | 186 +++++ skills/cuopt/cuopt-developer/SKILL.md | 399 +++++++++ .../resources/python_bindings.md | 233 ++++++ .../cuopt/cuopt-installation-api-c/SKILL.md | 32 + .../resources/verification_examples.md | 172 ++++ .../cuopt-installation-api-python/SKILL.md | 73 ++ .../resources/verification_examples.md | 172 ++++ .../cuopt/cuopt-installation-common/SKILL.md | 29 + .../cuopt-installation-developer/SKILL.md | 36 + skills/cuopt/cuopt-lp-milp-api-c/SKILL.md | 57 ++ .../cuopt-lp-milp-api-c/assets/README.md | 33 + .../assets/lp_basic/README.md | 15 + .../assets/lp_basic/lp_simple.c | 109 +++ .../assets/lp_duals/README.md | 14 + .../assets/lp_duals/lp_duals.c | 115 +++ .../assets/lp_warmstart/README.md | 5 + .../assets/milp_basic/README.md | 12 + .../assets/milp_basic/milp_simple.c | 102 +++ .../assets/milp_production_planning/README.md | 12 + .../milp_production.c | 98 +++ .../assets/mps_solver/README.md | 14 + .../assets/mps_solver/data/sample.mps | 19 + .../assets/mps_solver/mps_solver.c | 107 +++ .../cuopt-lp-milp-api-c/resources/examples.md | 291 +++++++ skills/cuopt/cuopt-lp-milp-api-cli/SKILL.md | 66 ++ .../cuopt-lp-milp-api-cli/assets/README.md | 21 + .../assets/lp_production/README.md | 5 + .../assets/lp_production/production.mps | 16 + .../assets/lp_simple/README.md | 5 + .../assets/lp_simple/sample.mps | 19 + .../assets/milp_facility/README.md | 5 + .../assets/milp_facility/facility.mps | 27 + .../cuopt/cuopt-lp-milp-api-python/SKILL.md | 226 +++++ .../cuopt-lp-milp-api-python/assets/README.md | 12 + .../assets/lp_basic/README.md | 7 + .../assets/lp_basic/model.py | 36 + .../assets/lp_duals/README.md | 7 + .../assets/lp_duals/model.py | 38 + .../assets/lp_warmstart/README.md | 5 + .../assets/lp_warmstart/model.py | 52 ++ .../assets/milp_basic/README.md | 10 + .../assets/milp_basic/incumbent_callback.py | 50 ++ .../assets/milp_basic/model.py | 36 + .../assets/milp_production_planning/README.md | 5 + .../assets/milp_production_planning/model.py | 33 + .../assets/mps_solver/README.md | 88 ++ .../assets/mps_solver/data/README.md | 82 ++ .../assets/mps_solver/data/sample.mps | 19 + .../assets/mps_solver/model.py | 283 +++++++ .../assets/mps_solver/results.md | 90 ++ skills/cuopt/cuopt-qp-api-c/SKILL.md | 19 + skills/cuopt/cuopt-qp-api-c/assets/README.md | 9 + skills/cuopt/cuopt-qp-api-cli/SKILL.md | 37 + .../cuopt/cuopt-qp-api-cli/assets/README.md | 9 + skills/cuopt/cuopt-qp-api-python/SKILL.md | 61 ++ .../cuopt-qp-api-python/assets/README.md | 11 + .../assets/least_squares/README.md | 5 + .../assets/least_squares/model.py | 24 + .../assets/maximization_workaround/README.md | 5 + .../assets/maximization_workaround/model.py | 22 + .../assets/portfolio/README.md | 7 + .../assets/portfolio/model.py | 49 ++ .../cuopt-qp-api-python/resources/examples.md | 198 +++++ .../cuopt/cuopt-routing-api-python/SKILL.md | 101 +++ .../cuopt-routing-api-python/assets/README.md | 10 + .../assets/pdp_basic/README.md | 7 + .../assets/pdp_basic/model.py | 56 ++ .../assets/vrp_basic/README.md | 7 + .../assets/vrp_basic/model.py | 31 + .../resources/examples.md | 249 ++++++ .../resources/server_examples.md | 204 +++++ skills/cuopt/cuopt-server-api-python/SKILL.md | 80 ++ .../cuopt-server-api-python/assets/README.md | 14 + .../assets/lp_basic/README.md | 10 + .../assets/lp_basic/client.py | 84 ++ .../assets/milp_basic/README.md | 6 + .../assets/milp_basic/client.py | 82 ++ .../assets/pdp_basic/README.md | 6 + .../assets/pdp_basic/client.py | 97 +++ .../assets/vrp_basic/README.md | 10 + .../assets/vrp_basic/client.py | 101 +++ .../assets/vrp_simple/README.md | 6 + .../assets/vrp_simple/client.py | 95 +++ skills/cuopt/cuopt-server-common/SKILL.md | 46 ++ skills/cuopt/cuopt-user-rules/SKILL.md | 222 +++++ skills/cuopt/lp-milp-formulation/SKILL.md | 240 ++++++ skills/cuopt/qp-formulation/SKILL.md | 33 + skills/cuopt/routing-formulation/SKILL.md | 31 + skills/cuopt/skill-evolution/SKILL.md | 256 ++++++ .../nemotron-voice-agent-deploy/SKILL.md | 81 ++ .../references/jetson-deployment.md | 93 +++ .../references/workstation-deployment.md | 104 +++ 278 files changed, 45135 insertions(+) create mode 100644 skills/CUDA-Q/cudaq-guide/SKILL.md create mode 100644 skills/Megatron-Bridge/adding-model-support/SKILL.md create mode 100644 skills/Megatron-Bridge/adding-model-support/llm-patterns.md create mode 100644 skills/Megatron-Bridge/adding-model-support/recipe-patterns.md create mode 100644 skills/Megatron-Bridge/adding-model-support/tests-and-examples.md create mode 100644 skills/Megatron-Bridge/adding-model-support/vlm-patterns.md create mode 100644 skills/Megatron-Bridge/code-style/SKILL.md create mode 100644 skills/Megatron-Bridge/developer-guide/SKILL.md create mode 100644 skills/Megatron-Bridge/mlm-bridge-training/SKILL.md create mode 100644 skills/Megatron-Bridge/mlm-bridge-training/card.yaml create mode 100644 skills/Megatron-Bridge/multi-node-slurm/SKILL.md create mode 100644 skills/Megatron-Bridge/parity-testing/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/README.md create mode 100644 skills/Megatron-Bridge/perf-techniques/activation-recompute/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/activation-recompute/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/cpu-offloading/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/cpu-offloading/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/cuda-graphs/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/cuda-graphs/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/expert-parallel-overlap/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/expert-parallel-overlap/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/hybrid-context-parallel/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/hybrid-context-parallel/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/megatron-fsdp/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/megatron-fsdp/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/memory-tuning/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/memory-tuning/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-comm-overlap/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-comm-overlap/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-dispatcher-selection/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-dispatcher-selection/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-hardware-configs/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-hardware-configs/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-long-context/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-long-context/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-optimization-workflow/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-optimization-workflow/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-vlm-training/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/moe-vlm-training/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/parallelism-strategies/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/parallelism-strategies/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/sequence-packing/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/sequence-packing/card.yaml create mode 100644 skills/Megatron-Bridge/perf-techniques/tp-dp-comm-overlap/SKILL.md create mode 100644 skills/Megatron-Bridge/perf-techniques/tp-dp-comm-overlap/card.yaml create mode 100644 skills/Megatron-Bridge/recipe-recommender/SKILL.md create mode 100644 skills/Megatron-Bridge/resiliency/SKILL.md create mode 100644 skills/Megatron-Bridge/resiliency/card.yaml create mode 100644 skills/Model-Optimizer/common/environment-setup.md create mode 100644 skills/Model-Optimizer/common/remote-execution.md create mode 100644 skills/Model-Optimizer/common/remote_exec.sh create mode 100644 skills/Model-Optimizer/common/slurm-setup.md create mode 100644 skills/Model-Optimizer/common/workspace-management.md create mode 100644 skills/Model-Optimizer/debug/SKILL.md create mode 100644 skills/Model-Optimizer/deployment/SKILL.md create mode 100644 skills/Model-Optimizer/deployment/references/setup.md create mode 100644 skills/Model-Optimizer/deployment/references/sglang.md create mode 100644 skills/Model-Optimizer/deployment/references/support-matrix.md create mode 100644 skills/Model-Optimizer/deployment/references/trtllm.md create mode 100644 skills/Model-Optimizer/deployment/references/unsupported-models.md create mode 100644 skills/Model-Optimizer/deployment/references/vllm.md create mode 100755 skills/Model-Optimizer/deployment/scripts/deploy.sh create mode 100644 skills/Model-Optimizer/deployment/tests/evals.json create mode 100644 skills/Model-Optimizer/evaluation/SKILL.md create mode 100644 skills/Model-Optimizer/evaluation/references/model-card-research.md create mode 100644 skills/Model-Optimizer/evaluation/references/multi-node.md create mode 100644 skills/Model-Optimizer/evaluation/references/quantization-benchmarks.md create mode 100644 skills/Model-Optimizer/evaluation/tests/evals.json create mode 100644 skills/Model-Optimizer/ptq/SKILL.md create mode 100644 skills/Model-Optimizer/ptq/references/checkpoint-validation.md create mode 100644 skills/Model-Optimizer/ptq/references/launcher-guide.md create mode 100644 skills/Model-Optimizer/ptq/references/slurm-setup-ptq.md create mode 100644 skills/Model-Optimizer/ptq/references/unsupported-models.md create mode 100644 skills/Model-Optimizer/ptq/tests.json create mode 100644 skills/NeMo-Evaluator-Launcher/accessing-mlflow/SKILL.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/SKILL.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/references/analyze-results.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/references/benchmarks/swebench-general-info.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/references/benchmarks/terminal-bench-general-info.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/references/check-progress.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/references/debug-failed-runs.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/references/run-evaluation.md create mode 100644 skills/NeMo-Evaluator-Launcher/launching-evals/tests.json create mode 100644 skills/NeMo-Evaluator-Launcher/nel-assistant/SKILL.md create mode 100644 skills/NeMo-Evaluator-Launcher/nel-assistant/evals/nemotron3-nano-bf16-reasoning.json create mode 100644 skills/NeMo-Evaluator/byob/SKILL.md create mode 100644 skills/NeMo-Gym/add-benchmark/SKILL.md create mode 100644 skills/NeMo-Gym/add-benchmark/references/patterns.md create mode 100644 skills/TensorRT-LLM/ad-model-onboard/SKILL.md create mode 100644 skills/TensorRT-LLM/ad-pipeline-failure-pr/SKILL.md create mode 100644 skills/TensorRT-LLM/ci-failure-retrieval/SKILL.md create mode 100644 skills/TensorRT-LLM/exec-local-compile/SKILL.md create mode 100644 skills/TensorRT-LLM/exec-slurm-compile/SKILL.md create mode 100755 skills/TensorRT-LLM/exec-slurm-compile/scripts/compile.sh create mode 100644 skills/TensorRT-LLM/exec-slurm-compile/scripts/compile.slurm create mode 100755 skills/TensorRT-LLM/exec-slurm-compile/scripts/enroot-import create mode 100755 skills/TensorRT-LLM/exec-slurm-compile/scripts/submit_compile.sh create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/SKILL.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/api-arch.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/api-core.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/api-nvgpu.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/api-runtime-utils.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/concepts-architecture.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/concepts-layouts.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/concepts-mma.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/concepts-tensors.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/patterns-compilation.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/patterns-elementwise.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/patterns-gemm.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/patterns-getting-started.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/patterns-memory.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/patterns-pipeline.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/patterns-reduction.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/references/troubleshooting.md create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/scripts/__init__.py create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/scripts/benchmark_kernel.py create mode 100644 skills/TensorRT-LLM/kernel-cute-writing/scripts/verify_kernel.py create mode 100644 skills/TensorRT-LLM/kernel-tileir-optimization/SKILL.md create mode 100644 skills/TensorRT-LLM/kernel-tileir-optimization/references/config-templates.md create mode 100644 skills/TensorRT-LLM/kernel-tileir-optimization/references/tma-conversion.md create mode 100644 skills/TensorRT-LLM/kernel-tileir-optimization/scripts/classify_kernel.py create mode 100644 skills/TensorRT-LLM/kernel-tileir-optimization/scripts/tileir_check.py create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/SKILL.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/api-core.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/api-language.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/concepts-semantics.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/operator-routing.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/patterns-advanced.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/patterns-basic.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/patterns-fusion.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/patterns-gemm.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/references/troubleshooting.md create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/scripts/__init__.py create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/scripts/benchmark_kernel.py create mode 100644 skills/TensorRT-LLM/kernel-triton-writing/scripts/verify_kernel.py create mode 100644 skills/TensorRT-LLM/perf-analysis/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/references/examples.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/references/iteration-isolation-techniques.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/references/metrics.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/references/output-format.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/references/phase-classification.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/references/thresholds.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/references/trtllm-nvtx-ranges.md create mode 100644 skills/TensorRT-LLM/perf-host-analysis/scripts/analyze_host_overhead.py create mode 100644 skills/TensorRT-LLM/perf-host-optimization/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-host-optimization/references/examples.md create mode 100644 skills/TensorRT-LLM/perf-host-optimization/references/hot-path-files.md create mode 100644 skills/TensorRT-LLM/perf-host-optimization/references/hotspot-classification.md create mode 100644 skills/TensorRT-LLM/perf-host-optimization/references/optimization-patterns.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/advanced-profiling.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/bottleneck-guide.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/cli-reference.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/memory-analysis.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/metrics-guide.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/python-report-api.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/roofline-analysis.md create mode 100644 skills/TensorRT-LLM/perf-nsight-compute-analysis/references/sections-guide.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/references/app-preparation.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/references/cli-post-collection.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/references/cli-profiling.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/references/expert-systems.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/references/nvtx-analysis.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/references/recipes-dl.md create mode 100644 skills/TensorRT-LLM/perf-nsight-systems/references/stats-reports.md create mode 100644 skills/TensorRT-LLM/perf-optimization/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-torch-cuda-graphs/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-torch-cuda-graphs/references/api-pytorch.md create mode 100644 skills/TensorRT-LLM/perf-torch-cuda-graphs/references/api-te-megatron.md create mode 100644 skills/TensorRT-LLM/perf-torch-cuda-graphs/references/patterns-compatibility.md create mode 100644 skills/TensorRT-LLM/perf-torch-cuda-graphs/references/patterns-dynamic.md create mode 100644 skills/TensorRT-LLM/perf-torch-cuda-graphs/references/troubleshooting.md create mode 100644 skills/TensorRT-LLM/perf-torch-cuda-graphs/scripts/verify_workload.py create mode 100644 skills/TensorRT-LLM/perf-torch-sync-free/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-torch-sync-free/references/sync-patterns.md create mode 100644 skills/TensorRT-LLM/perf-torch-sync-free/scripts/verify_workload.py create mode 100644 skills/TensorRT-LLM/perf-workload-profiling/SKILL.md create mode 100644 skills/TensorRT-LLM/perf-workload-profiling/references/benchmarking-patterns.md create mode 100644 skills/TensorRT-LLM/perf-workload-profiling/references/nvtx-api.md create mode 100644 skills/TensorRT-LLM/perf-workload-profiling/references/pytorch-profiler-api.md create mode 100644 skills/TensorRT-LLM/serve-config-guide/SKILL.md create mode 100644 skills/TensorRT-LLM/serve-config-guide/references/knob-heuristics.md create mode 100644 skills/TensorRT-LLM/trtllm-code-contribution/SKILL.md create mode 100644 skills/TensorRT-LLM/trtllm-codebase-exploration/SKILL.md create mode 100644 skills/cuopt/cuopt-developer/SKILL.md create mode 100644 skills/cuopt/cuopt-developer/resources/python_bindings.md create mode 100644 skills/cuopt/cuopt-installation-api-c/SKILL.md create mode 100644 skills/cuopt/cuopt-installation-api-c/resources/verification_examples.md create mode 100644 skills/cuopt/cuopt-installation-api-python/SKILL.md create mode 100644 skills/cuopt/cuopt-installation-api-python/resources/verification_examples.md create mode 100644 skills/cuopt/cuopt-installation-common/SKILL.md create mode 100644 skills/cuopt/cuopt-installation-developer/SKILL.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/SKILL.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/lp_basic/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/lp_basic/lp_simple.c create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/lp_duals/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/lp_duals/lp_duals.c create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/lp_warmstart/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/milp_basic/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/milp_basic/milp_simple.c create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/milp_production_planning/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/milp_production_planning/milp_production.c create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/mps_solver/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/mps_solver/data/sample.mps create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/assets/mps_solver/mps_solver.c create mode 100644 skills/cuopt/cuopt-lp-milp-api-c/resources/examples.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/SKILL.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/assets/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/assets/lp_production/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/assets/lp_production/production.mps create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/assets/lp_simple/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/assets/lp_simple/sample.mps create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/assets/milp_facility/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-cli/assets/milp_facility/facility.mps create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/SKILL.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/lp_basic/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/lp_basic/model.py create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/lp_duals/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/lp_duals/model.py create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/lp_warmstart/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/lp_warmstart/model.py create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/milp_basic/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/milp_basic/incumbent_callback.py create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/milp_basic/model.py create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/milp_production_planning/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/milp_production_planning/model.py create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/mps_solver/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/mps_solver/data/README.md create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/mps_solver/data/sample.mps create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/mps_solver/model.py create mode 100644 skills/cuopt/cuopt-lp-milp-api-python/assets/mps_solver/results.md create mode 100644 skills/cuopt/cuopt-qp-api-c/SKILL.md create mode 100644 skills/cuopt/cuopt-qp-api-c/assets/README.md create mode 100644 skills/cuopt/cuopt-qp-api-cli/SKILL.md create mode 100644 skills/cuopt/cuopt-qp-api-cli/assets/README.md create mode 100644 skills/cuopt/cuopt-qp-api-python/SKILL.md create mode 100644 skills/cuopt/cuopt-qp-api-python/assets/README.md create mode 100644 skills/cuopt/cuopt-qp-api-python/assets/least_squares/README.md create mode 100644 skills/cuopt/cuopt-qp-api-python/assets/least_squares/model.py create mode 100644 skills/cuopt/cuopt-qp-api-python/assets/maximization_workaround/README.md create mode 100644 skills/cuopt/cuopt-qp-api-python/assets/maximization_workaround/model.py create mode 100644 skills/cuopt/cuopt-qp-api-python/assets/portfolio/README.md create mode 100644 skills/cuopt/cuopt-qp-api-python/assets/portfolio/model.py create mode 100644 skills/cuopt/cuopt-qp-api-python/resources/examples.md create mode 100644 skills/cuopt/cuopt-routing-api-python/SKILL.md create mode 100644 skills/cuopt/cuopt-routing-api-python/assets/README.md create mode 100644 skills/cuopt/cuopt-routing-api-python/assets/pdp_basic/README.md create mode 100644 skills/cuopt/cuopt-routing-api-python/assets/pdp_basic/model.py create mode 100644 skills/cuopt/cuopt-routing-api-python/assets/vrp_basic/README.md create mode 100644 skills/cuopt/cuopt-routing-api-python/assets/vrp_basic/model.py create mode 100644 skills/cuopt/cuopt-routing-api-python/resources/examples.md create mode 100644 skills/cuopt/cuopt-routing-api-python/resources/server_examples.md create mode 100644 skills/cuopt/cuopt-server-api-python/SKILL.md create mode 100644 skills/cuopt/cuopt-server-api-python/assets/README.md create mode 100644 skills/cuopt/cuopt-server-api-python/assets/lp_basic/README.md create mode 100644 skills/cuopt/cuopt-server-api-python/assets/lp_basic/client.py create mode 100644 skills/cuopt/cuopt-server-api-python/assets/milp_basic/README.md create mode 100644 skills/cuopt/cuopt-server-api-python/assets/milp_basic/client.py create mode 100644 skills/cuopt/cuopt-server-api-python/assets/pdp_basic/README.md create mode 100644 skills/cuopt/cuopt-server-api-python/assets/pdp_basic/client.py create mode 100644 skills/cuopt/cuopt-server-api-python/assets/vrp_basic/README.md create mode 100644 skills/cuopt/cuopt-server-api-python/assets/vrp_basic/client.py create mode 100644 skills/cuopt/cuopt-server-api-python/assets/vrp_simple/README.md create mode 100644 skills/cuopt/cuopt-server-api-python/assets/vrp_simple/client.py create mode 100644 skills/cuopt/cuopt-server-common/SKILL.md create mode 100644 skills/cuopt/cuopt-user-rules/SKILL.md create mode 100644 skills/cuopt/lp-milp-formulation/SKILL.md create mode 100644 skills/cuopt/qp-formulation/SKILL.md create mode 100644 skills/cuopt/routing-formulation/SKILL.md create mode 100644 skills/cuopt/skill-evolution/SKILL.md create mode 100644 skills/nemotron-voice-agent/nemotron-voice-agent-deploy/SKILL.md create mode 100644 skills/nemotron-voice-agent/nemotron-voice-agent-deploy/references/jetson-deployment.md create mode 100644 skills/nemotron-voice-agent/nemotron-voice-agent-deploy/references/workstation-deployment.md diff --git a/skills/CUDA-Q/cudaq-guide/SKILL.md b/skills/CUDA-Q/cudaq-guide/SKILL.md new file mode 100644 index 0000000..9dda483 --- /dev/null +++ b/skills/CUDA-Q/cudaq-guide/SKILL.md @@ -0,0 +1,312 @@ +--- +name: "cudaq-guide" +title: "Cuda Quantum" +description: "CUDA-Q onboarding guide for installation, test programs, GPU simulation, QPU hardware, and quantum applications." +version: "1.0.0" +author: "Sachin Pisal " +tags: [cuda-quantum, quantum-computing, onboarding, getting-started, nvidia] +tools: [Read, Glob, Grep, Bash] +license: "Apache License 2.0" +compatibility: "Python 3.10+, C++ 20" +metadata: + author: "Sachin Pisal " + tags: + - cuda-quantum + - quantum-computing + - onboarding + - getting-started + - nvidia + languages: + - python + - c++ + domain: "quantum" +--- + +## CUDA-Q Getting Started Guide + +You are a CUDA-Q expert assistant. Guide the user through the CUDA-Q platform +based on their `$ARGUMENTS`. If no argument is given, present the full +onboarding menu. + +## Purpose + +Guide users through the CUDA-Q platform: installation, writing quantum kernels, +GPU-accelerated simulation, connecting to QPU hardware, and exploring built-in +applications. + +## Prerequisites + +- Python 3.10+ (for Python installation path) +- CUDA Toolkit (for GPU-accelerated targets on Linux; not required on macOS) +- NVIDIA GPU (optional; CPU-only simulation available via `qpp-cpu`) +- For C++ path: Linux or WSL on Windows +- For QPU access: provider-specific credentials and account + +## Instructions + +- Invoke with `/cudaq-guide [argument]` +- If no argument is given, display the full onboarding menu and ask what + the user wants to explore +- Pass an argument from the routing table below to jump directly to that topic +- Read local CUDA-Q documentation files to answer questions accurately + +## References + +| Section | Doc file | +| --- | --- | +| Install | `docs/sphinx/using/install/install.rst`, `docs/sphinx/using/quick_start.rst` | +| Test Program | `docs/sphinx/using/basics/kernel_intro.rst`, `docs/sphinx/using/basics/build_kernel.rst` | +| GPU Simulation | `docs/sphinx/using/backends/sims/svsims.rst`, `docs/sphinx/using/examples/multi_gpu_workflows.rst` | +| QPU | `docs/sphinx/using/backends/hardware.rst`, `docs/sphinx/using/backends/cloud.rst` | +| Applications | `docs/sphinx/using/applications.rst` | +| Parallelize | `docs/sphinx/using/examples/multi_gpu_workflows.rst` | + +## Routing by Argument + +| Argument | Action | +|---|---| +| `install` | Walk through installation (see Install section) | +| `test-program` | Build and run a Bell state kernel to verify CUDA-Q is working properly | +| `gpu-sim` | Explain GPU-accelerated simulation targets (see GPU Simulation section) | +| `qpu` | Explain how to run on real QPU hardware (see QPU section) | +| `applications` | Showcase what can be built with CUDA-Q (see Applications section) | +| `parallelize` | Show how to run circuits in parallel across multiple QPUs (see Parallelize section) | +| _(none)_ | Print the full menu below and ask what they'd like to explore | + +--- + +## Full Menu (no argument) + +Present this when invoked with no argument + +```text +CUDA-Q Getting Started + +CUDA-Q is NVIDIA's unified quantum-classical programming model for CPUs, GPUs, and QPUs. +Supports Python and C++. Docs https://nvidia.github.io/cuda-quantum/ + +Choose a topic + /cudaq-guide install Install CUDA-Q (Python pip or C++ binary) + /cudaq-guide test-program Write and run your quantum kernel + /cudaq-guide gpu-sim Accelerate simulation on NVIDIA GPUs + /cudaq-guide qpu Connect to real QPU hardware + /cudaq-guide applications Explore what you can build + /cudaq-guide parallelize Run circuits in parallel across multiple QPUs + +Specialized skills + /cudaq-qec Quantum Error Correction memory experiments + /cudaq-chemistry Quantum chemistry (VQE, ADAPT-VQE) + /cudaq-add-backend Add a new hardware backend + /cudaq-compiler Work with the CUDA-Q compiler IR + /cudaq-benchmark Benchmark and optimize performance +``` + +--- + +## Install + +Instructions + +- Default to Python installation unless the user explicitly mentions C++ or + the `nvq++` compiler. +- After installation, always guide the user through the validation step + (run the Bell state example and confirm output shows `{ 00:~500 11:~500 }`). +- Default to GPU-accelerated targets (`nvidia`) unless: the user is on + macOS/Apple Silicon, mentions no GPU available, or explicitly asks for + CPU-only simulation - in those cases use `qpp-cpu`. +- Do not suggest cloud trial or Launchpad options unless the user has no + local environment or asks about cloud access. + +Platform notes + +- Linux (x86_64, ARM64): full GPU support - + `pip install cudaq` + CUDA Toolkit +- macOS (ARM64/Apple Silicon): CPU simulation only - + `pip install cudaq` (no CUDA Toolkit needed) +- Windows: use WSL, then follow Linux instructions +- C++ (no sudo): + `bash install_cuda_quantum*.$(uname -m) --accept -- --installpath $HOME/.cudaq` +- Brev (cloud, no local setup): Log in at the NVIDIA Application Hub, + open a CUDA-Q workspace, then SSH in with the Brev CLI: + + ```bash + brev open ${WORKSPACE_NAME} + ``` + + CUDA-Q and the CUDA Toolkit are pre-installed. + +--- + +## Test Program + +Key concepts to explain + +- `@cudaq.kernel` / `__qpu__` marks a quantum kernel - compiled to Quake MLIR +- `cudaq.qvector(N)` allocates N qubits in |0⟩ +- `cudaq.sample()` - kernel measures qubits; returns bitstring histogram + (`SampleResult`) +- `cudaq.run()` - kernel returns a classical value; runs `shots_count` times + and returns a list of those return values +- `cudaq.observe()` - computes expectation value ⟨H⟩ for a spin operator +- `cudaq.get_state()` - returns the full statevector (simulator only) + +Kernel restrictions + +- Only a restricted Python subset is valid inside a kernel - it compiles to + Quake MLIR, not regular Python. +- NumPy and SciPy cannot be used inside a kernel. Use them outside the kernel + for classical pre/post-processing. +- Kernels can call other kernels; the callee must also be a `@cudaq.kernel`. + +For compiler internals (`inspect` module -> `ast_bridge.py` -> Quake MLIR -> +QIR -> JIT), route to `/cudaq-compiler`. + +--- + +## GPU Simulation + +To recommend the best simulation backend for the user, consult the full +comparison table at + + +### Available GPU Targets + +| Target | Description | Use when | +|---|---|---| +| `nvidia` (default) | Single-GPU state vector via cuStateVec (up to ~30 qubits) | Default choice for most simulations on a single GPU | +| `nvidia --target-option fp64` | Double-precision single GPU | Higher numerical precision needed (e.g. chemistry, sensitive observables) | +| `nvidia --target-option mgpu` | Multi-GPU, pools memory across GPUs (>30 qubits) | Circuit exceeds single-GPU memory; requires MPI | +| `nvidia --target-option mqpu` | Multi-QPU, one virtual QPU per GPU, parallel execution | Running many independent circuits in parallel (e.g. parameter sweeps, VQE gradients) | +| `tensornet` | Tensor network simulator | Shallow or low-entanglement circuits; qubit count exceeds statevector feasibility | +| `qpp-cpu` | CPU-only fallback (OpenMP) | No GPU available; macOS; small circuits for testing | + +--- + +## QPU + +When the user invokes this section, do not dump all providers at once. +Instead, follow this two-step dialogue: + +Step 1 - ask which technology they want + +```text +Which QPU technology are you targeting? + 1. Ion trap (IonQ, Quantinuum) + 2. Superconducting (IQM, OQC, Anyon, TII, QCI) + 3. Neutral atom (QuEra, Infleqtion, Pasqal) + 4. Cloud / multi-platform (AWS Braket, Scaleway) +``` + +Step 2 - once they pick a technology, ask which provider, then read the +corresponding doc file and walk the user through it step by step. + +| Technology | Provider | Doc file | +|---|---|---| +| Ion trap | IonQ | `docs/sphinx/using/backends/hardware/iontrap.rst` (IonQ section) | +| Ion trap | Quantinuum | `docs/sphinx/using/backends/hardware/iontrap.rst` (Quantinuum section) | +| Superconducting | IQM | `docs/sphinx/using/backends/hardware/superconducting.rst` (IQM section) | +| Superconducting | OQC | `docs/sphinx/using/backends/hardware/superconducting.rst` (OQC section) | +| Superconducting | Anyon | `docs/sphinx/using/backends/hardware/superconducting.rst` (Anyon section) | +| Superconducting | TII | `docs/sphinx/using/backends/hardware/superconducting.rst` (TII section) | +| Superconducting | QCI | `docs/sphinx/using/backends/hardware/superconducting.rst` (QCI section) | +| Neutral atom | Infleqtion | `docs/sphinx/using/backends/hardware/neutralatom.rst` (Infleqtion section) | +| Neutral atom | QuEra | `docs/sphinx/using/backends/hardware/neutralatom.rst` (QuEra section) | +| Neutral atom | Pasqal | `docs/sphinx/using/backends/hardware/neutralatom.rst` (Pasqal section) | +| Cloud | AWS Braket | `docs/sphinx/using/backends/cloud/braket.rst` | +| Cloud | Scaleway | `docs/sphinx/using/backends/cloud/scaleway.rst` | + +After walking through the provider steps, always close with + +- Test locally first with `emulate=True` before submitting to real hardware. +- Use `cudaq.sample_async()` / `cudaq.observe_async()` for non-blocking submission. + +--- + +## Applications + +CUDA-Q ships with ready-to-run application notebooks + +| Category | Examples | +|---|---| +| Optimization | QAOA, ADAPT-QAOA, MaxCut | +| Chemistry | VQE, UCCSD, ADAPT-VQE -> see `/cudaq-chemistry` | +| Error Correction | Surface codes, QEC memory -> see `/cudaq-qec` | +| Algorithms | Grover's, Shor's, QFT, Deutsch-Jozsa, HHL | +| ML | Quantum neural networks, kernel methods | +| Simulation | Hamiltonian dynamics, Trotter evolution | +| Finance | Portfolio optimization, Monte Carlo | + +Point to sub-skills for specialized topics + +- `/cudaq-qec` - full QEC memory experiment walkthrough +- `/cudaq-chemistry` - VQE and ADAPT-VQE for molecular energies +- `/cudaq-benchmark` - performance profiling and multi-GPU scaling + +--- + +## Parallelize + +CUDA-Q supports two distinct multi-GPU parallelization strategies - pick based +on what you are trying to scale. + +| Goal | Strategy | Target option | +|---|---|---| +| Single circuit too large for one GPU | Pool GPU memory | `nvidia --target-option mgpu` | +| Many independent circuits at once | Run circuits in parallel | `nvidia --target-option mqpu` | +| Large Hamiltonian expectation value | Distribute terms across GPUs | `mqpu` + `execution=cudaq.parallel.thread` | + +### Circuit batching with mqpu (`sample_async` / `observe_async`) + +The `mqpu` option maps one virtual QPU to each GPU. Dispatch circuits +asynchronously with `qpu_id` to all GPUs simultaneously. + +```python +import cudaq + +cudaq.set_target("nvidia", option="mqpu") +n_qpus = cudaq.get_platform().num_qpus() + +futures = [ + cudaq.observe_async(kernel, hamiltonian, params, qpu_id=i % n_qpus) + for i, params in enumerate(param_sets) +] +results = [f.get().expectation() for f in futures] +``` + +### Hamiltonian batching + +For a single kernel with a large Hamiltonian, add `execution=` to +`cudaq.observe` — no other code change needed. + +```python +# Single node, multiple GPUs +result = cudaq.observe(kernel, hamiltonian, *args, + execution=cudaq.parallel.thread) + +# Multi-node via MPI +result = cudaq.observe(kernel, hamiltonian, *args, + execution=cudaq.parallel.mpi) +``` + +See the docs above for complete working examples of both patterns. + +--- + +## Limitations + +- GPU simulation requires Linux (x86_64 or ARM64); macOS is CPU-only +- Multi-GPU `mgpu` target requires MPI +- Kernel code must use a restricted Python subset; NumPy/SciPy are not + allowed inside kernels +- QPU access requires provider-specific credentials and accounts + +## Troubleshooting + +- Import error after `pip install cudaq`: Ensure Python 3.10+ and a + supported OS (Linux or macOS) +- No GPU detected: Verify CUDA Toolkit is installed and `nvidia-smi` + shows your GPU; fall back to `qpp-cpu` +- Kernel compile error: Check that only supported Python constructs are + used inside `@cudaq.kernel` +- QPU submission fails: Confirm credentials are set as environment + variables per the provider docs diff --git a/skills/Megatron-Bridge/adding-model-support/SKILL.md b/skills/Megatron-Bridge/adding-model-support/SKILL.md new file mode 100644 index 0000000..7965d03 --- /dev/null +++ b/skills/Megatron-Bridge/adding-model-support/SKILL.md @@ -0,0 +1,443 @@ +--- +name: adding-model-support +description: Guide for adding support for new LLM or VLM models in Megatron-Bridge. Covers bridge, provider, recipe, tests, docs, and examples. Use when the user asks to add, support, onboard, or integrate a new model, or when creating bridges, providers, or recipes for a new model family. +--- + +# Adding New Model Support in Megatron-Bridge + +## Phase 1: Discovery + +### Step 1 — Get the HF model link + +Ask the user for the HuggingFace model link (e.g. `https://huggingface.co/Qwen/Qwen3.5-VL-27B`). + +If the model is **not public**, ask the user to provide the `config.json` file directly. + +### Step 2 — Fetch and analyze config.json + +Read the model's `config.json` from HuggingFace (or from the user-provided file). Key fields to extract: + +- `model_type` — used for `@register_bridge(model_type=...)` +- `architectures` — the HF model class name (used for `source=...` in registration) +- `tie_word_embeddings` — critical for weight tying +- Architecture fields: `num_hidden_layers`, `hidden_size`, `intermediate_size`, `num_attention_heads`, `num_key_value_heads`, `vocab_size`, `max_position_embeddings`, `rope_theta`, etc. +- MoE fields (if present): `num_local_experts`, `num_experts_per_tok`, `moe_intermediate_size` +- MLA fields (if present): `q_lora_rank`, `kv_lora_rank`, `qk_nope_head_dim`, `qk_rope_head_dim` + +If there are config fields you don't recognize from previously supported models (check `CONFIG_MAPPING` in `model_bridge.py` and existing bridges), this likely indicates a **new architectural block** (e.g., a novel attention variant, custom normalization, or a new layer type). Ask the user to provide the HuggingFace `modeling_*.py` implementation of that block so you can understand the computation and create the correct Megatron-side mapping or custom module. + +### Step 3 — Determine VLM vs LLM + +**VLM** (Vision-Language Model) if config.json contains: +- `text_config` AND `vision_config` sub-configs +- Note: VLMs may or may not have "VL" in the name + +**LLM** (Text-only) if: +- No `text_config` / `vision_config` +- Single flat config for the language model + +This distinction affects: +- Which files to create (VLMs need a model.py combining vision + language) +- Where to read config fields from (`text_config` vs top-level for VLMs) +- Test patterns (VLMs need vision inputs in functional tests) + +### Step 4 — Check for quantized weights (FP8 / FP4) + +Inspect the HF checkpoint's `model.safetensors` (or `model.safetensors.index.json`) for quantized +weight dtypes such as `float8_e4m3fn` (FP8) or `uint8`/`uint4` with accompanying `*_scale_inv` or +`*_scale` tensors. Common signs: + +- `config.json` mentions `quantization_config` or dtype fields like `"torch_dtype": "float8_e4m3fn"` +- Safetensors contain `weight_scale_inv` keys alongside the main weight keys +- The model card mentions FP8/FP4/INT4 weights + +**Why this matters:** The bridge's `import_ckpt` path does **not** automatically dequantize — it +loads raw quantized values as-is. This produces a silently broken model (random-level loss, huge +grad norms) instead of raising an error. + +**Fix:** Dequantize before conversion. Two approaches: + +1. **Standalone script** (recommended for user-facing models) — Write a + `dequant_fp8_for_bridge.py` in the model's examples folder. + Reference: `examples/models/vlm/ministral3/dequant_fp8_for_bridge.py`. + The pattern is: `w_bf16 = fp8_weight.to(bfloat16) * weight_scale_inv`. + +2. **In-bridge hook** — Override `maybe_modify_loaded_hf_weight()` in the bridge class to + dequantize on the fly during import: + + ```python + def maybe_modify_loaded_hf_weight(self, hf_param, hf_state_dict): + weight = hf_state_dict[hf_param] + scale_key = hf_param + "_scale_inv" + if weight.dtype == torch.float8_e4m3fn and scale_key in hf_state_dict: + return weight.to(torch.bfloat16) * hf_state_dict[scale_key].to(torch.bfloat16) + return weight + ``` + +Always add a sanity check in the verification workflow (e.g., print `std` of a weight tensor — +quantized models typically have `std ≈ 13` before dequantization vs `std ≈ 0.006` after). + +## Phase 2: Bridge Support + +### File structure + +**LLM** — Reference: Qwen2 (`src/megatron/bridge/models/qwen/qwen2_bridge.py`) + +``` +src/megatron/bridge/models// +├── __init__.py +├── _bridge.py # Config + weight mappings (no provider file needed) +└── modeling_/ # (optional) Custom nn.Module implementations if needed + └── ... +``` + +**VLM** — Reference: Qwen3.5-VL (`src/megatron/bridge/models/qwen_vl/`) + +``` +src/megatron/bridge/models// +├── __init__.py +├── _bridge.py # Config + weight mappings +├── _provider.py # Only for VLMs that need custom provide() +└── modeling_/ # If using Megatron vision encoder + ├── __init__.py + └── model.py # Combines vision + language +``` + +OR with HF vision encoder (Reference: Gemma3-VL): + +``` +src/megatron/bridge/models// +├── __init__.py +├── _bridge.py +├── _provider.py # Only for VLMs that need custom provide() +└── modeling_.py # HF vision + Megatron language wrapper +``` + +**Model-specific modeling code:** If the model requires custom `nn.Module` implementations +(e.g. a custom RoPE variant, non-standard transformer config, custom thinker/talker +architecture), place them in a `modeling_/` directory or a single `modeling_.py` +file inside the model family folder. Use a directory when there are multiple files (model, +transformer config, custom ops); use a single file when one module suffices. Never put +model-specific modeling code in shared directories or as loose files in the bridge family +directory — keep them namespaced under the `modeling_` prefix. + +### Implementation order + +**LLM:** +1. **Bridge only** — Register bridge, implement `provider_bridge()` and `mapping_registry()`. + The bridge calls `super().provider_bridge()` to get a `GPTModelProvider` from `CONFIG_MAPPING`, + then sets model-specific attributes on it. **Do not create a provider file** — the stock + provider returned by `super().provider_bridge()` is usually sufficient for LLMs + (e.g., `GPTModelProvider`, or another base provider selected via `PROVIDER_CLASS`). + +**VLM:** +1. **Bridge** — Register bridge, implement config and weight mappings. +2. **Provider** (when needed) — Only VLMs that require a custom `provide()` to instantiate a + combined vision+language model need a provider subclass. The bridge manually calls + `hf_config_to_provider_kwargs(text_config)` and instantiates the custom provider. +3. **Model class** — Combine vision encoder + language decoder. + +For detailed patterns, see: +- VLM: [vlm-patterns.md](vlm-patterns.md) +- LLM: [llm-patterns.md](llm-patterns.md) + +### Critical: `tie_word_embeddings` for VLMs + +For VLMs, `tie_word_embeddings` lives on the **top-level** HF config, NOT on `text_config`. Always read from the parent config: + +```python +provider.share_embeddings_and_output_weights = getattr(hf_config, "tie_word_embeddings", False) +``` + +### Critical: Config field location for VLMs + +When reading HF config for VLMs, check whether each field is in: +- `hf_config` (top-level) — e.g. `tie_word_embeddings`, `image_token_id`, `video_token_id` +- `hf_config.text_config` — e.g. `num_hidden_layers`, `hidden_size`, etc. +- `hf_config.vision_config` — e.g. vision encoder dimensions + +### Encapsulating model-specific layers + +When a new model introduces custom or non-standard layers (novel attention variants, custom +normalization, fused expert layouts, MTP heads, etc.), **keep all model-specific logic inside +the model family directory**. Do not modify shared files in `src/megatron/bridge/models/conversion/` +(e.g. `param_mapping.py`, `model_bridge.py`, `quant_mapping.py`) unless the change is genuinely +reusable across multiple model families. + +**Principle:** The bridge and provider files for a model family are your primary extension surface. +Shared conversion infrastructure provides hooks and base classes — subclass them locally rather +than adding conditionals to shared code. + +#### Strategy 1: Create a local mapping subclass + +If the model has a layer whose weight layout doesn't match any existing mapping class, create a +private mapping class in the bridge file or a `_mappings.py` file in the family directory. + +Example — GLM's fused expert down-projection disables grouped-export transpose: + +```python +# src/megatron/bridge/models/glm/glm_moe_mappings.py +class GLMExpertDownProjMapping(FusedExpertMapping): + def __init__(self, megatron_param, hf_param, permute_dims=None): + super().__init__(megatron_param, hf_param, permute_dims, transpose_on_export=False) +``` + +Example — Nemotron-H's MTP layers flatten indices during resolve: + +```python +# Inside nemotron_h_bridge.py (private to the module) +class _MTPFlatteningMapping(MegatronParamMapping): + def resolve(self, captures): + return AutoMapping(self._flatten(captures), ...) +``` + +Example — MiniMax-M2's non-standard QK norm layout: + +```python +# Inside minimax_m2_bridge.py (private to the module) +class _FullDimQKNormMapping(MegatronParamMapping): + def hf_to_megatron(self, hf_weights): + # Custom scatter logic for full-dim QK norm + ... + def megatron_to_hf(self, megatron_weights): + # Custom gather logic + ... +``` + +#### Strategy 2: Override bridge hooks + +`MegatronModelBridge` provides several override hooks — use them instead of modifying the base class: + +| Hook | When to use | +|------|-------------| +| `mapping_registry()` | Define all weight name mappings (abstract, always overridden) | +| `provider_bridge()` | Configure the provider with model-specific flags (call `super()` then setattr) | +| `maybe_modify_loaded_hf_weight()` | Dequantize, rename, or reshape HF weights before conversion | +| `maybe_modify_converted_hf_weight()` | Synthesize extra HF keys on export (e.g. `inv_freq`) | +| `megatron_to_hf_config()` | Build HF `config.json` for export | +| `hf_config_to_provider_kwargs()` | Override CONFIG_MAPPING behavior for specific fields | + +**Accessing HF config in `mapping_registry()`:** The bridge instance has `self.hf_config` +available during conversion — it is set automatically by the dispatch system before +`mapping_registry()` is called. Use it when your mapping registry needs config-dependent +logic (e.g. dynamic MTP layer count, number of experts): + +```python +def mapping_registry(self) -> MegatronMappingRegistry: + hf_config = getattr(self, "hf_config", None) + num_mtp_layers = getattr(hf_config, "num_nextn_predict_layers", 0) if hf_config else 0 + ... +``` + +Do **not** override `build_conversion_tasks()` to stash `self._hf_config` — that pattern is +deprecated. + +#### Strategy 3: Custom provider subclass (VLMs only) + +Most models do **not** need a provider file — the stock provider (e.g., `GPTModelProvider`, or +another base selected via `PROVIDER_CLASS`) is usually sufficient for LLMs. Only create a provider subclass when a VLM needs custom `provide()` logic to instantiate +a combined vision+language model: + +```python +# src/megatron/bridge/models//_provider.py +class MyVLModelProvider(GPTModelProvider): + image_token_id: int = 0 + + def provide(self, ...): + # Custom model construction combining vision encoder + language decoder + ... +``` + +The bridge then references it via `PROVIDER_CLASS = MyVLModelProvider` or instantiates it directly +in `provider_bridge()`. + +#### When shared file changes ARE justified + +Modify `param_mapping.py` or `model_bridge.py` only when the pattern is **reusable by 2+ model +families**. Examples of justified shared changes: + +- `FusedExpertMapping` / `FusedGatedExpertMapping` — used by GLM, DeepSeek, OLMoE, etc. +- `RMSNorm2ZeroCenteredRMSNormMapping` — used by Gemma, Nemotron, etc. +- New `CONFIG_MAPPING` entries — when a standard HF config key maps to a standard provider attribute + +If you're tempted to add a model-specific `if model_type == "..."` branch in shared code, or +pattern-matching on specific weight names in shared conversion logic, that's a signal to use a +local subclass or hook override instead. + +### Update FLOPs calculator for new architectural blocks + +If the model introduces a new computational block that differs from standard attention or MLP +(e.g., Gated DeltaNet / GDN linear attention, Multi-Token Prediction / MTP heads, Mamba SSM layers), +update the FLOPs calculator in `src/megatron/bridge/training/utils/flop_utils.py` so that +training throughput metrics (TFLOPs/GPU) are accurate. + +**When to update:** Any time the new block has different FLOPs-per-token than standard self-attention +or standard MLP. Common cases: +- Linear attention variants (GDN, RetNet, RWKV) — replace the `O(s²)` attention term with the + block's actual operation count +- MTP / speculative decoding heads — add FLOPs for the extra projection and norm layers +- SSM layers (Mamba) — different recurrence FLOPs than attention +- Novel MoE routing — may change the effective expert count + +**How to update:** + +1. Read the existing `transformer_flops()` function in `flop_utils.py` to understand the structure. +2. Add a conditional block gated on a config attribute (e.g., `experimental_attention_variant`, + `mtp_num_layers`). Follow the existing MoE pattern for config validation — raise on invalid + types, assert list lengths, and use direct attribute access instead of `getattr` with fallback + defaults so that misconfigurations fail explicitly. +3. Compute the per-layer FLOPs for the new block and blend it with the standard attention term + based on the layer pattern. +4. Add unit tests in `tests/unit_tests/training/utils/test_flop_utils.py` that verify: + - New-block FLOPs differ from pure-attention baseline + - Exact formula matches hand-computed expected values + - Varying the block ratio (e.g., `linear_attention_freq`) changes FLOPs + +Reference PR: [#2925 — GDN FLOPs calculator](https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/2925) +adds GDN support with both the calculator code and comprehensive tests. + +## Phase 3: Recipe Support + +Recipes provide pre-configured training settings for each model size. + +**LLM recipes:** `src/megatron/bridge/recipes//.py` +**VLM recipes:** `src/megatron/bridge/recipes//.py` + +Each recipe file defines functions for each model size + training mode: +- `__sft_config()` — Full supervised fine-tuning +- `__peft_config()` — LoRA/DoRA parameter-efficient fine-tuning +- `__pretrain_config()` — Pretraining (LLM only, usually) + +For detailed recipe patterns, see [recipe-patterns.md](recipe-patterns.md). + +### Export checklist + +1. Family `__init__.py` — import and add to `__all__` +2. Top-level `src/megatron/bridge/recipes/__init__.py` — wildcard import +3. `train_any_basic.py` — add to `config_map`, docstring, and `--model` choices + +## Phase 4: Tests + +### Unit tests (no GPU) + +```text +tests/unit_tests/models// +├── __init__.py +├── test__bridge.py # Mock HF config → verify provider mapping +└── test__provider.py # (optional) Only if custom provider subclass exists +``` + +### Functional tests (GPU) + +```text +tests/functional_tests/models// +├── __init__.py +├── test__conversion.py # Toy model HF↔Megatron roundtrip +└── test__provider.py # compare_provider_configs (optional) +``` + +For detailed test patterns, see [tests-and-examples.md](tests-and-examples.md). + +## Phase 5: Docs and Examples + +### Examples + +LLM examples: `examples/models//` +VLM examples: `examples/models/vlm//` + +```text +examples/models// # LLM +examples/models/vlm// # VLM +├── README.md +├── conversion.sh # HF↔Megatron conversion commands (real model) +├── inference.sh # Generation commands (real model, reasonable output) +├── slurm_sft.sh # SFT training on SLURM +└── slurm_peft.sh # PEFT training on SLURM +``` + +**Key deliverable requirement:** `conversion.sh` and `inference.sh` must target a real published model (e.g. `Qwen/Qwen3-8B`, not a toy). The inference script must produce reasonable output — for LLMs a coherent text continuation, for VLMs a plausible image description. This is the acceptance bar: conversion runs cleanly and generation makes sense. + +### Documentation + +Add a model page at `docs/models//.md` covering: +- Supported variants and sizes +- Conversion commands +- Training examples (SFT, PEFT) +- Known limitations + +## Verification Workflow + +After implementing bridge support, prompt the user to run these commands on the cluster: + +### 1. Smoke test (single GPU) + +```bash +uv run python -c " +from megatron.bridge import AutoBridge +bridge = AutoBridge.from_hf_pretrained('/') +provider = bridge.to_megatron_provider() +provider.tensor_model_parallel_size = 1 +provider.pipeline_model_parallel_size = 1 +provider.finalize() +model = provider.provide_distributed_model(wrap_with_ddp=False) +bridge.load_hf_weights(model) +for i, (name, tensor) in enumerate(bridge.export_hf_weights(model, cpu=True)): + print(name, tuple(tensor.shape)) + if i > 10: break +" +``` + +### 2. Conversion roundtrip (multi-GPU) + +```bash +uv run python examples/conversion/convert_checkpoints.py import \ + --hf-model / \ + --megatron-path /workspace/ \ + --torch-dtype bfloat16 + +uv run python examples/conversion/convert_checkpoints.py export \ + --hf-model / \ + --megatron-path /workspace//iter_0000000 \ + --hf-path /workspace/-hf-export +``` + +### 3. Generation test + +For LLMs: +```bash +uv run python examples/conversion/hf_to_megatron_generate_text.py \ + --hf_model_path / --prompt "Hello" +``` + +For VLMs: +```bash +uv run python examples/conversion/hf_to_megatron_generate_vlm.py \ + --hf_model_path / \ + --image_path "https://example.com/image.jpeg" \ + --prompt "Describe this image." +``` + +### 4. Run tests + +```bash +uv run python -m pytest tests/unit_tests/models// -v +uv run python -m pytest tests/functional_tests/models// -v --run-gpu +``` + +## Quick Decision Tree + +``` +User wants to add a model +│ +├─ Has HF link? ─── No ──→ Ask for link (or config.json if private) +│ +├─ Has text_config + vision_config? ─── Yes ──→ VLM path +│ ├─ Has Megatron vision encoder? ──→ Megatron encoder (Qwen3.5 pattern) +│ └─ No Megatron encoder ──→ HF encoder (Gemma3 pattern) +│ +└─ No vision config ──→ LLM path (bridge only, no provider file) + ├─ Standard GPT-style? ──→ Bridge with stock mappings + └─ Custom layers? ──→ Bridge + local mapping subclasses / hook overrides + ├─ Custom weight layout? ──→ Local mapping subclass in family dir + └─ Custom import/export? ──→ Override bridge hooks (maybe_modify_*) +``` diff --git a/skills/Megatron-Bridge/adding-model-support/llm-patterns.md b/skills/Megatron-Bridge/adding-model-support/llm-patterns.md new file mode 100644 index 0000000..674ef9e --- /dev/null +++ b/skills/Megatron-Bridge/adding-model-support/llm-patterns.md @@ -0,0 +1,217 @@ +# LLM Bridge Patterns + +Reference implementations: +- Simple dense: Qwen2 (`src/megatron/bridge/models/qwen/qwen2_bridge.py`) +- MoE: GLM-4.5 (`src/megatron/bridge/models/glm/glm45_bridge.py`) +- MoE with custom layer spec: OLMoE (`src/megatron/bridge/models/olmoe/olmoe_bridge.py`) +- Advanced (YARN, MoE, provider re-wrap): GPT-OSS (`src/megatron/bridge/models/gpt_oss/`) + +## Provider Pattern + +Most bridges do **not** need a custom provider subclass. The base `provider_bridge()` uses +`CONFIG_MAPPING` to auto-create a `GPTModelProvider` from HF config. The bridge then sets +model-specific attributes directly on the returned provider instance. + +```python +def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GPTModelProvider: + provider = super().provider_bridge(hf_pretrained) + + provider.normalization = "RMSNorm" + provider.gated_linear_unit = True + provider.position_embedding_type = "rope" + provider.add_bias_linear = False + provider.hidden_dropout = 0.0 + provider.autocast_dtype = torch.bfloat16 + + # MoE settings (if applicable) + provider.moe_grouped_gemm = True + provider.moe_token_dispatcher_type = "alltoall" + + return provider +``` + +### When you DO need a provider subclass + +Create a `GPTModelProvider` subclass only when: + +1. **Extra dataclass fields** — The provider has fields not on `GPTModelProvider` (e.g., YARN + RoPE params, custom MoE fields) that need to serialize into `run_config.yaml`. +2. **Custom `provide()` logic** — The model needs special instantiation (e.g., TE version + checks, sink attention, custom layer specs that require runtime logic). +3. **Predefined size variants for recipes** — Hardcoded configs like `LlamaModelProvider8B` + used by recipe functions (not by the bridge itself). + +```python +@dataclass +class MyModelProvider(GPTModelProvider): + yarn_rotary_scaling_factor: Optional[float] = None + yarn_original_max_position_embeddings: Optional[int] = None + + def provide(self, pre_process=None, post_process=None, vp_stage=None): + # Custom logic only if needed + return super().provide(pre_process, post_process, vp_stage) +``` + +If the bridge uses a custom provider, re-wrap the base provider in `provider_bridge()`: + +```python +def provider_bridge(self, hf_pretrained) -> MyModelProvider: + provider = super().provider_bridge(hf_pretrained) + provider = MyModelProvider(**{f.name: getattr(provider, f.name) for f in fields(provider)}) + provider.yarn_rotary_scaling_factor = ... + return provider +``` + +### Predefined size variants (for recipes only) + +Size-specific subclasses are used by recipes, not by the bridge: + +```python +@dataclass +class MyModelProvider7B(MyModelProvider): + num_layers: int = 32 + hidden_size: int = 4096 + num_attention_heads: int = 32 + num_query_groups: int = 8 + ffn_hidden_size: int = 14336 + vocab_size: int = 128256 +``` + +## Bridge Pattern + +```python +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge +from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry +from megatron.bridge.models.conversion.param_mapping import AutoMapping, QKVMapping, GatedMLPMapping +from megatron.bridge.models.gpt_provider import GPTModelProvider +from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM + +@MegatronModelBridge.register_bridge( + source=MyModelForCausalLM, # HF class (or string "MyModelForCausalLM") + target=GPTModel, # Megatron target + model_type="my_model", # HF model_type +) +class MyModelBridge(MegatronModelBridge): + + def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GPTModelProvider: + provider = super().provider_bridge(hf_pretrained) + + provider.normalization = "RMSNorm" + provider.gated_linear_unit = True + provider.position_embedding_type = "rope" + provider.add_bias_linear = False + provider.hidden_dropout = 0.0 + provider.autocast_dtype = torch.bfloat16 + + return provider + + def mapping_registry(self) -> MegatronMappingRegistry: + return MegatronMappingRegistry( + # Embeddings + AutoMapping( + megatron_param="embedding.word_embeddings.weight", + hf_param="model.embed_tokens.weight", + ), + # Output layer + AutoMapping( + megatron_param="output_layer.weight", + hf_param="lm_head.weight", + ), + # Final layernorm + AutoMapping( + megatron_param="decoder.final_layernorm.weight", + hf_param="model.norm.weight", + ), + # QKV (fused) + QKVMapping( + megatron_param="decoder.layers.*.self_attention.linear_qkv.weight", + q="model.layers.*.self_attn.q_proj.weight", + k="model.layers.*.self_attn.k_proj.weight", + v="model.layers.*.self_attn.v_proj.weight", + ), + # Attention output projection + AutoMapping( + megatron_param="decoder.layers.*.self_attention.linear_proj.weight", + hf_param="model.layers.*.self_attn.o_proj.weight", + ), + # MLP (gated) + GatedMLPMapping( + megatron_param="decoder.layers.*.mlp.linear_fc1.weight", + gate="model.layers.*.mlp.gate_proj.weight", + up="model.layers.*.mlp.up_proj.weight", + ), + AutoMapping( + megatron_param="decoder.layers.*.mlp.linear_fc2.weight", + hf_param="model.layers.*.mlp.down_proj.weight", + ), + # Layer norms + AutoMapping( + megatron_param="decoder.layers.*.self_attention.linear_qkv.layer_norm_weight", + hf_param="model.layers.*.input_layernorm.weight", + ), + AutoMapping( + megatron_param="decoder.layers.*.mlp.linear_fc1.layer_norm_weight", + hf_param="model.layers.*.post_attention_layernorm.weight", + ), + ) +``` + +### Base CONFIG_MAPPING + +The base class provides automatic mapping for common fields — no need to duplicate: + +```text +(num_hidden_layers, num_layers), (hidden_size, hidden_size), +(intermediate_size, ffn_hidden_size), (num_attention_heads, num_attention_heads), +(num_key_value_heads, num_query_groups), (head_dim, kv_channels), +(vocab_size, vocab_size), (max_position_embeddings, seq_length), +(rms_norm_eps, layernorm_epsilon), (rope_theta, rotary_base), +(tie_word_embeddings, share_embeddings_and_output_weights), +(attention_bias, add_qkv_bias), (mlp_bias, add_bias_linear), +``` + +### MoE weight mappings + +For models with Mixture of Experts, use expert-specific mappings: + +```python +ExpertMLPGateUpProjMapping( + megatron_param="decoder.layers.*.mlp.experts.local_experts.*.linear_fc1.weight", + gate="model.layers.*.mlp.experts.*.gate_proj.weight", + up="model.layers.*.mlp.experts.*.up_proj.weight", +), +ExpertMLPDownProjMapping( + megatron_param="decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight", + hf_param="model.layers.*.mlp.experts.*.down_proj.weight", +), +AutoMapping( + megatron_param="decoder.layers.*.mlp.router.weight", + hf_param="model.layers.*.mlp.gate.weight", +), +``` + +### Optional weight modification hooks + +Override these for special handling (e.g., quantized weights, expert layout): + +```python +def maybe_modify_loaded_hf_weight(self, hf_param, hf_state_dict): + """Transform HF weights before loading into Megatron (e.g., dequantize).""" + return hf_state_dict[hf_param] + +def maybe_modify_converted_hf_weight(self, task, converted_weights_dict, hf_state_dict): + """Transform weights after Megatron→HF conversion (e.g., merge expert shards).""" + return converted_weights_dict +``` + +## Registration Options + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `source` | Yes | HF model class or string class name | +| `target` | Yes | Megatron model class (usually `GPTModel`) | +| `provider` | No | Provider class (defaults to `GPTModelProvider`) | +| `model_type` | No | HF `model_type` string for export config | + +If `source` is a string (model not importable), the bridge is matched by class name. diff --git a/skills/Megatron-Bridge/adding-model-support/recipe-patterns.md b/skills/Megatron-Bridge/adding-model-support/recipe-patterns.md new file mode 100644 index 0000000..94a706d --- /dev/null +++ b/skills/Megatron-Bridge/adding-model-support/recipe-patterns.md @@ -0,0 +1,169 @@ +# Recipe Patterns + +Recipes provide pre-configured `ConfigContainer` objects for training each model variant. + +Reference implementations: +- **VLM:** `src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py` +- **LLM:** `src/megatron/bridge/recipes/gpt_oss/gpt_oss.py` + +## File Structure + +```text +src/megatron/bridge/recipes// +├── __init__.py # Import and expose recipe functions +└── .py # Recipe functions for all sizes +``` + +## Recipe Function Pattern + +Each model size gets dedicated functions for SFT, PEFT, and optionally pretrain: + +```python +def __sft_config() -> ConfigContainer: + """SFT config for .""" + cfg = _sft_common() # or _sft_common_vlm() for VLMs + + # Model + cfg.model = AutoBridge.from_hf_pretrained("/").to_megatron_provider(load_weights=False) + + # Parallelism + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.sequence_parallel = True + + # Training + cfg.training.max_steps = 100 + cfg.training.global_batch_size = 128 + cfg.training.micro_batch_size = 1 + + # Optimizer + cfg.optimizer.lr = 5e-6 + cfg.optimizer.weight_decay = 0.01 + + # VLM-specific (if applicable) + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + return cfg + + +def __peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """PEFT config for .""" + cfg = _peft_common() # or _peft_common_vlm() for VLMs + + cfg.model = AutoBridge.from_hf_pretrained("/").to_megatron_provider(load_weights=False) + + # PEFT typically uses smaller parallelism + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + + # PEFT uses higher LR + cfg.optimizer.lr = 2e-4 + + # PEFT config + peft_cfg = default_peft_config(peft_scheme) + cfg.peft = peft_cfg + + return cfg +``` + +## Common Base Functions + +| Function | Use Case | +|----------|----------| +| `_pretrain_common()` | LLM pretraining | +| `_sft_common()` | LLM supervised fine-tuning | +| `_peft_common()` | LLM parameter-efficient fine-tuning | +| `_sft_common_vlm()` | VLM SFT (adds vision dataset, null tokenizer) | +| `_peft_common_vlm()` | VLM PEFT | + +VLM variants additionally set: +- `cfg.dataset` to `HFDatasetConversationProvider` (e.g., CORD-v2) +- `cfg.dataset.hf_processor_path` for the vision processor +- `NullTokenizer` (tokenization handled by processor) +- DDP without overlap (for vision model compatibility) + +## Parallelism Guidelines + +**Constraint:** `max(TP*CP, EP) * PP` = minimum GPUs, with 8 GPUs per node. + +| Model Size | TP | PP | EP | CP | Notes | +|-----------|----|----|----|----|-------| +| < 3B | 1 | 1 | 1 | 1 | Single GPU | +| 3-8B | 2 | 1 | 1 | 1 | | +| 8-13B | 4 | 1 | 1 | 1 | | +| 13-70B | 4 | 4 | 1 | 1 | | +| MoE (any) | 1-2 | 1-4 | 8-32 | 1 | EP dominates | + +**Rules:** +- TP must be <= `num_key_value_heads` +- When EP > 1 and TP > 1, `sequence_parallel` must be True +- PEFT typically uses smaller parallelism (TP=1, PP=1) + +## Export / Registration + +### Family `__init__.py` + +```python +from megatron.bridge.recipes.. import ( + __sft_config, + __peft_config, + __sft_config, + __peft_config, +) + +__all__ = [ + "__sft_config", + "__peft_config", + # ... +] +``` + +### Top-level `recipes/__init__.py` + +Add a wildcard import: + +```python +from megatron.bridge.recipes. import * +``` + +### `train_any_basic.py` + +Add entry to `config_map` dict, docstring model list, and `--model` argparse choices. + +## Recipe Test Patterns + +### Unit test (no GPU) + +Monkeypatch `AutoBridge` to return a mock provider. Verify `ConfigContainer` structure: + +```python +def test_sft_config(monkeypatch): + monkeypatch.setattr("megatron.bridge.AutoBridge.from_hf_pretrained", mock_bridge) + cfg = model_size_sft_config() + assert cfg.model.tensor_model_parallel_size == 4 + assert cfg.training.global_batch_size == 128 +``` + +### Functional test (GPU) + +Use `run_pretrain_vl_recipe_test()` from `tests/functional_tests/recipes/utils.py`: + +```python +RECIPES = [ + (model_size_sft_config, "model_size_sft", {}, {}), +] + +PEFT_RECIPES = [ + (partial(model_size_peft_config, peft="lora"), "model_size_peft", {}, {}), +] +``` + +### Five training scenarios to cover (VLMs) + +1. SFT nothing frozen +2. SFT language frozen (train vision + projection) +3. SFT vision + language frozen (train projection only) +4. PEFT with vision frozen +5. PEFT with nothing frozen diff --git a/skills/Megatron-Bridge/adding-model-support/tests-and-examples.md b/skills/Megatron-Bridge/adding-model-support/tests-and-examples.md new file mode 100644 index 0000000..66d0c7b --- /dev/null +++ b/skills/Megatron-Bridge/adding-model-support/tests-and-examples.md @@ -0,0 +1,326 @@ +# Test and Example Patterns + +## Unit Tests + +Location: `tests/unit_tests/models//` + +### Bridge Unit Test + +Mock the HF config and pretrained model, then verify `provider_bridge()` and `mapping_registry()`. + +```python +import pytest +from unittest.mock import Mock +from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM # or .causal_lm + +def _make_mock_config(): + """Create a mock HF config with model-specific attributes.""" + config = Mock() + config.num_hidden_layers = 4 + config.hidden_size = 256 + config.intermediate_size = 512 + config.num_attention_heads = 4 + config.num_key_value_heads = 2 + config.vocab_size = 32000 + config.max_position_embeddings = 2048 + config.rope_theta = 10000.0 + config.rms_norm_eps = 1e-6 + config.tie_word_embeddings = False + # For VLMs: add text_config and vision_config + # config.text_config = _make_text_config() + # config.vision_config = _make_vision_config() + return config + +def _make_mock_pretrained(config): + pretrained = Mock(spec=PreTrainedVLM) # or PreTrainedCausalLM + pretrained.config = config + return pretrained + +class TestMyModelBridgeProviderBridge: + @pytest.fixture + def bridge(self): + return MyModelBridge() + + @pytest.fixture + def mock_pretrained(self): + return _make_mock_pretrained(_make_mock_config()) + + def test_provider_type(self, bridge, mock_pretrained): + provider = bridge.provider_bridge(mock_pretrained) + assert isinstance(provider, GPTModelProvider) # or custom provider class if one exists + + def test_config_mapping(self, bridge, mock_pretrained): + provider = bridge.provider_bridge(mock_pretrained) + assert provider.num_layers == 4 + assert provider.hidden_size == 256 + assert provider.num_attention_heads == 4 + + def test_tie_word_embeddings(self, bridge, mock_pretrained): + provider = bridge.provider_bridge(mock_pretrained) + assert provider.share_embeddings_and_output_weights == False + +class TestMyModelBridgeMappingRegistry: + @pytest.fixture + def bridge(self): + return MyModelBridge() + + def test_has_embedding_mapping(self, bridge): + registry = bridge.mapping_registry() + hf_params = {m.hf_param for m in registry.mappings if hasattr(m, 'hf_param')} + assert "model.embed_tokens.weight" in hf_params + + def test_has_output_layer_mapping(self, bridge): + registry = bridge.mapping_registry() + megatron_params = {m.megatron_param for m in registry.mappings} + assert any("output_layer" in p for p in megatron_params) +``` + +### Provider Unit Test (only if custom provider subclass exists) + +Skip this if the bridge uses `GPTModelProvider` directly (most LLM bridges). +Only needed for VLM providers or LLM providers with custom fields/`provide()` logic. + +```python +class TestMyModelProvider: + def test_defaults(self): + provider = MyModelProvider( + num_layers=32, hidden_size=4096, + num_attention_heads=32, num_query_groups=8, + ) + assert provider.normalization == "RMSNorm" + + def test_tp_validation(self): + with pytest.raises(ValueError): + provider = MyModelProvider( + num_query_groups=2, + tensor_model_parallel_size=4, + ) + provider.validate_parallelism() +``` + +### Skip conditions + +```python +# Module-level skip for optional dependencies +pytestmark = pytest.mark.skipif( + not _HAS_MODEL_CLASS, + reason="transformers version does not support MyModel" +) + +# Class-level skip +@pytest.mark.skipif(not _HAS_MOE_CLASS, reason="MoE class not available") +class TestMyMoEBridge: + ... +``` + +## Functional Tests + +Location: `tests/functional_tests/models//` + +### Conversion Functional Test + +Tests HF ↔ Megatron roundtrip on GPU with a toy model. + +```python +import subprocess +import pytest + +# Toy model config (reduced sizes for fast testing) +HF_TOY_MODEL_CONFIG = { + "model_type": "my_model", + "num_hidden_layers": 4, + "hidden_size": 256, + "intermediate_size": 512, + "num_attention_heads": 4, + "num_key_value_heads": 2, + "vocab_size": 2048, + "max_position_embeddings": 512, + # ... model-specific fields +} + +@pytest.fixture(scope="class") +def toy_model_path(tmp_path_factory): + """Create a small HF model for testing.""" + from transformers import AutoConfig + model_dir = tmp_path_factory.mktemp("toy_model") + config = AutoConfig.for_model(**HF_TOY_MODEL_CONFIG) + model = MyModelForCausalLM(config) + model.save_pretrained(str(model_dir), safe_serialization=True) + return str(model_dir) + +@pytest.mark.run_only_on("GPU") +class TestMyModelConversion: + @pytest.mark.parametrize("tp,pp", [(1, 1), (2, 1)]) + def test_roundtrip(self, toy_model_path, tp, pp, tmp_path): + result = subprocess.run( + [ + "uv", "run", "python", "-m", "torch.distributed.run", + f"--nproc_per_node={tp * pp}", + "examples/conversion/hf_megatron_roundtrip_multi_gpu.py", + f"--hf-model-id={toy_model_path}", + f"--output-dir={tmp_path}", + f"--tp={tp}", f"--pp={pp}", + ], + capture_output=True, text=True, + ) + assert result.returncode == 0, f"Conversion failed: {result.stderr}" +``` + +### VLM toy model creation + +VLM toy models need both text and vision configs: + +```python +HF_VLM_TOY_CONFIG = { + "model_type": "my_vlm", + "text_config": { + "num_hidden_layers": 4, + "hidden_size": 256, + # ... + }, + "vision_config": { + "hidden_size": 128, + "num_hidden_layers": 2, + # ... + }, + "image_token_id": 151655, + "video_token_id": 151656, + "tie_word_embeddings": False, +} +``` + +### MoE toy model: fuse expert weights + +Some MoE models store experts in fused format. After creating the model, fuse: + +```python +def _fuse_moe_expert_weights(model_dir): + """Convert per-expert weights to fused gate_up_proj/down_proj layout.""" + # Load safetensors, reshape per-expert into combined tensors, save back + ... +``` + +### Test marks + +```python +@pytest.mark.run_only_on("GPU") # Requires GPU +@pytest.mark.parametrize("tp,pp", [(2, 1)]) # Parallelism variants +@pytest.mark.skipif(...) # Conditional skip +``` + +## Example Scripts + +Example scripts target **real published models** (e.g. `Qwen/Qwen3-8B`), not toy configs. +The inference script must produce reasonable output — a coherent text completion for LLMs, +a plausible image description for VLMs. This is the acceptance bar for the deliverable. + +### Conversion example (`examples/models///conversion.sh`) + +```bash +#!/usr/bin/env bash +set -e + +WORKSPACE=${WORKSPACE:-/workspace} +MODEL_NAME= +HF_MODEL=/${MODEL_NAME} +TP=1; PP=8; EP=1 # Adjust per model + +# Import HF → Megatron +uv run python examples/conversion/convert_checkpoints.py import \ + --hf-model ${HF_MODEL} \ + --megatron-path ${WORKSPACE}/${MODEL_NAME} \ + --torch-dtype bfloat16 + +# Compare logits +uv run python -m torch.distributed.run --nproc_per_node=8 \ + examples/conversion/compare_hf_and_megatron/compare.py \ + --hf_model_path ${HF_MODEL} \ + --megatron_model_path ${WORKSPACE}/${MODEL_NAME} \ + --prompt "Hello, how are you?" \ + --tp ${TP} --pp ${PP} --ep ${EP} + +# Export Megatron → HF +uv run python examples/conversion/convert_checkpoints.py export \ + --hf-model ${HF_MODEL} \ + --megatron-path ${WORKSPACE}/${MODEL_NAME}/iter_0000000 \ + --hf-path ${WORKSPACE}/${MODEL_NAME}-hf-export + +# Roundtrip validation +uv run python -m torch.distributed.run --nproc_per_node=8 \ + examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ + --hf-model-id ${HF_MODEL} --tp ${TP} --pp ${PP} --ep ${EP} +``` + +### Inference example (`examples/models///inference.sh`) + +For LLMs: +```bash +uv run python examples/conversion/hf_to_megatron_generate_text.py \ + --hf_model_path ${HF_MODEL} --prompt "Hello" +``` + +For VLMs: +```bash +uv run python examples/conversion/hf_to_megatron_generate_vlm.py \ + --hf_model_path ${HF_MODEL} \ + --image_path "https://example.com/image.jpeg" \ + --prompt "Describe this image." +``` + +### VLM inference adds `--model_class` for non-default HF classes: +```bash +--model_class "MyModelForConditionalGeneration" +``` + +## Documentation Page + +Create `docs/models//.md`: + +```markdown +# + +## Supported Variants + +| Variant | Parameters | HF Path | +|---------|-----------|---------| +| -7B | 7B | /-7B | +| -70B | 70B | /-70B | + +## Conversion + +\`\`\`bash +# HF → Megatron +uv run python examples/conversion/convert_checkpoints.py import \ + --hf-model / --megatron-path /workspace/ +\`\`\` + +## Training + +See `examples/models///slurm_sft.sh` and `slurm_peft.sh` for full Slurm scripts. +Single-node quick-start: + +### SFT +\`\`\`bash +uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe __sft_config \ + checkpoint.pretrained_checkpoint=/workspace/models/ \ + model.tensor_model_parallel_size= \ + model.pipeline_model_parallel_size= \ + train.train_iters=1000 \ + train.global_batch_size= +\`\`\` + +### PEFT (LoRA) +\`\`\`bash +uv run python -m torch.distributed.run --nproc_per_node=8 scripts/training/run_recipe.py \ + --recipe __peft_config \ + checkpoint.pretrained_checkpoint=/workspace/models/ \ + model.tensor_model_parallel_size= \ + model.pipeline_model_parallel_size= \ + train.train_iters=1000 \ + train.global_batch_size= +\`\`\` + +## Known Limitations +- [List any known issues] +``` diff --git a/skills/Megatron-Bridge/adding-model-support/vlm-patterns.md b/skills/Megatron-Bridge/adding-model-support/vlm-patterns.md new file mode 100644 index 0000000..a84e6d0 --- /dev/null +++ b/skills/Megatron-Bridge/adding-model-support/vlm-patterns.md @@ -0,0 +1,197 @@ +# VLM Bridge Patterns + +Reference implementations: +- **Megatron vision encoder:** Qwen3.5-VL (`src/megatron/bridge/models/qwen_vl/`) +- **HF vision encoder:** Gemma3-VL (`src/megatron/bridge/models/gemma_vl/`) + +## Provider Pattern + +Subclass `GPTModelProvider`. VLM providers add vision-specific fields on top of standard LLM fields. + +```python +@dataclass +class MyVLModelProvider(GPTModelProvider): + # Vision config (passed as a HF config object) + vision_config: Optional[Any] = None + + # VLM-specific token IDs + image_token_id: Optional[int] = None + video_token_id: Optional[int] = None + + # Freeze options + freeze_language_model: bool = False + freeze_vision_model: bool = False + freeze_vision_projection: bool = False + + # Whether to use HF vision model (vs Megatron) + use_hf_vision_model: bool = False + + def provide(self, pre_process=None, post_process=None, vp_stage=None) -> MyVLModel: + # Build language layer spec + language_transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(...) + # Build vision config if needed + # Instantiate combined model + model = MyVLModel(config=self, ...) + if self.freeze_language_model or self.freeze_vision_model or self.freeze_vision_projection: + model.freeze(self.freeze_language_model, self.freeze_vision_model, self.freeze_vision_projection) + return model + + def provide_language_model(self, pre_process=None, post_process=None, vp_stage=None): + """Returns language-only model (for text-only inference).""" + return GPTModel(config=self, ...) + + def validate_parallelism(self): + if self.num_query_groups < self.tensor_model_parallel_size: + raise ValueError(f"TP ({self.tensor_model_parallel_size}) must be <= num_query_groups ({self.num_query_groups})") +``` + +### Key provider fields by source + +Read these from the correct config level: + +| Field | Source (VLM) | Notes | +|-------|-------------|-------| +| `num_layers`, `hidden_size`, `ffn_hidden_size` | `text_config` | Core architecture | +| `num_attention_heads`, `num_key_value_heads` | `text_config` | Attention config | +| `vocab_size`, `max_position_embeddings` | `text_config` | Tokenizer/position | +| `rope_theta` | `text_config` | RoPE | +| `tie_word_embeddings` | **top-level** `hf_config` | CRITICAL: not text_config | +| `vision_config` | **top-level** `hf_config` | Vision encoder config | +| `image_token_id`, `video_token_id` | **top-level** `hf_config` | Special token IDs | + +## Bridge Pattern + +```python +@MegatronModelBridge.register_bridge( + source="MyModelForConditionalGeneration", # HF class name (string if not importable) + target=MyVLModel, # Megatron model class + provider=MyVLModelProvider, # Provider class + model_type="my_model", # HF model_type for export +) +class MyVLBridge(MegatronModelBridge): + def provider_bridge(self, hf_pretrained: PreTrainedVLM) -> MyVLModelProvider: + hf_config = hf_pretrained.config + text_config = hf_config.text_config + + # Map text config to provider kwargs using base class helper + provider_kwargs = self.hf_config_to_provider_kwargs(text_config) + provider = MyVLModelProvider(**provider_kwargs) + + # CRITICAL: tie_word_embeddings from top-level config + provider.share_embeddings_and_output_weights = getattr(hf_config, "tie_word_embeddings", False) + + # Vision config + provider.vision_config = hf_config.vision_config + + # VLM-specific fields from top-level config + provider.image_token_id = getattr(hf_config, "image_token_id", None) + provider.video_token_id = getattr(hf_config, "video_token_id", None) + + return provider + + def mapping_registry(self) -> MegatronMappingRegistry: + return MegatronMappingRegistry( + # Language model mappings (prefixed with language_model.*) + AutoMapping(megatron_param="language_model.embedding.word_embeddings.weight", + hf_param="model.embed_tokens.weight"), + AutoMapping(megatron_param="language_model.output_layer.weight", + hf_param="model.lm_head.weight"), + # ... language decoder layers ... + QKVMapping( + megatron_param="language_model.decoder.layers.*.self_attention.linear_qkv.weight", + q="model.language_model.layers.*.self_attn.q_proj.weight", + k="model.language_model.layers.*.self_attn.k_proj.weight", + v="model.language_model.layers.*.self_attn.v_proj.weight", + ), + # Vision model mappings + AutoMapping(megatron_param="vision_model.patch_embed.proj.**", + hf_param="model.visual.patch_embed.proj.**"), + # ... vision layers ... + ) +``` + +### Import types + +```python +from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM # VLM +from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM # LLM +``` + +## VLM Model Class Patterns + +### Option A: Megatron Vision Encoder (Qwen3.5 pattern) + +Both vision and language use Megatron modules. Full parallelism support. + +```python +class MyVLModel(MegatronModule): + def __init__(self, config, pre_process=True, post_process=True, ...): + if pre_process: + self.vision_model = MyVisionModel(config.vision_config, ...) + self.language_model = MyGPTModel(config, ...) + + def forward(self, input_ids, pixel_values, image_grid_thw, ...): + # 1. Vision: pixel_values → vision_embeds + vision_embeds = self.vision_model(pixel_values, image_grid_thw) + # 2. Text embeddings + text_embeds = self.language_model.embedding(input_ids) + # 3. Scatter vision into text at image token positions + combined = text_embeds.clone() + combined[vision_mask] = vision_embeds + # 4. Language model forward + return self.language_model(decoder_input=combined, ...) + + def freeze(self, freeze_language, freeze_vision, freeze_projection): + if freeze_language: + for p in self.language_model.parameters(): p.requires_grad = False + if freeze_vision: + for p in self.vision_model.parameters(): p.requires_grad = False + # projection freeze logic +``` + +### Option B: HF Vision Encoder (Gemma3 pattern) + +HF vision encoder + Megatron projector + Megatron language model. Simpler to implement. + +```python +class MyVLModel(MegatronModule): + def __init__(self, config, pre_process=True, post_process=True, ...): + if pre_process: + self.vision_tower = AutoModel.from_config(config.vision_config) + hook_hf_module_setattr_for_tp_grad_sync(self.vision_tower) + self.multi_modal_projector = MyProjector(config) + self.language_model = config.provide_language_model(pre_process, post_process) + + def forward(self, input_ids, pixel_values, ...): + text_embeds = self.language_model.embedding(input_ids) + if pixel_values is not None: + image_features = self.vision_tower(pixel_values).pooler_output + image_features = self.multi_modal_projector(image_features) + text_embeds.masked_scatter_(special_image_mask, image_features) + return self.language_model(decoder_input=text_embeds, ...) +``` + +## Weight Mapping Naming Conventions + +VLM weight names typically have these prefixes: + +| Megatron prefix | HF prefix | Component | +|----------------|-----------|-----------| +| `language_model.*` | `model.language_model.*` or `model.layers.*` | Text decoder | +| `language_model.embedding.*` | `model.embed_tokens.*` | Text embeddings | +| `language_model.output_layer.*` | `model.lm_head.*` or `lm_head.*` | Output head | +| `vision_model.*` | `model.visual.*` or `vision_tower.*` | Vision encoder | + +Check the actual HF model's `state_dict()` keys to determine exact naming. + +## Common Mapping Types for VLMs + +| Mapping Class | Use Case | +|--------------|----------| +| `AutoMapping` | 1:1 name mapping (most weights) | +| `QKVMapping` | Fused Q/K/V projections | +| `ConcatenatedQKVMapping` | Vision QKV (different from language) | +| `GatedMLPMapping` | gate_proj + up_proj → linear_fc1 | +| `ReplicatedMapping` | Weights replicated across TP ranks (e.g. patch_embed) | +| `ExpertMLPGateUpProjMapping` | MoE gate+up projections | +| `ExpertMLPDownProjMapping` | MoE down projections | diff --git a/skills/Megatron-Bridge/code-style/SKILL.md b/skills/Megatron-Bridge/code-style/SKILL.md new file mode 100644 index 0000000..e908055 --- /dev/null +++ b/skills/Megatron-Bridge/code-style/SKILL.md @@ -0,0 +1,304 @@ +--- +name: code-style +description: Code style and quality guidelines for Megatron Bridge. Covers naming, type hints, ruff enforcement, keyword-arg safety, copyright headers, logging, and common anti-patterns. Auto-invoked during code review and when writing new code. +--- + +# Code Style for Megatron Bridge + +This is the single source of truth for code style conventions in +Megatron Bridge, combining the ruff/pre-commit configuration with +project-specific rules. Read this before writing new code or reviewing PRs. + +## Style Guides + +- Python: [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) +- Shell: [Google Shell Style Guide](https://google.github.io/styleguide/shellguide.html) + +This repository is Python-first. Target Python 3.10+. + +## Formatting and Linting + +Run before every commit: + +```bash +uv run ruff check --fix . +uv run ruff format . +``` + +Pre-commit hooks run these automatically. If hooks auto-fix files, re-stage +and re-run until clean. + +### Ruff Rules (from `ruff.toml`) + +| Rule | ID | Description | +|---|---|---| +| Line length | — | 119 characters (formatter) | +| Quote style | — | Double quotes | +| f-string without placeholders | F541 | Error | +| Unused local variable | F841 | Auto-removed by `--fix` | +| Unused import | F401 | Auto-removed by `--fix` (ignored in `__init__.py`) | +| Ambiguous variable name | E741 | Error (e.g., `l`, `O`, `I`) | +| Undefined name | F821 | Error | +| Block comment format | E266 | Error (too many `#`) | +| Import sorting | I | isort-compatible, auto-fixed | +| Public class docstring | D101 | Warning (ignored in test files) | +| Public function docstring | D103 | Warning (ignored in test files) | + +**Per-file overrides:** +- `__init__.py`: F401 and F403 are ignored (re-exports are expected). +- `test_*.py`, `*_test.py`, `tests/*.py`: D101 and D103 are ignored. + +## Naming Conventions + +| Kind | Convention | Example | +|---|---|---| +| Files | snake_case | `model_bridge.py` | +| Classes | PascalCase | `MegatronModelBridge` | +| Functions/methods | snake_case | `load_weights_hf_to_megatron` | +| Local variables | snake_case | `megatron_weights` | +| Variables starting with digit | prefix `k` | `k_99th_percentile` | +| Global variables | UPPER_SNAKE + prefix `G` | `G_LOGGER` | +| Constants | UPPER_SNAKE | `DEFAULT_HIDDEN_SIZE` | + +- Avoid shadowing variables from an outer scope. +- Initialize all externally visible class members in the constructor. + +## Import Order + +Organize imports in this order, separated by blank lines: + +1. `__future__` imports +2. Standard library +3. Third-party (`megatron.core`, `torch`, `transformers`, etc.) +4. First-party (`megatron.bridge.*`) +5. Local folder imports + +ruff auto-fixes import ordering via the `I` rule. First-party is configured +as `known-first-party = ["megatron.bridge"]`. + +## Type Hints + +Required on all public API functions and methods. + +- Use `T | None` instead of `Optional[T]` +- Use `X | Y` instead of `Union[X, Y]` +- Use built-in generics (`list`, `dict`, `tuple`) instead of `typing` equivalents +- Use `TypeVar` for generic type parameters + +```python +def get_module_by_name( + model: torch.nn.Module, + name: str, + default: torch.nn.Module | None = None, +) -> torch.nn.Module | None: + ... +``` + +### Mypy + +Run mypy on changed files before submitting: + +```bash +uv run mypy --strict path/to/file.py +``` + +Key rules enforced by mypy: + +- **No `Any` leaks** — avoid `Any` in public signatures. Use `object` for truly + unknown types or a `TypeVar` for generic patterns. +- **No untyped defs** — every function must have parameter and return annotations. + Use `-> None` for procedures. +- **No implicit `Optional`** — write `x: int | None = None`, never `x: int = None`. +- **Explicit casts** — use `typing.cast()` only when the type system cannot infer + the correct type; add a comment explaining why. +- **Typed dictionaries** — prefer `TypedDict` over `dict[str, Any]` for + structured dictionaries with known keys. +- **Callable signatures** — use `Callable[[ArgType], ReturnType]` or + `Protocol` instead of bare `Callable`. +- **Ignore sparingly** — `# type: ignore[code]` must include the specific error + code and a comment justifying the suppression. + +## Enforce Keyword Arguments for Ambiguous Parameters + +When a function has multiple parameters of the same type that could be +swapped by mistake, use a bare `*` to force keyword-only arguments. + +**Don't:** +```python +def scatter_weights(tensor: Tensor, tp_group: ProcessGroup, ep_group: ProcessGroup): + ... +scatter_weights(t, ep_group, tp_group) # silently wrong +``` + +**Do:** +```python +def scatter_weights(tensor: Tensor, *, tp_group: ProcessGroup, ep_group: ProcessGroup): + ... +scatter_weights(t, tp_group=tp_group, ep_group=ep_group) # clear +``` + +## Docstrings + +Use Google-style docstrings for public classes and functions. These are +parseable by Sphinx. + +```python +def convert_weights( + source_model: torch.nn.Module, + target_model: torch.nn.Module, + mapping: MegatronParamMapping, +) -> dict[str, torch.Tensor]: + """Convert weights from source to target model format. + + Args: + source_model: The source model containing weights to convert. + target_model: The target model that will receive converted weights. + mapping: Parameter mapping defining the conversion rules. + + Returns: + Dictionary mapping parameter names to converted weight tensors. + + Raises: + ValueError: If source and target models have incompatible shapes. + """ + ... +``` + +For interfaces used outside a file, prefer docstrings over comments. Comments +are for code within a function or file-local interfaces. + +## Comments + +- Commented-out code must have a comment explaining why. Otherwise remove it. +- Do not add comments that merely narrate what the code does. +- Comments should explain non-obvious intent, trade-offs, or constraints. + +## Logging + +Use `logging.getLogger(__name__)` for module-level loggers. Use +`print_rank_0` / `warn_rank_0` for user-facing messages in distributed +contexts. + +**Don't:** +```python +print(f"Loading weights for {model_name}") +``` + +**Do:** +```python +logger = logging.getLogger(__name__) +logger.info("Loading weights for %s", model_name) + +# Or for distributed-aware output: +from megatron.bridge.utils.common_utils import print_rank_0 +print_rank_0(f"Loading weights for {model_name}") +``` + +## Error Handling + +Use specific exceptions. Keep try bodies minimal. + +**Don't:** +```python +try: + result = load_and_convert(path) +except: + print("Conversion failed") +``` + +**Do:** +```python +try: + state_dict = torch.load(path) +except FileNotFoundError: + raise ValueError(f"Checkpoint not found at {path}") from None +else: + result = convert(state_dict) +``` + +When using try-except for duck typing, keep the try body as small as possible +and use the else block for logic: + +```python +try: + f.seek # probe, do not call +except AttributeError: + ... # not file-like +else: + f.seek(0) + f.read() +``` + +## Avoid Reflection + +Do not use reflection when functionality can be achieved without it. + +**Don't:** +```python +def make_config(*args): + x, y = args + return dict(**locals()) +``` + +**Do:** +```python +def make_config(x, y): + return {"x": x, "y": y} +``` + +## Configuration and Dataclasses + +- Use `dataclasses` or `NamedTuple` for configuration objects. +- Be explicit about required vs optional fields. +- Do not add arbitrary defaults — be as explicit as possible. + +## NVIDIA Copyright Header + +Add this header to all Python files and shell scripts. Use the current year. +Exclude test files under `tests/`. + +```python +# Copyright (c) , NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +``` + +## String Quotes + +Use double quotes for all strings (matching ruff formatter configuration). + +## Testing Conventions + +- Unit tests go in `tests/unit_tests/`, named `test_*.py`. +- Functional tests go in `tests/functional_tests/`. +- Use pytest fixtures for common setup. +- Use pytest markers: `@pytest.mark.unit`, `@pytest.mark.integration`. +- Keep unit test configs tiny: small hidden dims, 1-2 layers, short sequences. +- Functional tests are capped at 2 GPUs. +- Set `CUDA_VISIBLE_DEVICES` explicitly for multi-GPU tests. + +## Code Review Checklist + +When reviewing code, check for: + +1. **Copyright header** present on new Python files (not test files) +2. **Type hints** on public functions and methods +3. **Docstrings** on public classes and functions (Google style) +4. **Specific exceptions** in try-except blocks +5. **No bare `print()`** — use `logger` or `print_rank_0` +6. **No hidden defaults** in function parameters for config values +7. **Keyword-only args** for ambiguous same-type parameters +8. **Double quotes** for strings +9. **Import order** follows the 5-group convention +10. **No commented-out code** without explanation +11. **Mypy clean** — no untyped defs, no `Any` in public APIs, no bare `# type: ignore` diff --git a/skills/Megatron-Bridge/developer-guide/SKILL.md b/skills/Megatron-Bridge/developer-guide/SKILL.md new file mode 100644 index 0000000..84a1e24 --- /dev/null +++ b/skills/Megatron-Bridge/developer-guide/SKILL.md @@ -0,0 +1,472 @@ +--- +name: developer-guide +description: Developer environment setup, CI/CD workflows, and CI failure debugging for Megatron Bridge. Covers container-based development, uv package management, pre-commit hooks, running tests, CI failure investigation, and common pitfalls. Use when onboarding, setting up a dev environment, troubleshooting build issues, investigating CI failures, or dealing with lockfile issues (corrupted, regenerating, or updating uv.lock). +--- + +# Developer Guide + +This guide covers the recommended development workflow for Megatron Bridge. +Two core principles apply everywhere: **build and develop inside containers**, +and **always use uv** for package management. + +--- + +## Why Containers + +Megatron Bridge depends on CUDA, NCCL, PyTorch with GPU support, +Transformer Engine, and optional components like TRT-LLM, vLLM, and DeepEP. +Installing these on a bare host is fragile and hard to reproduce. The project +ships production-quality Dockerfiles that pin every dependency. + +**Use the container as your development environment.** This guarantees: + +- Identical CUDA / NCCL / cuDNN versions across all developers and CI. +- `uv.lock` resolves the same way locally and in CI (the lockfile is + Linux-only; it cannot be regenerated on macOS). +- GPU-dependent operations (training, conversion, `uv lock`) work out of the + box. + +### Option 1: Use the NeMo Framework Container + +The fastest way to get started is the pre-built +[NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags), +which ships with Megatron Bridge, Megatron-Core, and all GPU dependencies +pre-installed. No build step required: + +```bash +docker run --rm -it --gpus all --shm-size=24g \ + nvcr.io/nvidia/nemo:latest \ + bash +``` + +### Option 2: Build the Megatron Bridge Container + +If you need to test against your local source tree, build the image from the +repository root: + +```bash +docker build \ + -f docker/Dockerfile.ci \ + --target megatron_bridge \ + -t megatron-bridge:latest \ + . +``` + +This builds the CI image with all dependencies installed via `uv sync --locked`. +See `docker/README.md` for the full NeMo Framework image stack +(fw-base -> megatron-bridge -> fw-final) and build argument reference. + +Key build args: +- `BASE_IMAGE` — base PyTorch image (default: `nvcr.io/nvidia/pytorch:26.02-py3`) +- `MCORE_TRIGGERED_TESTING` — set to `true` when testing against a non-pinned MCore commit +- `UV_CACHE_PRUNE_ARGS` — optional args passed to `uv cache prune` during image build + +### Running the Container + +Interactive development shell: + +```bash +docker run --rm -it -w /opt/Megatron-Bridge \ + -v $(pwd):/opt/Megatron-Bridge \ + --gpus all \ + --shm-size=24g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + megatron-bridge:latest \ + bash +``` + +### Containers on Slurm Clusters + +On Slurm clusters with Enroot/Pyxis, containers are passed to `srun` directly: + +```bash +srun --mpi=pmix \ + --container-image="$CONTAINER_IMAGE" \ + --container-mounts="$CONTAINER_MOUNTS" \ + --no-container-mount-home \ + bash -c "cd /opt/Megatron-Bridge && uv run --no-sync python ..." +``` + +If you use the built container (or the NeMo Framework container) as-is, +dependencies are already installed and no `uv sync` is needed. If you +**bind-mount a custom Megatron Bridge source tree** into the container +(e.g., for development), you need to `uv sync` so dependencies match +your local `pyproject.toml` and `uv.lock`. In that case, only rank 0 +should sync while other ranks wait: + +```bash +if [ "$SLURM_LOCALID" -eq 0 ]; then uv sync; else sleep 10; fi +``` + +Other key points: + +- `--no-container-mount-home` is an **srun flag**, not an `#SBATCH` directive. +- Set `UV_CACHE_DIR` to shared storage to avoid filling the container's + `/root/.cache/`. + +--- + +## Always Use uv + +Megatron Bridge uses [uv](https://docs.astral.sh/uv/) as its sole package +manager. The `uv.lock` file is checked into the repository for reproducible +builds. **Never use `pip install`, `conda`, or bare `python`** — always go +through `uv`. + +**Never install or upgrade dependencies outside the CI container.** All `uv` +commands must be run inside a `megatron-bridge` container — either one you +built locally or a pre-built image. + +### Why uv + +- **Reproducibility**: `uv.lock` pins every transitive dependency, ensuring + identical environments across developers, CI, and production containers. +- **Speed**: uv resolves and installs dependencies 10-100x faster than pip. +- **Single tool**: uv handles virtual environments, dependency resolution, + locking, syncing, and running scripts — no need for separate tools. +- **CI integration**: `Dockerfile.ci` installs everything via + `uv sync --locked`. If you use pip to install something locally, it will + diverge from what CI tests against. +- **Cache-friendly**: Set `UV_CACHE_DIR` to a persistent host directory and + mount it into the container to avoid re-downloading wheels on every + `docker run`. This is especially useful when you mount a frequently + changing workdir that triggers re-syncs: + ```bash + docker run --rm -it \ + -v $(pwd):/opt/Megatron-Bridge \ + -v $HOME/.cache/uv:/root/.cache/uv \ + --gpus all --shm-size=24g \ + megatron-bridge:latest bash + ``` + +### Essential uv Commands + +| Task | Command | +|---|---| +| Install all deps from lockfile | `uv sync --locked` | +| Install with all extras and dev groups | `uv sync --locked --all-extras --all-groups` | +| Run a Python command | `uv run python script.py` | +| Run training | `uv run python -m torch.distributed.run --nproc_per_node=N script.py` | +| Add a new dependency | `uv add ` | +| Add an optional dependency | `uv add --optional --extra ` | +| Regenerate the lockfile | `uv lock` (must be done inside the container on Linux) | +| Run linting | `uv run ruff check --fix . && uv run ruff format .` | +| Install pre-commit hooks | `uv run --group dev pre-commit install` | + +### uv run, Not bare python + +Always launch scripts with `uv run`: + +```bash +# Correct +uv run python -m torch.distributed.run --nproc_per_node=1 scripts/training/run_recipe.py ... + +# Wrong — bypasses the uv-managed environment +python -m torch.distributed.run --nproc_per_node=1 scripts/training/run_recipe.py ... +torchrun --nproc_per_node=1 scripts/training/run_recipe.py ... +``` + +After running `uv sync` inside a container, you can also use bare `python` +since the virtual environment is already activated. But `uv run` is always the +safer default. + +### Adding Dependencies + +```bash +uv add some-package + +# For an optional extra group (e.g., trtllm-specific deps) +uv add --optional --extra trtllm some-package +``` + +This updates `pyproject.toml` and `uv.lock`. Commit both files: + +```bash +git add pyproject.toml uv.lock +git commit -s -m "build: add some-package dependency" +``` + +### Regenerating uv.lock + +The lockfile is Linux-only (it resolves against CUDA wheels). **You cannot +regenerate it on macOS.** Run `uv lock` inside the Docker container or on a +Linux workstation: + +```bash +docker run --gpus all --rm \ + -v $(pwd):/opt/Megatron-Bridge \ + megatron-bridge:latest \ + bash -c 'cd /opt/Megatron-Bridge && uv lock' +``` + +### uv sync After Switching MCore Branches + +The lockfile is generated against the main MCore commit. When switching to the +dev branch: + +```bash +./scripts/switch_mcore.sh dev +uv sync # without --locked +``` + +When switching back to main: + +```bash +./scripts/switch_mcore.sh main +uv sync --locked # lockfile matches again +``` + +--- + +## Pre-commit Hooks + +Install pre-commit hooks before your first commit: + +```bash +uv run --group dev pre-commit install +``` + +The hooks run [ruff](https://docs.astral.sh/ruff/) for linting and formatting, +plus end-of-file and trailing-whitespace fixers. If hooks auto-fix files, +re-stage and re-run: + +```bash +git add -u +pre-commit run +# If it auto-fixed files: +git add -u +pre-commit run +``` + +Repeat until all hooks pass. + +Before committing, you can also run linting manually: + +```bash +ruff check --fix +ruff format +pre-commit run --all-files +``` + +--- + +## Running Tests + +Tests live under `tests/`: + +| Path | Description | +|------|-------------| +| `tests/unit_tests/` | Fast, isolated unit tests grouped by domain (models, core, data, etc.) | +| `tests/functional_tests/` | Integration tests with models/datasets, tiered L0/L1/L2 | + +**Pytest markers available:** `unit`, `integration`, `system`, `acceptance`, `docs`, `skipduringci`, `pleasefixme` + +### Unit Tests + +```bash +uv run pytest tests/unit_tests/ -x -v +``` + +Unit tests run without GPUs and do not depend on large artifacts. Or inside Docker: + +```bash +docker run --rm --gpus all -v $(pwd):/workdir/ -w /workdir/ megatron-bridge \ + uv run pytest tests/unit_tests/ +``` + +### Functional Tests + +Functional tests require GPUs and are typically run inside the container: + +```bash +uv run pytest tests/functional_tests/ -x -v +``` + +Longer functional tests use `L2_Launch_*.sh` launcher scripts in +`tests/functional_tests/`. Each launcher must be registered in +`.github/workflows/cicd-main.yml` under `matrix.include` to be picked up +by CI. + +### Adding a Unit Test + +1. Place it under `tests/unit_tests//test_.py`. +2. Use the appropriate pytest marker: `@pytest.mark.unit`. +3. Run locally: `uv run --no-sync --active pytest tests/unit_tests/.py` + +### Adding a Functional Test + +1. Create a launch script under `tests/functional_tests/launch_scripts/active/`. +2. Follow the naming convention: `L0_Launch__.sh`, `L1_Launch_...`, or `L2_Launch_...`. +3. Tier guidance: + - **L0** — smoke tests that run on every PR; must be fast and stable. + - **L1** — broader coverage; runs nightly. + - **L2** — heavy tests (large models, checkpoint conversion); runs on schedule or manual trigger. +4. Apply the `needs-more-tests` PR label to trigger L0 + L1 for a PR. + +--- + +## Commit and PR Workflow + +- **Never commit directly to `main`** — always create a feature branch. +- **Always sign commits**: `git commit -s -m "message"`. +- **PR title format**: `[{areas}] {type}: {description}` + (e.g., `[model] feat: Add Qwen3 model bridge`). +- **Trigger CI**: Comment `/ok to test ` on the PR, or set up + signed commits for automatic CI triggering. + +See `CONTRIBUTING.md` for the full PR workflow, area/type labels, and DCO +requirements. + +--- + +## CI Pipeline + +The CI pipeline is defined in `.github/workflows/cicd-main.yml`. It is +triggered by schedule, pushes to `main`, `deploy-release/*`, and +`pull-request/` branches, merge groups, and `workflow_dispatch`. + +### Pipeline Structure + +```text +pre-flight + └── lint-check + └── cicd-wait-in-queue # requires maintainer approval for untrusted PRs + └── cicd-container-build # builds and caches the Docker image + ├── unit-tests-core + ├── unit-tests-diffusion + └── functional-tests (L0 always; L1 with needs-more-tests label; L2 on schedule) +``` + +- The CI branch `pull-request/` is created automatically when a PR is opened against `main` or `deploy-release/*`. +- Concurrent runs for the same PR are cancelled automatically (concurrency group per PR number). +- Slack notifications are sent on completion for scheduled and nightly runs. + +--- + +## CI Failure Investigation + +For PR-scoped CI runs, branches follow the pattern `pull-request/`. +This workflow can also be triggered by schedule, push to `main`/`deploy-release/*`, and `workflow_dispatch`. + +### Locating the PR from a CI Branch + +```bash +# Extract PR number from the CI branch name (e.g. pull-request/1234) +PR_NUMBER=$(git rev-parse --abbrev-ref HEAD | grep -oP '(?<=pull-request/)\d+') + +# Or, given a branch name string directly: +PR_NUMBER=$(echo "pull-request/1234" | grep -oP '(?<=pull-request/)\d+') + +# Fetch PR metadata +gh pr view "$PR_NUMBER" --repo NVIDIA-NeMo/Megatron-Bridge + +# List files changed in the PR +gh pr diff "$PR_NUMBER" --repo NVIDIA-NeMo/Megatron-Bridge --name-only + +# View PR checks / CI status +gh pr checks "$PR_NUMBER" --repo NVIDIA-NeMo/Megatron-Bridge +``` + +### Investigating a Failing CI Job + +1. **Get the PR number** from the branch name (see above). +2. **Review the changeset** to understand what changed: + ```bash + gh pr diff "$PR_NUMBER" --repo NVIDIA-NeMo/Megatron-Bridge + ``` +3. **Identify the failing job** from `gh pr checks` output or from the GitHub Actions URL in the failure notification. +4. **Fetch job logs** for deeper inspection: + ```bash + # List runs for the PR's head SHA + gh run list --repo NVIDIA-NeMo/Megatron-Bridge --branch "pull-request/$PR_NUMBER" + + # Download logs for a specific run to a local file + gh run view --repo NVIDIA-NeMo/Megatron-Bridge --log-failed > run.log + ``` +5. **Scan the log file in chunks.** Log files can exceed 10,000 lines — never load them whole into context. Read them in chunks of ~200 lines and stop as soon as the root cause is found: + ```bash + # Total line count + wc -l run.log + + # Read chunk N (lines 1–200, 201–400, …) + sed -n '1,200p' run.log + sed -n '201,400p' run.log + # … continue until the failure is located + ``` + Scan from the end first if looking for the final error, then work backwards: + ```bash + # Last 200 lines + tail -200 run.log + ``` +6. **Cross-reference the changeset** against the failing test or step to narrow down the root cause. + +### Common Failure Patterns + +| Symptom | Likely Cause | Action | +|---------|-------------|--------| +| Lint job fails | `ruff` or `pre-commit` violation | Run `ruff check --fix` + `ruff format` locally | +| Container build fails | Dependency conflict or stale `uv.lock` | Re-run `uv lock` inside Docker and commit updated lock | +| Unit tests fail | Code regression or missing import | Run failing test locally; check the PR diff for the relevant module | +| Functional test (L0) fails | Integration breakage | Check GPU runner logs; reproduce with the corresponding `L0_Launch_*.sh` script | +| `cicd-wait-in-queue` blocked | PR not yet approved for CI | A maintainer must comment `/ok to test ` or approve via the test queue | +| MCore submodule mismatch | Pinned commit out of sync | Update `3rdparty/Megatron-LM` submodule and re-lock | + +--- + +## Common Pitfalls + +| Problem | Cause | Fix | +|---|---|---| +| `uv sync --locked` fails on macOS | Lockfile resolves CUDA wheels that don't exist on macOS | Run inside Docker or on a Linux machine | +| `ModuleNotFoundError` after pip install | pip installed outside the uv-managed venv | Use `uv add` and `uv sync`, never bare `pip install` | +| `uv sync --locked` fails after MCore branch switch | Lockfile was generated against main MCore | Use `uv sync` (without `--locked`) on dev | +| Stale checkpoint auto-resume in Bridge | `nemo_experiments/` from a previous run exists | `rm -rf nemo_experiments` before starting fresh | +| Port collision on Slurm (EADDRINUSE) | `ntasks-per-node=8` with `torchrun --nproc_per_node=8` | Drop torchrun; use `ntasks-per-node=8` with `uv run python script.py` (srun-native) | +| `uv: command not found` inside container | Container doesn't have uv | Use the `megatron-bridge` image built from `Dockerfile.ci` | +| `No space left on device` during uv ops | Cache fills container's `/root/.cache/` | Set `UV_CACHE_DIR` to shared/persistent storage | +| Pre-commit fails with ruff errors | Code style violations | Run `uv run ruff check --fix . && uv run ruff format .` | + +--- + +## Quick Start Checklist + +1. Clone the repo and initialize submodules: + ```bash + git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge megatron-bridge + cd megatron-bridge + git submodule update --init 3rdparty/Megatron-LM + ``` + +2. Build the container: + ```bash + docker build -f docker/Dockerfile.ci --target megatron_bridge -t megatron-bridge:latest . + ``` + +3. Start a dev shell: + ```bash + docker run --rm -it -v $(pwd):/opt/Megatron-Bridge --gpus all --shm-size=24g megatron-bridge:latest bash + ``` + +4. Install pre-commit hooks (inside container): + ```bash + uv run --group dev pre-commit install + ``` + +5. Run a quick training sanity check: + ```bash + uv run python -m torch.distributed.run --nproc_per_node=1 \ + scripts/training/run_recipe.py \ + --recipe vanilla_gpt_pretrain_config \ + train.train_iters=5 train.global_batch_size=8 train.micro_batch_size=4 \ + scheduler.lr_warmup_iters=1 scheduler.lr_decay_iters=5 \ + logger.log_interval=1 + ``` + +6. Create a branch, make changes, and submit a PR: + ```bash + git switch -c your-feature-name + # ... make changes ... + git add -u && git commit -s -m "[area] type: description" + git push origin your-feature-name + ``` diff --git a/skills/Megatron-Bridge/mlm-bridge-training/SKILL.md b/skills/Megatron-Bridge/mlm-bridge-training/SKILL.md new file mode 100644 index 0000000..54aac6e --- /dev/null +++ b/skills/Megatron-Bridge/mlm-bridge-training/SKILL.md @@ -0,0 +1,161 @@ +--- +name: mlm-bridge-training +description: Run Megatron-LM (MLM) and Megatron Bridge training with mock or real data. Covers correlation testing, available recipes, and multi-GPU examples. Use when running training, comparing MLM vs Bridge, or translating configs. +--- + +# MLM vs Bridge Training + +For how they differ, the arg mapping tables, gotchas, and translation script, see: + +- `docs/megatron-lm-to-megatron-bridge.md` + +## Correlation Testing + +Use `vanilla_gpt_pretrain_config` for loss-correlation testing. This recipe uses +bare `GPTModelProvider` defaults (LayerNorm, GeLU, learned_absolute position +embeddings, `vocab_size` inherited from tokenizer) — matching MLM +`pretrain_gpt.py` defaults with no args. + +### MLM Correlation Run (2L/256H, 1 GPU) + +```bash +PYTHONPATH=3rdparty/Megatron-LM:$PYTHONPATH \ +uv run python -m torch.distributed.run --nproc_per_node=1 \ + 3rdparty/Megatron-LM/pretrain_gpt.py \ + --num-layers 2 --hidden-size 256 --num-attention-heads 4 \ + --ffn-hidden-size 1024 --seq-length 512 --max-position-embeddings 512 \ + --micro-batch-size 4 --global-batch-size 32 \ + --train-iters 10 --eval-iters 2 --eval-interval 10 \ + --mock-data --bf16 --use-mcore-models \ + --tokenizer-type NullTokenizer --vocab-size 32000 \ + --lr 3e-4 --min-lr 3e-5 --seed 1234 --log-interval 1 +``` + +### Bridge Correlation Run (same config, 1 GPU) + +```bash +rm -rf nemo_experiments && \ +uv run python -m torch.distributed.run --nproc_per_node=1 \ + scripts/training/run_recipe.py \ + --recipe vanilla_gpt_pretrain_config \ + model.num_layers=2 model.hidden_size=256 \ + model.num_attention_heads=4 model.ffn_hidden_size=1024 \ + model.seq_length=512 dataset.sequence_length=512 \ + train.train_iters=10 train.global_batch_size=32 train.micro_batch_size=4 \ + validation.eval_interval=10 validation.eval_iters=2 \ + optimizer.lr=3e-4 optimizer.min_lr=3e-5 \ + scheduler.lr_warmup_iters=1 scheduler.lr_decay_iters=10 \ + rng.seed=1234 logger.log_interval=1 +``` + +### Verification + +With matched parameters the LM losses should be nearly identical at each +iteration. Compare `lm loss` values from both logs — they should agree to +within BF16 rounding. + +## Multi-GPU Examples + +### MLM 2-GPU with TP=2 + +```bash +PYTHONPATH=3rdparty/Megatron-LM:$PYTHONPATH \ +uv run python -m torch.distributed.run --nproc_per_node=2 \ + 3rdparty/Megatron-LM/pretrain_gpt.py \ + --tensor-model-parallel-size 2 --sequence-parallel \ + --num-layers 4 --hidden-size 256 --num-attention-heads 4 \ + --seq-length 1024 --max-position-embeddings 1024 \ + --micro-batch-size 2 --global-batch-size 16 \ + --train-iters 10 --eval-iters 2 --eval-interval 10 \ + --mock-data --bf16 --use-mcore-models \ + --tokenizer-type NullTokenizer --vocab-size 1024 \ + --lr 1e-4 --log-interval 1 +``` + +### Bridge 2-GPU with TP=2 + +```bash +rm -rf nemo_experiments && \ +uv run python -m torch.distributed.run --nproc_per_node=2 \ + scripts/training/run_recipe.py \ + --recipe vanilla_gpt_pretrain_config \ + model.tensor_model_parallel_size=2 model.sequence_parallel=true \ + model.num_layers=4 model.hidden_size=256 \ + model.num_attention_heads=4 model.ffn_hidden_size=1024 \ + model.seq_length=1024 dataset.sequence_length=1024 \ + train.train_iters=10 train.global_batch_size=16 train.micro_batch_size=2 \ + validation.eval_interval=10 validation.eval_iters=2 \ + scheduler.lr_warmup_iters=2 scheduler.lr_decay_iters=10 \ + logger.log_interval=1 +``` + +## Available Recipes + +Common recipes (use with `--recipe`): + +- `vanilla_gpt_pretrain_config` — Minimal GPT (bare GPTModelProvider defaults, + ideal for correlation testing and custom configs) +- `llama32_1b_pretrain_config` — Llama 3.2 1B (16L, 2048H, GBS=512, seq=8192) +- `llama3_8b_pretrain_config` — Llama 3 8B +- `qwen3_8b_pretrain_config` — Qwen3 8B +- `deepseek_v2_lite_pretrain_config` — DeepSeek-V2-Lite 16B MoE + +SFT/PEFT variants use `_sft_config` / `_peft_config` suffix. + +## Megatron-Core Submodule + +For what the submodule is and why two versions exist, see +`docs/megatron-lm-to-megatron-bridge.md`. + +### Check current version + +```bash +./scripts/switch_mcore.sh status +``` + +### Switch to dev for testing newer MCore features + +```bash +./scripts/switch_mcore.sh dev + +# uv sync (without --locked) since lockfile is for main +uv sync +``` + +### Switch back to main + +```bash +./scripts/switch_mcore.sh main +``` + +### After pulling latest main + +When you pull the latest Bridge main branch, the submodule pointer may have +been updated. Re-sync the submodule: + +```bash +git submodule update --init 3rdparty/Megatron-LM +``` + +## Pitfalls + +1. **Always `rm -rf nemo_experiments`** before a fresh correlation run. Bridge + auto-resumes from stale checkpoints silently. + +2. **`uv run` required**: Always use `uv run python -m torch.distributed.run` + (not bare `torchrun` or `python`). + +3. **MLM PYTHONPATH**: Must include `3rdparty/Megatron-LM` so `gpt_builders.py` + is importable. + +4. **Scheduler overrides**: When overriding `train.train_iters` to a small + value, also set `scheduler.lr_warmup_iters` and `scheduler.lr_decay_iters` + or you get an assertion error. + +5. **Use `dataset.sequence_length`** in CLI overrides, not `dataset.seq_length`. + +6. **MoE OOM**: Large MoE models require full activation recomputation and + typically multi-node EP. TP does NOT reduce per-GPU expert memory. + +7. **`uv sync --locked` fails after switching to dev**: The lockfile is generated + against the main MCore commit. Use `uv sync` (without `--locked`) when on dev. diff --git a/skills/Megatron-Bridge/mlm-bridge-training/card.yaml b/skills/Megatron-Bridge/mlm-bridge-training/card.yaml new file mode 100644 index 0000000..45b1f55 --- /dev/null +++ b/skills/Megatron-Bridge/mlm-bridge-training/card.yaml @@ -0,0 +1,47 @@ +title: mlm_bridge_training +validated_on: "2026-03-17" +summary: > + Operational guide for running Megatron-LM (pretrain_gpt.py) and Megatron + Bridge (run_recipe.py) training side by side, including correlation testing, + arg mapping, and the translation script. +validation_status: + mlm_pretrain_gpt_launch: + - code_verified + bridge_run_recipe_launch: + - code_verified + vanilla_gpt_correlation: + - code_verified + translation_script: + - code_verified + arg_mapping_tables: + - doc_only +feature_meaning: + vanilla_gpt_pretrain_config: > + Bare GPTModelProvider recipe with no model-specific overrides. Matches MLM + pretrain_gpt.py defaults for loss-correlation testing. + translate_mlm_to_bridge: > + Script that converts Megatron-LM YAML configs or raw CLI args into Bridge + overrides, launch commands, or standalone recipe files. +recommended_path: + correlation_testing: vanilla_gpt_pretrain_config + arg_mapping_reference: docs/megatron-lm-to-megatron-bridge.md +known_constraints: + - MLM requires --eval-iters and --eval-interval (no defaults). + - Bridge scheduler asserts lr_warmup_iters < lr_decay_iters. + - Use dataset.sequence_length (not dataset.seq_length) in CLI overrides. + - MLM requires PYTHONPATH to include 3rdparty/Megatron-LM. + - Bridge auto-resumes from nemo_experiments/ if previous checkpoint exists. +known_limitations: + - Not all MLM CLI flags have a direct Bridge equivalent. + - Model-specific recipes carry their own vocab_size which may not match the tokenizer. + - Translation script covers common args but may not handle all edge cases. +evidence: + - docs/megatron-lm-to-megatron-bridge.md + - scripts/training/run_recipe.py + - scripts/translate_mlm_to_bridge.py + - 3rdparty/Megatron-LM/pretrain_gpt.py + - src/megatron/bridge/training/config.py + - src/megatron/bridge/recipes/common.py +follow_up_validation: + - Add a checked-in CI job that runs MLM vs Bridge correlation and asserts loss match. + - Extend translation script coverage to recompute and CUDA-graph args. diff --git a/skills/Megatron-Bridge/multi-node-slurm/SKILL.md b/skills/Megatron-Bridge/multi-node-slurm/SKILL.md new file mode 100644 index 0000000..52a23ee --- /dev/null +++ b/skills/Megatron-Bridge/multi-node-slurm/SKILL.md @@ -0,0 +1,534 @@ +--- +name: multi-node-slurm +description: Convert single-node scripts to multi-node Slurm sbatch jobs and debug common multi-node failures. Covers srun-native vs uv run torch.distributed approaches, container setup, NCCL timeouts, OOM sizing for MoE models, and interactive allocation. Use when creating Slurm scripts, scaling to multi-node, or debugging multi-node job failures. +--- + +# Multi-Node Slurm + +Convert single-node `uv run python -m torch.distributed.run` commands into multi-node Slurm sbatch scripts with Enroot container support, and debug common multi-node failures. + +## Two Approaches: srun-native vs uv run torch.distributed + +| Approach | `ntasks-per-node` | Process spawning | Best for | +|---|---|---|---| +| **srun-native** (preferred) | 8 | Slurm spawns 8 tasks/node | Conversion, inference, Bridge scripts | +| **uv run torch.distributed** (legacy) | 1 | `uv run python -m torch.distributed.run` spawns 8 procs/node | MLM pretrain_gpt.py | + +**Prefer srun-native** — simpler, avoids shell escaping issues with TRAIN_CMD. Megatron Bridge auto-derives `RANK`, `WORLD_SIZE`, `LOCAL_RANK`, `MASTER_ADDR`, `MASTER_PORT` from SLURM env vars (`SLURM_PROCID`, `SLURM_NTASKS`, `SLURM_LOCALID`, `SLURM_NODELIST`) via `common_utils.py` helpers called during `initialize.py` distributed init, so you never need to set them manually. + +## Cluster Environment + +### Container + +```bash +CONTAINER_IMAGE=".sqsh" +CONTAINER_MOUNTS=":,:/opt/Megatron-Bridge,:/opt/data" +``` + +### Standard Paths + +```bash +WORKDIR="/opt/Megatron-Bridge" +DATA_PATH="/dclm_01_01_text_document" +``` + +### Tokens / Caches + +```bash +export GH_TOKEN= +export HF_TOKEN= +export HF_HOME=/HF_HOME +export UV_CACHE_DIR="/uv_cache" +export NEMO_HOME="/cache/nemo" +``` + +**Important**: `NEMO_HOME` must point to a shared filesystem (e.g. Lustre) for multi-node SFT/PEFT jobs. +The default (`/root/.cache/nemo`) is container-local and not shared across nodes. +Without this, packed-sequence data files prepared on node 0 are invisible to other +nodes, causing `TypeError: 'NoneType' object is not an iterator`. + +### Log Directory + +```text +/logs/_ +``` + +## srun-native Approach (Preferred) + +Slurm spawns all processes directly. No `torch.distributed.run`, no TRAIN_CMD escaping. + +### SBATCH Headers + +```bash +#SBATCH --job-name=- +#SBATCH --nodes= +#SBATCH --ntasks-per-node=8 # Slurm spawns 8 tasks per node +#SBATCH --gpus-per-node=8 +#SBATCH --time=00:30:00 +#SBATCH --account= +#SBATCH --partition=batch +#SBATCH --output=/logs/_%j.log +#SBATCH --exclusive +``` + +### Build and Launch + +Two-phase srun: first a single-process srun to populate the uv cache, then the full multi-node srun. + +```bash +# Env exports at sbatch level (before srun) +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_NVLS_ENABLE=0 + +# Phase 1: Single-process uv sync to build/populate the shared cache +srun --mpi=pmix -N 1 --ntasks=1 \ + --container-image="$CONTAINER_IMAGE" \ + --container-mounts="$CONTAINER_MOUNTS" \ + --no-container-mount-home \ + bash -c "cd $WORKDIR && uv sync" + +# Phase 2: Full multi-node run (uv sync is a fast no-op since cache is warm) +srun --mpi=pmix \ + --container-image="$CONTAINER_IMAGE" \ + --container-mounts="$CONTAINER_MOUNTS" \ + --no-container-mount-home \ + bash -c "cd $WORKDIR && uv sync && uv run --no-sync python " +``` + +### srun-native Key Points + +- Phase 1 runs `uv sync` once on a single node/process, building all wheels into the shared cache on Lustre +- Phase 2's `uv sync` is a fast no-op (everything is cached) — safe to run on all ranks without sleep guards +- `initialize.py` + `common_utils.py` auto-set `RANK`, `WORLD_SIZE`, `LOCAL_RANK`, `MASTER_ADDR`, `MASTER_PORT` from SLURM env vars +- Env vars like `HF_TOKEN`, `HF_HOME`, `UV_CACHE_DIR` exported at sbatch level are inherited by srun tasks +- Reference: `examples/models/vlm/glm_45v/slurm_sft.sh`, `examples/models/minimax_m2/slurm_conversion.sh` + +--- + +## uv run torch.distributed Approach (Legacy) + +Use when the script requires `torch.distributed.run` (e.g., MLM pretrain_gpt.py) or when Bridge's `initialize.py` is not in the call path. + +### 1. Add SBATCH Headers + +```bash +#SBATCH --job-name=- +#SBATCH --nodes= +#SBATCH --ntasks-per-node=1 # ALWAYS 1 — torchrun handles per-node spawning +#SBATCH --gpus-per-node=8 +#SBATCH --time=00:30:00 +#SBATCH --account= +#SBATCH --partition=batch +#SBATCH --output=/logs/_%j.log +#SBATCH --exclusive +``` + +**Critical**: `--ntasks-per-node=1`, NOT 8. `uv run python -m torch.distributed.run --nproc_per_node=8` spawns 8 processes per node. Using `ntasks-per-node=8` causes EADDRINUSE port collisions (8 tasks x 8 procs = 64 per node). + +### 2. Convert to Multi-Node + +Replace single-node: + +```bash +uv run python -m torch.distributed.run --nproc_per_node=8 \ +