From cdd7d79f952941f09f709f5188621a489305a46d Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 13 Feb 2026 01:07:11 -0800
Subject: [PATCH 1/2] Separate CI job for Megatron GPU tests

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .github/workflows/gpu_tests.yml                | 14 +++++++++++---
 tests/gpu_megatron/_extensions                 |  1 +
 tests/gpu_megatron/torch/conftest.py           |  1 +
 .../distill/plugins/test_distill_megatron.py   |  0
 .../export/test_unified_export_megatron.py     |  0
 .../test_vllm_fakequant_megatron_export.py     |  0
 .../test_megatron_gpt_dynamic_modules.py       |  0
 .../test_megatron_mamba_dynamic_modules.py     |  0
 .../opt/plugins/test_megatron_chaining.py      |  0
 .../torch/peft/plugins}/test_megatron_peft.py  |  0
 .../plugins/test_mcore_gpt_minitron_pruning.py |  0
 .../test_mcore_mamba_minitron_pruning.py       |  0
 .../torch/quantization/plugins/test_apex.py    |  0
 .../quantization/plugins/test_megatron.py      |  0
 .../plugins/test_transformer_engine.py         |  0
 .../plugins/test_megatron_sparsity.py          |  0
 .../test_speculative_megatron_modules.py       |  0
 .../torch/utils/plugins/test_utils_megatron.py |  0
 tox.ini                                        | 18 +++++++++++++-----
 19 files changed, 26 insertions(+), 8 deletions(-)
 create mode 120000 tests/gpu_megatron/_extensions
 create mode 120000 tests/gpu_megatron/torch/conftest.py
 rename tests/{gpu => gpu_megatron}/torch/distill/plugins/test_distill_megatron.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/export/test_unified_export_megatron.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/export/test_vllm_fakequant_megatron_export.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/opt/plugins/test_megatron_chaining.py (100%)
 rename tests/{gpu/torch/peft => gpu_megatron/torch/peft/plugins}/test_megatron_peft.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/quantization/plugins/test_apex.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/quantization/plugins/test_megatron.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/quantization/plugins/test_transformer_engine.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/speculative/plugins/test_speculative_megatron_modules.py (100%)
 rename tests/{gpu => gpu_megatron}/torch/utils/plugins/test_utils_megatron.py (100%)

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index cb4686815..f807363a8 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -59,8 +59,12 @@ jobs:
   gpu-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [py312-cuda12-gpu, py312-cuda12-gpu-megatron]
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 120
+    timeout-minutes: 90
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -74,11 +78,15 @@ jobs:
         run: |
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
-        run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
+        run: pip install tox-current-env && tox -e ${{ matrix.example }} --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [py312-cuda12-gpu, py312-cuda12-gpu-megatron]
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 150
+    timeout-minutes: 90
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:
diff --git a/tests/gpu_megatron/_extensions b/tests/gpu_megatron/_extensions
new file mode 120000
index 000000000..dc4ffce33
--- /dev/null
+++ b/tests/gpu_megatron/_extensions
@@ -0,0 +1 @@
+../gpu/_extensions/
\ No newline at end of file
diff --git a/tests/gpu_megatron/torch/conftest.py b/tests/gpu_megatron/torch/conftest.py
new file mode 120000
index 000000000..40eda16c0
--- /dev/null
+++ b/tests/gpu_megatron/torch/conftest.py
@@ -0,0 +1 @@
+../../gpu/torch/conftest.py
\ No newline at end of file
diff --git a/tests/gpu/torch/distill/plugins/test_distill_megatron.py b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
similarity index 100%
rename from tests/gpu/torch/distill/plugins/test_distill_megatron.py
rename to tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
diff --git a/tests/gpu/torch/export/test_unified_export_megatron.py b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
similarity index 100%
rename from tests/gpu/torch/export/test_unified_export_megatron.py
rename to tests/gpu_megatron/torch/export/test_unified_export_megatron.py
diff --git a/tests/gpu/torch/export/test_vllm_fakequant_megatron_export.py b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
similarity index 100%
rename from tests/gpu/torch/export/test_vllm_fakequant_megatron_export.py
rename to tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
similarity index 100%
rename from tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
rename to tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
similarity index 100%
rename from tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
rename to tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
diff --git a/tests/gpu/torch/opt/plugins/test_megatron_chaining.py b/tests/gpu_megatron/torch/opt/plugins/test_megatron_chaining.py
similarity index 100%
rename from tests/gpu/torch/opt/plugins/test_megatron_chaining.py
rename to tests/gpu_megatron/torch/opt/plugins/test_megatron_chaining.py
diff --git a/tests/gpu/torch/peft/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
similarity index 100%
rename from tests/gpu/torch/peft/test_megatron_peft.py
rename to tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
similarity index 100%
rename from tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
rename to tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
similarity index 100%
rename from tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
rename to tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
diff --git a/tests/gpu/torch/quantization/plugins/test_apex.py b/tests/gpu_megatron/torch/quantization/plugins/test_apex.py
similarity index 100%
rename from tests/gpu/torch/quantization/plugins/test_apex.py
rename to tests/gpu_megatron/torch/quantization/plugins/test_apex.py
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
similarity index 100%
rename from tests/gpu/torch/quantization/plugins/test_megatron.py
rename to tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
diff --git a/tests/gpu/torch/quantization/plugins/test_transformer_engine.py b/tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py
similarity index 100%
rename from tests/gpu/torch/quantization/plugins/test_transformer_engine.py
rename to tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py
diff --git a/tests/gpu/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py b/tests/gpu_megatron/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py
similarity index 100%
rename from tests/gpu/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py
rename to tests/gpu_megatron/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py
diff --git a/tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
similarity index 100%
rename from tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py
rename to tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
diff --git a/tests/gpu/torch/utils/plugins/test_utils_megatron.py b/tests/gpu_megatron/torch/utils/plugins/test_utils_megatron.py
similarity index 100%
rename from tests/gpu/torch/utils/plugins/test_utils_megatron.py
rename to tests/gpu_megatron/torch/utils/plugins/test_utils_megatron.py
diff --git a/tox.ini b/tox.ini
index ee7acf029..19fd22a48 100644
--- a/tox.ini
+++ b/tox.ini
@@ -60,23 +60,31 @@ commands =
 [testenv:{py310,py311,py312}-cuda12-gpu]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
-    pip install -U megatron-core
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
+    # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
+    #   to avoid possible CUDA version mismatch
+    pip install -e .[all,dev-test]
+commands =
+    # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
+    python -m pytest tests/gpu
+
+[testenv:{py310,py311,py312}-cuda12-gpu-megatron]
+commands_pre =
+    # Install deps here so that it gets installed even in --current-env
+    pip install -U megatron-core
+
     # Skip triton because pytorch-triton is installed in the NGC PyTorch containers
     pip install pip-mark-installed
     pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
 
-    # Install Eagle-3 test dependencies
-    pip install tiktoken blobfile sentencepiece
-
     # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
     #   to avoid possible CUDA version mismatch
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
-    python -m pytest tests/gpu
+    python -m pytest tests/gpu_megatron
 
 #############################################
 # Code quality checks on all files or on diff

From 7f8ccafb19c8f5525bfc61530188613b7190795d Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 13 Feb 2026 11:38:06 -0800
Subject: [PATCH 2/2] Add back sentencepiece dependency for tests + cleanup
 workflow

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .github/workflows/example_tests.yml | 37 ++++++++---------------------
 .github/workflows/gpu_tests.yml     | 16 +++++++++----
 pyproject.toml                      |  4 ++--
 setup.py                            |  1 +
 tox.ini                             |  4 ----
 5 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
index 8442125f3..c1dab5dab 100644
--- a/.github/workflows/example_tests.yml
+++ b/.github/workflows/example_tests.yml
@@ -56,7 +56,7 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### PyTorch Example Tests #####
+  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
   torch-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
@@ -64,10 +64,13 @@ jobs:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -78,36 +81,17 @@ jobs:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
 
-  ##### Speculative Decoding Example Tests (requires 26.01 image) #####
-  speculative-decoding-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-l4-latest-1
-
-  speculative-decoding-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-h100-latest-2
-
   ##### TensorRT-LLM Example Tests #####
   trtllm-pr:
     needs: [check-file-changes, wait-checks]
@@ -172,7 +156,7 @@ jobs:
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, torch-pr, speculative-decoding-pr, trtllm-pr, onnx-pr]
+    needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
@@ -180,7 +164,6 @@ jobs:
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
             needs.torch-pr.result != 'success' ||
-            needs.speculative-decoding-pr.result != 'success' ||
             needs.trtllm-pr.result != 'success' ||
             needs.onnx-pr.result != 'success'
           ))
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index f807363a8..3e55682cd 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -62,9 +62,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [py312-cuda12-gpu, py312-cuda12-gpu-megatron]
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 90
+    timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -84,9 +88,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [py312-cuda12-gpu, py312-cuda12-gpu-megatron]
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 90
+    timeout-minutes: ${{ matrix.timeout }}
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:
diff --git a/pyproject.toml b/pyproject.toml
index 176866d41..bffa547b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,8 +132,8 @@ disable_error_code = ["attr-defined"]
 [tool.pytest.ini_options]
 # Default additional options
 # Show a short test summary info for all except passed tests with -ra flag
-# print execution time for 20 slowest tests and generate coverage reports
-addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
+# print execution time for 50 slowest tests and generate coverage reports
+addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=50 --strict-markers"
 pythonpath = ["tests/"]
 markers = [
     "manual: Only run when --run-manual is given",
diff --git a/setup.py b/setup.py
index 242505302..8f5578e89 100644
--- a/setup.py
+++ b/setup.py
@@ -77,6 +77,7 @@
         "pytest-cov",
         "pytest-instafail",
         "pytest-timeout",
+        "sentencepiece",  # For test_unified_export_megatron.py, test_vllm_fakequant_megatron_export.py
         "timm",
         "torchprofile>=0.0.4",  # For computing flops of CV models
         "torchvision",
diff --git a/tox.ini b/tox.ini
index 19fd22a48..ae296e5bd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -62,8 +62,6 @@ commands_pre =
     # Install deps here so that it gets installed even in --current-env
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
-    # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
-    #   to avoid possible CUDA version mismatch
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
@@ -79,8 +77,6 @@ commands_pre =
     pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
 
-    # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
-    #   to avoid possible CUDA version mismatch
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"