diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
index 8442125f3..c1dab5dab 100644
--- a/.github/workflows/example_tests.yml
+++ b/.github/workflows/example_tests.yml
@@ -56,7 +56,7 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### PyTorch Example Tests #####
+  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
   torch-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
@@ -64,10 +64,13 @@ jobs:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -78,36 +81,17 @@ jobs:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
 
-  ##### Speculative Decoding Example Tests (requires 26.01 image) #####
-  speculative-decoding-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-l4-latest-1
-
-  speculative-decoding-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-h100-latest-2
-
   ##### TensorRT-LLM Example Tests #####
   trtllm-pr:
     needs: [check-file-changes, wait-checks]
@@ -172,7 +156,7 @@ jobs:
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, torch-pr, speculative-decoding-pr, trtllm-pr, onnx-pr]
+    needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
@@ -180,7 +164,6 @@ jobs:
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
             needs.torch-pr.result != 'success' ||
-            needs.speculative-decoding-pr.result != 'success' ||
             needs.trtllm-pr.result != 'success' ||
             needs.onnx-pr.result != 'success'
           ))
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index cb4686815..3e55682cd 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -59,8 +59,16 @@ jobs:
   gpu-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 120
+    timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -74,11 +82,19 @@ jobs:
         run: |
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
-        run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
+        run: pip install tox-current-env && tox -e ${{ matrix.example }} --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 150
+    timeout-minutes: ${{ matrix.timeout }}
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:
diff --git a/pyproject.toml b/pyproject.toml
index 176866d41..bffa547b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,8 +132,8 @@ disable_error_code = ["attr-defined"]
 [tool.pytest.ini_options]
 # Default additional options
 # Show a short test summary info for all except passed tests with -ra flag
-# print execution time for 20 slowest tests and generate coverage reports
-addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
+# print execution time for 50 slowest tests and generate coverage reports
+addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=50 --strict-markers"
 pythonpath = ["tests/"]
 markers = [
     "manual: Only run when --run-manual is given",
diff --git a/setup.py b/setup.py
index 242505302..8f5578e89 100644
--- a/setup.py
+++ b/setup.py
@@ -77,6 +77,7 @@
         "pytest-cov",
         "pytest-instafail",
         "pytest-timeout",
+        "sentencepiece",  # For test_unified_export_megatron.py, test_vllm_fakequant_megatron_export.py
         "timm",
         "torchprofile>=0.0.4",  # For computing flops of CV models
         "torchvision",
diff --git a/tests/gpu_megatron/_extensions b/tests/gpu_megatron/_extensions
new file mode 120000
index 000000000..dc4ffce33
--- /dev/null
+++ b/tests/gpu_megatron/_extensions
@@ -0,0 +1 @@
+../gpu/_extensions/
\ No newline at end of file
diff --git a/tests/gpu_megatron/torch/conftest.py b/tests/gpu_megatron/torch/conftest.py
new file mode 120000
index 000000000..40eda16c0
--- /dev/null
+++ b/tests/gpu_megatron/torch/conftest.py
@@ -0,0 +1 @@
+../../gpu/torch/conftest.py
\ No newline at end of file
diff --git a/tests/gpu/torch/distill/plugins/test_distill_megatron.py b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
similarity index 100%
rename from tests/gpu/torch/distill/plugins/test_distill_megatron.py
rename to tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
diff --git a/tests/gpu/torch/export/test_unified_export_megatron.py b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
similarity index 100%
rename from tests/gpu/torch/export/test_unified_export_megatron.py
rename to tests/gpu_megatron/torch/export/test_unified_export_megatron.py
diff --git a/tests/gpu/torch/export/test_vllm_fakequant_megatron_export.py b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
similarity index 100%
rename from tests/gpu/torch/export/test_vllm_fakequant_megatron_export.py
rename to tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
similarity index 100%
rename from tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
rename to tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
similarity index 100%
rename from tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
rename to tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
diff --git a/tests/gpu/torch/opt/plugins/test_megatron_chaining.py b/tests/gpu_megatron/torch/opt/plugins/test_megatron_chaining.py
similarity index 100%
rename from tests/gpu/torch/opt/plugins/test_megatron_chaining.py
rename to tests/gpu_megatron/torch/opt/plugins/test_megatron_chaining.py
diff --git a/tests/gpu/torch/peft/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
similarity index 100%
rename from tests/gpu/torch/peft/test_megatron_peft.py
rename to tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
similarity index 100%
rename from tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
rename to tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
similarity index 100%
rename from tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
rename to tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
diff --git a/tests/gpu/torch/quantization/plugins/test_apex.py b/tests/gpu_megatron/torch/quantization/plugins/test_apex.py
similarity index 100%
rename from tests/gpu/torch/quantization/plugins/test_apex.py
rename to tests/gpu_megatron/torch/quantization/plugins/test_apex.py
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
similarity index 100%
rename from tests/gpu/torch/quantization/plugins/test_megatron.py
rename to tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
diff --git a/tests/gpu/torch/quantization/plugins/test_transformer_engine.py b/tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py
similarity index 100%
rename from tests/gpu/torch/quantization/plugins/test_transformer_engine.py
rename to tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py
diff --git a/tests/gpu/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py b/tests/gpu_megatron/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py
similarity index 100%
rename from tests/gpu/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py
rename to tests/gpu_megatron/torch/sparsity/weight_sparsity/plugins/test_megatron_sparsity.py
diff --git a/tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
similarity index 100%
rename from tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py
rename to tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
diff --git a/tests/gpu/torch/utils/plugins/test_utils_megatron.py b/tests/gpu_megatron/torch/utils/plugins/test_utils_megatron.py
similarity index 100%
rename from tests/gpu/torch/utils/plugins/test_utils_megatron.py
rename to tests/gpu_megatron/torch/utils/plugins/test_utils_megatron.py
diff --git a/tox.ini b/tox.ini
index ee7acf029..ae296e5bd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -60,23 +60,27 @@ commands =
 [testenv:{py310,py311,py312}-cuda12-gpu]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
-    pip install -U megatron-core
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
+    pip install -e .[all,dev-test]
+commands =
+    # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
+    python -m pytest tests/gpu
+
+[testenv:{py310,py311,py312}-cuda12-gpu-megatron]
+commands_pre =
+    # Install deps here so that it gets installed even in --current-env
+    pip install -U megatron-core
+
     # Skip triton because pytorch-triton is installed in the NGC PyTorch containers
     pip install pip-mark-installed
     pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
 
-    # Install Eagle-3 test dependencies
-    pip install tiktoken blobfile sentencepiece
-
-    # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
-    #   to avoid possible CUDA version mismatch
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
-    python -m pytest tests/gpu
+    python -m pytest tests/gpu_megatron
 
 #############################################
 # Code quality checks on all files or on diff