NVIDIA · danielkorzekwa · Feb 18, 2026 · Feb 18, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -45,6 +45,7 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 /examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
 /examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
 /examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
+/examples/megatron_bridge @NVIDIA/modelopt-examples-megatron-codeowners
 /examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
 /examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
 /examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners

@@ -0,0 +1,170 @@
+name: Example tests
+
+on:
+  push:
+    branches: ["pull-request/[0-9]+"]
+    # NOTE: paths cannot be used since push happens to copied PR and only latest commit to PR is used
+  schedule:
+    - cron: "0 0 * * *" # Nightly
+  workflow_dispatch: # On-demand
+
+# Cancel previous runs if new commit is pushed to the same PR
+concurrency:
+  group: ${{ github.workflow }}-${{ startsWith(github.ref, 'refs/heads/pull-request/') && github.ref || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  check-file-changes:
+    if: startsWith(github.ref, 'refs/heads/pull-request/')
+    runs-on: ubuntu-latest
+    outputs:
+      any_changed: ${{ steps.changed-tests.outputs.any_changed }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - id: get-pr-info
+        uses: nv-gha-runners/get-pr-info@main
+      # Get commit from main branch that is present in the PR to use as base for changed files
+      - id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
+      - name: Check for changes in test-relevant directories
+        id: changed-tests
+        uses: step-security/changed-files@v46.0.5
+        with:
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          files: |
+            .github/workflows/example_tests.yml
+            examples/**
+            modelopt/**
+            setup.py
+            tests/examples/**
+          fail_on_initial_diff_error: true
+  wait-checks:
+    needs: [check-file-changes]
+    if: needs.check-file-changes.outputs.any_changed == 'true'
+    uses: ./.github/workflows/_wait_for_checks.yml
+    permissions:
+      checks: read
+    secrets: inherit
+    with:
+      match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
+      delay: 300s
+
+  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
+  torch-pr:
+    needs: [check-file-changes, wait-checks]
+    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "26.01"
+    uses: ./.github/workflows/_example_tests_runner.yml
+    secrets: inherit
+    with:
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
+      example: ${{ matrix.example }}
+      pip_install_extras: "[hf,dev-test]"
+      runner: linux-amd64-gpu-l4-latest-1
+
+  torch-non-pr:
+    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "26.01"
+    uses: ./.github/workflows/_example_tests_runner.yml
+    secrets: inherit
+    with:
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
+      example: ${{ matrix.example }}
+      pip_install_extras: "[hf,dev-test]"
+      runner: linux-amd64-gpu-h100-latest-2
+
+  ##### TensorRT-LLM Example Tests #####
+  trtllm-pr:
+    needs: [check-file-changes, wait-checks]
+    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [llm_ptq] # vlm_ptq temporarily disabled due to pipeline error
+    uses: ./.github/workflows/_example_tests_runner.yml
+    secrets: inherit
+    with:
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
+      example: ${{ matrix.example }}
+      pip_install_extras: "[hf,dev-test]"
+      runner: linux-amd64-gpu-h100-latest-1
+
+  trtllm-non-pr:
+    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [llm_autodeploy, llm_eval, llm_ptq, vlm_ptq]
+    uses: ./.github/workflows/_example_tests_runner.yml
+    secrets: inherit
+    with:
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
+      example: ${{ matrix.example }}
+      pip_install_extras: "[hf,dev-test]"
+      runner: linux-amd64-gpu-h100-latest-2
+
+  ##### ONNX/TensorRT Example Tests #####
+  onnx-pr:
+    needs: [check-file-changes, wait-checks]
+    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [diffusers, torch_onnx]
+    uses: ./.github/workflows/_example_tests_runner.yml
+    secrets: inherit
+    with:
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
+      example: ${{ matrix.example }}
+      pip_install_extras: "[all,dev-test]"
+      runner: linux-amd64-gpu-l4-latest-1
+
+  onnx-non-pr:
+    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        example: [diffusers, torch_onnx]
+    uses: ./.github/workflows/_example_tests_runner.yml
+    secrets: inherit
+    with:
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
+      example: ${{ matrix.example }}
+      pip_install_extras: "[all,dev-test]"
+      runner: linux-amd64-gpu-l4-latest-1
+
+  ##### Required Check for PR #####
+  example-pr-required-check:
+    # Run even if example tests are skipped
+    if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
+    needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Required GPU tests did not succeed
+        if: |
+          needs.check-file-changes.result != 'success' ||
+          (needs.check-file-changes.outputs.any_changed == 'true' && (
+            needs.torch-pr.result != 'success' ||
+            needs.trtllm-pr.result != 'success' ||
+            needs.onnx-pr.result != 'success'
+          ))
+        run: exit 1
@@ -1,4 +1,4 @@
-# NOTE: Make sure this file is consistent with .gitlab/tests.yml
+# TODO: Optimize gpu tests runtime!
 name: GPU tests
 
 on:
@@ -59,10 +59,18 @@ jobs:
   gpu-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: cuda13-gpu
+            timeout: 90
+          - example: cuda13-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 120
+    timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
-      image: nvcr.io/nvidia/pytorch:25.06-py3
+      image: nvcr.io/nvidia/pytorch:26.01-py3
       env:
         GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
@@ -76,11 +84,19 @@ jobs:
       - name: Install dependencies for mip
         run: apt-get update && apt-get install -y libffi-dev
       - name: Run gpu tests
-        run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
+        run: pip install tox-current-env && tox -e ${{ matrix.example }} --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: cuda13-gpu
+            timeout: 90
+          - example: cuda13-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
+    timeout-minutes: ${{ matrix.timeout }}
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:

@@ -37,7 +37,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
+        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
@@ -55,6 +55,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
+        # Some issues with torch 2.10 on Windows, so using 2.9 for now
         run: pip install tox && tox -e py312-torch29-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
@@ -70,15 +71,15 @@ jobs:
         with:
           python-version: "3.${{ matrix.py }}"
       - name: Run unit tests
-        run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
+        run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit
   multi-torch:
     if: github.event_name == 'pull_request'
     needs: [linux]
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       matrix:
-        torch: [26, 27, 28]
+        torch: [26, 27, 28, 29]
     steps:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
@@ -96,7 +97,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
+        run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit
   partial-install:
     if: github.event_name == 'pull_request'
     needs: [linux]

@@ -109,7 +109,8 @@ repos:
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
               examples/speculative_decoding/server_generate.py|
-              examples/puzzletron/evaluation/hf_deployable_anymodel\.py|
+              examples/puzzletron/evaluation/lm_eval_anymodel.py|
+              modelopt/torch/puzzletron/anymodel/models/gpt_oss_20b/gpt_oss_pruned_to_mxfp4.py|
               modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py|
           )$
 

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -40,7 +40,7 @@
         "--no-cov",
     ],
     "evenBetterToml.schema.enabled": false, // disable toml/json schema since we have custom fields
-    "python.analysis.extraPaths": [
+    "cursorpyright.analysis.extraPaths": [
         "./tests/" // add tests to python path just like pytest does in pyproject.toml
     ],
     "git.alwaysSignOff": true,

diff --git a/CHANGELOG-Windows.rst b/CHANGELOG-Windows.rst
@@ -1,6 +1,19 @@
 NVIDIA Model Optimizer Changelog (Windows)
 ==========================================
 
+0.41 (TBD)
+^^^^^^^^^^
+
+**Bug Fixes**
+
+- Fix ONNX 1.19 compatibility issues with CuPy during ONNX INT4 AWQ quantization. ONNX 1.19 uses ml_dtypes.int4 instead of numpy.int8 which caused CuPy failures.
+
+**New Features**
+
+- Add support for ONNX Mixed Precision Weight-only quantization using INT4 and INT8 precisions. Refer quantization `example for GenAI LLMs <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/windows/onnx_ptq/genai_llm>`_.
+- Add support for some diffusion models' quantization on Windows. Refer `example script <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/windows/torch_onnx/diffusers>`_ for details.
+- Add `Perplexity <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/windows/accuracy_benchmark/perplexity_metrics>`_ and `KL-Divergence <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/windows/accuracy_benchmark/kl_divergence_metrics>`_ accuracy benchmarks.
+
 0.33 (2025-07-21)
 ^^^^^^^^^^^^^^^^^
 
@@ -25,8 +38,8 @@ NVIDIA Model Optimizer Changelog (Windows)
 
 - This is the first official release of Model Optimizer for Windows
 - **ONNX INT4 Quantization:** :meth:`modelopt.onnx.quantization.quantize_int4 <modelopt.onnx.quantization.int4.quantize>` now supports ONNX INT4 quantization for DirectML and TensorRT* deployment. See :ref:`Support_Matrix` for details about supported features and models.
-- **LLM Quantization with Olive:** Enabled LLM quantization through Olive, streamlining model optimization workflows. Refer `example <https://github.com/microsoft/Olive/tree/main/examples/phi3#quantize-models-with-nvidia-Model-Optimizer>`_
-- **DirectML Deployment Guide:** Added DML deployment guide. Refer :ref:`DirectML_Deployment`.
+- **LLM Quantization with Olive:** Enabled LLM quantization through Olive, streamlining model optimization workflows. Refer `Olive example <https://github.com/microsoft/Olive/tree/main/examples/phi3#quantize-models-with-nvidia-Model-Optimizer>`_.
+- **DirectML Deployment Guide:** Added DML deployment guide. Refer :ref:`Onnxruntime_Deployment` deployment guide for details.
 - **MMLU Benchmark for Accuracy Evaluations:** Introduced `MMLU benchmarking <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/windows/accuracy_benchmark/README.md>`_ for accuracy evaluation of ONNX models on DirectML (DML).
 - **Published quantized ONNX models collection:** Published quantized ONNX models at HuggingFace `NVIDIA collections <https://huggingface.co/collections/nvidia/optimized-onnx-models-for-nvidia-rtx-gpus>`_.
 

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,39 @@
 NVIDIA Model Optimizer Changelog (Linux)
 ========================================
 
+0.43 (2026-03-xx)
+^^^^^^^^^^^^^^^^^
+
+**New Features**
+
+- User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
+- ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
+- Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
+- Add support for rotating the input before quantization for RHT.
+
+0.42 (2026-02-xx)
+^^^^^^^^^^^^^^^^^
+
+**Bug Fixes**
+
+- Fix calibration data generation with multiple samples in the ONNX workflow.
+
+**New Features**
+
+- Add standalone type inference option (``--use_standalone_type_inference``) in ONNX AutoCast as an alternative to ONNX's ``infer_shapes``. This experimental feature performs type-only inference without shape inference, useful as a workaround when shape inference fails or to avoid unnecessary shape inference overhead.
+- Add support for Kimi K2 Thinking model quantization from the original int4 checkpoint.
+- Add support for ``params`` constraint based automatic neural architecture search in Minitron pruning (``mcore_minitron``) as an alternative to manual pruning (using ``export_config``). See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details on its usage.
+- New example for Minitron pruning with Megatron-Bridge framework along with advanced pruning usage with new ``params`` constraint based pruning. Also add example for distillation with Megatron-Bridge framework. Check `examples/megatron_bridge/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge>`_ for example scripts.
+- Add support for calibration data with multiple samples in ``npz`` format in the ONNX Autocast workflow.
+- Add ``--opset`` option to ONNX quantization CLI to specify the target opset version for the quantized model.
+- Add support for context parallelism in Eagle speculative decoding for huggingface and megatron core models.
+- Add unified Hugging Face export support for diffusers pipelines/components.
+- Add LTX-2 and Wan2.2 (T2V) support in the diffusers quantization workflow.
+- Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is.
+- Add support for image-text data calibration in PTQ for Nemotron VL models.
+- Add PTQ support for Nemotron Parse.
+- Add distillation support for LTX-2. See `examples/diffusers/distillation/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/diffusers/distillation>`_ for more details.
+
 0.41 (2026-01-19)
 ^^^^^^^^^^^^^^^^^
 
@@ -84,7 +117,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 
 **Documentation**
 
-- Add general guidelines for Minitron pruning and distillation. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
+- Add general guidelines for Minitron pruning and distillation. See `pruning guidelines <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
 - Added example for exporting QLoRA checkpoint for vLLM deployment. Refer to `examples/llm_qat/README.md <https://github.com/NVIDIA/Model-Optimizer/blob/79ef31bc7269ba4da0cfab446da5b64509cbfcef/examples/llm_qat/README.md#qlora-deployment>`_ for more details
 
 0.37 (2025-10-08)
@@ -209,7 +242,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add support for UNet ONNX quantization.
 - Enable ``concat_elimination`` pass by default to improve the performance of quantized ONNX models.
 - Enable Redundant Cast elimination pass by default in :meth:`moq.quantize <modelopt.onnx.quantization.quantize>`.
-- Add new attribute ``parallel_state`` to :class:`DynamicModule <modelopt.torch.opt.dynamic.DynamicModule>` to support distributed parallelism such as data parallel and tensor parallel.
+- Add new attribute ``parallel_state`` to :class:`QuantModule <modelopt.torch.quantization.nn.modules.quant_module.QuantModule>` to support distributed parallelism such as data parallel and tensor parallel.
 - Add MXFP8, NVFP4 quantized ONNX export support.
 - Add new example for torch quantization to ONNX for MXFP8, NVFP4 precision.