diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0d85ba5342..abca17810f 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -6,6 +6,8 @@ on:
       - reopened
       - synchronize
   workflow_dispatch:
+permissions:
+  contents: read
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
   cancel-in-progress: true
@@ -21,6 +23,8 @@ jobs:
     runs-on: ${{ matrix.OS }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - name: Python check
         uses: actions/setup-python@v4
         with:
@@ -37,3 +41,109 @@ jobs:
       - name: Linux test
         run: |
           bash .github/workflows/scripts_new/linux/4_test.sh
+      - name: Upload wheel for CUDA job
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: linux-wheel
+          path: dist/*.whl
+      - name: Upload CPU coverage data
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-cpu
+          path: |
+            coverage.xml
+            pytest-coverage.txt
+
+  test-cuda:
+    name: Linux CUDA Test
+    needs: build
+    runs-on: gpu-t4-4-core
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Python check
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install CUDA libraries
+        run: |
+          sudo apt-get install -y libcusolver-dev-12-8 libcusolver-12-8 libcusparse-dev-12-8 libcusparse-12-8 libnvjitlink-12-8 libcublas-12-8
+          echo "/usr/local/cuda/targets/x86_64-linux/lib" | sudo tee /etc/ld.so.conf.d/cuda-targets.conf
+          sudo ldconfig
+      - name: Download wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: linux-wheel
+      - name: Install quadrants
+        run: |
+          set -x
+          mkdir -p dist
+          mv *.whl dist/
+          pip install dist/*.whl
+      - name: Install test requirements
+        run: |
+          pip install --group test
+          pip install -r requirements_test_xdist.txt
+      - name: Run CUDA tests with coverage
+        run: |
+          bash .github/workflows/scripts_new/linux/4_test_cuda.sh
+      - name: Upload CUDA coverage data
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-cuda
+          path: coverage.xml
+
+  coverage-comment:
+    if: github.event_name == 'pull_request' && always()
+    needs: [build, test-cuda]
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Python check
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Download CPU coverage
+        uses: actions/download-artifact@v4
+        continue-on-error: true
+        with:
+          name: coverage-cpu
+          path: coverage-cpu
+      - name: Download CUDA coverage
+        uses: actions/download-artifact@v4
+        continue-on-error: true
+        with:
+          name: coverage-cuda
+          path: coverage-cuda
+      - name: Generate coverage report
+        continue-on-error: true
+        run: |
+          COV_XMLS=""
+          if [ -f coverage-cpu/coverage.xml ]; then
+            COV_XMLS="coverage-cpu/coverage.xml"
+          fi
+          if [ -f coverage-cuda/coverage.xml ]; then
+            COV_XMLS="$COV_XMLS coverage-cuda/coverage.xml"
+          fi
+          if [ -z "$COV_XMLS" ]; then
+            echo "No coverage XML files found, skipping report"
+            exit 0
+          fi
+
+          python tests/coverage_report.py --report-only \
+            --compare-branch=origin/${{ github.base_ref }} \
+            --coverage-xml $COV_XMLS \
+            --format markdown > coverage-comment.md
+      - name: Post coverage comment
+        if: always() && hashFiles('coverage-comment.md') != ''
+        run: gh pr comment ${{ github.event.pull_request.number }} --body-file coverage-comment.md
+        env:
+          GH_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/scripts_new/linux/4_test.sh b/.github/workflows/scripts_new/linux/4_test.sh
index f0993bec44..630dc34783 100644
--- a/.github/workflows/scripts_new/linux/4_test.sh
+++ b/.github/workflows/scripts_new/linux/4_test.sh
@@ -7,9 +7,18 @@ pip install -r requirements_test_xdist.txt
 export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
 ./build/quadrants_cpp_tests  --gtest_filter=-AMDGPU.*
 
+TEST_EXIT=0
+
 # Phase 1: run all tests except torch-dependent ones
-python tests/run_tests.py -v -r 1 -m "not needs_torch"
+python tests/run_tests.py -v -r 3 --coverage -m "not needs_torch" || TEST_EXIT=$?
 
-# Phase 2: install torch, run only torch tests
 pip install torch --index-url https://download.pytorch.org/whl/cpu
-python tests/run_tests.py -v -r 1 -m needs_torch
+QD_KERNEL_COVERAGE=0 python tests/run_tests.py -v -r 3 --coverage --cov-append -m needs_torch || TEST_EXIT=$?
+
+# Phase 3: run tests that are skipped under kernel coverage (offline cache, snode layout, FE-LL observations,
+# etc.) without --coverage so QD_KERNEL_COVERAGE stays 0.
+QD_KERNEL_COVERAGE=0 python tests/run_tests.py -v -r 3 -m "not needs_torch" || TEST_EXIT=$?
+
+python tests/coverage_report.py --collect-only
+
+exit $TEST_EXIT
diff --git a/.github/workflows/scripts_new/linux/4_test_cuda.sh b/.github/workflows/scripts_new/linux/4_test_cuda.sh
new file mode 100755
index 0000000000..60a9a7e78f
--- /dev/null
+++ b/.github/workflows/scripts_new/linux/4_test_cuda.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -ex
+
+TEST_EXIT=0
+
+# Disable kernel-level coverage on CUDA: it changes field memory layout and breaks dlpack tests
+# (ValueError: Expected zero byte_offset).  Python code coverage (--cov) still runs.
+QD_KERNEL_COVERAGE=0 python tests/run_tests.py -v -r 1 --arch cuda --coverage -m "not needs_torch" || TEST_EXIT=$?
+
+pip install torch --index-url https://download.pytorch.org/whl/cu128
+QD_KERNEL_COVERAGE=0 python tests/run_tests.py -v -r 1 --arch cuda --coverage --cov-append -m needs_torch || TEST_EXIT=$?
+
+# Run kernel coverage tests on CUDA with coverage enabled — these are skipped by the phases above
+# (QD_KERNEL_COVERAGE=0) and include GPU-only tests like test_kernel_coverage_simt_e2e.
+QD_KERNEL_COVERAGE=1 python tests/run_tests.py -v -r 1 --arch cuda --coverage --cov-append test_kernel_coverage.py || TEST_EXIT=$?
+
+python tests/coverage_report.py --collect-only
+
+exit $TEST_EXIT
diff --git a/.gitignore b/.gitignore
index 0aaf63e31c..bf8a3e999e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,8 +65,13 @@ __pycache__
 /python/test_env
 /CHANGELOG.md
 /.coverage
+/.coverage.*
+_qd_kcov.*
 /coverage.xml
+/coverage-report.html
 /htmlcov
+/diff-cover.*
+/pytest-coverage.txt
 libpython_path.txt
 .vscode
 _build
diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index beeaf38ebc..ce04e72f27 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -57,6 +57,14 @@ graph
 perf_dispatch
 ```
 
+```{toctree}
+:caption: Testing
+:maxdepth: 1
+:titlesonly:
+
+kernel_coverage
+```
+
 ```{toctree}
 :caption: Reference
 :maxdepth: 1
diff --git a/docs/source/user_guide/kernel_coverage.md b/docs/source/user_guide/kernel_coverage.md
new file mode 100644
index 0000000000..31cd58ae6c
--- /dev/null
+++ b/docs/source/user_guide/kernel_coverage.md
@@ -0,0 +1,103 @@
+# Kernel code coverage
+
+Standard Python coverage tools only measure host-side code. Quadrants kernel coverage goes further — it tracks which lines actually execute *inside* compiled kernels on the device (CPU or GPU), including which branches of `if`/`else` blocks are taken at runtime.
+
+The coverage data is written in the standard `coverage.py` format, so it works with `coverage report`, `pytest-cov`, `diff-cover`, and IDE coverage viewers out of the box.
+
+## Prerequisites
+
+Kernel coverage requires the `coverage` Python package:
+
+```bash
+pip install coverage
+```
+
+## Enabling kernel coverage
+
+### Automatic with pytest-cov
+
+If you use `pytest-cov`, kernel coverage is enabled automatically — no configuration needed. Quadrants ships a pytest plugin that detects `--cov` and sets `QD_KERNEL_COVERAGE=1` for you. Just run:
+
+```bash
+pytest --cov=my_package --cov-branch tests/
+```
+
+To disable kernel coverage while still collecting Python coverage, opt out explicitly:
+
+```bash
+QD_KERNEL_COVERAGE=0 pytest --cov=my_package --cov-branch tests/
+```
+
+### Manual with any script
+
+For scripts outside pytest, set the `QD_KERNEL_COVERAGE` environment variable:
+
+```bash
+QD_KERNEL_COVERAGE=1 python my_simulation.py
+```
+
+This works with any script that uses quadrants kernels — no changes to your code are needed.
+
+When the process exits, quadrants writes one or more `_qd_kcov.<pid>` files in the working directory containing the collected coverage data.
+
+## Viewing results
+
+### With coverage.py
+
+Combine the kernel coverage files and produce a report using the standard `coverage` tool:
+
+```bash
+# Combine all kernel coverage files into .coverage
+coverage combine _qd_kcov.*
+
+# Terminal summary
+coverage report --show-missing
+
+# HTML report
+coverage html
+```
+
+### With pytest-cov
+
+When using `pytest-cov`, kernel coverage is enabled automatically (see above). The kernel coverage data is merged with Python coverage after the run:
+
+```bash
+coverage combine _qd_kcov.* .coverage
+```
+
+## Key properties
+
+- **Zero overhead when disabled.** The coverage module is never imported unless `QD_KERNEL_COVERAGE=1` is set. There is no cost in normal operation.
+- **Branch coverage.** Probes inside `if`/`else` bodies only fire when that branch is taken, giving true runtime branch coverage — not just kernel-level coverage, or static conditional coverage.
+- **Works with pytest-xdist.** Each worker writes to a separate file; combine them afterward.
+- **Survives `qd.init()` resets.** Coverage data is accumulated across multiple `qd.init()` calls within the same process.
+
+## Advanced usage
+
+### Probe capacity
+
+There is a limit of 100,000 coverage probes per process (one probe per unique source line per kernel/func). If you hit the limit — for example in a very large codebase with many kernels — increase it via the environment variable:
+
+```bash
+QD_COVERAGE_MAX_PROBES=500000 QD_KERNEL_COVERAGE=1 python my_simulation.py
+```
+
+## Coverage and autodiff
+
+The forward pass is covered. The backward pass is not, because instrumenting it would interfere with gradient computation. This is normally fine — the backward pass is auto-generated and replays the same control flow, so forward coverage is sufficient.
+
+One edge case: kernel calls inside a `qd.ad.Tape` with `validation=True` will not be covered.
+
+## Offline cache interaction
+
+Coverage probes change the compiled kernel, so the offline cache will see them as new kernels and recompile. This is expected and does not affect correctness, but the first run with coverage enabled will be slower if you normally rely on cached kernels.
+
+## CI integration
+
+The CI workflow posts a diff coverage report as a PR comment on each push. A **new comment** is created each time (rather than editing the previous one) so that the PR timeline shows a clear chronological sequence of commits and their corresponding coverage results.
+
+## Under the hood
+
+When `QD_KERNEL_COVERAGE=1` is set, quadrants rewrites the Python AST of each `@qd.kernel` and `@qd.func` before compilation. It inserts lightweight probe statements (`field[probe_id] = 1`) at each source line. These probes compile as ordinary field stores and execute on the device alongside your kernel code.
+
+At process exit, the probe data is read back from the device and written to a `.coverage`-compatible file.
diff --git a/pyproject.toml b/pyproject.toml
index a0a6223d49..4c600131e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -99,6 +99,9 @@ test = [
     "pyright",
 ]
 
+[project.entry-points.pytest11]
+quadrants = "quadrants.pytest_plugin"
+
 [project.urls]
 Homepage = "https://github.com/Genesis-Embodied-AI/quadrants"
 
@@ -120,6 +123,14 @@ requires = [
 # things, without doing full c++ build
 build-backend = "setuptools.build_meta"
 
+[tool.coverage.paths]
+source = [
+    "python/quadrants",
+    "*/site-packages/quadrants",
+]
+
+[tool.coverage.report]
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "ignore:Calling non-taichi function",
diff --git a/python/quadrants/lang/_fast_caching/src_hasher.py b/python/quadrants/lang/_fast_caching/src_hasher.py
index c0dcf7708d..cba05e505c 100644
--- a/python/quadrants/lang/_fast_caching/src_hasher.py
+++ b/python/quadrants/lang/_fast_caching/src_hasher.py
@@ -1,4 +1,5 @@
 import json
+import os
 import warnings
 from typing import Any, Iterable, Sequence
 
@@ -49,6 +50,7 @@ def create_cache_key(
             kernel_source_info.filepath,
             str(kernel_source_info.start_lineno),
             "pruned",
+            "kcov" if os.environ.get("QD_KERNEL_COVERAGE") == "1" else "",
         )
     )
     return cache_key
diff --git a/python/quadrants/lang/_func_base.py b/python/quadrants/lang/_func_base.py
index 73ada738e7..cde4eadb41 100644
--- a/python/quadrants/lang/_func_base.py
+++ b/python/quadrants/lang/_func_base.py
@@ -4,6 +4,7 @@
 import ast
 import inspect
 import math
+import os
 import sys
 import textwrap
 import types
@@ -21,6 +22,11 @@
 
 import numpy as np
 
+
+def _kernel_coverage_enabled() -> bool:
+    return os.environ.get("QD_KERNEL_COVERAGE") == "1"
+
+
 from quadrants._lib import core as _qd_core
 from quadrants._lib.core.quadrants_python import KernelLaunchContext
 from quadrants.lang import _kernel_impl_dataclass, impl
@@ -246,9 +252,21 @@ def get_tree_and_ctx(
 
         autodiff_mode = current_kernel.autodiff_mode
 
+        _kcov = None
+        if _kernel_coverage_enabled() and autodiff_mode == _qd_core.AutodiffMode.NONE:
+            from . import (  # pylint: disable=import-outside-toplevel
+                _kernel_coverage as _kcov,
+            )
+
+            tree = _kcov.rewrite_ast(tree, function_source_info.filepath, function_source_info.start_lineno)
+
         quadrants_callable = current_kernel.quadrants_callable
         is_pure = quadrants_callable is not None and quadrants_callable.is_pure
         global_vars = self._get_global_vars(self.func)
+        if _kcov is not None:
+            cov_field = _kcov.get_field()
+            if cov_field is not None:
+                global_vars[_kcov.FIELD_VAR_NAME] = cov_field
 
         template_vars = {}
         if is_kernel or is_real_function:
diff --git a/python/quadrants/lang/_kernel_coverage.py b/python/quadrants/lang/_kernel_coverage.py
new file mode 100644
index 0000000000..67fa1f4710
--- /dev/null
+++ b/python/quadrants/lang/_kernel_coverage.py
@@ -0,0 +1,284 @@
+"""Kernel code coverage via Python AST rewriting.
+
+When enabled (QD_KERNEL_COVERAGE=1), this module rewrites kernel and func ASTs to insert coverage probes — field
+stores that record which source lines actually execute on the GPU. At process exit, the collected data is written
+to a .coverage file compatible with coverage.py / pytest-cov / diff-cover.
+
+The probes are compiled as ordinary field stores by the existing pipeline, so no C++ changes are needed. When
+disabled, this module is never imported and has zero impact on the normal runtime path.
+"""
+
+import ast
+import atexit
+import logging
+import os
+import threading
+import warnings
+from typing import TYPE_CHECKING
+
+from coverage import CoverageData  # type: ignore[import-not-found]
+
+import quadrants as qd
+from quadrants.lang import impl
+
+if TYPE_CHECKING:
+    from quadrants.lang.field import ScalarField
+
+FIELD_VAR_NAME = "_qd_cov"
+_MAX_PROBES = int(os.environ.get("QD_COVERAGE_MAX_PROBES", "100000"))
+
+_lock = threading.Lock()
+_cov_field: "ScalarField | None" = None
+_cov_field_prog: object | None = None  # tracks which Program instance owns _cov_field
+_probe_counter: int = 0
+# {probe_id: (filepath, absolute_lineno)}
+_probe_map: dict[int, tuple[str, int]] = {}
+# Accumulated coverage lines surviving across qd.init() resets
+_accumulated_lines: dict[str, set[int]] = {}
+_reset_hook_installed: bool = False
+# Directory for .coverage and _qd_kcov.* files, captured when coverage is first enabled
+_coverage_dir: str | None = None
+
+
+def _harvest_field() -> None:
+    """Read probe data from the current field into _accumulated_lines.
+
+    Must be called while the runtime is still alive (before clear()).
+    """
+    global _cov_field, _cov_field_prog
+    with _lock:
+        if _cov_field is None or not _probe_map:
+            return
+        field_ref = _cov_field
+        probe_snapshot = dict(_probe_map)
+        _cov_field = None
+        _cov_field_prog = None
+    try:
+        arr = field_ref.to_numpy()
+    except Exception:
+        logging.warning("Failed to read coverage field, coverage data for this session will be lost", exc_info=True)
+        return
+    with _lock:
+        for probe_id, (filepath, lineno) in probe_snapshot.items():
+            if probe_id < len(arr) and arr[probe_id] != 0:
+                _accumulated_lines.setdefault(filepath, set()).add(lineno)
+
+
+def _install_reset_hook() -> None:
+    """Monkey-patch PyQuadrants.clear() to harvest probes before destruction."""
+    global _reset_hook_installed
+    if _reset_hook_installed:
+        return
+    _original_clear = impl.PyQuadrants.clear
+
+    def _hooked_clear(self) -> None:
+        _harvest_field()
+        _original_clear(self)
+
+    impl.PyQuadrants.clear = _hooked_clear  # type: ignore[assignment]
+    _reset_hook_installed = True
+
+
+def ensure_field_allocated() -> None:
+    """Allocate (or re-allocate after qd.init()) the global coverage field."""
+    global _cov_field, _cov_field_prog, _coverage_dir
+    _install_reset_hook()
+    if _coverage_dir is None:
+        _coverage_dir = os.getcwd()
+    current_prog = impl.get_runtime()._prog
+    if _cov_field is not None and _cov_field_prog is current_prog:
+        return
+    with _lock:
+        current_prog = impl.get_runtime()._prog
+        if _cov_field is not None and _cov_field_prog is current_prog:
+            return
+        _cov_field = qd.field(dtype=qd.i32, shape=(_MAX_PROBES,))  # type: ignore[assignment]
+        _cov_field_prog = current_prog
+
+
+def get_field() -> "ScalarField | None":
+    with _lock:
+        if _cov_field_prog is not impl.get_runtime()._prog:
+            return None
+        return _cov_field
+
+
+def rewrite_ast(tree: ast.Module, filepath: str, start_lineno: int) -> ast.Module:
+    """Rewrite a kernel/func AST to insert coverage probes.
+
+    Each executable statement at a new source line gets a probe: ``_qd_cov[<probe_id>] = 1``.
+    Probes inside if/else bodies only fire when that branch is taken, giving true runtime branch coverage.
+    """
+    global _probe_counter
+    with _lock:
+        rewriter = _CoverageASTRewriter(
+            field_name=FIELD_VAR_NAME,
+            filepath=filepath,
+            start_lineno=start_lineno,
+            probe_id_start=_probe_counter,
+        )
+        tree = rewriter.visit(tree)
+        ast.fix_missing_locations(tree)
+        _probe_counter = rewriter.next_probe_id
+        _probe_map.update(rewriter.probe_map)
+    return tree
+
+
+def _detect_arc_mode() -> bool:
+    """Detect whether pytest-cov is running in branch (arc) mode.
+
+    Checks _QD_KCOV_ARC env var first (set by the pytest plugin), then falls back to reading .coverage.
+    Defaults to False (line mode) when nothing is known, since ``pytest --cov`` without ``--cov-branch``
+    is the more common invocation.
+    """
+    arc_env = os.environ.get("_QD_KCOV_ARC")
+    if arc_env is not None:
+        return arc_env == "1"
+    try:
+        cov_path = os.path.join(_coverage_dir, ".coverage") if _coverage_dir else ".coverage"
+        cd = CoverageData(basename=cov_path)
+        cd.read()
+        if not cd.measured_files():
+            return False
+        return cd.has_arcs()
+    except Exception:
+        logging.debug("Failed to detect arc mode from .coverage file, defaulting to line mode", exc_info=True)
+        return False
+
+
+def flush() -> None:
+    """Harvest any remaining field data and write all results to a .coverage file.
+
+    If .coverage.kernel already exists (e.g. from a prior test phase), the new data is merged into it so nothing
+    is lost across multiple invocations.
+    """
+    _harvest_field()
+
+    with _lock:
+        if not _accumulated_lines:
+            return
+        snapshot = {f: set(lines) for f, lines in _accumulated_lines.items()}
+
+    base_dir = _coverage_dir or os.getcwd()
+    kernel_path = os.path.join(base_dir, f"_qd_kcov.{os.getpid()}")
+    use_arcs = _detect_arc_mode()
+
+    cov = CoverageData(basename=kernel_path)
+    if use_arcs:
+        arcs_by_file: dict[str, list[tuple[int, int]]] = {}
+        for filepath, lines in snapshot.items():
+            # Emit only entry/exit arcs per line — we know which lines ran but not the actual transitions
+            # between them, so we avoid fabricating inter-line arcs that would misrepresent branch coverage.
+            arcs = []
+            for line in sorted(lines):
+                arcs.append((-1, line))
+                arcs.append((line, -1))
+            arcs_by_file[filepath] = arcs
+        cov.add_arcs(arcs_by_file)
+    else:
+        cov.add_lines({f: sorted(lines) for f, lines in snapshot.items()})
+    cov.write()
+
+
+_capacity_warning_emitted = False
+
+
+class _CoverageASTRewriter(ast.NodeTransformer):
+    """Insert coverage probes before each statement at a new source line."""
+
+    def __init__(self, field_name: str, filepath: str, start_lineno: int, probe_id_start: int) -> None:
+        self._field_name = field_name
+        self._filepath = filepath
+        self._start_lineno = start_lineno
+        self.next_probe_id = probe_id_start
+        self._seen_lines: set[int] = set()
+        self.probe_map: dict[int, tuple[str, int]] = {}
+
+    def _make_probe(self, abs_lineno: int, rel_lineno: int, col_offset: int) -> ast.Assign | None:
+        global _capacity_warning_emitted
+        probe_id = self.next_probe_id
+        if probe_id >= _MAX_PROBES:
+            if not _capacity_warning_emitted:
+                warnings.warn(
+                    f"Kernel coverage probe capacity ({_MAX_PROBES}) exceeded. "
+                    f"Additional kernel lines will not be tracked. "
+                    f"Set QD_COVERAGE_MAX_PROBES to a higher value.",
+                    stacklevel=2,
+                )
+                _capacity_warning_emitted = True
+            return None
+        self.probe_map[probe_id] = (self._filepath, abs_lineno)
+        self.next_probe_id += 1
+        node = ast.Assign(
+            targets=[
+                ast.Subscript(
+                    value=ast.Name(id=self._field_name, ctx=ast.Load()),
+                    slice=ast.Constant(value=probe_id),
+                    ctx=ast.Store(),
+                )
+            ],
+            value=ast.Constant(value=1),
+            lineno=rel_lineno,
+            col_offset=col_offset,
+            end_lineno=rel_lineno,
+            end_col_offset=col_offset,
+        )
+        return node
+
+    def _instrument_body(self, stmts: list[ast.stmt]) -> list[ast.stmt]:
+        result: list[ast.stmt] = []
+        for stmt in stmts:
+            rel_lineno = getattr(stmt, "lineno", None)
+            if rel_lineno is not None:
+                abs_lineno = rel_lineno + self._start_lineno - 1
+                if abs_lineno not in self._seen_lines:
+                    self._seen_lines.add(abs_lineno)
+                    col = getattr(stmt, "col_offset", 0)
+                    probe = self._make_probe(abs_lineno, rel_lineno, col)
+                    if probe is not None:
+                        result.append(probe)
+            result.append(self.visit(stmt))
+        return result
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.FunctionDef:
+        node.body = self._instrument_body(node.body)
+        return node
+
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AsyncFunctionDef:
+        node.body = self._instrument_body(node.body)
+        return node
+
+    def visit_If(self, node: ast.If) -> ast.If:
+        node.body = self._instrument_body(node.body)
+        if node.orelse:
+            node.orelse = self._instrument_body(node.orelse)
+        return node
+
+    def visit_For(self, node: ast.For) -> ast.For:
+        node.body = self._instrument_body(node.body)
+        if node.orelse:
+            node.orelse = self._instrument_body(node.orelse)
+        return node
+
+    def visit_While(self, node: ast.While) -> ast.While:
+        node.body = self._instrument_body(node.body)
+        if node.orelse:
+            node.orelse = self._instrument_body(node.orelse)
+        return node
+
+    def visit_With(self, node: ast.With) -> ast.With:
+        node.body = self._instrument_body(node.body)
+        return node
+
+    def visit_Try(self, node: ast.Try) -> ast.Try:
+        node.body = self._instrument_body(node.body)
+        for handler in node.handlers:
+            handler.body = self._instrument_body(handler.body)
+        if node.orelse:
+            node.orelse = self._instrument_body(node.orelse)
+        if node.finalbody:
+            node.finalbody = self._instrument_body(node.finalbody)
+        return node
+
+
+atexit.register(flush)
diff --git a/python/quadrants/lang/ast/ast_transformer_utils.py b/python/quadrants/lang/ast/ast_transformer_utils.py
index beae3534cb..b65edd2905 100644
--- a/python/quadrants/lang/ast/ast_transformer_utils.py
+++ b/python/quadrants/lang/ast/ast_transformer_utils.py
@@ -332,8 +332,9 @@ def get_var_by_name(self, name: str) -> tuple[bool, Any, str | None]:
             found_name = True
         elif name in self.global_vars:
             var = self.global_vars[name]
-            reason = f"{name} is in global vars, therefore violates pure"
-            violates_pure = True
+            if not name.startswith("_qd_"):
+                reason = f"{name} is in global vars, therefore violates pure"
+                violates_pure = True
             found_name = True
             if self.raise_on_templated_floats and isinstance(var, float):
                 raise ValueError("Not permitted to access floats as global values")
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 1285add28a..ce99d6164c 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -16,6 +16,11 @@
 
 _GRAPH_ENABLED = os.environ.get("QD_GRAPH", "1") == "1"
 
+
+def _kernel_coverage_enabled() -> bool:
+    return os.environ.get("QD_KERNEL_COVERAGE") == "1"
+
+
 from quadrants._lib.core.quadrants_python import (
     Arch,
     ASTBuilder,
@@ -374,6 +379,11 @@ def materialize(self, key: "CompiledKernelKeyType | None", py_args: tuple[Any, .
         if key in self.materialized_kernels:
             return
 
+        if _kernel_coverage_enabled():
+            from . import _kernel_coverage  # pylint: disable=import-outside-toplevel
+
+            _kernel_coverage.ensure_field_allocated()
+
         with self.runtime.compilation_lock:
             if key in self.materialized_kernels:
                 return
diff --git a/python/quadrants/lang/misc.py b/python/quadrants/lang/misc.py
index 38b29a0408..10cce48894 100644
--- a/python/quadrants/lang/misc.py
+++ b/python/quadrants/lang/misc.py
@@ -493,6 +493,12 @@ def init(
 
     # Recover the current working directory (https://github.com/taichi-dev/taichi/issues/4811)
     os.chdir(current_dir)
+
+    if os.environ.get("QD_KERNEL_COVERAGE") == "1":
+        from . import _kernel_coverage  # pylint: disable=import-outside-toplevel
+
+        _kernel_coverage.ensure_field_allocated()
+
     return None
 
 
diff --git a/python/quadrants/pytest_plugin.py b/python/quadrants/pytest_plugin.py
new file mode 100644
index 0000000000..9e9b6e704b
--- /dev/null
+++ b/python/quadrants/pytest_plugin.py
@@ -0,0 +1,20 @@
+"""Pytest plugin that auto-enables kernel coverage when pytest-cov is active.
+
+Registered via the ``pytest11`` entry point so it loads automatically when quadrants is installed.
+Opt out by setting ``QD_KERNEL_COVERAGE=0`` explicitly.
+"""
+
+import os
+
+
+def pytest_configure(config):
+    if not config.pluginmanager.hasplugin("_cov"):
+        return
+    os.environ.setdefault("QD_KERNEL_COVERAGE", "1")
+    if os.environ.get("QD_KERNEL_COVERAGE") != "1":
+        return
+    # Tell the kernel coverage module whether pytest-cov is running in branch (arc) mode,
+    # so it writes the matching format and avoids "Can not mix line and arc data" at combine time.
+    # We read config.option.cov_branch which pytest-cov has already populated by this point.
+    cov_branch = getattr(config.option, "cov_branch", False) or False
+    os.environ["_QD_KCOV_ARC"] = "1" if cov_branch else "0"
diff --git a/tests/coverage_report.py b/tests/coverage_report.py
new file mode 100644
index 0000000000..5324197bd3
--- /dev/null
+++ b/tests/coverage_report.py
@@ -0,0 +1,460 @@
+#!/usr/bin/env python3
+"""Combine kernel coverage data and generate diff coverage reports.
+
+Run tests first with run_tests.py --coverage, then use this script:
+
+  # Local dev: combine coverage data and generate HTML diff report
+  python tests/coverage_report.py
+
+  # CI: combine coverage data and generate coverage.xml (no diff report)
+  python tests/coverage_report.py --collect-only
+
+  # CI: generate diff report from previously collected coverage.xml files
+  python tests/coverage_report.py --report-only --format markdown \\
+      --coverage-xml coverage-cpu/coverage.xml coverage-cuda/coverage.xml
+"""
+
+import argparse
+import glob
+import html as html_mod
+import os
+import re
+import subprocess
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+
+GREEN = "\033[32m"
+RED = "\033[31m"
+DIM = "\033[2m"
+BOLD = "\033[1m"
+RESET = "\033[0m"
+
+
+def _run(cmd, **kwargs):
+    print(f"{DIM}$ {cmd}{RESET}", flush=True)
+    return subprocess.run(cmd, shell=True, cwd=REPO_ROOT, **kwargs)
+
+
+def combine_coverage():
+    """Combine pytest-cov and kernel coverage data, applying path remapping."""
+    pytest_cov = REPO_ROOT / ".coverage"
+    if not pytest_cov.exists():
+        return
+    pytest_cov.rename(REPO_ROOT / ".coverage.pytest")
+    kcov_files = glob.glob(str(REPO_ROOT / "_qd_kcov.*"))
+    combine_args = [".coverage.pytest"] + [os.path.basename(f) for f in kcov_files]
+    result = _run(f"coverage combine {' '.join(combine_args)}")
+    if result.returncode != 0 and kcov_files:
+        _run("coverage combine .coverage.pytest")
+
+
+def generate_artifacts():
+    """Generate coverage.xml and pytest-coverage.txt from the combined .coverage."""
+    _run("coverage xml -o coverage.xml --ignore-errors")
+    _run("coverage report --show-missing --skip-covered --ignore-errors > pytest-coverage.txt")
+
+
+# ---------------------------------------------------------------------------
+# Report rendering
+# ---------------------------------------------------------------------------
+
+
+class _Renderer:
+    """Base class for coverage report renderers."""
+
+    def begin(self, total_hit, total_miss, total_pct):
+        pass
+
+    def begin_file(self, filename, pct, missing):
+        pass
+
+    def write_line(self, lineno, text, status):
+        pass
+
+    def end_file(self):
+        pass
+
+    def finish(self):
+        pass
+
+    def output(self):
+        return None
+
+
+class _TerminalRenderer(_Renderer):
+    def begin(self, total_hit, total_miss, total_pct):
+        self._total_hit, self._total_miss, self._total_pct = total_hit, total_miss, total_pct
+        print(f"\n{BOLD}Diff Coverage Report{RESET}")
+        print("=" * 70)
+
+    def begin_file(self, filename, pct, missing):
+        color = GREEN if pct >= 80 else RED
+        missing_str = f"  Missing: {_format_ranges(missing)}" if missing else ""
+        print(f"  {filename}: {color}{pct:.0f}%{RESET}{missing_str}")
+
+    def finish(self):
+        print("-" * 70)
+        color = GREEN if self._total_pct >= 80 else RED
+        total = self._total_hit + self._total_miss
+        print(f"  {BOLD}Total: {total} lines, {self._total_miss} missing, {color}{self._total_pct:.0f}%{RESET}")
+
+
+class _AnnotatedRenderer(_TerminalRenderer):
+    """Print grouped summary table first, then all annotated file blocks."""
+
+    _STATUS_FMT = {"hit": (GREEN, "\u2713"), "miss": (RED, "\u2717"), "no_data": (DIM, " ")}
+
+    def begin(self, total_hit, total_miss, total_pct):
+        super().begin(total_hit, total_miss, total_pct)
+        self._file_blocks: list[tuple[str, float, list[tuple[int, str, str]]]] = []
+
+    def begin_file(self, filename, pct, missing):
+        super().begin_file(filename, pct, missing)
+        self._cur_filename, self._cur_pct = filename, pct
+        self._cur_lines: list[tuple[int, str, str]] = []
+
+    def write_line(self, lineno, text, status):
+        self._cur_lines.append((lineno, text, status))
+
+    def end_file(self):
+        if self._cur_lines:
+            self._file_blocks.append((self._cur_filename, self._cur_pct, self._cur_lines))
+
+    def finish(self):
+        super().finish()
+        for filename, pct, lines in self._file_blocks:
+            print(f"\n{BOLD}=== {filename} ({pct:.0f}%) ==={RESET}")
+            for lineno, text, status in lines:
+                color, marker = self._STATUS_FMT[status]
+                print(f"{color} {marker} {lineno:4d}{RESET} {color}{text}{RESET}")
+
+
+class _MarkdownRenderer(_Renderer):
+    _STATUS_MARKER = {"hit": "🟢", "miss": "🔴", "no_data": "  "}
+
+    def begin(self, total_hit, total_miss, total_pct):
+        commit = subprocess.run(
+            ["git", "rev-parse", "--short", "HEAD"],
+            capture_output=True,
+            text=True,
+            cwd=REPO_ROOT,
+        ).stdout.strip()
+        heading = f"## Coverage Report (`{commit}`)\n" if commit else "## Coverage Report\n"
+        print(heading)
+        print("| Metric | Value |")
+        print("|--------|-------|")
+        print(f"| **Diff coverage** (changed lines only) | **{total_pct:.0f}%** |")
+        overall = _get_overall_coverage()
+        if overall:
+            print(f"| Overall project coverage | {overall} |")
+        print()
+        print(f"**Total**: {total_hit + total_miss} lines, {total_miss} missing, {total_pct:.0f}% covered\n")
+
+    def begin_file(self, filename, pct, missing):
+        icon = "🟢" if pct >= 80 else "🔴"
+        print(f"<details><summary>{icon} <code>{filename}</code> ({pct:.0f}%)</summary>\n")
+        print("```")
+
+    def write_line(self, lineno, text, status):
+        print(f"{self._STATUS_MARKER[status]} {lineno:4d}  {text}")
+
+    def end_file(self):
+        print("```\n</details>\n")
+
+
+_HTML_CSS = """\
+body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace;
+       max-width: 960px; margin: 2rem auto; padding: 0 1rem; background: #1e1e1e; color: #d4d4d4; }
+h1 { color: #e0e0e0; }
+table.summary { border-collapse: collapse; margin: 1rem 0; }
+table.summary td, table.summary th { padding: 0.4rem 1rem; border: 1px solid #444; }
+table.summary th { background: #2d2d2d; text-align: left; }
+details { margin: 0.5rem 0; }
+summary { cursor: pointer; padding: 0.4rem; background: #2d2d2d; border-radius: 4px; }
+summary:hover { background: #363636; }
+.file-header { font-weight: bold; }
+.pct-good { color: #4ec9b0; }
+.pct-bad { color: #f44747; }
+pre { margin: 0; padding: 0.5rem; background: #1a1a1a; border-radius: 4px; overflow-x: auto;
+      font-size: 13px; line-height: 1.5; }
+.line { display: block; }
+.hit { background: #1e3a1e; }
+.miss { background: #3a1e1e; }
+.no-data { opacity: 0.5; }
+.lineno { display: inline-block; width: 4em; text-align: right; color: #858585;
+          margin-right: 1em; user-select: none; }
+.status { display: inline-block; width: 1.5em; text-align: center; }
+.status-hit { color: #4ec9b0; }
+.status-miss { color: #f44747; }"""
+
+_HTML_STATUS = {
+    "hit": ("hit", '<span class="status status-hit">&#10003;</span>'),
+    "miss": ("miss", '<span class="status status-miss">&#10007;</span>'),
+    "no_data": ("no-data", '<span class="status"> </span>'),
+}
+
+
+class _HtmlRenderer(_Renderer):
+    def __init__(self, output_path=None):
+        self._out_path = Path(output_path) if output_path else REPO_ROOT / "coverage-report.html"
+        self._parts = []
+
+    def begin(self, total_hit, total_miss, total_pct):
+        overall = _get_overall_coverage()
+        self._parts.append(
+            f'<!DOCTYPE html>\n<html><head><meta charset="utf-8"><title>Diff Coverage Report</title>\n'
+            f"<style>\n{_HTML_CSS}\n</style></head><body>\n<h1>Diff Coverage Report</h1>"
+        )
+        pct_cls = "pct-good" if total_pct >= 80 else "pct-bad"
+        self._parts.append('<table class="summary"><tr><th>Metric</th><th>Value</th></tr>')
+        self._parts.append(
+            f'<tr><td>Diff coverage (changed lines)</td><td class="{pct_cls}"><b>{total_pct:.0f}%</b></td></tr>'
+        )
+        if overall:
+            self._parts.append(f"<tr><td>Overall project coverage</td><td>{overall}</td></tr>")
+        self._parts.append(
+            f"<tr><td>Total lines</td><td>{total_hit + total_miss} ({total_miss} missing)</td></tr></table>"
+        )
+
+    def begin_file(self, filename, pct, missing):
+        pct_cls = "pct-good" if pct >= 80 else "pct-bad"
+        missing_str = f" &mdash; missing: {_format_ranges(missing)}" if missing else ""
+        self._parts.append(
+            f'<details><summary><span class="file-header">{html_mod.escape(filename)}</span>'
+            f' <span class="{pct_cls}">{pct:.0f}%</span>{missing_str}</summary><pre>'
+        )
+        self._line_parts = []
+
+    def write_line(self, lineno, text, status):
+        cls, icon = _HTML_STATUS[status]
+        escaped = html_mod.escape(text)
+        self._line_parts.append(f'<span class="line {cls}"><span class="lineno">{lineno}</span>{icon}{escaped}</span>')
+
+    def end_file(self):
+        self._parts.append("".join(self._line_parts) + "</pre></details>")
+
+    def finish(self):
+        self._parts.append("</body></html>")
+        self._out_path.write_text("\n".join(self._parts))
+        print(f"Coverage report written to {self._out_path}")
+
+
+_RENDERERS = {
+    "terminal": _TerminalRenderer,
+    "annotated": _AnnotatedRenderer,
+    "markdown": _MarkdownRenderer,
+    "html": _HtmlRenderer,
+}
+
+
+# ---------------------------------------------------------------------------
+# Report generation
+# ---------------------------------------------------------------------------
+
+
+def get_diff_lines(compare_branch):
+    """Return {filename: [(lineno, text)]} for added/modified lines."""
+    result = subprocess.run(
+        ["git", "diff", "-U0", compare_branch],
+        capture_output=True,
+        text=True,
+        cwd=REPO_ROOT,
+    )
+    diff_lines = {}
+    current_file = None
+    current_lineno = 0
+    for line in result.stdout.splitlines():
+        if line.startswith("+++ b/"):
+            current_file = line[6:]
+        elif line.startswith("@@ "):
+            plus_part = [p for p in line.split() if p.startswith("+")][0][1:]
+            if "," in plus_part:
+                start, count = plus_part.split(",")
+                start, count = int(start), int(count)
+            else:
+                start, count = int(plus_part), 1
+            current_lineno = start
+        elif line.startswith("+") and not line.startswith("+++"):
+            if current_file and current_file.endswith(".py"):
+                diff_lines.setdefault(current_file, []).append((current_lineno, line[1:]))
+            current_lineno += 1
+        elif line.startswith("\\"):
+            continue
+        elif not line.startswith("-"):
+            current_lineno += 1
+    return diff_lines
+
+
+def get_covered_lines(xml_paths):
+    """Return {filename: {lineno: hits}} from one or more coverage.xml files."""
+    result = {}
+    for xml_path in xml_paths:
+        tree = ET.parse(xml_path)
+        for cls in tree.getroot().findall(".//class"):
+            fn = cls.get("filename")
+            for line in cls.findall(".//line"):
+                lineno = int(line.get("number"))
+                hits = int(line.get("hits", 0))
+                result.setdefault(fn, {})
+                result[fn][lineno] = result[fn].get(lineno, 0) + hits
+    return result
+
+
+def generate_report(compare_branch, coverage_xmls, output_format="terminal", output_path=None):
+    """Generate the diff coverage report."""
+    diff_lines = get_diff_lines(compare_branch)
+    coverage = get_covered_lines(coverage_xmls)
+
+    files_report = []
+    total_hit = 0
+    total_miss = 0
+
+    for filename in sorted(diff_lines):
+        lines = diff_lines[filename]
+        if not lines:
+            continue
+        file_cov = coverage.get(filename, {})
+
+        hit = miss = no_data = 0
+        line_details = []
+        for lineno, text in lines:
+            hits = file_cov.get(lineno)
+            if hits is None:
+                no_data += 1
+                status = "no_data"
+            elif hits > 0:
+                hit += 1
+                status = "hit"
+            else:
+                miss += 1
+                status = "miss"
+            line_details.append((lineno, text, status))
+
+        measurable = hit + miss
+        if measurable == 0:
+            continue
+
+        pct = (hit / measurable * 100) if measurable else 0
+        total_hit += hit
+        total_miss += miss
+        missing = [ln for ln, _, s in line_details if s == "miss"]
+
+        files_report.append(
+            {
+                "filename": filename,
+                "pct": pct,
+                "missing": missing,
+                "lines": line_details,
+            }
+        )
+
+    total_pct = (total_hit / (total_hit + total_miss) * 100) if (total_hit + total_miss) else 0
+
+    renderer_cls = _RENDERERS[output_format]
+    renderer = renderer_cls(output_path=output_path) if output_format == "html" else renderer_cls()
+    renderer.begin(total_hit, total_miss, total_pct)
+    for fr in files_report:
+        renderer.begin_file(fr["filename"], fr["pct"], fr["missing"])
+        for lineno, text, status in fr["lines"]:
+            renderer.write_line(lineno, text, status)
+        renderer.end_file()
+    renderer.finish()
+
+    return total_pct
+
+
+def _get_overall_coverage():
+    """Extract overall coverage % from pytest-coverage.txt if it exists."""
+    for path in [REPO_ROOT / "pytest-coverage.txt", REPO_ROOT / "coverage-cpu" / "pytest-coverage.txt"]:
+        if path.exists():
+            for line in reversed(path.read_text().splitlines()):
+                if line.startswith("TOTAL"):
+                    match = re.search(r"(\d+%)", line)
+                    if match:
+                        return match.group(1)
+    return None
+
+
+def _format_ranges(numbers):
+    """Format [1,2,3,5,7,8,9] as '1-3,5,7-9'."""
+    if not numbers:
+        return ""
+    ranges = []
+    start = prev = numbers[0]
+    for n in numbers[1:]:
+        if n == prev + 1:
+            prev = n
+        else:
+            ranges.append(f"{start}-{prev}" if start != prev else str(start))
+            start = prev = n
+    ranges.append(f"{start}-{prev}" if start != prev else str(start))
+    return ",".join(ranges)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Combine kernel coverage data and generate diff coverage reports",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    mode = parser.add_mutually_exclusive_group()
+    mode.add_argument(
+        "--collect-only",
+        action="store_true",
+        help="Combine coverage data and generate coverage.xml, but skip the diff report",
+    )
+    mode.add_argument(
+        "--report-only",
+        action="store_true",
+        help="Skip combining, generate report from existing coverage.xml",
+    )
+
+    parser.add_argument(
+        "--compare-branch",
+        default="origin/main",
+        help="Branch to compare against (default: origin/main)",
+    )
+    parser.add_argument(
+        "--coverage-xml",
+        nargs="+",
+        default=None,
+        help="Path(s) to coverage.xml file(s). Default: coverage.xml in repo root",
+    )
+    parser.add_argument(
+        "--format",
+        dest="output_format",
+        default="html",
+        choices=["html", "terminal", "annotated", "markdown"],
+        help="Output format (default: html)",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default=None,
+        help="Output file path for HTML format (default: coverage-report.html in repo root)",
+    )
+
+    args = parser.parse_args()
+
+    if not args.report_only:
+        combine_coverage()
+        generate_artifacts()
+
+    if args.collect_only:
+        return 0
+
+    xml_paths = args.coverage_xml or [str(REPO_ROOT / "coverage.xml")]
+    xml_paths = [p for p in xml_paths if os.path.exists(p)]
+    if not xml_paths:
+        print("No coverage.xml found. Run tests first or specify --coverage-xml.", file=sys.stderr)
+        sys.exit(1)
+
+    total_pct = generate_report(args.compare_branch, xml_paths, args.output_format, output_path=args.output)
+    return 0 if total_pct >= 80 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py b/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
index aa60625bcb..711839cf5d 100644
--- a/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
+++ b/tests/python/quadrants/lang/fast_caching/test_src_ll_cache.py
@@ -8,6 +8,8 @@
 import pytest
 
 import quadrants as qd
+
+_KERNEL_COVERAGE = os.environ.get("QD_KERNEL_COVERAGE") == "1"
 import quadrants.lang
 from quadrants._test_tools import qd_init_same_arch
 from quadrants.lang._kernel_types import SrcLlCacheObservations
@@ -62,6 +64,10 @@ def has_pure() -> None:
         assert has_pure._primal._last_compiled_kernel_data._debug_dump_to_string() == last_compiled_kernel_data_str
 
 
+@pytest.mark.skipif(
+    _KERNEL_COVERAGE,
+    reason="Coverage probes change LLVM IR addresses after reinit, breaking recompile comparison",
+)
 @test_utils.test()
 def test_src_ll_cache_with_corruption(tmp_path: pathlib.Path) -> None:
     qd_init_same_arch(offline_cache_file_path=str(tmp_path), offline_cache=True)
diff --git a/tests/python/quadrants/lang/test_kernel_impl.py b/tests/python/quadrants/lang/test_kernel_impl.py
index ff6c817f33..78e85f2bca 100644
--- a/tests/python/quadrants/lang/test_kernel_impl.py
+++ b/tests/python/quadrants/lang/test_kernel_impl.py
@@ -1,3 +1,4 @@
+import os
 import pathlib
 
 import pytest
@@ -7,7 +8,13 @@
 
 from tests import test_utils
 
+_KERNEL_COVERAGE = os.environ.get("QD_KERNEL_COVERAGE") == "1"
 
+
+@pytest.mark.skipif(
+    _KERNEL_COVERAGE,
+    reason="Coverage probes change the kernel AST, preventing FE-LL cache hits after reinit",
+)
 @test_utils.test()
 def test_fe_ll_observations(tmp_path: pathlib.Path) -> None:
     @qd.kernel
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 9b931488e9..763969c553 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -435,5 +435,5 @@ def _get_expected_matrix_apis():
 @test_utils.test(arch=qd.cpu)
 def test_api(src):
     expected = sorted(user_api[src])
-    actual = sorted([s for s in dir(src) if not s.startswith("_")])
+    actual = sorted([s for s in dir(src) if not s.startswith(("_", "@")) and s != "pytest_plugin"])
     assert actual == expected, f"Failed for API={src}:\n  expected={expected}\n  actual={actual}"
diff --git a/tests/python/test_kernel_coverage.py b/tests/python/test_kernel_coverage.py
new file mode 100644
index 0000000000..dca2df3572
--- /dev/null
+++ b/tests/python/test_kernel_coverage.py
@@ -0,0 +1,488 @@
+"""Tests for kernel code coverage instrumentation.
+
+These tests verify that the AST rewriter correctly inserts coverage probes and that the probes fire when kernel
+code executes on the device.
+"""
+
+import ast
+import os
+import textwrap
+
+import pytest
+
+import quadrants as qd
+
+from tests import test_utils
+
+# These tests only run when QD_KERNEL_COVERAGE=1
+pytestmark = pytest.mark.skipif(
+    os.environ.get("QD_KERNEL_COVERAGE", "") != "1",
+    reason="QD_KERNEL_COVERAGE=1 not set",
+)
+
+
+# ---------------------------------------------------------------------------
+# AST rewriter unit tests
+# ---------------------------------------------------------------------------
+
+_AST_REWRITER_CASES = [
+    pytest.param(
+        """\
+        def f():
+            x = 1
+            y = 2
+            return x + y
+        """,
+        {11, 12, 13},
+        10,
+        id="straight_line",
+    ),
+    pytest.param(
+        """\
+        def f():
+            if x > 0:
+                a = 1
+            else:
+                b = 2
+        """,
+        {2, 3, 5},
+        1,
+        id="if_else",
+    ),
+    pytest.param(
+        """\
+        def f():
+            for i in range(10):
+                x = i
+        """,
+        {2, 3},
+        1,
+        id="for_loop",
+    ),
+    pytest.param(
+        """\
+        def f():
+            while x > 0:
+                x = x - 1
+            else:
+                y = 0
+        """,
+        {2, 3, 5},
+        1,
+        id="while_loop_else",
+    ),
+    pytest.param(
+        """\
+        def f():
+            with ctx:
+                a = 1
+                b = 2
+        """,
+        {2, 3, 4},
+        1,
+        id="with_statement",
+    ),
+    pytest.param(
+        """\
+        def f():
+            try:
+                a = 1
+            except:
+                b = 2
+            else:
+                c = 3
+            finally:
+                d = 4
+        """,
+        {3, 5, 7, 9},
+        1,
+        id="try_except_finally",
+    ),
+]
+
+
+@pytest.mark.parametrize("src,expected_lines,start_lineno", _AST_REWRITER_CASES)
+def test_ast_rewriter(src, expected_lines, start_lineno):
+    """Verify the AST rewriter inserts probes at the expected source lines."""
+    from quadrants.lang._kernel_coverage import _CoverageASTRewriter
+
+    tree = ast.parse(textwrap.dedent(src))
+    rewriter = _CoverageASTRewriter(
+        field_name="_qd_cov", filepath="test.py", start_lineno=start_lineno, probe_id_start=0
+    )
+    rewriter.visit(tree)
+
+    covered_lines = {lineno for _, (_, lineno) in rewriter.probe_map.items()}
+    assert expected_lines.issubset(covered_lines), f"Expected lines {expected_lines} to be probed, got {covered_lines}"
+
+
+def test_ast_rewriter_capacity_limit():
+    """Verify that probes stop being inserted when the capacity limit is hit."""
+    import warnings
+
+    import quadrants.lang._kernel_coverage as kcov
+    from quadrants.lang._kernel_coverage import _CoverageASTRewriter
+
+    src = textwrap.dedent(
+        """\
+        def f():
+            a = 1
+            b = 2
+            c = 3
+    """
+    )
+    tree = ast.parse(src)
+    old_warning_state = kcov._capacity_warning_emitted
+    kcov._capacity_warning_emitted = False
+    try:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            rewriter = _CoverageASTRewriter(
+                field_name="_qd_cov", filepath="test.py", start_lineno=1, probe_id_start=kcov._MAX_PROBES - 1
+            )
+            rewriter.visit(tree)
+
+        assert rewriter.next_probe_id == kcov._MAX_PROBES
+        assert len(rewriter.probe_map) == 1, f"Only 1 probe should fit, got {len(rewriter.probe_map)}"
+        assert len(w) == 1
+        assert "exceeded" in str(w[0].message).lower()
+    finally:
+        kcov._capacity_warning_emitted = old_warning_state
+
+
+def test_ast_rewriter_deduplicates_same_line():
+    """Verify that two statements on the same source line get only one probe."""
+    from quadrants.lang._kernel_coverage import _CoverageASTRewriter
+
+    src = "def f():\n    a = 1; b = 2\n"
+    tree = ast.parse(src)
+    rewriter = _CoverageASTRewriter(field_name="_qd_cov", filepath="test.py", start_lineno=1, probe_id_start=0)
+    rewriter.visit(tree)
+
+    abs_lines = [lineno for _, (_, lineno) in rewriter.probe_map.items()]
+    assert abs_lines.count(2) == 1, f"Line 2 should have exactly one probe, got {abs_lines.count(2)}"
+
+
+def test_env_var_max_probes():
+    """Verify that QD_COVERAGE_MAX_PROBES env var is read at import time."""
+    import quadrants.lang._kernel_coverage as kcov
+
+    assert kcov._MAX_PROBES == int(os.environ.get("QD_COVERAGE_MAX_PROBES", "100000"))
+
+
+def test_harvest_field_exception_path():
+    """Verify that _harvest_field handles to_numpy() failure gracefully."""
+    from unittest.mock import MagicMock
+
+    import quadrants.lang._kernel_coverage as kcov
+
+    old_field = kcov._cov_field
+    old_prog = kcov._cov_field_prog
+    old_map = kcov._probe_map.copy()
+    try:
+        mock_field = MagicMock()
+        mock_field.to_numpy.side_effect = RuntimeError("runtime destroyed")
+        kcov._cov_field = mock_field
+        kcov._cov_field_prog = object()
+        kcov._probe_map[999999] = ("fake.py", 1)
+
+        # Should not raise — the exception is caught and logged
+        kcov._harvest_field()
+
+        assert kcov._cov_field is None, "Field should be cleared after failure"
+        assert kcov._cov_field_prog is None, "Field prog should be cleared after failure"
+    finally:
+        kcov._cov_field = old_field
+        kcov._cov_field_prog = old_prog
+        kcov._probe_map = old_map
+
+
+# ---------------------------------------------------------------------------
+# End-to-end tests
+# ---------------------------------------------------------------------------
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_kernel_coverage_branches_e2e():
+    """Verify that only the taken branch has its probe fired."""
+    from quadrants.lang import _kernel_coverage
+
+    _kernel_coverage.ensure_field_allocated()
+
+    probe_count_before = _kernel_coverage._probe_counter
+    out = qd.field(dtype=qd.i32, shape=(1,))
+
+    @qd.kernel
+    def branching_kernel():
+        x = 10
+        if x > 5:
+            out[0] = 1
+        else:
+            out[0] = 2
+
+    branching_kernel()
+
+    assert out[0] == 1
+
+    cov_field = _kernel_coverage.get_field()
+    arr = cov_field.to_numpy()
+
+    probes_for_kernel = {pid: loc for pid, loc in _kernel_coverage._probe_map.items() if pid >= probe_count_before}
+
+    taken_probes = {pid for pid, loc in probes_for_kernel.items() if arr[pid] != 0}
+    not_taken_probes = {pid for pid, loc in probes_for_kernel.items() if arr[pid] == 0}
+
+    assert len(taken_probes) > 0, "At least some probes should have fired"
+    assert len(not_taken_probes) > 0, "The else branch should not have been reached"
+
+
+@test_utils.test(arch=qd.gpu)
+def test_kernel_coverage_simt_e2e():
+    """Verify coverage probes track branches with block.sync() and subgroup shuffle.
+
+    The if/else is based on a runtime value read from a field, so the compiler cannot constant-fold it away.
+    Only the taken branch's shuffle probe should fire.
+    """
+    from quadrants.lang import _kernel_coverage
+    from quadrants.lang.simt import subgroup
+
+    _kernel_coverage.ensure_field_allocated()
+
+    N = 64
+    probe_count_before = _kernel_coverage._probe_counter
+    flag = qd.field(dtype=qd.i32, shape=(1,))
+    a = qd.field(dtype=qd.i32, shape=(N,))
+    out = qd.field(dtype=qd.i32, shape=(N,))
+
+    flag[0] = 1  # runtime value: take the if-branch
+
+    @qd.kernel
+    def simt_kernel():
+        qd.loop_config(block_dim=N)
+        for i in range(N):
+            a[i] = i + 1
+            qd.simt.block.sync()
+            if flag[0] > 0:
+                val = subgroup.shuffle(a[i], qd.u32(0))
+                out[i] = val
+            else:
+                val = subgroup.shuffle(a[i], qd.u32(1))
+                out[i] = val + 100
+
+    simt_kernel()
+
+    for i in range(4):
+        assert out[i] == 1, f"Expected 1 at index {i}, got {out[i]}"
+
+    cov_field = _kernel_coverage.get_field()
+    arr = cov_field.to_numpy()
+
+    probes_for_kernel = {pid: loc for pid, loc in _kernel_coverage._probe_map.items() if pid >= probe_count_before}
+
+    fired = {pid for pid in probes_for_kernel if arr[pid] != 0}
+    not_fired = {pid for pid in probes_for_kernel if arr[pid] == 0}
+    assert len(fired) >= 4, f"Expected at least 4 probes to fire, got {len(fired)}"
+    assert len(not_fired) >= 2, "The else branch should not have been reached"
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_kernel_coverage_survives_reinit():
+    """Verify that coverage data accumulated before qd.init() reset is preserved.
+
+    Runs a kernel, then resets via qd.reset()/qd.init() (which triggers the _hooked_clear harvest), runs another
+    kernel, harvests again, and checks that _accumulated_lines contains data from both sessions.
+    """
+    from quadrants.lang import _kernel_coverage, impl
+
+    current_arch = impl.get_runtime()._arch
+    _kernel_coverage.ensure_field_allocated()
+
+    probe_count_before = _kernel_coverage._probe_counter
+    out1 = qd.field(dtype=qd.i32, shape=(1,))
+
+    @qd.kernel
+    def kernel_before_reset():
+        out1[0] = 1
+
+    kernel_before_reset()
+
+    cov_field = _kernel_coverage.get_field()
+    assert cov_field is not None
+    arr = cov_field.to_numpy()
+    probes_first = {pid: loc for pid, loc in _kernel_coverage._probe_map.items() if pid >= probe_count_before}
+    fired_first = {pid for pid in probes_first if arr[pid] != 0}
+    assert len(fired_first) > 0, "Probes from first kernel should have fired"
+
+    # Don't call _harvest_field() manually — let qd.reset() trigger it via the _hooked_clear hook
+    qd.reset()
+
+    # Verify the hook harvested data from the first session
+    files_before = set(_kernel_coverage._accumulated_lines.keys())
+    assert len(files_before) > 0, "Hook should have harvested data during reset"
+    lines_before = {}
+    for f, lines in _kernel_coverage._accumulated_lines.items():
+        lines_before[f] = set(lines)
+
+    qd.init(arch=current_arch)
+
+    _kernel_coverage.ensure_field_allocated()
+
+    probe_count_mid = _kernel_coverage._probe_counter
+    out2 = qd.field(dtype=qd.i32, shape=(1,))
+
+    @qd.kernel
+    def kernel_after_reset():
+        out2[0] = 2
+
+    kernel_after_reset()
+
+    _kernel_coverage._harvest_field()
+
+    for f in files_before:
+        assert (
+            f in _kernel_coverage._accumulated_lines
+        ), f"File {f} from before reset should still be in _accumulated_lines"
+        assert lines_before[f].issubset(
+            _kernel_coverage._accumulated_lines[f]
+        ), "Lines from before reset should be preserved"
+
+    probes_second = {pid: loc for pid, loc in _kernel_coverage._probe_map.items() if pid >= probe_count_mid}
+    second_files = {loc[0] for loc in probes_second.values()}
+    for f in second_files:
+        assert f in _kernel_coverage._accumulated_lines, f"File {f} from second kernel should be in _accumulated_lines"
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_kernel_coverage_autodiff():
+    """Verify that autodiff forward pass produces probes but backward does not.
+
+    The forward compilation (AutodiffMode.NONE) should insert probes that fire. The backward compilation
+    (AutodiffMode.REVERSE) should not add any probes.
+    """
+    from quadrants.lang import _kernel_coverage
+
+    _kernel_coverage.ensure_field_allocated()
+
+    x = qd.field(dtype=qd.f32, shape=(), needs_grad=True)
+    loss = qd.field(dtype=qd.f32, shape=(), needs_grad=True)
+
+    @qd.kernel
+    def compute():
+        loss[None] = x[None] * x[None]
+
+    x[None] = 5.0
+
+    probe_count_before = _kernel_coverage._probe_counter
+
+    with qd.ad.Tape(loss):
+        compute()
+
+    probe_count_after_tape = _kernel_coverage._probe_counter
+    forward_probes = probe_count_after_tape - probe_count_before
+    assert forward_probes > 0, "Forward compilation should have inserted probes"
+
+    # Verify forward probes actually fired
+    cov_field = _kernel_coverage.get_field()
+    assert cov_field is not None
+    arr = cov_field.to_numpy()
+    probes = {pid: loc for pid, loc in _kernel_coverage._probe_map.items() if pid >= probe_count_before}
+    fired = {pid for pid in probes if arr[pid] != 0}
+    assert len(fired) > 0, "Forward pass inside Tape should produce fired coverage probes"
+
+    # Verify backward pass computes correct gradients
+    assert loss[None] == pytest.approx(25.0)
+    assert x.grad[None] == pytest.approx(10.0)
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_kernel_coverage_qd_func():
+    """Verify that probes fire inside a @qd.func called from a kernel."""
+    from quadrants.lang import _kernel_coverage
+
+    _kernel_coverage.ensure_field_allocated()
+
+    probe_count_before = _kernel_coverage._probe_counter
+    out = qd.field(dtype=qd.i32, shape=(1,))
+
+    @qd.func
+    def helper():
+        out[0] = 99
+
+    @qd.kernel
+    def caller():
+        helper()
+
+    caller()
+
+    assert out[0] == 99
+
+    cov_field = _kernel_coverage.get_field()
+    assert cov_field is not None
+    arr = cov_field.to_numpy()
+
+    probes = {pid: loc for pid, loc in _kernel_coverage._probe_map.items() if pid >= probe_count_before}
+    fired = {pid for pid in probes if arr[pid] != 0}
+    # The kernel body has one statement (helper()), and the func body has one (out[0] = 99).
+    # Both should produce probes that fire.
+    assert (
+        len(fired) >= 2
+    ), f"Expected probes from both kernel and func to fire, got {len(fired)} fired out of {len(probes)}"
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_kernel_coverage_multiple_kernels_same_session():
+    """Verify that probes from two different kernels both fire in the same session."""
+    from quadrants.lang import _kernel_coverage
+
+    _kernel_coverage.ensure_field_allocated()
+
+    probe_count_before = _kernel_coverage._probe_counter
+    a = qd.field(dtype=qd.i32, shape=(1,))
+    b = qd.field(dtype=qd.i32, shape=(1,))
+
+    @qd.kernel
+    def kernel_a():
+        a[0] = 10
+
+    @qd.kernel
+    def kernel_b():
+        b[0] = 20
+
+    kernel_a()
+    probe_count_after_a = _kernel_coverage._probe_counter
+    kernel_b()
+
+    assert a[0] == 10
+    assert b[0] == 20
+
+    cov_field = _kernel_coverage.get_field()
+    arr = cov_field.to_numpy()
+
+    probes_a = {
+        pid: loc for pid, loc in _kernel_coverage._probe_map.items() if probe_count_before <= pid < probe_count_after_a
+    }
+    probes_b = {pid: loc for pid, loc in _kernel_coverage._probe_map.items() if pid >= probe_count_after_a}
+
+    fired_a = {pid for pid in probes_a if arr[pid] != 0}
+    fired_b = {pid for pid in probes_b if arr[pid] != 0}
+
+    assert len(fired_a) > 0, "Probes from kernel_a should have fired"
+    assert len(fired_b) > 0, "Probes from kernel_b should have fired"
+
+
+@test_utils.test(arch=[qd.cpu, qd.cuda])
+def test_qd_prefix_exemption_pure_kernel():
+    """Verify that _qd_-prefixed globals don't violate pure kernel checks.
+
+    With kernel coverage enabled, _qd_cov is injected as a global. This test verifies that a pure (fastcache)
+    kernel still compiles without error. The kernel uses ndarray arguments (not global fields) because pure
+    kernels prohibit non-_qd_ globals.
+    """
+    a = qd.ndarray(qd.i32, (1,))
+
+    @qd.kernel(fastcache=True)
+    def pure_kernel(arr: qd.types.NDArray) -> None:
+        arr[0] = 42
+
+    pure_kernel(a)
+    assert a[0] == 42
diff --git a/tests/python/test_offline_cache.py b/tests/python/test_offline_cache.py
index 848c15dbba..36091b77bc 100644
--- a/tests/python/test_offline_cache.py
+++ b/tests/python/test_offline_cache.py
@@ -13,6 +13,13 @@
 
 from tests import test_utils
 
+# Coverage field allocation creates internal fill kernels that change cache file counts.
+# CI runs these tests in a separate phase without QD_KERNEL_COVERAGE (see 4_test.sh).
+pytestmark = pytest.mark.skipif(
+    os.environ.get("QD_KERNEL_COVERAGE") == "1",
+    reason="Kernel coverage adds internal kernels that invalidate cache file count assertions",
+)
+
 OFFLINE_CACHE_TEMP_DIR = pathlib.Path(mkdtemp())
 atexit.register(lambda: shutil.rmtree(OFFLINE_CACHE_TEMP_DIR))
 
diff --git a/tests/python/test_snode_layout_inspection.py b/tests/python/test_snode_layout_inspection.py
index 5877d0fe66..c2afd8992b 100644
--- a/tests/python/test_snode_layout_inspection.py
+++ b/tests/python/test_snode_layout_inspection.py
@@ -1,8 +1,16 @@
+import os
+
+import pytest
+
 import quadrants as qd
 
 from tests import test_utils
 
 
+@pytest.mark.skipif(
+    os.environ.get("QD_KERNEL_COVERAGE") == "1",
+    reason="Kernel coverage field on root shifts offset assertions",
+)
 @test_utils.test(arch=qd.cpu)
 def test_primitives():
     x = qd.field(dtype=qd.i16)
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 9a144771b7..da7df93f9a 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -37,7 +37,11 @@ def _test_python(args, default_dir="python"):
         pytest_args += ["--reruns", args.rerun]
     try:
         if args.coverage:
-            pytest_args += ["--cov-branch", "--cov=python/quadrants"]
+            os.environ.setdefault("QD_KERNEL_COVERAGE", "1")
+            import quadrants as _qd
+
+            _cov_src = os.path.dirname(_qd.__file__)
+            pytest_args += ["--cov-branch", f"--cov={_cov_src}", f"--cov={test_dir}"]
         if args.cov_append:
             pytest_args += ["--cov-append"]
         if args.keys: