From 026a7af6a5ec159959bee63e7cebd82bafb33e2a Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 23 Apr 2026 14:51:33 -0700
Subject: [PATCH 01/11] fix: drop taichi xdist fork, use stock pytest-xdist +
 pytest-timeout

The taichi fork of pytest-xdist used os._exit(0) in its retire() method,
which killed workers before the execnet channel could flush the test
failure report. This made all error messages invisible when running
with multiple threads.

Stock xdist >= 3.4 already handles worker crashes correctly: it
preserves the failure report, restarts the worker, and displays full
error details in the terminal summary. The conftest hook now just calls
os._exit(1) after a failure, and stock xdist does the rest.

Changes:
- Replace taichi xdist fork with stock pytest-xdist >= 3.7 + pytest-timeout
- Simplify conftest pytest_runtest_logreport hook (no fork-specific APIs)
- Remove requirements_test_xdist.txt and all CI references to it
- Remove pytest_rerunfailures.works_with_current_xdist version hack
- Replace pytest_hardtle with pytest-timeout in run_tests.py
---
 .github/workflows/scripts_new/linux/4_test.sh |  1 -
 .../workflows/scripts_new/macosx/4_test.sh    |  1 -
 .../scripts_new/manylinux_wheel/5_test.sh     |  1 -
 .github/workflows/scripts_new/win/3_test.ps1  |  1 -
 .github/workflows/test_gpu.yml                |  3 --
 pyproject.toml                                |  4 +--
 requirements_test_xdist.txt                   |  6 ----
 tests/python/conftest.py                      | 30 +++++--------------
 tests/run_tests.py                            |  2 --
 9 files changed, 10 insertions(+), 39 deletions(-)
 delete mode 100644 requirements_test_xdist.txt

diff --git a/.github/workflows/scripts_new/linux/4_test.sh b/.github/workflows/scripts_new/linux/4_test.sh
index b707ff68d5..829f2e2bef 100644
--- a/.github/workflows/scripts_new/linux/4_test.sh
+++ b/.github/workflows/scripts_new/linux/4_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --group test
-pip install -r requirements_test_xdist.txt
 export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
 ./build/quadrants_cpp_tests  --gtest_filter=-AMDGPU.*
 
diff --git a/.github/workflows/scripts_new/macosx/4_test.sh b/.github/workflows/scripts_new/macosx/4_test.sh
index 4fc44bf330..53ca92f165 100644
--- a/.github/workflows/scripts_new/macosx/4_test.sh
+++ b/.github/workflows/scripts_new/macosx/4_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --prefer-binary --group test
-pip install -r requirements_test_xdist.txt
 find . -name '*.bc'
 ls -lh build/
 export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
diff --git a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh
index 7d8215d9a8..fa92bc533c 100644
--- a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh
+++ b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --group test
-pip install -r requirements_test_xdist.txt
 
 # Phase 1: run all tests except torch-dependent ones
 python tests/run_tests.py -v -r 3 -m "not needs_torch"
diff --git a/.github/workflows/scripts_new/win/3_test.ps1 b/.github/workflows/scripts_new/win/3_test.ps1
index 597da17b56..1678204f1f 100644
--- a/.github/workflows/scripts_new/win/3_test.ps1
+++ b/.github/workflows/scripts_new/win/3_test.ps1
@@ -6,7 +6,6 @@ python -c 'import gstaichi as ti; ti.init();'
 $env:QD_LIB_DIR="python/gstaichi/_lib/runtime"
 Get-ChildItem -Path build -Recurse
 pip install --group test
-pip install -r requirements_test_xdist.txt
 
 # Phase 1: run all tests except torch-dependent ones
 python .\tests\run_tests.py -v -r 3 -m "not needs_torch"
diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml
index c16c3cf748..388c3bd369 100644
--- a/.github/workflows/test_gpu.yml
+++ b/.github/workflows/test_gpu.yml
@@ -113,7 +113,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           python tests/run_tests.py -r 1 -v --arch cuda -m "not needs_torch"
@@ -149,7 +148,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           python tests/run_tests.py -r 3 -v --arch vulkan -m "not needs_torch"
@@ -180,7 +178,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           export QD_AMDGPU_V520=1
diff --git a/pyproject.toml b/pyproject.toml
index a0a6223d49..37f4301c4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,8 +76,6 @@ dev = [
     "ruamel.yaml",
 ]
 test = [
-    # You also need to:
-    # pip install -r requirements_test_xdist.txt
     "Pillow",
     "pytest",
     # 16.0 upgrade broke xfail, caused fatal errors, see
@@ -85,6 +83,8 @@ test = [
     "pytest-rerunfailures<16.0",
     "pytest-cov",
     "pytest-retry",
+    "pytest-xdist>=3.7.0",
+    "pytest-timeout",
     "numpy>=2.0.0",  # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13
     "psutil",
     "autograd",
diff --git a/requirements_test_xdist.txt b/requirements_test_xdist.txt
deleted file mode 100644
index 12b246a1bc..0000000000
--- a/requirements_test_xdist.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# This URL format is incompatible with pyproject.toml
-# taichi created a fork of pytest-xdist, to handle restarting gpu
-# workers when they crash, in order to reset the GPU state.
-# TODO: come up with some approach that is compatible with pyproject.toml
-# Ticketed here: https://linear.app/genesis-ai-company/issue/CMP-141/clean-up-requirements-test-xdisttxt
-git+https://github.com/taichi-dev/pytest-xdist@a3b5ad3038#egg=pytest-xdist
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index ba2f621e58..69c826388b 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -1,17 +1,11 @@
 import gc
+import os
 import sys
 
 import pytest
 
-# rerunfailures use xdist version number to determine if it is compatible
-# but we are using a forked version of xdist(with git hash as it's version),
-# so we need to override it
-import pytest_rerunfailures
-
 import quadrants as qd
 
-pytest_rerunfailures.works_with_current_xdist = lambda: True
-
 
 @pytest.fixture(autouse=True)
 def run_gc_after_test():
@@ -86,28 +80,20 @@ def pytest_generate_tests(metafunc):
 @pytest.hookimpl(trylast=True)
 def pytest_runtest_logreport(report):
     """
-    Intentionally crash test workers when a test fails.
-    This is to avoid the failing test leaving a corrupted GPU state for the
-    following tests.
+    Kill the xdist worker process after a test failure so it restarts with
+    clean GPU state.  Stock xdist (>= 3.4) automatically restarts the worker
+    and preserves the failure report for the terminal summary.
     """
+    if not hasattr(pytest, "version_tuple"):
+        return
 
-    interactor = getattr(sys, "xdist_interactor", None)
-    if not interactor:
-        # not running under xdist, or xdist is not active,
-        # or using stock xdist (we need a customized version)
+    if not os.environ.get("PYTEST_XDIST_WORKER"):
         return
 
     if report.outcome not in ("rerun", "error", "failed"):
         return
 
-    layoff = False
-
-    for _, loc, _ in report.longrepr.chain:
-        if "CUDA_ERROR_OUT_OF_MEMORY" in loc.message:
-            layoff = True
-            break
-
-    interactor.retire(layoff=layoff)
+    os._exit(1)
 
 
 import importlib
diff --git a/tests/run_tests.py b/tests/run_tests.py
index a454003002..ed90b64fe0 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -51,8 +51,6 @@ def _test_python(args, default_dir="python"):
         if args.timeout > 0:
             pytest_args += [
                 "--durations=15",
-                "-p",
-                "pytest_hardtle",
                 f"--timeout={args.timeout}",
             ]
     except AttributeError:

From 8c83491dc164673dca88a8cc1e6ba89ef44adeba Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 23 Apr 2026 22:20:12 -0700
Subject: [PATCH 02/11] Keep pytest_hardtle for hard-kill timeouts on native
 GPU hangs

Stock pytest-timeout uses Python-level signals that can't interrupt
native CUDA/HIP kernel hangs. pytest_hardtle compiles a C watchdog
via CFFI that calls _exit(1) from a signal handler, which always works.

Restore `-p pytest_hardtle` (with `-p no:timeout` to suppress stock
pytest-timeout) and drop pytest-timeout from pyproject.toml.
---
 pyproject.toml     | 2 +-
 tests/run_tests.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 37f4301c4b..68d798e72a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,7 +84,7 @@ test = [
     "pytest-cov",
     "pytest-retry",
     "pytest-xdist>=3.7.0",
-    "pytest-timeout",
+
     "numpy>=2.0.0",  # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13
     "psutil",
     "autograd",
diff --git a/tests/run_tests.py b/tests/run_tests.py
index ed90b64fe0..36059493cc 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -51,6 +51,10 @@ def _test_python(args, default_dir="python"):
         if args.timeout > 0:
             pytest_args += [
                 "--durations=15",
+                "-p",
+                "no:timeout",
+                "-p",
+                "pytest_hardtle",
                 f"--timeout={args.timeout}",
             ]
     except AttributeError:

From 76672b1d86ffe908d14e80847acb3cda3a3a31e7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 23 Apr 2026 22:35:05 -0700
Subject: [PATCH 03/11] Add docstring to pytest_hardtle explaining why it
 exists

Documents why we use a CFFI-based hard-kill timeout instead of stock
pytest-timeout, and why the two cannot be loaded simultaneously.
---
 tests/pytest_hardtle.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/pytest_hardtle.py b/tests/pytest_hardtle.py
index 68b6bf72e4..97f65d6ef0 100644
--- a/tests/pytest_hardtle.py
+++ b/tests/pytest_hardtle.py
@@ -1,4 +1,14 @@
 # -*- coding: utf-8 -*-
+#
+# Hard-kill timeout plugin (drop-in replacement for pytest-timeout).
+# Uses CFFI to compile a native C watchdog that calls _exit(1) on timeout.
+# Unlike stock pytest-timeout (which uses Python-level SIGALRM handlers),
+# this can kill tests that hang inside native CUDA/HIP kernel calls or
+# C extensions that don't release the GIL.
+#
+# Stock pytest-timeout must be suppressed (`-p no:timeout`) when this
+# plugin is loaded, because both register the same hook specs and pytest
+# will raise a ValueError on the duplicate.
 
 # -- stdlib --
 import importlib

From f82637749e871b1ae90c05563ed1300940e8ef97 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 23 Apr 2026 23:29:04 -0700
Subject: [PATCH 04/11] Address review feedback: remove dead guard, add
 clarifying comments

- Remove always-true `hasattr(pytest, "version_tuple")` guard in
  conftest.py (pytest >= 7.0 is guaranteed by dependency constraints).
- Add comments in run_tests.py explaining why we suppress stock
  pytest-timeout and why pytest_hardtle is preferred (CFFI-based
  native signal handler can kill GIL-held native GPU hangs).
---
 tests/python/conftest.py | 3 ---
 tests/run_tests.py       | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 69c826388b..4a6b85916e 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -84,9 +84,6 @@ def pytest_runtest_logreport(report):
     clean GPU state.  Stock xdist (>= 3.4) automatically restarts the worker
     and preserves the failure report for the terminal summary.
     """
-    if not hasattr(pytest, "version_tuple"):
-        return
-
     if not os.environ.get("PYTEST_XDIST_WORKER"):
         return
 
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 36059493cc..06227f836e 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -51,8 +51,14 @@ def _test_python(args, default_dir="python"):
         if args.timeout > 0:
             pytest_args += [
                 "--durations=15",
+                # Suppress stock pytest-timeout if installed — it conflicts
+                # with pytest_hardtle (both register the same hook specs).
                 "-p",
                 "no:timeout",
+                # pytest_hardtle uses a CFFI-compiled C watchdog that calls
+                # _exit(1) from a native signal handler, so it can kill tests
+                # hung in native GPU calls even when the GIL is held.
+                # Stock pytest-timeout's signal method cannot do this.
                 "-p",
                 "pytest_hardtle",
                 f"--timeout={args.timeout}",

From 431197766c4032c31f397e0dcfd81b7a1d50a263 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 24 Apr 2026 00:19:37 -0700
Subject: [PATCH 05/11] Fix double-counted failures and crash-cap shutdown with
 stock xdist

Stock xdist treats os._exit(1) as a worker crash, which causes:
1. A synthetic "worker crashed" report duplicating the real failure
2. Each exit counting toward --max-worker-restart, eventually shutting
   down the session and silently dropping remaining tests

Fix both by:
- Adding a pytest_handlecrashitem hook (firstresult=True) that marks
  the synthetic crash report as passed, since the real report was
  already sent before the worker exited
- Setting --max-worker-restart=999999 so intentional worker kills
  don't trigger xdist's crash-cap shutdown
---
 tests/python/conftest.py | 20 ++++++++++++++++++--
 tests/run_tests.py       | 11 ++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 4a6b85916e..4631434dc3 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -81,8 +81,10 @@ def pytest_generate_tests(metafunc):
 def pytest_runtest_logreport(report):
     """
     Kill the xdist worker process after a test failure so it restarts with
-    clean GPU state.  Stock xdist (>= 3.4) automatically restarts the worker
-    and preserves the failure report for the terminal summary.
+    clean GPU state.  The real test report is sent by xdist's own hook
+    (which runs before this trylast hook) before we exit.  The controller's
+    pytest_handlecrashitem hook below suppresses the synthetic "worker
+    crashed" duplicate.
     """
     if not os.environ.get("PYTEST_XDIST_WORKER"):
         return
@@ -93,6 +95,20 @@ def pytest_runtest_logreport(report):
     os._exit(1)
 
 
+def pytest_handlecrashitem(crashitem, report, sched):
+    """Suppress the synthetic 'worker crashed while running ...' report.
+
+    When pytest_runtest_logreport above kills a worker via os._exit(1),
+    stock xdist treats it as a crash and synthesizes a duplicate failure
+    report.  The real report was already sent before the exit, so we
+    mark the synthetic one as passed to keep it out of the failure summary.
+    This hook is firstresult=True in xdist, so returning here prevents
+    the default handler from running.
+    """
+    report.outcome = "passed"
+    report.longrepr = None
+
+
 import importlib
 import sys
 
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 06227f836e..f477021e00 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -81,7 +81,16 @@ def _test_python(args, default_dir="python"):
         print(f"Due to how pytest-xdist is implemented, the -s option does not work with multiple thread...")
     else:
         if int(threads) > 1:
-            pytest_args += ["-n", str(threads), "--dist=worksteal"]
+            # We intentionally kill workers on test failure (see conftest.py)
+            # to reset GPU state.  Stock xdist counts each kill toward
+            # --max-worker-restart and shuts down the session when the cap is
+            # reached, so we set a very high cap to prevent that.
+            pytest_args += [
+                "-n",
+                str(threads),
+                "--dist=worksteal",
+                "--max-worker-restart=999999",
+            ]
     import pytest  # pylint: disable=C0415
 
     return int(pytest.main(pytest_args))

From fbd787f30d3dbe1012d5a547db7063e40a4bf9af Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 24 Apr 2026 00:29:21 -0700
Subject: [PATCH 06/11] Add tests for xdist worker retirement hooks

Uses pytester to verify in a subprocess that:
- pytest_handlecrashitem suppresses synthetic "worker crashed" duplicates
  (failures counted exactly once, not doubled)
- --max-worker-restart=999999 prevents premature session shutdown when
  many tests fail and workers are intentionally killed

Also fix pytest_handlecrashitem to set report.when="teardown" so the
suppressed crash report doesn't inflate the passed count.
---
 tests/python/conftest.py              |   1 +
 tests/test_xdist_worker_retirement.py | 124 ++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 tests/test_xdist_worker_retirement.py

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 4631434dc3..925ff4b32b 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -106,6 +106,7 @@ def pytest_handlecrashitem(crashitem, report, sched):
     the default handler from running.
     """
     report.outcome = "passed"
+    report.when = "teardown"
     report.longrepr = None
 
 
diff --git a/tests/test_xdist_worker_retirement.py b/tests/test_xdist_worker_retirement.py
new file mode 100644
index 0000000000..960a6d120a
--- /dev/null
+++ b/tests/test_xdist_worker_retirement.py
@@ -0,0 +1,124 @@
+"""Tests for the xdist worker retirement hooks in conftest.py.
+
+Verifies that when a worker is killed via os._exit(1) after a test failure:
+1. Failures are not double-counted (no synthetic "worker crashed" report)
+2. The session completes even with many failures (--max-worker-restart cap
+   does not trigger premature shutdown)
+
+These tests use pytester to run pytest-xdist in a subprocess, so they do
+not require GPU hardware.
+"""
+
+import pytest
+
+pytest_plugins = ["pytester"]
+
+SUBPROCESS_ARGS = [
+    "-p",
+    "no:retry",
+    "-p",
+    "no:rerunfailures",
+    "-p",
+    "no:nbmake",
+    "-p",
+    "no:timeout",
+    "-p",
+    "no:cacheprovider",
+    "-o",
+    "addopts=",
+]
+
+
+@pytest.fixture
+def xdist_project(pytester):
+    """Write a minimal conftest that reproduces our worker-retirement hooks."""
+    pytester.makeconftest(
+        """
+        import os
+        import pytest
+
+        @pytest.hookimpl(trylast=True)
+        def pytest_runtest_logreport(report):
+            if not os.environ.get("PYTEST_XDIST_WORKER"):
+                return
+            if report.outcome not in ("error", "failed"):
+                return
+            os._exit(1)
+
+        def pytest_handlecrashitem(crashitem, report, sched):
+            report.outcome = "passed"
+            report.when = "teardown"
+            report.longrepr = None
+        """
+    )
+    return pytester
+
+
+class TestNoDuplicateFailures:
+    def test_single_failure_counted_once(self, xdist_project):
+        """A single failing test should appear exactly once in the summary."""
+        xdist_project.makepyfile(
+            """
+            def test_pass():
+                pass
+
+            def test_fail():
+                assert False, "intentional failure"
+            """
+        )
+        result = xdist_project.runpytest_subprocess(
+            "-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v"
+        )
+        result.assert_outcomes(passed=1, failed=1)
+
+    def test_multiple_failures_counted_correctly(self, xdist_project):
+        """Each failing test should be counted exactly once."""
+        xdist_project.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("i", range(4))
+            def test_fail(i):
+                assert False, f"failure {i}"
+
+            def test_pass():
+                pass
+            """
+        )
+        result = xdist_project.runpytest_subprocess(
+            "-n",
+            "2",
+            "--dist=worksteal",
+            "--max-worker-restart=999999",
+            *SUBPROCESS_ARGS,
+            "-v",
+        )
+        result.assert_outcomes(passed=1, failed=4)
+
+
+class TestSessionCompletesWithManyFailures:
+    def test_no_premature_shutdown(self, xdist_project):
+        """With a high --max-worker-restart, all tests should run even if many fail."""
+        xdist_project.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("i", range(20))
+            def test_fail(i):
+                assert False, f"failure {i}"
+
+            @pytest.mark.parametrize("i", range(5))
+            def test_pass(i):
+                pass
+            """
+        )
+        result = xdist_project.runpytest_subprocess(
+            "-n",
+            "2",
+            "--dist=worksteal",
+            "--max-worker-restart=999999",
+            *SUBPROCESS_ARGS,
+            "-v",
+        )
+        result.assert_outcomes(passed=5, failed=20)
+        assert "maximum crashed workers reached" not in result.stdout.str()

From 2353f511c13e8491b4da3592306478b2683c9abb Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 24 Apr 2026 00:34:18 -0700
Subject: [PATCH 07/11] style: fix black formatting in
 test_xdist_worker_retirement.py

Made-with: Cursor
---
 tests/test_xdist_worker_retirement.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_xdist_worker_retirement.py b/tests/test_xdist_worker_retirement.py
index 960a6d120a..30730d8b4f 100644
--- a/tests/test_xdist_worker_retirement.py
+++ b/tests/test_xdist_worker_retirement.py
@@ -66,9 +66,7 @@ def test_fail():
                 assert False, "intentional failure"
             """
         )
-        result = xdist_project.runpytest_subprocess(
-            "-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v"
-        )
+        result = xdist_project.runpytest_subprocess("-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v")
         result.assert_outcomes(passed=1, failed=1)
 
     def test_multiple_failures_counted_correctly(self, xdist_project):

From b135892ed8e9488a10b817a698d7e757729b8112 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 27 Apr 2026 13:55:55 -0700
Subject: [PATCH 08/11] [Test] Fix pytest_handlecrashitem to distinguish
 intentional exits from genuine crashes

The previous implementation blindly suppressed all crash reports, causing:
1. Double-counted failures (1 failed + 1 passed for the same test)
2. Genuine crashes (segfaults, OOM) silently marked as passed

Use a marker-file protocol: workers write a marker before os._exit(1),
and the controller only suppresses crash reports when a marker is found.
A wrapper hookimpl on pytest_runtest_logreport swallows the suppressed
synthetic report so it never reaches the terminal reporter or stats.
---
 tests/python/conftest.py              | 98 +++++++++++++++++++++------
 tests/test_xdist_worker_retirement.py | 51 +++++++++++---
 2 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 2fbe4c6345..bf189aa5fc 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -1,6 +1,7 @@
 import gc
 import os
 import sys
+import tempfile
 
 import pytest
 
@@ -112,37 +113,90 @@ def pytest_generate_tests(metafunc):
         metafunc.parametrize("req_arch,req_options", [(None, None)], ids=["none"])
 
 
-@pytest.hookimpl(trylast=True)
-def pytest_runtest_logreport(report):
-    """
-    Kill the xdist worker process after a test failure so it restarts with
-    clean GPU state.  The real test report is sent by xdist's own hook
-    (which runs before this trylast hook) before we exit.  The controller's
-    pytest_handlecrashitem hook below suppresses the synthetic "worker
-    crashed" duplicate.
+def _exit_marker_dir():
+    """Temp directory shared between xdist controller and workers for intentional-exit markers."""
+    return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR")
+
+
+def pytest_configure(config):
+    """On the xdist controller, create a temp directory for intentional-exit markers.
+
+    Workers inherit the ``_QD_XDIST_EXIT_MARKER_DIR`` env var and use the same directory.
     """
-    if not os.environ.get("PYTEST_XDIST_WORKER"):
+    if os.environ.get("PYTEST_XDIST_WORKER"):
+        return
+    if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"):
         return
+    d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}")
+    os.makedirs(d, exist_ok=True)
+    os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d
 
-    if report.outcome not in ("rerun", "error", "failed"):
+
+def pytest_unconfigure(config):
+    """Clean up the marker directory at session end."""
+    if os.environ.get("PYTEST_XDIST_WORKER"):
         return
+    d = _exit_marker_dir()
+    if d and os.path.isdir(d):
+        import shutil
+
+        shutil.rmtree(d, ignore_errors=True)
+
+
+@pytest.hookimpl(wrapper=True, tryfirst=True)
+def pytest_runtest_logreport(report):
+    """Handle xdist worker retirement and crash-report suppression.
 
-    os._exit(1)
+    On the controller: swallow synthetic crash reports that were already marked for suppression by
+    pytest_handlecrashitem.
+
+    On workers: after a test failure, write an intentional-exit marker and kill the process so it
+    restarts with clean GPU state.  The real test report is sent by inner hooks (including xdist's
+    report-forwarding hook) during ``yield`` before we exit.
+    """
+    if getattr(report, "_qd_suppress", False):
+        return None
+
+    result = yield
+
+    if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"):
+        d = _exit_marker_dir()
+        if d:
+            worker_id = os.environ["PYTEST_XDIST_WORKER"]
+            try:
+                with open(os.path.join(d, worker_id), "w") as f:
+                    f.write(report.nodeid)
+            except OSError:
+                pass
+        os._exit(1)
+
+    return result
 
 
 def pytest_handlecrashitem(crashitem, report, sched):
-    """Suppress the synthetic 'worker crashed while running ...' report.
-
-    When pytest_runtest_logreport above kills a worker via os._exit(1),
-    stock xdist treats it as a crash and synthesizes a duplicate failure
-    report.  The real report was already sent before the exit, so we
-    mark the synthetic one as passed to keep it out of the failure summary.
-    This hook is firstresult=True in xdist, so returning here prevents
-    the default handler from running.
+    """Suppress the synthetic crash report only for intentional ``os._exit(1)`` exits.
+
+    When a worker is killed intentionally (to reset GPU state after a failure), it writes a marker
+    file before exiting.  If the marker exists, we flag the synthetic report for suppression and
+    return a truthy value to stop the firstresult hook chain.  Genuine crashes (segfaults, OOM,
+    etc.) have no marker, so their reports pass through unmodified.
     """
-    report.outcome = "passed"
-    report.when = "teardown"
-    report.longrepr = None
+    d = _exit_marker_dir()
+    if not d:
+        return
+    node = getattr(report, "node", None)
+    if not node:
+        return
+    worker_id = node.gateway.id
+    marker = os.path.join(d, worker_id)
+    if not os.path.exists(marker):
+        return
+    try:
+        os.unlink(marker)
+    except OSError:
+        pass
+    report._qd_suppress = True
+    return True
 
 
 import importlib
diff --git a/tests/test_xdist_worker_retirement.py b/tests/test_xdist_worker_retirement.py
index 30730d8b4f..e91ad431fb 100644
--- a/tests/test_xdist_worker_retirement.py
+++ b/tests/test_xdist_worker_retirement.py
@@ -35,20 +35,55 @@ def xdist_project(pytester):
     pytester.makeconftest(
         """
         import os
+        import tempfile
         import pytest
 
-        @pytest.hookimpl(trylast=True)
-        def pytest_runtest_logreport(report):
-            if not os.environ.get("PYTEST_XDIST_WORKER"):
+        def _exit_marker_dir():
+            return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR")
+
+        def pytest_configure(config):
+            if os.environ.get("PYTEST_XDIST_WORKER"):
                 return
-            if report.outcome not in ("error", "failed"):
+            if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"):
                 return
-            os._exit(1)
+            d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}")
+            os.makedirs(d, exist_ok=True)
+            os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d
+
+        @pytest.hookimpl(wrapper=True, tryfirst=True)
+        def pytest_runtest_logreport(report):
+            if getattr(report, "_qd_suppress", False):
+                return None
+            result = yield
+            if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("error", "failed"):
+                d = _exit_marker_dir()
+                if d:
+                    worker_id = os.environ["PYTEST_XDIST_WORKER"]
+                    try:
+                        with open(os.path.join(d, worker_id), "w") as f:
+                            f.write(report.nodeid)
+                    except OSError:
+                        pass
+                os._exit(1)
+            return result
 
         def pytest_handlecrashitem(crashitem, report, sched):
-            report.outcome = "passed"
-            report.when = "teardown"
-            report.longrepr = None
+            d = _exit_marker_dir()
+            if not d:
+                return
+            node = getattr(report, "node", None)
+            if not node:
+                return
+            worker_id = node.gateway.id
+            marker = os.path.join(d, worker_id)
+            if not os.path.exists(marker):
+                return
+            try:
+                os.unlink(marker)
+            except OSError:
+                pass
+            report._qd_suppress = True
+            return True
         """
     )
     return pytester

From 7fa8f34f7036e0685e21d278f580e52868e6401c Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 27 Apr 2026 13:56:16 -0700
Subject: [PATCH 09/11] [Test] Move xdist worker retirement tests into
 tests/python/ for CI discovery

CI runs pytest rooted at tests/python/ via run_tests.py, so tests placed
as siblings of that directory are never collected.
---
 tests/{ => python}/test_xdist_worker_retirement.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{ => python}/test_xdist_worker_retirement.py (100%)

diff --git a/tests/test_xdist_worker_retirement.py b/tests/python/test_xdist_worker_retirement.py
similarity index 100%
rename from tests/test_xdist_worker_retirement.py
rename to tests/python/test_xdist_worker_retirement.py

From 55dac704788bb30e9530638cc092e94c18386e83 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 27 Apr 2026 14:02:33 -0700
Subject: [PATCH 10/11] [Test] Remove duplicate import sys in conftest.py

The top-level import at line 3 already covers all usages.
---
 tests/python/conftest.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index bf189aa5fc..6b9f302204 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -200,9 +200,6 @@ def pytest_handlecrashitem(crashitem, report, sched):
 
 
 import importlib
-import sys
-
-import pytest
 
 
 @pytest.fixture

From f63261e64a2b5b1967623152d2a320a566cf7773 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 27 Apr 2026 14:12:00 -0700
Subject: [PATCH 11/11] [CI] Remove stale requirements_test_xdist.txt reference
 from linux.yml; scrub PYTEST_XDIST_WORKER in pytester fixture to prevent env
 var leak

The linux.yml test-cuda job still referenced the deleted
requirements_test_xdist.txt. Also, pytester propagates the outer
worker's PYTEST_XDIST_WORKER into inner subprocesses, causing the
inner controller to misidentify itself as a worker and os._exit(1).
---
 .github/workflows/linux.yml                  | 1 -
 tests/python/test_xdist_worker_retirement.py | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index abca17810f..fd71349f45 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -86,7 +86,6 @@ jobs:
       - name: Install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: Run CUDA tests with coverage
         run: |
           bash .github/workflows/scripts_new/linux/4_test_cuda.sh
diff --git a/tests/python/test_xdist_worker_retirement.py b/tests/python/test_xdist_worker_retirement.py
index e91ad431fb..c471db3a2e 100644
--- a/tests/python/test_xdist_worker_retirement.py
+++ b/tests/python/test_xdist_worker_retirement.py
@@ -30,8 +30,9 @@
 
 
 @pytest.fixture
-def xdist_project(pytester):
+def xdist_project(pytester, monkeypatch):
     """Write a minimal conftest that reproduces our worker-retirement hooks."""
+    monkeypatch.delenv("PYTEST_XDIST_WORKER", raising=False)
     pytester.makeconftest(
         """
         import os