Genesis-Embodied-AI · hughperkins · Apr 23, 2026 · Apr 23, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -86,7 +86,6 @@ jobs:
       - name: Install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: Run CUDA tests with coverage
         run: |
           bash .github/workflows/scripts_new/linux/4_test_cuda.sh

diff --git a/.github/workflows/scripts_new/linux/4_test.sh b/.github/workflows/scripts_new/linux/4_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --group test
-pip install -r requirements_test_xdist.txt
 export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
 ./build/quadrants_cpp_tests  --gtest_filter=-AMDGPU.*
 

diff --git a/.github/workflows/scripts_new/macosx/4_test.sh b/.github/workflows/scripts_new/macosx/4_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --prefer-binary --group test
-pip install -r requirements_test_xdist.txt
 find . -name '*.bc'
 ls -lh build/
 export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"

diff --git a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --group test
-pip install -r requirements_test_xdist.txt
 
 # Phase 1: run all tests except torch-dependent ones
 python tests/run_tests.py -v -r 1 -m "not needs_torch"

diff --git a/.github/workflows/scripts_new/win/3_test.ps1 b/.github/workflows/scripts_new/win/3_test.ps1
@@ -6,7 +6,6 @@ python -c 'import gstaichi as ti; ti.init();'
 $env:QD_LIB_DIR="python/gstaichi/_lib/runtime"
 Get-ChildItem -Path build -Recurse
 pip install --group test
-pip install -r requirements_test_xdist.txt
 
 # Phase 1: run all tests except torch-dependent ones
 python .\tests\run_tests.py -v -r 1 -m "not needs_torch"

diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml
@@ -113,7 +113,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           python tests/run_tests.py -r 1 -v --arch cuda -m "not needs_torch"
@@ -158,7 +157,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           python tests/run_tests.py -r 1 -v --arch vulkan -m "not needs_torch"
@@ -189,7 +187,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           export QD_AMDGPU_V520=1

diff --git a/pyproject.toml b/pyproject.toml
@@ -76,15 +76,15 @@ dev = [
     "ruamel.yaml",
 ]
 test = [
-    # You also need to:
-    # pip install -r requirements_test_xdist.txt
     "Pillow",
     "pytest",
     # 16.0 upgrade broke xfail, caused fatal errors, see
     # https://github.com/Genesis-Embodied-AI/quadrants/pull/162
     "pytest-rerunfailures<16.0",
     "pytest-cov",
     "pytest-retry",
+    "pytest-xdist>=3.7.0",
+
     "numpy>=2.0.0",  # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13
     "psutil",
     "autograd",

diff --git a/requirements_test_xdist.txt b/requirements_test_xdist.txt
diff --git a/tests/pytest_hardtle.py b/tests/pytest_hardtle.py
@@ -1,4 +1,14 @@
 # -*- coding: utf-8 -*-
+#
+# Hard-kill timeout plugin (drop-in replacement for pytest-timeout).
+# Uses CFFI to compile a native C watchdog that calls _exit(1) on timeout.
+# Unlike stock pytest-timeout (which uses Python-level SIGALRM handlers),
+# this can kill tests that hang inside native CUDA/HIP kernel calls or
+# C extensions that don't release the GIL.
+#
+# Stock pytest-timeout must be suppressed (`-p no:timeout`) when this
+# plugin is loaded, because both register the same hook specs and pytest
+# will raise a ValueError on the duplicate.
 
 # -- stdlib --
 import importlib

diff --git a/tests/python/conftest.py b/tests/python/conftest.py
@@ -1,19 +1,12 @@
 import gc
 import os
 import sys
-import time
+import tempfile
 
 import pytest
 
-# rerunfailures use xdist version number to determine if it is compatible
-# but we are using a forked version of xdist(with git hash as it's version),
-# so we need to override it
-import pytest_rerunfailures
-
 import quadrants as qd
 
-pytest_rerunfailures.works_with_current_xdist = lambda: True
-
 
 @pytest.fixture(autouse=True)
 def run_gc_after_test():
@@ -120,43 +113,93 @@ def pytest_generate_tests(metafunc):
         metafunc.parametrize("req_arch,req_options", [(None, None)], ids=["none"])
 
 
-@pytest.hookimpl(trylast=True)
-def pytest_runtest_logreport(report):
-    """
-    Retire test workers when a test fails, to avoid the failing test
-    leaving a corrupted GPU state for the following tests.
-    """
+def _exit_marker_dir():
+    """Temp directory shared between xdist controller and workers for intentional-exit markers."""
+    return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR")
+
 
-    interactor = getattr(sys, "xdist_interactor", None)
-    if not interactor:
+def pytest_configure(config):
+    """On the xdist controller, create a temp directory for intentional-exit markers.
+
+    Workers inherit the ``_QD_XDIST_EXIT_MARKER_DIR`` env var and use the same directory.
+    """
+    if os.environ.get("PYTEST_XDIST_WORKER"):
         return
+    if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"):
+        return
+    d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}")
+    os.makedirs(d, exist_ok=True)
+    os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d
+
 
-    if report.outcome not in ("rerun", "error", "failed"):
+def pytest_unconfigure(config):
+    """Clean up the marker directory at session end."""
+    if os.environ.get("PYTEST_XDIST_WORKER"):
         return
+    d = _exit_marker_dir()
+    if d and os.path.isdir(d):
+        import shutil
 
-    layoff = False
+        shutil.rmtree(d, ignore_errors=True)
 
-    chain = getattr(getattr(report, "longrepr", None), "chain", None)
-    if chain:
-        for _, loc, _ in chain:
-            msg = getattr(loc, "message", "") if loc else ""
-            if "CUDA_ERROR_OUT_OF_MEMORY" in msg:
-                layoff = True
-                break
 
-    # Don't call interactor.retire() — it uses os._exit(0) which kills
-    # the process before execnet's IO thread can flush the channel buffer.
-    # The test failure report (queued by xdist's own hook, which ran before
-    # this trylast hook) would be lost, hiding all error messages.
-    interactor.sendevent("workerretire", layoff=layoff)
-    time.sleep(0.2)
-    os._exit(0)
+@pytest.hookimpl(wrapper=True, tryfirst=True)
+def pytest_runtest_logreport(report):
+    """Handle xdist worker retirement and crash-report suppression.
 
+    On the controller: swallow synthetic crash reports that were already marked for suppression by
+    pytest_handlecrashitem.
 
-import importlib
-import sys
+    On workers: after a test failure, write an intentional-exit marker and kill the process so it
+    restarts with clean GPU state.  The real test report is sent by inner hooks (including xdist's
+    report-forwarding hook) during ``yield`` before we exit.
+    """
+    if getattr(report, "_qd_suppress", False):
+        return None
 
-import pytest
+    result = yield
+
+    if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"):
+        d = _exit_marker_dir()
+        if d:
+            worker_id = os.environ["PYTEST_XDIST_WORKER"]
+            try:
+                with open(os.path.join(d, worker_id), "w") as f:
+                    f.write(report.nodeid)
+            except OSError:
+                pass
+        os._exit(1)
+
+    return result
+
+
+def pytest_handlecrashitem(crashitem, report, sched):
+    """Suppress the synthetic crash report only for intentional ``os._exit(1)`` exits.
+
+    When a worker is killed intentionally (to reset GPU state after a failure), it writes a marker
+    file before exiting.  If the marker exists, we flag the synthetic report for suppression and
+    return a truthy value to stop the firstresult hook chain.  Genuine crashes (segfaults, OOM,
+    etc.) have no marker, so their reports pass through unmodified.
+    """
+    d = _exit_marker_dir()
+    if not d:
+        return
+    node = getattr(report, "node", None)
+    if not node:
+        return
+    worker_id = node.gateway.id
+    marker = os.path.join(d, worker_id)
+    if not os.path.exists(marker):
+        return
+    try:
+        os.unlink(marker)
+    except OSError:
+        pass
+    report._qd_suppress = True
+    return True
+
+
+import importlib
 
 
 @pytest.fixture