diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index abca17810f..fd71349f45 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -86,7 +86,6 @@ jobs:
       - name: Install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: Run CUDA tests with coverage
         run: |
           bash .github/workflows/scripts_new/linux/4_test_cuda.sh
diff --git a/.github/workflows/scripts_new/linux/4_test.sh b/.github/workflows/scripts_new/linux/4_test.sh
index 630dc34783..5b7b604011 100644
--- a/.github/workflows/scripts_new/linux/4_test.sh
+++ b/.github/workflows/scripts_new/linux/4_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --group test
-pip install -r requirements_test_xdist.txt
 export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
 ./build/quadrants_cpp_tests  --gtest_filter=-AMDGPU.*
 
diff --git a/.github/workflows/scripts_new/macosx/4_test.sh b/.github/workflows/scripts_new/macosx/4_test.sh
index 71037e2471..d5fa680770 100644
--- a/.github/workflows/scripts_new/macosx/4_test.sh
+++ b/.github/workflows/scripts_new/macosx/4_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --prefer-binary --group test
-pip install -r requirements_test_xdist.txt
 find . -name '*.bc'
 ls -lh build/
 export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
diff --git a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh
index b14b9d7778..f045c2ccc7 100644
--- a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh
+++ b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh
@@ -3,7 +3,6 @@
 set -ex
 
 pip install --group test
-pip install -r requirements_test_xdist.txt
 
 # Phase 1: run all tests except torch-dependent ones
 python tests/run_tests.py -v -r 1 -m "not needs_torch"
diff --git a/.github/workflows/scripts_new/win/3_test.ps1 b/.github/workflows/scripts_new/win/3_test.ps1
index c7eae72395..9ebc71e437 100644
--- a/.github/workflows/scripts_new/win/3_test.ps1
+++ b/.github/workflows/scripts_new/win/3_test.ps1
@@ -6,7 +6,6 @@ python -c 'import gstaichi as ti; ti.init();'
 $env:QD_LIB_DIR="python/gstaichi/_lib/runtime"
 Get-ChildItem -Path build -Recurse
 pip install --group test
-pip install -r requirements_test_xdist.txt
 
 # Phase 1: run all tests except torch-dependent ones
 python .\tests\run_tests.py -v -r 1 -m "not needs_torch"
diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml
index d98ff9f013..28fef6774b 100644
--- a/.github/workflows/test_gpu.yml
+++ b/.github/workflows/test_gpu.yml
@@ -113,7 +113,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           python tests/run_tests.py -r 1 -v --arch cuda -m "not needs_torch"
@@ -158,7 +157,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           python tests/run_tests.py -r 1 -v --arch vulkan -m "not needs_torch"
@@ -189,7 +187,6 @@ jobs:
       - name: install test requirements
         run: |
           pip install --group test
-          pip install -r requirements_test_xdist.txt
       - name: run tests (without torch)
         run: |
           export QD_AMDGPU_V520=1
diff --git a/pyproject.toml b/pyproject.toml
index 4c600131e2..0d1b33385f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,8 +76,6 @@ dev = [
     "ruamel.yaml",
 ]
 test = [
-    # You also need to:
-    # pip install -r requirements_test_xdist.txt
     "Pillow",
     "pytest",
     # 16.0 upgrade broke xfail, caused fatal errors, see
@@ -85,6 +83,8 @@ test = [
     "pytest-rerunfailures<16.0",
     "pytest-cov",
     "pytest-retry",
+    "pytest-xdist>=3.7.0",
+
     "numpy>=2.0.0",  # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13
     "psutil",
     "autograd",
diff --git a/requirements_test_xdist.txt b/requirements_test_xdist.txt
deleted file mode 100644
index 12b246a1bc..0000000000
--- a/requirements_test_xdist.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# This URL format is incompatible with pyproject.toml
-# taichi created a fork of pytest-xdist, to handle restarting gpu
-# workers when they crash, in order to reset the GPU state.
-# TODO: come up with some approach that is compatible with pyproject.toml
-# Ticketed here: https://linear.app/genesis-ai-company/issue/CMP-141/clean-up-requirements-test-xdisttxt
-git+https://github.com/taichi-dev/pytest-xdist@a3b5ad3038#egg=pytest-xdist
diff --git a/tests/pytest_hardtle.py b/tests/pytest_hardtle.py
index 68b6bf72e4..97f65d6ef0 100644
--- a/tests/pytest_hardtle.py
+++ b/tests/pytest_hardtle.py
@@ -1,4 +1,14 @@
 # -*- coding: utf-8 -*-
+#
+# Hard-kill timeout plugin (drop-in replacement for pytest-timeout).
+# Uses CFFI to compile a native C watchdog that calls _exit(1) on timeout.
+# Unlike stock pytest-timeout (which uses Python-level SIGALRM handlers),
+# this can kill tests that hang inside native CUDA/HIP kernel calls or
+# C extensions that don't release the GIL.
+#
+# Stock pytest-timeout must be suppressed (`-p no:timeout`) when this
+# plugin is loaded, because both register the same hook specs and pytest
+# will raise a ValueError on the duplicate.
 
 # -- stdlib --
 import importlib
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index 1ae8945a54..6b9f302204 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -1,19 +1,12 @@
 import gc
 import os
 import sys
-import time
+import tempfile
 
 import pytest
 
-# rerunfailures use xdist version number to determine if it is compatible
-# but we are using a forked version of xdist(with git hash as it's version),
-# so we need to override it
-import pytest_rerunfailures
-
 import quadrants as qd
 
-pytest_rerunfailures.works_with_current_xdist = lambda: True
-
 
 @pytest.fixture(autouse=True)
 def run_gc_after_test():
@@ -120,43 +113,93 @@ def pytest_generate_tests(metafunc):
         metafunc.parametrize("req_arch,req_options", [(None, None)], ids=["none"])
 
 
-@pytest.hookimpl(trylast=True)
-def pytest_runtest_logreport(report):
-    """
-    Retire test workers when a test fails, to avoid the failing test
-    leaving a corrupted GPU state for the following tests.
-    """
+def _exit_marker_dir():
+    """Temp directory shared between xdist controller and workers for intentional-exit markers."""
+    return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR")
+
 
-    interactor = getattr(sys, "xdist_interactor", None)
-    if not interactor:
+def pytest_configure(config):
+    """On the xdist controller, create a temp directory for intentional-exit markers.
+
+    Workers inherit the ``_QD_XDIST_EXIT_MARKER_DIR`` env var and use the same directory.
+    """
+    if os.environ.get("PYTEST_XDIST_WORKER"):
         return
+    if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"):
+        return
+    d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}")
+    os.makedirs(d, exist_ok=True)
+    os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d
+
 
-    if report.outcome not in ("rerun", "error", "failed"):
+def pytest_unconfigure(config):
+    """Clean up the marker directory at session end."""
+    if os.environ.get("PYTEST_XDIST_WORKER"):
         return
+    d = _exit_marker_dir()
+    if d and os.path.isdir(d):
+        import shutil
 
-    layoff = False
+        shutil.rmtree(d, ignore_errors=True)
 
-    chain = getattr(getattr(report, "longrepr", None), "chain", None)
-    if chain:
-        for _, loc, _ in chain:
-            msg = getattr(loc, "message", "") if loc else ""
-            if "CUDA_ERROR_OUT_OF_MEMORY" in msg:
-                layoff = True
-                break
 
-    # Don't call interactor.retire() — it uses os._exit(0) which kills
-    # the process before execnet's IO thread can flush the channel buffer.
-    # The test failure report (queued by xdist's own hook, which ran before
-    # this trylast hook) would be lost, hiding all error messages.
-    interactor.sendevent("workerretire", layoff=layoff)
-    time.sleep(0.2)
-    os._exit(0)
+@pytest.hookimpl(wrapper=True, tryfirst=True)
+def pytest_runtest_logreport(report):
+    """Handle xdist worker retirement and crash-report suppression.
 
+    On the controller: swallow synthetic crash reports that were already marked for suppression by
+    pytest_handlecrashitem.
 
-import importlib
-import sys
+    On workers: after a test failure, write an intentional-exit marker and kill the process so it
+    restarts with clean GPU state.  The real test report is sent by inner hooks (including xdist's
+    report-forwarding hook) during ``yield`` before we exit.
+    """
+    if getattr(report, "_qd_suppress", False):
+        return None
 
-import pytest
+    result = yield
+
+    if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"):
+        d = _exit_marker_dir()
+        if d:
+            worker_id = os.environ["PYTEST_XDIST_WORKER"]
+            try:
+                with open(os.path.join(d, worker_id), "w") as f:
+                    f.write(report.nodeid)
+            except OSError:
+                pass
+        os._exit(1)
+
+    return result
+
+
+def pytest_handlecrashitem(crashitem, report, sched):
+    """Suppress the synthetic crash report only for intentional ``os._exit(1)`` exits.
+
+    When a worker is killed intentionally (to reset GPU state after a failure), it writes a marker
+    file before exiting.  If the marker exists, we flag the synthetic report for suppression and
+    return a truthy value to stop the firstresult hook chain.  Genuine crashes (segfaults, OOM,
+    etc.) have no marker, so their reports pass through unmodified.
+    """
+    d = _exit_marker_dir()
+    if not d:
+        return
+    node = getattr(report, "node", None)
+    if not node:
+        return
+    worker_id = node.gateway.id
+    marker = os.path.join(d, worker_id)
+    if not os.path.exists(marker):
+        return
+    try:
+        os.unlink(marker)
+    except OSError:
+        pass
+    report._qd_suppress = True
+    return True
+
+
+import importlib
 
 
 @pytest.fixture
diff --git a/tests/python/test_xdist_worker_retirement.py b/tests/python/test_xdist_worker_retirement.py
new file mode 100644
index 0000000000..c471db3a2e
--- /dev/null
+++ b/tests/python/test_xdist_worker_retirement.py
@@ -0,0 +1,158 @@
+"""Tests for the xdist worker retirement hooks in conftest.py.
+
+Verifies that when a worker is killed via os._exit(1) after a test failure:
+1. Failures are not double-counted (no synthetic "worker crashed" report)
+2. The session completes even with many failures (--max-worker-restart cap
+   does not trigger premature shutdown)
+
+These tests use pytester to run pytest-xdist in a subprocess, so they do
+not require GPU hardware.
+"""
+
+import pytest
+
+pytest_plugins = ["pytester"]
+
+SUBPROCESS_ARGS = [
+    "-p",
+    "no:retry",
+    "-p",
+    "no:rerunfailures",
+    "-p",
+    "no:nbmake",
+    "-p",
+    "no:timeout",
+    "-p",
+    "no:cacheprovider",
+    "-o",
+    "addopts=",
+]
+
+
+@pytest.fixture
+def xdist_project(pytester, monkeypatch):
+    """Write a minimal conftest that reproduces our worker-retirement hooks."""
+    monkeypatch.delenv("PYTEST_XDIST_WORKER", raising=False)
+    pytester.makeconftest(
+        """
+        import os
+        import tempfile
+        import pytest
+
+        def _exit_marker_dir():
+            return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR")
+
+        def pytest_configure(config):
+            if os.environ.get("PYTEST_XDIST_WORKER"):
+                return
+            if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"):
+                return
+            d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}")
+            os.makedirs(d, exist_ok=True)
+            os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d
+
+        @pytest.hookimpl(wrapper=True, tryfirst=True)
+        def pytest_runtest_logreport(report):
+            if getattr(report, "_qd_suppress", False):
+                return None
+            result = yield
+            if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("error", "failed"):
+                d = _exit_marker_dir()
+                if d:
+                    worker_id = os.environ["PYTEST_XDIST_WORKER"]
+                    try:
+                        with open(os.path.join(d, worker_id), "w") as f:
+                            f.write(report.nodeid)
+                    except OSError:
+                        pass
+                os._exit(1)
+            return result
+
+        def pytest_handlecrashitem(crashitem, report, sched):
+            d = _exit_marker_dir()
+            if not d:
+                return
+            node = getattr(report, "node", None)
+            if not node:
+                return
+            worker_id = node.gateway.id
+            marker = os.path.join(d, worker_id)
+            if not os.path.exists(marker):
+                return
+            try:
+                os.unlink(marker)
+            except OSError:
+                pass
+            report._qd_suppress = True
+            return True
+        """
+    )
+    return pytester
+
+
+class TestNoDuplicateFailures:
+    def test_single_failure_counted_once(self, xdist_project):
+        """A single failing test should appear exactly once in the summary."""
+        xdist_project.makepyfile(
+            """
+            def test_pass():
+                pass
+
+            def test_fail():
+                assert False, "intentional failure"
+            """
+        )
+        result = xdist_project.runpytest_subprocess("-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v")
+        result.assert_outcomes(passed=1, failed=1)
+
+    def test_multiple_failures_counted_correctly(self, xdist_project):
+        """Each failing test should be counted exactly once."""
+        xdist_project.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("i", range(4))
+            def test_fail(i):
+                assert False, f"failure {i}"
+
+            def test_pass():
+                pass
+            """
+        )
+        result = xdist_project.runpytest_subprocess(
+            "-n",
+            "2",
+            "--dist=worksteal",
+            "--max-worker-restart=999999",
+            *SUBPROCESS_ARGS,
+            "-v",
+        )
+        result.assert_outcomes(passed=1, failed=4)
+
+
+class TestSessionCompletesWithManyFailures:
+    def test_no_premature_shutdown(self, xdist_project):
+        """With a high --max-worker-restart, all tests should run even if many fail."""
+        xdist_project.makepyfile(
+            """
+            import pytest
+
+            @pytest.mark.parametrize("i", range(20))
+            def test_fail(i):
+                assert False, f"failure {i}"
+
+            @pytest.mark.parametrize("i", range(5))
+            def test_pass(i):
+                pass
+            """
+        )
+        result = xdist_project.runpytest_subprocess(
+            "-n",
+            "2",
+            "--dist=worksteal",
+            "--max-worker-restart=999999",
+            *SUBPROCESS_ARGS,
+            "-v",
+        )
+        result.assert_outcomes(passed=5, failed=20)
+        assert "maximum crashed workers reached" not in result.stdout.str()
diff --git a/tests/run_tests.py b/tests/run_tests.py
index da7df93f9a..cf2349933b 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -87,7 +87,16 @@ def _test_python(args, default_dir="python"):
         print(f"Due to how pytest-xdist is implemented, the -s option does not work with multiple thread...")
     else:
         if int(threads) > 1:
-            pytest_args += ["-n", str(threads), "--dist=worksteal"]
+            # We intentionally kill workers on test failure (see conftest.py)
+            # to reset GPU state.  Stock xdist counts each kill toward
+            # --max-worker-restart and shuts down the session when the cap is
+            # reached, so we set a very high cap to prevent that.
+            pytest_args += [
+                "-n",
+                str(threads),
+                "--dist=worksteal",
+                "--max-worker-restart=999999",
+            ]
     import pytest  # pylint: disable=C0415
 
     return int(pytest.main(pytest_args))