Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ jobs:
- name: Install test requirements
run: |
pip install --group test
pip install -r requirements_test_xdist.txt
- name: Run CUDA tests with coverage
run: |
bash .github/workflows/scripts_new/linux/4_test_cuda.sh
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/scripts_new/linux/4_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -ex

pip install --group test
pip install -r requirements_test_xdist.txt
export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
Comment thread
claude[bot] marked this conversation as resolved.
./build/quadrants_cpp_tests --gtest_filter=-AMDGPU.*

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/scripts_new/macosx/4_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -ex

pip install --prefer-binary --group test
pip install -r requirements_test_xdist.txt
find . -name '*.bc'
ls -lh build/
export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/scripts_new/manylinux_wheel/5_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -ex

pip install --group test
pip install -r requirements_test_xdist.txt

# Phase 1: run all tests except torch-dependent ones
python tests/run_tests.py -v -r 1 -m "not needs_torch"
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/scripts_new/win/3_test.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ python -c 'import gstaichi as ti; ti.init();'
$env:QD_LIB_DIR="python/gstaichi/_lib/runtime"
Get-ChildItem -Path build -Recurse
pip install --group test
pip install -r requirements_test_xdist.txt

# Phase 1: run all tests except torch-dependent ones
python .\tests\run_tests.py -v -r 1 -m "not needs_torch"
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/test_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ jobs:
- name: install test requirements
run: |
pip install --group test
pip install -r requirements_test_xdist.txt
- name: run tests (without torch)
run: |
python tests/run_tests.py -r 1 -v --arch cuda -m "not needs_torch"
Expand Down Expand Up @@ -158,7 +157,6 @@ jobs:
- name: install test requirements
run: |
pip install --group test
pip install -r requirements_test_xdist.txt
- name: run tests (without torch)
run: |
python tests/run_tests.py -r 1 -v --arch vulkan -m "not needs_torch"
Expand Down Expand Up @@ -189,7 +187,6 @@ jobs:
- name: install test requirements
run: |
pip install --group test
pip install -r requirements_test_xdist.txt
- name: run tests (without torch)
run: |
export QD_AMDGPU_V520=1
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ dev = [
"ruamel.yaml",
]
test = [
# You also need to:
# pip install -r requirements_test_xdist.txt
"Pillow",
"pytest",
# 16.0 upgrade broke xfail, caused fatal errors, see
# https://github.com/Genesis-Embodied-AI/quadrants/pull/162
"pytest-rerunfailures<16.0",
"pytest-cov",
"pytest-retry",
"pytest-xdist>=3.7.0",

"numpy>=2.0.0", # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13
"psutil",
"autograd",
Expand Down
6 changes: 0 additions & 6 deletions requirements_test_xdist.txt

This file was deleted.

10 changes: 10 additions & 0 deletions tests/pytest_hardtle.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
# -*- coding: utf-8 -*-
#
# Hard-kill timeout plugin (drop-in replacement for pytest-timeout).
# Uses CFFI to compile a native C watchdog that calls _exit(1) on timeout.
# Unlike stock pytest-timeout (which uses Python-level SIGALRM handlers),
# this can kill tests that hang inside native CUDA/HIP kernel calls or
# C extensions that don't release the GIL.
#
# Stock pytest-timeout must be suppressed (`-p no:timeout`) when this
# plugin is loaded, because both register the same hook specs and pytest
# will raise a ValueError on the duplicate.

# -- stdlib --
import importlib
Expand Down
113 changes: 78 additions & 35 deletions tests/python/conftest.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
import gc
import os
import sys
Comment thread
claude[bot] marked this conversation as resolved.
import time
import tempfile

import pytest

# rerunfailures use xdist version number to determine if it is compatible
# but we are using a forked version of xdist(with git hash as it's version),
# so we need to override it
import pytest_rerunfailures

import quadrants as qd

pytest_rerunfailures.works_with_current_xdist = lambda: True


@pytest.fixture(autouse=True)
def run_gc_after_test():
Expand Down Expand Up @@ -120,43 +113,93 @@ def pytest_generate_tests(metafunc):
metafunc.parametrize("req_arch,req_options", [(None, None)], ids=["none"])


@pytest.hookimpl(trylast=True)
def pytest_runtest_logreport(report):
"""
Retire test workers when a test fails, to avoid the failing test
leaving a corrupted GPU state for the following tests.
"""
def _exit_marker_dir():
"""Temp directory shared between xdist controller and workers for intentional-exit markers."""
return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR")


interactor = getattr(sys, "xdist_interactor", None)
if not interactor:
def pytest_configure(config):
"""On the xdist controller, create a temp directory for intentional-exit markers.

Workers inherit the ``_QD_XDIST_EXIT_MARKER_DIR`` env var and use the same directory.
"""
if os.environ.get("PYTEST_XDIST_WORKER"):
return
if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"):
return
d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}")
os.makedirs(d, exist_ok=True)
os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d


if report.outcome not in ("rerun", "error", "failed"):
def pytest_unconfigure(config):
"""Clean up the marker directory at session end."""
if os.environ.get("PYTEST_XDIST_WORKER"):
return
d = _exit_marker_dir()
if d and os.path.isdir(d):
import shutil

layoff = False
shutil.rmtree(d, ignore_errors=True)
Comment on lines +135 to +143
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Latent regression of the duplicate-failure protection on the second pytest.main() invocation in the same process. pytest_unconfigure at conftest.py:135-143 deletes the marker dir but never clears os.environ['_QD_XDIST_EXIT_MARKER_DIR'], so a second pytest.main() call sees the stale env var, skips creating a fresh dir at the early-return on lines 128-129, and any subsequent worker failure cannot write its marker — pytest_handlecrashitem then leaves _qd_suppress unset and the synthetic 'worker crashed' report is double-counted, the exact regression this PR was written to prevent. Triggers locally on tests/run_tests.py --with-offline-cache (run_count=2). Fix: add os.environ.pop('_QD_XDIST_EXIT_MARKER_DIR', None) to pytest_unconfigure (controller-only) so the next session creates a fresh directory.

Extended reasoning...

What the bug is

pytest_configure at conftest.py:121-132 creates a per-process marker directory and sets os.environ['_QD_XDIST_EXIT_MARKER_DIR'] so workers (via env-var inheritance) and the controller share the same path. The function early-returns at lines 128-129 if the env var is already set.

pytest_unconfigure at conftest.py:135-143 calls shutil.rmtree(d, ignore_errors=True) on the directory, but never clears the env var. There is no symmetric os.environ.pop('_QD_XDIST_EXIT_MARKER_DIR', None). So after the first session ends, the env var is still set in the running Python process, pointing at a now-deleted path.

How it manifests

tests/run_tests.py:test() invokes pytest.main(...) in a loop:

run_count = 1
if args.with_offline_cache:
    run_count += args.rerun_with_offline_cache  # default 1, so run_count = 2
...
for _ in range(run_count):
    ret = _test_python(args)   # calls pytest.main(...)
    if ret == 5: ret = 0
    if ret != 0: exit(ret)

When --with-offline-cache is passed (or QD_TEST_OFFLINE_CACHE=1 is set), pytest.main() is called twice in the same Python process.

Step-by-step proof

  1. Iter 1, pytest_configure: env var unset → creates /tmp/qd_xdist_exits_<pid>, sets env var.
  2. Iter 1, pytest_unconfigure: shutil.rmtree(d) removes the dir; env var still set.
  3. Iter 2, pytest_configure: line 128 sees env var, returns early. No dir is recreated.
  4. Iter 2, worker fails a test: pytest_runtest_logreport enters the wrapper hook, real report is sent via inner hooks during yield, then the worker tries open(os.path.join(d, worker_id), 'w')d points to the deleted dir, so this raises FileNotFoundError, which is swallowed by except OSError: pass (lines 169-170). os._exit(1) runs anyway.
  5. Iter 2, controller pytest_handlecrashitem: d = _exit_marker_dir() returns the stale path (truthy, so doesn't early-return at line 201). os.path.exists(marker) is False (whole dir is gone) → returns None without setting report._qd_suppress = True.
  6. Iter 2, dsession.py:452: stock xdist unconditionally calls pytest_runtest_logreport(report=rep) on the synthetic crash report. The controller's wrapper hook sees _qd_suppress is False, so yield lets the report flow through. The real failed report (already counted in step 4 from the worker) plus the synthetic worker 'gw0' crashed while running X report are both counted — exactly the double-counted-failure regression this PR was written to prevent.

Why existing code doesn't prevent it

The two early-return guards in pytest_configure (lines 126-129) check the env var to detect (a) running inside a worker process, and (b) re-entry — but the second guard implicitly assumes the env var is cleared between sessions. pytest_unconfigure does not honor that assumption. The pytest_handlecrashitem helper (lines 199-218) similarly assumes that if _exit_marker_dir() is truthy, the directory exists and any missing marker means a genuine crash, not 'we forgot to recreate the directory'.

Scope and severity

CI lanes do not exercise --with-offline-cache (no matches under .github/ for with-offline-cache or QD_TEST_OFFLINE_CACHE), so no CI workflow regresses today. The bug bites any developer running python tests/run_tests.py --with-offline-cache locally — a documented developer flag (see arg help in run_tests.py). On iteration ≥ 2 the deduplication mechanism the PR adds is silently disabled, and any test failure is double-counted in the summary.

Fix

One line in pytest_unconfigure (conftest.py:143), after the rmtree:

os.environ.pop('_QD_XDIST_EXIT_MARKER_DIR', None)

This is the symmetric counterpart to the os.environ['_QD_XDIST_EXIT_MARKER_DIR'] = d set on line 132. The PYTEST_XDIST_WORKER early-return on line 137 already ensures only the controller clears it, so workers (which inherited the env var via fork/spawn) are unaffected.


chain = getattr(getattr(report, "longrepr", None), "chain", None)
if chain:
for _, loc, _ in chain:
msg = getattr(loc, "message", "") if loc else ""
if "CUDA_ERROR_OUT_OF_MEMORY" in msg:
layoff = True
break

# Don't call interactor.retire() — it uses os._exit(0) which kills
# the process before execnet's IO thread can flush the channel buffer.
# The test failure report (queued by xdist's own hook, which ran before
# this trylast hook) would be lost, hiding all error messages.
interactor.sendevent("workerretire", layoff=layoff)
time.sleep(0.2)
os._exit(0)
@pytest.hookimpl(wrapper=True, tryfirst=True)
def pytest_runtest_logreport(report):
"""Handle xdist worker retirement and crash-report suppression.

On the controller: swallow synthetic crash reports that were already marked for suppression by
pytest_handlecrashitem.

import importlib
import sys
On workers: after a test failure, write an intentional-exit marker and kill the process so it
restarts with clean GPU state. The real test report is sent by inner hooks (including xdist's
report-forwarding hook) during ``yield`` before we exit.
"""
if getattr(report, "_qd_suppress", False):
return None

import pytest
result = yield

if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"):
d = _exit_marker_dir()
if d:
worker_id = os.environ["PYTEST_XDIST_WORKER"]
try:
with open(os.path.join(d, worker_id), "w") as f:
f.write(report.nodeid)
except OSError:
pass
os._exit(1)

return result
Comment on lines 145 to +173
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The wrapper hook at tests/python/conftest.py:146-173 (@pytest.hookimpl(wrapper=True, tryfirst=True)) has if getattr(report, "_qd_suppress", False): return None at line 158, BEFORE the result = yield at line 160 — pluggy treats any function containing yield as a generator, so the early return raises StopIteration on next(), which pluggy converts to RuntimeError("wrap_controller at pytest_runtest_logreport ... did not yield"). This fires every time a worker dies via os._exit(1) after a test failure: pytest_handlecrashitem at line 176 sets report._qd_suppress=True and then xdist/dsession.py:452 unconditionally re-invokes pytest_runtest_logreport(report=rep), which enters the wrapper, hits the early return, and raises. The RuntimeError surfaces as INTERNALERROR and aborts the test loop, silently dropping the remaining tests — strictly worse than the failure-visibility issue this PR set out to fix. Fix: move the _qd_suppress check after the yield (yield first, then discard the result), or convert the hook to a non-wrapper @pytest.hookimpl(tryfirst=True) that mutates the report in place.

Extended reasoning...

What the bug is

The hook at tests/python/conftest.py:146-173:

@pytest.hookimpl(wrapper=True, tryfirst=True)
def pytest_runtest_logreport(report):
    if getattr(report, "_qd_suppress", False):
        return None        # <-- early return BEFORE yield

    result = yield

    if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"):
        ...
        os._exit(1)

    return result

A wrapper=True hookimpl whose body contains yield is a generator. Pluggy drives it via next() to run the pre-yield code, calls .send(value) to resume after the inner hooks, and expects the generator to either return after the yield (StopIteration with a value) or raise. If the generator returns before yielding, the very first next() raises StopIteration; pluggy detects this in _callers.py and re-raises as RuntimeError: wrap_controller at pytest_runtest_logreport ... did not yield. That RuntimeError surfaces as INTERNALERROR in pytest_runtestloop, which aborts the entire session.

How the bad path is triggered

The PR introduces two cooperating pieces:

  1. The wrapper hook above — fires on every pytest_runtest_logreport invocation.
  2. pytest_handlecrashitem at conftest.py:176-200 — sets report._qd_suppress = True on the synthetic crash report and returns truthy.

The trigger chain (verified against xdist/dsession.py from pytest-xdist 3.x):

  1. Worker fails a test → real TestReport is sent over the channel, conftest wrapper runs cleanly to its os._exit(1).
  2. Controller detects errordown, calls handle_crashitem(crashitem, errors).
  3. handle_crashitem invokes the firstresult hook pytest_handlecrashitem. The PRs impl sets report._qd_suppress = True and returns True.
  4. The very next line in dsession.py:452 unconditionally calls self.config.hook.pytest_runtest_logreport(report=rep) on the same synthetic report.
  5. Pluggy enters the PRs wrapper hook with tryfirst=True. Body sees _qd_suppress flag → return None on line 158.
  6. The function never reaches yield on line 160. Pluggy raises RuntimeError: ... did not yield.
  7. The error propagates out of _pytest_runtest_logreport → INTERNALERROR → pytest_runtestloop aborts.

Why the existing code does not prevent it

  • dsession.py:452 has no opt-out path: the pytest_runtest_logreport call is unconditional after the firstresult hook returns.
  • The firstresult=True decorator on pytest_handlecrashitem only short-circuits that hook chain; it does not gate the unrelated logreport call on the next line.
  • pluggys wrapper protocol has no "yielded zero times" mode — wrappers must yield exactly once.

Impact

Every xdist run with a worker-dying-after-failure (which is the central path of this PRs design) hits the bug on the first such failure and aborts the test loop with INTERNALERROR. With N>=2 failing tests under -n 2 --dist=worksteal --max-worker-restart=999999, only the first one or two failures are reported before the loop dies; remaining tests are silently dropped. This is strictly worse than the original PR-author-acknowledged double-counting issue: instead of a noisy 2x in the summary, real test failures vanish.

It also means the new regression tests at tests/python/test_xdist_worker_retirement.py::TestNoDuplicateFailures::test_multiple_failures_counted_correctly (expects failed=4) and TestSessionCompletesWithManyFailures::test_no_premature_shutdown (expects failed=20) will themselves fail when actually run — the conftest they install via pytester.makeconftest has the identical wrapper-without-pre-yield check and will INTERNALERROR before reaching their expected counts.

Step-by-step concrete proof

Setup: pytest 9.x + pytest-xdist 3.8+ + pluggy 1.6, exact PR conftest, three tests of which two always fail, run with -n 2 --dist=worksteal --max-worker-restart=999999.

  1. gw0 runs test_fail_1, asserts False → real failed report sent to controller.
  2. Conftest wrapper resumes after yield, takes the outcome in ("rerun","error","failed") branch, writes the marker, calls os._exit(1). gw0 dies.
  3. Controller emits FAILED test_fail_1 from the real report. Schedules errordown.
  4. handle_crashitem(node="gw0", crashitem="test_fail_1") runs:
    • Synthesizes TestReport(outcome="failed", when="???", longrepr="worker gw0 crashed while running test_fail_1").
    • Calls pytest_handlecrashitem(...) → PR impl reads marker, sets report._qd_suppress=True, returns True.
    • Next line: self.config.hook.pytest_runtest_logreport(report=rep).
  5. Pluggy invokes PR wrapper with tryfirst=True:
    • First statement: if getattr(report, "_qd_suppress", False): return None.
    • Function is a generator (because of yield further down) → next(gen) raises StopIteration.
    • Pluggy _callers.py detects this and calls _raise_wrapfail(gen, "did not yield")RuntimeError.
  6. RuntimeError propagates up _multicallpytest_runtestloop catches and reports INTERNALERROR> RuntimeError: wrap_controller at pytest_runtest_logreport ... did not yield.
  7. Test loop aborts. test_fail_2 (and any further failing test) is never reported. Summary shows 1 failed in X.Xs despite multiple real failures pending.

This was independently reproduced by three verifiers using the exact PR code with stock pytest+xdist+pluggy.

How to fix

Two clean options:

  1. Yield first, gate the post-yield logic on the flag — preserves wrapper semantics and still suppresses the synthetic report:

    @pytest.hookimpl(wrapper=True, tryfirst=True)
    def pytest_runtest_logreport(report):
        result = yield
        if getattr(report, "_qd_suppress", False):
            return None
        if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"):
            ...
            os._exit(1)
        return result

    Note this still lets inner hooks observe the synthetic report; for stronger suppression, mutate the report in pytest_handlecrashitem (e.g. report.outcome="passed"; report.when="teardown") instead, which the existing terminal categorisation will collapse to the "" bucket.

  2. Drop the wrapper entirely — use a non-wrapper @pytest.hookimpl(tryfirst=True) that mutates the report (the suppression-via-mutation pattern an earlier review iteration on this PR already validated empirically with report.when = "teardown").

The same identical bug exists in the embedded conftest at tests/python/test_xdist_worker_retirement.py:53-68 (written via pytester.makeconftest), so any fix needs to be applied there too — those are the regression tests for this exact mechanism, and they currently encode the broken pattern.



def pytest_handlecrashitem(crashitem, report, sched):
"""Suppress the synthetic crash report only for intentional ``os._exit(1)`` exits.

When a worker is killed intentionally (to reset GPU state after a failure), it writes a marker
file before exiting. If the marker exists, we flag the synthetic report for suppression and
return a truthy value to stop the firstresult hook chain. Genuine crashes (segfaults, OOM,
etc.) have no marker, so their reports pass through unmodified.
"""
d = _exit_marker_dir()
if not d:
return
node = getattr(report, "node", None)
if not node:
return
worker_id = node.gateway.id
marker = os.path.join(d, worker_id)
if not os.path.exists(marker):
return
try:
os.unlink(marker)
except OSError:
pass
report._qd_suppress = True
return True


import importlib


@pytest.fixture
Expand Down
Loading
Loading