From cca0a2a2aec2df23aff1016828d5f0bece39e7ad Mon Sep 17 00:00:00 2001 From: Alexis Duburcq Date: Tue, 21 Apr 2026 00:07:43 +0200 Subject: [PATCH] [AutoDiff] Surface LLVM adstack push/pop overflow as a Python exception --- quadrants/codegen/llvm/codegen_llvm.cpp | 2 +- quadrants/program/program.cpp | 5 + quadrants/program/program_impl.h | 6 + .../runtime/llvm/llvm_runtime_executor.cpp | 19 +++ .../runtime/llvm/llvm_runtime_executor.h | 11 ++ .../llvm/runtime_module/internal_functions.h | 51 +++++- .../runtime/llvm/runtime_module/runtime.cpp | 43 ++++- .../runtime/program_impls/llvm/llvm_program.h | 19 +++ tests/python/test_adstack.py | 158 ++++++++++++++++++ 9 files changed, 304 insertions(+), 10 deletions(-) diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp index 1905f33531..35156d8b6a 100644 --- a/quadrants/codegen/llvm/codegen_llvm.cpp +++ b/quadrants/codegen/llvm/codegen_llvm.cpp @@ -2117,7 +2117,7 @@ void TaskCodeGenLLVM::visit(AdStackPopStmt *stmt) { void TaskCodeGenLLVM::visit(AdStackPushStmt *stmt) { auto stack = stmt->stack->as(); - call("stack_push", llvm_val[stack], tlctx->get_constant(stack->max_size), + call("stack_push", get_runtime(), llvm_val[stack], tlctx->get_constant(stack->max_size), tlctx->get_constant(stack->element_size_in_bytes())); auto primal_ptr = call("stack_top_primal", llvm_val[stack], tlctx->get_constant(stack->element_size_in_bytes())); primal_ptr = builder->CreateBitCast(primal_ptr, llvm::PointerType::get(tlctx->get_data_type(stmt->ret_type), 0)); diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 64c8a08067..06ec1a1d8e 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -307,6 +307,11 @@ void Program::finalize() { return; } + // Notify the backend that teardown has started before the two teardown syncs below. On LLVM this flips + // `LlvmProgramImpl::finalizing_` so `check_adstack_overflow()` short-circuits: otherwise a pending overflow + // flag from a kernel the user never synced explicitly would throw into the Program destructor path. + program_impl_->pre_finalize(); + synchronize(); QD_TRACE("Program finalizing..."); diff --git a/quadrants/program/program_impl.h b/quadrants/program/program_impl.h index 8a43c62eb7..bafbcfdc1d 100644 --- a/quadrants/program/program_impl.h +++ b/quadrants/program/program_impl.h @@ -128,6 +128,12 @@ class ProgramImpl { virtual void finalize() { } + // Hook invoked by `Program::finalize()` before any teardown sync. Lets backends flip state (e.g. the LLVM + // `finalizing_` flag used to suppress adstack-overflow polling) so the two `Program::synchronize()` calls that + // precede `finalize()` do not throw into the Program destructor path. + virtual void pre_finalize() { + } + virtual uint64 fetch_result_uint64(int i, uint64 *result_buffer) { return result_buffer[i]; } diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 3ce56b0b78..f794db4a54 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -238,6 +238,24 @@ std::size_t LlvmRuntimeExecutor::get_snode_num_dynamically_allocated(SNode *snod return (std::size_t)runtime_query("ListManager_get_num_elements", result_buffer, data_list); } +void LlvmRuntimeExecutor::check_adstack_overflow() { + // Called from `synchronize()` on every sync so adstack overflow surfaces as a Python exception regardless of + // `compile_config.debug`. The runtime / result buffer may not exist yet (e.g. a C++ test that constructs Program + // without materializing the runtime and then triggers Program::finalize -> synchronize), so no-op in that case. + if (llvm_runtime_ == nullptr || result_buffer_cache_ == nullptr) { + return; + } + auto *runtime_jit_module = get_runtime_jit_module(); + runtime_jit_module->call("runtime_retrieve_and_reset_adstack_overflow", llvm_runtime_); + auto flag = fetch_result(quadrants_result_buffer_error_id, result_buffer_cache_); + if (flag != 0) { + throw QuadrantsAssertionError( + "Adstack overflow: a reverse-mode autodiff kernel pushed more elements than the adstack capacity " + "allows. Raised at the next qd.sync() rather than at the offending kernel launch. Pass " + "ad_stack_size=N to qd.init() to raise the capacity."); + } +} + void LlvmRuntimeExecutor::check_runtime_error(uint64 *result_buffer) { synchronize(); auto *runtime_jit_module = get_runtime_jit_module(); @@ -617,6 +635,7 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler, uint QD_TRACE("LLVMRuntime initialized (excluding `root`)"); llvm_runtime_ = fetch_result(quadrants_result_buffer_ret_value_id, *result_buffer_ptr); + result_buffer_cache_ = *result_buffer_ptr; QD_TRACE("LLVMRuntime pointer fetched"); // Preallocate for runtime memory and update to LLVMRuntime diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.h b/quadrants/runtime/llvm/llvm_runtime_executor.h index d0732434e3..9ace49b1fd 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.h +++ b/quadrants/runtime/llvm/llvm_runtime_executor.h @@ -53,6 +53,12 @@ class LlvmRuntimeExecutor { void check_runtime_error(uint64 *result_buffer); + // Poll the runtime's adstack-overflow flag and raise if set. Unlike check_runtime_error, this runs + // unconditionally at every synchronize() (not gated on `compile_config.debug`) because adstack overflow silently + // corrupts gradients and we do not want to hide it. Safe to call before materialize_runtime() -- no-op when the + // cached result buffer is not yet populated. + void check_adstack_overflow(); + uint64_t *get_device_alloc_info_ptr(const DeviceAllocation &alloc); const CompileConfig &get_config() const { @@ -132,6 +138,11 @@ class LlvmRuntimeExecutor { std::unique_ptr jit_session_{nullptr}; JITModule *runtime_jit_module_{nullptr}; void *llvm_runtime_{nullptr}; + // Non-owning cache of the Program-owned result buffer so internal polls (adstack overflow, etc.) can be + // invoked from `synchronize()` without threading the pointer through the public API. Ownership stays with + // `Program` for its lifetime; reallocating or repointing `Program::result_buffer` mid-run would invalidate + // this cache, so avoid that. + uint64 *result_buffer_cache_{nullptr}; std::unique_ptr thread_pool_{nullptr}; std::shared_ptr device_{nullptr}; diff --git a/quadrants/runtime/llvm/runtime_module/internal_functions.h b/quadrants/runtime/llvm/runtime_module/internal_functions.h index 4ad15d250c..9cac852c32 100644 --- a/quadrants/runtime/llvm/runtime_module/internal_functions.h +++ b/quadrants/runtime/llvm/runtime_module/internal_functions.h @@ -43,11 +43,52 @@ i32 test_internal_func_args(RuntimeContext *context, float32 i, float32 j, int32 } i32 test_stack(RuntimeContext *context) { - auto stack = new u8[132]; - stack_push(stack, 16, 4); - stack_push(stack, 16, 4); - stack_push(stack, 16, 4); - stack_push(stack, 16, 4); + auto *runtime = context->runtime; + // Header u64 `n` + max_num_elements * 2 * element_size for primal+adjoint slot pairs. Allocate generously for + // the guard-case subtests below. + auto stack = new u8[8 + 16 * 2 * 4]; + stack_init(stack); + + // Basic push/pop accounting. + stack_push(runtime, stack, 16, 4); + stack_push(runtime, stack, 16, 4); + stack_push(runtime, stack, 16, 4); + stack_push(runtime, stack, 16, 4); + QD_TEST_CHECK(*(u64 *)stack == 4, runtime); + QD_TEST_CHECK(runtime->adstack_overflow_flag == 0, runtime); + + // stack_top_primal must point at slot (n - 1) (here: slot 3) when n > 0. + QD_TEST_CHECK(stack_top_primal(stack, 4) == stack + sizeof(u64) + 3 * 2 * 4, runtime); + + stack_pop(stack); + stack_pop(stack); + stack_pop(stack); + stack_pop(stack); + QD_TEST_CHECK(*(u64 *)stack == 0, runtime); + + // stack_pop underflow guard: extra pops past n == 0 must not wrap `n` into UINT_MAX. The runtime silently + // clamps at 0 instead of trapping, so the reverse pass can over-pop without corrupting subsequent kernels. + stack_pop(stack); + stack_pop(stack); + QD_TEST_CHECK(*(u64 *)stack == 0, runtime); + + // stack_top_primal clamping: on an empty stack the top-of-stack pointer must index slot 0 (not `-1` + // * 2 * element_size, which would point into header territory and crash on read). + QD_TEST_CHECK(stack_top_primal(stack, 4) == stack + sizeof(u64), runtime); + + // Push past capacity: `n` stops at max_num_elements and `adstack_overflow_flag` flips to 1. + for (int i = 0; i < 16; i++) { + stack_push(runtime, stack, 16, 4); + } + QD_TEST_CHECK(*(u64 *)stack == 16, runtime); + QD_TEST_CHECK(runtime->adstack_overflow_flag == 0, runtime); + stack_push(runtime, stack, 16, 4); // overflow push + QD_TEST_CHECK(*(u64 *)stack == 16, runtime); + QD_TEST_CHECK(runtime->adstack_overflow_flag == 1, runtime); + // Reset the flag so subsequent tests in the same fixture are not poisoned. + runtime->adstack_overflow_flag = 0; + + delete[] stack; return 0; } diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp index 7459f17f8c..41ee960dd4 100644 --- a/quadrants/runtime/llvm/runtime_module/runtime.cpp +++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp @@ -577,6 +577,10 @@ struct LLVMRuntime { uint64 error_message_arguments[quadrants_error_message_max_num_arguments]; i32 error_message_lock = 0; i64 error_code = 0; + // Dedicated flag for adstack-overflow-specific errors. Separate from `error_code` so assertions (which set + // error_code=1 and are only surfaced when `compile_config.debug` is on) do not leak through the always-on poll + // that Program::synchronize runs. + i64 adstack_overflow_flag = 0; Ptr result_buffer; i32 allocator_lock; @@ -709,6 +713,14 @@ void runtime_retrieve_and_reset_error_code(LLVMRuntime *runtime) { runtime->error_code = 0; } +void runtime_retrieve_and_reset_adstack_overflow(LLVMRuntime *runtime) { + // Paired with the relaxed atomic write in `stack_push`. The host calls this only after the thread pool has + // joined, so strictly no synchronization is required here, but use `__atomic_exchange_n` anyway to keep the + // read/reset symmetric with the write and to avoid annotating the single shared field as half-atomic. + i64 flag = __atomic_exchange_n(&runtime->adstack_overflow_flag, (i64)0, __ATOMIC_RELAXED); + runtime->set_result(quadrants_result_buffer_error_id, flag); +} + void runtime_retrieve_error_message(LLVMRuntime *runtime, int i) { runtime->set_result(quadrants_result_buffer_error_id, runtime->error_message_template[i]); } @@ -1953,9 +1965,17 @@ void quadrants_printf(LLVMRuntime *runtime, const char *format, Args &&...args) extern "C" { // local stack operations +// The stack index `n` is clamped on read so that overflow (push past capacity) does not let subsequent pops and +// top-accesses underflow it and index far out of bounds. The corresponding stack_push sets +// `runtime->adstack_overflow_flag` and skips the increment instead of trapping, so the host-side launcher +// surfaces the failure as a Python exception rather than killing the process via __builtin_trap. When n == 0 +// (pop-after-overflow underflow path) we return a pointer to slot 0 - an uninitialized-but-in-bounds slot. The +// caller will read garbage from it, but the host raises on `runtime->adstack_overflow_flag` before any such +// value reaches user code. Ptr stack_top_primal(Ptr stack, std::size_t element_size) { auto n = *(u64 *)stack; - return stack + sizeof(u64) + (n - 1) * 2 * element_size; + std::size_t idx = n > 0 ? n - 1 : 0; + return stack + sizeof(u64) + idx * 2 * element_size; } Ptr stack_top_adjoint(Ptr stack, std::size_t element_size) { @@ -1968,13 +1988,28 @@ void stack_init(Ptr stack) { void stack_pop(Ptr stack) { auto &n = *(u64 *)stack; - n--; + if (n > 0) { + n--; + } } -void stack_push(Ptr stack, size_t max_num_elements, std::size_t element_size) { +void stack_push(LLVMRuntime *runtime, Ptr stack, size_t max_num_elements, std::size_t element_size) { u64 &n = *(u64 *)stack; + if (n + 1 > max_num_elements) { + // Overflow: the loop has more iterations than the adstack capacity. Skip the push and flip the dedicated + // overflow flag so the host launcher throws at sync. Multiple CPU threads can hit this branch concurrently + // (thread pool dispatch over a multi-element field), so write the sentinel through `__atomic_store_n` with + // relaxed ordering: on x86-64/ARM64 this compiles to a regular naturally-aligned store, but it satisfies the + // C++11 memory model (plain non-atomic writes from multiple threads to the same object are a data race, even + // when every writer stores the same value). The host only reads the flag from `check_adstack_overflow()` + // after the thread pool has joined, so no ordering beyond "happens eventually" is required. + // `locked_task` was avoided because the AMDGPU JIT cannot retarget its host-side machinery + // (`hipErrorNoBinaryForGpu`). Using a separate field (not `error_code`) keeps this check distinct from + // assertion machinery, which is debug-gated. + __atomic_store_n(&runtime->adstack_overflow_flag, (i64)1, __ATOMIC_RELAXED); + return; + } n += 1; - // TODO: assert n <= max_elements std::memset(stack_top_primal(stack, element_size), 0, element_size * 2); } diff --git a/quadrants/runtime/program_impls/llvm/llvm_program.h b/quadrants/runtime/program_impls/llvm/llvm_program.h index 25f7722856..a7ff25fed2 100644 --- a/quadrants/runtime/program_impls/llvm/llvm_program.h +++ b/quadrants/runtime/program_impls/llvm/llvm_program.h @@ -99,7 +99,17 @@ class LlvmProgramImpl : public ProgramImpl { return runtime_exec_->fetch_result(i, result_buffer); } + // Skip the adstack-overflow poll from this point on: `Program::finalize()` invokes `pre_finalize()` before the + // two teardown `synchronize()` calls, and we do not want `check_adstack_overflow()` to throw into a + // `~Program()` unwinding path - that would terminate the process with a bare `QuadrantsAssertionError` instead + // of letting the user handle it at their own `qd.sync()` site. The flag only affects the internal poll; the + // user can still call `qd.sync()` explicitly before finalize to observe the raise. + void pre_finalize() override { + finalizing_ = true; + } + void finalize() override { + finalizing_ = true; runtime_exec_->finalize(); } @@ -150,6 +160,9 @@ class LlvmProgramImpl : public ProgramImpl { void synchronize() override { runtime_exec_->synchronize(); + if (!finalizing_) { + runtime_exec_->check_adstack_overflow(); + } } LLVMRuntime *get_llvm_runtime() { @@ -250,6 +263,12 @@ class LlvmProgramImpl : public ProgramImpl { std::size_t num_snode_trees_processed_{0}; std::unique_ptr runtime_exec_; std::unique_ptr cache_data_; + // Flipped on by `pre_finalize()` (with a defensive re-assignment in `finalize()`) so the `synchronize()` + // override stops polling the adstack-overflow flag during teardown. `Program::finalize()` invokes + // `pre_finalize()` before its two teardown syncs, so the flag is already true when those syncs run; moving + // the assignment back into `finalize()` alone would silently re-introduce the `std::terminate()` teardown bug + // this field was introduced to fix. + bool finalizing_{false}; }; LlvmProgramImpl *get_llvm_program(Program *prog); diff --git a/tests/python/test_adstack.py b/tests/python/test_adstack.py index 13c56db3df..fa0a8f193e 100644 --- a/tests/python/test_adstack.py +++ b/tests/python/test_adstack.py @@ -1,10 +1,14 @@ import math import pathlib import re +import subprocess +import sys +import textwrap import pytest import quadrants as qd +from quadrants.lang.misc import is_extension_supported from tests import test_utils @@ -427,6 +431,160 @@ def compute(): compute.grad() +def _overflowing_compute(n_elements=1, n_iter=64): + # Shared kernel for the overflow tests. Builds `compute`, loads inputs, seeds the output gradient, and returns + # `(compute, x, y)` so each test can drive the grad launch and read back assertions itself. `n_iter=64` + 2 + # adstack preamble pushes = 66 pushes, comfortably above `default_ad_stack_size=32`; `n_elements` controls how + # many threads run the overflowing loop in parallel. + x = qd.field(qd.f32) + y = qd.field(qd.f32) + qd.root.dense(qd.i, n_elements).place(x, x.grad) + qd.root.place(y, y.grad) + + @qd.kernel + def compute(): + for i in x: + v = x[i] + for _ in range(n_iter): + y[None] += qd.sin(v) + v = v + 1.0 + + for i in range(n_elements): + x[i] = 0.1 + 0.01 * i + y[None] = 0.0 + compute() + y.grad[None] = 1.0 + for i in range(n_elements): + x.grad[i] = 0.0 + return compute, x, y + + +@test_utils.test(require=qd.extension.adstack, ad_stack_size=32) +def test_adstack_overflow_raises(): + # Runs a backward pass with a for-loop longer than the adstack can hold, and asserts the overflow surfaces as a + # regular Python exception on the next `qd.sync()` - not a silent wrong gradient and not a process crash. This + # is what users see when their differentiable kernel is too deep for the current `ad_stack_size`, and the error + # message should tell them how to raise the capacity. + # + # Internal detail: both LLVM and SPIR-V defer the error to the next `qd.sync()` (same pattern as CUDA async + # errors) so we do not pay a sync-per-launch. LLVM polls `runtime->adstack_overflow_flag` from + # `LlvmProgramImpl::synchronize()` via `check_adstack_overflow()`; SPIR-V's gfx runtime raises via `QD_ERROR` + # on sync. The test launches the overflowing grad kernel and calls `qd.sync()` inside the same `pytest.raises` + # block so the deferred surfacing point is caught. + compute, _, _ = _overflowing_compute() + # On LLVM the runtime raises QuadrantsAssertionError (subclass of AssertionError) from + # check_adstack_overflow; on SPIR-V the gfx runtime raises RuntimeError via QD_ERROR. We accept either, + # matching only the message prefix. + with pytest.raises((AssertionError, RuntimeError), match=r"[Aa]dstack overflow"): + compute.grad() + qd.sync() + + +@test_utils.test(require=qd.extension.adstack, ad_stack_size=32) +def test_adstack_overflow_flag_resets_after_catch(): + # Once `check_adstack_overflow()` raises, the runtime must clear its overflow flag so a subsequent `qd.sync()` + # (with no new overflowing grad launch in between) returns normally. Without the reset the user would see a + # stale overflow exception every time they sync after the first one, which makes diagnosis and recovery + # impossible. + compute, _, _ = _overflowing_compute() + with pytest.raises((AssertionError, RuntimeError), match=r"[Aa]dstack overflow"): + compute.grad() + qd.sync() + # No new grad launch here - the flag must already be back to zero. + qd.sync() + + +@test_utils.test(require=qd.extension.adstack, ad_stack_size=1024) +def test_adstack_large_capacity_resolves_overflow(): + # Same kernel shape as `test_adstack_overflow_raises`, but with `ad_stack_size=1024` explicitly passed to + # `qd.init()`. Asserts that raising the capacity (rather than shrinking the loop) is a valid workaround and + # that the backward pass runs to completion with a correct gradient. This is the remediation path the overflow + # error message points users at. + compute, x, _ = _overflowing_compute() + compute.grad() + qd.sync() + + # y += sin(v) iterated with v = x[0] + k for k = 0..63, so dy/dx[0] = sum_k cos(x[0] + k). + expected = sum(math.cos(0.1 + k) for k in range(64)) + assert x.grad[0] == test_utils.approx(expected, rel=1e-3) + + +@test_utils.test(require=qd.extension.adstack, ad_stack_size=32) +def test_adstack_overflow_multithreaded(): + # Multi-element field so several threads execute the overflowing grad body in parallel. Asserts the overflow + # still surfaces as a single Python exception rather than deadlocking, crashing, or racing on the flag. Every + # thread writes the same flag value (non-zero), so a race on the write is benign; this test pins that the + # read side is also safe (one raise per sync regardless of how many threads flipped the bit). + compute, _, _ = _overflowing_compute(n_elements=16) + with pytest.raises((AssertionError, RuntimeError), match=r"[Aa]dstack overflow"): + compute.grad() + qd.sync() + + +def test_adstack_overflow_during_teardown_does_not_abort(tmp_path): + # This test runs the kernel in a child process (not via `@test_utils.test`, which iterates arches), so it + # cannot rely on the decorator's `require=qd.extension.adstack` skip. Guard manually: skip if the CPU backend + # was not built with the adstack extension, matching what the sibling overflow tests get from the decorator. + if not is_extension_supported(qd.cpu, qd.extension.adstack): + pytest.skip("adstack extension not available on cpu") + + # If a user launches an overflowing grad kernel and never calls `qd.sync()` before the process exits, the + # adstack-overflow flag is still set when Python interpreter teardown invokes `Program::finalize()`. The two + # teardown syncs inside `Program::finalize()` must not re-raise a `QuadrantsAssertionError` into the + # destructor path - doing so would terminate the process with `std::terminate()` instead of returning a clean + # exit code. A subprocess runs the overflowing-grad kernel without calling `qd.sync()` at all and exits; this + # test asserts that the child returns with exit code 0 rather than SIGABRT (-6) or any other non-zero code. + # + # Internal details: `Program::finalize()` invokes `program_impl_->pre_finalize()` before the two teardown + # `synchronize()` calls. `LlvmProgramImpl::pre_finalize()` sets `finalizing_ = true` so + # `LlvmProgramImpl::synchronize()` short-circuits `check_adstack_overflow()`. Note the flag must be set + # *before* those syncs run - setting it only inside `LlvmProgramImpl::finalize()` (which is dispatched after + # them) is too late. The subprocess is launched from a temp file because `python -c ""` breaks + # Quadrants' kernel source-inspect (`getsourcelines` cannot find the source of an inlined `-c` string); the + # grad call is deliberately left unsynced so this is the teardown path, not the user-catch path. + child_script = textwrap.dedent( + """ + import quadrants as qd + + qd.init(arch=qd.cpu, ad_stack_experimental_enabled=True, ad_stack_size=32) + + x = qd.field(qd.f32) + y = qd.field(qd.f32) + qd.root.dense(qd.i, 1).place(x, x.grad) + qd.root.place(y, y.grad) + + @qd.kernel + def compute(): + for i in x: + v = x[i] + for _ in range(64): + y[None] += qd.sin(v) + v = v + 1.0 + + x[0] = 0.1 + y[None] = 0.0 + compute() + y.grad[None] = 1.0 + x.grad[0] = 0.0 + compute.grad() + # Intentionally no qd.sync() and no try/except here: the adstack-overflow flag is left set when the + # process exits, so teardown must swallow it via the `finalizing_` guard rather than re-raising. + """ + ) + script_path = tmp_path / "overflow_teardown_child.py" + script_path.write_text(child_script) + # No `timeout=` on subprocess.run: pytest's own per-test timeout (`--timeout=...` in CI and locally) already + # terminates the whole test if the child deadlocks. Adding a second timeout here would only duplicate that + # safety net with a different failure mode (TimeoutExpired vs pytest-timeout's clean teardown). + result = subprocess.run([sys.executable, str(script_path)], capture_output=True, check=False) + if result.returncode != 0: + raise AssertionError( + f"child exited with {result.returncode}\n" + f"stdout:\n{result.stdout.decode()}\n" + f"stderr:\n{result.stderr.decode()}" + ) + + def _run_sum_linear( qd_dtype, use_static_loop, use_varying_coeff, n_iter, rel_tol, approx=test_utils.approx, abs_tol=None ):