diff --git a/.github/workflows/scripts_new/linux/4_test.sh b/.github/workflows/scripts_new/linux/4_test.sh index b707ff68d5..2eb389747c 100644 --- a/.github/workflows/scripts_new/linux/4_test.sh +++ b/.github/workflows/scripts_new/linux/4_test.sh @@ -5,11 +5,6 @@ set -ex pip install --group test pip install -r requirements_test_xdist.txt export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime" -./build/quadrants_cpp_tests --gtest_filter=-AMDGPU.* +./build/quadrants_cpp_tests -# Phase 1: run all tests except torch-dependent ones -python tests/run_tests.py -v -r 3 -m "not needs_torch" - -# Phase 2: install torch, run only torch tests -pip install torch --index-url https://download.pytorch.org/whl/cpu -python tests/run_tests.py -v -r 3 -m needs_torch +python tests/run_tests.py -v -r 3 -a amdgpu -t 16 diff --git a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp index f0e166e4fc..65f22f7560 100644 --- a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp +++ b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp @@ -157,6 +157,12 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { #undef UNARY_STD } + // Emit reductions as direct LLVM atomics instead of calling runtime + // reduce_* helpers. The runtime helpers expect addrspace(0) pointers, + // but SNode destinations arrive in addrspace(1). Calling the helpers + // requires an addrspace cast + inlining for correctness, which causes + // compilation blowup. Direct atomics preserve the address space and + // compile fast. llvm::Value *optimized_reduction(AtomicOpStmt *stmt) override { if (!stmt->is_reduction) { return nullptr; @@ -164,45 +170,41 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { QD_ASSERT(stmt->val->ret_type->is()); PrimitiveTypeID prim_type = stmt->val->ret_type->cast()->type; - - std::unordered_map> - fast_reductions; - - fast_reductions[PrimitiveTypeID::i32][AtomicOpType::add] = "reduce_add_i32"; - fast_reductions[PrimitiveTypeID::f32][AtomicOpType::add] = "reduce_add_f32"; - fast_reductions[PrimitiveTypeID::i32][AtomicOpType::min] = "reduce_min_i32"; - fast_reductions[PrimitiveTypeID::f32][AtomicOpType::min] = "reduce_min_f32"; - fast_reductions[PrimitiveTypeID::i32][AtomicOpType::max] = "reduce_max_i32"; - fast_reductions[PrimitiveTypeID::f32][AtomicOpType::max] = "reduce_max_f32"; - - fast_reductions[PrimitiveTypeID::i32][AtomicOpType::bit_and] = - "reduce_and_i32"; - fast_reductions[PrimitiveTypeID::i32][AtomicOpType::bit_or] = - "reduce_or_i32"; - fast_reductions[PrimitiveTypeID::i32][AtomicOpType::bit_xor] = - "reduce_xor_i32"; - AtomicOpType op = stmt->op_type; - if (fast_reductions.find(prim_type) == fast_reductions.end()) { - return nullptr; - } - QD_ASSERT(fast_reductions.at(prim_type).find(op) != - fast_reductions.at(prim_type).end()); - // SNode pointer chain (GetRootStmt/SNodeLookupStmt/GetChStmt) propagates - // addrspace(1) on AMDGPU. The runtime reduce_*_* helpers in - // runtime.cpp:DEFINE_REDUCTION are declared with generic (addrspace 0) - // pointer parameters. Cast the destination back to addrspace(0) so - // check_func_call_signature accepts the call; InferAddressSpaces in O3 - // can re-promote downstream loads/stores after inlining. llvm::Value *dest = llvm_val[stmt->dest]; - if (dest && dest->getType()->isPointerTy() && - dest->getType()->getPointerAddressSpace() == 1) { - auto *ptr_as0 = llvm::PointerType::getUnqual(*llvm_context); - dest = builder->CreateAddrSpaceCast(dest, ptr_as0); + llvm::Value *val = llvm_val[stmt->val]; + + if (prim_type == PrimitiveTypeID::i32) { + std::unordered_map i32_ops; + i32_ops[AtomicOpType::add] = llvm::AtomicRMWInst::BinOp::Add; + i32_ops[AtomicOpType::min] = llvm::AtomicRMWInst::BinOp::Min; + i32_ops[AtomicOpType::max] = llvm::AtomicRMWInst::BinOp::Max; + i32_ops[AtomicOpType::bit_and] = llvm::AtomicRMWInst::BinOp::And; + i32_ops[AtomicOpType::bit_or] = llvm::AtomicRMWInst::BinOp::Or; + i32_ops[AtomicOpType::bit_xor] = llvm::AtomicRMWInst::BinOp::Xor; + if (i32_ops.find(op) != i32_ops.end()) { + return builder->CreateAtomicRMW( + i32_ops.at(op), dest, val, llvm::MaybeAlign(0), + llvm::AtomicOrdering::SequentiallyConsistent); + } + } else if (prim_type == PrimitiveTypeID::f32) { + if (op == AtomicOpType::add) { + return builder->CreateAtomicRMW( + llvm::AtomicRMWInst::FAdd, dest, val, llvm::MaybeAlign(0), + llvm::AtomicOrdering::SequentiallyConsistent); + } else if (op == AtomicOpType::min) { + return atomic_op_using_cas( + dest, val, + [&](auto v1, auto v2) { return builder->CreateMinNum(v1, v2); }, + stmt->val->ret_type); + } else if (op == AtomicOpType::max) { + return atomic_op_using_cas( + dest, val, + [&](auto v1, auto v2) { return builder->CreateMaxNum(v1, v2); }, + stmt->val->ret_type); + } } - return call(fast_reductions.at(prim_type).at(op), - {dest, llvm_val[stmt->val]}); + return nullptr; } void visit(RangeForStmt *for_stmt) override { @@ -372,15 +374,19 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { origin_pointee_ty, casted_ptr, {tlctx->get_constant(0), llvm_val[stmt->offset]}); } else { - auto *origin_address = builder->CreatePtrToInt( - origin_ptr, llvm::Type::getInt64Ty(*llvm_context)); + // Byte-offset GEP preserves pointer provenance and address space, + // avoiding the PtrToInt/IntToPtr round-trip that breaks addrspace + // tagging and confuses InferAddressSpaces. + auto *byte_ptr = builder->CreateBitCast( + origin_ptr, llvm::PointerType::get( + llvm::Type::getInt8Ty(*llvm_context), origin_as)); auto *address_offset = builder->CreateSExt( llvm_val[stmt->offset], llvm::Type::getInt64Ty(*llvm_context)); - auto *target_address = - builder->CreateAdd(origin_address, address_offset); + auto *offset_ptr = builder->CreateGEP( + llvm::Type::getInt8Ty(*llvm_context), byte_ptr, address_offset); auto pointee_ty = tlctx->get_data_type(stmt->ret_type.ptr_removed()); - llvm_val[stmt] = builder->CreateIntToPtr( - target_address, llvm::PointerType::get(pointee_ty, origin_as)); + llvm_val[stmt] = builder->CreateBitCast( + offset_ptr, llvm::PointerType::get(pointee_ty, origin_as)); } } diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 889bd2bd0c..6f7c40596c 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -722,8 +722,8 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler, } } - if (config_.arch == Arch::cuda) { - QD_TRACE("Initializing {} random states using CUDA", num_rand_states); + if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) { + QD_TRACE("Initializing {} random states using CUDA or AMDGPU", num_rand_states); runtime_jit->launch( "runtime_initialize_rand_states_cuda", config_.saturating_grid_dim, config_.max_block_dim, 0, llvm_runtime_, starting_rand_state); diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp index 28cbb9a448..c0f7c43ddd 100644 --- a/quadrants/runtime/llvm/runtime_module/runtime.cpp +++ b/quadrants/runtime/llvm/runtime_module/runtime.cpp @@ -871,18 +871,13 @@ void quadrants_assert_format(LLVMRuntime *runtime, // Kill this CUDA thread. asm("exit;"); #elif ARCH_amdgpu - asm("S_ENDPGM"); - // TODO: properly kill this CPU thread here, considering the containing - // ThreadPool structure. - - // std::terminate(); - - // Note that std::terminate() will throw an signal 6 - // (Aborted), which will be caught by Quadrants's signal handler. The assert - // failure message will NOT be properly printed since Quadrants exits after - // receiving that signal. It is better than nothing when debugging the - // runtime, since otherwise the whole program may crash if the kernel - // continues after assertion failure. + // S_ENDPGM only kills the current wavefront; other wavefronts in the + // dispatch keep running and may spin forever waiting for data the + // terminated wavefront was supposed to produce. + // __builtin_trap() emits s_trap 2 which causes an unrecoverable GPU + // fault that halts the entire dispatch and returns + // hipErrorLaunchFailure to the host, unblocking hipStreamSynchronize. + __builtin_trap(); #endif } diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py index cd6bb32a04..b3f7fedf43 100644 --- a/tests/python/test_ad_gdar_diffmpm.py +++ b/tests/python/test_ad_gdar_diffmpm.py @@ -5,7 +5,7 @@ from tests import test_utils -@test_utils.test(require=qd.extension.assertion, debug=True) +@test_utils.test(require=qd.extension.assertion, debug=True, exclude=[qd.amdgpu]) def test_gdar_mpm(): real = qd.f32 diff --git a/tests/python/test_ad_global_data_access_rule_checker.py b/tests/python/test_ad_global_data_access_rule_checker.py index e7dab55ef9..837b5ff7ed 100644 --- a/tests/python/test_ad_global_data_access_rule_checker.py +++ b/tests/python/test_ad_global_data_access_rule_checker.py @@ -75,7 +75,7 @@ def test(): assert warn_raised -@test_utils.test(require=qd.extension.assertion, debug=True) +@test_utils.test(require=qd.extension.assertion, debug=True, exclude=[qd.amdgpu]) def test_break_gdar_rule_1(): N = 16 x = qd.field(dtype=qd.f32, shape=N, needs_grad=True) @@ -98,7 +98,7 @@ def func_broke_rule_1(): func_broke_rule_1() -@test_utils.test(require=qd.extension.assertion, debug=True) +@test_utils.test(require=qd.extension.assertion, debug=True, exclude=[qd.amdgpu]) def test_skip_grad_replaced(): N = 16 x = qd.field(dtype=qd.f32, shape=N, needs_grad=True) diff --git a/tests/python/test_assert.py b/tests/python/test_assert.py index 345f31c2fb..31ac29c7c4 100644 --- a/tests/python/test_assert.py +++ b/tests/python/test_assert.py @@ -12,7 +12,7 @@ pytest.skip("assert not currently supported on linux arm64 or aarch64", allow_module_level=True) -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_assert_minimal(): @qd.kernel def func(): @@ -28,7 +28,7 @@ def func2(): func2() -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_assert_basic(): @qd.kernel def func(): @@ -39,7 +39,7 @@ def func(): func() -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_assert_message(): @qd.kernel def func(): @@ -50,7 +50,7 @@ def func(): func() -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_assert_message_formatted(): x = qd.field(dtype=int, shape=16) x[10] = 42 @@ -77,7 +77,7 @@ def assert_float(): assert_formatted() -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_assert_message_formatted_fstring(): x = qd.field(dtype=int, shape=16) x[10] = 42 diff --git a/tests/python/test_assert_skip.py b/tests/python/test_assert_skip.py index 579499bac6..a420c66f59 100644 --- a/tests/python/test_assert_skip.py +++ b/tests/python/test_assert_skip.py @@ -13,7 +13,7 @@ ) -@test_utils.test() +@test_utils.test(exclude=[qd.amdgpu]) def test_assert_ignored(): """ On linux arm, assert is just a `nop` currently (otherwise it crashes). This test checks that: diff --git a/tests/python/test_ast_refactor.py b/tests/python/test_ast_refactor.py index 21ec12c573..0a49e55375 100644 --- a/tests/python/test_ast_refactor.py +++ b/tests/python/test_ast_refactor.py @@ -800,7 +800,7 @@ def foo(x: tc.template()) -> tc.i32: u.system == "linux" and u.machine in ("arm64", "aarch64"), reason="assert not currently supported on linux arm64 or aarch64", ) -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_assert_message(): @qd.kernel def func(): @@ -815,7 +815,7 @@ def func(): u.system == "linux" and u.machine in ("arm64", "aarch64"), reason="assert not currently supported on linux arm64 or aarch64", ) -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_assert_message_formatted(): x = qd.field(dtype=int, shape=16) x[10] = 42 diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py index 828639eae6..754c7de0a7 100644 --- a/tests/python/test_debug.py +++ b/tests/python/test_debug.py @@ -22,7 +22,7 @@ def test_cpu_debug_snode_reader(): u.system == "linux" and u.machine in ("arm64", "aarch64"), reason="assert not currently supported on linux arm64 or aarch64", ) -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_cpu_debug_snode_writer_out_of_bound(): x = qd.field(qd.f32, shape=3) @@ -30,14 +30,14 @@ def test_cpu_debug_snode_writer_out_of_bound(): x[3] = 10.0 -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_cpu_debug_snode_writer_out_of_bound_negative(): x = qd.field(qd.f32, shape=3) with pytest.raises(AssertionError): x[-1] = 10.0 -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_cpu_debug_snode_reader_out_of_bound(): x = qd.field(qd.f32, shape=3) @@ -45,14 +45,14 @@ def test_cpu_debug_snode_reader_out_of_bound(): a = x[3] -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_cpu_debug_snode_reader_out_of_bound_negative(): x = qd.field(qd.f32, shape=3) with pytest.raises(AssertionError): a = x[-1] -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_out_of_bound(): x = qd.field(qd.i32, shape=(8, 16)) @@ -79,7 +79,7 @@ def func(): require=[qd.extension.sparse, qd.extension.assertion], debug=True, gdb_trigger=False, - exclude=qd.metal, + exclude=[qd.metal, qd.amdgpu], ) def test_out_of_bound_dynamic(): x = qd.field(qd.i32) @@ -112,7 +112,7 @@ def func(): func() -@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False) +@test_utils.test(require=qd.extension.assertion, debug=True, gdb_trigger=False, exclude=[qd.amdgpu]) def test_out_of_bound_with_offset(): x = qd.field(qd.i32, shape=(8, 16), offset=(-8, -8)) diff --git a/tests/python/test_element_wise.py b/tests/python/test_element_wise.py index 661cb6094f..a06d292574 100644 --- a/tests/python/test_element_wise.py +++ b/tests/python/test_element_wise.py @@ -61,7 +61,9 @@ def func(): assert test_utils.allclose(x[3], y / z) assert test_utils.allclose(x[4], y // z) assert test_utils.allclose(x[5], y % z) - assert test_utils.allclose(x[6], y**z) + # AMDGPU __ocml_pow_f32 uses log2->mul->exp2 giving ~0.06% relative + # error vs x86 pow; loosen tolerance to accommodate this. + assert test_utils.allclose(x[6], y**z, rel=1e-3) assert test_utils.allclose(x[7].astype(bool), y == z) assert test_utils.allclose(x[8].astype(bool), y != z) assert test_utils.allclose(x[9].astype(bool), y > z) diff --git a/tests/python/test_math_module.py b/tests/python/test_math_module.py index 0218559176..884e5648f1 100644 --- a/tests/python/test_math_module.py +++ b/tests/python/test_math_module.py @@ -41,7 +41,7 @@ def test_inf_nan_f32(dt): _test_inf_nan(dt) -@test_utils.test() +@test_utils.test(exclude=[qd.amdgpu]) def test_vdir(): @qd.kernel def make_test(): diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py index 64c2ff6cac..ae31b5e898 100644 --- a/tests/python/test_matrix.py +++ b/tests/python/test_matrix.py @@ -1101,7 +1101,7 @@ def foo(): foo() -@test_utils.test(debug=True) +@test_utils.test(debug=True, exclude=[qd.amdgpu]) def test_cross_scope_matrix_binary_ops(): n = 128 x = qd.Vector.field(3, dtype=int, shape=(n, n)) @@ -1122,7 +1122,7 @@ def test(): assert (x[6, 8] == [1, 10, 100]).all() -@test_utils.test(debug=True) +@test_utils.test(debug=True, exclude=[qd.amdgpu]) def test_cross_scope_matrix_ternary_ops(): n = 128 x = qd.Vector.field(3, dtype=int, shape=(n, n)) @@ -1139,7 +1139,7 @@ def test(): assert (x[1, 1] == [100, 10, 1]).all() -@test_utils.test(debug=True) +@test_utils.test(debug=True, exclude=[qd.amdgpu]) @pytest.mark.skipif( sys.platform == "darwin", reason=( @@ -1166,7 +1166,7 @@ def test(): assert (x[1, 3] == [100, 10, 1]).all() -@test_utils.test(debug=True) +@test_utils.test(debug=True, exclude=[qd.amdgpu]) def test_global_tmp_overwrite(): # https://github.com/taichi-dev/quadrants/issues/6663 @qd.kernel @@ -1268,6 +1268,7 @@ def vec_test(arr: qd.types.ndarray()): debug=True, check_out_of_bound=True, gdb_trigger=False, + exclude=[qd.amdgpu], ) def test_matrix_oob(): @qd.kernel diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py index b635f145ee..d1cd50a873 100644 --- a/tests/python/test_ndarray.py +++ b/tests/python/test_ndarray.py @@ -807,6 +807,7 @@ def test(table: qd.types.NDArray[half2, 1]): debug=True, check_out_of_bound=True, gdb_trigger=False, + exclude=[qd.amdgpu], ) def test_scalar_ndarray_oob(): @qd.kernel @@ -832,6 +833,7 @@ def access_arr(input: qd.types.NDArray, x: qd.i32) -> qd.f32: debug=True, check_out_of_bound=True, gdb_trigger=False, + exclude=[qd.amdgpu], ) # TODO: investigate why this crashes sometimes on Windows @pytest.mark.skipif(sys.platform == "win32", reason="Crashes frequently on windows") diff --git a/tests/python/test_pow.py b/tests/python/test_pow.py index 65e07f918d..ea7225de7d 100644 --- a/tests/python/test_pow.py +++ b/tests/python/test_pow.py @@ -66,7 +66,7 @@ def foo(x: dt, y: qd.template()): @test_utils.test( debug=True, advanced_optimization=False, - exclude=[qd.vulkan, qd.metal], + exclude=[qd.vulkan, qd.metal, qd.amdgpu], ) def test_ipow_negative_exp_i32(): _ipow_negative_exp(qd.i32) diff --git a/tests/test_utils.py b/tests/test_utils.py index d41b4db3c7..fa139d31c5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,7 +11,7 @@ import quadrants as qd from quadrants._lib import core as _qd_core -from quadrants.lang import cpu, cuda, gpu, metal, vulkan +from quadrants.lang import amdgpu, cpu, cuda, gpu, metal, vulkan from quadrants.lang.misc import is_arch_supported @@ -139,7 +139,7 @@ def expected_archs(): """ def get_archs(): - archs = set([cpu, cuda, metal, vulkan]) + archs = set([cpu, cuda, metal, vulkan, amdgpu]) # TODO: now expected_archs is not called per test so we cannot test it archs = set(filter(is_arch_supported, archs)) return archs