diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py index 4f62f1a8da26..372281dbabf1 100644 --- a/python/tvm/contrib/rocm.py +++ b/python/tvm/contrib/rocm.py @@ -141,9 +141,7 @@ def callback_rocm_bitcode_path(rocdl_dir=None): bitcode_names = [ "oclc_daz_opt_on", "ocml", - "hc", "irif", # this does not exist in rocm 3.9, drop eventually - "ockl", "oclc_correctly_rounded_sqrt_off", "oclc_correctly_rounded_sqrt_on", "oclc_daz_opt_off", @@ -152,9 +150,11 @@ def callback_rocm_bitcode_path(rocdl_dir=None): "oclc_isa_version_803", # todo (t-vi): an alternative might be to scan for the "oclc_isa_version_900", # isa version files (if the linker throws out "oclc_isa_version_906", # the unneeded ones or we filter for the arch we need) + "oclc_isa_version_1030", "oclc_unsafe_math_off", "oclc_unsafe_math_on", "oclc_wavefrontsize64_on", + "oclc_abi_version_500", ] bitcode_files = [] diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index cc438092666a..fa295c93a19f 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -918,13 +918,16 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): name="dense_int8.cuda", ) else: - strategy.add_implementation( - wrap_compute_dense(topi.gpu.dense_small_batch), - wrap_topi_schedule(topi.gpu.schedule_dense_small_batch), - name="dense_small_batch.gpu", - ) + # Some AMDGPU cards have accuracy issues with this schedule + # See https://github.com/apache/tvm/issues/13666 + if target.kind.name != "rocm": + strategy.add_implementation( + wrap_compute_dense(topi.gpu.dense_small_batch), + wrap_topi_schedule(topi.gpu.schedule_dense_small_batch), + name="dense_small_batch.gpu", + ) - with SpecializedCondition(b >= 32): + with SpecializedCondition(target.kind.name == "rocm" or b >= 32): strategy.add_implementation( wrap_compute_dense(topi.gpu.dense_large_batch), wrap_topi_schedule(topi.gpu.schedule_dense_large_batch),