From dce3438877481c518f2b41c86af3adea3f6ca55f Mon Sep 17 00:00:00 2001 From: masa Date: Sun, 6 Dec 2020 23:48:56 +0900 Subject: [PATCH 01/19] support atomic add on llvm --- python/tvm/topi/cuda/nms.py | 8 ++++ src/target/llvm/codegen_llvm.cc | 14 ++++++ tests/python/relay/test_op_level5.py | 46 ++++++++++---------- tests/python/topi/python/test_topi_vision.py | 21 ++++----- 4 files changed, 55 insertions(+), 34 deletions(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 46d7f9800c43..e99265906d5e 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -40,6 +40,10 @@ def opencl_atomic_add_rule(op): return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) raise RuntimeError("only support int32") +def llvm_atomic_add_rule(op): + if op.dtype == "int32": + return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) + raise RuntimeError("only support int32") tvm.target.intrin.register_intrin_rule("cuda", "atomic_add", cuda_atomic_add_rule, override=True) @@ -47,6 +51,10 @@ def opencl_atomic_add_rule(op): "opencl", "atomic_add", opencl_atomic_add_rule, override=True ) +tvm.target.intrin.register_intrin_rule( + "nvptx", "atomic_add", llvm_atomic_add_rule, override=True +) + tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque) diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index d10ed311949c..50e68f8c6012 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -955,6 +955,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); + } else if (op->op.same_as(builtin_call_extern_)) { + auto func = Downcast(op->args[0]); + if (func->value == "atomic_add") { + LOG(FATAL) << "atomic add found " << op->op; + llvm::Value* v0 = MakeValue(op->args[1]); + llvm::Value* v1 = MakeValue(op->args[2]); + } } else { LOG(FATAL) << "unknown intrinsic " << op->op; return nullptr; @@ -1185,6 +1192,13 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) { if (auto* ptr_op = op->op.as()) { auto call_op = GetRef(ptr_op); if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) { + auto func = Downcast(op->args[0]); + if (func->value == "atomic_add") { + llvm::Value* v0 = MakeValue(op->args[1]); + llvm::Value* v1 = MakeValue(op->args[2]); + auto old_val = builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic); + return old_val; + } // call extern intrinsic ICHECK_GE(op->args.size(), 1U); auto global_symbol = Downcast(op->args[0]); diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 9e9aaf842669..ee7acc619f9d 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -393,8 +393,6 @@ def verify_nms( intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) - if target == "nvptx": - continue op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5) op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) @@ -1196,26 +1194,26 @@ def verify_batch_to_space_nd(dshape, block_shape, crops): if __name__ == "__main__": - test_resize_infer_type() - test_resize() - test_resize3d_infer_type() - test_resize3d() - test_crop_and_resize() - test_multibox_prior() - test_multibox_transform_loc() - test_get_valid_counts() - test_roi_align() - test_roi_pool() - test_proposal() - test_yolo_reorg_infer_shape() - test_yolo_reorg() + # test_resize_infer_type() + # test_resize() + # test_resize3d_infer_type() + # test_resize3d() + # test_crop_and_resize() + # test_multibox_prior() + # test_multibox_transform_loc() + # test_get_valid_counts() + # test_roi_align() + # test_roi_pool() + # test_proposal() + # test_yolo_reorg_infer_shape() + # test_yolo_reorg() test_non_max_suppression() - test_deformable_conv2d() - test_depth_to_space() - test_space_to_depth() - test_dilation2d_infer_type() - test_dilation2d_run() - test_affine_grid() - test_grid_sample() - test_space_to_batch_nd() - test_batch_to_space_nd() + # test_deformable_conv2d() + # test_depth_to_space() + # test_space_to_depth() + # test_dilation2d_infer_type() + # test_dilation2d_run() + # test_affine_grid() + # test_grid_sample() + # test_space_to_batch_nd() + # test_batch_to_space_nd() diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py index 6d6353eebce6..3fb02075dbc7 100644 --- a/tests/python/topi/python/test_topi_vision.py +++ b/tests/python/topi/python/test_topi_vision.py @@ -202,15 +202,16 @@ def check_device(device): tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx) - if device in ["llvm", "cuda"]: + if device in ["nvptx"]: f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device) - f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) + # f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) + print(f.imported_modules[0].get_source("llvm")) else: f = tvm.build(indices_s, [data, valid_count, indices, indices_out], device) f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) - tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4) + # tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4) - for device in ["llvm", "cuda", "opencl"]: + for device in ["nvptx"]: check_device(device) @@ -630,10 +631,10 @@ def test_proposal(): if __name__ == "__main__": - test_get_valid_counts() - test_multibox_prior() - test_multibox_detection() - test_roi_align() - test_roi_pool() - test_proposal() + # test_get_valid_counts() + # test_multibox_prior() + # test_multibox_detection() + # test_roi_align() + # test_roi_pool() + # test_proposal() test_non_max_suppression() From b8ae806e59b7b507cbb248ba0390413e4838a125 Mon Sep 17 00:00:00 2001 From: masa Date: Mon, 7 Dec 2020 00:12:28 +0900 Subject: [PATCH 02/19] make atomic builtin intrin --- include/tvm/tir/builtin.h | 2 ++ python/tvm/topi/cuda/nms.py | 10 ---------- src/target/llvm/codegen_llvm.cc | 18 ++++-------------- src/tir/op/builtin.cc | 3 +++ 4 files changed, 9 insertions(+), 24 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index bea53136fd54..a5ac378be581 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -549,6 +549,8 @@ TVM_DLL const Op& vectorlow(); */ TVM_DLL const Op& vectorcombine(); +TVM_DLL const Op& atomic_add(); + /*! \brief The kind of structure field info used in intrinsic */ enum TVMStructFieldKind : int { // array head address diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index e99265906d5e..d51eb5ce1d11 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -40,10 +40,6 @@ def opencl_atomic_add_rule(op): return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) raise RuntimeError("only support int32") -def llvm_atomic_add_rule(op): - if op.dtype == "int32": - return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) - raise RuntimeError("only support int32") tvm.target.intrin.register_intrin_rule("cuda", "atomic_add", cuda_atomic_add_rule, override=True) @@ -51,12 +47,6 @@ def llvm_atomic_add_rule(op): "opencl", "atomic_add", opencl_atomic_add_rule, override=True ) -tvm.target.intrin.register_intrin_rule( - "nvptx", "atomic_add", llvm_atomic_add_rule, override=True -) - -tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque) - def atomic_add(x, y): return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index 50e68f8c6012..bcca7ba5bc15 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -955,13 +955,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); - } else if (op->op.same_as(builtin_call_extern_)) { - auto func = Downcast(op->args[0]); - if (func->value == "atomic_add") { - LOG(FATAL) << "atomic add found " << op->op; - llvm::Value* v0 = MakeValue(op->args[1]); - llvm::Value* v1 = MakeValue(op->args[2]); - } + } else if (op->op.same_as(Op::Get("tir.atomic_add"))) { + llvm::Value* v0 = MakeValue(op->args[0]); + llvm::Value* v1 = MakeValue(op->args[1]); + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic); } else { LOG(FATAL) << "unknown intrinsic " << op->op; return nullptr; @@ -1192,13 +1189,6 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) { if (auto* ptr_op = op->op.as()) { auto call_op = GetRef(ptr_op); if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) { - auto func = Downcast(op->args[0]); - if (func->value == "atomic_add") { - llvm::Value* v0 = MakeValue(op->args[1]); - llvm::Value* v1 = MakeValue(op->args[2]); - auto old_val = builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic); - return old_val; - } // call extern intrinsic ICHECK_GE(op->args.size(), 1U); auto global_symbol = Downcast(op->args[0]); diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 3afb8810e774..796b113a4054 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -229,6 +229,9 @@ TIR_DEFINE_BUILTIN_FUNC(vectorlow).set_attr("TCallEffectKind", TIR_DEFINE_BUILTIN_FUNC(vectorcombine) .set_attr("TCallEffectKind", Integer(CallEffectKind::kPure)); +TIR_DEFINE_BUILTIN_FUNC(atomic_add) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + } // namespace builtin } // namespace tir } // namespace tvm From e604240e2a84537253eae4df84b06bddb786b63a Mon Sep 17 00:00:00 2001 From: masa Date: Mon, 7 Dec 2020 00:13:40 +0900 Subject: [PATCH 03/19] test bincount on nvptx --- tests/python/frontend/pytorch/test_forward.py | 309 +++++++++--------- 1 file changed, 156 insertions(+), 153 deletions(-) diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 2dda675c74f5..f31020483ccd 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -207,6 +207,7 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params) relay_model = graph_runtime.create(relay_graph, relay_lib, ctx) relay_model.set_input(**relay_params) + for name, inp in compiled_input.items(): relay_model.set_input(name, inp) relay_model.run() @@ -1912,6 +1913,8 @@ def verify_model_vm(input_model, ishapes, idtype=torch.float, idata=None, target ctx = tvm.context(tgt, 0) executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=tgt) + print(executor.executable.lib.imported_modules[0].get_source("llvm")) + evaluator = executor.evaluate() # Inference @@ -3364,167 +3367,167 @@ def test_fn(x, weights=None): if __name__ == "__main__": - # some structural tests - test_forward_traced_function() - test_forward_dtypes() - test_weight_names() - test_duplicate_weight_use() - - # Single operator tests - test_forward_pixel_shuffle() - test_forward_add() - test_forward_subtract() - test_forward_multiply() - test_forward_matmul() - test_forward_rsub() - test_forward_onehot() - test_forward_embedding() - test_forward_reshape() - test_forward_reciprocal() - test_forward_repeat() - test_forward_repeat_interleave() - test_forward_squeeze() - test_forward_unsqueeze() - test_forward_concatenate() - test_forward_reduce_sum() - test_forward_reduce_prod() - test_forward_argmin() - test_forward_argmax() - test_forward_norm() - test_forward_frobenius_norm() - test_forward_std() - test_forward_variance() - test_forward_relu() - test_forward_prelu() - test_forward_leakyrelu() - test_forward_elu() - test_forward_celu() - test_forward_gelu() - test_forward_selu() - test_forward_log_sigmoid() - test_forward_adaptiveavgpool() - test_forward_maxpool2d() - test_forward_maxpool1d() - test_forward_maxpool3d() - test_forward_hardtanh() - test_forward_conv() - test_forward_conv_transpose() - test_forward_threshold() - test_forward_contiguous() - test_forward_batchnorm() - test_forward_instancenorm() - test_forward_layernorm() - test_forward_groupnorm() - test_forward_transpose() - test_forward_size() - test_forward_view() - test_forward_select() - test_forward_take() - test_forward_topk() - test_forward_where() - test_forward_addcdiv() - test_forward_addcmul() - test_forward_true_divide() - test_forward_clone() - test_forward_softplus() - test_forward_softsign() - test_forward_logsoftmax() - test_forward_sigmoid() - test_forward_dense() - test_forward_avgpool() - test_forward_avgpool3d() - test_forward_dropout() - test_forward_slice() - test_forward_mean() - test_forward_expand() - test_forward_pow() - test_forward_unary() - test_forward_clamp() - test_forward_clamp_() - test_forward_logical_not() - test_forward_bitwise_not() - test_forward_bitwise_xor() - test_forward_logical_xor() - test_forward_isfinite() - test_forward_isnan() - test_forward_isinf() - test_forward_ones() - test_forward_ones_like() - test_forward_zeros() - test_forward_zeros_like() - test_forward_full() - test_forward_full_like() - test_forward_linspace() - test_forward_arange() - test_forward_mesh_grid() - test_forward_chunk() - test_forward_split() - test_forward_gather() - test_upsample() - test_forward_upsample3d() - test_forward_nms() - test_forward_roi_align() - test_to() - test_flatten() - test_type_as() - test_forward_functional_pad() - test_forward_zero_pad2d() - test_forward_constant_pad1d() - test_forward_constant_pad2d() - test_forward_constant_pad3d() - test_forward_reflection_pad1d() - test_forward_reflection_pad2d() - test_forward_replication_pad1d() - test_forward_replication_pad2d() - test_forward_replication_pad3d() - test_adaptive_pool3d() - test_conv3d() - test_conv3d_transpose() - test_forward_index() - test_min_max() - test_logsumexp() - test_stack() - test_stack_dynamic() - test_forward_unbind() - test_forward_nonzero() - test_forward_scatter() - test_numel() + # # some structural tests + # test_forward_traced_function() + # test_forward_dtypes() + # test_weight_names() + # test_duplicate_weight_use() + + # # Single operator tests + # test_forward_pixel_shuffle() + # test_forward_add() + # test_forward_subtract() + # test_forward_multiply() + # test_forward_matmul() + # test_forward_rsub() + # test_forward_onehot() + # test_forward_embedding() + # test_forward_reshape() + # test_forward_reciprocal() + # test_forward_repeat() + # test_forward_repeat_interleave() + # test_forward_squeeze() + # test_forward_unsqueeze() + # test_forward_concatenate() + # test_forward_reduce_sum() + # test_forward_reduce_prod() + # test_forward_argmin() + # test_forward_argmax() + # test_forward_norm() + # test_forward_frobenius_norm() + # test_forward_std() + # test_forward_variance() + # test_forward_relu() + # test_forward_prelu() + # test_forward_leakyrelu() + # test_forward_elu() + # test_forward_celu() + # test_forward_gelu() + # test_forward_selu() + # test_forward_log_sigmoid() + # test_forward_adaptiveavgpool() + # test_forward_maxpool2d() + # test_forward_maxpool1d() + # test_forward_maxpool3d() + # test_forward_hardtanh() + # test_forward_conv() + # test_forward_conv_transpose() + # test_forward_threshold() + # test_forward_contiguous() + # test_forward_batchnorm() + # test_forward_instancenorm() + # test_forward_layernorm() + # test_forward_groupnorm() + # test_forward_transpose() + # test_forward_size() + # test_forward_view() + # test_forward_select() + # test_forward_take() + # test_forward_topk() + # test_forward_where() + # test_forward_addcdiv() + # test_forward_addcmul() + # test_forward_true_divide() + # test_forward_clone() + # test_forward_softplus() + # test_forward_softsign() + # test_forward_logsoftmax() + # test_forward_sigmoid() + # test_forward_dense() + # test_forward_avgpool() + # test_forward_avgpool3d() + # test_forward_dropout() + # test_forward_slice() + # test_forward_mean() + # test_forward_expand() + # test_forward_pow() + # test_forward_unary() + # test_forward_clamp() + # test_forward_clamp_() + # test_forward_logical_not() + # test_forward_bitwise_not() + # test_forward_bitwise_xor() + # test_forward_logical_xor() + # test_forward_isfinite() + # test_forward_isnan() + # test_forward_isinf() + # test_forward_ones() + # test_forward_ones_like() + # test_forward_zeros() + # test_forward_zeros_like() + # test_forward_full() + # test_forward_full_like() + # test_forward_linspace() + # test_forward_arange() + # test_forward_mesh_grid() + # test_forward_chunk() + # test_forward_split() + # test_forward_gather() + # test_upsample() + # test_forward_upsample3d() + # test_forward_nms() + # test_forward_roi_align() + # test_to() + # test_flatten() + # test_type_as() + # test_forward_functional_pad() + # test_forward_zero_pad2d() + # test_forward_constant_pad1d() + # test_forward_constant_pad2d() + # test_forward_constant_pad3d() + # test_forward_reflection_pad1d() + # test_forward_reflection_pad2d() + # test_forward_replication_pad1d() + # test_forward_replication_pad2d() + # test_forward_replication_pad3d() + # test_adaptive_pool3d() + # test_conv3d() + # test_conv3d_transpose() + # test_forward_index() + # test_min_max() + # test_logsumexp() + # test_stack() + # test_stack_dynamic() + # test_forward_unbind() + # test_forward_nonzero() + # test_forward_scatter() + # test_numel() test_bincount() - # Model tests - test_resnet18() - test_squeezenet1_0() - test_squeezenet1_1() - test_densenet121() - # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug - # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756 - # test_inception_v3() - test_googlenet() - test_mnasnet0_5() - test_mobilenet_v2() + # # Model tests + # test_resnet18() + # test_squeezenet1_0() + # test_squeezenet1_1() + # test_densenet121() + # # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug + # # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756 + # # test_inception_v3() + # test_googlenet() + # test_mnasnet0_5() + # test_mobilenet_v2() - test_custom_conversion_map() + # test_custom_conversion_map() - test_segmentaton_models() - test_3d_models() + # test_segmentaton_models() + # test_3d_models() - # Quantization test - from qnn_test import test_quantized_imagenet, test_quantized_modules + # # Quantization test + # from qnn_test import test_quantized_imagenet, test_quantized_modules - test_quantized_modules() - test_quantized_imagenet() + # test_quantized_modules() + # test_quantized_imagenet() - # Test simple conditionals and loop - test_control_flow() - test_simple_rnn() + # # Test simple conditionals and loop + # test_control_flow() + # test_simple_rnn() - # More complex recurrent models - from test_lstm import test_custom_lstm + # # More complex recurrent models + # from test_lstm import test_custom_lstm - test_custom_lstm() + # test_custom_lstm() - # Test bert model - test_forward_pretrained_bert_base_uncased() + # # Test bert model + # test_forward_pretrained_bert_base_uncased() - # Test convert torch script(jit) with specific inputs' types - test_convert_torch_script_with_input_types() + # # Test convert torch script(jit) with specific inputs' types + # test_convert_torch_script_with_input_types() From 30e88dc10ab9f5236111a40f72d5cf092c652e2c Mon Sep 17 00:00:00 2001 From: masa Date: Mon, 7 Dec 2020 00:21:20 +0900 Subject: [PATCH 04/19] use builtin::atomic_add --- src/target/llvm/codegen_llvm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index bcca7ba5bc15..b2767114815f 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -955,7 +955,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); - } else if (op->op.same_as(Op::Get("tir.atomic_add"))) { + } else if (op->op.same_as(builtin::atomic_add())) { llvm::Value* v0 = MakeValue(op->args[0]); llvm::Value* v1 = MakeValue(op->args[1]); return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic); From e37706f5f0130548f3fdf1406260a7019f8fdb02 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 7 Dec 2020 13:23:02 +0900 Subject: [PATCH 05/19] add atomic llvm codegen test, only works on int8 input somehow --- .../unittest/test_target_codegen_llvm.py | 76 +++++++++++++------ 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index 162481bfdb6e..decf623f2e32 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -74,7 +74,6 @@ def use_llvm_intrinsic(A, C): C = tvm.te.extern( (1, 1), [A], lambda ins, outs: use_llvm_intrinsic(ins[0], outs[0]), name="C", dtype="int32" ) - s = tvm.te.create_schedule(C.op) f = tvm.build(s, [A, C], target="llvm") @@ -750,27 +749,56 @@ def test_llvm_crt_static_lib(): module.save("test.o") +def atomic_add(x, y): + return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) + + +@tvm.testing.requires_llvm +def test_llvm_lower_atomic(): + def do_atomic_add(A): + ib = tvm.tir.ir_builder.create() + n = A.shape[0] + atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local") + one = tvm.tir.const(1, A.dtype) + A_ptr = ib.buffer_ptr(A) + with ib.for_range(0, n, name="i", for_type="parallel") as i: + atomic_add_return[0] = atomic_add( + tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one) + return ib.get() + + A = tvm.te.placeholder((100,), dtype="int8", name="A") + C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32") + s = tvm.te.create_schedule(C.op) + f = tvm.build(s, [A], target="llvm") + a_np = np.zeros((100,), dtype=A.dtype) + a = tvm.nd.array(a_np) + f(a) + print(a) + print(tvm.lower(s, [A], simple_mode=True)) + + if __name__ == "__main__": - test_multiple_func() - test_llvm_large_uintimm() - test_llvm_import() - test_alignment() - test_rank_zero() - test_rank_zero_bound_checkers() - test_llvm_bool() - test_llvm_persist_parallel() - test_llvm_condition() - test_llvm_vadd_pipeline() - test_llvm_add_pipeline() - test_llvm_intrin() - test_llvm_overloaded_intrin() - test_llvm_flip_pipeline() - test_llvm_madd_pipeline() - test_llvm_temp_space() - test_llvm_lookup_intrin() - test_llvm_div() - test_llvm_fp_math() - test_dwarf_debug_information() - test_llvm_shuffle() - test_llvm_bf16() - test_llvm_crt_static_lib() + # test_multiple_func() + # test_llvm_large_uintimm() + # test_llvm_import() + # test_alignment() + # test_rank_zero() + # test_rank_zero_bound_checkers() + # test_llvm_bool() + # test_llvm_persist_parallel() + # test_llvm_condition() + # test_llvm_vadd_pipeline() + # test_llvm_add_pipeline() + # test_llvm_intrin() + # test_llvm_overloaded_intrin() + # test_llvm_flip_pipeline() + # test_llvm_madd_pipeline() + # test_llvm_temp_space() + # test_llvm_lookup_intrin() + # test_llvm_div() + # test_llvm_fp_math() + # test_dwarf_debug_information() + # test_llvm_shuffle() + # test_llvm_bf16() + # test_llvm_crt_static_lib() + test_llvm_lower_atomic() From faf20d63d0b4fbe7ff2d34ce836ea358e64db8d6 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 04:12:24 +0900 Subject: [PATCH 06/19] supports fp32 atomic --- src/target/llvm/codegen_llvm.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index b2767114815f..c0ec3255bab5 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -956,9 +956,15 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { } return builder_->CreateShuffleVector(v0, v1, indices); } else if (op->op.same_as(builtin::atomic_add())) { + ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now"; llvm::Value* v0 = MakeValue(op->args[0]); llvm::Value* v1 = MakeValue(op->args[1]); - return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic); + if (op->args[1]->dtype.is_float()) { + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::AtomicOrdering::Monotonic); + } + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, + llvm::AtomicOrdering::Monotonic); } else { LOG(FATAL) << "unknown intrinsic " << op->op; return nullptr; From 07c50fa7b7364e1883504964c9ac525f4e4f7c5d Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 04:36:02 +0900 Subject: [PATCH 07/19] drop support for cpu atomic --- src/target/llvm/codegen_amdgpu.cc | 15 +++++++++++++++ src/target/llvm/codegen_llvm.cc | 10 ---------- src/target/llvm/codegen_nvptx.cc | 10 ++++++++++ tests/python/unittest/test_target_codegen_llvm.py | 10 +++------- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc index 2890c1ce3e56..1e6c64abf655 100644 --- a/src/target/llvm/codegen_amdgpu.cc +++ b/src/target/llvm/codegen_amdgpu.cc @@ -183,6 +183,21 @@ class CodeGenAMDGPU : public CodeGenLLVM { unsigned GetGlobalAddressSpace() const final { return 1; } + llvm::Value* CreateIntrinsic(const CallNode* op) final { + if (op->op.same_as(builtin::atomic_add())) { + ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now"; + llvm::Value* v0 = MakeValue(op->args[0]); + llvm::Value* v1 = MakeValue(op->args[1]); + if (op->args[1]->dtype.is_float()) { + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::AtomicOrdering::Monotonic); + } + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, + llvm::AtomicOrdering::Monotonic); + } + return CodeGenLLVM::CreateIntrinsic(op); + } + protected: void InitTarget(llvm::TargetMachine* tm) final { // Maximum vector lane = float4 diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index c0ec3255bab5..d10ed311949c 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -955,16 +955,6 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); - } else if (op->op.same_as(builtin::atomic_add())) { - ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now"; - llvm::Value* v0 = MakeValue(op->args[0]); - llvm::Value* v1 = MakeValue(op->args[1]); - if (op->args[1]->dtype.is_float()) { - return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, - llvm::AtomicOrdering::Monotonic); - } - return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, - llvm::AtomicOrdering::Monotonic); } else { LOG(FATAL) << "unknown intrinsic " << op->op; return nullptr; diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc index 22e612b11090..06584eb5b189 100644 --- a/src/target/llvm/codegen_nvptx.cc +++ b/src/target/llvm/codegen_nvptx.cc @@ -232,6 +232,16 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) { auto fty = llvm::FunctionType::get(t_int32_, false); auto val = llvm::InlineAsm::get(fty, "activemask.b32 %0", "=r", true); return builder_->CreateCall(val); + } else if (op->op.same_as(builtin::atomic_add())) { + ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now"; + llvm::Value* v0 = MakeValue(op->args[0]); + llvm::Value* v1 = MakeValue(op->args[1]); + if (op->args[1]->dtype.is_float()) { + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::AtomicOrdering::Monotonic); + } + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, + llvm::AtomicOrdering::Monotonic); } return CodeGenLLVM::CreateIntrinsic(op); } diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index decf623f2e32..b6e54bccc513 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -766,15 +766,11 @@ def do_atomic_add(A): tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one) return ib.get() - A = tvm.te.placeholder((100,), dtype="int8", name="A") + A = tvm.te.placeholder((100,), dtype="int32", name="A") C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32") s = tvm.te.create_schedule(C.op) - f = tvm.build(s, [A], target="llvm") - a_np = np.zeros((100,), dtype=A.dtype) - a = tvm.nd.array(a_np) - f(a) - print(a) - print(tvm.lower(s, [A], simple_mode=True)) + # This does not work because of pointer type mismatch + # f = tvm.build(s, [A], target="llvm") if __name__ == "__main__": From e48563e0f52640c3bc1541ad61d564d8d2162892 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 04:38:10 +0900 Subject: [PATCH 08/19] add comment --- tests/python/unittest/test_target_codegen_llvm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index b6e54bccc513..b0f59339650b 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -770,6 +770,10 @@ def do_atomic_add(A): C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32") s = tvm.te.create_schedule(C.op) # This does not work because of pointer type mismatch + # TVMError: LLVM module verification failed with the following errors: + # Argument value type does not match pointer operand type! + # %21 = atomicrmw add i8* %7, i32 1 monotonic + # i8 # f = tvm.build(s, [A], target="llvm") From 8dc9fd66ee593aa9c13a0d20475674b27452154f Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 04:50:53 +0900 Subject: [PATCH 09/19] add atomic gpu unit test --- .../unittest/test_target_codegen_llvm.py | 85 +++++++++++++------ 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index b0f59339650b..8e3cb4a9a262 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -767,7 +767,7 @@ def do_atomic_add(A): return ib.get() A = tvm.te.placeholder((100,), dtype="int32", name="A") - C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32") + C = tvm.te.extern((100,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32") s = tvm.te.create_schedule(C.op) # This does not work because of pointer type mismatch # TVMError: LLVM module verification failed with the following errors: @@ -777,28 +777,63 @@ def do_atomic_add(A): # f = tvm.build(s, [A], target="llvm") +@tvm.testing.requires_llvm +@tvm.testing.requires_gpu +def test_llvm_gpu_lower_atomic(): + def do_atomic_add(A): + ib = tvm.tir.ir_builder.create() + n = A.shape[0] + atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local") + one = tvm.tir.const(1, A.dtype) + A_ptr = ib.buffer_ptr(A) + nthread_tx = 64 + with ib.new_scope(): + nthread_bx = (n + nthread_tx - 1) // nthread_tx + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + atomic_add_return[0] = atomic_add( + tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one) + return ib.get() + + size = 1024 + for dtype in ["int32", "float32"]: + A = tvm.te.placeholder((size,), dtype=dtype, name="A") + C = tvm.te.extern((size,), [A], lambda ins, _: do_atomic_add(ins[0]), dtype=dtype) + s = tvm.te.create_schedule(C.op) + f = tvm.build(s, [A], target="nvptx") + + ctx = tvm.gpu() + a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), ctx) + f(a) + ref = np.zeros((size,)).astype(A.dtype) + ref[0] = size + tvm.testing.assert_allclose(a.asnumpy(), ref, rtol=1e-5) + + if __name__ == "__main__": - # test_multiple_func() - # test_llvm_large_uintimm() - # test_llvm_import() - # test_alignment() - # test_rank_zero() - # test_rank_zero_bound_checkers() - # test_llvm_bool() - # test_llvm_persist_parallel() - # test_llvm_condition() - # test_llvm_vadd_pipeline() - # test_llvm_add_pipeline() - # test_llvm_intrin() - # test_llvm_overloaded_intrin() - # test_llvm_flip_pipeline() - # test_llvm_madd_pipeline() - # test_llvm_temp_space() - # test_llvm_lookup_intrin() - # test_llvm_div() - # test_llvm_fp_math() - # test_dwarf_debug_information() - # test_llvm_shuffle() - # test_llvm_bf16() - # test_llvm_crt_static_lib() - test_llvm_lower_atomic() + test_multiple_func() + test_llvm_large_uintimm() + test_llvm_import() + test_alignment() + test_rank_zero() + test_rank_zero_bound_checkers() + test_llvm_bool() + test_llvm_persist_parallel() + test_llvm_condition() + test_llvm_vadd_pipeline() + test_llvm_add_pipeline() + test_llvm_intrin() + test_llvm_overloaded_intrin() + test_llvm_flip_pipeline() + test_llvm_madd_pipeline() + test_llvm_temp_space() + test_llvm_lookup_intrin() + test_llvm_div() + test_llvm_fp_math() + test_dwarf_debug_information() + test_llvm_shuffle() + test_llvm_bf16() + test_llvm_crt_static_lib() + test_llvm_gpu_lower_atomic() From 9147f9ef08284e4d660f0a50cbc0be35ac986528 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 04:57:42 +0900 Subject: [PATCH 10/19] reenable other tests --- tests/python/frontend/pytorch/test_forward.py | 309 +++++++++--------- tests/python/relay/test_op_level3.py | 3 - tests/python/relay/test_op_level5.py | 44 +-- tests/python/topi/python/test_topi_vision.py | 25 +- 4 files changed, 185 insertions(+), 196 deletions(-) diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index f31020483ccd..2dda675c74f5 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -207,7 +207,6 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params) relay_model = graph_runtime.create(relay_graph, relay_lib, ctx) relay_model.set_input(**relay_params) - for name, inp in compiled_input.items(): relay_model.set_input(name, inp) relay_model.run() @@ -1913,8 +1912,6 @@ def verify_model_vm(input_model, ishapes, idtype=torch.float, idata=None, target ctx = tvm.context(tgt, 0) executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=tgt) - print(executor.executable.lib.imported_modules[0].get_source("llvm")) - evaluator = executor.evaluate() # Inference @@ -3367,167 +3364,167 @@ def test_fn(x, weights=None): if __name__ == "__main__": - # # some structural tests - # test_forward_traced_function() - # test_forward_dtypes() - # test_weight_names() - # test_duplicate_weight_use() - - # # Single operator tests - # test_forward_pixel_shuffle() - # test_forward_add() - # test_forward_subtract() - # test_forward_multiply() - # test_forward_matmul() - # test_forward_rsub() - # test_forward_onehot() - # test_forward_embedding() - # test_forward_reshape() - # test_forward_reciprocal() - # test_forward_repeat() - # test_forward_repeat_interleave() - # test_forward_squeeze() - # test_forward_unsqueeze() - # test_forward_concatenate() - # test_forward_reduce_sum() - # test_forward_reduce_prod() - # test_forward_argmin() - # test_forward_argmax() - # test_forward_norm() - # test_forward_frobenius_norm() - # test_forward_std() - # test_forward_variance() - # test_forward_relu() - # test_forward_prelu() - # test_forward_leakyrelu() - # test_forward_elu() - # test_forward_celu() - # test_forward_gelu() - # test_forward_selu() - # test_forward_log_sigmoid() - # test_forward_adaptiveavgpool() - # test_forward_maxpool2d() - # test_forward_maxpool1d() - # test_forward_maxpool3d() - # test_forward_hardtanh() - # test_forward_conv() - # test_forward_conv_transpose() - # test_forward_threshold() - # test_forward_contiguous() - # test_forward_batchnorm() - # test_forward_instancenorm() - # test_forward_layernorm() - # test_forward_groupnorm() - # test_forward_transpose() - # test_forward_size() - # test_forward_view() - # test_forward_select() - # test_forward_take() - # test_forward_topk() - # test_forward_where() - # test_forward_addcdiv() - # test_forward_addcmul() - # test_forward_true_divide() - # test_forward_clone() - # test_forward_softplus() - # test_forward_softsign() - # test_forward_logsoftmax() - # test_forward_sigmoid() - # test_forward_dense() - # test_forward_avgpool() - # test_forward_avgpool3d() - # test_forward_dropout() - # test_forward_slice() - # test_forward_mean() - # test_forward_expand() - # test_forward_pow() - # test_forward_unary() - # test_forward_clamp() - # test_forward_clamp_() - # test_forward_logical_not() - # test_forward_bitwise_not() - # test_forward_bitwise_xor() - # test_forward_logical_xor() - # test_forward_isfinite() - # test_forward_isnan() - # test_forward_isinf() - # test_forward_ones() - # test_forward_ones_like() - # test_forward_zeros() - # test_forward_zeros_like() - # test_forward_full() - # test_forward_full_like() - # test_forward_linspace() - # test_forward_arange() - # test_forward_mesh_grid() - # test_forward_chunk() - # test_forward_split() - # test_forward_gather() - # test_upsample() - # test_forward_upsample3d() - # test_forward_nms() - # test_forward_roi_align() - # test_to() - # test_flatten() - # test_type_as() - # test_forward_functional_pad() - # test_forward_zero_pad2d() - # test_forward_constant_pad1d() - # test_forward_constant_pad2d() - # test_forward_constant_pad3d() - # test_forward_reflection_pad1d() - # test_forward_reflection_pad2d() - # test_forward_replication_pad1d() - # test_forward_replication_pad2d() - # test_forward_replication_pad3d() - # test_adaptive_pool3d() - # test_conv3d() - # test_conv3d_transpose() - # test_forward_index() - # test_min_max() - # test_logsumexp() - # test_stack() - # test_stack_dynamic() - # test_forward_unbind() - # test_forward_nonzero() - # test_forward_scatter() - # test_numel() + # some structural tests + test_forward_traced_function() + test_forward_dtypes() + test_weight_names() + test_duplicate_weight_use() + + # Single operator tests + test_forward_pixel_shuffle() + test_forward_add() + test_forward_subtract() + test_forward_multiply() + test_forward_matmul() + test_forward_rsub() + test_forward_onehot() + test_forward_embedding() + test_forward_reshape() + test_forward_reciprocal() + test_forward_repeat() + test_forward_repeat_interleave() + test_forward_squeeze() + test_forward_unsqueeze() + test_forward_concatenate() + test_forward_reduce_sum() + test_forward_reduce_prod() + test_forward_argmin() + test_forward_argmax() + test_forward_norm() + test_forward_frobenius_norm() + test_forward_std() + test_forward_variance() + test_forward_relu() + test_forward_prelu() + test_forward_leakyrelu() + test_forward_elu() + test_forward_celu() + test_forward_gelu() + test_forward_selu() + test_forward_log_sigmoid() + test_forward_adaptiveavgpool() + test_forward_maxpool2d() + test_forward_maxpool1d() + test_forward_maxpool3d() + test_forward_hardtanh() + test_forward_conv() + test_forward_conv_transpose() + test_forward_threshold() + test_forward_contiguous() + test_forward_batchnorm() + test_forward_instancenorm() + test_forward_layernorm() + test_forward_groupnorm() + test_forward_transpose() + test_forward_size() + test_forward_view() + test_forward_select() + test_forward_take() + test_forward_topk() + test_forward_where() + test_forward_addcdiv() + test_forward_addcmul() + test_forward_true_divide() + test_forward_clone() + test_forward_softplus() + test_forward_softsign() + test_forward_logsoftmax() + test_forward_sigmoid() + test_forward_dense() + test_forward_avgpool() + test_forward_avgpool3d() + test_forward_dropout() + test_forward_slice() + test_forward_mean() + test_forward_expand() + test_forward_pow() + test_forward_unary() + test_forward_clamp() + test_forward_clamp_() + test_forward_logical_not() + test_forward_bitwise_not() + test_forward_bitwise_xor() + test_forward_logical_xor() + test_forward_isfinite() + test_forward_isnan() + test_forward_isinf() + test_forward_ones() + test_forward_ones_like() + test_forward_zeros() + test_forward_zeros_like() + test_forward_full() + test_forward_full_like() + test_forward_linspace() + test_forward_arange() + test_forward_mesh_grid() + test_forward_chunk() + test_forward_split() + test_forward_gather() + test_upsample() + test_forward_upsample3d() + test_forward_nms() + test_forward_roi_align() + test_to() + test_flatten() + test_type_as() + test_forward_functional_pad() + test_forward_zero_pad2d() + test_forward_constant_pad1d() + test_forward_constant_pad2d() + test_forward_constant_pad3d() + test_forward_reflection_pad1d() + test_forward_reflection_pad2d() + test_forward_replication_pad1d() + test_forward_replication_pad2d() + test_forward_replication_pad3d() + test_adaptive_pool3d() + test_conv3d() + test_conv3d_transpose() + test_forward_index() + test_min_max() + test_logsumexp() + test_stack() + test_stack_dynamic() + test_forward_unbind() + test_forward_nonzero() + test_forward_scatter() + test_numel() test_bincount() - # # Model tests - # test_resnet18() - # test_squeezenet1_0() - # test_squeezenet1_1() - # test_densenet121() - # # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug - # # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756 - # # test_inception_v3() - # test_googlenet() - # test_mnasnet0_5() - # test_mobilenet_v2() + # Model tests + test_resnet18() + test_squeezenet1_0() + test_squeezenet1_1() + test_densenet121() + # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug + # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756 + # test_inception_v3() + test_googlenet() + test_mnasnet0_5() + test_mobilenet_v2() - # test_custom_conversion_map() + test_custom_conversion_map() - # test_segmentaton_models() - # test_3d_models() + test_segmentaton_models() + test_3d_models() - # # Quantization test - # from qnn_test import test_quantized_imagenet, test_quantized_modules + # Quantization test + from qnn_test import test_quantized_imagenet, test_quantized_modules - # test_quantized_modules() - # test_quantized_imagenet() + test_quantized_modules() + test_quantized_imagenet() - # # Test simple conditionals and loop - # test_control_flow() - # test_simple_rnn() + # Test simple conditionals and loop + test_control_flow() + test_simple_rnn() - # # More complex recurrent models - # from test_lstm import test_custom_lstm + # More complex recurrent models + from test_lstm import test_custom_lstm - # test_custom_lstm() + test_custom_lstm() - # # Test bert model - # test_forward_pretrained_bert_base_uncased() + # Test bert model + test_forward_pretrained_bert_base_uncased() - # # Test convert torch script(jit) with specific inputs' types - # test_convert_torch_script_with_input_types() + # Test convert torch script(jit) with specific inputs' types + test_convert_torch_script_with_input_types() diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index fc1929e9dc18..0c1f0463e6ff 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -1017,9 +1017,6 @@ def verify_scatter_add(dshape, ishape, axis=0): ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis) for target, ctx in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - if target == "nvptx": - # TODO(masahi): support atomic in LLVM codegen - continue intrp = relay.create_executor(kind, ctx=ctx, target=target) op_res = intrp.evaluate(func)(data_np, indices_np, updates_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index ee7acc619f9d..f114957f3cab 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -1194,26 +1194,26 @@ def verify_batch_to_space_nd(dshape, block_shape, crops): if __name__ == "__main__": - # test_resize_infer_type() - # test_resize() - # test_resize3d_infer_type() - # test_resize3d() - # test_crop_and_resize() - # test_multibox_prior() - # test_multibox_transform_loc() - # test_get_valid_counts() - # test_roi_align() - # test_roi_pool() - # test_proposal() - # test_yolo_reorg_infer_shape() - # test_yolo_reorg() + test_resize_infer_type() + test_resize() + test_resize3d_infer_type() + test_resize3d() + test_crop_and_resize() + test_multibox_prior() + test_multibox_transform_loc() + test_get_valid_counts() + test_roi_align() + test_roi_pool() + test_proposal() + test_yolo_reorg_infer_shape() + test_yolo_reorg() test_non_max_suppression() - # test_deformable_conv2d() - # test_depth_to_space() - # test_space_to_depth() - # test_dilation2d_infer_type() - # test_dilation2d_run() - # test_affine_grid() - # test_grid_sample() - # test_space_to_batch_nd() - # test_batch_to_space_nd() + test_deformable_conv2d() + test_depth_to_space() + test_space_to_depth() + test_dilation2d_infer_type() + test_dilation2d_run() + test_affine_grid() + test_grid_sample() + test_space_to_batch_nd() + test_batch_to_space_nd() diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py index 3fb02075dbc7..778843be37de 100644 --- a/tests/python/topi/python/test_topi_vision.py +++ b/tests/python/topi/python/test_topi_vision.py @@ -202,16 +202,11 @@ def check_device(device): tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx) - if device in ["nvptx"]: - f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device) - # f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) - print(f.imported_modules[0].get_source("llvm")) - else: - f = tvm.build(indices_s, [data, valid_count, indices, indices_out], device) - f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) - # tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4) + f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device) + f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) + tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4) - for device in ["nvptx"]: + for device in ["llvm", "cuda", "opencl", "nvptx"]: check_device(device) @@ -631,10 +626,10 @@ def test_proposal(): if __name__ == "__main__": - # test_get_valid_counts() - # test_multibox_prior() - # test_multibox_detection() - # test_roi_align() - # test_roi_pool() - # test_proposal() + test_get_valid_counts() + test_multibox_prior() + test_multibox_detection() + test_roi_align() + test_roi_pool() + test_proposal() test_non_max_suppression() From 552aa5fd45948aad3d715a409b40fa305f6333aa Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 05:04:19 +0900 Subject: [PATCH 11/19] add doc string --- include/tvm/tir/builtin.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index a5ac378be581..a150595ab551 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -549,6 +549,9 @@ TVM_DLL const Op& vectorlow(); */ TVM_DLL const Op& vectorcombine(); +/*! + * \brief atomic add instruction, corresponding e.g. to atomicAdd in CUDA + */ TVM_DLL const Op& atomic_add(); /*! \brief The kind of structure field info used in intrinsic */ From 337f20998b0cc3d3cb738a24115be2aad057afd1 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 8 Dec 2020 07:17:32 +0900 Subject: [PATCH 12/19] run black --- tests/python/unittest/test_target_codegen_llvm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index 8e3cb4a9a262..bf8afc9e85dd 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -763,7 +763,8 @@ def do_atomic_add(A): A_ptr = ib.buffer_ptr(A) with ib.for_range(0, n, name="i", for_type="parallel") as i: atomic_add_return[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one) + tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one + ) return ib.get() A = tvm.te.placeholder((100,), dtype="int32", name="A") @@ -794,7 +795,8 @@ def do_atomic_add(A): ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) atomic_add_return[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one) + tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one + ) return ib.get() size = 1024 From c9f413acc359cc4a1e5904a19a2b479dd2d00023 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 09:23:33 +0900 Subject: [PATCH 13/19] fix build with llvm 8 and older --- src/target/llvm/codegen_amdgpu.cc | 8 ++++++-- src/target/llvm/codegen_nvptx.cc | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc index 1e6c64abf655..3ff686941f46 100644 --- a/src/target/llvm/codegen_amdgpu.cc +++ b/src/target/llvm/codegen_amdgpu.cc @@ -189,8 +189,12 @@ class CodeGenAMDGPU : public CodeGenLLVM { llvm::Value* v0 = MakeValue(op->args[0]); llvm::Value* v1 = MakeValue(op->args[1]); if (op->args[1]->dtype.is_float()) { - return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, - llvm::AtomicOrdering::Monotonic); +#if TVM_LLVM_VERSION >= 90 + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::AtomicOrdering::Monotonic); +#else + LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer"; +#endif } return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic); diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc index 06584eb5b189..d8002a2b58a6 100644 --- a/src/target/llvm/codegen_nvptx.cc +++ b/src/target/llvm/codegen_nvptx.cc @@ -237,8 +237,12 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) { llvm::Value* v0 = MakeValue(op->args[0]); llvm::Value* v1 = MakeValue(op->args[1]); if (op->args[1]->dtype.is_float()) { +#if TVM_LLVM_VERSION >= 90 return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::AtomicOrdering::Monotonic); +#else + LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer"; +#endif } return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic); From 0d6b4800409978e3c4269719d85cafcb54b79e43 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 09:26:18 +0900 Subject: [PATCH 14/19] fix format --- src/target/llvm/codegen_amdgpu.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc index 3ff686941f46..605870f48c52 100644 --- a/src/target/llvm/codegen_amdgpu.cc +++ b/src/target/llvm/codegen_amdgpu.cc @@ -190,10 +190,10 @@ class CodeGenAMDGPU : public CodeGenLLVM { llvm::Value* v1 = MakeValue(op->args[1]); if (op->args[1]->dtype.is_float()) { #if TVM_LLVM_VERSION >= 90 - return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, - llvm::AtomicOrdering::Monotonic); + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::AtomicOrdering::Monotonic); #else - LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer"; + LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer"; #endif } return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, From 9dea22ad2e699d8cf0f85c603423100f70bdcbd8 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 09:44:09 +0900 Subject: [PATCH 15/19] do not run float32 atomic test on ci --- tests/python/unittest/test_target_codegen_llvm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index bf8afc9e85dd..4b67752367db 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -800,7 +800,8 @@ def do_atomic_add(A): return ib.get() size = 1024 - for dtype in ["int32", "float32"]: + # CI uses LLVM 8, which does not support float atomic + for dtype in ["int32"]: A = tvm.te.placeholder((size,), dtype=dtype, name="A") C = tvm.te.extern((size,), [A], lambda ins, _: do_atomic_add(ins[0]), dtype=dtype) s = tvm.te.create_schedule(C.op) From 7551cbae8b256d1b58ca19b04fea7d63834d8115 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 10:43:40 +0900 Subject: [PATCH 16/19] do not run scatter_add 1d with float inputs on CI --- tests/python/relay/test_op_level3.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index 0c1f0463e6ff..427b0a99a756 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -1002,27 +1002,33 @@ def ref_scatter_add(data, indices, updates, axis=0): output[tuple(new_index)] += updates[index] return output - def verify_scatter_add(dshape, ishape, axis=0): - d = relay.var("d", relay.TensorType(dshape, "float32")) + def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"): + d = relay.var("d", relay.TensorType(dshape, dtype)) i = relay.var("i", relay.TensorType(ishape, "int64")) - u = relay.var("u", relay.TensorType(ishape, "float32")) + u = relay.var("u", relay.TensorType(ishape, dtype)) z = relay.op.scatter_add(d, i, u, axis) func = relay.Function([d, i, u], z) - data_np = np.random.uniform(size=dshape).astype("float32") - updates_np = np.random.uniform(size=ishape).astype("float32") + data_np = np.random.uniform(size=dshape).astype(dtype) + updates_np = np.random.uniform(size=ishape).astype(dtype) indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64") ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis) for target, ctx in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: + if target == "nvptx" and dtype == "float32" and len(dshape) == 1: + # scatter_add 1D on GPU is implemented via atomic. + # Floating point atomic requires LLVM 9 or newer for nvptx backend. + # But LLVM on CI is LLVM 8, + continue intrp = relay.create_executor(kind, ctx=ctx, target=target) op_res = intrp.evaluate(func)(data_np, indices_np, updates_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) - verify_scatter_add((10,), (10,), 0) - verify_scatter_add((1000,), (1000,), 0) + verify_scatter_add((10,), (10,), 0, dtype="int32") + verify_scatter_add((1000,), (1000,)) + verify_scatter_add((1000,), (1000,), 0, dtype="int32") verify_scatter_add((10, 5), (10, 5), -2) verify_scatter_add((10, 5), (10, 5), -1) verify_scatter_add((10, 5), (3, 5), 0) From 65498b0b5759f914f60d28b10eb41f7dc644ab37 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 8 Dec 2020 10:45:20 +0900 Subject: [PATCH 17/19] fix typo --- tests/python/relay/test_op_level3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index 427b0a99a756..668285dfb882 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -1020,7 +1020,7 @@ def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"): if target == "nvptx" and dtype == "float32" and len(dshape) == 1: # scatter_add 1D on GPU is implemented via atomic. # Floating point atomic requires LLVM 9 or newer for nvptx backend. - # But LLVM on CI is LLVM 8, + # But LLVM on CI is LLVM 8. continue intrp = relay.create_executor(kind, ctx=ctx, target=target) op_res = intrp.evaluate(func)(data_np, indices_np, updates_np) From 82a887c691079f083ba6419aff54e4169aaef522 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 8 Dec 2020 20:02:50 +0900 Subject: [PATCH 18/19] add todo comment for cpu backend --- src/target/llvm/codegen_llvm.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index d10ed311949c..12573e29c9de 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -955,6 +955,9 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); + } else if (op->op.same_as(builtin::atomic_add())) { + // TODO(masahi): Support atomic for CPU backend + LOG(FATAL) << "CPU backend does not support atomic add yet."; } else { LOG(FATAL) << "unknown intrinsic " << op->op; return nullptr; From e4eda99986839d9e654a7b5932597a4d0809d871 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 8 Dec 2020 20:32:57 +0900 Subject: [PATCH 19/19] fix build on ci --- src/target/llvm/codegen_llvm.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index 12573e29c9de..70f094a186e7 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -958,6 +958,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { } else if (op->op.same_as(builtin::atomic_add())) { // TODO(masahi): Support atomic for CPU backend LOG(FATAL) << "CPU backend does not support atomic add yet."; + return nullptr; } else { LOG(FATAL) << "unknown intrinsic " << op->op; return nullptr;