From dce3438877481c518f2b41c86af3adea3f6ca55f Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Sun, 6 Dec 2020 23:48:56 +0900
Subject: [PATCH 01/19] support atomic add on llvm

---
 python/tvm/topi/cuda/nms.py                  |  8 ++++
 src/target/llvm/codegen_llvm.cc              | 14 ++++++
 tests/python/relay/test_op_level5.py         | 46 ++++++++++----------
 tests/python/topi/python/test_topi_vision.py | 21 ++++-----
 4 files changed, 55 insertions(+), 34 deletions(-)
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 46d7f9800c43..e99265906d5e 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -40,6 +40,10 @@ def opencl_atomic_add_rule(op):
         return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
     raise RuntimeError("only support int32")
 
+def llvm_atomic_add_rule(op):
+    if op.dtype == "int32":
+        return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
+    raise RuntimeError("only support int32")
 
 tvm.target.intrin.register_intrin_rule("cuda", "atomic_add", cuda_atomic_add_rule, override=True)
 
@@ -47,6 +51,10 @@ def opencl_atomic_add_rule(op):
     "opencl", "atomic_add", opencl_atomic_add_rule, override=True
 )
 
+tvm.target.intrin.register_intrin_rule(
+    "nvptx", "atomic_add", llvm_atomic_add_rule, override=True
+)
+
 tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
 
 
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index d10ed311949c..50e68f8c6012 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -955,6 +955,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
+  } else if (op->op.same_as(builtin_call_extern_)) {
+    auto func = Downcast<StringImm>(op->args[0]);
+    if (func->value == "atomic_add") {
+      LOG(FATAL) << "atomic add found " << op->op;
+      llvm::Value* v0 = MakeValue(op->args[1]);
+      llvm::Value* v1 = MakeValue(op->args[2]);
+    }
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;
@@ -1185,6 +1192,13 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
   if (auto* ptr_op = op->op.as<OpNode>()) {
     auto call_op = GetRef<Op>(ptr_op);
     if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
+      auto func = Downcast<StringImm>(op->args[0]);
+      if (func->value == "atomic_add") {
+        llvm::Value* v0 = MakeValue(op->args[1]);
+        llvm::Value* v1 = MakeValue(op->args[2]);
+        auto old_val = builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic);
+        return old_val;
+      }
       // call extern intrinsic
       ICHECK_GE(op->args.size(), 1U);
       auto global_symbol = Downcast<StringImm>(op->args[0]);
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 9e9aaf842669..ee7acc619f9d 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -393,8 +393,6 @@ def verify_nms(
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
-            if target == "nvptx":
-                continue
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
@@ -1196,26 +1194,26 @@ def verify_batch_to_space_nd(dshape, block_shape, crops):
 
 
 if __name__ == "__main__":
-    test_resize_infer_type()
-    test_resize()
-    test_resize3d_infer_type()
-    test_resize3d()
-    test_crop_and_resize()
-    test_multibox_prior()
-    test_multibox_transform_loc()
-    test_get_valid_counts()
-    test_roi_align()
-    test_roi_pool()
-    test_proposal()
-    test_yolo_reorg_infer_shape()
-    test_yolo_reorg()
+    # test_resize_infer_type()
+    # test_resize()
+    # test_resize3d_infer_type()
+    # test_resize3d()
+    # test_crop_and_resize()
+    # test_multibox_prior()
+    # test_multibox_transform_loc()
+    # test_get_valid_counts()
+    # test_roi_align()
+    # test_roi_pool()
+    # test_proposal()
+    # test_yolo_reorg_infer_shape()
+    # test_yolo_reorg()
     test_non_max_suppression()
-    test_deformable_conv2d()
-    test_depth_to_space()
-    test_space_to_depth()
-    test_dilation2d_infer_type()
-    test_dilation2d_run()
-    test_affine_grid()
-    test_grid_sample()
-    test_space_to_batch_nd()
-    test_batch_to_space_nd()
+    # test_deformable_conv2d()
+    # test_depth_to_space()
+    # test_space_to_depth()
+    # test_dilation2d_infer_type()
+    # test_dilation2d_run()
+    # test_affine_grid()
+    # test_grid_sample()
+    # test_space_to_batch_nd()
+    # test_batch_to_space_nd()
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 6d6353eebce6..3fb02075dbc7 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -202,15 +202,16 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
         tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        if device in ["llvm", "cuda"]:
+        if device in ["nvptx"]:
             f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
-            f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+            # f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+            print(f.imported_modules[0].get_source("llvm"))
         else:
             f = tvm.build(indices_s, [data, valid_count, indices, indices_out], device)
             f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
-        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
+        # tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
-    for device in ["llvm", "cuda", "opencl"]:
+    for device in ["nvptx"]:
         check_device(device)
 
 
@@ -630,10 +631,10 @@ def test_proposal():
 
 
 if __name__ == "__main__":
-    test_get_valid_counts()
-    test_multibox_prior()
-    test_multibox_detection()
-    test_roi_align()
-    test_roi_pool()
-    test_proposal()
+    # test_get_valid_counts()
+    # test_multibox_prior()
+    # test_multibox_detection()
+    # test_roi_align()
+    # test_roi_pool()
+    # test_proposal()
     test_non_max_suppression()

From b8ae806e59b7b507cbb248ba0390413e4838a125 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Mon, 7 Dec 2020 00:12:28 +0900
Subject: [PATCH 02/19] make atomic builtin intrin

---
 include/tvm/tir/builtin.h       |  2 ++
 python/tvm/topi/cuda/nms.py     | 10 ----------
 src/target/llvm/codegen_llvm.cc | 18 ++++--------------
 src/tir/op/builtin.cc           |  3 +++
 4 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index bea53136fd54..a5ac378be581 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -549,6 +549,8 @@ TVM_DLL const Op& vectorlow();
  */
 TVM_DLL const Op& vectorcombine();
 
+TVM_DLL const Op& atomic_add();
+
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
   // array head address
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index e99265906d5e..d51eb5ce1d11 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -40,10 +40,6 @@ def opencl_atomic_add_rule(op):
         return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
     raise RuntimeError("only support int32")
 
-def llvm_atomic_add_rule(op):
-    if op.dtype == "int32":
-        return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
-    raise RuntimeError("only support int32")
 
 tvm.target.intrin.register_intrin_rule("cuda", "atomic_add", cuda_atomic_add_rule, override=True)
 
@@ -51,12 +47,6 @@ def llvm_atomic_add_rule(op):
     "opencl", "atomic_add", opencl_atomic_add_rule, override=True
 )
 
-tvm.target.intrin.register_intrin_rule(
-    "nvptx", "atomic_add", llvm_atomic_add_rule, override=True
-)
-
-tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
-
 
 def atomic_add(x, y):
     return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 50e68f8c6012..bcca7ba5bc15 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -955,13 +955,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
-  } else if (op->op.same_as(builtin_call_extern_)) {
-    auto func = Downcast<StringImm>(op->args[0]);
-    if (func->value == "atomic_add") {
-      LOG(FATAL) << "atomic add found " << op->op;
-      llvm::Value* v0 = MakeValue(op->args[1]);
-      llvm::Value* v1 = MakeValue(op->args[2]);
-    }
+  } else if (op->op.same_as(Op::Get("tir.atomic_add"))) {
+    llvm::Value* v0 = MakeValue(op->args[0]);
+    llvm::Value* v1 = MakeValue(op->args[1]);
+    return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;
@@ -1192,13 +1189,6 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
   if (auto* ptr_op = op->op.as<OpNode>()) {
     auto call_op = GetRef<Op>(ptr_op);
     if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
-      auto func = Downcast<StringImm>(op->args[0]);
-      if (func->value == "atomic_add") {
-        llvm::Value* v0 = MakeValue(op->args[1]);
-        llvm::Value* v1 = MakeValue(op->args[2]);
-        auto old_val = builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic);
-        return old_val;
-      }
       // call extern intrinsic
       ICHECK_GE(op->args.size(), 1U);
       auto global_symbol = Downcast<StringImm>(op->args[0]);
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 3afb8810e774..796b113a4054 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -229,6 +229,9 @@ TIR_DEFINE_BUILTIN_FUNC(vectorlow).set_attr<TCallEffectKind>("TCallEffectKind",
 TIR_DEFINE_BUILTIN_FUNC(vectorcombine)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
+TIR_DEFINE_BUILTIN_FUNC(atomic_add)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 }  // namespace builtin
 }  // namespace tir
 }  // namespace tvm

From e604240e2a84537253eae4df84b06bddb786b63a Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Mon, 7 Dec 2020 00:13:40 +0900
Subject: [PATCH 03/19] test bincount on nvptx

---
 tests/python/frontend/pytorch/test_forward.py | 309 +++++++++---------
 1 file changed, 156 insertions(+), 153 deletions(-)

diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 2dda675c74f5..f31020483ccd 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -207,6 +207,7 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at
             relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params)
             relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
             relay_model.set_input(**relay_params)
+
             for name, inp in compiled_input.items():
                 relay_model.set_input(name, inp)
             relay_model.run()
@@ -1912,6 +1913,8 @@ def verify_model_vm(input_model, ishapes, idtype=torch.float, idata=None, target
         ctx = tvm.context(tgt, 0)
 
         executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=tgt)
+        print(executor.executable.lib.imported_modules[0].get_source("llvm"))
+
         evaluator = executor.evaluate()
 
         # Inference
@@ -3364,167 +3367,167 @@ def test_fn(x, weights=None):
 
 
 if __name__ == "__main__":
-    # some structural tests
-    test_forward_traced_function()
-    test_forward_dtypes()
-    test_weight_names()
-    test_duplicate_weight_use()
-
-    # Single operator tests
-    test_forward_pixel_shuffle()
-    test_forward_add()
-    test_forward_subtract()
-    test_forward_multiply()
-    test_forward_matmul()
-    test_forward_rsub()
-    test_forward_onehot()
-    test_forward_embedding()
-    test_forward_reshape()
-    test_forward_reciprocal()
-    test_forward_repeat()
-    test_forward_repeat_interleave()
-    test_forward_squeeze()
-    test_forward_unsqueeze()
-    test_forward_concatenate()
-    test_forward_reduce_sum()
-    test_forward_reduce_prod()
-    test_forward_argmin()
-    test_forward_argmax()
-    test_forward_norm()
-    test_forward_frobenius_norm()
-    test_forward_std()
-    test_forward_variance()
-    test_forward_relu()
-    test_forward_prelu()
-    test_forward_leakyrelu()
-    test_forward_elu()
-    test_forward_celu()
-    test_forward_gelu()
-    test_forward_selu()
-    test_forward_log_sigmoid()
-    test_forward_adaptiveavgpool()
-    test_forward_maxpool2d()
-    test_forward_maxpool1d()
-    test_forward_maxpool3d()
-    test_forward_hardtanh()
-    test_forward_conv()
-    test_forward_conv_transpose()
-    test_forward_threshold()
-    test_forward_contiguous()
-    test_forward_batchnorm()
-    test_forward_instancenorm()
-    test_forward_layernorm()
-    test_forward_groupnorm()
-    test_forward_transpose()
-    test_forward_size()
-    test_forward_view()
-    test_forward_select()
-    test_forward_take()
-    test_forward_topk()
-    test_forward_where()
-    test_forward_addcdiv()
-    test_forward_addcmul()
-    test_forward_true_divide()
-    test_forward_clone()
-    test_forward_softplus()
-    test_forward_softsign()
-    test_forward_logsoftmax()
-    test_forward_sigmoid()
-    test_forward_dense()
-    test_forward_avgpool()
-    test_forward_avgpool3d()
-    test_forward_dropout()
-    test_forward_slice()
-    test_forward_mean()
-    test_forward_expand()
-    test_forward_pow()
-    test_forward_unary()
-    test_forward_clamp()
-    test_forward_clamp_()
-    test_forward_logical_not()
-    test_forward_bitwise_not()
-    test_forward_bitwise_xor()
-    test_forward_logical_xor()
-    test_forward_isfinite()
-    test_forward_isnan()
-    test_forward_isinf()
-    test_forward_ones()
-    test_forward_ones_like()
-    test_forward_zeros()
-    test_forward_zeros_like()
-    test_forward_full()
-    test_forward_full_like()
-    test_forward_linspace()
-    test_forward_arange()
-    test_forward_mesh_grid()
-    test_forward_chunk()
-    test_forward_split()
-    test_forward_gather()
-    test_upsample()
-    test_forward_upsample3d()
-    test_forward_nms()
-    test_forward_roi_align()
-    test_to()
-    test_flatten()
-    test_type_as()
-    test_forward_functional_pad()
-    test_forward_zero_pad2d()
-    test_forward_constant_pad1d()
-    test_forward_constant_pad2d()
-    test_forward_constant_pad3d()
-    test_forward_reflection_pad1d()
-    test_forward_reflection_pad2d()
-    test_forward_replication_pad1d()
-    test_forward_replication_pad2d()
-    test_forward_replication_pad3d()
-    test_adaptive_pool3d()
-    test_conv3d()
-    test_conv3d_transpose()
-    test_forward_index()
-    test_min_max()
-    test_logsumexp()
-    test_stack()
-    test_stack_dynamic()
-    test_forward_unbind()
-    test_forward_nonzero()
-    test_forward_scatter()
-    test_numel()
+    # # some structural tests
+    # test_forward_traced_function()
+    # test_forward_dtypes()
+    # test_weight_names()
+    # test_duplicate_weight_use()
+
+    # # Single operator tests
+    # test_forward_pixel_shuffle()
+    # test_forward_add()
+    # test_forward_subtract()
+    # test_forward_multiply()
+    # test_forward_matmul()
+    # test_forward_rsub()
+    # test_forward_onehot()
+    # test_forward_embedding()
+    # test_forward_reshape()
+    # test_forward_reciprocal()
+    # test_forward_repeat()
+    # test_forward_repeat_interleave()
+    # test_forward_squeeze()
+    # test_forward_unsqueeze()
+    # test_forward_concatenate()
+    # test_forward_reduce_sum()
+    # test_forward_reduce_prod()
+    # test_forward_argmin()
+    # test_forward_argmax()
+    # test_forward_norm()
+    # test_forward_frobenius_norm()
+    # test_forward_std()
+    # test_forward_variance()
+    # test_forward_relu()
+    # test_forward_prelu()
+    # test_forward_leakyrelu()
+    # test_forward_elu()
+    # test_forward_celu()
+    # test_forward_gelu()
+    # test_forward_selu()
+    # test_forward_log_sigmoid()
+    # test_forward_adaptiveavgpool()
+    # test_forward_maxpool2d()
+    # test_forward_maxpool1d()
+    # test_forward_maxpool3d()
+    # test_forward_hardtanh()
+    # test_forward_conv()
+    # test_forward_conv_transpose()
+    # test_forward_threshold()
+    # test_forward_contiguous()
+    # test_forward_batchnorm()
+    # test_forward_instancenorm()
+    # test_forward_layernorm()
+    # test_forward_groupnorm()
+    # test_forward_transpose()
+    # test_forward_size()
+    # test_forward_view()
+    # test_forward_select()
+    # test_forward_take()
+    # test_forward_topk()
+    # test_forward_where()
+    # test_forward_addcdiv()
+    # test_forward_addcmul()
+    # test_forward_true_divide()
+    # test_forward_clone()
+    # test_forward_softplus()
+    # test_forward_softsign()
+    # test_forward_logsoftmax()
+    # test_forward_sigmoid()
+    # test_forward_dense()
+    # test_forward_avgpool()
+    # test_forward_avgpool3d()
+    # test_forward_dropout()
+    # test_forward_slice()
+    # test_forward_mean()
+    # test_forward_expand()
+    # test_forward_pow()
+    # test_forward_unary()
+    # test_forward_clamp()
+    # test_forward_clamp_()
+    # test_forward_logical_not()
+    # test_forward_bitwise_not()
+    # test_forward_bitwise_xor()
+    # test_forward_logical_xor()
+    # test_forward_isfinite()
+    # test_forward_isnan()
+    # test_forward_isinf()
+    # test_forward_ones()
+    # test_forward_ones_like()
+    # test_forward_zeros()
+    # test_forward_zeros_like()
+    # test_forward_full()
+    # test_forward_full_like()
+    # test_forward_linspace()
+    # test_forward_arange()
+    # test_forward_mesh_grid()
+    # test_forward_chunk()
+    # test_forward_split()
+    # test_forward_gather()
+    # test_upsample()
+    # test_forward_upsample3d()
+    # test_forward_nms()
+    # test_forward_roi_align()
+    # test_to()
+    # test_flatten()
+    # test_type_as()
+    # test_forward_functional_pad()
+    # test_forward_zero_pad2d()
+    # test_forward_constant_pad1d()
+    # test_forward_constant_pad2d()
+    # test_forward_constant_pad3d()
+    # test_forward_reflection_pad1d()
+    # test_forward_reflection_pad2d()
+    # test_forward_replication_pad1d()
+    # test_forward_replication_pad2d()
+    # test_forward_replication_pad3d()
+    # test_adaptive_pool3d()
+    # test_conv3d()
+    # test_conv3d_transpose()
+    # test_forward_index()
+    # test_min_max()
+    # test_logsumexp()
+    # test_stack()
+    # test_stack_dynamic()
+    # test_forward_unbind()
+    # test_forward_nonzero()
+    # test_forward_scatter()
+    # test_numel()
     test_bincount()
 
-    # Model tests
-    test_resnet18()
-    test_squeezenet1_0()
-    test_squeezenet1_1()
-    test_densenet121()
-    # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug
-    # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756
-    # test_inception_v3()
-    test_googlenet()
-    test_mnasnet0_5()
-    test_mobilenet_v2()
+    # # Model tests
+    # test_resnet18()
+    # test_squeezenet1_0()
+    # test_squeezenet1_1()
+    # test_densenet121()
+    # # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug
+    # # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756
+    # # test_inception_v3()
+    # test_googlenet()
+    # test_mnasnet0_5()
+    # test_mobilenet_v2()
 
-    test_custom_conversion_map()
+    # test_custom_conversion_map()
 
-    test_segmentaton_models()
-    test_3d_models()
+    # test_segmentaton_models()
+    # test_3d_models()
 
-    # Quantization test
-    from qnn_test import test_quantized_imagenet, test_quantized_modules
+    # # Quantization test
+    # from qnn_test import test_quantized_imagenet, test_quantized_modules
 
-    test_quantized_modules()
-    test_quantized_imagenet()
+    # test_quantized_modules()
+    # test_quantized_imagenet()
 
-    # Test simple conditionals and loop
-    test_control_flow()
-    test_simple_rnn()
+    # # Test simple conditionals and loop
+    # test_control_flow()
+    # test_simple_rnn()
 
-    # More complex recurrent models
-    from test_lstm import test_custom_lstm
+    # # More complex recurrent models
+    # from test_lstm import test_custom_lstm
 
-    test_custom_lstm()
+    # test_custom_lstm()
 
-    # Test bert model
-    test_forward_pretrained_bert_base_uncased()
+    # # Test bert model
+    # test_forward_pretrained_bert_base_uncased()
 
-    # Test convert torch script(jit) with specific inputs' types
-    test_convert_torch_script_with_input_types()
+    # # Test convert torch script(jit) with specific inputs' types
+    # test_convert_torch_script_with_input_types()

From 30e88dc10ab9f5236111a40f72d5cf092c652e2c Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Mon, 7 Dec 2020 00:21:20 +0900
Subject: [PATCH 04/19] use builtin::atomic_add

---
 src/target/llvm/codegen_llvm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index bcca7ba5bc15..b2767114815f 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -955,7 +955,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
-  } else if (op->op.same_as(Op::Get("tir.atomic_add"))) {
+  } else if (op->op.same_as(builtin::atomic_add())) {
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
     return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic);

From e37706f5f0130548f3fdf1406260a7019f8fdb02 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 7 Dec 2020 13:23:02 +0900
Subject: [PATCH 05/19] add atomic llvm codegen test, only works on int8 input
 somehow

---
 .../unittest/test_target_codegen_llvm.py      | 76 +++++++++++++------
 1 file changed, 52 insertions(+), 24 deletions(-)

diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 162481bfdb6e..decf623f2e32 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -74,7 +74,6 @@ def use_llvm_intrinsic(A, C):
     C = tvm.te.extern(
         (1, 1), [A], lambda ins, outs: use_llvm_intrinsic(ins[0], outs[0]), name="C", dtype="int32"
     )
-
     s = tvm.te.create_schedule(C.op)
     f = tvm.build(s, [A, C], target="llvm")
 
@@ -750,27 +749,56 @@ def test_llvm_crt_static_lib():
     module.save("test.o")
 
 
+def atomic_add(x, y):
+    return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
+
+
+@tvm.testing.requires_llvm
+def test_llvm_lower_atomic():
+    def do_atomic_add(A):
+        ib = tvm.tir.ir_builder.create()
+        n = A.shape[0]
+        atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local")
+        one = tvm.tir.const(1, A.dtype)
+        A_ptr = ib.buffer_ptr(A)
+        with ib.for_range(0, n, name="i", for_type="parallel") as i:
+            atomic_add_return[0] = atomic_add(
+                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one)
+        return ib.get()
+
+    A = tvm.te.placeholder((100,), dtype="int8", name="A")
+    C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32")
+    s = tvm.te.create_schedule(C.op)
+    f = tvm.build(s, [A], target="llvm")
+    a_np = np.zeros((100,), dtype=A.dtype)
+    a = tvm.nd.array(a_np)
+    f(a)
+    print(a)
+    print(tvm.lower(s, [A], simple_mode=True))
+
+
 if __name__ == "__main__":
-    test_multiple_func()
-    test_llvm_large_uintimm()
-    test_llvm_import()
-    test_alignment()
-    test_rank_zero()
-    test_rank_zero_bound_checkers()
-    test_llvm_bool()
-    test_llvm_persist_parallel()
-    test_llvm_condition()
-    test_llvm_vadd_pipeline()
-    test_llvm_add_pipeline()
-    test_llvm_intrin()
-    test_llvm_overloaded_intrin()
-    test_llvm_flip_pipeline()
-    test_llvm_madd_pipeline()
-    test_llvm_temp_space()
-    test_llvm_lookup_intrin()
-    test_llvm_div()
-    test_llvm_fp_math()
-    test_dwarf_debug_information()
-    test_llvm_shuffle()
-    test_llvm_bf16()
-    test_llvm_crt_static_lib()
+    # test_multiple_func()
+    # test_llvm_large_uintimm()
+    # test_llvm_import()
+    # test_alignment()
+    # test_rank_zero()
+    # test_rank_zero_bound_checkers()
+    # test_llvm_bool()
+    # test_llvm_persist_parallel()
+    # test_llvm_condition()
+    # test_llvm_vadd_pipeline()
+    # test_llvm_add_pipeline()
+    # test_llvm_intrin()
+    # test_llvm_overloaded_intrin()
+    # test_llvm_flip_pipeline()
+    # test_llvm_madd_pipeline()
+    # test_llvm_temp_space()
+    # test_llvm_lookup_intrin()
+    # test_llvm_div()
+    # test_llvm_fp_math()
+    # test_dwarf_debug_information()
+    # test_llvm_shuffle()
+    # test_llvm_bf16()
+    # test_llvm_crt_static_lib()
+    test_llvm_lower_atomic()

From faf20d63d0b4fbe7ff2d34ce836ea358e64db8d6 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 04:12:24 +0900
Subject: [PATCH 06/19] supports fp32 atomic

---
 src/target/llvm/codegen_llvm.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index b2767114815f..c0ec3255bab5 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -956,9 +956,15 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
   } else if (op->op.same_as(builtin::atomic_add())) {
+    ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
-    return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::AtomicOrdering::Monotonic);
+    if (op->args[1]->dtype.is_float()) {
+      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                       llvm::AtomicOrdering::Monotonic);
+    }
+    return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
+                                     llvm::AtomicOrdering::Monotonic);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;

From 07c50fa7b7364e1883504964c9ac525f4e4f7c5d Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 04:36:02 +0900
Subject: [PATCH 07/19] drop support for cpu atomic

---
 src/target/llvm/codegen_amdgpu.cc                 | 15 +++++++++++++++
 src/target/llvm/codegen_llvm.cc                   | 10 ----------
 src/target/llvm/codegen_nvptx.cc                  | 10 ++++++++++
 tests/python/unittest/test_target_codegen_llvm.py | 10 +++-------
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 2890c1ce3e56..1e6c64abf655 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -183,6 +183,21 @@ class CodeGenAMDGPU : public CodeGenLLVM {
 
   unsigned GetGlobalAddressSpace() const final { return 1; }
 
+  llvm::Value* CreateIntrinsic(const CallNode* op) final {
+    if (op->op.same_as(builtin::atomic_add())) {
+      ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
+      llvm::Value* v0 = MakeValue(op->args[0]);
+      llvm::Value* v1 = MakeValue(op->args[1]);
+      if (op->args[1]->dtype.is_float()) {
+        return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                         llvm::AtomicOrdering::Monotonic);
+      }
+      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
+                                       llvm::AtomicOrdering::Monotonic);
+    }
+    return CodeGenLLVM::CreateIntrinsic(op);
+  }
+
  protected:
   void InitTarget(llvm::TargetMachine* tm) final {
     // Maximum vector lane = float4
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index c0ec3255bab5..d10ed311949c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -955,16 +955,6 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
-  } else if (op->op.same_as(builtin::atomic_add())) {
-    ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
-    llvm::Value* v0 = MakeValue(op->args[0]);
-    llvm::Value* v1 = MakeValue(op->args[1]);
-    if (op->args[1]->dtype.is_float()) {
-      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
-                                       llvm::AtomicOrdering::Monotonic);
-    }
-    return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
-                                     llvm::AtomicOrdering::Monotonic);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 22e612b11090..06584eb5b189 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -232,6 +232,16 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
     auto fty = llvm::FunctionType::get(t_int32_, false);
     auto val = llvm::InlineAsm::get(fty, "activemask.b32 %0", "=r", true);
     return builder_->CreateCall(val);
+  } else if (op->op.same_as(builtin::atomic_add())) {
+    ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
+    llvm::Value* v0 = MakeValue(op->args[0]);
+    llvm::Value* v1 = MakeValue(op->args[1]);
+    if (op->args[1]->dtype.is_float()) {
+      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                       llvm::AtomicOrdering::Monotonic);
+    }
+    return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
+                                     llvm::AtomicOrdering::Monotonic);
   }
   return CodeGenLLVM::CreateIntrinsic(op);
 }
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index decf623f2e32..b6e54bccc513 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -766,15 +766,11 @@ def do_atomic_add(A):
                 tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one)
         return ib.get()
 
-    A = tvm.te.placeholder((100,), dtype="int8", name="A")
+    A = tvm.te.placeholder((100,), dtype="int32", name="A")
     C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32")
     s = tvm.te.create_schedule(C.op)
-    f = tvm.build(s, [A], target="llvm")
-    a_np = np.zeros((100,), dtype=A.dtype)
-    a = tvm.nd.array(a_np)
-    f(a)
-    print(a)
-    print(tvm.lower(s, [A], simple_mode=True))
+    # This does not work because of pointer type mismatch
+    # f = tvm.build(s, [A], target="llvm")
 
 
 if __name__ == "__main__":

From e48563e0f52640c3bc1541ad61d564d8d2162892 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 04:38:10 +0900
Subject: [PATCH 08/19] add comment

---
 tests/python/unittest/test_target_codegen_llvm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index b6e54bccc513..b0f59339650b 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -770,6 +770,10 @@ def do_atomic_add(A):
     C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32")
     s = tvm.te.create_schedule(C.op)
     # This does not work because of pointer type mismatch
+    # TVMError: LLVM module verification failed with the following errors:
+    # Argument value type does not match pointer operand type!
+    # %21 = atomicrmw add i8* %7, i32 1 monotonic
+    # i8
     # f = tvm.build(s, [A], target="llvm")
 
 

From 8dc9fd66ee593aa9c13a0d20475674b27452154f Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 04:50:53 +0900
Subject: [PATCH 09/19] add atomic gpu unit test

---
 .../unittest/test_target_codegen_llvm.py      | 85 +++++++++++++------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index b0f59339650b..8e3cb4a9a262 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -767,7 +767,7 @@ def do_atomic_add(A):
         return ib.get()
 
     A = tvm.te.placeholder((100,), dtype="int32", name="A")
-    C = tvm.te.extern((1000,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32")
+    C = tvm.te.extern((100,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32")
     s = tvm.te.create_schedule(C.op)
     # This does not work because of pointer type mismatch
     # TVMError: LLVM module verification failed with the following errors:
@@ -777,28 +777,63 @@ def do_atomic_add(A):
     # f = tvm.build(s, [A], target="llvm")
 
 
+@tvm.testing.requires_llvm
+@tvm.testing.requires_gpu
+def test_llvm_gpu_lower_atomic():
+    def do_atomic_add(A):
+        ib = tvm.tir.ir_builder.create()
+        n = A.shape[0]
+        atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local")
+        one = tvm.tir.const(1, A.dtype)
+        A_ptr = ib.buffer_ptr(A)
+        nthread_tx = 64
+        with ib.new_scope():
+            nthread_bx = (n + nthread_tx - 1) // nthread_tx
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            atomic_add_return[0] = atomic_add(
+                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one)
+        return ib.get()
+
+    size = 1024
+    for dtype in ["int32", "float32"]:
+        A = tvm.te.placeholder((size,), dtype=dtype, name="A")
+        C = tvm.te.extern((size,), [A], lambda ins, _: do_atomic_add(ins[0]), dtype=dtype)
+        s = tvm.te.create_schedule(C.op)
+        f = tvm.build(s, [A], target="nvptx")
+
+        ctx = tvm.gpu()
+        a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), ctx)
+        f(a)
+        ref = np.zeros((size,)).astype(A.dtype)
+        ref[0] = size
+        tvm.testing.assert_allclose(a.asnumpy(), ref, rtol=1e-5)
+
+
 if __name__ == "__main__":
-    # test_multiple_func()
-    # test_llvm_large_uintimm()
-    # test_llvm_import()
-    # test_alignment()
-    # test_rank_zero()
-    # test_rank_zero_bound_checkers()
-    # test_llvm_bool()
-    # test_llvm_persist_parallel()
-    # test_llvm_condition()
-    # test_llvm_vadd_pipeline()
-    # test_llvm_add_pipeline()
-    # test_llvm_intrin()
-    # test_llvm_overloaded_intrin()
-    # test_llvm_flip_pipeline()
-    # test_llvm_madd_pipeline()
-    # test_llvm_temp_space()
-    # test_llvm_lookup_intrin()
-    # test_llvm_div()
-    # test_llvm_fp_math()
-    # test_dwarf_debug_information()
-    # test_llvm_shuffle()
-    # test_llvm_bf16()
-    # test_llvm_crt_static_lib()
-    test_llvm_lower_atomic()
+    test_multiple_func()
+    test_llvm_large_uintimm()
+    test_llvm_import()
+    test_alignment()
+    test_rank_zero()
+    test_rank_zero_bound_checkers()
+    test_llvm_bool()
+    test_llvm_persist_parallel()
+    test_llvm_condition()
+    test_llvm_vadd_pipeline()
+    test_llvm_add_pipeline()
+    test_llvm_intrin()
+    test_llvm_overloaded_intrin()
+    test_llvm_flip_pipeline()
+    test_llvm_madd_pipeline()
+    test_llvm_temp_space()
+    test_llvm_lookup_intrin()
+    test_llvm_div()
+    test_llvm_fp_math()
+    test_dwarf_debug_information()
+    test_llvm_shuffle()
+    test_llvm_bf16()
+    test_llvm_crt_static_lib()
+    test_llvm_gpu_lower_atomic()

From 9147f9ef08284e4d660f0a50cbc0be35ac986528 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 04:57:42 +0900
Subject: [PATCH 10/19] reenable other tests

---
 tests/python/frontend/pytorch/test_forward.py | 309 +++++++++---------
 tests/python/relay/test_op_level3.py          |   3 -
 tests/python/relay/test_op_level5.py          |  44 +--
 tests/python/topi/python/test_topi_vision.py  |  25 +-
 4 files changed, 185 insertions(+), 196 deletions(-)

diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index f31020483ccd..2dda675c74f5 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -207,7 +207,6 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at
             relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params)
             relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
             relay_model.set_input(**relay_params)
-
             for name, inp in compiled_input.items():
                 relay_model.set_input(name, inp)
             relay_model.run()
@@ -1913,8 +1912,6 @@ def verify_model_vm(input_model, ishapes, idtype=torch.float, idata=None, target
         ctx = tvm.context(tgt, 0)
 
         executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=tgt)
-        print(executor.executable.lib.imported_modules[0].get_source("llvm"))
-
         evaluator = executor.evaluate()
 
         # Inference
@@ -3367,167 +3364,167 @@ def test_fn(x, weights=None):
 
 
 if __name__ == "__main__":
-    # # some structural tests
-    # test_forward_traced_function()
-    # test_forward_dtypes()
-    # test_weight_names()
-    # test_duplicate_weight_use()
-
-    # # Single operator tests
-    # test_forward_pixel_shuffle()
-    # test_forward_add()
-    # test_forward_subtract()
-    # test_forward_multiply()
-    # test_forward_matmul()
-    # test_forward_rsub()
-    # test_forward_onehot()
-    # test_forward_embedding()
-    # test_forward_reshape()
-    # test_forward_reciprocal()
-    # test_forward_repeat()
-    # test_forward_repeat_interleave()
-    # test_forward_squeeze()
-    # test_forward_unsqueeze()
-    # test_forward_concatenate()
-    # test_forward_reduce_sum()
-    # test_forward_reduce_prod()
-    # test_forward_argmin()
-    # test_forward_argmax()
-    # test_forward_norm()
-    # test_forward_frobenius_norm()
-    # test_forward_std()
-    # test_forward_variance()
-    # test_forward_relu()
-    # test_forward_prelu()
-    # test_forward_leakyrelu()
-    # test_forward_elu()
-    # test_forward_celu()
-    # test_forward_gelu()
-    # test_forward_selu()
-    # test_forward_log_sigmoid()
-    # test_forward_adaptiveavgpool()
-    # test_forward_maxpool2d()
-    # test_forward_maxpool1d()
-    # test_forward_maxpool3d()
-    # test_forward_hardtanh()
-    # test_forward_conv()
-    # test_forward_conv_transpose()
-    # test_forward_threshold()
-    # test_forward_contiguous()
-    # test_forward_batchnorm()
-    # test_forward_instancenorm()
-    # test_forward_layernorm()
-    # test_forward_groupnorm()
-    # test_forward_transpose()
-    # test_forward_size()
-    # test_forward_view()
-    # test_forward_select()
-    # test_forward_take()
-    # test_forward_topk()
-    # test_forward_where()
-    # test_forward_addcdiv()
-    # test_forward_addcmul()
-    # test_forward_true_divide()
-    # test_forward_clone()
-    # test_forward_softplus()
-    # test_forward_softsign()
-    # test_forward_logsoftmax()
-    # test_forward_sigmoid()
-    # test_forward_dense()
-    # test_forward_avgpool()
-    # test_forward_avgpool3d()
-    # test_forward_dropout()
-    # test_forward_slice()
-    # test_forward_mean()
-    # test_forward_expand()
-    # test_forward_pow()
-    # test_forward_unary()
-    # test_forward_clamp()
-    # test_forward_clamp_()
-    # test_forward_logical_not()
-    # test_forward_bitwise_not()
-    # test_forward_bitwise_xor()
-    # test_forward_logical_xor()
-    # test_forward_isfinite()
-    # test_forward_isnan()
-    # test_forward_isinf()
-    # test_forward_ones()
-    # test_forward_ones_like()
-    # test_forward_zeros()
-    # test_forward_zeros_like()
-    # test_forward_full()
-    # test_forward_full_like()
-    # test_forward_linspace()
-    # test_forward_arange()
-    # test_forward_mesh_grid()
-    # test_forward_chunk()
-    # test_forward_split()
-    # test_forward_gather()
-    # test_upsample()
-    # test_forward_upsample3d()
-    # test_forward_nms()
-    # test_forward_roi_align()
-    # test_to()
-    # test_flatten()
-    # test_type_as()
-    # test_forward_functional_pad()
-    # test_forward_zero_pad2d()
-    # test_forward_constant_pad1d()
-    # test_forward_constant_pad2d()
-    # test_forward_constant_pad3d()
-    # test_forward_reflection_pad1d()
-    # test_forward_reflection_pad2d()
-    # test_forward_replication_pad1d()
-    # test_forward_replication_pad2d()
-    # test_forward_replication_pad3d()
-    # test_adaptive_pool3d()
-    # test_conv3d()
-    # test_conv3d_transpose()
-    # test_forward_index()
-    # test_min_max()
-    # test_logsumexp()
-    # test_stack()
-    # test_stack_dynamic()
-    # test_forward_unbind()
-    # test_forward_nonzero()
-    # test_forward_scatter()
-    # test_numel()
+    # some structural tests
+    test_forward_traced_function()
+    test_forward_dtypes()
+    test_weight_names()
+    test_duplicate_weight_use()
+
+    # Single operator tests
+    test_forward_pixel_shuffle()
+    test_forward_add()
+    test_forward_subtract()
+    test_forward_multiply()
+    test_forward_matmul()
+    test_forward_rsub()
+    test_forward_onehot()
+    test_forward_embedding()
+    test_forward_reshape()
+    test_forward_reciprocal()
+    test_forward_repeat()
+    test_forward_repeat_interleave()
+    test_forward_squeeze()
+    test_forward_unsqueeze()
+    test_forward_concatenate()
+    test_forward_reduce_sum()
+    test_forward_reduce_prod()
+    test_forward_argmin()
+    test_forward_argmax()
+    test_forward_norm()
+    test_forward_frobenius_norm()
+    test_forward_std()
+    test_forward_variance()
+    test_forward_relu()
+    test_forward_prelu()
+    test_forward_leakyrelu()
+    test_forward_elu()
+    test_forward_celu()
+    test_forward_gelu()
+    test_forward_selu()
+    test_forward_log_sigmoid()
+    test_forward_adaptiveavgpool()
+    test_forward_maxpool2d()
+    test_forward_maxpool1d()
+    test_forward_maxpool3d()
+    test_forward_hardtanh()
+    test_forward_conv()
+    test_forward_conv_transpose()
+    test_forward_threshold()
+    test_forward_contiguous()
+    test_forward_batchnorm()
+    test_forward_instancenorm()
+    test_forward_layernorm()
+    test_forward_groupnorm()
+    test_forward_transpose()
+    test_forward_size()
+    test_forward_view()
+    test_forward_select()
+    test_forward_take()
+    test_forward_topk()
+    test_forward_where()
+    test_forward_addcdiv()
+    test_forward_addcmul()
+    test_forward_true_divide()
+    test_forward_clone()
+    test_forward_softplus()
+    test_forward_softsign()
+    test_forward_logsoftmax()
+    test_forward_sigmoid()
+    test_forward_dense()
+    test_forward_avgpool()
+    test_forward_avgpool3d()
+    test_forward_dropout()
+    test_forward_slice()
+    test_forward_mean()
+    test_forward_expand()
+    test_forward_pow()
+    test_forward_unary()
+    test_forward_clamp()
+    test_forward_clamp_()
+    test_forward_logical_not()
+    test_forward_bitwise_not()
+    test_forward_bitwise_xor()
+    test_forward_logical_xor()
+    test_forward_isfinite()
+    test_forward_isnan()
+    test_forward_isinf()
+    test_forward_ones()
+    test_forward_ones_like()
+    test_forward_zeros()
+    test_forward_zeros_like()
+    test_forward_full()
+    test_forward_full_like()
+    test_forward_linspace()
+    test_forward_arange()
+    test_forward_mesh_grid()
+    test_forward_chunk()
+    test_forward_split()
+    test_forward_gather()
+    test_upsample()
+    test_forward_upsample3d()
+    test_forward_nms()
+    test_forward_roi_align()
+    test_to()
+    test_flatten()
+    test_type_as()
+    test_forward_functional_pad()
+    test_forward_zero_pad2d()
+    test_forward_constant_pad1d()
+    test_forward_constant_pad2d()
+    test_forward_constant_pad3d()
+    test_forward_reflection_pad1d()
+    test_forward_reflection_pad2d()
+    test_forward_replication_pad1d()
+    test_forward_replication_pad2d()
+    test_forward_replication_pad3d()
+    test_adaptive_pool3d()
+    test_conv3d()
+    test_conv3d_transpose()
+    test_forward_index()
+    test_min_max()
+    test_logsumexp()
+    test_stack()
+    test_stack_dynamic()
+    test_forward_unbind()
+    test_forward_nonzero()
+    test_forward_scatter()
+    test_numel()
     test_bincount()
 
-    # # Model tests
-    # test_resnet18()
-    # test_squeezenet1_0()
-    # test_squeezenet1_1()
-    # test_densenet121()
-    # # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug
-    # # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756
-    # # test_inception_v3()
-    # test_googlenet()
-    # test_mnasnet0_5()
-    # test_mobilenet_v2()
+    # Model tests
+    test_resnet18()
+    test_squeezenet1_0()
+    test_squeezenet1_1()
+    test_densenet121()
+    # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug
+    # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756
+    # test_inception_v3()
+    test_googlenet()
+    test_mnasnet0_5()
+    test_mobilenet_v2()
 
-    # test_custom_conversion_map()
+    test_custom_conversion_map()
 
-    # test_segmentaton_models()
-    # test_3d_models()
+    test_segmentaton_models()
+    test_3d_models()
 
-    # # Quantization test
-    # from qnn_test import test_quantized_imagenet, test_quantized_modules
+    # Quantization test
+    from qnn_test import test_quantized_imagenet, test_quantized_modules
 
-    # test_quantized_modules()
-    # test_quantized_imagenet()
+    test_quantized_modules()
+    test_quantized_imagenet()
 
-    # # Test simple conditionals and loop
-    # test_control_flow()
-    # test_simple_rnn()
+    # Test simple conditionals and loop
+    test_control_flow()
+    test_simple_rnn()
 
-    # # More complex recurrent models
-    # from test_lstm import test_custom_lstm
+    # More complex recurrent models
+    from test_lstm import test_custom_lstm
 
-    # test_custom_lstm()
+    test_custom_lstm()
 
-    # # Test bert model
-    # test_forward_pretrained_bert_base_uncased()
+    # Test bert model
+    test_forward_pretrained_bert_base_uncased()
 
-    # # Test convert torch script(jit) with specific inputs' types
-    # test_convert_torch_script_with_input_types()
+    # Test convert torch script(jit) with specific inputs' types
+    test_convert_torch_script_with_input_types()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index fc1929e9dc18..0c1f0463e6ff 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1017,9 +1017,6 @@ def verify_scatter_add(dshape, ishape, axis=0):
         ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis)
         for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                if target == "nvptx":
-                    # TODO(masahi): support atomic in LLVM codegen
-                    continue
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index ee7acc619f9d..f114957f3cab 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1194,26 +1194,26 @@ def verify_batch_to_space_nd(dshape, block_shape, crops):
 
 
 if __name__ == "__main__":
-    # test_resize_infer_type()
-    # test_resize()
-    # test_resize3d_infer_type()
-    # test_resize3d()
-    # test_crop_and_resize()
-    # test_multibox_prior()
-    # test_multibox_transform_loc()
-    # test_get_valid_counts()
-    # test_roi_align()
-    # test_roi_pool()
-    # test_proposal()
-    # test_yolo_reorg_infer_shape()
-    # test_yolo_reorg()
+    test_resize_infer_type()
+    test_resize()
+    test_resize3d_infer_type()
+    test_resize3d()
+    test_crop_and_resize()
+    test_multibox_prior()
+    test_multibox_transform_loc()
+    test_get_valid_counts()
+    test_roi_align()
+    test_roi_pool()
+    test_proposal()
+    test_yolo_reorg_infer_shape()
+    test_yolo_reorg()
     test_non_max_suppression()
-    # test_deformable_conv2d()
-    # test_depth_to_space()
-    # test_space_to_depth()
-    # test_dilation2d_infer_type()
-    # test_dilation2d_run()
-    # test_affine_grid()
-    # test_grid_sample()
-    # test_space_to_batch_nd()
-    # test_batch_to_space_nd()
+    test_deformable_conv2d()
+    test_depth_to_space()
+    test_space_to_depth()
+    test_dilation2d_infer_type()
+    test_dilation2d_run()
+    test_affine_grid()
+    test_grid_sample()
+    test_space_to_batch_nd()
+    test_batch_to_space_nd()
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 3fb02075dbc7..778843be37de 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -202,16 +202,11 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
         tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        if device in ["nvptx"]:
-            f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
-            # f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
-            print(f.imported_modules[0].get_source("llvm"))
-        else:
-            f = tvm.build(indices_s, [data, valid_count, indices, indices_out], device)
-            f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
-        # tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
+        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
+        f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
-    for device in ["nvptx"]:
+    for device in ["llvm", "cuda", "opencl", "nvptx"]:
         check_device(device)
 
 
@@ -631,10 +626,10 @@ def test_proposal():
 
 
 if __name__ == "__main__":
-    # test_get_valid_counts()
-    # test_multibox_prior()
-    # test_multibox_detection()
-    # test_roi_align()
-    # test_roi_pool()
-    # test_proposal()
+    test_get_valid_counts()
+    test_multibox_prior()
+    test_multibox_detection()
+    test_roi_align()
+    test_roi_pool()
+    test_proposal()
     test_non_max_suppression()

From 552aa5fd45948aad3d715a409b40fa305f6333aa Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 05:04:19 +0900
Subject: [PATCH 11/19] add doc string

---
 include/tvm/tir/builtin.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index a5ac378be581..a150595ab551 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -549,6 +549,9 @@ TVM_DLL const Op& vectorlow();
  */
 TVM_DLL const Op& vectorcombine();
 
+/*!
+ * \brief atomic add instruction, corresponding e.g. to atomicAdd in CUDA
+ */
 TVM_DLL const Op& atomic_add();
 
 /*! \brief The kind of structure field info used in intrinsic */

From 337f20998b0cc3d3cb738a24115be2aad057afd1 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 8 Dec 2020 07:17:32 +0900
Subject: [PATCH 12/19] run black

---
 tests/python/unittest/test_target_codegen_llvm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 8e3cb4a9a262..bf8afc9e85dd 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -763,7 +763,8 @@ def do_atomic_add(A):
         A_ptr = ib.buffer_ptr(A)
         with ib.for_range(0, n, name="i", for_type="parallel") as i:
             atomic_add_return[0] = atomic_add(
-                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one)
+                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one
+            )
         return ib.get()
 
     A = tvm.te.placeholder((100,), dtype="int32", name="A")
@@ -794,7 +795,8 @@ def do_atomic_add(A):
             ib.scope_attr(tx, "thread_extent", nthread_tx)
             ib.scope_attr(bx, "thread_extent", nthread_bx)
             atomic_add_return[0] = atomic_add(
-                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one)
+                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one
+            )
         return ib.get()
 
     size = 1024

From c9f413acc359cc4a1e5904a19a2b479dd2d00023 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 09:23:33 +0900
Subject: [PATCH 13/19] fix build with llvm 8 and older

---
 src/target/llvm/codegen_amdgpu.cc | 8 ++++++--
 src/target/llvm/codegen_nvptx.cc  | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 1e6c64abf655..3ff686941f46 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -189,8 +189,12 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       llvm::Value* v0 = MakeValue(op->args[0]);
       llvm::Value* v1 = MakeValue(op->args[1]);
       if (op->args[1]->dtype.is_float()) {
-        return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
-                                         llvm::AtomicOrdering::Monotonic);
+#if TVM_LLVM_VERSION >= 90
+      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                       llvm::AtomicOrdering::Monotonic);
+#else
+      LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer";
+#endif
       }
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
                                        llvm::AtomicOrdering::Monotonic);
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 06584eb5b189..d8002a2b58a6 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -237,8 +237,12 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
     if (op->args[1]->dtype.is_float()) {
+#if TVM_LLVM_VERSION >= 90
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
                                        llvm::AtomicOrdering::Monotonic);
+#else
+      LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer";
+#endif
     }
     return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
                                      llvm::AtomicOrdering::Monotonic);

From 0d6b4800409978e3c4269719d85cafcb54b79e43 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 09:26:18 +0900
Subject: [PATCH 14/19] fix format

---
 src/target/llvm/codegen_amdgpu.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 3ff686941f46..605870f48c52 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -190,10 +190,10 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       llvm::Value* v1 = MakeValue(op->args[1]);
       if (op->args[1]->dtype.is_float()) {
 #if TVM_LLVM_VERSION >= 90
-      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
-                                       llvm::AtomicOrdering::Monotonic);
+        return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                         llvm::AtomicOrdering::Monotonic);
 #else
-      LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer";
+        LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer";
 #endif
       }
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,

From 9dea22ad2e699d8cf0f85c603423100f70bdcbd8 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 09:44:09 +0900
Subject: [PATCH 15/19] do not run float32 atomic test on ci

---
 tests/python/unittest/test_target_codegen_llvm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index bf8afc9e85dd..4b67752367db 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -800,7 +800,8 @@ def do_atomic_add(A):
         return ib.get()
 
     size = 1024
-    for dtype in ["int32", "float32"]:
+    # CI uses LLVM 8, which does not support float atomic
+    for dtype in ["int32"]:
         A = tvm.te.placeholder((size,), dtype=dtype, name="A")
         C = tvm.te.extern((size,), [A], lambda ins, _: do_atomic_add(ins[0]), dtype=dtype)
         s = tvm.te.create_schedule(C.op)

From 7551cbae8b256d1b58ca19b04fea7d63834d8115 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 10:43:40 +0900
Subject: [PATCH 16/19] do not run scatter_add 1d with float inputs on CI

---
 tests/python/relay/test_op_level3.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 0c1f0463e6ff..427b0a99a756 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1002,27 +1002,33 @@ def ref_scatter_add(data, indices, updates, axis=0):
             output[tuple(new_index)] += updates[index]
         return output
 
-    def verify_scatter_add(dshape, ishape, axis=0):
-        d = relay.var("d", relay.TensorType(dshape, "float32"))
+    def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
+        d = relay.var("d", relay.TensorType(dshape, dtype))
         i = relay.var("i", relay.TensorType(ishape, "int64"))
-        u = relay.var("u", relay.TensorType(ishape, "float32"))
+        u = relay.var("u", relay.TensorType(ishape, dtype))
         z = relay.op.scatter_add(d, i, u, axis)
 
         func = relay.Function([d, i, u], z)
 
-        data_np = np.random.uniform(size=dshape).astype("float32")
-        updates_np = np.random.uniform(size=ishape).astype("float32")
+        data_np = np.random.uniform(size=dshape).astype(dtype)
+        updates_np = np.random.uniform(size=ishape).astype(dtype)
         indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
 
         ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis)
         for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
+                if target == "nvptx" and dtype == "float32" and len(dshape) == 1:
+                    # scatter_add 1D on GPU is implemented via atomic.
+                    # Floating point atomic requires LLVM 9 or newer for nvptx backend.
+                    # But LLVM on CI is LLVM 8,
+                    continue
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
-    verify_scatter_add((10,), (10,), 0)
-    verify_scatter_add((1000,), (1000,), 0)
+    verify_scatter_add((10,), (10,), 0, dtype="int32")
+    verify_scatter_add((1000,), (1000,))
+    verify_scatter_add((1000,), (1000,), 0, dtype="int32")
     verify_scatter_add((10, 5), (10, 5), -2)
     verify_scatter_add((10, 5), (10, 5), -1)
     verify_scatter_add((10, 5), (3, 5), 0)

From 65498b0b5759f914f60d28b10eb41f7dc644ab37 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 8 Dec 2020 10:45:20 +0900
Subject: [PATCH 17/19] fix typo

---
 tests/python/relay/test_op_level3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 427b0a99a756..668285dfb882 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1020,7 +1020,7 @@ def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
                 if target == "nvptx" and dtype == "float32" and len(dshape) == 1:
                     # scatter_add 1D on GPU is implemented via atomic.
                     # Floating point atomic requires LLVM 9 or newer for nvptx backend.
-                    # But LLVM on CI is LLVM 8,
+                    # But LLVM on CI is LLVM 8.
                     continue
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)

From 82a887c691079f083ba6419aff54e4169aaef522 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 8 Dec 2020 20:02:50 +0900
Subject: [PATCH 18/19] add todo comment for cpu backend

---
 src/target/llvm/codegen_llvm.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index d10ed311949c..12573e29c9de 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -955,6 +955,9 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
+  } else if (op->op.same_as(builtin::atomic_add())) {
+    // TODO(masahi): Support atomic for CPU backend
+    LOG(FATAL) << "CPU backend does not support atomic add yet.";
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;

From e4eda99986839d9e654a7b5932597a4d0809d871 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 8 Dec 2020 20:32:57 +0900
Subject: [PATCH 19/19] fix build on ci

---
 src/target/llvm/codegen_llvm.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 12573e29c9de..70f094a186e7 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -958,6 +958,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::atomic_add())) {
     // TODO(masahi): Support atomic for CPU backend
     LOG(FATAL) << "CPU backend does not support atomic add yet.";
+    return nullptr;
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->op;
     return nullptr;