From 73726a89e6afb4c9aa118c943674b01c31597886 Mon Sep 17 00:00:00 2001
From: tobe <tobeg3oogle@gmail.com>
Date: Tue, 3 Dec 2019 19:49:22 +0800
Subject: [PATCH 01/30] Add implementation of TVMDSOOp

---
 CMakeLists.txt                          |   2 +
 cmake/config.cmake                      |   3 +
 cmake/modules/contrib/TFOP.cmake        |  45 ++++
 python/tvm/contrib/tf_op/__init__.py    |  20 ++
 python/tvm/contrib/tf_op/module.py      | 104 +++++++++
 src/contrib/tf_op/index_seq.h           |  44 ++++
 src/contrib/tf_op/tvm_dso_op_kernels.cc | 285 ++++++++++++++++++++++++
 src/contrib/tf_op/tvm_dso_ops.cc        |  91 ++++++++
 8 files changed, 594 insertions(+)
 create mode 100644 cmake/modules/contrib/TFOP.cmake
 create mode 100644 python/tvm/contrib/tf_op/__init__.py
 create mode 100644 python/tvm/contrib/tf_op/module.py
 create mode 100644 src/contrib/tf_op/index_seq.h
 create mode 100644 src/contrib/tf_op/tvm_dso_op_kernels.cc
 create mode 100644 src/contrib/tf_op/tvm_dso_ops.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf18ffc9e856..2ec625d0dbed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,7 @@ tvm_option(USE_MSVC_MT "Build with MT" OFF)
 tvm_option(USE_MICRO "Build with Micro" OFF)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
 tvm_option(HIDE_PRIVATE_SYMBOLS "Compile with -fvisibility=hidden." OFF)
+tvm_option(USE_TFOP "Build with TensorFlow TVMDSOOp" OFF)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -257,6 +258,7 @@ include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
 include(cmake/modules/contrib/NNPack.cmake)
 include(cmake/modules/contrib/HybridDump.cmake)
+include(cmake/modules/contrib/TFOP.cmake)
 
 if(NOT MSVC)
   include(CheckCXXCompilerFlag)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index dba1b44e676e..01918fb2f974 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -184,3 +184,6 @@ set(USE_VTA_FPGA OFF)
 
 # Whether to build the example external runtime module
 set(USE_EXAMPLE_EXT_RUNTIME OFF)
+
+# Whether to build the TensorFlow TVMDSOOp module
+set(USE_TFOP OFF)
diff --git a/cmake/modules/contrib/TFOP.cmake b/cmake/modules/contrib/TFOP.cmake
new file mode 100644
index 000000000000..52d017c08e16
--- /dev/null
+++ b/cmake/modules/contrib/TFOP.cmake
@@ -0,0 +1,45 @@
+
+if(NOT USE_TFOP STREQUAL "OFF")
+
+  if ("${TVM_HOME}" STREQUAL "")
+    message(FATAL_ERROR "TVM_HOME is not defined")
+  else()
+    message("Use TVM_HOME=\"${TVM_HOME}\"")
+  endif()
+
+  include_directories(${TVM_HOME}/include)
+  include_directories(${TVM_HOME}/3rdparty/dlpack/include)
+  include_directories(${TVM_HOME}/3rdparty/dmlc-core/include)
+
+  execute_process(COMMAND python -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
+    OUTPUT_VARIABLE TF_COMPILE_FLAGS_STR
+    RESULT_VARIABLE TF_STATUS)
+  if (NOT ${TF_STATUS} EQUAL 0)
+    message(FATAL_ERROR "Fail to get TensorFlow compile flags")
+  endif()
+
+  execute_process(COMMAND python -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags()))"
+    OUTPUT_VARIABLE TF_LINK_FLAGS_STR
+    RESULT_VARIABLE TF_STATUS)
+  if (NOT ${TF_STATUS} EQUAL 0)
+    message(FATAL_ERROR "Fail to get TensorFlow link flags")
+  endif()
+
+  string(REGEX REPLACE "\n" " " TF_FLAGS "${TF_COMPILE_FLAGS} ${TF_LINK_FLAGS}")
+  message("Use TensorFlow flags=\"${TF_FLAGS}\"")
+  separate_arguments(TF_COMPILE_FLAGS UNIX_COMMAND ${TF_COMPILE_FLAGS_STR})
+  separate_arguments(TF_LINK_FLAGS UNIX_COMMAND ${TF_LINK_FLAGS_STR})
+
+
+  set(OP_LIBRARY_NAME tvm_dso_op)
+  file(GLOB_RECURSE TFTVM_SRCS src/*.cc)
+  add_library(${OP_LIBRARY_NAME} SHARED ${TFTVM_SRCS})
+  set_target_properties(${OP_LIBRARY_NAME} PROPERTIES PREFIX "")
+
+  set(TFTVM_COMPILE_FLAGS  -O2 -ldl -g)
+  set(TFTVM_LINK_FLAGS  -ltvm_runtime -L${TVM_HOME}/build)
+  target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
+  target_link_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
+
+endif()
+
diff --git a/python/tvm/contrib/tf_op/__init__.py b/python/tvm/contrib/tf_op/__init__.py
new file mode 100644
index 000000000000..9f80b266cb91
--- /dev/null
+++ b/python/tvm/contrib/tf_op/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from . import module
+
+Module = module.Module
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
new file mode 100644
index 000000000000..9a9e1f386fc0
--- /dev/null
+++ b/python/tvm/contrib/tf_op/module.py
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tensorflow as tf
+from tensorflow.python.framework import load_library
+
+
+class Module():
+
+  def __init__(self, lib_path):
+    self.lib_path = lib_path
+
+  def func(self, name, output_dtype=None, output_shape=None):
+    return Func(self.lib_path, name, output_dtype, output_shape)
+
+  def __getitem__(self, func_name):
+    return self.func(func_name)
+
+
+class Func():
+
+  def __init__(self, lib_path, func_name, output_dtype, output_shape):
+    self.lib_path = lib_path
+    self.func_name = func_name
+    self.output_dtype = output_dtype
+
+    # const(0) indicate invalid dynamic shape
+    self.dynamic_output_shape = tf.constant(0, tf.int64)
+    self.static_output_shape = None
+    self.has_static_output_shape = False  # extra flag is required
+
+    if self._is_static_shape(output_shape):
+      self.static_output_shape = output_shape
+      self.has_static_output_shape = True
+    elif output_shape is not None:
+      self.dynamic_output_shape = self._pack_shape_tensor(output_shape)
+    
+    # TODO: support non-xpu device 
+    #self.device = device
+    # delay initialization to called first time, where num input arguments is known
+    self.tvm_dso_op = None
+    self.module = load_library.load_op_library('tvm_dso_op.so')
+    
+  def apply(self, *params):
+    if self.tvm_dso_op is None:
+      num_inputs = len(params)
+      self.tvm_dso_op = getattr(self.module, "tvm_dso_op%s" % num_inputs)
+    
+    return self.tvm_dso_op(*params, 
+                           dynamic_output_shape=self.dynamic_output_shape,
+                           static_output_shape=self.static_output_shape,
+                           has_static_output_shape=self.has_static_output_shape, 
+                           lib_path=self.lib_path, 
+                           func_name=self.func_name, 
+                           output_dtype=self.output_dtype)
+
+  def __call__(self, *params):
+    return self.apply(*params)
+
+  def _is_static_shape(self, shape):
+    if shape is None or not isinstance(shape, list):
+      return False
+    for d in shape:
+      if not isinstance(d, int):
+        return False
+      if d < 0:
+        raise Exception("Negative dimension is illegal: %d" % d)
+    return True
+
+  def _pack_shape_tensor(self, shape):
+    if isinstance(shape, tf.Tensor):
+      if shape.dtype == tf.int32:
+        shape = tf.cast(shape, tf.int64)
+      return shape
+    elif isinstance(shape, list):
+      shape_dims = []
+      for d in shape:
+        if isinstance(d, int):
+          shape_dims.append(tf.constant(d, tf.int64))
+        elif isinstance(d, tf.Tensor) and len(d.shape) == 0:
+          if d.dtype == tf.int32:
+            d = tf.cast(d, tf.int64)
+          shape_dims.append(d)
+        else:
+          raise TypeError("Input shape dimension is neither scala tensor nor int")
+      return tf.stack(shape_dims) 
+    else:
+      raise TypeError("Input shape is neither tensor nor list")
+
+
+
diff --git a/src/contrib/tf_op/index_seq.h b/src/contrib/tf_op/index_seq.h
new file mode 100644
index 000000000000..3d01e405d239
--- /dev/null
+++ b/src/contrib/tf_op/index_seq.h
@@ -0,0 +1,44 @@
+/**
+ * Refer to std::index_sequence (since c++14)
+ * Utilities to invoke variadic function with template <size_t N> 
+ */
+#ifndef __TFTVM_INDEX_SEQ__
+#define __TFTVM_INDEX_SEQ__
+
+template <std::size_t ...>
+struct IndexSeq {};
+
+template <std::size_t N, std::size_t ... Tail>
+struct IndexSeqHelper : public IndexSeqHelper<N-1U, N-1U, Tail...> {};
+
+template <std::size_t ... Tail>
+struct IndexSeqHelper<0U, Tail ...> {
+    using type = IndexSeq<Tail ...>;
+};
+
+template <std::size_t N>
+using make_index_sequence = typename IndexSeqHelper<N>::type;
+
+
+template <typename F, typename T, std::size_t N, std::size_t... Idx>
+decltype(auto) apply_variadic_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
+    return f(t[Idx]...);
+}
+
+template <typename F, typename T, std::size_t N>
+decltype(auto) apply_variadic(F f, T(&t)[N]) {
+    return apply_variadic_impl(f, t, make_index_sequence<N>{});
+}
+
+template <typename F, typename T, std::size_t N, std::size_t... Idx>
+decltype(auto) apply_variadic_by_ptrs_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
+    return f(&t[Idx]...);
+}
+
+template <typename F, typename T, std::size_t N>
+decltype(auto) apply_variadic_by_ptrs(F f, T(&t)[N]) {
+    return apply_variadic_by_ptrs_impl(f, t, make_index_sequence<N>{});
+}
+
+#endif
+
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
new file mode 100644
index 000000000000..178b8abc210e
--- /dev/null
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/packed_func.h>
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "index_seq.h"
+
+using namespace tensorflow;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+typedef gtl::InlinedVector<int64, 4> ShapeContainer;
+
+
+template <typename DEVICE_TYPE>
+class TVMDSOOpTrait;
+
+
+class TensorAsBuf {
+  public:
+    Tensor inline_tensor;
+    Tensor* tensor;
+
+    size_t size;
+    size_t offset;
+
+    int device_type;
+
+    char* origin_buf; 
+    char* buf;
+
+    void CopyToOrigin() {
+        if (buf == origin_buf) {
+            return;
+        }
+        if (device_type == kDLCPU) {
+            memcpy(origin_buf, buf + offset, size); 
+        } else {
+            cudaMemcpy(origin_buf, buf + offset, size, cudaMemcpyDeviceToDevice);
+        }
+    }
+
+    void CopyFromOrigin() {
+        if (buf == origin_buf) {
+            return;
+        }
+        if (device_type == kDLCPU) {
+            memcpy(buf + offset, origin_buf, size); 
+        } else {
+            cudaMemcpy(buf + offset, origin_buf, size, cudaMemcpyDeviceToDevice);
+        }
+    }
+};
+
+
+int GetDLPackDtype(const Tensor& tf_tensor, DLDataType* res) {
+    auto dtype = tf_tensor.dtype();
+    if (dtype == DT_FLOAT) {
+      res->code = kDLFloat;
+      res->bits = 32;
+      res->lanes = 1;
+    } else if (dtype == DT_INT64) {
+      res->code = kDLInt;
+      res->bits = 64;
+      res->lanes = 1;
+    } else if (dtype == DT_INT32) {
+      res->code = kDLInt;
+      res->bits = 32;
+      res->lanes = 1;
+    } else {
+      return -1;
+    }
+    return 0;
+}
+
+
+void EnsureAlignment(OpKernelContext* ctx, const Tensor& tensor, TensorAsBuf* out) {
+    char* buf = (char*) tensor.tensor_data().data();
+    out->origin_buf = buf;
+    out->size = tensor.TotalBytes(); 
+
+    int alignment = 64;
+    char* aligned = (char*)(((uint64_t)buf + alignment - 1) & (~ (alignment - 1)));
+    if (buf == aligned) {
+        out->tensor = const_cast<Tensor*>(&tensor);
+        out->buf = buf;
+        out->offset = 0;
+    } else {
+        TensorShape buf_shape;
+        int64 dims[1] = { (int64)(tensor.TotalBytes() + alignment) }; 
+        TensorShapeUtils::MakeShape(dims, 1, &buf_shape);
+        
+        out->tensor = &out->inline_tensor;
+        ctx->allocate_temp(tensor.dtype(), buf_shape, out->tensor);
+        
+        buf = (char*)(out->tensor->tensor_data().data());
+        char* buf_aligned = (char*)(((uint64_t)buf + alignment) & (~ (alignment - 1)));
+        out->buf = buf;
+        out->offset = buf_aligned - buf;
+    }
+}
+
+
+int MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape, DLTensor* out) {
+    DLDataType dlpack_type;
+    const Tensor& tensor = *src.tensor;
+
+    int status = GetDLPackDtype(tensor, &dlpack_type);
+    if (status != 0) {
+        return status;
+    }
+    out->ctx = ctx;
+    out->ndim = tensor.shape().dims();
+    out->shape = tf_shape;
+    out->strides = NULL;
+    out->byte_offset = 0;
+    out->dtype = dlpack_type;    
+    out->data = src.buf + src.offset;
+    return 0;
+}
+
+
+template <>
+class TVMDSOOpTrait<CPUDevice> {
+  public:
+    static const int device_type = kDLCPU;
+
+    static int device_id(OpKernelContext* context) {
+        return 0;
+    }
+
+};
+
+
+template <>
+class TVMDSOOpTrait<GPUDevice> {
+  public:
+    static const int device_type = kDLGPU;
+
+    static int device_id(OpKernelContext* context) {
+        auto device_base = context->device();
+        auto gpu_device_info = device_base->tensorflow_gpu_device_info();
+        return gpu_device_info->gpu_id;
+    }
+};
+
+
+template <typename DEVICE_TYPE, int NUM_INPUTS>
+class TVMDSOOp : public OpKernel {
+
+private:
+  tvm::runtime::PackedFunc tvm_func;
+  string lib_path;
+  string func_name;
+
+  DataType output_dtype;
+  
+  bool has_static_output_shape;
+  std::vector<int64> static_output_shape;
+
+  void initAttributes(OpKernelConstruction* context) {
+    context->GetAttr("lib_path", &lib_path);
+    context->GetAttr("func_name", &func_name);
+    context->GetAttr("output_dtype", &output_dtype);
+     
+    context->GetAttr("has_static_output_shape", &has_static_output_shape);
+    context->GetAttr("static_output_shape", &static_output_shape);
+  }
+
+ public:
+  explicit TVMDSOOp(OpKernelConstruction* context) : OpKernel(context) {
+
+    // Get attr
+    initAttributes(context);
+
+    // Load TVM function from dynamic library
+    tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile(lib_path);
+    LOG(INFO) << "Verify dynamic loading from " << lib_path << " device_type=" << TVMDSOOpTrait<DEVICE_TYPE>::device_type;
+    tvm_func = mod_dylib.GetFunction(func_name);
+    CHECK(tvm_func != nullptr);
+  }
+  
+  void Compute(OpKernelContext* context) override {
+
+    DLTensor args[NUM_INPUTS + 1];
+    TensorAsBuf buf_info[NUM_INPUTS];
+    ShapeContainer shapes[NUM_INPUTS];
+
+    int status;
+    int device_id = TVMDSOOpTrait<DEVICE_TYPE>::device_id(context);
+    int device_type = TVMDSOOpTrait<DEVICE_TYPE>::device_type;
+    
+    DLContext dl_ctx = { DLDeviceType(device_type), device_id };
+
+    // Get output shape
+    TensorShape output_shape;
+    auto& output_shape_tensor = context->input(NUM_INPUTS);
+    if (has_static_output_shape) {
+      // use static output shape
+      const int64* dims = static_output_shape.data();
+      TensorShapeUtils::MakeShape(dims, static_output_shape.size(), &output_shape);
+    } else if (output_shape_tensor.dims() == 1) {
+      // use shape tensor values as output shape
+      const int64* dims = output_shape_tensor.flat<int64>().data();
+      TensorShapeUtils::MakeShape(dims, 1, &output_shape);
+    } else {
+      // use input tensor shape by default
+      output_shape = context->input(0).shape();
+    }
+    
+    for (int i = 0; i < NUM_INPUTS; ++i) {
+        // Grab the input tensor
+        auto& input_tensor = context->input(i);
+
+        // Create shape container, should keep ref during execution
+        shapes[i] = input_tensor.shape().dim_sizes();
+        auto shape_ptr = (int64_t*) shapes[i].data();
+
+        TensorAsBuf& input = buf_info[i];
+        input.device_type = device_type;
+
+        EnsureAlignment(context, input_tensor, &input);
+        input.CopyFromOrigin();
+
+        status = MakeDLTensor(input, dl_ctx, shape_ptr, &args[i]);
+        OP_REQUIRES(context, status == 0, Status(error::INTERNAL, "Fail to create dlpack tensor for input"));
+    }
+
+    // Allocate output tensor
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
+    auto output_shape_dim_buf = output_tensor->shape().dim_sizes(); // should keep alive on stack 
+    auto output_shape_ptr = (int64_t*) output_shape_dim_buf.data();
+    
+    TensorAsBuf output;
+    output.device_type = device_type;
+    EnsureAlignment(context, *output_tensor, &output);
+
+    status = MakeDLTensor(output, dl_ctx, output_shape_ptr, &args[NUM_INPUTS]);
+    OP_REQUIRES(context, status == 0, Status(error::INTERNAL, "Fail to create dlpack tensor for output"));
+   
+    apply_variadic_by_ptrs(tvm_func, args);
+   
+    output.CopyToOrigin(); 
+  }
+};
+
+
+
+#define REGISTER_TFTVM_KERNEL(n) \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(DEVICE_GPU), TVMDSOOp<GPUDevice, n>); \
+
+REGISTER_TFTVM_KERNEL(1)
+REGISTER_TFTVM_KERNEL(2)
+REGISTER_TFTVM_KERNEL(3)
+REGISTER_TFTVM_KERNEL(4)
+REGISTER_TFTVM_KERNEL(5)
+REGISTER_TFTVM_KERNEL(6)
+REGISTER_TFTVM_KERNEL(7)
+REGISTER_TFTVM_KERNEL(8)
+
diff --git a/src/contrib/tf_op/tvm_dso_ops.cc b/src/contrib/tf_op/tvm_dso_ops.cc
new file mode 100644
index 000000000000..8369e928599b
--- /dev/null
+++ b/src/contrib/tf_op/tvm_dso_ops.cc
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "tensorflow/core/framework/op.h"
+
+using namespace tensorflow;
+
+#define REGISTER_TFTVM_OP(n) REGISTER_OP("TvmDsoOp" #n) \
+    .Output("output: output_dtype") \
+    .Attr("lib_path: string") \
+    .Attr("func_name: string") \
+    .Attr("output_dtype: {int32, int64, float} = DT_FLOAT") \
+    .Attr("static_output_shape: list(int) >= 0 = []") \
+    .Attr("has_static_output_shape: bool") \
+
+
+REGISTER_TFTVM_OP(1)
+    .Input("input: T").Attr("T: type") \
+    .Input("dynamic_output_shape: int64");
+
+REGISTER_TFTVM_OP(2)
+    .Input("input1: T1").Attr("T1: type")
+    .Input("input2: T2").Attr("T2: type")
+    .Input("dynamic_output_shape: int64");
+
+REGISTER_TFTVM_OP(3)
+    .Input("input1: T1").Attr("T1: type")
+    .Input("input2: T2").Attr("T2: type")
+    .Input("input3: T3").Attr("T3: type")
+    .Input("dynamic_output_shape: int64");
+
+REGISTER_TFTVM_OP(4)
+    .Input("input1: T1").Attr("T1: type")
+    .Input("input2: T2").Attr("T2: type")
+    .Input("input3: T3").Attr("T3: type")
+    .Input("input4: T4").Attr("T4: type")
+    .Input("dynamic_output_shape: int64");
+
+REGISTER_TFTVM_OP(5)
+    .Input("input1: T1").Attr("T1: type")
+    .Input("input2: T2").Attr("T2: type")
+    .Input("input3: T3").Attr("T3: type")
+    .Input("input4: T4").Attr("T4: type")
+    .Input("input5: T5").Attr("T5: type")
+    .Input("dynamic_output_shape: int64");
+
+REGISTER_TFTVM_OP(6)
+    .Input("input1: T1").Attr("T1: type")
+    .Input("input2: T2").Attr("T2: type")
+    .Input("input3: T3").Attr("T3: type")
+    .Input("input4: T4").Attr("T4: type")
+    .Input("input5: T5").Attr("T5: type")
+    .Input("input6: T6").Attr("T6: type")
+    .Input("dynamic_output_shape: int64");
+
+REGISTER_TFTVM_OP(7)
+    .Input("input1: T1").Attr("T1: type")
+    .Input("input2: T2").Attr("T2: type")
+    .Input("input3: T3").Attr("T3: type")
+    .Input("input4: T4").Attr("T4: type")
+    .Input("input5: T5").Attr("T5: type")
+    .Input("input6: T6").Attr("T6: type")
+    .Input("input7: T7").Attr("T7: type")
+    .Input("dynamic_output_shape: int64");
+
+REGISTER_TFTVM_OP(8)
+    .Input("input1: T1").Attr("T1: type")
+    .Input("input2: T2").Attr("T2: type")
+    .Input("input3: T3").Attr("T3: type")
+    .Input("input4: T4").Attr("T4: type")
+    .Input("input5: T5").Attr("T5: type")
+    .Input("input6: T6").Attr("T6: type")
+    .Input("input7: T7").Attr("T7: type")
+    .Input("input8: T8").Attr("T8: type")
+    .Input("dynamic_output_shape: int64");

From 9c3f732d1a57179ab3e3f24ec02de55e5d9d7ddf Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 4 Dec 2019 12:04:24 +0800
Subject: [PATCH 02/30] feat: Update cmake script to work with c++11 and
 in-repo build

---
 cmake/modules/contrib/TFOP.cmake | 24 ++++++++++++------------
 src/contrib/tf_op/index_seq.h    | 16 ++++++++--------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/cmake/modules/contrib/TFOP.cmake b/cmake/modules/contrib/TFOP.cmake
index 52d017c08e16..8720d4b308dd 100644
--- a/cmake/modules/contrib/TFOP.cmake
+++ b/cmake/modules/contrib/TFOP.cmake
@@ -1,15 +1,16 @@
 
 if(NOT USE_TFOP STREQUAL "OFF")
 
-  if ("${TVM_HOME}" STREQUAL "")
-    message(FATAL_ERROR "TVM_HOME is not defined")
-  else()
-    message("Use TVM_HOME=\"${TVM_HOME}\"")
-  endif()
-
-  include_directories(${TVM_HOME}/include)
-  include_directories(${TVM_HOME}/3rdparty/dlpack/include)
-  include_directories(${TVM_HOME}/3rdparty/dmlc-core/include)
+  # If want build this directly comment out below lines.
+  # if ("${TVM_HOME}" STREQUAL "")
+  #   message(FATAL_ERROR "TVM_HOME is not defined")
+  # else()
+  #  message("Use TVM_HOME=\"${TVM_HOME}\"")
+  #endif()
+  # include_directories(${TVM_HOME}/include)
+  # include_directories(${TVM_HOME}/3rdparty/dlpack/include)
+  # include_directories(${TVM_HOME}/3rdparty/dmlc-core/include)
+  # set(TFTVM_LINK_FLAGS  -ltvm_runtime -L${TVM_HOME}/build)
 
   execute_process(COMMAND python -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
     OUTPUT_VARIABLE TF_COMPILE_FLAGS_STR
@@ -26,18 +27,17 @@ if(NOT USE_TFOP STREQUAL "OFF")
   endif()
 
   string(REGEX REPLACE "\n" " " TF_FLAGS "${TF_COMPILE_FLAGS} ${TF_LINK_FLAGS}")
-  message("Use TensorFlow flags=\"${TF_FLAGS}\"")
   separate_arguments(TF_COMPILE_FLAGS UNIX_COMMAND ${TF_COMPILE_FLAGS_STR})
   separate_arguments(TF_LINK_FLAGS UNIX_COMMAND ${TF_LINK_FLAGS_STR})
 
 
   set(OP_LIBRARY_NAME tvm_dso_op)
-  file(GLOB_RECURSE TFTVM_SRCS src/*.cc)
+  file(GLOB_RECURSE TFTVM_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/src/contrib/tf_op/*.cc)
   add_library(${OP_LIBRARY_NAME} SHARED ${TFTVM_SRCS})
   set_target_properties(${OP_LIBRARY_NAME} PROPERTIES PREFIX "")
 
   set(TFTVM_COMPILE_FLAGS  -O2 -ldl -g)
-  set(TFTVM_LINK_FLAGS  -ltvm_runtime -L${TVM_HOME}/build)
+  set(TFTVM_LINK_FLAGS  -ltvm_runtime)
   target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
   target_link_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
 
diff --git a/src/contrib/tf_op/index_seq.h b/src/contrib/tf_op/index_seq.h
index 3d01e405d239..6a0479e609c3 100644
--- a/src/contrib/tf_op/index_seq.h
+++ b/src/contrib/tf_op/index_seq.h
@@ -21,23 +21,23 @@ using make_index_sequence = typename IndexSeqHelper<N>::type;
 
 
 template <typename F, typename T, std::size_t N, std::size_t... Idx>
-decltype(auto) apply_variadic_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
-    return f(t[Idx]...);
+void apply_variadic_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
+    f(t[Idx]...);
 }
 
 template <typename F, typename T, std::size_t N>
-decltype(auto) apply_variadic(F f, T(&t)[N]) {
-    return apply_variadic_impl(f, t, make_index_sequence<N>{});
+void apply_variadic(F f, T(&t)[N]) {
+    apply_variadic_impl(f, t, make_index_sequence<N>{});
 }
 
 template <typename F, typename T, std::size_t N, std::size_t... Idx>
-decltype(auto) apply_variadic_by_ptrs_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
-    return f(&t[Idx]...);
+void apply_variadic_by_ptrs_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
+    f(&t[Idx]...);
 }
 
 template <typename F, typename T, std::size_t N>
-decltype(auto) apply_variadic_by_ptrs(F f, T(&t)[N]) {
-    return apply_variadic_by_ptrs_impl(f, t, make_index_sequence<N>{});
+void apply_variadic_by_ptrs(F f, T(&t)[N]) {
+    apply_variadic_by_ptrs_impl(f, t, make_index_sequence<N>{});
 }
 
 #endif

From 83612178f95400f284a5e658600f583960c3352f Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 4 Dec 2019 12:32:39 +0800
Subject: [PATCH 03/30] feat: Use libtvm as oplib dependency

---
 cmake/modules/contrib/TFOP.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/contrib/TFOP.cmake b/cmake/modules/contrib/TFOP.cmake
index 8720d4b308dd..2517414208fe 100644
--- a/cmake/modules/contrib/TFOP.cmake
+++ b/cmake/modules/contrib/TFOP.cmake
@@ -35,9 +35,9 @@ if(NOT USE_TFOP STREQUAL "OFF")
   file(GLOB_RECURSE TFTVM_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/src/contrib/tf_op/*.cc)
   add_library(${OP_LIBRARY_NAME} SHARED ${TFTVM_SRCS})
   set_target_properties(${OP_LIBRARY_NAME} PROPERTIES PREFIX "")
+  add_dependencies(${OP_LIBRARY_NAME} tvm) 
 
   set(TFTVM_COMPILE_FLAGS  -O2 -ldl -g)
-  set(TFTVM_LINK_FLAGS  -ltvm_runtime)
   target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
   target_link_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
 

From ef387fd607c1ae0e87215d4f82b0bca791ddcb33 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 4 Dec 2019 15:42:02 +0800
Subject: [PATCH 04/30] fix: Add missing link dependency to libtvm

---
 cmake/modules/contrib/TFOP.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/modules/contrib/TFOP.cmake b/cmake/modules/contrib/TFOP.cmake
index 2517414208fe..46ae422b1bf2 100644
--- a/cmake/modules/contrib/TFOP.cmake
+++ b/cmake/modules/contrib/TFOP.cmake
@@ -35,6 +35,7 @@ if(NOT USE_TFOP STREQUAL "OFF")
   file(GLOB_RECURSE TFTVM_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/src/contrib/tf_op/*.cc)
   add_library(${OP_LIBRARY_NAME} SHARED ${TFTVM_SRCS})
   set_target_properties(${OP_LIBRARY_NAME} PROPERTIES PREFIX "")
+  set(TFTVM_LINK_FLAGS  -ltvm -L${CMAKE_CURRENT_BINARY_DIR})
   add_dependencies(${OP_LIBRARY_NAME} tvm) 
 
   set(TFTVM_COMPILE_FLAGS  -O2 -ldl -g)

From 056d4b1f1dcfce94c698e4b435d66176441c3f23 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 2 Jan 2020 18:10:36 +0800
Subject: [PATCH 05/30] feat: Update tf tvmdso op by review comments

---
 CMakeLists.txt                                |  4 +-
 .../contrib/{TFOP.cmake => TF_TVMDSOOP.cmake} | 10 +-
 python/tvm/contrib/tf_op/module.py            | 18 ++++
 src/codegen/build_module.cc                   |  1 +
 src/contrib/tf_op/index_seq.h                 | 21 +++-
 src/contrib/tf_op/tvm_dso_op_kernels.cc       | 96 +++++++++----------
 6 files changed, 91 insertions(+), 59 deletions(-)
 rename cmake/modules/contrib/{TFOP.cmake => TF_TVMDSOOP.cmake} (80%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ec625d0dbed..2afd78424437 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,7 @@ tvm_option(USE_MSVC_MT "Build with MT" OFF)
 tvm_option(USE_MICRO "Build with Micro" OFF)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
 tvm_option(HIDE_PRIVATE_SYMBOLS "Compile with -fvisibility=hidden." OFF)
-tvm_option(USE_TFOP "Build with TensorFlow TVMDSOOp" OFF)
+tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -258,7 +258,7 @@ include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
 include(cmake/modules/contrib/NNPack.cmake)
 include(cmake/modules/contrib/HybridDump.cmake)
-include(cmake/modules/contrib/TFOP.cmake)
+include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 
 if(NOT MSVC)
   include(CheckCXXCompilerFlag)
diff --git a/cmake/modules/contrib/TFOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
similarity index 80%
rename from cmake/modules/contrib/TFOP.cmake
rename to cmake/modules/contrib/TF_TVMDSOOP.cmake
index 46ae422b1bf2..f104ee618f1b 100644
--- a/cmake/modules/contrib/TFOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -1,5 +1,5 @@
 
-if(NOT USE_TFOP STREQUAL "OFF")
+if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
 
   # If want build this directly comment out below lines.
   # if ("${TVM_HOME}" STREQUAL "")
@@ -12,14 +12,16 @@ if(NOT USE_TFOP STREQUAL "OFF")
   # include_directories(${TVM_HOME}/3rdparty/dmlc-core/include)
   # set(TFTVM_LINK_FLAGS  -ltvm_runtime -L${TVM_HOME}/build)
 
-  execute_process(COMMAND python -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
+  find_package(Python COMPONENTS Interpreter)
+  
+  execute_process(COMMAND ${Python_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
     OUTPUT_VARIABLE TF_COMPILE_FLAGS_STR
     RESULT_VARIABLE TF_STATUS)
   if (NOT ${TF_STATUS} EQUAL 0)
     message(FATAL_ERROR "Fail to get TensorFlow compile flags")
   endif()
 
-  execute_process(COMMAND python -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags()))"
+  execute_process(COMMAND ${Python_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags()))"
     OUTPUT_VARIABLE TF_LINK_FLAGS_STR
     RESULT_VARIABLE TF_STATUS)
   if (NOT ${TF_STATUS} EQUAL 0)
@@ -38,7 +40,7 @@ if(NOT USE_TFOP STREQUAL "OFF")
   set(TFTVM_LINK_FLAGS  -ltvm -L${CMAKE_CURRENT_BINARY_DIR})
   add_dependencies(${OP_LIBRARY_NAME} tvm) 
 
-  set(TFTVM_COMPILE_FLAGS  -O2 -ldl -g)
+  # set(TFTVM_COMPILE_FLAGS  ${CMAKE_CXX_FLAGS})
   target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
   target_link_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
 
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index 9a9e1f386fc0..e31778cf06f8 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -19,11 +19,28 @@
 
 
 class Module():
+  """Module container of TensorFlow TVMDSO op which wraps exported
+  TVM op implementation library to be called on TensorFlow side"""
 
   def __init__(self, lib_path):
     self.lib_path = lib_path
 
   def func(self, name, output_dtype=None, output_shape=None):
+    """Get tvm op function wrapped as TensorFlow tensor to tensor function
+
+    Parameters
+    ----------
+    name: str
+        function name
+    output_dtype: str or TensorFlow datatype
+        Output datatype, default is float32
+    output_shape: List of integer/tf scalar tensor or tf shape tensor
+        Output shape, default the same with first input's shape
+
+    Returns
+    ----------
+    Func object that act as TensorFlow tensor to tensor function.
+    """
     return Func(self.lib_path, name, output_dtype, output_shape)
 
   def __getitem__(self, func_name):
@@ -31,6 +48,7 @@ def __getitem__(self, func_name):
 
 
 class Func():
+  """Function object that act as TensorFlow tensor to tensor function."""
 
   def __init__(self, lib_path, func_name, output_dtype, output_shape):
     self.lib_path = lib_path
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 80fd57af66f9..0fb761d768b0 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -572,6 +572,7 @@ runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
     auto& fhost = host_dev_funcs[0];
     auto& fdevice = host_dev_funcs[1];
     // Get the module for a certain target.
+    printf("build device module: %s\n", it.first->device_name.c_str());
     runtime::Module mdev = DeviceBuild(fdevice, it.first);
     for (const auto& it : fhost) {
       fhost_all.push_back(it);
diff --git a/src/contrib/tf_op/index_seq.h b/src/contrib/tf_op/index_seq.h
index 6a0479e609c3..98fcd717b224 100644
--- a/src/contrib/tf_op/index_seq.h
+++ b/src/contrib/tf_op/index_seq.h
@@ -1,4 +1,23 @@
-/**
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
  * Refer to std::index_sequence (since c++14)
  * Utilities to invoke variadic function with template <size_t N> 
  */
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 178b8abc210e..a1c991c1d67e 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -28,11 +28,10 @@
 
 #include "index_seq.h"
 
-using namespace tensorflow;
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-typedef gtl::InlinedVector<int64, 4> ShapeContainer;
+typedef tensorflow::gtl::InlinedVector<tensorflow::int64, 4> ShapeContainer;
 
 
 template <typename DEVICE_TYPE>
@@ -41,8 +40,8 @@ class TVMDSOOpTrait;
 
 class TensorAsBuf {
   public:
-    Tensor inline_tensor;
-    Tensor* tensor;
+    tensorflow::Tensor inline_tensor;
+    tensorflow::Tensor* tensor;
 
     size_t size;
     size_t offset;
@@ -76,28 +75,22 @@ class TensorAsBuf {
 };
 
 
-int GetDLPackDtype(const Tensor& tf_tensor, DLDataType* res) {
+tensorflow::Status GetDLPackDtype(const tensorflow::Tensor& tf_tensor, DLDataType* res) {
     auto dtype = tf_tensor.dtype();
-    if (dtype == DT_FLOAT) {
-      res->code = kDLFloat;
-      res->bits = 32;
-      res->lanes = 1;
-    } else if (dtype == DT_INT64) {
-      res->code = kDLInt;
-      res->bits = 64;
-      res->lanes = 1;
-    } else if (dtype == DT_INT32) {
-      res->code = kDLInt;
-      res->bits = 32;
-      res->lanes = 1;
+    if (dtype == tensorflow::DT_FLOAT) {
+      *res = {kDLFloat, 32, 1};
+    } else if (dtype == tensorflow::DT_INT64) {
+      *res = {kDLInt, 64, 1};
+    } else if (dtype == tensorflow::DT_INT32) {
+      *res = {kDLInt, 32, 1};
     } else {
-      return -1;
+      return tensorflow::Status(tensorflow::error::INTERNAL, "Fail to get dlpack datatype");
     }
-    return 0;
+    return tensorflow::Status::OK();
 }
 
 
-void EnsureAlignment(OpKernelContext* ctx, const Tensor& tensor, TensorAsBuf* out) {
+void EnsureAlignment(tensorflow::OpKernelContext* ctx, const tensorflow::Tensor& tensor, TensorAsBuf* out) {
     char* buf = (char*) tensor.tensor_data().data();
     out->origin_buf = buf;
     out->size = tensor.TotalBytes(); 
@@ -105,13 +98,13 @@ void EnsureAlignment(OpKernelContext* ctx, const Tensor& tensor, TensorAsBuf* ou
     int alignment = 64;
     char* aligned = (char*)(((uint64_t)buf + alignment - 1) & (~ (alignment - 1)));
     if (buf == aligned) {
-        out->tensor = const_cast<Tensor*>(&tensor);
+        out->tensor = const_cast<tensorflow::Tensor*>(&tensor);
         out->buf = buf;
         out->offset = 0;
     } else {
-        TensorShape buf_shape;
-        int64 dims[1] = { (int64)(tensor.TotalBytes() + alignment) }; 
-        TensorShapeUtils::MakeShape(dims, 1, &buf_shape);
+        tensorflow::TensorShape buf_shape;
+        tensorflow::int64 dims[1] = { (tensorflow::int64)(tensor.TotalBytes() + alignment) }; 
+        tensorflow::TensorShapeUtils::MakeShape(dims, 1, &buf_shape);
         
         out->tensor = &out->inline_tensor;
         ctx->allocate_temp(tensor.dtype(), buf_shape, out->tensor);
@@ -124,22 +117,22 @@ void EnsureAlignment(OpKernelContext* ctx, const Tensor& tensor, TensorAsBuf* ou
 }
 
 
-int MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape, DLTensor* out) {
+tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape, DLTensor* out) {
     DLDataType dlpack_type;
-    const Tensor& tensor = *src.tensor;
+    const tensorflow::Tensor& tensor = *src.tensor;
 
-    int status = GetDLPackDtype(tensor, &dlpack_type);
-    if (status != 0) {
+    auto status = GetDLPackDtype(tensor, &dlpack_type);
+    if (! status.ok()) {
         return status;
     }
     out->ctx = ctx;
     out->ndim = tensor.shape().dims();
     out->shape = tf_shape;
-    out->strides = NULL;
+    out->strides = nullptr;
     out->byte_offset = 0;
     out->dtype = dlpack_type;    
     out->data = src.buf + src.offset;
-    return 0;
+    return tensorflow::Status::OK();
 }
 
 
@@ -148,7 +141,7 @@ class TVMDSOOpTrait<CPUDevice> {
   public:
     static const int device_type = kDLCPU;
 
-    static int device_id(OpKernelContext* context) {
+    static int device_id(tensorflow::OpKernelContext* context) {
         return 0;
     }
 
@@ -160,7 +153,7 @@ class TVMDSOOpTrait<GPUDevice> {
   public:
     static const int device_type = kDLGPU;
 
-    static int device_id(OpKernelContext* context) {
+    static int device_id(tensorflow::OpKernelContext* context) {
         auto device_base = context->device();
         auto gpu_device_info = device_base->tensorflow_gpu_device_info();
         return gpu_device_info->gpu_id;
@@ -169,19 +162,19 @@ class TVMDSOOpTrait<GPUDevice> {
 
 
 template <typename DEVICE_TYPE, int NUM_INPUTS>
-class TVMDSOOp : public OpKernel {
+class TVMDSOOp : public tensorflow::OpKernel {
 
 private:
   tvm::runtime::PackedFunc tvm_func;
-  string lib_path;
-  string func_name;
+  std::string lib_path;
+  std::string func_name;
 
-  DataType output_dtype;
+  tensorflow::DataType output_dtype;
   
   bool has_static_output_shape;
-  std::vector<int64> static_output_shape;
+  std::vector<tensorflow::int64> static_output_shape;
 
-  void initAttributes(OpKernelConstruction* context) {
+  void initAttributes(tensorflow::OpKernelConstruction* context) {
     context->GetAttr("lib_path", &lib_path);
     context->GetAttr("func_name", &func_name);
     context->GetAttr("output_dtype", &output_dtype);
@@ -191,41 +184,40 @@ class TVMDSOOp : public OpKernel {
   }
 
  public:
-  explicit TVMDSOOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit TVMDSOOp(tensorflow::OpKernelConstruction* context) : tensorflow::OpKernel(context) {
 
     // Get attr
     initAttributes(context);
 
     // Load TVM function from dynamic library
     tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile(lib_path);
-    LOG(INFO) << "Verify dynamic loading from " << lib_path << " device_type=" << TVMDSOOpTrait<DEVICE_TYPE>::device_type;
     tvm_func = mod_dylib.GetFunction(func_name);
     CHECK(tvm_func != nullptr);
   }
   
-  void Compute(OpKernelContext* context) override {
+  void Compute(tensorflow::OpKernelContext* context) override {
 
     DLTensor args[NUM_INPUTS + 1];
     TensorAsBuf buf_info[NUM_INPUTS];
     ShapeContainer shapes[NUM_INPUTS];
 
-    int status;
+    tensorflow::Status status;
     int device_id = TVMDSOOpTrait<DEVICE_TYPE>::device_id(context);
     int device_type = TVMDSOOpTrait<DEVICE_TYPE>::device_type;
     
     DLContext dl_ctx = { DLDeviceType(device_type), device_id };
 
     // Get output shape
-    TensorShape output_shape;
+    tensorflow::TensorShape output_shape;
     auto& output_shape_tensor = context->input(NUM_INPUTS);
     if (has_static_output_shape) {
       // use static output shape
-      const int64* dims = static_output_shape.data();
-      TensorShapeUtils::MakeShape(dims, static_output_shape.size(), &output_shape);
+      const tensorflow::int64* dims = static_output_shape.data();
+      tensorflow::TensorShapeUtils::MakeShape(dims, static_output_shape.size(), &output_shape);
     } else if (output_shape_tensor.dims() == 1) {
       // use shape tensor values as output shape
-      const int64* dims = output_shape_tensor.flat<int64>().data();
-      TensorShapeUtils::MakeShape(dims, 1, &output_shape);
+      const tensorflow::int64* dims = output_shape_tensor.flat<tensorflow::int64>().data();
+      tensorflow::TensorShapeUtils::MakeShape(dims, 1, &output_shape);
     } else {
       // use input tensor shape by default
       output_shape = context->input(0).shape();
@@ -246,11 +238,11 @@ class TVMDSOOp : public OpKernel {
         input.CopyFromOrigin();
 
         status = MakeDLTensor(input, dl_ctx, shape_ptr, &args[i]);
-        OP_REQUIRES(context, status == 0, Status(error::INTERNAL, "Fail to create dlpack tensor for input"));
+        OP_REQUIRES_OK(context, status);
     }
 
     // Allocate output tensor
-    Tensor* output_tensor;
+    tensorflow::Tensor* output_tensor;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
     auto output_shape_dim_buf = output_tensor->shape().dim_sizes(); // should keep alive on stack 
     auto output_shape_ptr = (int64_t*) output_shape_dim_buf.data();
@@ -260,7 +252,7 @@ class TVMDSOOp : public OpKernel {
     EnsureAlignment(context, *output_tensor, &output);
 
     status = MakeDLTensor(output, dl_ctx, output_shape_ptr, &args[NUM_INPUTS]);
-    OP_REQUIRES(context, status == 0, Status(error::INTERNAL, "Fail to create dlpack tensor for output"));
+    OP_REQUIRES_OK(context, status);
    
     apply_variadic_by_ptrs(tvm_func, args);
    
@@ -271,8 +263,8 @@ class TVMDSOOp : public OpKernel {
 
 
 #define REGISTER_TFTVM_KERNEL(n) \
-    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
-    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(DEVICE_GPU), TVMDSOOp<GPUDevice, n>); \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>); \
 
 REGISTER_TFTVM_KERNEL(1)
 REGISTER_TFTVM_KERNEL(2)

From 69df54930b6393a16dfd4b07641d64519db0a26c Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 16 Jan 2020 16:21:08 +0800
Subject: [PATCH 06/30] fix: Update with pr comments

---
 cmake/modules/contrib/TF_TVMDSOOP.cmake | 28 ++++++++++++++-----------
 python/tvm/contrib/tf_op/module.py      |  6 ++----
 src/contrib/tf_op/tvm_dso_op_kernels.cc | 18 ++++++++++++++--
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index f104ee618f1b..97e5dd4d2190 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -1,17 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
 if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
-
-  # If want build this directly comment out below lines.
-  # if ("${TVM_HOME}" STREQUAL "")
-  #   message(FATAL_ERROR "TVM_HOME is not defined")
-  # else()
-  #  message("Use TVM_HOME=\"${TVM_HOME}\"")
-  #endif()
-  # include_directories(${TVM_HOME}/include)
-  # include_directories(${TVM_HOME}/3rdparty/dlpack/include)
-  # include_directories(${TVM_HOME}/3rdparty/dmlc-core/include)
-  # set(TFTVM_LINK_FLAGS  -ltvm_runtime -L${TVM_HOME}/build)
-
   find_package(Python COMPONENTS Interpreter)
   
   execute_process(COMMAND ${Python_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index e31778cf06f8..fd977bd218f8 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -66,9 +66,7 @@ def __init__(self, lib_path, func_name, output_dtype, output_shape):
     elif output_shape is not None:
       self.dynamic_output_shape = self._pack_shape_tensor(output_shape)
     
-    # TODO: support non-xpu device 
-    #self.device = device
-    # delay initialization to called first time, where num input arguments is known
+    # delay op initialization to where Func.apply() get called first time
     self.tvm_dso_op = None
     self.module = load_library.load_op_library('tvm_dso_op.so')
     
@@ -113,7 +111,7 @@ def _pack_shape_tensor(self, shape):
             d = tf.cast(d, tf.int64)
           shape_dims.append(d)
         else:
-          raise TypeError("Input shape dimension is neither scala tensor nor int")
+          raise TypeError("Input shape dimension is neither scalar tensor nor int")
       return tf.stack(shape_dims) 
     else:
       raise TypeError("Input shape is neither tensor nor list")
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index a1c991c1d67e..9f6b1d0c285b 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -34,10 +34,16 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef tensorflow::gtl::InlinedVector<tensorflow::int64, 4> ShapeContainer;
 
 
+// Op utility trait for diffrent device type template
 template <typename DEVICE_TYPE>
 class TVMDSOOpTrait;
 
 
+// Buffer information used for actual computation.
+// Each buffer is associated with one TensorFlow tensor
+// whose underlying buffer is record into "origin_buf".
+// For input tensor, we copy data from origin_buf to buf
+// and for output tensor, copy data from buf to origin_buf
 class TensorAsBuf {
   public:
     tensorflow::Tensor inline_tensor;
@@ -57,8 +63,11 @@ class TensorAsBuf {
         }
         if (device_type == kDLCPU) {
             memcpy(origin_buf, buf + offset, size); 
-        } else {
+        } else if (device_type == kDLGPU) {
             cudaMemcpy(origin_buf, buf + offset, size, cudaMemcpyDeviceToDevice);
+        } else {
+            LOG(FATAL) << "Only support CPU and CUDA now. Device "
+                << device_type << " is not implemented currently";
         }
     }
 
@@ -68,8 +77,11 @@ class TensorAsBuf {
         }
         if (device_type == kDLCPU) {
             memcpy(buf + offset, origin_buf, size); 
-        } else {
+        } else if (device_type == kDLGPU) {
             cudaMemcpy(buf + offset, origin_buf, size, cudaMemcpyDeviceToDevice);
+        } else {
+            LOG(FATAL) << "Only support CPU and CUDA now. Device "
+                << device_type << " is not implemented currently";
         }
     }
 };
@@ -90,6 +102,7 @@ tensorflow::Status GetDLPackDtype(const tensorflow::Tensor& tf_tensor, DLDataTyp
 }
 
 
+// Ensure buffer used for actual computation take 64byte alignment 
 void EnsureAlignment(tensorflow::OpKernelContext* ctx, const tensorflow::Tensor& tensor, TensorAsBuf* out) {
     char* buf = (char*) tensor.tensor_data().data();
     out->origin_buf = buf;
@@ -117,6 +130,7 @@ void EnsureAlignment(tensorflow::OpKernelContext* ctx, const tensorflow::Tensor&
 }
 
 
+// Create DLPack tensor from TensorFlow tensor
 tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape, DLTensor* out) {
     DLDataType dlpack_type;
     const tensorflow::Tensor& tensor = *src.tensor;

From c2e72a6a2e7426af690bbef6004c1f8cca285154 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 16 Jan 2020 17:23:17 +0800
Subject: [PATCH 07/30] fix: Fix lint

---
 cmake/config.cmake                      |   2 +-
 python/tvm/contrib/tf_op/__init__.py    |   2 +-
 python/tvm/contrib/tf_op/module.py      | 178 ++++++++++++------------
 src/contrib/tf_op/index_seq.h           |   6 +-
 src/contrib/tf_op/tvm_dso_op_kernels.cc | 124 ++++++++++-------
 src/contrib/tf_op/tvm_dso_ops.cc        |   2 -
 6 files changed, 164 insertions(+), 150 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 65556998d185..bece89261f5a 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -195,4 +195,4 @@ set(USE_VTA_FPGA OFF)
 set(USE_EXAMPLE_EXT_RUNTIME OFF)
 
 # Whether to build the TensorFlow TVMDSOOp module
-set(USE_TFOP OFF)
+set(USE_TF_TVMDSOOP OFF)
diff --git a/python/tvm/contrib/tf_op/__init__.py b/python/tvm/contrib/tf_op/__init__.py
index 9f80b266cb91..cabd8b716e7d 100644
--- a/python/tvm/contrib/tf_op/__init__.py
+++ b/python/tvm/contrib/tf_op/__init__.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Module container of TensorFlow TVMDSO op"""
 from . import module
 
 Module = module.Module
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index fd977bd218f8..ed2622fd3cff 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -14,107 +14,105 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Module container of TensorFlow TVMDSO op"""
 import tensorflow as tf
 from tensorflow.python.framework import load_library
 
 
 class Module():
-  """Module container of TensorFlow TVMDSO op which wraps exported
-  TVM op implementation library to be called on TensorFlow side"""
+    """Module container of TensorFlow TVMDSO op which wraps exported
+    TVM op implementation library to be called on TensorFlow side"""
 
-  def __init__(self, lib_path):
-    self.lib_path = lib_path
+    def __init__(self, lib_path):
+        self.lib_path = lib_path
 
-  def func(self, name, output_dtype=None, output_shape=None):
-    """Get tvm op function wrapped as TensorFlow tensor to tensor function
+    def func(self, name, output_dtype=None, output_shape=None):
+        """Get tvm op function wrapped as TensorFlow tensor to tensor function
 
-    Parameters
-    ----------
-    name: str
-        function name
-    output_dtype: str or TensorFlow datatype
-        Output datatype, default is float32
-    output_shape: List of integer/tf scalar tensor or tf shape tensor
-        Output shape, default the same with first input's shape
+        Parameters
+        ----------
+        name: str
+            function name
+        output_dtype: str or TensorFlow datatype
+            Output datatype, default is float32
+        output_shape: List of integer/tf scalar tensor or tf shape tensor
+            Output shape, default the same with first input's shape
 
-    Returns
-    ----------
-    Func object that act as TensorFlow tensor to tensor function.
-    """
-    return Func(self.lib_path, name, output_dtype, output_shape)
+        Returns
+        ----------
+        Func object that act as TensorFlow tensor to tensor function.
+        """
+        return Func(self.lib_path, name, output_dtype, output_shape)
 
-  def __getitem__(self, func_name):
-    return self.func(func_name)
+    def __getitem__(self, func_name):
+        return self.func(func_name)
 
 
 class Func():
-  """Function object that act as TensorFlow tensor to tensor function."""
-
-  def __init__(self, lib_path, func_name, output_dtype, output_shape):
-    self.lib_path = lib_path
-    self.func_name = func_name
-    self.output_dtype = output_dtype
-
-    # const(0) indicate invalid dynamic shape
-    self.dynamic_output_shape = tf.constant(0, tf.int64)
-    self.static_output_shape = None
-    self.has_static_output_shape = False  # extra flag is required
-
-    if self._is_static_shape(output_shape):
-      self.static_output_shape = output_shape
-      self.has_static_output_shape = True
-    elif output_shape is not None:
-      self.dynamic_output_shape = self._pack_shape_tensor(output_shape)
-    
-    # delay op initialization to where Func.apply() get called first time
-    self.tvm_dso_op = None
-    self.module = load_library.load_op_library('tvm_dso_op.so')
-    
-  def apply(self, *params):
-    if self.tvm_dso_op is None:
-      num_inputs = len(params)
-      self.tvm_dso_op = getattr(self.module, "tvm_dso_op%s" % num_inputs)
-    
-    return self.tvm_dso_op(*params, 
-                           dynamic_output_shape=self.dynamic_output_shape,
-                           static_output_shape=self.static_output_shape,
-                           has_static_output_shape=self.has_static_output_shape, 
-                           lib_path=self.lib_path, 
-                           func_name=self.func_name, 
-                           output_dtype=self.output_dtype)
-
-  def __call__(self, *params):
-    return self.apply(*params)
-
-  def _is_static_shape(self, shape):
-    if shape is None or not isinstance(shape, list):
-      return False
-    for d in shape:
-      if not isinstance(d, int):
-        return False
-      if d < 0:
-        raise Exception("Negative dimension is illegal: %d" % d)
-    return True
-
-  def _pack_shape_tensor(self, shape):
-    if isinstance(shape, tf.Tensor):
-      if shape.dtype == tf.int32:
-        shape = tf.cast(shape, tf.int64)
-      return shape
-    elif isinstance(shape, list):
-      shape_dims = []
-      for d in shape:
-        if isinstance(d, int):
-          shape_dims.append(tf.constant(d, tf.int64))
-        elif isinstance(d, tf.Tensor) and len(d.shape) == 0:
-          if d.dtype == tf.int32:
-            d = tf.cast(d, tf.int64)
-          shape_dims.append(d)
+    """Function object that act as TensorFlow tensor to tensor function."""
+
+    def __init__(self, lib_path, func_name, output_dtype, output_shape):
+        self.lib_path = lib_path
+        self.func_name = func_name
+        self.output_dtype = output_dtype
+
+        # const(0) indicate invalid dynamic shape
+        self.dynamic_output_shape = tf.constant(0, tf.int64)
+        self.static_output_shape = None
+        self.has_static_output_shape = False  # extra flag is required
+
+        if self._is_static_shape(output_shape):
+            self.static_output_shape = output_shape
+            self.has_static_output_shape = True
+        elif output_shape is not None:
+            self.dynamic_output_shape = self._pack_shape_tensor(output_shape)
+
+        # delay op initialization to where Func.apply() get called first time
+        self.tvm_dso_op = None
+        self.module = load_library.load_op_library('tvm_dso_op.so')
+
+    def apply(self, *params):
+        if self.tvm_dso_op is None:
+            num_inputs = len(params)
+        self.tvm_dso_op = getattr(self.module, "tvm_dso_op%s" % num_inputs)
+
+        return self.tvm_dso_op(*params,
+                               dynamic_output_shape=self.dynamic_output_shape,
+                               static_output_shape=self.static_output_shape,
+                               has_static_output_shape=self.has_static_output_shape,
+                               lib_path=self.lib_path,
+                               func_name=self.func_name,
+                               output_dtype=self.output_dtype)
+
+    def __call__(self, *params):
+        return self.apply(*params)
+
+    def _is_static_shape(self, shape):
+        if shape is None or not isinstance(shape, list):
+            return False
+        for dim_value in shape:
+            if not isinstance(dim_value, int):
+                return False
+            if dim_value < 0:
+                raise Exception("Negative dimension is illegal: %d" % dim_value)
+        return True
+
+    def _pack_shape_tensor(self, shape):
+        if isinstance(shape, tf.Tensor):
+            if shape.dtype == tf.int32:
+                shape = tf.cast(shape, tf.int64)
+            return shape
+        elif isinstance(shape, list):
+            shape_dims = []
+            for dim_value in shape:
+                if isinstance(dim_value, int):
+                    shape_dims.append(tf.constant(dim_value, tf.int64))
+                elif isinstance(dim_value, tf.Tensor) and dim_value.shape.rank == 0:
+                    if dim_value.dtype == tf.int32:
+                        dim_value = tf.cast(dim_value, tf.int64)
+                    shape_dims.append(dim_value)
+                else:
+                    raise TypeError("Input shape dimension is neither scalar tensor nor int")
+            return tf.stack(shape_dims)
         else:
-          raise TypeError("Input shape dimension is neither scalar tensor nor int")
-      return tf.stack(shape_dims) 
-    else:
-      raise TypeError("Input shape is neither tensor nor list")
-
-
-
+            raise TypeError("Input shape is neither tensor nor list")
diff --git a/src/contrib/tf_op/index_seq.h b/src/contrib/tf_op/index_seq.h
index 98fcd717b224..7a9a423a5977 100644
--- a/src/contrib/tf_op/index_seq.h
+++ b/src/contrib/tf_op/index_seq.h
@@ -21,8 +21,8 @@
  * Refer to std::index_sequence (since c++14)
  * Utilities to invoke variadic function with template <size_t N> 
  */
-#ifndef __TFTVM_INDEX_SEQ__
-#define __TFTVM_INDEX_SEQ__
+#ifndef TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
+#define TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
 
 template <std::size_t ...>
 struct IndexSeq {};
@@ -59,5 +59,5 @@ void apply_variadic_by_ptrs(F f, T(&t)[N]) {
     apply_variadic_by_ptrs_impl(f, t, make_index_sequence<N>{});
 }
 
-#endif
+#endif  // TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
 
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 9f6b1d0c285b..363de7199307 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -17,15 +17,15 @@
  * under the License.
  */
 
-#include <cstdio>
 #include <cuda_runtime.h>
 #include <dlpack/dlpack.h>
+
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
-#include "tensorflow/core/framework/op_kernel.h"
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "index_seq.h"
 
 
@@ -33,6 +33,10 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 typedef tensorflow::gtl::InlinedVector<tensorflow::int64, 4> ShapeContainer;
 
+using tensorflow::OpKernel;
+using tensorflow::OpKernelConstruction;
+using tensorflow::OpKernelContext;
+
 
 // Op utility trait for diffrent device type template
 template <typename DEVICE_TYPE>
@@ -45,7 +49,7 @@ class TVMDSOOpTrait;
 // For input tensor, we copy data from origin_buf to buf
 // and for output tensor, copy data from buf to origin_buf
 class TensorAsBuf {
-  public:
+ public:
     tensorflow::Tensor inline_tensor;
     tensorflow::Tensor* tensor;
 
@@ -54,7 +58,7 @@ class TensorAsBuf {
 
     int device_type;
 
-    char* origin_buf; 
+    char* origin_buf;
     char* buf;
 
     void CopyToOrigin() {
@@ -62,9 +66,10 @@ class TensorAsBuf {
             return;
         }
         if (device_type == kDLCPU) {
-            memcpy(origin_buf, buf + offset, size); 
+            memcpy(origin_buf, buf + offset, size);
         } else if (device_type == kDLGPU) {
-            cudaMemcpy(origin_buf, buf + offset, size, cudaMemcpyDeviceToDevice);
+            cudaMemcpy(origin_buf, buf + offset,
+                size, cudaMemcpyDeviceToDevice);
         } else {
             LOG(FATAL) << "Only support CPU and CUDA now. Device "
                 << device_type << " is not implemented currently";
@@ -76,9 +81,10 @@ class TensorAsBuf {
             return;
         }
         if (device_type == kDLCPU) {
-            memcpy(buf + offset, origin_buf, size); 
+            memcpy(buf + offset, origin_buf, size);
         } else if (device_type == kDLGPU) {
-            cudaMemcpy(buf + offset, origin_buf, size, cudaMemcpyDeviceToDevice);
+            cudaMemcpy(buf + offset, origin_buf,
+                size, cudaMemcpyDeviceToDevice);
         } else {
             LOG(FATAL) << "Only support CPU and CUDA now. Device "
                 << device_type << " is not implemented currently";
@@ -87,7 +93,8 @@ class TensorAsBuf {
 };
 
 
-tensorflow::Status GetDLPackDtype(const tensorflow::Tensor& tf_tensor, DLDataType* res) {
+tensorflow::Status GetDLPackDtype(const tensorflow::Tensor& tf_tensor,
+                                  DLDataType* res) {
     auto dtype = tf_tensor.dtype();
     if (dtype == tensorflow::DT_FLOAT) {
       *res = {kDLFloat, 32, 1};
@@ -96,34 +103,40 @@ tensorflow::Status GetDLPackDtype(const tensorflow::Tensor& tf_tensor, DLDataTyp
     } else if (dtype == tensorflow::DT_INT32) {
       *res = {kDLInt, 32, 1};
     } else {
-      return tensorflow::Status(tensorflow::error::INTERNAL, "Fail to get dlpack datatype");
+      return tensorflow::Status(tensorflow::error::INTERNAL,
+          "Fail to get dlpack datatype");
     }
     return tensorflow::Status::OK();
 }
 
 
-// Ensure buffer used for actual computation take 64byte alignment 
-void EnsureAlignment(tensorflow::OpKernelContext* ctx, const tensorflow::Tensor& tensor, TensorAsBuf* out) {
-    char* buf = (char*) tensor.tensor_data().data();
+// Ensure buffer used for actual computation take 64byte alignment
+void EnsureAlignment(OpKernelContext* ctx,
+                     const tensorflow::Tensor& tensor,
+                     TensorAsBuf* out) {
+    char* buf = const_cast<char*>(tensor.tensor_data().data());
     out->origin_buf = buf;
-    out->size = tensor.TotalBytes(); 
+    out->size = tensor.TotalBytes();
 
     int alignment = 64;
-    char* aligned = (char*)(((uint64_t)buf + alignment - 1) & (~ (alignment - 1)));
+    char* aligned = reinterpret_cast<char*>(
+        ((uint64_t)buf + alignment - 1) & (~(alignment - 1)));
     if (buf == aligned) {
         out->tensor = const_cast<tensorflow::Tensor*>(&tensor);
         out->buf = buf;
         out->offset = 0;
     } else {
         tensorflow::TensorShape buf_shape;
-        tensorflow::int64 dims[1] = { (tensorflow::int64)(tensor.TotalBytes() + alignment) }; 
+        tensorflow::int64 dims[1] =
+            { (tensorflow::int64)(tensor.TotalBytes() + alignment) };
         tensorflow::TensorShapeUtils::MakeShape(dims, 1, &buf_shape);
-        
+
         out->tensor = &out->inline_tensor;
         ctx->allocate_temp(tensor.dtype(), buf_shape, out->tensor);
-        
-        buf = (char*)(out->tensor->tensor_data().data());
-        char* buf_aligned = (char*)(((uint64_t)buf + alignment) & (~ (alignment - 1)));
+
+        buf = const_cast<char*>(out->tensor->tensor_data().data());
+        char* buf_aligned = reinterpret_cast<char*>(
+            ((uint64_t)buf + alignment) & (~(alignment - 1)));
         out->buf = buf;
         out->offset = buf_aligned - buf;
     }
@@ -131,12 +144,13 @@ void EnsureAlignment(tensorflow::OpKernelContext* ctx, const tensorflow::Tensor&
 
 
 // Create DLPack tensor from TensorFlow tensor
-tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape, DLTensor* out) {
+tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx,
+                                int64_t* tf_shape, DLTensor* out) {
     DLDataType dlpack_type;
     const tensorflow::Tensor& tensor = *src.tensor;
 
     auto status = GetDLPackDtype(tensor, &dlpack_type);
-    if (! status.ok()) {
+    if (!status.ok()) {
         return status;
     }
     out->ctx = ctx;
@@ -144,7 +158,7 @@ tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, in
     out->shape = tf_shape;
     out->strides = nullptr;
     out->byte_offset = 0;
-    out->dtype = dlpack_type;    
+    out->dtype = dlpack_type;
     out->data = src.buf + src.offset;
     return tensorflow::Status::OK();
 }
@@ -152,22 +166,21 @@ tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, in
 
 template <>
 class TVMDSOOpTrait<CPUDevice> {
-  public:
+ public:
     static const int device_type = kDLCPU;
 
-    static int device_id(tensorflow::OpKernelContext* context) {
+    static int device_id(OpKernelContext* context) {
         return 0;
     }
-
 };
 
 
 template <>
 class TVMDSOOpTrait<GPUDevice> {
-  public:
+ public:
     static const int device_type = kDLGPU;
 
-    static int device_id(tensorflow::OpKernelContext* context) {
+    static int device_id(OpKernelContext* context) {
         auto device_base = context->device();
         auto gpu_device_info = device_base->tensorflow_gpu_device_info();
         return gpu_device_info->gpu_id;
@@ -176,41 +189,39 @@ class TVMDSOOpTrait<GPUDevice> {
 
 
 template <typename DEVICE_TYPE, int NUM_INPUTS>
-class TVMDSOOp : public tensorflow::OpKernel {
-
-private:
+class TVMDSOOp : public OpKernel {
+ private:
   tvm::runtime::PackedFunc tvm_func;
   std::string lib_path;
   std::string func_name;
 
   tensorflow::DataType output_dtype;
-  
+
   bool has_static_output_shape;
   std::vector<tensorflow::int64> static_output_shape;
 
-  void initAttributes(tensorflow::OpKernelConstruction* context) {
+  void initAttributes(OpKernelConstruction* context) {
     context->GetAttr("lib_path", &lib_path);
     context->GetAttr("func_name", &func_name);
     context->GetAttr("output_dtype", &output_dtype);
-     
+
     context->GetAttr("has_static_output_shape", &has_static_output_shape);
     context->GetAttr("static_output_shape", &static_output_shape);
   }
 
  public:
-  explicit TVMDSOOp(tensorflow::OpKernelConstruction* context) : tensorflow::OpKernel(context) {
-
+  explicit TVMDSOOp(OpKernelConstruction* context) : OpKernel(context) {
     // Get attr
     initAttributes(context);
 
     // Load TVM function from dynamic library
-    tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile(lib_path);
+    tvm::runtime::Module mod_dylib =
+        tvm::runtime::Module::LoadFromFile(lib_path);
     tvm_func = mod_dylib.GetFunction(func_name);
     CHECK(tvm_func != nullptr);
   }
-  
-  void Compute(tensorflow::OpKernelContext* context) override {
 
+  void Compute(tensorflow::OpKernelContext* context) override {
     DLTensor args[NUM_INPUTS + 1];
     TensorAsBuf buf_info[NUM_INPUTS];
     ShapeContainer shapes[NUM_INPUTS];
@@ -218,7 +229,7 @@ class TVMDSOOp : public tensorflow::OpKernel {
     tensorflow::Status status;
     int device_id = TVMDSOOpTrait<DEVICE_TYPE>::device_id(context);
     int device_type = TVMDSOOpTrait<DEVICE_TYPE>::device_type;
-    
+
     DLContext dl_ctx = { DLDeviceType(device_type), device_id };
 
     // Get output shape
@@ -227,23 +238,25 @@ class TVMDSOOp : public tensorflow::OpKernel {
     if (has_static_output_shape) {
       // use static output shape
       const tensorflow::int64* dims = static_output_shape.data();
-      tensorflow::TensorShapeUtils::MakeShape(dims, static_output_shape.size(), &output_shape);
+      tensorflow::TensorShapeUtils::MakeShape(
+          dims, static_output_shape.size(), &output_shape);
     } else if (output_shape_tensor.dims() == 1) {
       // use shape tensor values as output shape
-      const tensorflow::int64* dims = output_shape_tensor.flat<tensorflow::int64>().data();
+      const tensorflow::int64* dims =
+          output_shape_tensor.flat<tensorflow::int64>().data();
       tensorflow::TensorShapeUtils::MakeShape(dims, 1, &output_shape);
     } else {
       // use input tensor shape by default
       output_shape = context->input(0).shape();
     }
-    
+
     for (int i = 0; i < NUM_INPUTS; ++i) {
         // Grab the input tensor
         auto& input_tensor = context->input(i);
 
         // Create shape container, should keep ref during execution
         shapes[i] = input_tensor.shape().dim_sizes();
-        auto shape_ptr = (int64_t*) shapes[i].data();
+        auto shape_ptr = reinterpret_cast<int64_t*>(shapes[i].data());
 
         TensorAsBuf& input = buf_info[i];
         input.device_type = device_type;
@@ -257,28 +270,33 @@ class TVMDSOOp : public tensorflow::OpKernel {
 
     // Allocate output tensor
     tensorflow::Tensor* output_tensor;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
-    auto output_shape_dim_buf = output_tensor->shape().dim_sizes(); // should keep alive on stack 
-    auto output_shape_ptr = (int64_t*) output_shape_dim_buf.data();
-    
+    OP_REQUIRES_OK(context, context->allocate_output(
+        0, output_shape, &output_tensor));
+    // shape dimension buf should keel alive on stack
+    auto output_shape_dim_buf = output_tensor->shape().dim_sizes();
+    auto output_shape_ptr = reinterpret_cast<int64_t*>(
+        output_shape_dim_buf.data());
+
     TensorAsBuf output;
     output.device_type = device_type;
     EnsureAlignment(context, *output_tensor, &output);
 
     status = MakeDLTensor(output, dl_ctx, output_shape_ptr, &args[NUM_INPUTS]);
     OP_REQUIRES_OK(context, status);
-   
+
     apply_variadic_by_ptrs(tvm_func, args);
-   
-    output.CopyToOrigin(); 
+
+    output.CopyToOrigin();
   }
 };
 
 
 
 #define REGISTER_TFTVM_KERNEL(n) \
-    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
-    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>); \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
+        .Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
+        .Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>); \
 
 REGISTER_TFTVM_KERNEL(1)
 REGISTER_TFTVM_KERNEL(2)
diff --git a/src/contrib/tf_op/tvm_dso_ops.cc b/src/contrib/tf_op/tvm_dso_ops.cc
index 8369e928599b..dfb21b204b81 100644
--- a/src/contrib/tf_op/tvm_dso_ops.cc
+++ b/src/contrib/tf_op/tvm_dso_ops.cc
@@ -19,8 +19,6 @@
 
 #include "tensorflow/core/framework/op.h"
 
-using namespace tensorflow;
-
 #define REGISTER_TFTVM_OP(n) REGISTER_OP("TvmDsoOp" #n) \
     .Output("output: output_dtype") \
     .Attr("lib_path: string") \

From 729a5dc60970dfc915f60c1430e69e7c2c8681ec Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 5 Mar 2020 14:34:32 +0800
Subject: [PATCH 08/30] feat: Add test script and fix gpu shape

---
 python/tvm/contrib/tf_op/test_tfop_module.py | 100 +++++++++++++++++++
 src/codegen/build_module.cc                  |   1 -
 src/contrib/tf_op/tvm_dso_op_kernels.cc      |  19 +++-
 3 files changed, 116 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/contrib/tf_op/test_tfop_module.py

diff --git a/python/tvm/contrib/tf_op/test_tfop_module.py b/python/tvm/contrib/tf_op/test_tfop_module.py
new file mode 100644
index 000000000000..8f1f147b7968
--- /dev/null
+++ b/python/tvm/contrib/tf_op/test_tfop_module.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test script for tf op module"""
+import tempfile
+import os
+import tensorflow as tf
+import numpy as np
+import tvm
+from tvm.contrib import tf_op
+
+
+def export_cpu_add_lib():
+    """create cpu add op lib"""
+    n = tvm.var("n")
+    ph_a = tvm.placeholder((n,), name='ph_a')
+    ph_b = tvm.placeholder((n,), name='ph_b')
+    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+    sched = tvm.create_schedule(ph_c.op)
+    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "llvm", name="vector_add")
+    lib_path = tempfile.mktemp("tvm_add_dll.so")
+    fadd_dylib.export_library(lib_path)
+    return lib_path
+
+
+def export_gpu_add_lib():
+    """create gpu add op lib"""
+    n = tvm.var("n")
+    ph_a = tvm.placeholder((n,), name='ph_a')
+    ph_b = tvm.placeholder((n,), name='ph_b')
+    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+    sched = tvm.create_schedule(ph_c.op)
+    b_axis, t_axis = sched[ph_c].split(ph_c.op.axis[0], factor=64)
+    sched[ph_c].bind(b_axis, tvm.thread_axis("blockIdx.x"))
+    sched[ph_c].bind(t_axis, tvm.thread_axis("threadIdx.x"))
+    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "cuda", name="vector_add")
+    lib_path = tempfile.mktemp("tvm_add_cuda_dll.so")
+    fadd_dylib.export_library(lib_path)
+    return lib_path
+
+
+def test_add(session, lib_path, tf_device):
+    """test add lib with TensorFlow wrapper"""
+    module = tf_op.Module(lib_path)
+
+    left = tf.placeholder("float32", shape=[4])
+    right = tf.placeholder("float32", shape=[4])
+
+    feed_dict = {left: [1.0, 2.0, 3.0, 4.0], right: [5.0, 6.0, 7.0, 8.0]}
+    expect = np.asarray([6.0, 8.0, 10.0, 12.0])
+
+    add1 = module.func("vector_add", output_shape=[4], output_dtype="float")
+    add2 = module.func("vector_add", output_shape=tf.shape(left), output_dtype="float")
+    add3 = module.func("vector_add", output_shape=[tf.shape(left)[0]], output_dtype="float")
+
+    with tf.device(tf_device):
+        output1 = session.run(add1(left, right), feed_dict)
+        np.testing.assert_equal(output1, expect)
+
+        output2 = session.run(add2(left, right), feed_dict)
+        np.testing.assert_equal(output2, expect)
+
+        output3 = session.run(add3(left, right), feed_dict)
+        np.testing.assert_equal(output3, expect)
+
+
+def main():
+    """main test function"""
+    cpu_lib = None
+    gpu_lib = None
+    try:
+        cpu_lib = export_cpu_add_lib()
+        gpu_lib = export_gpu_add_lib()
+        with tf.Session() as session:
+            test_add(session, cpu_lib, "/cpu:0")
+            test_add(session, gpu_lib, "/gpu:0")
+    finally:
+        if cpu_lib is not None:
+            os.remove(cpu_lib)
+        if gpu_lib is not None:
+            os.remove(gpu_lib)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 0fb761d768b0..80fd57af66f9 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -572,7 +572,6 @@ runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
     auto& fhost = host_dev_funcs[0];
     auto& fdevice = host_dev_funcs[1];
     // Get the module for a certain target.
-    printf("build device module: %s\n", it.first->device_name.c_str());
     runtime::Module mdev = DeviceBuild(fdevice, it.first);
     for (const auto& it : fhost) {
       fhost_all.push_back(it);
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 363de7199307..24cb8531ee4f 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -242,9 +242,22 @@ class TVMDSOOp : public OpKernel {
           dims, static_output_shape.size(), &output_shape);
     } else if (output_shape_tensor.dims() == 1) {
       // use shape tensor values as output shape
-      const tensorflow::int64* dims =
-          output_shape_tensor.flat<tensorflow::int64>().data();
-      tensorflow::TensorShapeUtils::MakeShape(dims, 1, &output_shape);
+      tensorflow::int64 num_dims = output_shape_tensor.NumElements();
+      if (TVMDSOOpTrait<GPUDevice>::device_type == kDLGPU) {
+          const tensorflow::int64* flat =
+              output_shape_tensor.flat<tensorflow::int64>().data();
+          tensorflow::int64* dims = new tensorflow::int64[num_dims];
+          cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims,
+             cudaMemcpyDeviceToHost);
+          tensorflow::TensorShapeUtils::MakeShape(
+              dims, num_dims, &output_shape);
+          delete dims;
+      } else {
+          const tensorflow::int64* dims =
+              output_shape_tensor.flat<tensorflow::int64>().data();
+          tensorflow::TensorShapeUtils::MakeShape(
+              dims, num_dims, &output_shape);
+      }
     } else {
       // use input tensor shape by default
       output_shape = context->input(0).shape();

From 7f1b5b317f058186e7d29a579234f4b5c98631eb Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 5 Mar 2020 14:34:32 +0800
Subject: [PATCH 09/30] feat: Add test script and fix gpu shape

---
 python/tvm/contrib/tf_op/test_tfop_module.py | 100 +++++++++++++++++++
 src/codegen/build_module.cc                  |   1 -
 src/contrib/tf_op/tvm_dso_op_kernels.cc      |  19 +++-
 3 files changed, 116 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/contrib/tf_op/test_tfop_module.py

diff --git a/python/tvm/contrib/tf_op/test_tfop_module.py b/python/tvm/contrib/tf_op/test_tfop_module.py
new file mode 100644
index 000000000000..8f1f147b7968
--- /dev/null
+++ b/python/tvm/contrib/tf_op/test_tfop_module.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test script for tf op module"""
+import tempfile
+import os
+import tensorflow as tf
+import numpy as np
+import tvm
+from tvm.contrib import tf_op
+
+
+def export_cpu_add_lib():
+    """create cpu add op lib"""
+    n = tvm.var("n")
+    ph_a = tvm.placeholder((n,), name='ph_a')
+    ph_b = tvm.placeholder((n,), name='ph_b')
+    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+    sched = tvm.create_schedule(ph_c.op)
+    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "llvm", name="vector_add")
+    lib_path = tempfile.mktemp("tvm_add_dll.so")
+    fadd_dylib.export_library(lib_path)
+    return lib_path
+
+
+def export_gpu_add_lib():
+    """create gpu add op lib"""
+    n = tvm.var("n")
+    ph_a = tvm.placeholder((n,), name='ph_a')
+    ph_b = tvm.placeholder((n,), name='ph_b')
+    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+    sched = tvm.create_schedule(ph_c.op)
+    b_axis, t_axis = sched[ph_c].split(ph_c.op.axis[0], factor=64)
+    sched[ph_c].bind(b_axis, tvm.thread_axis("blockIdx.x"))
+    sched[ph_c].bind(t_axis, tvm.thread_axis("threadIdx.x"))
+    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "cuda", name="vector_add")
+    lib_path = tempfile.mktemp("tvm_add_cuda_dll.so")
+    fadd_dylib.export_library(lib_path)
+    return lib_path
+
+
+def test_add(session, lib_path, tf_device):
+    """test add lib with TensorFlow wrapper"""
+    module = tf_op.Module(lib_path)
+
+    left = tf.placeholder("float32", shape=[4])
+    right = tf.placeholder("float32", shape=[4])
+
+    feed_dict = {left: [1.0, 2.0, 3.0, 4.0], right: [5.0, 6.0, 7.0, 8.0]}
+    expect = np.asarray([6.0, 8.0, 10.0, 12.0])
+
+    add1 = module.func("vector_add", output_shape=[4], output_dtype="float")
+    add2 = module.func("vector_add", output_shape=tf.shape(left), output_dtype="float")
+    add3 = module.func("vector_add", output_shape=[tf.shape(left)[0]], output_dtype="float")
+
+    with tf.device(tf_device):
+        output1 = session.run(add1(left, right), feed_dict)
+        np.testing.assert_equal(output1, expect)
+
+        output2 = session.run(add2(left, right), feed_dict)
+        np.testing.assert_equal(output2, expect)
+
+        output3 = session.run(add3(left, right), feed_dict)
+        np.testing.assert_equal(output3, expect)
+
+
+def main():
+    """main test function"""
+    cpu_lib = None
+    gpu_lib = None
+    try:
+        cpu_lib = export_cpu_add_lib()
+        gpu_lib = export_gpu_add_lib()
+        with tf.Session() as session:
+            test_add(session, cpu_lib, "/cpu:0")
+            test_add(session, gpu_lib, "/gpu:0")
+    finally:
+        if cpu_lib is not None:
+            os.remove(cpu_lib)
+        if gpu_lib is not None:
+            os.remove(gpu_lib)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 0fb761d768b0..80fd57af66f9 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -572,7 +572,6 @@ runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
     auto& fhost = host_dev_funcs[0];
     auto& fdevice = host_dev_funcs[1];
     // Get the module for a certain target.
-    printf("build device module: %s\n", it.first->device_name.c_str());
     runtime::Module mdev = DeviceBuild(fdevice, it.first);
     for (const auto& it : fhost) {
       fhost_all.push_back(it);
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 363de7199307..24cb8531ee4f 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -242,9 +242,22 @@ class TVMDSOOp : public OpKernel {
           dims, static_output_shape.size(), &output_shape);
     } else if (output_shape_tensor.dims() == 1) {
       // use shape tensor values as output shape
-      const tensorflow::int64* dims =
-          output_shape_tensor.flat<tensorflow::int64>().data();
-      tensorflow::TensorShapeUtils::MakeShape(dims, 1, &output_shape);
+      tensorflow::int64 num_dims = output_shape_tensor.NumElements();
+      if (TVMDSOOpTrait<GPUDevice>::device_type == kDLGPU) {
+          const tensorflow::int64* flat =
+              output_shape_tensor.flat<tensorflow::int64>().data();
+          tensorflow::int64* dims = new tensorflow::int64[num_dims];
+          cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims,
+             cudaMemcpyDeviceToHost);
+          tensorflow::TensorShapeUtils::MakeShape(
+              dims, num_dims, &output_shape);
+          delete dims;
+      } else {
+          const tensorflow::int64* dims =
+              output_shape_tensor.flat<tensorflow::int64>().data();
+          tensorflow::TensorShapeUtils::MakeShape(
+              dims, num_dims, &output_shape);
+      }
     } else {
       // use input tensor shape by default
       output_shape = context->input(0).shape();

From 0debcd682e99b4c04d1be091b0b0e5f9383d93f2 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Fri, 6 Mar 2020 11:57:49 +0800
Subject: [PATCH 10/30] fix: Conditional build tftvm op for gpu

---
 cmake/modules/contrib/TF_TVMDSOOP.cmake |  4 ++
 python/tvm/contrib/tf_op/module.py      |  2 +-
 src/contrib/tf_op/tvm_dso_op_kernels.cc | 60 +++++++++++++++++--------
 3 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index 97e5dd4d2190..ddd3b9c31c3d 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -24,6 +24,10 @@ if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
   if (NOT ${TF_STATUS} EQUAL 0)
     message(FATAL_ERROR "Fail to get TensorFlow compile flags")
   endif()
+  
+  if(NOT USE_CUDA STREQUAL "OFF")
+    add_definitions(-DTF_TVMDSOOP_ENABLE_GPU)
+  endif()
 
   execute_process(COMMAND ${Python_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags()))"
     OUTPUT_VARIABLE TF_LINK_FLAGS_STR
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index ed2622fd3cff..08384ad8b0ba 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -102,7 +102,7 @@ def _pack_shape_tensor(self, shape):
             if shape.dtype == tf.int32:
                 shape = tf.cast(shape, tf.int64)
             return shape
-        elif isinstance(shape, list):
+        if isinstance(shape, list):
             shape_dims = []
             for dim_value in shape:
                 if isinstance(dim_value, int):
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 24cb8531ee4f..c82d60db0e70 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -17,7 +17,9 @@
  * under the License.
  */
 
+#ifdef TF_TVMDSOOP_ENABLE_GPU
 #include <cuda_runtime.h>
+#endif
 #include <dlpack/dlpack.h>
 
 #include <tvm/runtime/module.h>
@@ -67,9 +69,11 @@ class TensorAsBuf {
         }
         if (device_type == kDLCPU) {
             memcpy(origin_buf, buf + offset, size);
+#ifdef TF_TVMDSOOP_ENABLE_GPU
         } else if (device_type == kDLGPU) {
             cudaMemcpy(origin_buf, buf + offset,
                 size, cudaMemcpyDeviceToDevice);
+#endif
         } else {
             LOG(FATAL) << "Only support CPU and CUDA now. Device "
                 << device_type << " is not implemented currently";
@@ -82,9 +86,11 @@ class TensorAsBuf {
         }
         if (device_type == kDLCPU) {
             memcpy(buf + offset, origin_buf, size);
+#ifdef TF_TVMDSOOP_ENABLE_GPU
         } else if (device_type == kDLGPU) {
             cudaMemcpy(buf + offset, origin_buf,
                 size, cudaMemcpyDeviceToDevice);
+#endif
         } else {
             LOG(FATAL) << "Only support CPU and CUDA now. Device "
                 << device_type << " is not implemented currently";
@@ -172,9 +178,19 @@ class TVMDSOOpTrait<CPUDevice> {
     static int device_id(OpKernelContext* context) {
         return 0;
     }
-};
 
+    static void make_shape_from_tensor(
+            const tensorflow::Tensor& shape_tensor,
+            tensorflow::TensorShape* output_shape) {
+        tensorflow::int64 num_dims = shape_tensor.NumElements();
+        const tensorflow::int64* dims =
+              shape_tensor.flat<tensorflow::int64>().data();
+        tensorflow::TensorShapeUtils::MakeShape(
+            dims, num_dims, output_shape);
+    }
+};
 
+#ifdef TF_TVMDSOOP_ENABLE_GPU
 template <>
 class TVMDSOOpTrait<GPUDevice> {
  public:
@@ -185,7 +201,22 @@ class TVMDSOOpTrait<GPUDevice> {
         auto gpu_device_info = device_base->tensorflow_gpu_device_info();
         return gpu_device_info->gpu_id;
     }
+
+    static void make_shape_from_tensor(
+            const tensorflow::Tensor& shape_tensor,
+            tensorflow::TensorShape* output_shape) {
+        tensorflow::int64 num_dims = shape_tensor.NumElements();
+        const tensorflow::int64* flat =
+            shape_tensor.flat<tensorflow::int64>().data();
+        tensorflow::int64* dims = new tensorflow::int64[num_dims];
+        cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims,
+           cudaMemcpyDeviceToHost);
+        tensorflow::TensorShapeUtils::MakeShape(
+            dims, num_dims, output_shape);
+        delete dims;
+    }
 };
+#endif
 
 
 template <typename DEVICE_TYPE, int NUM_INPUTS>
@@ -242,22 +273,8 @@ class TVMDSOOp : public OpKernel {
           dims, static_output_shape.size(), &output_shape);
     } else if (output_shape_tensor.dims() == 1) {
       // use shape tensor values as output shape
-      tensorflow::int64 num_dims = output_shape_tensor.NumElements();
-      if (TVMDSOOpTrait<GPUDevice>::device_type == kDLGPU) {
-          const tensorflow::int64* flat =
-              output_shape_tensor.flat<tensorflow::int64>().data();
-          tensorflow::int64* dims = new tensorflow::int64[num_dims];
-          cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims,
-             cudaMemcpyDeviceToHost);
-          tensorflow::TensorShapeUtils::MakeShape(
-              dims, num_dims, &output_shape);
-          delete dims;
-      } else {
-          const tensorflow::int64* dims =
-              output_shape_tensor.flat<tensorflow::int64>().data();
-          tensorflow::TensorShapeUtils::MakeShape(
-              dims, num_dims, &output_shape);
-      }
+      TVMDSOOpTrait<DEVICE_TYPE>::make_shape_from_tensor(
+          output_shape_tensor, &output_shape);
     } else {
       // use input tensor shape by default
       output_shape = context->input(0).shape();
@@ -304,12 +321,17 @@ class TVMDSOOp : public OpKernel {
 };
 
 
-
+#ifdef TF_TVMDSOOP_ENABLE_GPU
 #define REGISTER_TFTVM_KERNEL(n) \
     REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
         .Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
     REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
-        .Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>); \
+        .Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>);
+#else
+#define REGISTER_TFTVM_KERNEL(n) \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
+        .Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>);
+#endif
 
 REGISTER_TFTVM_KERNEL(1)
 REGISTER_TFTVM_KERNEL(2)

From f5be2f3d224a05deeb6da6104ff590ab6c399c31 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Fri, 6 Mar 2020 11:57:49 +0800
Subject: [PATCH 11/30] fix: Conditional build tftvm op for gpu

---
 cmake/modules/contrib/TF_TVMDSOOP.cmake |  4 ++
 python/tvm/contrib/tf_op/module.py      |  2 +-
 src/contrib/tf_op/tvm_dso_op_kernels.cc | 60 +++++++++++++++++--------
 3 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index 97e5dd4d2190..ddd3b9c31c3d 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -24,6 +24,10 @@ if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
   if (NOT ${TF_STATUS} EQUAL 0)
     message(FATAL_ERROR "Fail to get TensorFlow compile flags")
   endif()
+  
+  if(NOT USE_CUDA STREQUAL "OFF")
+    add_definitions(-DTF_TVMDSOOP_ENABLE_GPU)
+  endif()
 
   execute_process(COMMAND ${Python_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags()))"
     OUTPUT_VARIABLE TF_LINK_FLAGS_STR
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index ed2622fd3cff..08384ad8b0ba 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -102,7 +102,7 @@ def _pack_shape_tensor(self, shape):
             if shape.dtype == tf.int32:
                 shape = tf.cast(shape, tf.int64)
             return shape
-        elif isinstance(shape, list):
+        if isinstance(shape, list):
             shape_dims = []
             for dim_value in shape:
                 if isinstance(dim_value, int):
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 24cb8531ee4f..c82d60db0e70 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -17,7 +17,9 @@
  * under the License.
  */
 
+#ifdef TF_TVMDSOOP_ENABLE_GPU
 #include <cuda_runtime.h>
+#endif
 #include <dlpack/dlpack.h>
 
 #include <tvm/runtime/module.h>
@@ -67,9 +69,11 @@ class TensorAsBuf {
         }
         if (device_type == kDLCPU) {
             memcpy(origin_buf, buf + offset, size);
+#ifdef TF_TVMDSOOP_ENABLE_GPU
         } else if (device_type == kDLGPU) {
             cudaMemcpy(origin_buf, buf + offset,
                 size, cudaMemcpyDeviceToDevice);
+#endif
         } else {
             LOG(FATAL) << "Only support CPU and CUDA now. Device "
                 << device_type << " is not implemented currently";
@@ -82,9 +86,11 @@ class TensorAsBuf {
         }
         if (device_type == kDLCPU) {
             memcpy(buf + offset, origin_buf, size);
+#ifdef TF_TVMDSOOP_ENABLE_GPU
         } else if (device_type == kDLGPU) {
             cudaMemcpy(buf + offset, origin_buf,
                 size, cudaMemcpyDeviceToDevice);
+#endif
         } else {
             LOG(FATAL) << "Only support CPU and CUDA now. Device "
                 << device_type << " is not implemented currently";
@@ -172,9 +178,19 @@ class TVMDSOOpTrait<CPUDevice> {
     static int device_id(OpKernelContext* context) {
         return 0;
     }
-};
 
+    static void make_shape_from_tensor(
+            const tensorflow::Tensor& shape_tensor,
+            tensorflow::TensorShape* output_shape) {
+        tensorflow::int64 num_dims = shape_tensor.NumElements();
+        const tensorflow::int64* dims =
+              shape_tensor.flat<tensorflow::int64>().data();
+        tensorflow::TensorShapeUtils::MakeShape(
+            dims, num_dims, output_shape);
+    }
+};
 
+#ifdef TF_TVMDSOOP_ENABLE_GPU
 template <>
 class TVMDSOOpTrait<GPUDevice> {
  public:
@@ -185,7 +201,22 @@ class TVMDSOOpTrait<GPUDevice> {
         auto gpu_device_info = device_base->tensorflow_gpu_device_info();
         return gpu_device_info->gpu_id;
     }
+
+    static void make_shape_from_tensor(
+            const tensorflow::Tensor& shape_tensor,
+            tensorflow::TensorShape* output_shape) {
+        tensorflow::int64 num_dims = shape_tensor.NumElements();
+        const tensorflow::int64* flat =
+            shape_tensor.flat<tensorflow::int64>().data();
+        tensorflow::int64* dims = new tensorflow::int64[num_dims];
+        cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims,
+           cudaMemcpyDeviceToHost);
+        tensorflow::TensorShapeUtils::MakeShape(
+            dims, num_dims, output_shape);
+        delete dims;
+    }
 };
+#endif
 
 
 template <typename DEVICE_TYPE, int NUM_INPUTS>
@@ -242,22 +273,8 @@ class TVMDSOOp : public OpKernel {
           dims, static_output_shape.size(), &output_shape);
     } else if (output_shape_tensor.dims() == 1) {
       // use shape tensor values as output shape
-      tensorflow::int64 num_dims = output_shape_tensor.NumElements();
-      if (TVMDSOOpTrait<GPUDevice>::device_type == kDLGPU) {
-          const tensorflow::int64* flat =
-              output_shape_tensor.flat<tensorflow::int64>().data();
-          tensorflow::int64* dims = new tensorflow::int64[num_dims];
-          cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims,
-             cudaMemcpyDeviceToHost);
-          tensorflow::TensorShapeUtils::MakeShape(
-              dims, num_dims, &output_shape);
-          delete dims;
-      } else {
-          const tensorflow::int64* dims =
-              output_shape_tensor.flat<tensorflow::int64>().data();
-          tensorflow::TensorShapeUtils::MakeShape(
-              dims, num_dims, &output_shape);
-      }
+      TVMDSOOpTrait<DEVICE_TYPE>::make_shape_from_tensor(
+          output_shape_tensor, &output_shape);
     } else {
       // use input tensor shape by default
       output_shape = context->input(0).shape();
@@ -304,12 +321,17 @@ class TVMDSOOp : public OpKernel {
 };
 
 
-
+#ifdef TF_TVMDSOOP_ENABLE_GPU
 #define REGISTER_TFTVM_KERNEL(n) \
     REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
         .Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
     REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
-        .Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>); \
+        .Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>);
+#else
+#define REGISTER_TFTVM_KERNEL(n) \
+    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
+        .Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>);
+#endif
 
 REGISTER_TFTVM_KERNEL(1)
 REGISTER_TFTVM_KERNEL(2)

From 7eae33d2014a0e3cfa43ae2057f89b3ebfc013f7 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Mon, 9 Mar 2020 17:44:04 +0800
Subject: [PATCH 12/30] fix: Fix pylint of tf_op module.py

---
 python/tvm/contrib/tf_op/module.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index 08384ad8b0ba..5e8892d6746b 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -101,8 +101,7 @@ def _pack_shape_tensor(self, shape):
         if isinstance(shape, tf.Tensor):
             if shape.dtype == tf.int32:
                 shape = tf.cast(shape, tf.int64)
-            return shape
-        if isinstance(shape, list):
+        elif isinstance(shape, list):
             shape_dims = []
             for dim_value in shape:
                 if isinstance(dim_value, int):
@@ -113,6 +112,7 @@ def _pack_shape_tensor(self, shape):
                     shape_dims.append(dim_value)
                 else:
                     raise TypeError("Input shape dimension is neither scalar tensor nor int")
-            return tf.stack(shape_dims)
+            shape = tf.stack(shape_dims)
         else:
             raise TypeError("Input shape is neither tensor nor list")
+        return shape

From cd8fd80aa7960f6b74664db826cee51ef1773a34 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Mon, 9 Mar 2020 17:44:04 +0800
Subject: [PATCH 13/30] fix: Fix pylint of tf_op module.py

---
 python/tvm/contrib/tf_op/module.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index 08384ad8b0ba..5e8892d6746b 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -101,8 +101,7 @@ def _pack_shape_tensor(self, shape):
         if isinstance(shape, tf.Tensor):
             if shape.dtype == tf.int32:
                 shape = tf.cast(shape, tf.int64)
-            return shape
-        if isinstance(shape, list):
+        elif isinstance(shape, list):
             shape_dims = []
             for dim_value in shape:
                 if isinstance(dim_value, int):
@@ -113,6 +112,7 @@ def _pack_shape_tensor(self, shape):
                     shape_dims.append(dim_value)
                 else:
                     raise TypeError("Input shape dimension is neither scalar tensor nor int")
-            return tf.stack(shape_dims)
+            shape = tf.stack(shape_dims)
         else:
             raise TypeError("Input shape is neither tensor nor list")
+        return shape

From 1fc54ecad79765d01c29af77747028b801f5a76d Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Tue, 10 Mar 2020 12:53:48 +0800
Subject: [PATCH 14/30] feat: Conditional enable gpu test for tftvm op

---
 python/tvm/contrib/tf_op/test_tfop_module.py | 33 +++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/tf_op/test_tfop_module.py b/python/tvm/contrib/tf_op/test_tfop_module.py
index 8f1f147b7968..b75736e841c2 100644
--- a/python/tvm/contrib/tf_op/test_tfop_module.py
+++ b/python/tvm/contrib/tf_op/test_tfop_module.py
@@ -19,6 +19,7 @@
 """Test script for tf op module"""
 import tempfile
 import os
+import logging
 import tensorflow as tf
 import numpy as np
 import tvm
@@ -32,7 +33,7 @@ def export_cpu_add_lib():
     ph_b = tvm.placeholder((n,), name='ph_b')
     ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
     sched = tvm.create_schedule(ph_c.op)
-    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "llvm", name="vector_add")
+    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "c", name="vector_add")
     lib_path = tempfile.mktemp("tvm_add_dll.so")
     fadd_dylib.export_library(lib_path)
     return lib_path
@@ -79,22 +80,38 @@ def test_add(session, lib_path, tf_device):
         np.testing.assert_equal(output3, expect)
 
 
-def main():
-    """main test function"""
+def cpu_test(session):
+    """test function for cpu"""
     cpu_lib = None
-    gpu_lib = None
     try:
         cpu_lib = export_cpu_add_lib()
-        gpu_lib = export_gpu_add_lib()
-        with tf.Session() as session:
-            test_add(session, cpu_lib, "/cpu:0")
-            test_add(session, gpu_lib, "/gpu:0")
+        test_add(session, cpu_lib, "/cpu:0")
     finally:
         if cpu_lib is not None:
             os.remove(cpu_lib)
+
+
+def gpu_test(session):
+    """test function for gpu"""
+    gpu_lib = None
+    try:
+        gpu_lib = export_gpu_add_lib()
+        test_add(session, gpu_lib, "/gpu:0")
+    finally:
         if gpu_lib is not None:
             os.remove(gpu_lib)
 
 
+def main():
+    """main test function"""
+    with tf.Session() as session:
+        if tvm.module.enabled("cpu"):
+            logging.info("Test TensorFlow op on cpu kernel")
+            cpu_test(session)
+        if tvm.module.enabled("gpu"):
+            logging.info("Test TensorFlow op on gpu kernel")
+            gpu_test(session)
+
+
 if __name__ == "__main__":
     main()

From b4b9f964396b316b28064934ba01a7de7b01bfd8 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Tue, 10 Mar 2020 12:53:48 +0800
Subject: [PATCH 15/30] feat: Conditional enable gpu test for tftvm op

---
 python/tvm/contrib/tf_op/test_tfop_module.py | 33 +++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/tf_op/test_tfop_module.py b/python/tvm/contrib/tf_op/test_tfop_module.py
index 8f1f147b7968..b75736e841c2 100644
--- a/python/tvm/contrib/tf_op/test_tfop_module.py
+++ b/python/tvm/contrib/tf_op/test_tfop_module.py
@@ -19,6 +19,7 @@
 """Test script for tf op module"""
 import tempfile
 import os
+import logging
 import tensorflow as tf
 import numpy as np
 import tvm
@@ -32,7 +33,7 @@ def export_cpu_add_lib():
     ph_b = tvm.placeholder((n,), name='ph_b')
     ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
     sched = tvm.create_schedule(ph_c.op)
-    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "llvm", name="vector_add")
+    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "c", name="vector_add")
     lib_path = tempfile.mktemp("tvm_add_dll.so")
     fadd_dylib.export_library(lib_path)
     return lib_path
@@ -79,22 +80,38 @@ def test_add(session, lib_path, tf_device):
         np.testing.assert_equal(output3, expect)
 
 
-def main():
-    """main test function"""
+def cpu_test(session):
+    """test function for cpu"""
     cpu_lib = None
-    gpu_lib = None
     try:
         cpu_lib = export_cpu_add_lib()
-        gpu_lib = export_gpu_add_lib()
-        with tf.Session() as session:
-            test_add(session, cpu_lib, "/cpu:0")
-            test_add(session, gpu_lib, "/gpu:0")
+        test_add(session, cpu_lib, "/cpu:0")
     finally:
         if cpu_lib is not None:
             os.remove(cpu_lib)
+
+
+def gpu_test(session):
+    """test function for gpu"""
+    gpu_lib = None
+    try:
+        gpu_lib = export_gpu_add_lib()
+        test_add(session, gpu_lib, "/gpu:0")
+    finally:
         if gpu_lib is not None:
             os.remove(gpu_lib)
 
 
+def main():
+    """main test function"""
+    with tf.Session() as session:
+        if tvm.module.enabled("cpu"):
+            logging.info("Test TensorFlow op on cpu kernel")
+            cpu_test(session)
+        if tvm.module.enabled("gpu"):
+            logging.info("Test TensorFlow op on gpu kernel")
+            gpu_test(session)
+
+
 if __name__ == "__main__":
     main()

From 864c4a5265bc97c8a8c60abf04ca34dbfdcbfa0e Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Mon, 16 Mar 2020 18:37:13 +0800
Subject: [PATCH 16/30] feat: Add tf_tvmdsoop test script as an app test

---
 apps/tf_tvmdsoop/CMakeLists.txt               | 32 ++++++++++++++++++
 apps/tf_tvmdsoop/prepare_tfop_module.sh       | 33 +++++++++++++++++++
 .../tf_tvmdsoop/tests}/test_tfop_module.py    |  0
 cmake/modules/contrib/TF_TVMDSOOP.cmake       |  5 ++-
 tests/scripts/task_python_integration.sh      |  6 ++++
 5 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 apps/tf_tvmdsoop/CMakeLists.txt
 create mode 100644 apps/tf_tvmdsoop/prepare_tfop_module.sh
 rename {python/tvm/contrib/tf_op => apps/tf_tvmdsoop/tests}/test_tfop_module.py (100%)

diff --git a/apps/tf_tvmdsoop/CMakeLists.txt b/apps/tf_tvmdsoop/CMakeLists.txt
new file mode 100644
index 000000000000..ab9cc34e6fe0
--- /dev/null
+++ b/apps/tf_tvmdsoop/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+cmake_minimum_required(VERSION 3.2)
+project(tf_tvmdsoop C CXX)
+
+set(CMAKE_CURRENT_SOURCE_DIR ${TVM_ROOT})
+
+include_directories(${TVM_ROOT}/3rdparty/dlpack/include/)
+include_directories(${TVM_ROOT}/3rdparty/dmlc-core/include/)
+include_directories(${TVM_ROOT}/include)
+
+link_directories(${TVM_ROOT}/build)
+
+include(${TVM_ROOT}/cmake/util/FindCUDA.cmake)
+include(${TVM_ROOT}/cmake/modules/CUDA.cmake)
+
+set(BUILD_TVMDSOOP_ONLY ON)
+include(${TVM_ROOT}/cmake/modules/contrib/TF_TVMDSOOP.cmake)
diff --git a/apps/tf_tvmdsoop/prepare_tfop_module.sh b/apps/tf_tvmdsoop/prepare_tfop_module.sh
new file mode 100644
index 000000000000..1081f3b5bf52
--- /dev/null
+++ b/apps/tf_tvmdsoop/prepare_tfop_module.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+TVM_ROOT=$(cd $(dirname $0)/../..; pwd)
+echo "TVM_ROOT=${TVM_ROOT}"
+
+export PYTHONPATH=${TVM_ROOT}/python
+
+python -c "import tvm; print(tvm.module.enabled('gpu'))" | grep -e 1
+if [[ $? -eq 0 ]]; then 
+    echo "Build TF_TVMDSOOP with gpu support"
+    CMAKE_OPTIONS="-DUSE_CUDA=ON -DTVM_ROOT=${TVM_ROOT}"
+else
+    CMAKE_OPTIONS="-DUSE_CUDA=OFF -DTVM_ROOT=${TVM_ROOT}"
+fi
+
+mkdir -p build; cd build; cmake .. ${CMAKE_OPTIONS}; make
+
diff --git a/python/tvm/contrib/tf_op/test_tfop_module.py b/apps/tf_tvmdsoop/tests/test_tfop_module.py
similarity index 100%
rename from python/tvm/contrib/tf_op/test_tfop_module.py
rename to apps/tf_tvmdsoop/tests/test_tfop_module.py
diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index ddd3b9c31c3d..c5dd122dfb36 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -46,7 +46,10 @@ if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
   add_library(${OP_LIBRARY_NAME} SHARED ${TFTVM_SRCS})
   set_target_properties(${OP_LIBRARY_NAME} PROPERTIES PREFIX "")
   set(TFTVM_LINK_FLAGS  -ltvm -L${CMAKE_CURRENT_BINARY_DIR})
-  add_dependencies(${OP_LIBRARY_NAME} tvm) 
+  
+  if (NOT BUILD_TVMDSOOP_ONLY STREQUAL "ON")
+      add_dependencies(${OP_LIBRARY_NAME} tvm) 
+  endif()
 
   # set(TFTVM_COMPILE_FLAGS  ${CMAKE_CXX_FLAGS})
   target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 5c00fd9c8896..8b08c4fa8573 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -53,6 +53,12 @@ cd ../..
 TVM_FFI=cython python3 -m pytest -v apps/dso_plugin_module
 TVM_FFI=ctypes python3 -m pytest -v apps/dso_plugin_module
 
+# Test TensorFlow TVMDSOOP
+cd apps/tf_tvmdsoop
+sh prepare_tfop_module.sh
+cd ../..
+TVM_FFI=ctypes LD_LIBRARY_PATH=apps/tf_tvmdsoop/build:$LD_LIBRARY_PATH \
+  python3 -m pytest -v apps/tf_tvmdsoop/tests
 
 TVM_FFI=ctypes python3 -m pytest -v tests/python/integration
 TVM_FFI=ctypes python3 -m pytest -v tests/python/contrib

From ec055117c7580a25c9687fe3df7bd2d4415a5eb8 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Tue, 17 Mar 2020 17:59:29 +0800
Subject: [PATCH 17/30] fix: Fix gpu/cpu enabled check on tvm in test script

---
 apps/tf_tvmdsoop/prepare_tfop_module.sh    |  4 +--
 apps/tf_tvmdsoop/tests/test_tfop_module.py | 29 +++++++++++-----------
 tests/scripts/task_python_integration.sh   |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/apps/tf_tvmdsoop/prepare_tfop_module.sh b/apps/tf_tvmdsoop/prepare_tfop_module.sh
index 1081f3b5bf52..3d1caf0c7274 100644
--- a/apps/tf_tvmdsoop/prepare_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_tfop_module.sh
@@ -21,8 +21,8 @@ echo "TVM_ROOT=${TVM_ROOT}"
 
 export PYTHONPATH=${TVM_ROOT}/python
 
-python -c "import tvm; print(tvm.module.enabled('gpu'))" | grep -e 1
-if [[ $? -eq 0 ]]; then 
+python3 -c "import tvm; print(tvm.runtime.enabled('gpu'))" | grep -e 1
+if [ "$?" -eq 0 ]; then 
     echo "Build TF_TVMDSOOP with gpu support"
     CMAKE_OPTIONS="-DUSE_CUDA=ON -DTVM_ROOT=${TVM_ROOT}"
 else
diff --git a/apps/tf_tvmdsoop/tests/test_tfop_module.py b/apps/tf_tvmdsoop/tests/test_tfop_module.py
index b75736e841c2..00c9370b2f4f 100644
--- a/apps/tf_tvmdsoop/tests/test_tfop_module.py
+++ b/apps/tf_tvmdsoop/tests/test_tfop_module.py
@@ -23,16 +23,17 @@
 import tensorflow as tf
 import numpy as np
 import tvm
+from tvm import te
 from tvm.contrib import tf_op
 
 
 def export_cpu_add_lib():
     """create cpu add op lib"""
-    n = tvm.var("n")
-    ph_a = tvm.placeholder((n,), name='ph_a')
-    ph_b = tvm.placeholder((n,), name='ph_b')
-    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
-    sched = tvm.create_schedule(ph_c.op)
+    n = te.var("n")
+    ph_a = te.placeholder((n,), name='ph_a')
+    ph_b = te.placeholder((n,), name='ph_b')
+    ph_c = te.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+    sched = te.create_schedule(ph_c.op)
     fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "c", name="vector_add")
     lib_path = tempfile.mktemp("tvm_add_dll.so")
     fadd_dylib.export_library(lib_path)
@@ -41,14 +42,14 @@ def export_cpu_add_lib():
 
 def export_gpu_add_lib():
     """create gpu add op lib"""
-    n = tvm.var("n")
-    ph_a = tvm.placeholder((n,), name='ph_a')
-    ph_b = tvm.placeholder((n,), name='ph_b')
-    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
-    sched = tvm.create_schedule(ph_c.op)
+    n = te.var("n")
+    ph_a = te.placeholder((n,), name='ph_a')
+    ph_b = te.placeholder((n,), name='ph_b')
+    ph_c = te.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+    sched = te.create_schedule(ph_c.op)
     b_axis, t_axis = sched[ph_c].split(ph_c.op.axis[0], factor=64)
-    sched[ph_c].bind(b_axis, tvm.thread_axis("blockIdx.x"))
-    sched[ph_c].bind(t_axis, tvm.thread_axis("threadIdx.x"))
+    sched[ph_c].bind(b_axis, te.thread_axis("blockIdx.x"))
+    sched[ph_c].bind(t_axis, te.thread_axis("threadIdx.x"))
     fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "cuda", name="vector_add")
     lib_path = tempfile.mktemp("tvm_add_cuda_dll.so")
     fadd_dylib.export_library(lib_path)
@@ -105,10 +106,10 @@ def gpu_test(session):
 def main():
     """main test function"""
     with tf.Session() as session:
-        if tvm.module.enabled("cpu"):
+        if tvm.runtime.enabled("cpu"):
             logging.info("Test TensorFlow op on cpu kernel")
             cpu_test(session)
-        if tvm.module.enabled("gpu"):
+        if tvm.runtime.enabled("gpu"):
             logging.info("Test TensorFlow op on gpu kernel")
             gpu_test(session)
 
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 8b08c4fa8573..b7c3ba1e18ad 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -57,7 +57,7 @@ TVM_FFI=ctypes python3 -m pytest -v apps/dso_plugin_module
 cd apps/tf_tvmdsoop
 sh prepare_tfop_module.sh
 cd ../..
-TVM_FFI=ctypes LD_LIBRARY_PATH=apps/tf_tvmdsoop/build:$LD_LIBRARY_PATH \
+LD_LIBRARY_PATH=apps/tf_tvmdsoop/build:$LD_LIBRARY_PATH \
   python3 -m pytest -v apps/tf_tvmdsoop/tests
 
 TVM_FFI=ctypes python3 -m pytest -v tests/python/integration

From 51ed77956316793a8eed85486b91506020f89df0 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Tue, 17 Mar 2020 18:45:50 +0800
Subject: [PATCH 18/30] fix: Make tf tvmdso op test script runnable with pytest

---
 ...ule.sh => prepare_and_test_tfop_module.sh} |  13 +-
 apps/tf_tvmdsoop/tests/test_tfop_module.py    | 156 +++++++++---------
 tests/scripts/task_python_integration.sh      |   5 +-
 3 files changed, 88 insertions(+), 86 deletions(-)
 rename apps/tf_tvmdsoop/{prepare_tfop_module.sh => prepare_and_test_tfop_module.sh} (82%)

diff --git a/apps/tf_tvmdsoop/prepare_tfop_module.sh b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
similarity index 82%
rename from apps/tf_tvmdsoop/prepare_tfop_module.sh
rename to apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
index 3d1caf0c7274..3a4a6937de7f 100644
--- a/apps/tf_tvmdsoop/prepare_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
@@ -23,11 +23,14 @@ export PYTHONPATH=${TVM_ROOT}/python
 
 python3 -c "import tvm; print(tvm.runtime.enabled('gpu'))" | grep -e 1
 if [ "$?" -eq 0 ]; then 
-    echo "Build TF_TVMDSOOP with gpu support"
+    echo "Build TF_TVMDSOOP with gpu support and execute tests"
     CMAKE_OPTIONS="-DUSE_CUDA=ON -DTVM_ROOT=${TVM_ROOT}"
-else
-    CMAKE_OPTIONS="-DUSE_CUDA=OFF -DTVM_ROOT=${TVM_ROOT}"
-fi
+ 
+    mkdir -p build
+    pushd build
+    cmake .. ${CMAKE_OPTIONS} && make
+    popd
 
-mkdir -p build; cd build; cmake .. ${CMAKE_OPTIONS}; make
+    LD_LIBRARY_PATH=./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
+fi
 
diff --git a/apps/tf_tvmdsoop/tests/test_tfop_module.py b/apps/tf_tvmdsoop/tests/test_tfop_module.py
index 00c9370b2f4f..f2dee98ee01c 100644
--- a/apps/tf_tvmdsoop/tests/test_tfop_module.py
+++ b/apps/tf_tvmdsoop/tests/test_tfop_module.py
@@ -27,84 +27,84 @@
 from tvm.contrib import tf_op
 
 
-def export_cpu_add_lib():
-    """create cpu add op lib"""
-    n = te.var("n")
-    ph_a = te.placeholder((n,), name='ph_a')
-    ph_b = te.placeholder((n,), name='ph_b')
-    ph_c = te.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
-    sched = te.create_schedule(ph_c.op)
-    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "c", name="vector_add")
-    lib_path = tempfile.mktemp("tvm_add_dll.so")
-    fadd_dylib.export_library(lib_path)
-    return lib_path
-
-
-def export_gpu_add_lib():
-    """create gpu add op lib"""
-    n = te.var("n")
-    ph_a = te.placeholder((n,), name='ph_a')
-    ph_b = te.placeholder((n,), name='ph_b')
-    ph_c = te.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
-    sched = te.create_schedule(ph_c.op)
-    b_axis, t_axis = sched[ph_c].split(ph_c.op.axis[0], factor=64)
-    sched[ph_c].bind(b_axis, te.thread_axis("blockIdx.x"))
-    sched[ph_c].bind(t_axis, te.thread_axis("threadIdx.x"))
-    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "cuda", name="vector_add")
-    lib_path = tempfile.mktemp("tvm_add_cuda_dll.so")
-    fadd_dylib.export_library(lib_path)
-    return lib_path
-
-
-def test_add(session, lib_path, tf_device):
-    """test add lib with TensorFlow wrapper"""
-    module = tf_op.Module(lib_path)
-
-    left = tf.placeholder("float32", shape=[4])
-    right = tf.placeholder("float32", shape=[4])
-
-    feed_dict = {left: [1.0, 2.0, 3.0, 4.0], right: [5.0, 6.0, 7.0, 8.0]}
-    expect = np.asarray([6.0, 8.0, 10.0, 12.0])
-
-    add1 = module.func("vector_add", output_shape=[4], output_dtype="float")
-    add2 = module.func("vector_add", output_shape=tf.shape(left), output_dtype="float")
-    add3 = module.func("vector_add", output_shape=[tf.shape(left)[0]], output_dtype="float")
-
-    with tf.device(tf_device):
-        output1 = session.run(add1(left, right), feed_dict)
-        np.testing.assert_equal(output1, expect)
-
-        output2 = session.run(add2(left, right), feed_dict)
-        np.testing.assert_equal(output2, expect)
-
-        output3 = session.run(add3(left, right), feed_dict)
-        np.testing.assert_equal(output3, expect)
-
-
-def cpu_test(session):
-    """test function for cpu"""
-    cpu_lib = None
-    try:
-        cpu_lib = export_cpu_add_lib()
-        test_add(session, cpu_lib, "/cpu:0")
-    finally:
-        if cpu_lib is not None:
-            os.remove(cpu_lib)
-
-
-def gpu_test(session):
-    """test function for gpu"""
-    gpu_lib = None
-    try:
-        gpu_lib = export_gpu_add_lib()
-        test_add(session, gpu_lib, "/gpu:0")
-    finally:
-        if gpu_lib is not None:
-            os.remove(gpu_lib)
-
-
-def main():
+def test_use_tvmdso_op():
     """main test function"""
+
+    def export_cpu_add_lib():
+        """create cpu add op lib"""
+        n = te.var("n")
+        ph_a = te.placeholder((n,), name='ph_a')
+        ph_b = te.placeholder((n,), name='ph_b')
+        ph_c = te.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+        sched = te.create_schedule(ph_c.op)
+        fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "c", name="vector_add")
+        lib_path = tempfile.mktemp("tvm_add_dll.so")
+        fadd_dylib.export_library(lib_path)
+        return lib_path
+
+
+    def export_gpu_add_lib():
+        """create gpu add op lib"""
+        n = te.var("n")
+        ph_a = te.placeholder((n,), name='ph_a')
+        ph_b = te.placeholder((n,), name='ph_b')
+        ph_c = te.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
+        sched = te.create_schedule(ph_c.op)
+        b_axis, t_axis = sched[ph_c].split(ph_c.op.axis[0], factor=64)
+        sched[ph_c].bind(b_axis, te.thread_axis("blockIdx.x"))
+        sched[ph_c].bind(t_axis, te.thread_axis("threadIdx.x"))
+        fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "cuda", name="vector_add")
+        lib_path = tempfile.mktemp("tvm_add_cuda_dll.so")
+        fadd_dylib.export_library(lib_path)
+        return lib_path
+
+
+    def test_add(session, lib_path, tf_device):
+        """test add lib with TensorFlow wrapper"""
+        module = tf_op.Module(lib_path)
+
+        left = tf.placeholder("float32", shape=[4])
+        right = tf.placeholder("float32", shape=[4])
+
+        feed_dict = {left: [1.0, 2.0, 3.0, 4.0], right: [5.0, 6.0, 7.0, 8.0]}
+        expect = np.asarray([6.0, 8.0, 10.0, 12.0])
+
+        add1 = module.func("vector_add", output_shape=[4], output_dtype="float")
+        add2 = module.func("vector_add", output_shape=tf.shape(left), output_dtype="float")
+        add3 = module.func("vector_add", output_shape=[tf.shape(left)[0]], output_dtype="float")
+
+        with tf.device(tf_device):
+            output1 = session.run(add1(left, right), feed_dict)
+            np.testing.assert_equal(output1, expect)
+
+            output2 = session.run(add2(left, right), feed_dict)
+            np.testing.assert_equal(output2, expect)
+
+            output3 = session.run(add3(left, right), feed_dict)
+            np.testing.assert_equal(output3, expect)
+
+
+    def cpu_test(session):
+        """test function for cpu"""
+        cpu_lib = None
+        try:
+            cpu_lib = export_cpu_add_lib()
+            test_add(session, cpu_lib, "/cpu:0")
+        finally:
+            if cpu_lib is not None:
+                os.remove(cpu_lib)
+
+
+    def gpu_test(session):
+        """test function for gpu"""
+        gpu_lib = None
+        try:
+            gpu_lib = export_gpu_add_lib()
+            test_add(session, gpu_lib, "/gpu:0")
+        finally:
+            if gpu_lib is not None:
+                os.remove(gpu_lib)
+
     with tf.Session() as session:
         if tvm.runtime.enabled("cpu"):
             logging.info("Test TensorFlow op on cpu kernel")
@@ -115,4 +115,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    test_use_tvmdso_op()
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index b7c3ba1e18ad..3569505cb551 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -55,10 +55,9 @@ TVM_FFI=ctypes python3 -m pytest -v apps/dso_plugin_module
 
 # Test TensorFlow TVMDSOOP
 cd apps/tf_tvmdsoop
-sh prepare_tfop_module.sh
+TVM_FFI=cython sh prepare_and_test_tfop_module.sh
+TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh
 cd ../..
-LD_LIBRARY_PATH=apps/tf_tvmdsoop/build:$LD_LIBRARY_PATH \
-  python3 -m pytest -v apps/tf_tvmdsoop/tests
 
 TVM_FFI=ctypes python3 -m pytest -v tests/python/integration
 TVM_FFI=ctypes python3 -m pytest -v tests/python/contrib

From 685f7d049ed00cf4af1d1652c34e24c4368490de Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest@4paradigm.com>
Date: Tue, 17 Mar 2020 19:08:10 +0800
Subject: [PATCH 19/30] remove unused test script test_tfop_module.py

---
 python/tvm/contrib/tf_op/test_tfop_module.py | 117 -------------------
 1 file changed, 117 deletions(-)
 delete mode 100644 python/tvm/contrib/tf_op/test_tfop_module.py

diff --git a/python/tvm/contrib/tf_op/test_tfop_module.py b/python/tvm/contrib/tf_op/test_tfop_module.py
deleted file mode 100644
index b75736e841c2..000000000000
--- a/python/tvm/contrib/tf_op/test_tfop_module.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test script for tf op module"""
-import tempfile
-import os
-import logging
-import tensorflow as tf
-import numpy as np
-import tvm
-from tvm.contrib import tf_op
-
-
-def export_cpu_add_lib():
-    """create cpu add op lib"""
-    n = tvm.var("n")
-    ph_a = tvm.placeholder((n,), name='ph_a')
-    ph_b = tvm.placeholder((n,), name='ph_b')
-    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
-    sched = tvm.create_schedule(ph_c.op)
-    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "c", name="vector_add")
-    lib_path = tempfile.mktemp("tvm_add_dll.so")
-    fadd_dylib.export_library(lib_path)
-    return lib_path
-
-
-def export_gpu_add_lib():
-    """create gpu add op lib"""
-    n = tvm.var("n")
-    ph_a = tvm.placeholder((n,), name='ph_a')
-    ph_b = tvm.placeholder((n,), name='ph_b')
-    ph_c = tvm.compute(ph_a.shape, lambda i: ph_a[i] + ph_b[i], name='ph_c')
-    sched = tvm.create_schedule(ph_c.op)
-    b_axis, t_axis = sched[ph_c].split(ph_c.op.axis[0], factor=64)
-    sched[ph_c].bind(b_axis, tvm.thread_axis("blockIdx.x"))
-    sched[ph_c].bind(t_axis, tvm.thread_axis("threadIdx.x"))
-    fadd_dylib = tvm.build(sched, [ph_a, ph_b, ph_c], "cuda", name="vector_add")
-    lib_path = tempfile.mktemp("tvm_add_cuda_dll.so")
-    fadd_dylib.export_library(lib_path)
-    return lib_path
-
-
-def test_add(session, lib_path, tf_device):
-    """test add lib with TensorFlow wrapper"""
-    module = tf_op.Module(lib_path)
-
-    left = tf.placeholder("float32", shape=[4])
-    right = tf.placeholder("float32", shape=[4])
-
-    feed_dict = {left: [1.0, 2.0, 3.0, 4.0], right: [5.0, 6.0, 7.0, 8.0]}
-    expect = np.asarray([6.0, 8.0, 10.0, 12.0])
-
-    add1 = module.func("vector_add", output_shape=[4], output_dtype="float")
-    add2 = module.func("vector_add", output_shape=tf.shape(left), output_dtype="float")
-    add3 = module.func("vector_add", output_shape=[tf.shape(left)[0]], output_dtype="float")
-
-    with tf.device(tf_device):
-        output1 = session.run(add1(left, right), feed_dict)
-        np.testing.assert_equal(output1, expect)
-
-        output2 = session.run(add2(left, right), feed_dict)
-        np.testing.assert_equal(output2, expect)
-
-        output3 = session.run(add3(left, right), feed_dict)
-        np.testing.assert_equal(output3, expect)
-
-
-def cpu_test(session):
-    """test function for cpu"""
-    cpu_lib = None
-    try:
-        cpu_lib = export_cpu_add_lib()
-        test_add(session, cpu_lib, "/cpu:0")
-    finally:
-        if cpu_lib is not None:
-            os.remove(cpu_lib)
-
-
-def gpu_test(session):
-    """test function for gpu"""
-    gpu_lib = None
-    try:
-        gpu_lib = export_gpu_add_lib()
-        test_add(session, gpu_lib, "/gpu:0")
-    finally:
-        if gpu_lib is not None:
-            os.remove(gpu_lib)
-
-
-def main():
-    """main test function"""
-    with tf.Session() as session:
-        if tvm.module.enabled("cpu"):
-            logging.info("Test TensorFlow op on cpu kernel")
-            cpu_test(session)
-        if tvm.module.enabled("gpu"):
-            logging.info("Test TensorFlow op on gpu kernel")
-            gpu_test(session)
-
-
-if __name__ == "__main__":
-    main()

From ea6328b92d775a756232c3e1918339b1cfb9df63 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 18 Mar 2020 11:09:16 +0800
Subject: [PATCH 20/30] fix: Remove pushd & popd in tfdsoop test script

---
 apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
index 3a4a6937de7f..857bd9ed2b02 100644
--- a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
@@ -27,10 +27,8 @@ if [ "$?" -eq 0 ]; then
     CMAKE_OPTIONS="-DUSE_CUDA=ON -DTVM_ROOT=${TVM_ROOT}"
  
     mkdir -p build
-    pushd build
-    cmake .. ${CMAKE_OPTIONS} && make
-    popd
-
+    cd build; cmake .. ${CMAKE_OPTIONS} && make
+    cd ..
     LD_LIBRARY_PATH=./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
 fi
 

From 0ae094261c2279888c76b29e59f8e740162c87f2 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 18 Mar 2020 13:16:51 +0800
Subject: [PATCH 21/30] fix: Upgrade tftvmop use python3 to find TensorFlow

---
 cmake/modules/contrib/TF_TVMDSOOP.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index c5dd122dfb36..a7a6b497ca9c 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -16,9 +16,9 @@
 # under the License.
 
 if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
-  find_package(Python COMPONENTS Interpreter)
+  find_package(Python3 COMPONENTS Interpreter)
   
-  execute_process(COMMAND ${Python_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
+  execute_process(COMMAND ${Python3_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags()))"
     OUTPUT_VARIABLE TF_COMPILE_FLAGS_STR
     RESULT_VARIABLE TF_STATUS)
   if (NOT ${TF_STATUS} EQUAL 0)
@@ -29,7 +29,7 @@ if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
     add_definitions(-DTF_TVMDSOOP_ENABLE_GPU)
   endif()
 
-  execute_process(COMMAND ${Python_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags()))"
+  execute_process(COMMAND ${Python3_EXECUTABLE} -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags()))"
     OUTPUT_VARIABLE TF_LINK_FLAGS_STR
     RESULT_VARIABLE TF_STATUS)
   if (NOT ${TF_STATUS} EQUAL 0)

From 95947006ab8eff1161f320371c0ad26503e941de Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 18 Mar 2020 13:16:51 +0800
Subject: [PATCH 22/30] fix: Upgrade tftvmop use python3 to find TensorFlow

---
 apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
index 857bd9ed2b02..be33b57de67b 100644
--- a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
@@ -24,7 +24,7 @@ export PYTHONPATH=${TVM_ROOT}/python
 python3 -c "import tvm; print(tvm.runtime.enabled('gpu'))" | grep -e 1
 if [ "$?" -eq 0 ]; then 
     echo "Build TF_TVMDSOOP with gpu support and execute tests"
-    CMAKE_OPTIONS="-DUSE_CUDA=ON -DTVM_ROOT=${TVM_ROOT}"
+    CMAKE_OPTIONS="-DUSE_CUDA=ON -DPython3_EXECUTABLE=python3 -DTVM_ROOT=${TVM_ROOT}"
  
     mkdir -p build
     cd build; cmake .. ${CMAKE_OPTIONS} && make

From 8a5d2fbd444b5cd09762c6a4c235282fa8fcee4e Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 18 Mar 2020 16:20:05 +0800
Subject: [PATCH 23/30] fix: Change target_link_options to
 target_link_libraries

---
 cmake/modules/contrib/TF_TVMDSOOP.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index a7a6b497ca9c..bf716267ed5e 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -53,7 +53,7 @@ if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
 
   # set(TFTVM_COMPILE_FLAGS  ${CMAKE_CXX_FLAGS})
   target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
-  target_link_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
+  target_link_libraries(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
 
 endif()
 

From b8fbd2e2c17e30df68e927f3768fa3571b07056f Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 19 Mar 2020 10:56:53 +0800
Subject: [PATCH 24/30] fix: Add tftvmop build script's c++ option

---
 apps/tf_tvmdsoop/CMakeLists.txt         | 1 +
 cmake/modules/contrib/TF_TVMDSOOP.cmake | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/tf_tvmdsoop/CMakeLists.txt b/apps/tf_tvmdsoop/CMakeLists.txt
index ab9cc34e6fe0..a445d0108fd3 100644
--- a/apps/tf_tvmdsoop/CMakeLists.txt
+++ b/apps/tf_tvmdsoop/CMakeLists.txt
@@ -28,5 +28,6 @@ link_directories(${TVM_ROOT}/build)
 include(${TVM_ROOT}/cmake/util/FindCUDA.cmake)
 include(${TVM_ROOT}/cmake/modules/CUDA.cmake)
 
+set(TFTVM_COMPILE_FLAGS -std=c++11)
 set(BUILD_TVMDSOOP_ONLY ON)
 include(${TVM_ROOT}/cmake/modules/contrib/TF_TVMDSOOP.cmake)
diff --git a/cmake/modules/contrib/TF_TVMDSOOP.cmake b/cmake/modules/contrib/TF_TVMDSOOP.cmake
index bf716267ed5e..e92822a397ae 100644
--- a/cmake/modules/contrib/TF_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/TF_TVMDSOOP.cmake
@@ -51,7 +51,6 @@ if(NOT USE_TF_TVMDSOOP STREQUAL "OFF")
       add_dependencies(${OP_LIBRARY_NAME} tvm) 
   endif()
 
-  # set(TFTVM_COMPILE_FLAGS  ${CMAKE_CXX_FLAGS})
   target_compile_options(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_COMPILE_FLAGS} ${TF_COMPILE_FLAGS})
   target_link_libraries(${OP_LIBRARY_NAME} PUBLIC ${TFTVM_LINK_FLAGS} ${TF_LINK_FLAGS})
 

From 380e1d7f6176e13df0501642c87502fa6aaaea2a Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 19 Mar 2020 11:50:00 +0800
Subject: [PATCH 25/30] fix: Add tvm library path to tf op test library path

---
 apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
index be33b57de67b..ef1b63771526 100644
--- a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
@@ -29,6 +29,6 @@ if [ "$?" -eq 0 ]; then
     mkdir -p build
     cd build; cmake .. ${CMAKE_OPTIONS} && make
     cd ..
-    LD_LIBRARY_PATH=./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
+    LD_LIBRARY_PATH=${TVM_ROOT}/build:./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
 fi
 

From 0b3884d165ec6528df44768d0f0969b5120eb22c Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 19 Mar 2020 12:41:04 +0800
Subject: [PATCH 26/30] fix: Debug ci build for tftvm dso op

---
 apps/tf_tvmdsoop/CMakeLists.txt                  | 5 +++--
 apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/apps/tf_tvmdsoop/CMakeLists.txt b/apps/tf_tvmdsoop/CMakeLists.txt
index a445d0108fd3..cb601ef6d30d 100644
--- a/apps/tf_tvmdsoop/CMakeLists.txt
+++ b/apps/tf_tvmdsoop/CMakeLists.txt
@@ -17,7 +17,10 @@
 cmake_minimum_required(VERSION 3.2)
 project(tf_tvmdsoop C CXX)
 
+set(TFTVM_COMPILE_FLAGS -std=c++11)
+set(BUILD_TVMDSOOP_ONLY ON)
 set(CMAKE_CURRENT_SOURCE_DIR ${TVM_ROOT})
+set(CMAKE_CURRENT_BINARY_DIR ${TVM_ROOT}/build)
 
 include_directories(${TVM_ROOT}/3rdparty/dlpack/include/)
 include_directories(${TVM_ROOT}/3rdparty/dmlc-core/include/)
@@ -28,6 +31,4 @@ link_directories(${TVM_ROOT}/build)
 include(${TVM_ROOT}/cmake/util/FindCUDA.cmake)
 include(${TVM_ROOT}/cmake/modules/CUDA.cmake)
 
-set(TFTVM_COMPILE_FLAGS -std=c++11)
-set(BUILD_TVMDSOOP_ONLY ON)
 include(${TVM_ROOT}/cmake/modules/contrib/TF_TVMDSOOP.cmake)
diff --git a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
index ef1b63771526..f5bcb4797aa2 100644
--- a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
@@ -29,6 +29,10 @@ if [ "$?" -eq 0 ]; then
     mkdir -p build
     cd build; cmake .. ${CMAKE_OPTIONS} && make
     cd ..
+
+    ldd ./build/tvm_dso_op.so
+    ls ${TVM_ROOT}/build    
+
     LD_LIBRARY_PATH=${TVM_ROOT}/build:./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
 fi
 

From 9fd18b823c5a2ff1ac9bfaad091bda84b19e00bf Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Wed, 1 Apr 2020 11:05:35 +0800
Subject: [PATCH 27/30] fix: Fix cmake error and skip tfop test

---
 cmake/config.cmake                       | 5 +++--
 tests/scripts/task_python_integration.sh | 8 +++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 4e3f56116a1b..65c60df60d6d 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -118,7 +118,7 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF)
 # - ON: enable llvm with cmake's find search
 # - OFF: disable llvm
 # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
-set(USE_LLVM OFF)
+set(USE_LLVM ON)
 
 #---------------------------------------------
 # Contrib libraries
@@ -203,4 +203,5 @@ set(USE_VTA_FPGA OFF)
 set(USE_EXAMPLE_EXT_RUNTIME OFF)
 
 # Whether to build the TensorFlow TVMDSOOp module
-set(USE_TF_TVMDSOOP OFF)
+set(USE_TF_TVMDSOOP ON)
+
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 3569505cb551..dcd8139abd81 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -53,11 +53,9 @@ cd ../..
 TVM_FFI=cython python3 -m pytest -v apps/dso_plugin_module
 TVM_FFI=ctypes python3 -m pytest -v apps/dso_plugin_module
 
-# Test TensorFlow TVMDSOOP
-cd apps/tf_tvmdsoop
-TVM_FFI=cython sh prepare_and_test_tfop_module.sh
-TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh
-cd ../..
+# Do not enable TensorFlow op
+# TVM_FFI=cython sh prepare_and_test_tfop_module.sh
+# TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh
 
 TVM_FFI=ctypes python3 -m pytest -v tests/python/integration
 TVM_FFI=ctypes python3 -m pytest -v tests/python/contrib

From 8ac182f0b6c589fa952d9806704386836725a2e4 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Thu, 2 Apr 2020 12:11:57 +0800
Subject: [PATCH 28/30] fix: Fix typo and indentation issues

---
 .../prepare_and_test_tfop_module.sh           |   3 -
 python/tvm/contrib/tf_op/__init__.py          |   2 +-
 python/tvm/contrib/tf_op/module.py            |  10 +-
 src/contrib/tf_op/index_seq.h                 |  32 +-
 src/contrib/tf_op/tvm_dso_op_kernels.cc       | 308 ++++++++----------
 src/contrib/tf_op/tvm_dso_ops.cc              | 129 +++++---
 6 files changed, 239 insertions(+), 245 deletions(-)

diff --git a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
index f5bcb4797aa2..2bde4f87c84e 100644
--- a/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
+++ b/apps/tf_tvmdsoop/prepare_and_test_tfop_module.sh
@@ -30,9 +30,6 @@ if [ "$?" -eq 0 ]; then
     cd build; cmake .. ${CMAKE_OPTIONS} && make
     cd ..
 
-    ldd ./build/tvm_dso_op.so
-    ls ${TVM_ROOT}/build    
-
     LD_LIBRARY_PATH=${TVM_ROOT}/build:./build:$LD_LIBRARY_PATH python3 -m pytest -v ./tests
 fi
 
diff --git a/python/tvm/contrib/tf_op/__init__.py b/python/tvm/contrib/tf_op/__init__.py
index cabd8b716e7d..05d0ecc1ddc1 100644
--- a/python/tvm/contrib/tf_op/__init__.py
+++ b/python/tvm/contrib/tf_op/__init__.py
@@ -17,4 +17,4 @@
 """Module container of TensorFlow TVMDSO op"""
 from . import module
 
-Module = module.Module
+OpModule = module.OpModule
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index 5e8892d6746b..446800c82a03 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -19,7 +19,7 @@
 from tensorflow.python.framework import load_library
 
 
-class Module():
+class OpModule():
     """Module container of TensorFlow TVMDSO op which wraps exported
     TVM op implementation library to be called on TensorFlow side"""
 
@@ -40,16 +40,16 @@ def func(self, name, output_dtype=None, output_shape=None):
 
         Returns
         ----------
-        Func object that act as TensorFlow tensor to tensor function.
+        Func object that acts as TensorFlow tensor to tensor function.
         """
-        return Func(self.lib_path, name, output_dtype, output_shape)
+        return TensorFunc(self.lib_path, name, output_dtype, output_shape)
 
     def __getitem__(self, func_name):
         return self.func(func_name)
 
 
-class Func():
-    """Function object that act as TensorFlow tensor to tensor function."""
+class TensorFunc():
+    """Function object that acts as TensorFlow tensor to tensor function."""
 
     def __init__(self, lib_path, func_name, output_dtype, output_shape):
         self.lib_path = lib_path
diff --git a/src/contrib/tf_op/index_seq.h b/src/contrib/tf_op/index_seq.h
index 7a9a423a5977..5448c1f5f42d 100644
--- a/src/contrib/tf_op/index_seq.h
+++ b/src/contrib/tf_op/index_seq.h
@@ -19,45 +19,43 @@
 
 /*
  * Refer to std::index_sequence (since c++14)
- * Utilities to invoke variadic function with template <size_t N> 
+ * Utilities to invoke variadic function with template <size_t N>
  */
 #ifndef TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
 #define TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
 
-template <std::size_t ...>
+template <std::size_t...>
 struct IndexSeq {};
 
-template <std::size_t N, std::size_t ... Tail>
-struct IndexSeqHelper : public IndexSeqHelper<N-1U, N-1U, Tail...> {};
+template <std::size_t N, std::size_t... Tail>
+struct IndexSeqHelper : public IndexSeqHelper<N - 1U, N - 1U, Tail...> {};
 
-template <std::size_t ... Tail>
-struct IndexSeqHelper<0U, Tail ...> {
-    using type = IndexSeq<Tail ...>;
+template <std::size_t... Tail>
+struct IndexSeqHelper<0U, Tail...> {
+  using type = IndexSeq<Tail...>;
 };
 
 template <std::size_t N>
 using make_index_sequence = typename IndexSeqHelper<N>::type;
 
-
 template <typename F, typename T, std::size_t N, std::size_t... Idx>
-void apply_variadic_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
-    f(t[Idx]...);
+void apply_variadic_impl(F f, T (&t)[N], IndexSeq<Idx...>) {
+  f(t[Idx]...);
 }
 
 template <typename F, typename T, std::size_t N>
-void apply_variadic(F f, T(&t)[N]) {
-    apply_variadic_impl(f, t, make_index_sequence<N>{});
+void apply_variadic(F f, T (&t)[N]) {
+  apply_variadic_impl(f, t, make_index_sequence<N>{});
 }
 
 template <typename F, typename T, std::size_t N, std::size_t... Idx>
-void apply_variadic_by_ptrs_impl(F f, T(&t)[N], IndexSeq<Idx...>) {
-    f(&t[Idx]...);
+void apply_variadic_by_ptrs_impl(F f, T (&t)[N], IndexSeq<Idx...>) {
+  f(&t[Idx]...);
 }
 
 template <typename F, typename T, std::size_t N>
-void apply_variadic_by_ptrs(F f, T(&t)[N]) {
-    apply_variadic_by_ptrs_impl(f, t, make_index_sequence<N>{});
+void apply_variadic_by_ptrs(F f, T (&t)[N]) {
+  apply_variadic_by_ptrs_impl(f, t, make_index_sequence<N>{});
 }
 
 #endif  // TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
-
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index c82d60db0e70..03024da9a478 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -21,15 +21,13 @@
 #include <cuda_runtime.h>
 #endif
 #include <dlpack/dlpack.h>
-
-#include <tvm/runtime/module.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/runtime/registry.h>
+#include <tvm/runtime/module.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
 
-#include "tensorflow/core/framework/op_kernel.h"
 #include "index_seq.h"
-
+#include "tensorflow/core/framework/op_kernel.h"
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
@@ -39,12 +37,10 @@ using tensorflow::OpKernel;
 using tensorflow::OpKernelConstruction;
 using tensorflow::OpKernelContext;
 
-
 // Op utility trait for diffrent device type template
 template <typename DEVICE_TYPE>
 class TVMDSOOpTrait;
 
-
 // Buffer information used for actual computation.
 // Each buffer is associated with one TensorFlow tensor
 // whose underlying buffer is record into "origin_buf".
@@ -52,173 +48,150 @@ class TVMDSOOpTrait;
 // and for output tensor, copy data from buf to origin_buf
 class TensorAsBuf {
  public:
-    tensorflow::Tensor inline_tensor;
-    tensorflow::Tensor* tensor;
+  tensorflow::Tensor inline_tensor;
+  tensorflow::Tensor* tensor;
 
-    size_t size;
-    size_t offset;
+  size_t size;
+  size_t offset;
 
-    int device_type;
+  int device_type;
 
-    char* origin_buf;
-    char* buf;
+  char* origin_buf;
+  char* buf;
 
-    void CopyToOrigin() {
-        if (buf == origin_buf) {
-            return;
-        }
-        if (device_type == kDLCPU) {
-            memcpy(origin_buf, buf + offset, size);
+  void CopyToOrigin() {
+    if (buf == origin_buf) {
+      return;
+    }
+    if (device_type == kDLCPU) {
+      memcpy(origin_buf, buf + offset, size);
 #ifdef TF_TVMDSOOP_ENABLE_GPU
-        } else if (device_type == kDLGPU) {
-            cudaMemcpy(origin_buf, buf + offset,
-                size, cudaMemcpyDeviceToDevice);
+    } else if (device_type == kDLGPU) {
+      cudaMemcpy(origin_buf, buf + offset, size, cudaMemcpyDeviceToDevice);
 #endif
-        } else {
-            LOG(FATAL) << "Only support CPU and CUDA now. Device "
-                << device_type << " is not implemented currently";
-        }
+    } else {
+      LOG(FATAL) << "Only support CPU and CUDA now. Device " << device_type
+                 << " is not implemented currently";
     }
+  }
 
-    void CopyFromOrigin() {
-        if (buf == origin_buf) {
-            return;
-        }
-        if (device_type == kDLCPU) {
-            memcpy(buf + offset, origin_buf, size);
+  void CopyFromOrigin() {
+    if (buf == origin_buf) {
+      return;
+    }
+    if (device_type == kDLCPU) {
+      memcpy(buf + offset, origin_buf, size);
 #ifdef TF_TVMDSOOP_ENABLE_GPU
-        } else if (device_type == kDLGPU) {
-            cudaMemcpy(buf + offset, origin_buf,
-                size, cudaMemcpyDeviceToDevice);
+    } else if (device_type == kDLGPU) {
+      cudaMemcpy(buf + offset, origin_buf, size, cudaMemcpyDeviceToDevice);
 #endif
-        } else {
-            LOG(FATAL) << "Only support CPU and CUDA now. Device "
-                << device_type << " is not implemented currently";
-        }
+    } else {
+      LOG(FATAL) << "Only support CPU and CUDA now. Device " << device_type
+                 << " is not implemented currently";
     }
+  }
 };
 
-
-tensorflow::Status GetDLPackDtype(const tensorflow::Tensor& tf_tensor,
-                                  DLDataType* res) {
-    auto dtype = tf_tensor.dtype();
-    if (dtype == tensorflow::DT_FLOAT) {
-      *res = {kDLFloat, 32, 1};
-    } else if (dtype == tensorflow::DT_INT64) {
-      *res = {kDLInt, 64, 1};
-    } else if (dtype == tensorflow::DT_INT32) {
-      *res = {kDLInt, 32, 1};
-    } else {
-      return tensorflow::Status(tensorflow::error::INTERNAL,
-          "Fail to get dlpack datatype");
-    }
-    return tensorflow::Status::OK();
+tensorflow::Status GetDLPackDtype(const tensorflow::Tensor& tf_tensor, DLDataType* res) {
+  auto dtype = tf_tensor.dtype();
+  if (dtype == tensorflow::DT_FLOAT) {
+    *res = {kDLFloat, 32, 1};
+  } else if (dtype == tensorflow::DT_INT64) {
+    *res = {kDLInt, 64, 1};
+  } else if (dtype == tensorflow::DT_INT32) {
+    *res = {kDLInt, 32, 1};
+  } else {
+    return tensorflow::Status(tensorflow::error::INTERNAL, "Fail to get dlpack datatype");
+  }
+  return tensorflow::Status::OK();
 }
 
-
 // Ensure buffer used for actual computation take 64byte alignment
-void EnsureAlignment(OpKernelContext* ctx,
-                     const tensorflow::Tensor& tensor,
-                     TensorAsBuf* out) {
-    char* buf = const_cast<char*>(tensor.tensor_data().data());
-    out->origin_buf = buf;
-    out->size = tensor.TotalBytes();
-
-    int alignment = 64;
-    char* aligned = reinterpret_cast<char*>(
-        ((uint64_t)buf + alignment - 1) & (~(alignment - 1)));
-    if (buf == aligned) {
-        out->tensor = const_cast<tensorflow::Tensor*>(&tensor);
-        out->buf = buf;
-        out->offset = 0;
-    } else {
-        tensorflow::TensorShape buf_shape;
-        tensorflow::int64 dims[1] =
-            { (tensorflow::int64)(tensor.TotalBytes() + alignment) };
-        tensorflow::TensorShapeUtils::MakeShape(dims, 1, &buf_shape);
-
-        out->tensor = &out->inline_tensor;
-        ctx->allocate_temp(tensor.dtype(), buf_shape, out->tensor);
-
-        buf = const_cast<char*>(out->tensor->tensor_data().data());
-        char* buf_aligned = reinterpret_cast<char*>(
-            ((uint64_t)buf + alignment) & (~(alignment - 1)));
-        out->buf = buf;
-        out->offset = buf_aligned - buf;
-    }
+void EnsureAlignment(OpKernelContext* ctx, const tensorflow::Tensor& tensor, TensorAsBuf* out) {
+  char* buf = const_cast<char*>(tensor.tensor_data().data());
+  out->origin_buf = buf;
+  out->size = tensor.TotalBytes();
+
+  int alignment = 64;
+  char* aligned = reinterpret_cast<char*>(((uint64_t)buf + alignment - 1) & (~(alignment - 1)));
+  if (buf == aligned) {
+    out->tensor = const_cast<tensorflow::Tensor*>(&tensor);
+    out->buf = buf;
+    out->offset = 0;
+  } else {
+    tensorflow::TensorShape buf_shape;
+    tensorflow::int64 dims[1] = {(tensorflow::int64)(tensor.TotalBytes() + alignment)};
+    tensorflow::TensorShapeUtils::MakeShape(dims, 1, &buf_shape);
+
+    out->tensor = &out->inline_tensor;
+    ctx->allocate_temp(tensor.dtype(), buf_shape, out->tensor);
+
+    buf = const_cast<char*>(out->tensor->tensor_data().data());
+    char* buf_aligned = reinterpret_cast<char*>(((uint64_t)buf + alignment) & (~(alignment - 1)));
+    out->buf = buf;
+    out->offset = buf_aligned - buf;
+  }
 }
 
-
 // Create DLPack tensor from TensorFlow tensor
-tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx,
-                                int64_t* tf_shape, DLTensor* out) {
-    DLDataType dlpack_type;
-    const tensorflow::Tensor& tensor = *src.tensor;
-
-    auto status = GetDLPackDtype(tensor, &dlpack_type);
-    if (!status.ok()) {
-        return status;
-    }
-    out->ctx = ctx;
-    out->ndim = tensor.shape().dims();
-    out->shape = tf_shape;
-    out->strides = nullptr;
-    out->byte_offset = 0;
-    out->dtype = dlpack_type;
-    out->data = src.buf + src.offset;
-    return tensorflow::Status::OK();
+tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape,
+                                DLTensor* out) {
+  DLDataType dlpack_type;
+  const tensorflow::Tensor& tensor = *src.tensor;
+
+  auto status = GetDLPackDtype(tensor, &dlpack_type);
+  if (!status.ok()) {
+    return status;
+  }
+  out->ctx = ctx;
+  out->ndim = tensor.shape().dims();
+  out->shape = tf_shape;
+  out->strides = nullptr;
+  out->byte_offset = 0;
+  out->dtype = dlpack_type;
+  out->data = src.buf + src.offset;
+  return tensorflow::Status::OK();
 }
 
-
 template <>
 class TVMDSOOpTrait<CPUDevice> {
  public:
-    static const int device_type = kDLCPU;
+  static const int device_type = kDLCPU;
 
-    static int device_id(OpKernelContext* context) {
-        return 0;
-    }
+  static int device_id(OpKernelContext* context) { return 0; }
 
-    static void make_shape_from_tensor(
-            const tensorflow::Tensor& shape_tensor,
-            tensorflow::TensorShape* output_shape) {
-        tensorflow::int64 num_dims = shape_tensor.NumElements();
-        const tensorflow::int64* dims =
-              shape_tensor.flat<tensorflow::int64>().data();
-        tensorflow::TensorShapeUtils::MakeShape(
-            dims, num_dims, output_shape);
-    }
+  static void make_shape_from_tensor(const tensorflow::Tensor& shape_tensor,
+                                     tensorflow::TensorShape* output_shape) {
+    tensorflow::int64 num_dims = shape_tensor.NumElements();
+    const tensorflow::int64* dims = shape_tensor.flat<tensorflow::int64>().data();
+    tensorflow::TensorShapeUtils::MakeShape(dims, num_dims, output_shape);
+  }
 };
 
 #ifdef TF_TVMDSOOP_ENABLE_GPU
 template <>
 class TVMDSOOpTrait<GPUDevice> {
  public:
-    static const int device_type = kDLGPU;
+  static const int device_type = kDLGPU;
 
-    static int device_id(OpKernelContext* context) {
-        auto device_base = context->device();
-        auto gpu_device_info = device_base->tensorflow_gpu_device_info();
-        return gpu_device_info->gpu_id;
-    }
+  static int device_id(OpKernelContext* context) {
+    auto device_base = context->device();
+    auto gpu_device_info = device_base->tensorflow_gpu_device_info();
+    return gpu_device_info->gpu_id;
+  }
 
-    static void make_shape_from_tensor(
-            const tensorflow::Tensor& shape_tensor,
-            tensorflow::TensorShape* output_shape) {
-        tensorflow::int64 num_dims = shape_tensor.NumElements();
-        const tensorflow::int64* flat =
-            shape_tensor.flat<tensorflow::int64>().data();
-        tensorflow::int64* dims = new tensorflow::int64[num_dims];
-        cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims,
-           cudaMemcpyDeviceToHost);
-        tensorflow::TensorShapeUtils::MakeShape(
-            dims, num_dims, output_shape);
-        delete dims;
-    }
+  static void make_shape_from_tensor(const tensorflow::Tensor& shape_tensor,
+                                     tensorflow::TensorShape* output_shape) {
+    tensorflow::int64 num_dims = shape_tensor.NumElements();
+    const tensorflow::int64* flat = shape_tensor.flat<tensorflow::int64>().data();
+    tensorflow::int64* dims = new tensorflow::int64[num_dims];
+    cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims, cudaMemcpyDeviceToHost);
+    tensorflow::TensorShapeUtils::MakeShape(dims, num_dims, output_shape);
+    delete dims;
+  }
 };
 #endif
 
-
 template <typename DEVICE_TYPE, int NUM_INPUTS>
 class TVMDSOOp : public OpKernel {
  private:
@@ -246,8 +219,7 @@ class TVMDSOOp : public OpKernel {
     initAttributes(context);
 
     // Load TVM function from dynamic library
-    tvm::runtime::Module mod_dylib =
-        tvm::runtime::Module::LoadFromFile(lib_path);
+    tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile(lib_path);
     tvm_func = mod_dylib.GetFunction(func_name);
     CHECK(tvm_func != nullptr);
   }
@@ -261,7 +233,7 @@ class TVMDSOOp : public OpKernel {
     int device_id = TVMDSOOpTrait<DEVICE_TYPE>::device_id(context);
     int device_type = TVMDSOOpTrait<DEVICE_TYPE>::device_type;
 
-    DLContext dl_ctx = { DLDeviceType(device_type), device_id };
+    DLContext dl_ctx = {DLDeviceType(device_type), device_id};
 
     // Get output shape
     tensorflow::TensorShape output_shape;
@@ -269,43 +241,39 @@ class TVMDSOOp : public OpKernel {
     if (has_static_output_shape) {
       // use static output shape
       const tensorflow::int64* dims = static_output_shape.data();
-      tensorflow::TensorShapeUtils::MakeShape(
-          dims, static_output_shape.size(), &output_shape);
+      tensorflow::TensorShapeUtils::MakeShape(dims, static_output_shape.size(), &output_shape);
     } else if (output_shape_tensor.dims() == 1) {
       // use shape tensor values as output shape
-      TVMDSOOpTrait<DEVICE_TYPE>::make_shape_from_tensor(
-          output_shape_tensor, &output_shape);
+      TVMDSOOpTrait<DEVICE_TYPE>::make_shape_from_tensor(output_shape_tensor, &output_shape);
     } else {
       // use input tensor shape by default
       output_shape = context->input(0).shape();
     }
 
     for (int i = 0; i < NUM_INPUTS; ++i) {
-        // Grab the input tensor
-        auto& input_tensor = context->input(i);
+      // Grab the input tensor
+      auto& input_tensor = context->input(i);
 
-        // Create shape container, should keep ref during execution
-        shapes[i] = input_tensor.shape().dim_sizes();
-        auto shape_ptr = reinterpret_cast<int64_t*>(shapes[i].data());
+      // Create shape container, should keep ref during execution
+      shapes[i] = input_tensor.shape().dim_sizes();
+      auto shape_ptr = reinterpret_cast<int64_t*>(shapes[i].data());
 
-        TensorAsBuf& input = buf_info[i];
-        input.device_type = device_type;
+      TensorAsBuf& input = buf_info[i];
+      input.device_type = device_type;
 
-        EnsureAlignment(context, input_tensor, &input);
-        input.CopyFromOrigin();
+      EnsureAlignment(context, input_tensor, &input);
+      input.CopyFromOrigin();
 
-        status = MakeDLTensor(input, dl_ctx, shape_ptr, &args[i]);
-        OP_REQUIRES_OK(context, status);
+      status = MakeDLTensor(input, dl_ctx, shape_ptr, &args[i]);
+      OP_REQUIRES_OK(context, status);
     }
 
     // Allocate output tensor
     tensorflow::Tensor* output_tensor;
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, output_shape, &output_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
     // shape dimension buf should keel alive on stack
     auto output_shape_dim_buf = output_tensor->shape().dim_sizes();
-    auto output_shape_ptr = reinterpret_cast<int64_t*>(
-        output_shape_dim_buf.data());
+    auto output_shape_ptr = reinterpret_cast<int64_t*>(output_shape_dim_buf.data());
 
     TensorAsBuf output;
     output.device_type = device_type;
@@ -320,17 +288,16 @@ class TVMDSOOp : public OpKernel {
   }
 };
 
-
 #ifdef TF_TVMDSOOP_ENABLE_GPU
-#define REGISTER_TFTVM_KERNEL(n) \
-    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
-        .Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>); \
-    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
-        .Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice, n>);
+#define REGISTER_TFTVM_KERNEL(n)                                              \
+  REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_CPU), \
+                          TVMDSOOp<CPUDevice, n>);                            \
+  REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_GPU), \
+                          TVMDSOOp<GPUDevice, n>);
 #else
-#define REGISTER_TFTVM_KERNEL(n) \
-    REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n) \
-        .Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice, n>);
+#define REGISTER_TFTVM_KERNEL(n)                                              \
+  REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_CPU), \
+                          TVMDSOOp<CPUDevice, n>);
 #endif
 
 REGISTER_TFTVM_KERNEL(1)
@@ -341,4 +308,3 @@ REGISTER_TFTVM_KERNEL(5)
 REGISTER_TFTVM_KERNEL(6)
 REGISTER_TFTVM_KERNEL(7)
 REGISTER_TFTVM_KERNEL(8)
-
diff --git a/src/contrib/tf_op/tvm_dso_ops.cc b/src/contrib/tf_op/tvm_dso_ops.cc
index dfb21b204b81..f228313949cb 100644
--- a/src/contrib/tf_op/tvm_dso_ops.cc
+++ b/src/contrib/tf_op/tvm_dso_ops.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,71 +19,104 @@
 
 #include "tensorflow/core/framework/op.h"
 
-#define REGISTER_TFTVM_OP(n) REGISTER_OP("TvmDsoOp" #n) \
-    .Output("output: output_dtype") \
-    .Attr("lib_path: string") \
-    .Attr("func_name: string") \
-    .Attr("output_dtype: {int32, int64, float} = DT_FLOAT") \
-    .Attr("static_output_shape: list(int) >= 0 = []") \
-    .Attr("has_static_output_shape: bool") \
+#define REGISTER_TFTVM_OP(n)                                  \
+  REGISTER_OP("TvmDsoOp" #n)                                  \
+      .Output("output: output_dtype")                         \
+      .Attr("lib_path: string")                               \
+      .Attr("func_name: string")                              \
+      .Attr("output_dtype: {int32, int64, float} = DT_FLOAT") \
+      .Attr("static_output_shape: list(int) >= 0 = []")       \
+      .Attr("has_static_output_shape: bool")
 
-
-REGISTER_TFTVM_OP(1)
-    .Input("input: T").Attr("T: type") \
-    .Input("dynamic_output_shape: int64");
+REGISTER_TFTVM_OP(1).Input("input: T").Attr("T: type").Input("dynamic_output_shape: int64");
 
 REGISTER_TFTVM_OP(2)
-    .Input("input1: T1").Attr("T1: type")
-    .Input("input2: T2").Attr("T2: type")
+    .Input("input1: T1")
+    .Attr("T1: type")
+    .Input("input2: T2")
+    .Attr("T2: type")
     .Input("dynamic_output_shape: int64");
 
 REGISTER_TFTVM_OP(3)
-    .Input("input1: T1").Attr("T1: type")
-    .Input("input2: T2").Attr("T2: type")
-    .Input("input3: T3").Attr("T3: type")
+    .Input("input1: T1")
+    .Attr("T1: type")
+    .Input("input2: T2")
+    .Attr("T2: type")
+    .Input("input3: T3")
+    .Attr("T3: type")
     .Input("dynamic_output_shape: int64");
 
 REGISTER_TFTVM_OP(4)
-    .Input("input1: T1").Attr("T1: type")
-    .Input("input2: T2").Attr("T2: type")
-    .Input("input3: T3").Attr("T3: type")
-    .Input("input4: T4").Attr("T4: type")
+    .Input("input1: T1")
+    .Attr("T1: type")
+    .Input("input2: T2")
+    .Attr("T2: type")
+    .Input("input3: T3")
+    .Attr("T3: type")
+    .Input("input4: T4")
+    .Attr("T4: type")
     .Input("dynamic_output_shape: int64");
 
 REGISTER_TFTVM_OP(5)
-    .Input("input1: T1").Attr("T1: type")
-    .Input("input2: T2").Attr("T2: type")
-    .Input("input3: T3").Attr("T3: type")
-    .Input("input4: T4").Attr("T4: type")
-    .Input("input5: T5").Attr("T5: type")
+    .Input("input1: T1")
+    .Attr("T1: type")
+    .Input("input2: T2")
+    .Attr("T2: type")
+    .Input("input3: T3")
+    .Attr("T3: type")
+    .Input("input4: T4")
+    .Attr("T4: type")
+    .Input("input5: T5")
+    .Attr("T5: type")
     .Input("dynamic_output_shape: int64");
 
 REGISTER_TFTVM_OP(6)
-    .Input("input1: T1").Attr("T1: type")
-    .Input("input2: T2").Attr("T2: type")
-    .Input("input3: T3").Attr("T3: type")
-    .Input("input4: T4").Attr("T4: type")
-    .Input("input5: T5").Attr("T5: type")
-    .Input("input6: T6").Attr("T6: type")
+    .Input("input1: T1")
+    .Attr("T1: type")
+    .Input("input2: T2")
+    .Attr("T2: type")
+    .Input("input3: T3")
+    .Attr("T3: type")
+    .Input("input4: T4")
+    .Attr("T4: type")
+    .Input("input5: T5")
+    .Attr("T5: type")
+    .Input("input6: T6")
+    .Attr("T6: type")
     .Input("dynamic_output_shape: int64");
 
 REGISTER_TFTVM_OP(7)
-    .Input("input1: T1").Attr("T1: type")
-    .Input("input2: T2").Attr("T2: type")
-    .Input("input3: T3").Attr("T3: type")
-    .Input("input4: T4").Attr("T4: type")
-    .Input("input5: T5").Attr("T5: type")
-    .Input("input6: T6").Attr("T6: type")
-    .Input("input7: T7").Attr("T7: type")
+    .Input("input1: T1")
+    .Attr("T1: type")
+    .Input("input2: T2")
+    .Attr("T2: type")
+    .Input("input3: T3")
+    .Attr("T3: type")
+    .Input("input4: T4")
+    .Attr("T4: type")
+    .Input("input5: T5")
+    .Attr("T5: type")
+    .Input("input6: T6")
+    .Attr("T6: type")
+    .Input("input7: T7")
+    .Attr("T7: type")
     .Input("dynamic_output_shape: int64");
 
 REGISTER_TFTVM_OP(8)
-    .Input("input1: T1").Attr("T1: type")
-    .Input("input2: T2").Attr("T2: type")
-    .Input("input3: T3").Attr("T3: type")
-    .Input("input4: T4").Attr("T4: type")
-    .Input("input5: T5").Attr("T5: type")
-    .Input("input6: T6").Attr("T6: type")
-    .Input("input7: T7").Attr("T7: type")
-    .Input("input8: T8").Attr("T8: type")
+    .Input("input1: T1")
+    .Attr("T1: type")
+    .Input("input2: T2")
+    .Attr("T2: type")
+    .Input("input3: T3")
+    .Attr("T3: type")
+    .Input("input4: T4")
+    .Attr("T4: type")
+    .Input("input5: T5")
+    .Attr("T5: type")
+    .Input("input6: T6")
+    .Attr("T6: type")
+    .Input("input7: T7")
+    .Attr("T7: type")
+    .Input("input8: T8")
+    .Attr("T8: type")
     .Input("dynamic_output_shape: int64");

From 93436693abb12b114e0bb023ffdc3663359c2c00 Mon Sep 17 00:00:00 2001
From: baoxinqi <baoxinqi@4paradigm.com>
Date: Fri, 3 Apr 2020 13:24:39 +0800
Subject: [PATCH 29/30] feat: Use TF list input op def

---
 apps/tf_tvmdsoop/tests/test_tfop_module.py |   2 +-
 python/tvm/contrib/tf_op/module.py         |   9 +-
 src/contrib/tf_op/index_seq.h              |  61 -----------
 src/contrib/tf_op/tvm_dso_op_kernels.cc    |  52 +++++-----
 src/contrib/tf_op/tvm_dso_ops.cc           | 111 ++-------------------
 5 files changed, 39 insertions(+), 196 deletions(-)
 delete mode 100644 src/contrib/tf_op/index_seq.h

diff --git a/apps/tf_tvmdsoop/tests/test_tfop_module.py b/apps/tf_tvmdsoop/tests/test_tfop_module.py
index f2dee98ee01c..1672b58fd60a 100644
--- a/apps/tf_tvmdsoop/tests/test_tfop_module.py
+++ b/apps/tf_tvmdsoop/tests/test_tfop_module.py
@@ -61,7 +61,7 @@ def export_gpu_add_lib():
 
     def test_add(session, lib_path, tf_device):
         """test add lib with TensorFlow wrapper"""
-        module = tf_op.Module(lib_path)
+        module = tf_op.OpModule(lib_path)
 
         left = tf.placeholder("float32", shape=[4])
         right = tf.placeholder("float32", shape=[4])
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index 446800c82a03..f67f715a1be6 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -67,16 +67,11 @@ def __init__(self, lib_path, func_name, output_dtype, output_shape):
         elif output_shape is not None:
             self.dynamic_output_shape = self._pack_shape_tensor(output_shape)
 
-        # delay op initialization to where Func.apply() get called first time
-        self.tvm_dso_op = None
         self.module = load_library.load_op_library('tvm_dso_op.so')
+        self.tvm_dso_op = self.module.tvm_dso_op
 
     def apply(self, *params):
-        if self.tvm_dso_op is None:
-            num_inputs = len(params)
-        self.tvm_dso_op = getattr(self.module, "tvm_dso_op%s" % num_inputs)
-
-        return self.tvm_dso_op(*params,
+        return self.tvm_dso_op(params,
                                dynamic_output_shape=self.dynamic_output_shape,
                                static_output_shape=self.static_output_shape,
                                has_static_output_shape=self.has_static_output_shape,
diff --git a/src/contrib/tf_op/index_seq.h b/src/contrib/tf_op/index_seq.h
deleted file mode 100644
index 5448c1f5f42d..000000000000
--- a/src/contrib/tf_op/index_seq.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Refer to std::index_sequence (since c++14)
- * Utilities to invoke variadic function with template <size_t N>
- */
-#ifndef TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
-#define TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
-
-template <std::size_t...>
-struct IndexSeq {};
-
-template <std::size_t N, std::size_t... Tail>
-struct IndexSeqHelper : public IndexSeqHelper<N - 1U, N - 1U, Tail...> {};
-
-template <std::size_t... Tail>
-struct IndexSeqHelper<0U, Tail...> {
-  using type = IndexSeq<Tail...>;
-};
-
-template <std::size_t N>
-using make_index_sequence = typename IndexSeqHelper<N>::type;
-
-template <typename F, typename T, std::size_t N, std::size_t... Idx>
-void apply_variadic_impl(F f, T (&t)[N], IndexSeq<Idx...>) {
-  f(t[Idx]...);
-}
-
-template <typename F, typename T, std::size_t N>
-void apply_variadic(F f, T (&t)[N]) {
-  apply_variadic_impl(f, t, make_index_sequence<N>{});
-}
-
-template <typename F, typename T, std::size_t N, std::size_t... Idx>
-void apply_variadic_by_ptrs_impl(F f, T (&t)[N], IndexSeq<Idx...>) {
-  f(&t[Idx]...);
-}
-
-template <typename F, typename T, std::size_t N>
-void apply_variadic_by_ptrs(F f, T (&t)[N]) {
-  apply_variadic_by_ptrs_impl(f, t, make_index_sequence<N>{});
-}
-
-#endif  // TVM_CONTRIB_TF_OP_INDEX_SEQ_H_
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 03024da9a478..d74d8fb917e5 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -26,7 +26,6 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
-#include "index_seq.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -37,6 +36,10 @@ using tensorflow::OpKernel;
 using tensorflow::OpKernelConstruction;
 using tensorflow::OpKernelContext;
 
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMArgsSetter;
+using tvm::runtime::TVMRetValue;
+
 // Op utility trait for diffrent device type template
 template <typename DEVICE_TYPE>
 class TVMDSOOpTrait;
@@ -192,7 +195,7 @@ class TVMDSOOpTrait<GPUDevice> {
 };
 #endif
 
-template <typename DEVICE_TYPE, int NUM_INPUTS>
+template <typename DEVICE_TYPE>
 class TVMDSOOp : public OpKernel {
  private:
   tvm::runtime::PackedFunc tvm_func;
@@ -225,9 +228,12 @@ class TVMDSOOp : public OpKernel {
   }
 
   void Compute(tensorflow::OpKernelContext* context) override {
-    DLTensor args[NUM_INPUTS + 1];
-    TensorAsBuf buf_info[NUM_INPUTS];
-    ShapeContainer shapes[NUM_INPUTS];
+    // the last input is output shape spec
+    const int num_inputs = context->num_inputs() - 1;
+    const int num_total_args = num_inputs + 1;
+    std::vector<DLTensor> args(num_total_args);
+    std::vector<TensorAsBuf> buf_info(num_inputs);
+    std::vector<ShapeContainer> shapes(num_inputs);
 
     tensorflow::Status status;
     int device_id = TVMDSOOpTrait<DEVICE_TYPE>::device_id(context);
@@ -237,7 +243,7 @@ class TVMDSOOp : public OpKernel {
 
     // Get output shape
     tensorflow::TensorShape output_shape;
-    auto& output_shape_tensor = context->input(NUM_INPUTS);
+    auto& output_shape_tensor = context->input(num_inputs);
     if (has_static_output_shape) {
       // use static output shape
       const tensorflow::int64* dims = static_output_shape.data();
@@ -250,7 +256,7 @@ class TVMDSOOp : public OpKernel {
       output_shape = context->input(0).shape();
     }
 
-    for (int i = 0; i < NUM_INPUTS; ++i) {
+    for (int i = 0; i < num_inputs; ++i) {
       // Grab the input tensor
       auto& input_tensor = context->input(i);
 
@@ -279,32 +285,26 @@ class TVMDSOOp : public OpKernel {
     output.device_type = device_type;
     EnsureAlignment(context, *output_tensor, &output);
 
-    status = MakeDLTensor(output, dl_ctx, output_shape_ptr, &args[NUM_INPUTS]);
+    status = MakeDLTensor(output, dl_ctx, output_shape_ptr, &args[num_inputs]);
     OP_REQUIRES_OK(context, status);
 
-    apply_variadic_by_ptrs(tvm_func, args);
+    // Prepare PackedFunc arguments
+    std::vector<TVMValue> tvm_values(num_total_args);
+    std::vector<int> tvm_type_codes(num_total_args);
+    TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data());
+    for (int k = 0; k < num_total_args; ++k) {
+      setter(k, &args[k]);
+    }
+    TVMRetValue rv;
+    tvm_func.CallPacked(TVMArgs(tvm_values.data(), tvm_type_codes.data(), num_total_args), &rv);
 
     output.CopyToOrigin();
   }
 };
 
 #ifdef TF_TVMDSOOP_ENABLE_GPU
-#define REGISTER_TFTVM_KERNEL(n)                                              \
-  REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_CPU), \
-                          TVMDSOOp<CPUDevice, n>);                            \
-  REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_GPU), \
-                          TVMDSOOp<GPUDevice, n>);
+REGISTER_KERNEL_BUILDER(Name("TvmDsoOp").Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TvmDsoOp").Device(tensorflow::DEVICE_GPU), TVMDSOOp<GPUDevice>);
 #else
-#define REGISTER_TFTVM_KERNEL(n)                                              \
-  REGISTER_KERNEL_BUILDER(Name("TvmDsoOp" #n).Device(tensorflow::DEVICE_CPU), \
-                          TVMDSOOp<CPUDevice, n>);
+REGISTER_KERNEL_BUILDER(Name("TvmDsoOp").Device(tensorflow::DEVICE_CPU), TVMDSOOp<CPUDevice>);
 #endif
-
-REGISTER_TFTVM_KERNEL(1)
-REGISTER_TFTVM_KERNEL(2)
-REGISTER_TFTVM_KERNEL(3)
-REGISTER_TFTVM_KERNEL(4)
-REGISTER_TFTVM_KERNEL(5)
-REGISTER_TFTVM_KERNEL(6)
-REGISTER_TFTVM_KERNEL(7)
-REGISTER_TFTVM_KERNEL(8)
diff --git a/src/contrib/tf_op/tvm_dso_ops.cc b/src/contrib/tf_op/tvm_dso_ops.cc
index f228313949cb..1183b2ef34b5 100644
--- a/src/contrib/tf_op/tvm_dso_ops.cc
+++ b/src/contrib/tf_op/tvm_dso_ops.cc
@@ -19,104 +19,13 @@
 
 #include "tensorflow/core/framework/op.h"
 
-#define REGISTER_TFTVM_OP(n)                                  \
-  REGISTER_OP("TvmDsoOp" #n)                                  \
-      .Output("output: output_dtype")                         \
-      .Attr("lib_path: string")                               \
-      .Attr("func_name: string")                              \
-      .Attr("output_dtype: {int32, int64, float} = DT_FLOAT") \
-      .Attr("static_output_shape: list(int) >= 0 = []")       \
-      .Attr("has_static_output_shape: bool")
-
-REGISTER_TFTVM_OP(1).Input("input: T").Attr("T: type").Input("dynamic_output_shape: int64");
-
-REGISTER_TFTVM_OP(2)
-    .Input("input1: T1")
-    .Attr("T1: type")
-    .Input("input2: T2")
-    .Attr("T2: type")
-    .Input("dynamic_output_shape: int64");
-
-REGISTER_TFTVM_OP(3)
-    .Input("input1: T1")
-    .Attr("T1: type")
-    .Input("input2: T2")
-    .Attr("T2: type")
-    .Input("input3: T3")
-    .Attr("T3: type")
-    .Input("dynamic_output_shape: int64");
-
-REGISTER_TFTVM_OP(4)
-    .Input("input1: T1")
-    .Attr("T1: type")
-    .Input("input2: T2")
-    .Attr("T2: type")
-    .Input("input3: T3")
-    .Attr("T3: type")
-    .Input("input4: T4")
-    .Attr("T4: type")
-    .Input("dynamic_output_shape: int64");
-
-REGISTER_TFTVM_OP(5)
-    .Input("input1: T1")
-    .Attr("T1: type")
-    .Input("input2: T2")
-    .Attr("T2: type")
-    .Input("input3: T3")
-    .Attr("T3: type")
-    .Input("input4: T4")
-    .Attr("T4: type")
-    .Input("input5: T5")
-    .Attr("T5: type")
-    .Input("dynamic_output_shape: int64");
-
-REGISTER_TFTVM_OP(6)
-    .Input("input1: T1")
-    .Attr("T1: type")
-    .Input("input2: T2")
-    .Attr("T2: type")
-    .Input("input3: T3")
-    .Attr("T3: type")
-    .Input("input4: T4")
-    .Attr("T4: type")
-    .Input("input5: T5")
-    .Attr("T5: type")
-    .Input("input6: T6")
-    .Attr("T6: type")
-    .Input("dynamic_output_shape: int64");
-
-REGISTER_TFTVM_OP(7)
-    .Input("input1: T1")
-    .Attr("T1: type")
-    .Input("input2: T2")
-    .Attr("T2: type")
-    .Input("input3: T3")
-    .Attr("T3: type")
-    .Input("input4: T4")
-    .Attr("T4: type")
-    .Input("input5: T5")
-    .Attr("T5: type")
-    .Input("input6: T6")
-    .Attr("T6: type")
-    .Input("input7: T7")
-    .Attr("T7: type")
-    .Input("dynamic_output_shape: int64");
-
-REGISTER_TFTVM_OP(8)
-    .Input("input1: T1")
-    .Attr("T1: type")
-    .Input("input2: T2")
-    .Attr("T2: type")
-    .Input("input3: T3")
-    .Attr("T3: type")
-    .Input("input4: T4")
-    .Attr("T4: type")
-    .Input("input5: T5")
-    .Attr("T5: type")
-    .Input("input6: T6")
-    .Attr("T6: type")
-    .Input("input7: T7")
-    .Attr("T7: type")
-    .Input("input8: T8")
-    .Attr("T8: type")
-    .Input("dynamic_output_shape: int64");
+REGISTER_OP("TvmDsoOp")
+    .Input("input_args: ListT")
+    .Attr("ListT: list({int8, int32, int64, float16, float32})")
+    .Input("dynamic_output_shape: int64")
+    .Output("output: output_dtype")
+    .Attr("lib_path: string")
+    .Attr("func_name: string")
+    .Attr("output_dtype: {int8, int32, int64, float16, float32} = DT_FLOAT")
+    .Attr("static_output_shape: list(int) >= 0 = []")
+    .Attr("has_static_output_shape: bool");

From 38af1ceb99555b491f86fdd5d5bdd59b7a51e86e Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest@4paradigm.com>
Date: Tue, 7 Apr 2020 12:54:55 +0800
Subject: [PATCH 30/30] fix: Fix style and unexpected changes

---
 cmake/config.cmake                 | 2 +-
 python/tvm/contrib/tf_op/module.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 55958e278be0..04b6c3b66ea4 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -118,7 +118,7 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF)
 # - ON: enable llvm with cmake's find search
 # - OFF: disable llvm
 # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
-set(USE_LLVM ON)
+set(USE_LLVM OFF)
 
 #---------------------------------------------
 # Contrib libraries
diff --git a/python/tvm/contrib/tf_op/module.py b/python/tvm/contrib/tf_op/module.py
index f67f715a1be6..f13670e39895 100644
--- a/python/tvm/contrib/tf_op/module.py
+++ b/python/tvm/contrib/tf_op/module.py
@@ -19,7 +19,7 @@
 from tensorflow.python.framework import load_library
 
 
-class OpModule():
+class OpModule:
     """Module container of TensorFlow TVMDSO op which wraps exported
     TVM op implementation library to be called on TensorFlow side"""
 
@@ -48,7 +48,7 @@ def __getitem__(self, func_name):
         return self.func(func_name)
 
 
-class TensorFunc():
+class TensorFunc:
     """Function object that acts as TensorFlow tensor to tensor function."""
 
     def __init__(self, lib_path, func_name, output_dtype, output_shape):