From 3818b0ab32d94c8429c79839965bfaa7d8c1d451 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sat, 19 Jan 2019 10:05:25 -0800
Subject: [PATCH 1/4] Fix broadcast add and subtract grad (#2465)

---
 3rdparty/HalideIR | 2 +-
 3rdparty/dlpack   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index 6e7c1f046fda..97efb11fff13 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit 6e7c1f046fda536562dc80977e93324fee2324bd
+Subproject commit 97efb11fff13131480fcaa5adc65a0aef4a4cb5d
diff --git a/3rdparty/dlpack b/3rdparty/dlpack
index bee4d1dd8dc1..5c792cef3aee 160000
--- a/3rdparty/dlpack
+++ b/3rdparty/dlpack
@@ -1 +1 @@
-Subproject commit bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3
+Subproject commit 5c792cef3aee54ad8b7000111c9dc1797f327b59

From 473c71aa883ecf368628039edd567b30e2a92e11 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Wed, 16 Jan 2019 15:07:02 -0800
Subject: [PATCH 2/4] [Relay] Unifier hotfix (#2437)

---
 src/relay/pass/type_solver.cc | 289 ++++++++++++++++++++++++++++++++++
 1 file changed, 289 insertions(+)

diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index dafcaf56015a..786fe90c7b71 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -335,6 +335,295 @@ class TypeSolver::Merger : public TypeFunctor<void(const Type&)> {
   TypeNode* dst_;
 };
 
+class TypeSolver::OccursChecker : public TypeVisitor {
+ public:
+  explicit OccursChecker(TypeSolver* solver, TypeNode* var)
+    : solver_(solver), var_(var), found_(false) {}
+
+  bool Check(const Type& t) {
+    VisitType(t);
+    return found_;
+  }
+
+  void VisitType_(const IncompleteTypeNode* op) override {
+    IncompleteType t = GetRef<IncompleteType>(op);
+    TypeNode* node = solver_->GetTypeNode(t);
+    found_ = found_ || (var_->FindRoot() == node->FindRoot());
+  }
+
+ private:
+  TypeSolver* solver_;
+  TypeNode* var_;
+  bool found_;
+};
+
+class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
+ public:
+  explicit Unifier(TypeSolver* solver) : solver_(solver) {}
+
+  Type Unify(const Type& src, const Type& dst) {
+    // Known limitation
+    // - handle shape pattern matching
+    TypeNode* lhs = solver_->GetTypeNode(dst);
+    TypeNode* rhs = solver_->GetTypeNode(src);
+
+    // do occur check so we don't create self-referencing structure
+    if (lhs->FindRoot() == rhs->FindRoot()) {
+      return lhs->resolved_type;
+    }
+    if (lhs->resolved_type.as<IncompleteTypeNode>()) {
+      CHECK(!CheckOccurs(lhs, rhs->resolved_type))
+        << "Incomplete type " << lhs->resolved_type << " occurs in "
+        << rhs->resolved_type << ", cannot unify";
+      solver_->MergeFromTo(lhs, rhs);
+      return rhs->resolved_type;
+    } else if (rhs->resolved_type.as<IncompleteTypeNode>()) {
+      CHECK(!CheckOccurs(rhs, lhs->resolved_type))
+        << "Incomplete type " << rhs->resolved_type << " occurs in "
+        << lhs->resolved_type << ", cannot unify";
+      solver_->MergeFromTo(rhs, lhs);
+      return lhs->resolved_type;
+    } else {
+      Type resolved = this->VisitType(lhs->resolved_type, rhs->resolved_type);
+      CHECK(resolved.defined())
+        << "Unable to unify parent types: "
+        << lhs->resolved_type << " and " << rhs->resolved_type;
+      TypeNode* top = solver_->GetTypeNode(resolved);
+      solver_->MergeFromTo(lhs, top);
+      solver_->MergeFromTo(rhs, top);
+      return resolved;
+    }
+  }
+
+  // Checks whether lhs (taken to be a type var) occurs in t, meaning
+  // there is a recursive equality constraint, which should be rejected.
+  // N.b.: A tautology like ?a = ?a is okay and should be checked for
+  // *before* calling this method
+  bool CheckOccurs(TypeNode* lhs, const Type& t) {
+    OccursChecker rc(solver_, lhs);
+    return rc.Check(t);
+  }
+
+  // default: unify only if alpha-equal
+  Type VisitTypeDefault_(const Node* op, const Type& tn) override {
+    NodeRef nr = GetRef<NodeRef>(op);
+    Type t1 = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    if (!AlphaEqual(t1, tn)) {
+      return Type(nullptr);
+    }
+    return t1;
+  }
+
+  Type VisitType_(const TupleTypeNode* op, const Type& tn) override {
+    const auto* ttn = tn.as<TupleTypeNode>();
+    if (!ttn || op->fields.size() != ttn->fields.size()) {
+      return Type(nullptr);
+    }
+
+    TupleType tt1 = GetRef<TupleType>(op);
+    TupleType tt2 = GetRef<TupleType>(ttn);
+
+    std::vector<Type> new_fields;
+    for (size_t i = 0; i < tt1->fields.size(); i++) {
+      Type field = Unify(tt1->fields[i], tt2->fields[i]);
+      new_fields.push_back(field);
+    }
+    return TupleTypeNode::make(new_fields);
+  }
+
+  Type VisitType_(const FuncTypeNode* op, const Type& tn) override {
+    const auto* ftn = tn.as<FuncTypeNode>();
+    if (!ftn
+        || op->arg_types.size() != ftn->arg_types.size()
+        || op->type_params.size() != ftn->type_params.size()
+        || op->type_constraints.size() != ftn->type_constraints.size()) {
+      return Type(nullptr);
+    }
+
+    // remap type vars so they match
+    Map<TypeVar, Type> subst_map;
+    for (size_t i = 0; i < op->type_params.size(); i++) {
+      subst_map.Set(ftn->type_params[i], op->type_params[i]);
+    }
+
+    auto ft1 = GetRef<FuncType>(op);
+    auto ft2 = Downcast<FuncType>(Bind(GetRef<FuncType>(ftn), subst_map));
+
+    Type ret_type = Unify(ft1->ret_type, ft2->ret_type);
+
+    std::vector<Type> arg_types;
+    for (size_t i = 0; i < ft1->arg_types.size(); i++) {
+      Type arg_type = Unify(ft1->arg_types[i], ft2->arg_types[i]);
+      arg_types.push_back(arg_type);
+    }
+
+    std::vector<TypeConstraint> type_constraints;
+    for (size_t i = 0; i < ft1->type_constraints.size(); i++) {
+      Type unified_constraint = Unify(ft1->type_constraints[i],
+                                      ft2->type_constraints[i]);
+      const auto* tcn = unified_constraint.as<TypeConstraintNode>();
+      CHECK(tcn) << "Two type constraints unified into a non-constraint?"
+                 << ft1->type_constraints[i] << " and " << ft2->type_constraints[i];
+      type_constraints.push_back(GetRef<TypeConstraint>(tcn));
+    }
+
+    return FuncTypeNode::make(arg_types, ret_type, ft1->type_params, type_constraints);
+  }
+
+ private:
+  TypeSolver* solver_;
+};
+
+class TypeSolver::Resolver : public TypeMutator {
+ public:
+  explicit Resolver(TypeSolver* solver) : solver_(solver) {}
+
+  Type Resolve(const Type& t) {
+    if (!t.defined()) {
+      return t;
+    }
+    return VisitType(t);
+  }
+
+  Type VisitType_(const IncompleteTypeNode* op) override {
+    auto* node = solver_->GetTypeNode(GetRef<IncompleteType>(op));
+    return node->resolved_type;
+  }
+
+ private:
+  TypeSolver* solver_;
+};
+
+// It ends up being more compact to simply have TypeFunctor<void(const Type&) than
+// a TypeVisitor because we can use the default case to dispense with
+// most of the overrides.
+class TypeSolver::Propagator : public TypeFunctor<void(const Type&)> {
+ public:
+  explicit Propagator(TypeSolver* solver, const std::unordered_set<RelationNode*>* rels)
+    : solver_(solver), rels_(rels) {}
+
+  // adds the relation node to t and all child types of t
+  void Propagate(const Type& t) {
+    VisitType(t);
+  }
+
+  void UpdateRelSet(const Type& t) {
+    TypeNode* tnode = solver_->GetTypeNode(t);
+    for (auto* rel : *rels_) {
+      tnode->rel_set.insert(rel);
+    }
+  }
+
+  void VisitTypeDefault_(const Node* op) override {
+    NodeRef nr = GetRef<NodeRef>(op);
+    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    UpdateRelSet(t);
+  }
+
+  void VisitType_(const TupleTypeNode* op) override {
+    TupleType tt = GetRef<TupleType>(op);
+    UpdateRelSet(tt);
+
+    for (const Type& t : tt->fields) {
+      Propagate(t);
+    }
+  }
+
+  void VisitType_(const FuncTypeNode* op) override {
+    FuncType ft = GetRef<FuncType>(op);
+    UpdateRelSet(ft);
+
+    Propagate(ft->ret_type);
+    for (auto arg_type : ft->arg_types) {
+      Propagate(arg_type);
+    }
+
+    for (auto type_param : ft->type_params) {
+      Propagate(type_param);
+    }
+
+    for (auto type_cs : ft->type_constraints) {
+      Propagate(type_cs);
+    }
+  }
+
+ private:
+  TypeSolver* solver_;
+  const std::unordered_set<RelationNode*>* rels_;
+};
+
+// similarly, we use TypeFunctor<void(const Type&)> so we can use
+// the default visitor case to avoid more overrides
+class TypeSolver::Merger : public TypeFunctor<void(const Type&)> {
+ public:
+  explicit Merger(TypeSolver* solver) : solver_(solver) {}
+
+  // Merges src node to dst, ensures *all* type relations of all
+  // child nodes of src are transferred to dst.
+  void Merge(TypeNode* src, TypeNode* dst) {
+    if (src == dst) return;
+    dst_ = dst;
+    VisitType(src->resolved_type);
+    // set parent at the end so later calls to GetTypeNode go back to src
+    src->parent = dst;
+
+    // now propagate relations to child nodes, since change to
+    // a child node should update parent too
+    Propagator prop(solver_, &dst->rel_set);
+    prop.Propagate(dst->resolved_type);
+  }
+
+  // Transfers any relations linked to t to the stored dst.
+  // Any unresolved relations are added back to the queue, since
+  // there is now new information
+  void TransferLinks(const Type& t) {
+    TypeNode* src = solver_->GetTypeNode(t);
+    if (src == dst_) return;
+    for (auto* rel : src->rel_set) {
+      // if the relation is not yet resolved, add to queue
+      if (!rel->resolved) {
+        solver_->AddToQueue(rel);
+        dst_->rel_set.insert(rel);
+      }
+    }
+  }
+
+  void VisitTypeDefault_(const Node* op) override {
+    NodeRef nr = GetRef<NodeRef>(op);
+    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    TransferLinks(t);
+  }
+
+  void VisitType_(const TupleTypeNode* ttn) override {
+    auto tup = GetRef<TupleType>(ttn);
+    TransferLinks(tup);
+
+    for (auto field : tup->fields) {
+      VisitType(field);
+    }
+  }
+
+  void VisitType_(const FuncTypeNode* ftn) override {
+    auto func = GetRef<FuncType>(ftn);
+    TransferLinks(func);
+
+    VisitType(func->ret_type);
+    for (auto arg : func->arg_types) {
+      VisitType(arg);
+    }
+    for (auto param : func->type_params) {
+      VisitType(param);
+    }
+    for (auto constraint : func->type_constraints) {
+      VisitType(constraint);
+    }
+  }
+
+ private:
+  TypeSolver* solver_;
+  TypeNode* dst_;
+};
+
 // constructor
 TypeSolver::TypeSolver(const GlobalVar &current_func, ErrorReporter* err_reporter)
   : reporter_(make_node<Reporter>(this)),

From 24138d33bfa334c3a62e7b18a899aab316f369aa Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Fri, 25 Jan 2019 10:17:31 -0800
Subject: [PATCH 3/4] [Relay] Add generic & informative Relay error reporting
 (#2408)

---
 src/relay/pass/type_solver.cc | 289 ----------------------------------
 1 file changed, 289 deletions(-)

diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 786fe90c7b71..dafcaf56015a 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -335,295 +335,6 @@ class TypeSolver::Merger : public TypeFunctor<void(const Type&)> {
   TypeNode* dst_;
 };
 
-class TypeSolver::OccursChecker : public TypeVisitor {
- public:
-  explicit OccursChecker(TypeSolver* solver, TypeNode* var)
-    : solver_(solver), var_(var), found_(false) {}
-
-  bool Check(const Type& t) {
-    VisitType(t);
-    return found_;
-  }
-
-  void VisitType_(const IncompleteTypeNode* op) override {
-    IncompleteType t = GetRef<IncompleteType>(op);
-    TypeNode* node = solver_->GetTypeNode(t);
-    found_ = found_ || (var_->FindRoot() == node->FindRoot());
-  }
-
- private:
-  TypeSolver* solver_;
-  TypeNode* var_;
-  bool found_;
-};
-
-class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
- public:
-  explicit Unifier(TypeSolver* solver) : solver_(solver) {}
-
-  Type Unify(const Type& src, const Type& dst) {
-    // Known limitation
-    // - handle shape pattern matching
-    TypeNode* lhs = solver_->GetTypeNode(dst);
-    TypeNode* rhs = solver_->GetTypeNode(src);
-
-    // do occur check so we don't create self-referencing structure
-    if (lhs->FindRoot() == rhs->FindRoot()) {
-      return lhs->resolved_type;
-    }
-    if (lhs->resolved_type.as<IncompleteTypeNode>()) {
-      CHECK(!CheckOccurs(lhs, rhs->resolved_type))
-        << "Incomplete type " << lhs->resolved_type << " occurs in "
-        << rhs->resolved_type << ", cannot unify";
-      solver_->MergeFromTo(lhs, rhs);
-      return rhs->resolved_type;
-    } else if (rhs->resolved_type.as<IncompleteTypeNode>()) {
-      CHECK(!CheckOccurs(rhs, lhs->resolved_type))
-        << "Incomplete type " << rhs->resolved_type << " occurs in "
-        << lhs->resolved_type << ", cannot unify";
-      solver_->MergeFromTo(rhs, lhs);
-      return lhs->resolved_type;
-    } else {
-      Type resolved = this->VisitType(lhs->resolved_type, rhs->resolved_type);
-      CHECK(resolved.defined())
-        << "Unable to unify parent types: "
-        << lhs->resolved_type << " and " << rhs->resolved_type;
-      TypeNode* top = solver_->GetTypeNode(resolved);
-      solver_->MergeFromTo(lhs, top);
-      solver_->MergeFromTo(rhs, top);
-      return resolved;
-    }
-  }
-
-  // Checks whether lhs (taken to be a type var) occurs in t, meaning
-  // there is a recursive equality constraint, which should be rejected.
-  // N.b.: A tautology like ?a = ?a is okay and should be checked for
-  // *before* calling this method
-  bool CheckOccurs(TypeNode* lhs, const Type& t) {
-    OccursChecker rc(solver_, lhs);
-    return rc.Check(t);
-  }
-
-  // default: unify only if alpha-equal
-  Type VisitTypeDefault_(const Node* op, const Type& tn) override {
-    NodeRef nr = GetRef<NodeRef>(op);
-    Type t1 = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
-    if (!AlphaEqual(t1, tn)) {
-      return Type(nullptr);
-    }
-    return t1;
-  }
-
-  Type VisitType_(const TupleTypeNode* op, const Type& tn) override {
-    const auto* ttn = tn.as<TupleTypeNode>();
-    if (!ttn || op->fields.size() != ttn->fields.size()) {
-      return Type(nullptr);
-    }
-
-    TupleType tt1 = GetRef<TupleType>(op);
-    TupleType tt2 = GetRef<TupleType>(ttn);
-
-    std::vector<Type> new_fields;
-    for (size_t i = 0; i < tt1->fields.size(); i++) {
-      Type field = Unify(tt1->fields[i], tt2->fields[i]);
-      new_fields.push_back(field);
-    }
-    return TupleTypeNode::make(new_fields);
-  }
-
-  Type VisitType_(const FuncTypeNode* op, const Type& tn) override {
-    const auto* ftn = tn.as<FuncTypeNode>();
-    if (!ftn
-        || op->arg_types.size() != ftn->arg_types.size()
-        || op->type_params.size() != ftn->type_params.size()
-        || op->type_constraints.size() != ftn->type_constraints.size()) {
-      return Type(nullptr);
-    }
-
-    // remap type vars so they match
-    Map<TypeVar, Type> subst_map;
-    for (size_t i = 0; i < op->type_params.size(); i++) {
-      subst_map.Set(ftn->type_params[i], op->type_params[i]);
-    }
-
-    auto ft1 = GetRef<FuncType>(op);
-    auto ft2 = Downcast<FuncType>(Bind(GetRef<FuncType>(ftn), subst_map));
-
-    Type ret_type = Unify(ft1->ret_type, ft2->ret_type);
-
-    std::vector<Type> arg_types;
-    for (size_t i = 0; i < ft1->arg_types.size(); i++) {
-      Type arg_type = Unify(ft1->arg_types[i], ft2->arg_types[i]);
-      arg_types.push_back(arg_type);
-    }
-
-    std::vector<TypeConstraint> type_constraints;
-    for (size_t i = 0; i < ft1->type_constraints.size(); i++) {
-      Type unified_constraint = Unify(ft1->type_constraints[i],
-                                      ft2->type_constraints[i]);
-      const auto* tcn = unified_constraint.as<TypeConstraintNode>();
-      CHECK(tcn) << "Two type constraints unified into a non-constraint?"
-                 << ft1->type_constraints[i] << " and " << ft2->type_constraints[i];
-      type_constraints.push_back(GetRef<TypeConstraint>(tcn));
-    }
-
-    return FuncTypeNode::make(arg_types, ret_type, ft1->type_params, type_constraints);
-  }
-
- private:
-  TypeSolver* solver_;
-};
-
-class TypeSolver::Resolver : public TypeMutator {
- public:
-  explicit Resolver(TypeSolver* solver) : solver_(solver) {}
-
-  Type Resolve(const Type& t) {
-    if (!t.defined()) {
-      return t;
-    }
-    return VisitType(t);
-  }
-
-  Type VisitType_(const IncompleteTypeNode* op) override {
-    auto* node = solver_->GetTypeNode(GetRef<IncompleteType>(op));
-    return node->resolved_type;
-  }
-
- private:
-  TypeSolver* solver_;
-};
-
-// It ends up being more compact to simply have TypeFunctor<void(const Type&) than
-// a TypeVisitor because we can use the default case to dispense with
-// most of the overrides.
-class TypeSolver::Propagator : public TypeFunctor<void(const Type&)> {
- public:
-  explicit Propagator(TypeSolver* solver, const std::unordered_set<RelationNode*>* rels)
-    : solver_(solver), rels_(rels) {}
-
-  // adds the relation node to t and all child types of t
-  void Propagate(const Type& t) {
-    VisitType(t);
-  }
-
-  void UpdateRelSet(const Type& t) {
-    TypeNode* tnode = solver_->GetTypeNode(t);
-    for (auto* rel : *rels_) {
-      tnode->rel_set.insert(rel);
-    }
-  }
-
-  void VisitTypeDefault_(const Node* op) override {
-    NodeRef nr = GetRef<NodeRef>(op);
-    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
-    UpdateRelSet(t);
-  }
-
-  void VisitType_(const TupleTypeNode* op) override {
-    TupleType tt = GetRef<TupleType>(op);
-    UpdateRelSet(tt);
-
-    for (const Type& t : tt->fields) {
-      Propagate(t);
-    }
-  }
-
-  void VisitType_(const FuncTypeNode* op) override {
-    FuncType ft = GetRef<FuncType>(op);
-    UpdateRelSet(ft);
-
-    Propagate(ft->ret_type);
-    for (auto arg_type : ft->arg_types) {
-      Propagate(arg_type);
-    }
-
-    for (auto type_param : ft->type_params) {
-      Propagate(type_param);
-    }
-
-    for (auto type_cs : ft->type_constraints) {
-      Propagate(type_cs);
-    }
-  }
-
- private:
-  TypeSolver* solver_;
-  const std::unordered_set<RelationNode*>* rels_;
-};
-
-// similarly, we use TypeFunctor<void(const Type&)> so we can use
-// the default visitor case to avoid more overrides
-class TypeSolver::Merger : public TypeFunctor<void(const Type&)> {
- public:
-  explicit Merger(TypeSolver* solver) : solver_(solver) {}
-
-  // Merges src node to dst, ensures *all* type relations of all
-  // child nodes of src are transferred to dst.
-  void Merge(TypeNode* src, TypeNode* dst) {
-    if (src == dst) return;
-    dst_ = dst;
-    VisitType(src->resolved_type);
-    // set parent at the end so later calls to GetTypeNode go back to src
-    src->parent = dst;
-
-    // now propagate relations to child nodes, since change to
-    // a child node should update parent too
-    Propagator prop(solver_, &dst->rel_set);
-    prop.Propagate(dst->resolved_type);
-  }
-
-  // Transfers any relations linked to t to the stored dst.
-  // Any unresolved relations are added back to the queue, since
-  // there is now new information
-  void TransferLinks(const Type& t) {
-    TypeNode* src = solver_->GetTypeNode(t);
-    if (src == dst_) return;
-    for (auto* rel : src->rel_set) {
-      // if the relation is not yet resolved, add to queue
-      if (!rel->resolved) {
-        solver_->AddToQueue(rel);
-        dst_->rel_set.insert(rel);
-      }
-    }
-  }
-
-  void VisitTypeDefault_(const Node* op) override {
-    NodeRef nr = GetRef<NodeRef>(op);
-    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
-    TransferLinks(t);
-  }
-
-  void VisitType_(const TupleTypeNode* ttn) override {
-    auto tup = GetRef<TupleType>(ttn);
-    TransferLinks(tup);
-
-    for (auto field : tup->fields) {
-      VisitType(field);
-    }
-  }
-
-  void VisitType_(const FuncTypeNode* ftn) override {
-    auto func = GetRef<FuncType>(ftn);
-    TransferLinks(func);
-
-    VisitType(func->ret_type);
-    for (auto arg : func->arg_types) {
-      VisitType(arg);
-    }
-    for (auto param : func->type_params) {
-      VisitType(param);
-    }
-    for (auto constraint : func->type_constraints) {
-      VisitType(constraint);
-    }
-  }
-
- private:
-  TypeSolver* solver_;
-  TypeNode* dst_;
-};
-
 // constructor
 TypeSolver::TypeSolver(const GlobalVar &current_func, ErrorReporter* err_reporter)
   : reporter_(make_node<Reporter>(this)),

From 9214908e661f3089c2f3338d555140613ba82a7c Mon Sep 17 00:00:00 2001
From: Anthony Mai <mai_anthony@hotmail.com>
Date: Wed, 16 Jan 2019 11:17:35 -0800
Subject: [PATCH 4/4] [TVM] Reduce symbol visibility of shared modules (*.so
 files)

Default compilation of Linux shared library modules (*.so files)
exports all symbols. This creates large module files as the export
symbol table contains too many entries. The correct approach is
to export nothing by default. Anything that needs to be exported
must be explicitly specified. This is done by the following steps:

    In the Makefile, add "-fvisibility=hidden" flag. You can search
    for "-fPIC" to find the appropriate place to add the flag. This
    hides symbols by default if not explicitly specified otherwise.

    To declare of any symbol to be exported, add this attribute:
      __attribute__((visibility("default")))
    The attribute string can be added using a macro definition. It
    should be added right before the return type for functions, or
    right after the 'class' or 'struct' keyword for class/struct.

    To supress Doxygen parser warnings, modify docs/Doxyfile and
    add to PRE_DEFINED: TVM_DLL= NNVM_DLL= __attribute__(x)=

For more info on shared module export symbol visibility read:
    https://gcc.gnu.org/wiki/Visibility

Update submodule HalideIR to 7a3287d3883fdeac3aba2a7f3865c7ab78e1925c
and dlpack to 5c792cef3aee54ad8b7000111c9dc1797f327b59.

Explicitly export __gnu_f2h_ieee() which is needed in a unit test.

Move the visibility specifier to header files.
---
 CMakeLists.txt                      |   6 +-
 docs/Doxyfile                       |   2 +-
 include/tvm/ir_operator.h           |  20 -----
 include/tvm/ir_pass.h               |  12 +--
 include/tvm/ir_visitor.h            |   2 +-
 include/tvm/relay/pass.h            |  46 +++++-----
 include/tvm/runtime/c_runtime_api.h |   2 +-
 include/tvm/runtime/device_api.h    |  14 +--
 nnvm/include/nnvm/c_api.h           |   2 +-
 src/relay/backend/interpreter.cc    |   2 +-
 src/runtime/builtin_fp16.cc         |   4 +-
 src/runtime/workspace_pool.h        |   2 +-
 vta/include/vta/runtime.h           | 133 ++++++++++++++--------------
 vta/src/runtime.cc                  |   1 +
 14 files changed, 115 insertions(+), 133 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23dd58a2cd26..cb9b2df2f284 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,10 +84,10 @@ else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
   if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-    add_compile_options(-O0 -Wall -fPIC -std=c++11)
+    add_compile_options(-O0 -Wall -fPIC -fvisibility=hidden -std=c++11)
   else()
-    set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11 ${CMAKE_CXX_FLAGS}")
   endif ()
   if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 7bb47ccab4c5..5f5a4dbf0ddf 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -1974,7 +1974,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = DMLC_USE_CXX11
+PREDEFINED             = DMLC_USE_CXX11 TVM_DLL= NNVM_DLL= __attribute__(x)=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index f2c9c3d517a5..af5b23ed6552 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -332,26 +332,6 @@ TVM_DLL Expr max(Expr a, Expr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL Expr min(Expr a, Expr b);
-/*!
- * \brief right shift
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL Expr operator>>(Expr a, Expr b);
-/*!
- * \brief left shift
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL Expr operator<<(Expr a, Expr b);
 /*!
  * \brief take bitwise and of two values
  *
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 68bfe53407c8..83201b824a7e 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -27,7 +27,7 @@ namespace ir {
  * \param vrange The range information about the variable.
  * \return Canonicalized statement.
  */
-EXPORT Expr Simplify(Expr expr, Map<Var, Range> vrange = Map<Var, Range>());
+TVM_DLL Expr Simplify(Expr expr, Map<Var, Range> vrange = Map<Var, Range>());
 
 /*!
  * \brief Simplify the statement.
@@ -52,7 +52,7 @@ Stmt CanonicalSimplify(Stmt stmt,
  * \param vrange The range information about the variable.
  * \return Canonicalized expression.
  */
-EXPORT Expr CanonicalSimplify(Expr expr,
+TVM_DLL Expr CanonicalSimplify(Expr expr,
                               Map<Var, Range> vrange = Map<Var, Range>());
 
 /*!
@@ -61,7 +61,7 @@ EXPORT Expr CanonicalSimplify(Expr expr,
  * \param rhs The right operand
  * \return The comparison result.
  */
-EXPORT bool Equal(const Expr& lhs, const Expr& rhs);
+TVM_DLL bool Equal(const Expr& lhs, const Expr& rhs);
 
 /*!
  * \brief Deep compare lhs and rhs
@@ -92,13 +92,13 @@ int Compare(const Expr& lhs, const Expr& rhs);
  * \return Whether IR is in SSA form.
  * \note All the passes in this file uses SSA form and outputs SSA form.
  */
-bool VerifySSA(const Stmt& ir);
+TVM_DLL bool VerifySSA(const Stmt& ir);
 
 /*!
  * \brief Whether the expression have side effect.
  * \return whether expression have side effect
  */
-bool HasSideEffect(const Expr& e);
+TVM_DLL bool HasSideEffect(const Expr& e);
 
 /*!
  * \brief Whether e expression used var.
@@ -121,7 +121,7 @@ bool ExprUseVar(const Expr& e, const std::unordered_set<const Variable*>& vset);
  * \param stmt The source statement to be converted.
  * \return The converted form.
  */
-Stmt ConvertSSA(Stmt stmt);
+TVM_DLL Stmt ConvertSSA(Stmt stmt);
 
 /*!
  * \brief Substitute the var specified in key->var to be value.
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 755f15078ce2..c4fccfbe6b1b 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -131,7 +131,7 @@ class TVM_DLL IRVisitor {
  * \param node The ir to be visited.
  * \param fvisit The visitor function to be applied.
  */
-void PostOrderVisit(const NodeRef& node, std::function<void(const NodeRef&)> fvisit);
+TVM_DLL void PostOrderVisit(const NodeRef& node, std::function<void(const NodeRef&)> fvisit);
 
 }  // namespace ir
 }  // namespace tvm
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index 38f6a805f131..1558e65a6b36 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -27,7 +27,7 @@ namespace relay {
  *
  * \return A type checked expression with its checked_type field populated.
  */
-Expr InferType(const Expr& expr, const Module& mod);
+TVM_DLL Expr InferType(const Expr& expr, const Module& mod);
 
 /*!
  * \brief Infer the type of a function as if it is mapped to var in the mod.
@@ -39,8 +39,8 @@ Expr InferType(const Expr& expr, const Module& mod);
  * \return A type checked Function with its checked_type field populated.
  * \note this function mutates mod and is not thread-safe.
  */
-Function InferType(const Function& f, const Module& mod,
-                   const GlobalVar& var);
+TVM_DLL Function InferType(const Function& f, const Module& mod,
+                           const GlobalVar& var);
 
 /*!
  * \brief Check that types are well kinded by applying "kinding rules".
@@ -58,7 +58,7 @@ Function InferType(const Function& f, const Module& mod,
  *
  * \return true if the rules are satisified otherwise false
  */
-bool KindCheck(const Type& t, const Module& mod);
+TVM_DLL bool KindCheck(const Type& t, const Module& mod);
 
 /*! \brief Compare two expressions for structural equivalence.
  *
@@ -75,7 +75,7 @@ bool KindCheck(const Type& t, const Module& mod);
  *
  *   \return true if equal, otherwise false
  */
-bool AlphaEqual(const Expr& e1, const Expr& e2);
+TVM_DLL bool AlphaEqual(const Expr& e1, const Expr& e2);
 
 /*! \brief Compare two types for structural equivalence.
  *
@@ -93,7 +93,7 @@ bool AlphaEqual(const Expr& e1, const Expr& e2);
  *
  * \return true if equal, otherwise false
  */
-bool AlphaEqual(const Type& t1, const Type& t2);
+TVM_DLL bool AlphaEqual(const Type& t1, const Type& t2);
 
 /*! \brief Check that each Var is only bound once.
  *
@@ -106,7 +106,7 @@ bool AlphaEqual(const Type& t1, const Type& t2);
  *
   * \return true iff all Var in expr is bound at most once.
  */
-bool WellFormed(const Expr& expr);
+TVM_DLL bool WellFormed(const Expr& expr);
 
 /*! \brief Get all bound variables from expression expr.
  *
@@ -117,7 +117,7 @@ bool WellFormed(const Expr& expr);
  *
  * \return List of bound vars, in the PostDFS order in the expression.
  */
-tvm::Array<Var> BoundVars(const Expr& expr);
+TVM_DLL tvm::Array<Var> BoundVars(const Expr& expr);
 
 /*! \brief Get free type parameters from expression expr.
  *
@@ -128,7 +128,7 @@ tvm::Array<Var> BoundVars(const Expr& expr);
  *
  * \return List of free vars, in the PostDFS order in the expression.
  */
-tvm::Array<Var> FreeVars(const Expr& expr);
+TVM_DLL tvm::Array<Var> FreeVars(const Expr& expr);
 
 /*! \brief Get all variables from expression expr.
  *
@@ -136,7 +136,7 @@ tvm::Array<Var> FreeVars(const Expr& expr);
  *
  * \return List of all vars, in the PostDFS order in the expression.
  */
-tvm::Array<Var> AllVars(const Expr& expr);
+TVM_DLL tvm::Array<Var> AllVars(const Expr& expr);
 
 /*! \brief Get free TypeVars from expression expr.
  *
@@ -147,7 +147,7 @@ tvm::Array<Var> AllVars(const Expr& expr);
  *
  * \return List of free vars, in the PostDFS order visited by expr.
  */
-tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
+TVM_DLL tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
 
 /*! \brief Get free TypeVars from type t.
  *
@@ -158,7 +158,7 @@ tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
  *
  * \return List of free type vars, in the PostDFS order visited by type.
  */
-tvm::Array<TypeVar> FreeTypeVars(const Type& t);
+TVM_DLL tvm::Array<TypeVar> FreeTypeVars(const Type& t);
 
 /*! \brief Get all bound type variables from expression expr.
  *
@@ -169,7 +169,7 @@ tvm::Array<TypeVar> FreeTypeVars(const Type& t);
  *
  * \return List of bound type vars, in the PostDFS order in the expression.
  */
-tvm::Array<TypeVar> BoundTypeVars(const Expr& expr);
+TVM_DLL tvm::Array<TypeVar> BoundTypeVars(const Expr& expr);
 
 /*! \brief Get all bound type variables from type t.
  *
@@ -180,7 +180,7 @@ tvm::Array<TypeVar> BoundTypeVars(const Expr& expr);
  *
  * \return List of bound type vars, in the PostDFS order visited by type.
  */
-tvm::Array<TypeVar> BoundTypeVars(const Type& t);
+TVM_DLL tvm::Array<TypeVar> BoundTypeVars(const Type& t);
 
 /*! \brief Get all type variables in expression expr.
  *
@@ -188,7 +188,7 @@ tvm::Array<TypeVar> BoundTypeVars(const Type& t);
  *
  * \return List of type vars, in the PostDFS order in the expression.
  */
-tvm::Array<TypeVar> AllTypeVars(const Expr& expr);
+TVM_DLL tvm::Array<TypeVar> AllTypeVars(const Expr& expr);
 
 /*! \brief Get all type variables in type t.
  *
@@ -196,7 +196,7 @@ tvm::Array<TypeVar> AllTypeVars(const Expr& expr);
  *
  * \return List of type vars, in the PostDFS order visited by type.
  */
-tvm::Array<TypeVar> AllTypeVars(const Type& t);
+TVM_DLL tvm::Array<TypeVar> AllTypeVars(const Type& t);
 
 /*! \brief Remove expressions which does not effect the program result.
  *
@@ -211,14 +211,14 @@ tvm::Array<TypeVar> AllTypeVars(const Type& t);
  *
  * \return the optimized expression.
  */
-Expr DeadCodeElimination(const Expr& e);
+TVM_DLL Expr DeadCodeElimination(const Expr& e);
 
 /*!
  * \brief Fold constant expressions.
  * \param expr the expression to be optimized.
  * \return The optimized expression.
  */
-Expr FoldConstant(const Expr& expr);
+TVM_DLL Expr FoldConstant(const Expr& expr);
 
 /*!
  * \brief Fuse operations into expr into seperate functions.
@@ -226,7 +226,7 @@ Expr FoldConstant(const Expr& expr);
  * \param fuse_opt_level Optimization level.
  * \return The optimized expression.
  */
-Expr FuseOps(const Expr& expr, int fuse_opt_level);
+TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level);
 
 /*!
  * \brief Apply rewrite rules to rewrite the expr in post DFS order.
@@ -238,7 +238,7 @@ Expr FuseOps(const Expr& expr, int fuse_opt_level);
  *                           an Expr consumed by multiple callers.
  * \return The rewritten expression.
  */
-Expr ForwardRewrite(const Expr& expr,
+TVM_DLL Expr ForwardRewrite(const Expr& expr,
                     const std::string& rewrite_map_attr_name,
                     std::function<NodeRef(const Call&)> fcontext = nullptr,
                     std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
@@ -252,7 +252,7 @@ Expr ForwardRewrite(const Expr& expr,
  *                           an Expr consumed by multiple callers.
  * \return The rewritten expression.
  */
-Expr ForwardRewrite(const Expr& expr,
+TVM_DLL Expr ForwardRewrite(const Expr& expr,
                     const FForwardRewrite& rewrite_func,
                     std::function<NodeRef(const Call&)> fcontext = nullptr,
                     std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
@@ -264,14 +264,14 @@ Expr ForwardRewrite(const Expr& expr,
  *                        operators without annotation.
  * \return The updated program.
  */
-Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device);
+TVM_DLL Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device);
 
 /*!
  * \brief Collect the device mapping information of each expression.
  * \param expr The expression.
  * \return The device mapping.
  */
-Map<Expr, Integer> CollectDeviceInfo(const Expr& expr);
+TVM_DLL Map<Expr, Integer> CollectDeviceInfo(const Expr& expr);
 
 /*! \brief A hashing structure in the style of std::hash. */
 struct StructuralHash {
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 75e936d8f502..b493cf6dc8da 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -38,7 +38,7 @@
 #define TVM_DLL __declspec(dllimport)
 #endif
 #else
-#define TVM_DLL
+#define TVM_DLL __attribute__((visibility("default")))
 #endif
 #endif
 
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 2a5ea83a4d2d..621d00067a03 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -37,10 +37,10 @@ constexpr int kTempAllocaAlignment = 64;
 constexpr int kMaxStackAlloca = 1024;
 
 /*!
- * \brief TVM Runtime Device API, abstracts the device
+ *  \brief TVM Runtime Device API, abstracts the device
  *  specific interface for memory management.
  */
-class DeviceAPI {
+class TVM_DLL DeviceAPI {
  public:
   /*! \brief virtual destructor */
   virtual ~DeviceAPI() {}
@@ -103,7 +103,7 @@ class DeviceAPI {
    *
    * \param ctx The context of allocation.
    */
-  TVM_DLL virtual TVMStreamHandle CreateStream(TVMContext ctx);
+  virtual TVMStreamHandle CreateStream(TVMContext ctx);
 
   /*!
    * \brief Free a stream of execution
@@ -111,7 +111,7 @@ class DeviceAPI {
    * \param ctx The context of the stream
    * \param stream The pointer to be freed.
    */
-  TVM_DLL virtual void FreeStream(TVMContext ctx, TVMStreamHandle stream);
+  virtual void FreeStream(TVMContext ctx, TVMStreamHandle stream);
 
   /*!
    * \brief Synchronize the stream
@@ -137,7 +137,7 @@ class DeviceAPI {
    * \param event_src The source stream to synchronize.
    * \param event_dst The destination stream to synchronize.
    */
-  TVM_DLL virtual void SyncStreamFromTo(TVMContext ctx,
+  virtual void SyncStreamFromTo(TVMContext ctx,
                                         TVMStreamHandle event_src,
                                         TVMStreamHandle event_dst);
   /*!
@@ -156,7 +156,7 @@ class DeviceAPI {
    * \param type_hint The type of elements. Only needed by certain backends such
    * as OpenGL, as nbytes is sufficient for most backends.
    */
-  TVM_DLL virtual void* AllocWorkspace(TVMContext ctx,
+  virtual void* AllocWorkspace(TVMContext ctx,
                                        size_t nbytes,
                                        TVMType type_hint = {});
   /*!
@@ -165,7 +165,7 @@ class DeviceAPI {
    * \param ctx The context of allocation.
    * \param ptr The pointer to be freed.
    */
-  TVM_DLL virtual void FreeWorkspace(TVMContext ctx, void* ptr);
+  virtual void FreeWorkspace(TVMContext ctx, void* ptr);
 
   /*!
    * \brief Get device API base don context.
diff --git a/nnvm/include/nnvm/c_api.h b/nnvm/include/nnvm/c_api.h
index daf9b564f3fa..1010e3c07227 100644
--- a/nnvm/include/nnvm/c_api.h
+++ b/nnvm/include/nnvm/c_api.h
@@ -16,7 +16,7 @@
 #define NNVM_DLL __declspec(dllimport)
 #endif
 #else
-#define NNVM_DLL
+#define NNVM_DLL __attribute__((visibility("default")))
 #endif
 
 /*! \brief manually define unsigned int */
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 734180c53759..81f1cc6989f3 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -145,7 +145,7 @@ class InterpreterStateNode : public Node {
     v->Visit("stack", &stack);
   }
 
-  TVM_DLL static InterpreterState make(Expr current_expr, Stack stack);
+  static InterpreterState make(Expr current_expr, Stack stack);
 
   static constexpr const char* _type_key = "relay.InterpreterState";
   TVM_DECLARE_NODE_TYPE_INFO(InterpreterStateNode, Node);
diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc
index c920c9571f38..eca4814b6114 100644
--- a/src/runtime/builtin_fp16.cc
+++ b/src/runtime/builtin_fp16.cc
@@ -11,11 +11,11 @@ extern "C" {
 // disable under msvc
 #ifndef _MSC_VER
 
-TVM_WEAK uint16_t __gnu_f2h_ieee(float a) {
+TVM_DLL TVM_WEAK uint16_t __gnu_f2h_ieee(float a) {
   return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(a);
 }
 
-TVM_WEAK float __gnu_h2f_ieee(uint16_t a) {
+TVM_DLL TVM_WEAK float __gnu_h2f_ieee(uint16_t a) {
   return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
 }
 
diff --git a/src/runtime/workspace_pool.h b/src/runtime/workspace_pool.h
index 62364211c10c..eba2fd2117ee 100644
--- a/src/runtime/workspace_pool.h
+++ b/src/runtime/workspace_pool.h
@@ -22,7 +22,7 @@ namespace runtime {
  *  - The release order is usually in reverse order of allocate
  *  - Repeative pattern of same allocations over different runs.
  */
-class WorkspacePool {
+class TVM_DLL WorkspacePool {
  public:
   /*!
    * \brief Create pool with specific device type and device.
diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h
index e58d45486282..5af915696134 100644
--- a/vta/include/vta/runtime.h
+++ b/vta/include/vta/runtime.h
@@ -11,6 +11,7 @@
 extern "C" {
 #endif
 
+#include <tvm/runtime/c_runtime_api.h>
 #include "driver.h"
 
 #define VTA_MEMCPY_H2D 1
@@ -28,13 +29,13 @@ extern "C" {
  * \param size Buffer size.
  * \return A pointer to the allocated buffer.
  */
-void* VTABufferAlloc(size_t size);
+TVM_DLL void* VTABufferAlloc(size_t size);
 
 /*!
  * \brief Free data buffer.
  * \param buffer The data buffer to be freed.
  */
-void VTABufferFree(void* buffer);
+TVM_DLL void VTABufferFree(void* buffer);
 
 /*!
  * \brief Copy data buffer from one location to another.
@@ -45,24 +46,24 @@ void VTABufferFree(void* buffer);
  * \param size Size of copy.
  * \param kind_mask The memory copy kind.
  */
-void VTABufferCopy(const void* from,
-                   size_t from_offset,
-                   void* to,
-                   size_t to_offset,
-                   size_t size,
-                   int kind_mask);
+TVM_DLL void VTABufferCopy(const void* from,
+                           size_t from_offset,
+                           void* to,
+                           size_t to_offset,
+                           size_t size,
+                           int kind_mask);
 
 /*! \brief VTA command handle */
 typedef void* VTACommandHandle;
 
 /*! \brief Shutdown hook of VTA to cleanup resources */
-void VTARuntimeShutdown();
+TVM_DLL void VTARuntimeShutdown();
 
 /*!
  * \brief Get thread local command handle.
  * \return A thread local command handle.
  */
-VTACommandHandle VTATLSCommandHandle();
+TVM_DLL VTACommandHandle VTATLSCommandHandle();
 
 /*!
  * \brief Get the buffer access pointer on CPU.
@@ -70,7 +71,7 @@ VTACommandHandle VTATLSCommandHandle();
  * \param buffer The data buffer.
  * \return The pointer that can be accessed by the CPU.
  */
-void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
+TVM_DLL void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
 
 /*!
  * \brief Perform a write barrier to make a memory region visible to the CPU.
@@ -80,11 +81,11 @@ void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
  * \param start The start of the region (in elements).
  * \param extent The end of the region (in elements).
  */
-void VTAWriteBarrier(VTACommandHandle cmd,
-                     void* buffer,
-                     uint32_t elem_bits,
-                     uint32_t start,
-                     uint32_t extent);
+TVM_DLL void VTAWriteBarrier(VTACommandHandle cmd,
+                             void* buffer,
+                             uint32_t elem_bits,
+                             uint32_t start,
+                             uint32_t extent);
 /*!
  * \brief Perform a read barrier to a memory region visible to VTA.
  * \param cmd The VTA command handle.
@@ -93,18 +94,18 @@ void VTAWriteBarrier(VTACommandHandle cmd,
  * \param start The start of the region (in elements).
  * \param extent The end of the region (in elements).
  */
-void VTAReadBarrier(VTACommandHandle cmd,
-                    void* buffer,
-                    uint32_t elem_bits,
-                    uint32_t start,
-                    uint32_t extent);
+TVM_DLL void VTAReadBarrier(VTACommandHandle cmd,
+                            void* buffer,
+                            uint32_t elem_bits,
+                            uint32_t start,
+                            uint32_t extent);
 
 /*!
  * \brief Set debug mode on the command handle.
  * \param cmd The VTA command handle.
  * \param debug_flag The debug flag.
  */
-void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
+TVM_DLL void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
 
 /*!
  * \brief Perform a 2D data load from DRAM.
@@ -122,18 +123,18 @@ void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
  * \param dst_sram_index Destination SRAM index.
  * \param dst_memory_type Destination memory type.
  */
-void VTALoadBuffer2D(VTACommandHandle cmd,
-                     void* src_dram_addr,
-                     uint32_t src_elem_offset,
-                     uint32_t x_size,
-                     uint32_t y_size,
-                     uint32_t x_stride,
-                     uint32_t x_pad_before,
-                     uint32_t y_pad_before,
-                     uint32_t x_pad_after,
-                     uint32_t y_pad_after,
-                     uint32_t dst_sram_index,
-                     uint32_t dst_memory_type);
+TVM_DLL void VTALoadBuffer2D(VTACommandHandle cmd,
+                             void* src_dram_addr,
+                             uint32_t src_elem_offset,
+                             uint32_t x_size,
+                             uint32_t y_size,
+                             uint32_t x_stride,
+                             uint32_t x_pad_before,
+                             uint32_t y_pad_before,
+                             uint32_t x_pad_after,
+                             uint32_t y_pad_after,
+                             uint32_t dst_sram_index,
+                             uint32_t dst_memory_type);
 
 /*!
  * \brief Perform a 2D data store into DRAM
@@ -147,14 +148,14 @@ void VTALoadBuffer2D(VTACommandHandle cmd,
  * \param y_size The number of rows.
  * \param x_stride The x axis stride.
  */
-void VTAStoreBuffer2D(VTACommandHandle cmd,
-                      uint32_t src_sram_index,
-                      uint32_t src_memory_type,
-                      void* dst_dram_addr,
-                      uint32_t dst_elem_offset,
-                      uint32_t x_size,
-                      uint32_t y_size,
-                      uint32_t x_stride);
+TVM_DLL void VTAStoreBuffer2D(VTACommandHandle cmd,
+                              uint32_t src_sram_index,
+                              uint32_t src_memory_type,
+                              void* dst_dram_addr,
+                              uint32_t dst_elem_offset,
+                              uint32_t x_size,
+                              uint32_t y_size,
+                              uint32_t x_stride);
 
 /*!
  * \brief Push uop into kernel buffer.
@@ -187,14 +188,14 @@ void VTAStoreBuffer2D(VTACommandHandle cmd,
  * \param use_imm Use immediate in ALU mode if set to true.
  * \param imm_val Immediate value in ALU mode.
  */
-void VTAUopPush(uint32_t mode,
-                uint32_t reset_out,
-                uint32_t dst_index,
-                uint32_t src_index,
-                uint32_t wgt_index,
-                uint32_t opcode,
-                uint32_t use_imm,
-                int32_t imm_val);
+TVM_DLL void VTAUopPush(uint32_t mode,
+                        uint32_t reset_out,
+                        uint32_t dst_index,
+                        uint32_t src_index,
+                        uint32_t wgt_index,
+                        uint32_t opcode,
+                        uint32_t use_imm,
+                        int32_t imm_val);
 
 /*!
  * \brief Mark start of a micro op loop.
@@ -203,15 +204,15 @@ void VTAUopPush(uint32_t mode,
  * \param src_factor The input factor.
  * \param wgt_factor The weight factor.
  */
-void VTAUopLoopBegin(uint32_t extent,
-                     uint32_t dst_factor,
-                     uint32_t src_factor,
-                     uint32_t wgt_factor);
+TVM_DLL void VTAUopLoopBegin(uint32_t extent,
+                             uint32_t dst_factor,
+                             uint32_t src_factor,
+                             uint32_t wgt_factor);
 
 /*!
  * \brief Mark end of a micro op loop.
  */
-void VTAUopLoopEnd();
+TVM_DLL void VTAUopLoopEnd();
 
 /*!
  * \brief Push GEMM uop kernel into the command handle.
@@ -221,10 +222,10 @@ void VTAUopLoopEnd();
  * \param nbytes Number of bytes to in the closure arguments.
  * \return 0 if success.
  */
-int VTAPushGEMMOp(void** uop_handle,
-                  int (*finit)(void*),
-                  void* signature,
-                  int nbytes);
+TVM_DLL int VTAPushGEMMOp(void** uop_handle,
+                          int (*finit)(void*),
+                          void* signature,
+                          int nbytes);
 
 /*!
  * \brief Push ALU uop kernel into the command handle.
@@ -234,10 +235,10 @@ int VTAPushGEMMOp(void** uop_handle,
  * \param nbytes Number of bytes to in the closure arguments.
  * \return 0 if success.
  */
-int VTAPushALUOp(void** uop_handle,
-                 int (*finit)(void*),
-                 void* signature,
-                 int nbytes);
+TVM_DLL int VTAPushALUOp(void** uop_handle,
+                         int (*finit)(void*),
+                         void* signature,
+                         int nbytes);
 
 /*!
  * \brief Push dependence token.
@@ -246,7 +247,7 @@ int VTAPushALUOp(void** uop_handle,
  * \param to_qid The destination queue.
  * \return 0 if success.
  */
-int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
+TVM_DLL int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
 
 /*!
  * \brief Pop dependence signal.
@@ -255,7 +256,7 @@ int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
  * \param to_qid The destination queue.
  * \return 0 if success.
  */
-int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
+TVM_DLL int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
 
 /*!
  * \brief Synchronize the command handle.
@@ -266,7 +267,7 @@ int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
  * \param wait_cycles The limit of poll cycles.
  *
  */
-void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
+TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
 
 #ifdef __cplusplus
 }
diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
index ffa0096e1713..88d400713a12 100644
--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -10,6 +10,7 @@
 #include <vta/hw_spec.h>
 #include <vta/runtime.h>
 #include <dmlc/logging.h>
+#include <tvm/runtime/c_runtime_api.h>
 
 #include <cassert>
 #include <cstring>