From 1d0fc6cbb8c65ae7acb49462f2c8c32d6b717984 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@fb.com>
Date: Fri, 11 Aug 2017 03:25:06 -0700
Subject: [PATCH 1/2] [WIP] C++ topi contributions

Summary:
This diff implements C++ topi contributions for:
  - relu with parametrix threshold
  - pad with generic padBefore / padAfter specification
  - matmult with transposes
  - conv2d_nchw, conv2d_hwcn with runtime constant padding and strides
  - depthwise_conv2d_nchw with runtime constant padding and strides
  - group_conv2d_ngchw with runtime constant padding and strides
  - broadcast_to a broadcastable shape
  - broadcast_bop where bop is an usual binary op (+ - * / %)

Convolution padding is implemented using the pad operation.
To avoid extra memory consumption, it is generally recommended to inline the padding with the autoinliner.
Unfortunately in its current form the elemwise checks are too restrictive to allow inlining.
So this diff also proposes an extension to LHS injective (i.e. no reduction axis in the current IR design)

Test Plan:
Tested in C++ testsuite in a separate repository, I am looking for suggestions to quickly spin up some tests for tvm.

Reviewers: tqchen

Subscribers:

Tasks:

Tags:

Blame Revision:
---
 include/tvm/schedule_pass.h           |   8 +-
 src/schedule/auto_inline_elem_wise.cc |  14 +-
 topi/include/topi/broadcast.h         |  67 ++++++++
 topi/include/topi/detail/broadcast.h  | 103 ++++++++++++
 topi/include/topi/ewise.h             |  13 +-
 topi/include/topi/nn.h                | 218 ++++++++++++++++++++++++++
 6 files changed, 414 insertions(+), 9 deletions(-)
 create mode 100644 topi/include/topi/broadcast.h
 create mode 100644 topi/include/topi/detail/broadcast.h
 create mode 100644 topi/include/topi/nn.h
diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h
index b3f64db1eee9..992e8732adc8 100644
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@@ -35,10 +35,16 @@ Stmt ScheduleOps(Schedule s, Map<IterVar, Range> dom_map);
 
 /*!
  * \brief To automatically inline the element-wise operations.
+ * By default only pure element-wise operations are inlined
+ *   (no scaling, shifting of transposition on thr RHS accesses).
+ * One can also relax that check to permit inlining operations that only
+ *   exhibit injective writes (i.e. no reduction or sequential loops). Note
+ *   that in this case, guarantees about contiguity, transpose, stride,
+ *   alignemnt and memory footprint in general do not hold.
  *
  * \param sch The schedule to be inlined.
  */
-void AutoInlineElemWise(Schedule sch);
+void AutoInlineElemWise(Schedule sch, bool relaxElemWiseCondition = false);
 
 }  // namespace schedule
 }  // namespace tvm
diff --git a/src/schedule/auto_inline_elem_wise.cc b/src/schedule/auto_inline_elem_wise.cc
index 9fd073c0ac7a..14f2a23cb610 100644
--- a/src/schedule/auto_inline_elem_wise.cc
+++ b/src/schedule/auto_inline_elem_wise.cc
@@ -52,9 +52,19 @@ bool IsElemWise(const Operation& op) {
   return false;
 }
 
-void AutoInlineElemWise(Schedule sch) {
+bool IsInjective(const Operation& op) {
+  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
+    return compute->reduce_axis.size() == 0;
+  }
+  return false;
+}
+
+void AutoInlineElemWise(Schedule sch, bool relaxElemWiseCondition) {
   for (Stage s : sch->stages) {
-    if (!s.is_scheduled() && IsElemWise(s->op) && !s->is_output) {
+    if (!s.is_scheduled() &&
+        (IsElemWise(s->op) ||
+         (relaxElemWiseCondition && IsInjective(s->op))) &&
+        !s->is_output) {
       s.compute_inline();
     }
   }
diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
new file mode 100644
index 000000000000..7bd5cbeeae46
--- /dev/null
+++ b/topi/include/topi/broadcast.h
@@ -0,0 +1,67 @@
+#ifndef TOPI_BROADCAST_H
+#define TOPI_BROADCAST_H
+
+#include <topi/detail/broadcast.h>
+
+namespace topi {
+
+inline tvm::Tensor broadcast_to(const tvm::Array<tvm::Expr>& outputShape,
+                                const tvm::Tensor& I) {
+  CHECK_GE(outputShape.size(), I->shape.size()) <<
+    "Not a broadcast, output dimensionality smaller than input.\noutput: " <<
+    outputShape << "\nvs\ninput: " << I;
+  auto bh = detail::broadcastShape(outputShape, I->shape);
+  CHECK_EQ(outputShape.size(), bh.commonShape.size());
+  for (int i = 0; i < outputShape.size(); ++i) {
+    CHECK(tvm::ir::Equal(outputShape[i], bh.commonShape[i]));
+  }
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    return I(detail::inputShapeFromBroadcast(ovars, I, bh.vars2, bh.allVars));
+  };
+  return tvm::compute(
+    tvm::Array<tvm::Expr>(bh.commonShape.begin(), bh.commonShape.end()), l);
+}
+
+inline tvm::Tensor broadcast_add(
+    const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) {
+    return a + b;
+  };
+  return detail::withBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_sub(
+    const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) {
+    return a - b;
+  };
+  return detail::withBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_mul(
+    const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) {
+    return a * b;
+  };
+  return detail::withBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_div(
+    const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) {
+    return a / b;
+  };
+  return detail::withBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_mod(
+    const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) {
+    return a % b;
+  };
+  return detail::withBroadcast(l, A, B);
+}
+
+} // ns topi
+
+#endif // TOPI_BROADCAST_H
diff --git a/topi/include/topi/detail/broadcast.h b/topi/include/topi/detail/broadcast.h
new file mode 100644
index 000000000000..bd2cb7072c51
--- /dev/null
+++ b/topi/include/topi/detail/broadcast.h
@@ -0,0 +1,103 @@
+#ifndef TOPI_DETAIL_BROADCAST_H
+#define TOPI_DETAIL_BROADCAST_H
+
+#include <tvm/tvm.h>
+#include <tvm/ir_pass.h>
+
+namespace topi { namespace detail {
+
+struct BroadcastHelper {
+  std::deque<tvm::Expr> commonShape;
+  std::deque<tvm::Var> allVars;
+  std::deque<tvm::Var> vars1;
+  std::deque<tvm::Var> vars2;
+};
+
+inline BroadcastHelper broadcastShape(const tvm::Array<tvm::Expr>& shape1,
+                                      const tvm::Array<tvm::Expr>& shape2) {
+  BroadcastHelper bh;
+  int s1Size = shape1.size();
+  int s2Size = shape2.size();
+  tvm::Expr one(1);
+  int i;
+  for (i = 1; i <= std::min(s1Size, s2Size); ++i) {
+    bh.allVars.push_front(tvm::Var());
+    if (tvm::ir::Equal(shape1[s1Size - i], shape2[s2Size - i])) {
+      bh.commonShape.push_front(shape1[s1Size - i]);
+      bh.vars1.push_front(bh.allVars[0]);
+      bh.vars2.push_front(bh.allVars[0]);
+    } else if (tvm::ir::Equal(one, shape1[s1Size - i])) {
+      CHECK(! tvm::ir::Equal(one, shape2[s2Size - i]));
+      bh.commonShape.push_front(shape2[s2Size - i]);
+      bh.vars2.push_front(bh.allVars[0]);
+    } else if (tvm::ir::Equal(one, shape2[s2Size - i])) {
+      bh.commonShape.push_front(shape1[s1Size - i]);
+      bh.vars1.push_front(bh.allVars[0]);
+    } else {
+      CHECK(false) <<
+        "Incompatible broadcast dims: " <<
+        shape1[s1Size - i] << " and " << shape2[s2Size - i] << " in: " <<
+        tvm::Array<tvm::Expr>(shape1.begin(), shape1.end()) << " and " <<
+        tvm::Array<tvm::Expr>(shape2.begin(), shape2.end());
+    }
+  }
+  // Remaining dimensions whether on shape1 or shape2 can always be completed
+  auto maxSize = std::max(s1Size, s2Size);
+  auto& shape = (s1Size > s2Size) ? shape1 : shape2;
+  auto& vars = (s1Size > s2Size) ? bh.vars1 : bh.vars2;
+  for (i = i; i <= maxSize; ++i) {
+    bh.allVars.push_front(tvm::Var());
+    bh.commonShape.push_front(shape[maxSize - i]);
+    vars.push_front(bh.allVars[0]);
+  }
+  return bh;
+}
+
+inline tvm::Array<tvm::Expr> inputShapeFromBroadcast(
+    const tvm::Array<tvm::Var>& ovars,
+    const tvm::Tensor& T,
+    const std::deque<tvm::Var>& myVars,
+    const std::deque<tvm::Var>& allVars) {
+  tvm::Array<tvm::Expr> ivars;
+  CHECK_EQ(ovars.size(), allVars.size());
+  // N^2, could use a map but NBD..
+  int expectedDims = T->shape.size();
+  for (int i = 0; i < ovars.size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < myVars.size(); ++j) {
+      if (tvm::ir::Equal(allVars[i], myVars[j])) {
+        ivars.push_back(ovars[i]);
+        found = true;
+        break;
+      }
+    }
+    // Only inject 0 here if we have not yet reached the dimension of I
+    // (i.e. this must be a 1)
+    if (!found && (ovars.size() - i) <= expectedDims) {
+      ivars.push_back(tvm::Expr(0));
+    }
+  }
+  CHECK(expectedDims == ivars.size());
+  return ivars;
+}
+
+typedef std::function<tvm::Expr(tvm::Expr, tvm::Expr)> BinaryExpr;
+
+inline tvm::Tensor withBroadcast(BinaryExpr op,
+                                 const tvm::Tensor& A,
+                                 const tvm::Tensor& B) {
+  auto bh = broadcastShape(A->shape, B->shape);
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    return op(
+      A(inputShapeFromBroadcast(ovars, A, bh.vars1, bh.allVars)),
+      B(inputShapeFromBroadcast(ovars, B, bh.vars2, bh.allVars))
+    );
+  };
+  return tvm::compute(
+    tvm::Array<tvm::Expr>(bh.commonShape.begin(), bh.commonShape.end()), l);
+}
+
+}} // ns topi::detail
+
+
+#endif // TOPI_DETAIL_BROADCAST_H
diff --git a/topi/include/topi/ewise.h b/topi/include/topi/ewise.h
index 2909e726fe07..8de4d1f4f320 100644
--- a/topi/include/topi/ewise.h
+++ b/topi/include/topi/ewise.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file topi.h
+ * \file ewise.h
  * \brief Elementwise op constructions
  */
 #ifndef TOPI_EWISE_H_
@@ -12,16 +12,17 @@ namespace topi {
 using namespace tvm;
 
 // Unary intrinsic operators
-#define TOPI_DECLARE_UNARY_OP(OpName)                                   \
-  inline Tensor OpName(const Tensor& x) {                               \
-    return compute(x->shape, [&](const Array<Var>& i) {                 \
-        return ::tvm::OpName(x(i));                                     \
-      });                                                               \
+#define TOPI_DECLARE_UNARY_OP(OpName)                   \
+  inline Tensor OpName(const Tensor& x) {               \
+    return compute(x->shape, [&](const Array<Var>& i) { \
+        return ::tvm::OpName(x(i));                     \
+      }, "tensor", "ewise");                            \
   }
 
 TOPI_DECLARE_UNARY_OP(exp);
 TOPI_DECLARE_UNARY_OP(tanh);
 TOPI_DECLARE_UNARY_OP(sigmoid);
 TOPI_DECLARE_UNARY_OP(sqrt);
+
 }  // namespace topi
 #endif  // TOPI_EWISE_H_
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
new file mode 100644
index 000000000000..54e816dbd56b
--- /dev/null
+++ b/topi/include/topi/nn.h
@@ -0,0 +1,218 @@
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \file topi.h
+ * \brief Elementwise op constructions
+ */
+#ifndef TOPI_NN_H_
+#define TOPI_NN_H_
+
+#include <tvm/ir.h>
+#include <tvm/ir_pass.h>
+#include <tvm/tvm.h>
+
+namespace topi {
+
+template <typename T>
+tvm::Expr map(const tvm::Array<tvm::Expr>& exprs, T op) {
+  CHECK_GE(exprs.size(), 1);
+  tvm::Expr res = exprs[0];
+  for (int i = 1; i < exprs.size(); ++i) {
+    res = op(res, exprs[i]);
+  }
+  return res;
+}
+
+template <typename T>
+inline tvm::Tensor relu(const tvm::Tensor& x, T threshold) {
+  return tvm::compute(x->shape, [&](const tvm::Array<tvm::Var>& i) {
+    return tvm::max(x(i), threshold);
+    }, "tensor", "ewise");
+}
+
+inline tvm::Tensor pad(
+    const tvm::Tensor& t,
+    const tvm::Array<tvm::Expr>& padBefore,
+    tvm::Array<tvm::Expr> padAfter = tvm::Array<tvm::Expr>()) {
+  if (padAfter.size() < padBefore.size()) {
+    for(int i = padAfter.size(); i < padBefore.size(); ++i) {
+      padAfter.push_back(padBefore[i]);
+    }
+  }
+  CHECK_GE(padBefore.size(), 1);
+  CHECK_EQ(padBefore.size(), padAfter.size());
+  tvm::Array<tvm::Expr> outputShape;
+  for (int i = 0; i < t->shape.size(); ++i) {
+    if (i >= padBefore.size()) {
+      outputShape.push_back(t->shape[i]);
+    } else {
+      outputShape.push_back(
+        tvm::ir::Simplify(t->shape[i] + padBefore[i] + padAfter[i]));
+    }
+  }
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    tvm::Array<tvm::Expr> indices;
+    tvm::Array<tvm::Expr> sel;
+    for (int i = 0; i < t->shape.size(); ++i) {
+      if (i >= padBefore.size()) {
+        indices.push_back(ovars[i]);
+        continue;
+      }
+      if (!tvm::ir::Equal(padBefore[i], 0)) {
+        sel.push_back(ovars[i] >= padBefore[i]);
+        indices.push_back(ovars[i] - padBefore[i]);
+      } else {
+        indices.push_back(ovars[i]);
+      }
+      if (!tvm::ir::Equal(padAfter[i], 0)) {
+        sel.push_back(tvm::ir::Simplify(ovars[i] < padBefore[i] + t->shape[i]));
+      }
+    }
+    return tvm::select(map(sel, tvm::ir::And::make), t(indices), 0);
+  };
+  return tvm::compute(outputShape, l, "tensor", "ewise");
+}
+
+// Returns a compute that calculates a row-major matrix multiplication:
+//   A(i, k) * B(k, j), if transA == transB
+//   the usual transposed combinations, otherwise
+inline tvm::Tensor matmult(const tvm::Tensor& A,
+                           const tvm::Tensor& B,
+                           bool transA = false,
+                           bool transB = false) {
+  tvm::Array<tvm::Expr> outputShape{
+    A->shape[transA ? 1 : 0],
+    B->shape[transB ? 0 : 1]
+  };
+  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[transA ? 0 : 1]}, "k");
+  auto l = [&](tvm::Var i, tvm::Var j) {
+    return tvm::sum(
+      (transA ? A[k][i] : A[i][k]) * (transB ? B[j][k] : B[k][j]),
+      {k});
+  };
+  return tvm::compute(outputShape, l);
+}
+
+inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I,
+                               const tvm::Tensor& W,
+                               int padH = 0,
+                               int padW = 0,
+                               int strideH = 1,
+                               int strideW = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> outputShape{
+    I->shape[0],                   // B
+    W->shape[1],                   // O
+    (I->shape[2] - W->shape[2] + 2 * padH) / strideH + 1, // H
+    (I->shape[3] - W->shape[3] + 2 * padW) / strideW + 1  // W
+  };
+  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
+  auto T = (padH == 0 && padW == 0) ?
+    I : pad(I, {tvm::Expr(0), tvm::Expr(0), padH, padW});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+      T(b, i, strideH * h + kh, strideW * w + kw) *  W(i, o, kh, kw),
+      {i, kh, kw}
+    );
+  };
+  return tvm::compute(outputShape, l);
+}
+
+inline tvm::Tensor conv2d_hwcn(const tvm::Tensor& I,
+                               const tvm::Tensor& W,
+                               int padH = 0,
+                               int padW = 0,
+                               int strideH = 1,
+                               int strideW = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> outputShape{
+    (I->shape[2] - W->shape[2] + 2 * padH) / strideH + 1, // H
+    (I->shape[3] - W->shape[3] + 2 * padW) / strideW + 1, // W
+    I->shape[2],                   // B
+    W->shape[3]                    // O
+  };
+  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[3]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[0]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[1]}, "kw");
+  auto T = (padH == 0 && padW == 0) ? I : pad(I, {padH, padW});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+      T(strideH * h + kh, strideW * w + kw, i, b) * W(kh, kw, i, o),
+      {i, kh, kw}
+    );
+  };
+  return tvm::compute(outputShape, l);
+}
+
+inline tvm::Tensor depthwise_conv2d_nchw(const tvm::Tensor& I,
+                                         const tvm::Tensor& W,
+                                         int padH = 0,
+                                         int padW = 0,
+                                         int strideH = 1,
+                                         int strideW = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  auto pCM = W->shape[1]; // channel_multiplier
+  tvm::Array<tvm::Expr> outputShape{
+    I->shape[0],                   // B
+    W->shape[1],                   // O
+    (I->shape[2] - W->shape[2] + 2 * padH) / strideH + 1, // H
+    (I->shape[3] - W->shape[3] + 2 * padW) / strideW + 1  // W
+  };
+  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
+  auto T = (padH == 0 && padW == 0) ?
+    I : pad(I, {tvm::Expr(0), tvm::Expr(0), padH, padW});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+      T(b, i / pCM, strideH * h + kh, strideW * w + kw) *  W(i / pCM, o % pCM, kh, kw),
+      {i, kh, kw}
+    );
+  };
+  return tvm::compute(outputShape, l);
+}
+
+inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
+                                      const tvm::Tensor& W,
+                                      int padH = 0,
+                                      int padW = 0,
+                                      int strideH = 1,
+                                      int strideW = 1) {
+  CHECK_EQ(5, I->shape.size());
+  CHECK_EQ(5, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> outputShape{
+    I->shape[0],                   // B
+    I->shape[1],                   // G
+    W->shape[2],                   // O
+    (I->shape[3] - W->shape[3] + 2 * padH) / strideH + 1, // H
+    (I->shape[4] - W->shape[4] + 2 * padW) / strideW + 1  // W
+  };
+  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[2]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[4]}, "kw");
+
+  auto T = (padH == 0 && padW == 0) ?
+    I : pad(I, {tvm::Expr(0), tvm::Expr(0), tvm::Expr(0), padH, padW});
+  auto l = [&](tvm::Var b, tvm::Var g, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+      I(b, g, i, strideH * h + kh, strideW * w + kw) * W(g, i, o, kh, kw),
+      {i, kh, kw}
+    );
+  };
+  return tvm::compute(outputShape, l);
+}
+
+}  // namespace topi
+#endif  // TOPI_NN_H_

From d087b4287730d077e87a65a12d082da6e2fbdb9b Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@fb.com>
Date: Sat, 12 Aug 2017 14:27:22 -0700
Subject: [PATCH 2/2] Review + Lint + GSG C++

---
 include/tvm/schedule_pass.h           |  14 +-
 src/schedule/auto_inline_elem_wise.cc |  33 +++-
 topi/include/topi/broadcast.h         |  80 ++++-----
 topi/include/topi/detail/broadcast.h  | 122 +++++++-------
 topi/include/topi/nn.h                | 223 ++++++++++++--------------
 5 files changed, 241 insertions(+), 231 deletions(-)

diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h
index 992e8732adc8..189b999a253d 100644
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@@ -35,16 +35,20 @@ Stmt ScheduleOps(Schedule s, Map<IterVar, Range> dom_map);
 
 /*!
  * \brief To automatically inline the element-wise operations.
- * By default only pure element-wise operations are inlined
- *   (no scaling, shifting of transposition on thr RHS accesses).
- * One can also relax that check to permit inlining operations that only
- *   exhibit injective writes (i.e. no reduction or sequential loops). Note
+ *
+ * \param sch The schedule to be inlined.
+ */
+void AutoInlineElemWise(Schedule sch);
+
+/*!
+ * \brief To automatically inline operations with injective writes
+ *   (i.e. writes without reduction or sequential loops). Note
  *   that in this case, guarantees about contiguity, transpose, stride,
  *   alignemnt and memory footprint in general do not hold.
  *
  * \param sch The schedule to be inlined.
  */
-void AutoInlineElemWise(Schedule sch, bool relaxElemWiseCondition = false);
+void AutoInlineInjective(Schedule sch);
 
 }  // namespace schedule
 }  // namespace tvm
diff --git a/src/schedule/auto_inline_elem_wise.cc b/src/schedule/auto_inline_elem_wise.cc
index 14f2a23cb610..1dc1ebbd9959 100644
--- a/src/schedule/auto_inline_elem_wise.cc
+++ b/src/schedule/auto_inline_elem_wise.cc
@@ -52,6 +52,32 @@ bool IsElemWise(const Operation& op) {
   return false;
 }
 
+void AutoInlineElemWise(Schedule sch) {
+  for (Stage s : sch->stages) {
+    if (!s.is_scheduled() && IsElemWise(s->op) && !s->is_output) {
+      s.compute_inline();
+    }
+  }
+}
+
+bool IsBroadcast(const Operation& op) {
+  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
+    if (compute->reduce_axis.size()) {
+      return false;
+    }
+    // TODO(nicolasvasilache): Implement Me
+  }
+  return false;
+}
+
+void AutoInlineBroadcast(Schedule sch) {
+  for (Stage s : sch->stages) {
+    if (!s.is_scheduled() && IsBroadcast(s->op) && !s->is_output) {
+      s.compute_inline();
+    }
+  }
+}
+
 bool IsInjective(const Operation& op) {
   if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
     return compute->reduce_axis.size() == 0;
@@ -59,12 +85,9 @@ bool IsInjective(const Operation& op) {
   return false;
 }
 
-void AutoInlineElemWise(Schedule sch, bool relaxElemWiseCondition) {
+void AutoInlineInjective(Schedule sch) {
   for (Stage s : sch->stages) {
-    if (!s.is_scheduled() &&
-        (IsElemWise(s->op) ||
-         (relaxElemWiseCondition && IsInjective(s->op))) &&
-        !s->is_output) {
+    if (!s.is_scheduled() && IsInjective(s->op) && !s->is_output) {
       s.compute_inline();
     }
   }
diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
index 7bd5cbeeae46..0a9f885c1b12 100644
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
@@ -1,67 +1,57 @@
-#ifndef TOPI_BROADCAST_H
-#define TOPI_BROADCAST_H
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief Broadcast op constructions
+ * \file broadcast.h
+ */
+#ifndef TOPI_BROADCAST_H_
+#define TOPI_BROADCAST_H_
 
 #include <topi/detail/broadcast.h>
 
 namespace topi {
 
-inline tvm::Tensor broadcast_to(const tvm::Array<tvm::Expr>& outputShape,
-                                const tvm::Tensor& I) {
-  CHECK_GE(outputShape.size(), I->shape.size()) <<
-    "Not a broadcast, output dimensionality smaller than input.\noutput: " <<
-    outputShape << "\nvs\ninput: " << I;
-  auto bh = detail::broadcastShape(outputShape, I->shape);
-  CHECK_EQ(outputShape.size(), bh.commonShape.size());
-  for (int i = 0; i < outputShape.size(); ++i) {
-    CHECK(tvm::ir::Equal(outputShape[i], bh.commonShape[i]));
+inline tvm::Tensor broadcast_to(const tvm::Tensor& I,
+                                const tvm::Array<tvm::Expr>& output_shape) {
+  CHECK_GE(output_shape.size(), I->shape.size())
+      << "Not a broadcast, output dimensionality smaller than input.\noutput: "
+      << output_shape << "\nvs\ninput: " << I;
+  auto bh = detail::BroadcastShape(output_shape, I->shape);
+  CHECK_EQ(output_shape.size(), bh.common_shape.size());
+  for (int i = 0; i < output_shape.size(); ++i) {
+    CHECK(tvm::ir::Equal(output_shape[i], bh.common_shape[i]));
   }
   auto l = [&](tvm::Array<tvm::Var> ovars) {
-    return I(detail::inputShapeFromBroadcast(ovars, I, bh.vars2, bh.allVars));
+    return I(detail::InputIndexFromBroadcast(ovars, I, bh.vars2, bh.all_vars));
   };
   return tvm::compute(
-    tvm::Array<tvm::Expr>(bh.commonShape.begin(), bh.commonShape.end()), l);
+      tvm::Array<tvm::Expr>(bh.common_shape.begin(), bh.common_shape.end()), l);
 }
 
-inline tvm::Tensor broadcast_add(
-    const tvm::Tensor& A, const tvm::Tensor& B) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) {
-    return a + b;
-  };
-  return detail::withBroadcast(l, A, B);
+inline tvm::Tensor broadcast_add(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a + b; };
+  return detail::WithBroadcast(l, A, B);
 }
 
-inline tvm::Tensor broadcast_sub(
-    const tvm::Tensor& A, const tvm::Tensor& B) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) {
-    return a - b;
-  };
-  return detail::withBroadcast(l, A, B);
+inline tvm::Tensor broadcast_sub(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a - b; };
+  return detail::WithBroadcast(l, A, B);
 }
 
-inline tvm::Tensor broadcast_mul(
-    const tvm::Tensor& A, const tvm::Tensor& B) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) {
-    return a * b;
-  };
-  return detail::withBroadcast(l, A, B);
+inline tvm::Tensor broadcast_mul(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a * b; };
+  return detail::WithBroadcast(l, A, B);
 }
 
-inline tvm::Tensor broadcast_div(
-    const tvm::Tensor& A, const tvm::Tensor& B) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) {
-    return a / b;
-  };
-  return detail::withBroadcast(l, A, B);
+inline tvm::Tensor broadcast_div(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a / b; };
+  return detail::WithBroadcast(l, A, B);
 }
 
-inline tvm::Tensor broadcast_mod(
-    const tvm::Tensor& A, const tvm::Tensor& B) {
-  auto l = [&](tvm::Expr a, tvm::Expr b) {
-    return a % b;
-  };
-  return detail::withBroadcast(l, A, B);
+inline tvm::Tensor broadcast_mod(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a % b; };
+  return detail::WithBroadcast(l, A, B);
 }
 
-} // ns topi
+}  // namespace topi
 
-#endif // TOPI_BROADCAST_H
+#endif  // TOPI_BROADCAST_H_
diff --git a/topi/include/topi/detail/broadcast.h b/topi/include/topi/detail/broadcast.h
index bd2cb7072c51..a1b760ca21ac 100644
--- a/topi/include/topi/detail/broadcast.h
+++ b/topi/include/topi/detail/broadcast.h
@@ -1,71 +1,78 @@
-#ifndef TOPI_DETAIL_BROADCAST_H
-#define TOPI_DETAIL_BROADCAST_H
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief Detail broadcast.
+ * \file broadcast.h
+ */
+#ifndef TOPI_DETAIL_BROADCAST_H_
+#define TOPI_DETAIL_BROADCAST_H_
 
-#include <tvm/tvm.h>
-#include <tvm/ir_pass.h>
+#include <algorithm>
+#include <deque>
 
-namespace topi { namespace detail {
+#include "tvm/ir_pass.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
 
 struct BroadcastHelper {
-  std::deque<tvm::Expr> commonShape;
-  std::deque<tvm::Var> allVars;
+  std::deque<tvm::Expr> common_shape;
+  std::deque<tvm::Var> all_vars;
   std::deque<tvm::Var> vars1;
   std::deque<tvm::Var> vars2;
 };
 
-inline BroadcastHelper broadcastShape(const tvm::Array<tvm::Expr>& shape1,
+inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::Expr>& shape1,
                                       const tvm::Array<tvm::Expr>& shape2) {
   BroadcastHelper bh;
-  int s1Size = shape1.size();
-  int s2Size = shape2.size();
+  int s1_size = shape1.size();
+  int s2_size = shape2.size();
   tvm::Expr one(1);
   int i;
-  for (i = 1; i <= std::min(s1Size, s2Size); ++i) {
-    bh.allVars.push_front(tvm::Var());
-    if (tvm::ir::Equal(shape1[s1Size - i], shape2[s2Size - i])) {
-      bh.commonShape.push_front(shape1[s1Size - i]);
-      bh.vars1.push_front(bh.allVars[0]);
-      bh.vars2.push_front(bh.allVars[0]);
-    } else if (tvm::ir::Equal(one, shape1[s1Size - i])) {
-      CHECK(! tvm::ir::Equal(one, shape2[s2Size - i]));
-      bh.commonShape.push_front(shape2[s2Size - i]);
-      bh.vars2.push_front(bh.allVars[0]);
-    } else if (tvm::ir::Equal(one, shape2[s2Size - i])) {
-      bh.commonShape.push_front(shape1[s1Size - i]);
-      bh.vars1.push_front(bh.allVars[0]);
+  for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
+    bh.all_vars.push_front(tvm::Var());
+    if (tvm::ir::Equal(shape1[s1_size - i], shape2[s2_size - i])) {
+      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.vars1.push_front(bh.all_vars[0]);
+      bh.vars2.push_front(bh.all_vars[0]);
+    } else if (tvm::ir::Equal(one, shape1[s1_size - i])) {
+      CHECK(!tvm::ir::Equal(one, shape2[s2_size - i]));
+      bh.common_shape.push_front(shape2[s2_size - i]);
+      bh.vars2.push_front(bh.all_vars[0]);
+    } else if (tvm::ir::Equal(one, shape2[s2_size - i])) {
+      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.vars1.push_front(bh.all_vars[0]);
     } else {
-      CHECK(false) <<
-        "Incompatible broadcast dims: " <<
-        shape1[s1Size - i] << " and " << shape2[s2Size - i] << " in: " <<
-        tvm::Array<tvm::Expr>(shape1.begin(), shape1.end()) << " and " <<
-        tvm::Array<tvm::Expr>(shape2.begin(), shape2.end());
+      CHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i]
+                   << " and " << shape2[s2_size - i] << " in: "
+                   << tvm::Array<tvm::Expr>(shape1.begin(), shape1.end())
+                   << " and "
+                   << tvm::Array<tvm::Expr>(shape2.begin(), shape2.end());
     }
   }
   // Remaining dimensions whether on shape1 or shape2 can always be completed
-  auto maxSize = std::max(s1Size, s2Size);
-  auto& shape = (s1Size > s2Size) ? shape1 : shape2;
-  auto& vars = (s1Size > s2Size) ? bh.vars1 : bh.vars2;
-  for (i = i; i <= maxSize; ++i) {
-    bh.allVars.push_front(tvm::Var());
-    bh.commonShape.push_front(shape[maxSize - i]);
-    vars.push_front(bh.allVars[0]);
+  auto max_size = std::max(s1_size, s2_size);
+  auto& shape = (s1_size > s2_size) ? shape1 : shape2;
+  auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2;
+  for (i = i; i <= max_size; ++i) {
+    bh.all_vars.push_front(tvm::Var());
+    bh.common_shape.push_front(shape[max_size - i]);
+    vars.push_front(bh.all_vars[0]);
   }
   return bh;
 }
 
-inline tvm::Array<tvm::Expr> inputShapeFromBroadcast(
-    const tvm::Array<tvm::Var>& ovars,
-    const tvm::Tensor& T,
-    const std::deque<tvm::Var>& myVars,
-    const std::deque<tvm::Var>& allVars) {
+inline tvm::Array<tvm::Expr> InputIndexFromBroadcast(
+    const tvm::Array<tvm::Var>& ovars, const tvm::Tensor& T,
+    const std::deque<tvm::Var>& my_vars, const std::deque<tvm::Var>& all_vars) {
   tvm::Array<tvm::Expr> ivars;
-  CHECK_EQ(ovars.size(), allVars.size());
+  CHECK_EQ(ovars.size(), all_vars.size());
   // N^2, could use a map but NBD..
-  int expectedDims = T->shape.size();
+  int expected_dims = T->shape.size();
   for (int i = 0; i < ovars.size(); ++i) {
     bool found = false;
-    for (int j = 0; j < myVars.size(); ++j) {
-      if (tvm::ir::Equal(allVars[i], myVars[j])) {
+    for (int j = 0; j < my_vars.size(); ++j) {
+    if (all_vars[i].same_as(my_vars[j])) {
         ivars.push_back(ovars[i]);
         found = true;
         break;
@@ -73,31 +80,28 @@ inline tvm::Array<tvm::Expr> inputShapeFromBroadcast(
     }
     // Only inject 0 here if we have not yet reached the dimension of I
     // (i.e. this must be a 1)
-    if (!found && (ovars.size() - i) <= expectedDims) {
-      ivars.push_back(tvm::Expr(0));
+    if (!found && (ovars.size() - i) <= expected_dims) {
+      ivars.push_back(tvm::make_zero(ovars[i].type()));
     }
   }
-  CHECK(expectedDims == ivars.size());
+  CHECK(expected_dims == ivars.size());
   return ivars;
 }
 
-typedef std::function<tvm::Expr(tvm::Expr, tvm::Expr)> BinaryExpr;
 
-inline tvm::Tensor withBroadcast(BinaryExpr op,
-                                 const tvm::Tensor& A,
+template <typename FBinaryExpr>
+inline tvm::Tensor WithBroadcast(FBinaryExpr op, const tvm::Tensor& A,
                                  const tvm::Tensor& B) {
-  auto bh = broadcastShape(A->shape, B->shape);
+  auto bh = BroadcastShape(A->shape, B->shape);
   auto l = [&](tvm::Array<tvm::Var> ovars) {
-    return op(
-      A(inputShapeFromBroadcast(ovars, A, bh.vars1, bh.allVars)),
-      B(inputShapeFromBroadcast(ovars, B, bh.vars2, bh.allVars))
-    );
+    return op(A(InputIndexFromBroadcast(ovars, A, bh.vars1, bh.all_vars)),
+              B(InputIndexFromBroadcast(ovars, B, bh.vars2, bh.all_vars)));
   };
   return tvm::compute(
-    tvm::Array<tvm::Expr>(bh.commonShape.begin(), bh.commonShape.end()), l);
+      tvm::Array<tvm::Expr>(bh.common_shape.begin(), bh.common_shape.end()), l);
 }
 
-}} // ns topi::detail
-
+}  // namespace detail
+}  // namespace topi
 
-#endif // TOPI_DETAIL_BROADCAST_H
+#endif  // TOPI_DETAIL_BROADCAST_H_
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 54e816dbd56b..dfd51e82f4ab 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -1,19 +1,22 @@
 /*
  *  Copyright (c) 2017 by Contributors
- * \file topi.h
- * \brief Elementwise op constructions
+ * \brief NN op constructions
+ * \file nn.h
  */
 #ifndef TOPI_NN_H_
 #define TOPI_NN_H_
 
-#include <tvm/ir.h>
-#include <tvm/ir_pass.h>
-#include <tvm/tvm.h>
+#include <algorithm>
+
+#include "tvm/ir.h"
+#include "tvm/ir_pass.h"
+#include "tvm/tvm.h"
 
 namespace topi {
+namespace detail {
 
 template <typename T>
-tvm::Expr map(const tvm::Array<tvm::Expr>& exprs, T op) {
+tvm::Expr Map(const tvm::Array<tvm::Expr>& exprs, T op) {
   CHECK_GE(exprs.size(), 1);
   tvm::Expr res = exprs[0];
   for (int i = 1; i < exprs.size(); ++i) {
@@ -22,196 +25,182 @@ tvm::Expr map(const tvm::Array<tvm::Expr>& exprs, T op) {
   return res;
 }
 
+}  // namespace detail
+
 template <typename T>
-inline tvm::Tensor relu(const tvm::Tensor& x, T threshold) {
-  return tvm::compute(x->shape, [&](const tvm::Array<tvm::Var>& i) {
-    return tvm::max(x(i), threshold);
-    }, "tensor", "ewise");
+inline tvm::Tensor relu(const tvm::Tensor& x, T threshold = static_cast<T>(0)) {
+  return tvm::compute(
+      x->shape,
+      [&](const tvm::Array<tvm::Var>& i) { return tvm::max(x(i), threshold); },
+      "tensor", "ewise");
 }
 
 inline tvm::Tensor pad(
-    const tvm::Tensor& t,
-    const tvm::Array<tvm::Expr>& padBefore,
-    tvm::Array<tvm::Expr> padAfter = tvm::Array<tvm::Expr>()) {
-  if (padAfter.size() < padBefore.size()) {
-    for(int i = padAfter.size(); i < padBefore.size(); ++i) {
-      padAfter.push_back(padBefore[i]);
+    const tvm::Tensor& t, const tvm::Array<tvm::Expr>& pad_before,
+    tvm::Array<tvm::Expr> pad_after = tvm::Array<tvm::Expr>()) {
+  if (pad_after.size() < pad_before.size()) {
+    for (int i = pad_after.size(); i < pad_before.size(); ++i) {
+      pad_after.push_back(pad_before[i]);
     }
   }
-  CHECK_GE(padBefore.size(), 1);
-  CHECK_EQ(padBefore.size(), padAfter.size());
-  tvm::Array<tvm::Expr> outputShape;
+  CHECK_GE(pad_before.size(), 1);
+  CHECK_EQ(pad_before.size(), pad_after.size());
+  tvm::Array<tvm::Expr> output_shape;
   for (int i = 0; i < t->shape.size(); ++i) {
-    if (i >= padBefore.size()) {
-      outputShape.push_back(t->shape[i]);
+    if (i >= pad_before.size()) {
+      output_shape.push_back(t->shape[i]);
     } else {
-      outputShape.push_back(
-        tvm::ir::Simplify(t->shape[i] + padBefore[i] + padAfter[i]));
+      output_shape.push_back(
+          tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i]));
     }
   }
   auto l = [&](tvm::Array<tvm::Var> ovars) {
     tvm::Array<tvm::Expr> indices;
     tvm::Array<tvm::Expr> sel;
     for (int i = 0; i < t->shape.size(); ++i) {
-      if (i >= padBefore.size()) {
+      if (i >= pad_before.size()) {
         indices.push_back(ovars[i]);
         continue;
       }
-      if (!tvm::ir::Equal(padBefore[i], 0)) {
-        sel.push_back(ovars[i] >= padBefore[i]);
-        indices.push_back(ovars[i] - padBefore[i]);
+      if (!tvm::ir::Equal(pad_before[i], 0)) {
+        sel.push_back(ovars[i] >= pad_before[i]);
+        indices.push_back(ovars[i] - pad_before[i]);
       } else {
         indices.push_back(ovars[i]);
       }
-      if (!tvm::ir::Equal(padAfter[i], 0)) {
-        sel.push_back(tvm::ir::Simplify(ovars[i] < padBefore[i] + t->shape[i]));
+      if (!tvm::ir::Equal(pad_after[i], 0)) {
+        sel.push_back(tvm::ir::Simplify(ovars[i] < pad_before[i] + t->shape[i]));
       }
     }
-    return tvm::select(map(sel, tvm::ir::And::make), t(indices), 0);
+    return tvm::select(detail::Map(sel, tvm::ir::And::make), t(indices), 0);
   };
-  return tvm::compute(outputShape, l, "tensor", "ewise");
+  return tvm::compute(output_shape, l, "tensor", "ewise");
 }
 
 // Returns a compute that calculates a row-major matrix multiplication:
-//   A(i, k) * B(k, j), if transA == transB
+//   A(i, k) * B(k, j), if trans_a == trans_b
 //   the usual transposed combinations, otherwise
-inline tvm::Tensor matmult(const tvm::Tensor& A,
-                           const tvm::Tensor& B,
-                           bool transA = false,
-                           bool transB = false) {
-  tvm::Array<tvm::Expr> outputShape{
-    A->shape[transA ? 1 : 0],
-    B->shape[transB ? 0 : 1]
-  };
-  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[transA ? 0 : 1]}, "k");
+inline tvm::Tensor matmult(const tvm::Tensor& A, const tvm::Tensor& B,
+                           bool trans_a = false, bool trans_b = false) {
+  tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
+                                     B->shape[trans_b ? 0 : 1]};
+  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
   auto l = [&](tvm::Var i, tvm::Var j) {
-    return tvm::sum(
-      (transA ? A[k][i] : A[i][k]) * (transB ? B[j][k] : B[k][j]),
-      {k});
+    return tvm::sum((trans_a ? A[k][i] : A[i][k]) * (trans_b ? B[j][k] : B[k][j]),
+                    {k});
   };
-  return tvm::compute(outputShape, l);
+  return tvm::compute(output_shape, l);
 }
 
-inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I,
-                               const tvm::Tensor& W,
-                               int padH = 0,
-                               int padW = 0,
-                               int strideH = 1,
-                               int strideW = 1) {
+inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I, const tvm::Tensor& W,
+                               int pad_h = 0, int pad_w = 0, int stride_h = 1,
+                               int stride_w = 1) {
   CHECK_EQ(4, I->shape.size());
   CHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
-  tvm::Array<tvm::Expr> outputShape{
-    I->shape[0],                   // B
-    W->shape[1],                   // O
-    (I->shape[2] - W->shape[2] + 2 * padH) / strideH + 1, // H
-    (I->shape[3] - W->shape[3] + 2 * padW) / strideW + 1  // W
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      W->shape[1],                                            // O
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
   };
-  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
   auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
   auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
-  auto T = (padH == 0 && padW == 0) ?
-    I : pad(I, {tvm::Expr(0), tvm::Expr(0), padH, padW});
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
   auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
     return tvm::sum(
-      T(b, i, strideH * h + kh, strideW * w + kw) *  W(i, o, kh, kw),
-      {i, kh, kw}
-    );
+        T(b, i, stride_h * h + kh, stride_w * w + kw) * W(i, o, kh, kw),
+        {i, kh, kw});
   };
-  return tvm::compute(outputShape, l);
+  return tvm::compute(output_shape, l);
 }
 
-inline tvm::Tensor conv2d_hwcn(const tvm::Tensor& I,
-                               const tvm::Tensor& W,
-                               int padH = 0,
-                               int padW = 0,
-                               int strideH = 1,
-                               int strideW = 1) {
+inline tvm::Tensor conv2d_hwcn(const tvm::Tensor& I, const tvm::Tensor& W,
+                               int pad_h = 0, int pad_w = 0, int stride_h = 1,
+                               int stride_w = 1) {
   CHECK_EQ(4, I->shape.size());
   CHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
-  tvm::Array<tvm::Expr> outputShape{
-    (I->shape[2] - W->shape[2] + 2 * padH) / strideH + 1, // H
-    (I->shape[3] - W->shape[3] + 2 * padW) / strideW + 1, // W
-    I->shape[2],                   // B
-    W->shape[3]                    // O
+  tvm::Array<tvm::Expr> output_shape{
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1,  // W
+      I->shape[2],                                            // B
+      W->shape[3]                                             // O
   };
-  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[3]}, "i");
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[3]}, "i");
   auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[0]}, "kh");
   auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[1]}, "kw");
-  auto T = (padH == 0 && padW == 0) ? I : pad(I, {padH, padW});
+  auto T = (pad_h == 0 && pad_w == 0) ? I : pad(I, {pad_h, pad_w});
   auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
     return tvm::sum(
-      T(strideH * h + kh, strideW * w + kw, i, b) * W(kh, kw, i, o),
-      {i, kh, kw}
-    );
+        T(stride_h * h + kh, stride_w * w + kw, i, b) * W(kh, kw, i, o),
+        {i, kh, kw});
   };
-  return tvm::compute(outputShape, l);
+  return tvm::compute(output_shape, l);
 }
 
 inline tvm::Tensor depthwise_conv2d_nchw(const tvm::Tensor& I,
-                                         const tvm::Tensor& W,
-                                         int padH = 0,
-                                         int padW = 0,
-                                         int strideH = 1,
-                                         int strideW = 1) {
+                                         const tvm::Tensor& W, int pad_h = 0,
+                                         int pad_w = 0, int stride_h = 1,
+                                         int stride_w = 1) {
   CHECK_EQ(4, I->shape.size());
   CHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
-  auto pCM = W->shape[1]; // channel_multiplier
-  tvm::Array<tvm::Expr> outputShape{
-    I->shape[0],                   // B
-    W->shape[1],                   // O
-    (I->shape[2] - W->shape[2] + 2 * padH) / strideH + 1, // H
-    (I->shape[3] - W->shape[3] + 2 * padW) / strideW + 1  // W
+  auto pCM = W->shape[1];  // channel_multiplier
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      W->shape[1],                                            // O
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
   };
-  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
   auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
   auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
-  auto T = (padH == 0 && padW == 0) ?
-    I : pad(I, {tvm::Expr(0), tvm::Expr(0), padH, padW});
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
   auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
-    return tvm::sum(
-      T(b, i / pCM, strideH * h + kh, strideW * w + kw) *  W(i / pCM, o % pCM, kh, kw),
-      {i, kh, kw}
-    );
+    return tvm::sum(T(b, i / pCM, stride_h * h + kh, stride_w * w + kw) *
+                        W(i / pCM, o % pCM, kh, kw),
+                    {i, kh, kw});
   };
-  return tvm::compute(outputShape, l);
+  return tvm::compute(output_shape, l);
 }
 
 inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
-                                      const tvm::Tensor& W,
-                                      int padH = 0,
-                                      int padW = 0,
-                                      int strideH = 1,
-                                      int strideW = 1) {
+                                      const tvm::Tensor& W, int pad_h = 0,
+                                      int pad_w = 0, int stride_h = 1,
+                                      int stride_w = 1) {
   CHECK_EQ(5, I->shape.size());
   CHECK_EQ(5, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
-  tvm::Array<tvm::Expr> outputShape{
-    I->shape[0],                   // B
-    I->shape[1],                   // G
-    W->shape[2],                   // O
-    (I->shape[3] - W->shape[3] + 2 * padH) / strideH + 1, // H
-    (I->shape[4] - W->shape[4] + 2 * padW) / strideW + 1  // W
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      I->shape[1],                                            // G
+      W->shape[2],                                            // O
+      (I->shape[3] - W->shape[3] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[4] - W->shape[4] + 2 * pad_w) / stride_w + 1   // W
   };
-  auto i  = tvm::reduce_axis(tvm::Range{0, I->shape[2]}, "i");
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[2]}, "i");
   auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kh");
   auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[4]}, "kw");
 
-  auto T = (padH == 0 && padW == 0) ?
-    I : pad(I, {tvm::Expr(0), tvm::Expr(0), tvm::Expr(0), padH, padW});
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
   auto l = [&](tvm::Var b, tvm::Var g, tvm::Var o, tvm::Var h, tvm::Var w) {
     return tvm::sum(
-      I(b, g, i, strideH * h + kh, strideW * w + kw) * W(g, i, o, kh, kw),
-      {i, kh, kw}
-    );
+        I(b, g, i, stride_h * h + kh, stride_w * w + kw) * W(g, i, o, kh, kw),
+        {i, kh, kw});
   };
-  return tvm::compute(outputShape, l);
+  return tvm::compute(output_shape, l);
 }
 
 }  // namespace topi