diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h
index b3f64db1eee9..189b999a253d 100644
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@@ -40,6 +40,16 @@ Stmt ScheduleOps(Schedule s, Map<IterVar, Range> dom_map);
  */
 void AutoInlineElemWise(Schedule sch);
 
+/*!
+ * \brief To automatically inline operations with injective writes
+ *   (i.e. writes without reduction or sequential loops). Note
+ *   that in this case, guarantees about contiguity, transpose, stride,
+ *   alignemnt and memory footprint in general do not hold.
+ *
+ * \param sch The schedule to be inlined.
+ */
+void AutoInlineInjective(Schedule sch);
+
 }  // namespace schedule
 }  // namespace tvm
 #endif  // TVM_SCHEDULE_PASS_H_
diff --git a/src/schedule/auto_inline_elem_wise.cc b/src/schedule/auto_inline_elem_wise.cc
index 9fd073c0ac7a..1dc1ebbd9959 100644
--- a/src/schedule/auto_inline_elem_wise.cc
+++ b/src/schedule/auto_inline_elem_wise.cc
@@ -60,5 +60,38 @@ void AutoInlineElemWise(Schedule sch) {
   }
 }
 
+bool IsBroadcast(const Operation& op) {
+  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
+    if (compute->reduce_axis.size()) {
+      return false;
+    }
+    // TODO(nicolasvasilache): Implement Me
+  }
+  return false;
+}
+
+void AutoInlineBroadcast(Schedule sch) {
+  for (Stage s : sch->stages) {
+    if (!s.is_scheduled() && IsBroadcast(s->op) && !s->is_output) {
+      s.compute_inline();
+    }
+  }
+}
+
+bool IsInjective(const Operation& op) {
+  if (const ComputeOpNode* compute = op.as<ComputeOpNode>()) {
+    return compute->reduce_axis.size() == 0;
+  }
+  return false;
+}
+
+void AutoInlineInjective(Schedule sch) {
+  for (Stage s : sch->stages) {
+    if (!s.is_scheduled() && IsInjective(s->op) && !s->is_output) {
+      s.compute_inline();
+    }
+  }
+}
+
 }  // namespace schedule
 }  // namespace tvm
diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
new file mode 100644
index 000000000000..0a9f885c1b12
--- /dev/null
+++ b/topi/include/topi/broadcast.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief Broadcast op constructions
+ * \file broadcast.h
+ */
+#ifndef TOPI_BROADCAST_H_
+#define TOPI_BROADCAST_H_
+
+#include <topi/detail/broadcast.h>
+
+namespace topi {
+
+inline tvm::Tensor broadcast_to(const tvm::Tensor& I,
+                                const tvm::Array<tvm::Expr>& output_shape) {
+  CHECK_GE(output_shape.size(), I->shape.size())
+      << "Not a broadcast, output dimensionality smaller than input.\noutput: "
+      << output_shape << "\nvs\ninput: " << I;
+  auto bh = detail::BroadcastShape(output_shape, I->shape);
+  CHECK_EQ(output_shape.size(), bh.common_shape.size());
+  for (int i = 0; i < output_shape.size(); ++i) {
+    CHECK(tvm::ir::Equal(output_shape[i], bh.common_shape[i]));
+  }
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    return I(detail::InputIndexFromBroadcast(ovars, I, bh.vars2, bh.all_vars));
+  };
+  return tvm::compute(
+      tvm::Array<tvm::Expr>(bh.common_shape.begin(), bh.common_shape.end()), l);
+}
+
+inline tvm::Tensor broadcast_add(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a + b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_sub(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a - b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_mul(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a * b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_div(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a / b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+inline tvm::Tensor broadcast_mod(const tvm::Tensor& A, const tvm::Tensor& B) {
+  auto l = [&](tvm::Expr a, tvm::Expr b) { return a % b; };
+  return detail::WithBroadcast(l, A, B);
+}
+
+}  // namespace topi
+
+#endif  // TOPI_BROADCAST_H_
diff --git a/topi/include/topi/detail/broadcast.h b/topi/include/topi/detail/broadcast.h
new file mode 100644
index 000000000000..a1b760ca21ac
--- /dev/null
+++ b/topi/include/topi/detail/broadcast.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief Detail broadcast.
+ * \file broadcast.h
+ */
+#ifndef TOPI_DETAIL_BROADCAST_H_
+#define TOPI_DETAIL_BROADCAST_H_
+
+#include <algorithm>
+#include <deque>
+
+#include "tvm/ir_pass.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+
+struct BroadcastHelper {
+  std::deque<tvm::Expr> common_shape;
+  std::deque<tvm::Var> all_vars;
+  std::deque<tvm::Var> vars1;
+  std::deque<tvm::Var> vars2;
+};
+
+inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::Expr>& shape1,
+                                      const tvm::Array<tvm::Expr>& shape2) {
+  BroadcastHelper bh;
+  int s1_size = shape1.size();
+  int s2_size = shape2.size();
+  tvm::Expr one(1);
+  int i;
+  for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
+    bh.all_vars.push_front(tvm::Var());
+    if (tvm::ir::Equal(shape1[s1_size - i], shape2[s2_size - i])) {
+      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.vars1.push_front(bh.all_vars[0]);
+      bh.vars2.push_front(bh.all_vars[0]);
+    } else if (tvm::ir::Equal(one, shape1[s1_size - i])) {
+      CHECK(!tvm::ir::Equal(one, shape2[s2_size - i]));
+      bh.common_shape.push_front(shape2[s2_size - i]);
+      bh.vars2.push_front(bh.all_vars[0]);
+    } else if (tvm::ir::Equal(one, shape2[s2_size - i])) {
+      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.vars1.push_front(bh.all_vars[0]);
+    } else {
+      CHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i]
+                   << " and " << shape2[s2_size - i] << " in: "
+                   << tvm::Array<tvm::Expr>(shape1.begin(), shape1.end())
+                   << " and "
+                   << tvm::Array<tvm::Expr>(shape2.begin(), shape2.end());
+    }
+  }
+  // Remaining dimensions whether on shape1 or shape2 can always be completed
+  auto max_size = std::max(s1_size, s2_size);
+  auto& shape = (s1_size > s2_size) ? shape1 : shape2;
+  auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2;
+  for (i = i; i <= max_size; ++i) {
+    bh.all_vars.push_front(tvm::Var());
+    bh.common_shape.push_front(shape[max_size - i]);
+    vars.push_front(bh.all_vars[0]);
+  }
+  return bh;
+}
+
+inline tvm::Array<tvm::Expr> InputIndexFromBroadcast(
+    const tvm::Array<tvm::Var>& ovars, const tvm::Tensor& T,
+    const std::deque<tvm::Var>& my_vars, const std::deque<tvm::Var>& all_vars) {
+  tvm::Array<tvm::Expr> ivars;
+  CHECK_EQ(ovars.size(), all_vars.size());
+  // N^2, could use a map but NBD..
+  int expected_dims = T->shape.size();
+  for (int i = 0; i < ovars.size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < my_vars.size(); ++j) {
+    if (all_vars[i].same_as(my_vars[j])) {
+        ivars.push_back(ovars[i]);
+        found = true;
+        break;
+      }
+    }
+    // Only inject 0 here if we have not yet reached the dimension of I
+    // (i.e. this must be a 1)
+    if (!found && (ovars.size() - i) <= expected_dims) {
+      ivars.push_back(tvm::make_zero(ovars[i].type()));
+    }
+  }
+  CHECK(expected_dims == ivars.size());
+  return ivars;
+}
+
+
+template <typename FBinaryExpr>
+inline tvm::Tensor WithBroadcast(FBinaryExpr op, const tvm::Tensor& A,
+                                 const tvm::Tensor& B) {
+  auto bh = BroadcastShape(A->shape, B->shape);
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    return op(A(InputIndexFromBroadcast(ovars, A, bh.vars1, bh.all_vars)),
+              B(InputIndexFromBroadcast(ovars, B, bh.vars2, bh.all_vars)));
+  };
+  return tvm::compute(
+      tvm::Array<tvm::Expr>(bh.common_shape.begin(), bh.common_shape.end()), l);
+}
+
+}  // namespace detail
+}  // namespace topi
+
+#endif  // TOPI_DETAIL_BROADCAST_H_
diff --git a/topi/include/topi/ewise.h b/topi/include/topi/ewise.h
index 2909e726fe07..8de4d1f4f320 100644
--- a/topi/include/topi/ewise.h
+++ b/topi/include/topi/ewise.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file topi.h
+ * \file ewise.h
  * \brief Elementwise op constructions
  */
 #ifndef TOPI_EWISE_H_
@@ -12,16 +12,17 @@ namespace topi {
 using namespace tvm;
 
 // Unary intrinsic operators
-#define TOPI_DECLARE_UNARY_OP(OpName)                                   \
-  inline Tensor OpName(const Tensor& x) {                               \
-    return compute(x->shape, [&](const Array<Var>& i) {                 \
-        return ::tvm::OpName(x(i));                                     \
-      });                                                               \
+#define TOPI_DECLARE_UNARY_OP(OpName)                   \
+  inline Tensor OpName(const Tensor& x) {               \
+    return compute(x->shape, [&](const Array<Var>& i) { \
+        return ::tvm::OpName(x(i));                     \
+      }, "tensor", "ewise");                            \
   }
 
 TOPI_DECLARE_UNARY_OP(exp);
 TOPI_DECLARE_UNARY_OP(tanh);
 TOPI_DECLARE_UNARY_OP(sigmoid);
 TOPI_DECLARE_UNARY_OP(sqrt);
+
 }  // namespace topi
 #endif  // TOPI_EWISE_H_
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
new file mode 100644
index 000000000000..dfd51e82f4ab
--- /dev/null
+++ b/topi/include/topi/nn.h
@@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2017 by Contributors
+ * \brief NN op constructions
+ * \file nn.h
+ */
+#ifndef TOPI_NN_H_
+#define TOPI_NN_H_
+
+#include <algorithm>
+
+#include "tvm/ir.h"
+#include "tvm/ir_pass.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace detail {
+
+template <typename T>
+tvm::Expr Map(const tvm::Array<tvm::Expr>& exprs, T op) {
+  CHECK_GE(exprs.size(), 1);
+  tvm::Expr res = exprs[0];
+  for (int i = 1; i < exprs.size(); ++i) {
+    res = op(res, exprs[i]);
+  }
+  return res;
+}
+
+}  // namespace detail
+
+template <typename T>
+inline tvm::Tensor relu(const tvm::Tensor& x, T threshold = static_cast<T>(0)) {
+  return tvm::compute(
+      x->shape,
+      [&](const tvm::Array<tvm::Var>& i) { return tvm::max(x(i), threshold); },
+      "tensor", "ewise");
+}
+
+inline tvm::Tensor pad(
+    const tvm::Tensor& t, const tvm::Array<tvm::Expr>& pad_before,
+    tvm::Array<tvm::Expr> pad_after = tvm::Array<tvm::Expr>()) {
+  if (pad_after.size() < pad_before.size()) {
+    for (int i = pad_after.size(); i < pad_before.size(); ++i) {
+      pad_after.push_back(pad_before[i]);
+    }
+  }
+  CHECK_GE(pad_before.size(), 1);
+  CHECK_EQ(pad_before.size(), pad_after.size());
+  tvm::Array<tvm::Expr> output_shape;
+  for (int i = 0; i < t->shape.size(); ++i) {
+    if (i >= pad_before.size()) {
+      output_shape.push_back(t->shape[i]);
+    } else {
+      output_shape.push_back(
+          tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i]));
+    }
+  }
+  auto l = [&](tvm::Array<tvm::Var> ovars) {
+    tvm::Array<tvm::Expr> indices;
+    tvm::Array<tvm::Expr> sel;
+    for (int i = 0; i < t->shape.size(); ++i) {
+      if (i >= pad_before.size()) {
+        indices.push_back(ovars[i]);
+        continue;
+      }
+      if (!tvm::ir::Equal(pad_before[i], 0)) {
+        sel.push_back(ovars[i] >= pad_before[i]);
+        indices.push_back(ovars[i] - pad_before[i]);
+      } else {
+        indices.push_back(ovars[i]);
+      }
+      if (!tvm::ir::Equal(pad_after[i], 0)) {
+        sel.push_back(tvm::ir::Simplify(ovars[i] < pad_before[i] + t->shape[i]));
+      }
+    }
+    return tvm::select(detail::Map(sel, tvm::ir::And::make), t(indices), 0);
+  };
+  return tvm::compute(output_shape, l, "tensor", "ewise");
+}
+
+// Returns a compute that calculates a row-major matrix multiplication:
+//   A(i, k) * B(k, j), if trans_a == trans_b
+//   the usual transposed combinations, otherwise
+inline tvm::Tensor matmult(const tvm::Tensor& A, const tvm::Tensor& B,
+                           bool trans_a = false, bool trans_b = false) {
+  tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
+                                     B->shape[trans_b ? 0 : 1]};
+  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
+  auto l = [&](tvm::Var i, tvm::Var j) {
+    return tvm::sum((trans_a ? A[k][i] : A[i][k]) * (trans_b ? B[j][k] : B[k][j]),
+                    {k});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I, const tvm::Tensor& W,
+                               int pad_h = 0, int pad_w = 0, int stride_h = 1,
+                               int stride_w = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      W->shape[1],                                            // O
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+        T(b, i, stride_h * h + kh, stride_w * w + kw) * W(i, o, kh, kw),
+        {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor conv2d_hwcn(const tvm::Tensor& I, const tvm::Tensor& W,
+                               int pad_h = 0, int pad_w = 0, int stride_h = 1,
+                               int stride_w = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> output_shape{
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1,  // W
+      I->shape[2],                                            // B
+      W->shape[3]                                             // O
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[3]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[0]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[1]}, "kw");
+  auto T = (pad_h == 0 && pad_w == 0) ? I : pad(I, {pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+        T(stride_h * h + kh, stride_w * w + kw, i, b) * W(kh, kw, i, o),
+        {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor depthwise_conv2d_nchw(const tvm::Tensor& I,
+                                         const tvm::Tensor& W, int pad_h = 0,
+                                         int pad_w = 0, int stride_h = 1,
+                                         int stride_w = 1) {
+  CHECK_EQ(4, I->shape.size());
+  CHECK_EQ(4, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  auto pCM = W->shape[1];  // channel_multiplier
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      W->shape[1],                                            // O
+      (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[1]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[2]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kw");
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(T(b, i / pCM, stride_h * h + kh, stride_w * w + kw) *
+                        W(i / pCM, o % pCM, kh, kw),
+                    {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
+                                      const tvm::Tensor& W, int pad_h = 0,
+                                      int pad_w = 0, int stride_h = 1,
+                                      int stride_w = 1) {
+  CHECK_EQ(5, I->shape.size());
+  CHECK_EQ(5, W->shape.size());
+  auto pH = I->shape[2];
+  auto pW = I->shape[3];
+  tvm::Array<tvm::Expr> output_shape{
+      I->shape[0],                                            // B
+      I->shape[1],                                            // G
+      W->shape[2],                                            // O
+      (I->shape[3] - W->shape[3] + 2 * pad_h) / stride_h + 1,  // H
+      (I->shape[4] - W->shape[4] + 2 * pad_w) / stride_w + 1   // W
+  };
+  auto i = tvm::reduce_axis(tvm::Range{0, I->shape[2]}, "i");
+  auto kh = tvm::reduce_axis(tvm::Range{0, W->shape[3]}, "kh");
+  auto kw = tvm::reduce_axis(tvm::Range{0, W->shape[4]}, "kw");
+
+  auto T = (pad_h == 0 && pad_w == 0)
+               ? I
+               : pad(I, {tvm::Expr(0), tvm::Expr(0), tvm::Expr(0), pad_h, pad_w});
+  auto l = [&](tvm::Var b, tvm::Var g, tvm::Var o, tvm::Var h, tvm::Var w) {
+    return tvm::sum(
+        I(b, g, i, stride_h * h + kh, stride_w * w + kw) * W(g, i, o, kh, kw),
+        {i, kh, kw});
+  };
+  return tvm::compute(output_shape, l);
+}
+
+}  // namespace topi
+#endif  // TOPI_NN_H_