From 42e8d221f8d02d02d54d50ceb8d6e1f56eabb559 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Fri, 28 Jul 2017 22:19:54 +0000
Subject: [PATCH 01/14] __init__ updated

---
 python/tvm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 367efe7f911d..bf3d0ace75dd 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -27,5 +27,5 @@
 from .node import register_node
 from .ndarray import register_extension
 from .schedule import create_schedule
-from .build import build, lower, build_config
+from .build import build, lower, build_config, BuildConfig
 from .tag import tag_scope

From 01a6b49c900d510e8cc5b55139b4417cf8a1a4e9 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Fri, 28 Jul 2017 23:33:57 +0000
Subject: [PATCH 02/14] pull request updated

---
 python/tvm/__init__.py |   2 +-
 python/tvm/build.py    | 336 -----------------------------------------
 2 files changed, 1 insertion(+), 337 deletions(-)
 delete mode 100644 python/tvm/build.py

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index bf3d0ace75dd..367efe7f911d 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -27,5 +27,5 @@
 from .node import register_node
 from .ndarray import register_extension
 from .schedule import create_schedule
-from .build import build, lower, build_config, BuildConfig
+from .build import build, lower, build_config
 from .tag import tag_scope
diff --git a/python/tvm/build.py b/python/tvm/build.py
deleted file mode 100644
index 900e10553c53..000000000000
--- a/python/tvm/build.py
+++ /dev/null
@@ -1,336 +0,0 @@
-"""The build utils in python.
-
-This module provides the functions to transform schedule to
-LoweredFunc and compiled Module.
-"""
-from __future__ import absolute_import as _abs
-from . import api
-from . import tensor
-from . import schedule
-from . import expr
-from . import ir_pass
-from . import container
-from . import module
-from . import codegen
-from . import ndarray
-
-class BuildConfig(object):
-    """Configuration scope to set a build config option.
-
-    Parameters
-    ----------
-    kwargs
-        Keyword arguments of configurations to set.
-    """
-    current = None
-    defaults = {
-        'auto_unroll_max_step': 0,
-        'auto_unroll_min_depth': 1,
-        'unroll_explicit': True,
-        'detect_global_barrier': False,
-        'offset_factor': 0,
-        'data_alignment': -1,
-        'restricted_func': True
-    }
-    def __init__(self, **kwargs):
-        self._old_scope = None
-        for k, _ in kwargs.items():
-            if k not in BuildConfig.defaults:
-                raise ValueError(
-                    "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys()))
-        self._attr = kwargs
-
-    def __getattr__(self, name):
-        if name not in self._attr:
-            return BuildConfig.defaults[name]
-        return self._attr[name]
-
-    def __enter__(self):
-        # pylint: disable=protected-access
-        self._old_scope = BuildConfig.current
-        attr = BuildConfig.current._attr.copy()
-        attr.update(self._attr)
-        self._attr = attr
-        BuildConfig.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        assert self._old_scope
-        BuildConfig.current = self._old_scope
-
-BuildConfig.current = BuildConfig()
-
-def build_config(**kwargs):
-    """Configure the build behavior by setting config variables.
-
-    Parameters
-    ----------
-    auto_unroll_max_step: int, default=0
-        Threshold of loop extent to be automatically unrolled.
-
-    auto_unroll_min_depth: int, default=1
-        The minimum loop nest level before the loop can be automatically unrolled.
-
-    unroll_explicit: bool, default=True
-        Whether explicitly unroll the loop, if set false, the unroll hint will
-        be passed to the CodeGen phase, which may generate pragma unroll hint.
-        Set this to be true if CodeGen support unroll pragma and
-        when we want to be more readable.
-
-    detect_global_barrier: bool, default=True
-        Whether detect global barrier.
-
-    data_alignment: int, optional
-        The alignment of data pointer in bytes.
-        If -1 is passed, the alignment will be set to TVM's internal default.
-
-    offset_factor: int, default=0
-        The factor used in default buffer declaration.
-        If specified as 0, offset field is not used.
-
-    restricted_func: bool, default=True
-        Whether build restricted function.
-        That is each buffer argument to the function are guaranteed
-        not to overlap. This enables more optimization.
-        Corresponds to restricted keyword in C99
-
-    Returns
-    -------
-    config: BuildConfig
-        The build configuration
-    """
-    return BuildConfig(**kwargs)
-
-
-def get_binds(args, binds=None):
-    """Internal function to get binds and arg_list given arguments.
-
-    Parameters
-    ----------
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-
-    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-
-    Returns
-    -------
-    binds: dict
-        The bind specification
-
-    arg_list: list
-        The list of symbolic buffers of arguments.
-    """
-    binds = {} if binds is None else binds.copy()
-    cfg = BuildConfig.current
-    arg_list = []
-    for x in args:
-        if isinstance(x, tensor.Tensor):
-            if x not in binds:
-                buf = api.decl_buffer(x.shape,
-                                      dtype=x.dtype,
-                                      name=x.name,
-                                      data_alignment=cfg.data_alignment,
-                                      offset_factor=cfg.offset_factor)
-                binds[x] = buf
-                arg_list.append(buf)
-            else:
-                arg_list.append(binds[x])
-        elif isinstance(x, schedule.Buffer):
-            arg_list.append(x)
-        elif isinstance(x, expr.Var):
-            arg_list.append(x)
-        else:
-            raise ValueError("args must be Tensor, Buffer or Var")
-    return binds, arg_list
-
-
-def lower(sch,
-          args,
-          name="default_function",
-          binds=None,
-          simple_mode=False):
-    """Lowering step before build into target.
-
-    Parameters
-    ----------
-    sch : tvm.Schedule
-        The schedule to be builded
-
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-
-    name : str, optional
-        The name of result function.
-
-    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-
-    simple_mode : bool, optional
-        Whether only output simple and compact statement, this will skip
-        LoopPartition, api wrapper generation and Unrolling.
-
-    Returns
-    -------
-    f : LoweredFunc or Stmt
-       The result function, if with_api_wrapper=False
-       Then the Stmt before make api is returned.
-    """
-    binds, arg_list = get_binds(args, binds)
-    # normalize schedule first
-    sch = sch.normalize()
-    bounds = schedule.InferBound(sch)
-    stmt = schedule.ScheduleOps(sch, bounds)
-    stmt = ir_pass.InjectPrefetch(stmt)
-    stmt = ir_pass.StorageFlatten(stmt, binds, 64)
-    stmt = ir_pass.CanonicalSimplify(stmt)
-    if not simple_mode:
-        stmt = ir_pass.LoopPartition(stmt)
-    stmt = ir_pass.VectorizeLoop(stmt)
-    stmt = ir_pass.InjectVirtualThread(stmt)
-    stmt = ir_pass.StorageRewrite(stmt)
-    cfg = BuildConfig.current
-    stmt = ir_pass.UnrollLoop(
-        stmt,
-        cfg.auto_unroll_max_step,
-        cfg.auto_unroll_min_depth,
-        cfg.unroll_explicit)
-    stmt = ir_pass.Simplify(stmt)
-    if simple_mode:
-        return stmt
-    return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
-
-
-def build(sch,
-          args=None,
-          target="llvm",
-          target_host=None,
-          name="default_function",
-          binds=None):
-    """Build a function with arguments as signiture.
-
-    Parameters
-    ----------
-    sch : tvm.Schedule, or LoweredFunc
-        The schedule to be builded
-
-    args : list of Buffer or Tensor or Var, optional
-        The argument lists to the function.
-
-    target : str, optional
-        The target and option of the compilation.
-        When the target is llvm, you can set options like:
-
-          - **-mtriple=<target triple>** or **-target**
-
-            Specify the target triple, which is useful for cross
-            compilation.
-
-          - **-mcpu=<cpuname>**
-
-            Specify a specific chip in the current architecture to
-            generate code for. By default this is infered from the
-            target triple and autodetected to the current architecture.
-
-          - **-mattr=a1,+a2,-a3,...**
-
-            Override or control specific attributes of the target,
-            such as whether SIMD operations are enabled or not. The
-            default set of attributes is set by the current CPU.
-
-          - **-system-lib**
-
-            Build TVM system library module. System lib is a global module that contains
-            self registered functions in program startup. User can get the module using
-            :any:`tvm.module.system_lib`.
-            It is useful in environments where dynamic loading api like dlopen is banned.
-            The system lib will be available as long as the result code is linked by the program.
-
-    target_host : str, optional
-        Host compilation target, if target is device.
-        When TVM compiles device specific program such as CUDA,
-        we also need host(CPU) side code to interact with the driver
-        setup the dimensions and parameters correctly.
-        target_host is used to specify the host side codegen target.
-        By default, llvm is used if it is enabled,
-        otherwise a stackvm intepreter is used.
-
-    name : str, optional
-        The name of result function.
-
-    binds : dict, optional
-        Dictionary that maps the binding of symbolic buffer to Tensor.
-        By default, a new buffer is created for each tensor in the argument.
-
-    Returns
-    -------
-    f : Function, or pair of functions
-       The result function.
-    """
-    if isinstance(sch, schedule.Schedule):
-        if args is None:
-            raise ValueError("args must be given for build from schedule")
-        flist = lower(sch, args,
-                      name=name,
-                      binds=binds)
-        if isinstance(flist, container.LoweredFunc):
-            flist = [flist]
-    elif isinstance(sch, container.LoweredFunc):
-        if args:
-            raise ValueError("args must be done when build from LoweredFunc")
-        flist = [sch]
-    elif isinstance(sch, (list, tuple, container.Array)):
-        flist = sch
-    else:
-        raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
-    fname_set = set()
-    for x in flist:
-        if not isinstance(x, container.LoweredFunc):
-            raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
-        if x.name in fname_set:
-            raise ValueError("Duplicate function name %s" % x.name)
-
-    fhost = []
-    fdevice = []
-    for func in flist:
-        if func.func_type == container.LoweredFunc.MixedFunc:
-            if BuildConfig.current.detect_global_barrier:
-                func = ir_pass.StorageSync(func, "global")
-            func = ir_pass.StorageSync(func, "shared")
-            warp_size = 32 if target == "cuda" else 1
-            func = ir_pass.LowerThreadAllreduce(func, warp_size)
-            fsplits = [s for s in ir_pass.SplitHostDevice(func)]
-            fhost.append(fsplits[0])
-            for x in fsplits[1:]:
-                fdevice.append(x)
-        elif func.func_type == container.LoweredFunc.HostFunc:
-            fhost.append(func)
-        elif func.func_type == container.LoweredFunc.DeviceFunc:
-            fdevice.append(func)
-        else:
-            raise ValueError("unknown function type %d" % func.func_type)
-
-    if not target.startswith("llvm") and target != "stackvm" and not fdevice:
-        raise ValueError(
-            "Specified target %s, but cannot find device code, did you do bind?" % target)
-
-    device = "cpu" if target.startswith("llvm") or target == "stackvm" else target
-    device_type = ndarray.context(device, 0).device_type
-    fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost]
-    fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost]
-    fhost = [ir_pass.CombineContextCall(x) for x in fhost]
-
-    if fdevice:
-        if not target_host:
-            target_host = "llvm" if module.enabled("llvm") else "stackvm"
-        mhost = codegen.build_module(fhost, target_host)
-        if target:
-            mdev = codegen.build_module(fdevice, target)
-            mhost.import_module(mdev)
-        return mhost
-    else:
-        return codegen.build_module(fhost, target)

From 077ec59681fcba89d1c11df50cde235b45f0377f Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Fri, 28 Jul 2017 23:40:38 +0000
Subject: [PATCH 03/14] build_module added

---
 python/tvm/build_module.py | 336 +++++++++++++++++++++++++++++++++++++
 1 file changed, 336 insertions(+)
 create mode 100644 python/tvm/build_module.py

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
new file mode 100644
index 000000000000..900e10553c53
--- /dev/null
+++ b/python/tvm/build_module.py
@@ -0,0 +1,336 @@
+"""The build utils in python.
+
+This module provides the functions to transform schedule to
+LoweredFunc and compiled Module.
+"""
+from __future__ import absolute_import as _abs
+from . import api
+from . import tensor
+from . import schedule
+from . import expr
+from . import ir_pass
+from . import container
+from . import module
+from . import codegen
+from . import ndarray
+
+class BuildConfig(object):
+    """Configuration scope to set a build config option.
+
+    Parameters
+    ----------
+    kwargs
+        Keyword arguments of configurations to set.
+    """
+    current = None
+    defaults = {
+        'auto_unroll_max_step': 0,
+        'auto_unroll_min_depth': 1,
+        'unroll_explicit': True,
+        'detect_global_barrier': False,
+        'offset_factor': 0,
+        'data_alignment': -1,
+        'restricted_func': True
+    }
+    def __init__(self, **kwargs):
+        self._old_scope = None
+        for k, _ in kwargs.items():
+            if k not in BuildConfig.defaults:
+                raise ValueError(
+                    "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys()))
+        self._attr = kwargs
+
+    def __getattr__(self, name):
+        if name not in self._attr:
+            return BuildConfig.defaults[name]
+        return self._attr[name]
+
+    def __enter__(self):
+        # pylint: disable=protected-access
+        self._old_scope = BuildConfig.current
+        attr = BuildConfig.current._attr.copy()
+        attr.update(self._attr)
+        self._attr = attr
+        BuildConfig.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        BuildConfig.current = self._old_scope
+
+BuildConfig.current = BuildConfig()
+
+def build_config(**kwargs):
+    """Configure the build behavior by setting config variables.
+
+    Parameters
+    ----------
+    auto_unroll_max_step: int, default=0
+        Threshold of loop extent to be automatically unrolled.
+
+    auto_unroll_min_depth: int, default=1
+        The minimum loop nest level before the loop can be automatically unrolled.
+
+    unroll_explicit: bool, default=True
+        Whether explicitly unroll the loop, if set false, the unroll hint will
+        be passed to the CodeGen phase, which may generate pragma unroll hint.
+        Set this to be true if CodeGen support unroll pragma and
+        when we want to be more readable.
+
+    detect_global_barrier: bool, default=True
+        Whether detect global barrier.
+
+    data_alignment: int, optional
+        The alignment of data pointer in bytes.
+        If -1 is passed, the alignment will be set to TVM's internal default.
+
+    offset_factor: int, default=0
+        The factor used in default buffer declaration.
+        If specified as 0, offset field is not used.
+
+    restricted_func: bool, default=True
+        Whether build restricted function.
+        That is each buffer argument to the function are guaranteed
+        not to overlap. This enables more optimization.
+        Corresponds to restricted keyword in C99
+
+    Returns
+    -------
+    config: BuildConfig
+        The build configuration
+    """
+    return BuildConfig(**kwargs)
+
+
+def get_binds(args, binds=None):
+    """Internal function to get binds and arg_list given arguments.
+
+    Parameters
+    ----------
+    args : list of Buffer or Tensor or Var
+        The argument lists to the function.
+
+    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
+        Dictionary that maps the Tensor to Buffer which specified the data layout
+        requirement of the function. By default, a new compact buffer is created
+        for each tensor in the argument.
+
+    Returns
+    -------
+    binds: dict
+        The bind specification
+
+    arg_list: list
+        The list of symbolic buffers of arguments.
+    """
+    binds = {} if binds is None else binds.copy()
+    cfg = BuildConfig.current
+    arg_list = []
+    for x in args:
+        if isinstance(x, tensor.Tensor):
+            if x not in binds:
+                buf = api.decl_buffer(x.shape,
+                                      dtype=x.dtype,
+                                      name=x.name,
+                                      data_alignment=cfg.data_alignment,
+                                      offset_factor=cfg.offset_factor)
+                binds[x] = buf
+                arg_list.append(buf)
+            else:
+                arg_list.append(binds[x])
+        elif isinstance(x, schedule.Buffer):
+            arg_list.append(x)
+        elif isinstance(x, expr.Var):
+            arg_list.append(x)
+        else:
+            raise ValueError("args must be Tensor, Buffer or Var")
+    return binds, arg_list
+
+
+def lower(sch,
+          args,
+          name="default_function",
+          binds=None,
+          simple_mode=False):
+    """Lowering step before build into target.
+
+    Parameters
+    ----------
+    sch : tvm.Schedule
+        The schedule to be builded
+
+    args : list of Buffer or Tensor or Var
+        The argument lists to the function.
+
+    name : str, optional
+        The name of result function.
+
+    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
+        Dictionary that maps the Tensor to Buffer which specified the data layout
+        requirement of the function. By default, a new compact buffer is created
+        for each tensor in the argument.
+
+    simple_mode : bool, optional
+        Whether only output simple and compact statement, this will skip
+        LoopPartition, api wrapper generation and Unrolling.
+
+    Returns
+    -------
+    f : LoweredFunc or Stmt
+       The result function, if with_api_wrapper=False
+       Then the Stmt before make api is returned.
+    """
+    binds, arg_list = get_binds(args, binds)
+    # normalize schedule first
+    sch = sch.normalize()
+    bounds = schedule.InferBound(sch)
+    stmt = schedule.ScheduleOps(sch, bounds)
+    stmt = ir_pass.InjectPrefetch(stmt)
+    stmt = ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = ir_pass.CanonicalSimplify(stmt)
+    if not simple_mode:
+        stmt = ir_pass.LoopPartition(stmt)
+    stmt = ir_pass.VectorizeLoop(stmt)
+    stmt = ir_pass.InjectVirtualThread(stmt)
+    stmt = ir_pass.StorageRewrite(stmt)
+    cfg = BuildConfig.current
+    stmt = ir_pass.UnrollLoop(
+        stmt,
+        cfg.auto_unroll_max_step,
+        cfg.auto_unroll_min_depth,
+        cfg.unroll_explicit)
+    stmt = ir_pass.Simplify(stmt)
+    if simple_mode:
+        return stmt
+    return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
+
+
+def build(sch,
+          args=None,
+          target="llvm",
+          target_host=None,
+          name="default_function",
+          binds=None):
+    """Build a function with arguments as signiture.
+
+    Parameters
+    ----------
+    sch : tvm.Schedule, or LoweredFunc
+        The schedule to be builded
+
+    args : list of Buffer or Tensor or Var, optional
+        The argument lists to the function.
+
+    target : str, optional
+        The target and option of the compilation.
+        When the target is llvm, you can set options like:
+
+          - **-mtriple=<target triple>** or **-target**
+
+            Specify the target triple, which is useful for cross
+            compilation.
+
+          - **-mcpu=<cpuname>**
+
+            Specify a specific chip in the current architecture to
+            generate code for. By default this is infered from the
+            target triple and autodetected to the current architecture.
+
+          - **-mattr=a1,+a2,-a3,...**
+
+            Override or control specific attributes of the target,
+            such as whether SIMD operations are enabled or not. The
+            default set of attributes is set by the current CPU.
+
+          - **-system-lib**
+
+            Build TVM system library module. System lib is a global module that contains
+            self registered functions in program startup. User can get the module using
+            :any:`tvm.module.system_lib`.
+            It is useful in environments where dynamic loading api like dlopen is banned.
+            The system lib will be available as long as the result code is linked by the program.
+
+    target_host : str, optional
+        Host compilation target, if target is device.
+        When TVM compiles device specific program such as CUDA,
+        we also need host(CPU) side code to interact with the driver
+        setup the dimensions and parameters correctly.
+        target_host is used to specify the host side codegen target.
+        By default, llvm is used if it is enabled,
+        otherwise a stackvm intepreter is used.
+
+    name : str, optional
+        The name of result function.
+
+    binds : dict, optional
+        Dictionary that maps the binding of symbolic buffer to Tensor.
+        By default, a new buffer is created for each tensor in the argument.
+
+    Returns
+    -------
+    f : Function, or pair of functions
+       The result function.
+    """
+    if isinstance(sch, schedule.Schedule):
+        if args is None:
+            raise ValueError("args must be given for build from schedule")
+        flist = lower(sch, args,
+                      name=name,
+                      binds=binds)
+        if isinstance(flist, container.LoweredFunc):
+            flist = [flist]
+    elif isinstance(sch, container.LoweredFunc):
+        if args:
+            raise ValueError("args must be done when build from LoweredFunc")
+        flist = [sch]
+    elif isinstance(sch, (list, tuple, container.Array)):
+        flist = sch
+    else:
+        raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
+    fname_set = set()
+    for x in flist:
+        if not isinstance(x, container.LoweredFunc):
+            raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
+        if x.name in fname_set:
+            raise ValueError("Duplicate function name %s" % x.name)
+
+    fhost = []
+    fdevice = []
+    for func in flist:
+        if func.func_type == container.LoweredFunc.MixedFunc:
+            if BuildConfig.current.detect_global_barrier:
+                func = ir_pass.StorageSync(func, "global")
+            func = ir_pass.StorageSync(func, "shared")
+            warp_size = 32 if target == "cuda" else 1
+            func = ir_pass.LowerThreadAllreduce(func, warp_size)
+            fsplits = [s for s in ir_pass.SplitHostDevice(func)]
+            fhost.append(fsplits[0])
+            for x in fsplits[1:]:
+                fdevice.append(x)
+        elif func.func_type == container.LoweredFunc.HostFunc:
+            fhost.append(func)
+        elif func.func_type == container.LoweredFunc.DeviceFunc:
+            fdevice.append(func)
+        else:
+            raise ValueError("unknown function type %d" % func.func_type)
+
+    if not target.startswith("llvm") and target != "stackvm" and not fdevice:
+        raise ValueError(
+            "Specified target %s, but cannot find device code, did you do bind?" % target)
+
+    device = "cpu" if target.startswith("llvm") or target == "stackvm" else target
+    device_type = ndarray.context(device, 0).device_type
+    fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost]
+    fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost]
+    fhost = [ir_pass.CombineContextCall(x) for x in fhost]
+
+    if fdevice:
+        if not target_host:
+            target_host = "llvm" if module.enabled("llvm") else "stackvm"
+        mhost = codegen.build_module(fhost, target_host)
+        if target:
+            mdev = codegen.build_module(fdevice, target)
+            mhost.import_module(mdev)
+        return mhost
+    else:
+        return codegen.build_module(fhost, target)

From 5daeac2e818ea8ada7603f595878b0a8228af4f0 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Fri, 28 Jul 2017 23:45:08 +0000
Subject: [PATCH 04/14] typo fixed

---
 python/tvm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 367efe7f911d..53d1dd60ee78 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -27,5 +27,5 @@
 from .node import register_node
 from .ndarray import register_extension
 from .schedule import create_schedule
-from .build import build, lower, build_config
+from .build_module import build, lower, build_config
 from .tag import tag_scope

From 8856fb5800a265763547b0424ea4b203d5e32193 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Sat, 29 Jul 2017 01:53:31 +0000
Subject: [PATCH 05/14] another typo fixed

---
 python/tvm/tensor_intrin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/tensor_intrin.py b/python/tvm/tensor_intrin.py
index e2f10b0dce28..e166f3706a7b 100644
--- a/python/tvm/tensor_intrin.py
+++ b/python/tvm/tensor_intrin.py
@@ -6,7 +6,7 @@
 from . import stmt as _stmt
 from . import make as _make
 from . import tensor as _tensor
-from .build import BuildConfig
+from .build_module import BuildConfig
 from ._ffi.node import NodeBase, register_node
 
 @register_node

From bcf6d734f5595eba08d99a9e2e3777d8ec22551a Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Sat, 12 Aug 2017 20:33:30 +0000
Subject: [PATCH 06/14] conv2d gpu scheduler for two layouts moved to tvm

---
 topi/python/topi/cuda/__init__.py        |   3 +-
 topi/python/topi/cuda/conv2d_hwcn.py     | 122 ++++++++++++++++++++
 topi/python/topi/cuda/conv2d_hwcn_map.py | 121 --------------------
 topi/python/topi/cuda/conv2d_nchw.py     | 137 +++++++++++++++++++++++
 4 files changed, 261 insertions(+), 122 deletions(-)
 create mode 100644 topi/python/topi/cuda/conv2d_hwcn.py
 delete mode 100644 topi/python/topi/cuda/conv2d_hwcn_map.py
 create mode 100644 topi/python/topi/cuda/conv2d_nchw.py

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 6456441d6038..449cd4c5b161 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,5 +2,6 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from .conv2d_hwcn_map import schedule_conv2d_hwcn_map
+from . import conv2d_nchw
+from . import conv2d_hwcn
 from .depthwise_conv2d_map import schedule_depthwise_conv2d_map
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
new file mode 100644
index 000000000000..0a246a84e118
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -0,0 +1,122 @@
+# pylint: disable=invalid-name
+"""Schedule for conv2d_hwcn with auto fusion"""
+import tvm
+
+
+
+@tvm.register_func("topi.schedule.cuda.conv2d_hwcn")
+def schedule_conv2d_hwcn(outs, target):
+    """Schedule for conv2d_nchw.
+
+    Parameters
+    ----------
+    outs: tvm.Array<tvm::Tensor>
+        The computation graph description of conv2d_nchw in the format
+        of a list of tensors.
+
+    traget: str
+        Compilation target ('cuda' for gpu)
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+    def schedule(Apad, W, B):
+
+        sch[Apad].compute_inline()
+        AA = sch.cache_read(Apad, "shared", [B])
+        WW = sch.cache_read(W, "shared", [B])
+        AL = sch.cache_read(AA, "local", [B])
+        WL = sch.cache_read(WW, "local", [B])
+
+        if op in sch.outputs:
+            Out = op.output(0)
+            BL = sch.cache_write(Out, "local")
+        else:
+            Out = sch.outputs[0].output(0)
+            sch[B].set_scope("local")
+            BL = B
+
+        tile = 8
+        num_thread = 8
+        block_factor = tile * num_thread
+        step = 8
+        vthread = 2
+
+        block_x = tvm.thread_axis("blockIdx.x")
+        block_y = tvm.thread_axis("blockIdx.y")
+        block_z = tvm.thread_axis("blockIdx.z")
+        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
+        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
+        thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
+        thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
+
+        hi, wi, fi, ni = sch[Out].op.axis
+        bz = sch[Out].fuse(hi, wi)
+        by, fi = sch[Out].split(fi, factor=block_factor)
+        bx, ni = sch[Out].split(ni, factor=block_factor)
+        tyz, fi = sch[Out].split(fi, nparts=vthread)
+        txz, ni = sch[Out].split(ni, nparts=vthread)
+        ty, fi = sch[Out].split(fi, nparts=num_thread)
+        tx, ni = sch[Out].split(ni, nparts=num_thread)
+        sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
+        sch[Out].bind(bz, block_z)
+        sch[Out].bind(by, block_y)
+        sch[Out].bind(bx, block_x)
+        sch[Out].bind(tyz, thread_yz)
+        sch[Out].bind(txz, thread_xz)
+        sch[Out].bind(ty, thread_y)
+        sch[Out].bind(tx, thread_x)
+
+        # Schedule BL local write
+        sch[BL].compute_at(sch[Out], tx)
+        yi, xi, fi, ni = sch[BL].op.axis
+        ry, rx, rc = sch[BL].op.reduce_axis
+        rco, rci = sch[BL].split(rc, factor=step)
+        sch[BL].reorder(rco, ry, rx, rci, fi, ni)
+        fuse_index = sch[BL].fuse(ry, rx)
+        fuse_index = sch[BL].fuse(fuse_index, rco)
+        rx = fuse_index
+
+        sch[AA].compute_at(sch[BL], rx)
+        sch[WW].compute_at(sch[BL], rx)
+        sch[AL].compute_at(sch[BL], rci)
+        sch[WL].compute_at(sch[BL], rci)
+        # Schedule for A's shared memory load
+        yi, xi, ci, ni = sch[AA].op.axis
+        ty, ci = sch[AA].split(ci, nparts=num_thread)
+        tx, ni = sch[AA].split(ni, nparts=num_thread)
+        _, ni = sch[AA].split(ni, factor=4)
+        sch[AA].reorder(ty, tx, yi, xi, ci, ni)
+        sch[AA].bind(ty, thread_y)
+        sch[AA].bind(tx, thread_x)
+        sch[AA].vectorize(ni)
+        # Schedule for W's shared memory load
+        yi, xi, ci, fi = sch[WW].op.axis
+        ty, ci = sch[WW].split(ci, nparts=num_thread)
+        tx, fi = sch[WW].split(fi, nparts=num_thread)
+        _, fi = sch[WW].split(fi, factor=4)
+        sch[WW].reorder(ty, tx, yi, xi, ci, fi)
+        sch[WW].bind(ty, thread_y)
+        sch[WW].bind(tx, thread_x)
+        sch[WW].vectorize(fi)
+
+        def traverse(operator):
+            if operator.tag == 'ewise' or operator.tag == 'scale_shift':
+                if operator not in sch.outputs:
+                    sch[operator].compute_inline()
+                for tensor in operator.input_tensors:
+                    if tensor.op.input_tensors:
+                        traverse(tensor.op)
+            elif operator.tag == 'conv2d_hwcn':
+                Apad = op.input_tensors[0]
+                W = op.input_tensors[1]
+                B = op.output(0)
+                schedule(Apad, W, B)
+            else:
+                raise RuntimeError("Unsupported operator: %s" % operator.tag)
+
+    traverse(outs[0].op)
+    return sch
diff --git a/topi/python/topi/cuda/conv2d_hwcn_map.py b/topi/python/topi/cuda/conv2d_hwcn_map.py
deleted file mode 100644
index 7b932523b720..000000000000
--- a/topi/python/topi/cuda/conv2d_hwcn_map.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# pylint: disable=invalid-name
-"""Schedule for conv2d_hwcn with auto fusion"""
-import tvm
-
-
-def _schedule_conv2d_hwcn(op, sch):
-    assert len(op.input_tensors) == 2
-    Apad = op.input_tensors[0]
-    W = op.input_tensors[1]
-    B = op.output(0)
-
-    sch[Apad].compute_inline()
-    AA = sch.cache_read(Apad, "shared", [B])
-    WW = sch.cache_read(W, "shared", [B])
-    AL = sch.cache_read(AA, "local", [B])
-    WL = sch.cache_read(WW, "local", [B])
-
-    if op in sch.outputs:
-        Out = op.output(0)
-        BL = sch.cache_write(Out, "local")
-    else:
-        Out = sch.outputs[0].output(0)
-        sch[B].set_scope("local")
-        BL = B
-
-    tile = 8
-    num_thread = 8
-    block_factor = tile * num_thread
-    step = 8
-    vthread = 2
-
-    block_x = tvm.thread_axis("blockIdx.x")
-    block_y = tvm.thread_axis("blockIdx.y")
-    block_z = tvm.thread_axis("blockIdx.z")
-    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-    thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
-    thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-    thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
-
-    hi, wi, fi, ni = sch[Out].op.axis
-    bz = sch[Out].fuse(hi, wi)
-    by, fi = sch[Out].split(fi, factor=block_factor)
-    bx, ni = sch[Out].split(ni, factor=block_factor)
-    tyz, fi = sch[Out].split(fi, nparts=vthread)
-    txz, ni = sch[Out].split(ni, nparts=vthread)
-    ty, fi = sch[Out].split(fi, nparts=num_thread)
-    tx, ni = sch[Out].split(ni, nparts=num_thread)
-    sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
-    sch[Out].bind(bz, block_z)
-    sch[Out].bind(by, block_y)
-    sch[Out].bind(bx, block_x)
-    sch[Out].bind(tyz, thread_yz)
-    sch[Out].bind(txz, thread_xz)
-    sch[Out].bind(ty, thread_y)
-    sch[Out].bind(tx, thread_x)
-
-    # Schedule BL local write
-    sch[BL].compute_at(sch[Out], tx)
-    yi, xi, fi, ni = sch[BL].op.axis
-    ry, rx, rc = sch[BL].op.reduce_axis
-    rco, rci = sch[BL].split(rc, factor=step)
-    sch[BL].reorder(rco, ry, rx, rci, fi, ni)
-    fuse_index = sch[BL].fuse(ry, rx)
-    fuse_index = sch[BL].fuse(fuse_index, rco)
-    rx = fuse_index
-
-    sch[AA].compute_at(sch[BL], rx)
-    sch[WW].compute_at(sch[BL], rx)
-    sch[AL].compute_at(sch[BL], rci)
-    sch[WL].compute_at(sch[BL], rci)
-    # Schedule for A's shared memory load
-    yi, xi, ci, ni = sch[AA].op.axis
-    ty, ci = sch[AA].split(ci, nparts=num_thread)
-    tx, ni = sch[AA].split(ni, nparts=num_thread)
-    _, ni = sch[AA].split(ni, factor=4)
-    sch[AA].reorder(ty, tx, yi, xi, ci, ni)
-    sch[AA].bind(ty, thread_y)
-    sch[AA].bind(tx, thread_x)
-    sch[AA].vectorize(ni)
-    # Schedule for W's shared memory load
-    yi, xi, ci, fi = sch[WW].op.axis
-    ty, ci = sch[WW].split(ci, nparts=num_thread)
-    tx, fi = sch[WW].split(fi, nparts=num_thread)
-    _, fi = sch[WW].split(fi, factor=4)
-    sch[WW].reorder(ty, tx, yi, xi, ci, fi)
-    sch[WW].bind(ty, thread_y)
-    sch[WW].bind(tx, thread_x)
-    sch[WW].vectorize(fi)
-
-    return sch
-
-
-def schedule_conv2d_hwcn_map(op):
-    """Schedule for conv2d_hwcn map ops.
-
-    Parameters
-    ----------
-    op: tvm.tensor.Operation
-        The symbolic description of the operation, should be conv2d_hwcn or
-        conv2d_hwcn followed by a sequence of one-to-one-mapping operators.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    def traverse(operator):
-        if operator.tag == 'ewise' or operator.tag == 'scale_shift':
-            if operator not in sch.outputs:
-                sch[operator].compute_inline()
-            for tensor in operator.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-        elif operator.tag == 'conv2d_hwcn':
-            _schedule_conv2d_hwcn(operator, sch)
-        else:
-            raise RuntimeError("Unsupported operator: %s" % operator.tag)
-
-    sch = tvm.create_schedule(op)
-    traverse(op)
-    return sch
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
new file mode 100644
index 000000000000..ba570fd87143
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -0,0 +1,137 @@
+# pylint: disable=invalid-name
+"""Schedule for conv2d_nchw with auto fusion, optimized for batch_size(n)=1."""
+import tvm
+import math
+
+
+@tvm.register_func("topi.schedule.cuda.conv2d_nchw")
+def schedule_conv2d_nchw(outs, target):
+    """Schedule for conv2d_nchw.
+
+    Parameters
+    ----------
+    outs: tvm.Array<tvm::Tensor>
+        The computation graph description of conv2d_nchw in the format 
+        of a list of tensors. 
+
+    traget: str
+        Compilation target ('cuda' for gpu)
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+    def schedule(temp, Filter, Output):
+        out_height = tvm.ir_pass.Simplify(Output.shape[2]).value
+        out_width = tvm.ir_pass.Simplify(Output.shape[3]).value
+        channel_multiplier = tvm.ir_pass.Simplify(Filter.shape[1]).value
+
+        block_h = out_width
+        block_w = tvm.ir_pass.Simplify(temp.shape[1]).value
+        if block_h % 48 == 0:
+            block_h = 48
+        elif block_h % 32 == 0:
+            block_h = 32
+        if block_w % 48 == 0:
+            block_w = 48
+        elif block_w % 32 == 0:
+            block_w = 32
+
+        s[temp].compute_inline()
+        
+        temp_S   = s.cache_read(temp, "shared", [Output])
+        Filter_S = s.cache_read(Filter, "shared", [Output])
+        
+        if outs[0].op in s.outputs:
+            Out = Output
+            Out_L = s.cache_write(Out, "local")
+        else:
+            Out = outs[0].op.output(0)
+            s[Output].set_scope("local")
+            Out_L = Output
+
+        # sheduler params
+        tile = 8
+        num_thread = 8
+        step = 16
+        vthread = 2
+        out_filter = tvm.ir_pass.Simplify(Filter.shape[0]).value
+        in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value
+        opart2 = out_filter/8
+        ofactor=out_filter
+        wfactor=block_h
+        ifactor=in_filter/4
+        sfactor=max(1, ofactor/(opart2*2))
+        spart = int(math.ceil(wfactor/vthread))
+
+        block_x = tvm.thread_axis("blockIdx.x")
+        block_y = tvm.thread_axis("blockIdx.y")
+        block_z = tvm.thread_axis("blockIdx.z")
+        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
+        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
+        thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
+        thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
+
+        i, oc, h, w = s[Out].op.axis
+        ooc, ioc = s[Out].split(oc, factor=ofactor)
+        ow, iw = s[Out].split(w, factor=wfactor)
+        ow = s[Out].fuse(ow, h)
+        oioc, iioc = s[Out].split(ioc, nparts = vthread)
+        oiw, iiw = s[Out].split(iw, nparts=vthread)
+        oiioc, iiioc = s[Out].split(iioc, nparts = opart2)
+        s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc)
+        s[Out].bind(iiioc, thread_x)
+        s[Out].bind(iiw, thread_y)
+        s[Out].bind(oiioc, thread_xz)
+        s[Out].bind(oiw, thread_yz)
+        s[Out].bind(oioc, block_x)
+        s[Out].bind(ow, block_y)
+        s[Out].bind(ooc, block_z)
+
+        s[Out_L].compute_at(s[Out], iiioc)
+
+        # schedule Out_L local write
+        i, oc, h, w = s[Out_L].op.axis
+        ic, dh, dw = s[Out_L].op.reduce_axis
+        oic, iic = s[Out_L].split(ic, factor=ifactor)
+        s[Out_L].reorder(oic, dh, dw, iic, h, w)
+        fuse_index = s[Out_L].fuse(dw, dh)
+        fuse_index = s[Out_L].fuse(fuse_index, oic)
+        dw = fuse_index
+
+        s[temp_S].compute_at(s[Out_L], dw)
+        s[Filter_S].compute_at(s[Out_L], dw)
+        
+        #schedule temp_S shared mem load
+        i, ic, h, w = s[temp_S].op.axis
+        oic, iic = s[temp_S].split(ic, factor=sfactor)
+        ow, iw = s[temp_S].split(w, factor=spart)
+        s[temp_S].bind(iic, thread_x)
+        s[temp_S].bind(iw, thread_y)
+        
+        #schedule Filter_S shared mem load
+        i, oc, h, w = s[Filter_S].op.axis
+        ooc, ioc = s[Filter_S].split(oc, factor=sfactor)
+        oi, ii = s[Filter_S].split(i, factor=spart)
+        s[Filter_S].bind(ioc, thread_x)
+        s[Filter_S].bind(ii, thread_y)
+        
+    def traverse(OP):
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if 'ewise' in OP.tag or 'bcast' in OP.tag:
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if str(tensor.op.input_tensors) != str([]):
+                    traverse(tensor.op)
+        # schedule conv2d
+        if 'conv2d_nchw' in OP.tag:
+            temp = OP.input_tensors[0]
+            Filter = OP.input_tensors[1]
+            Output = OP.output(0)
+            schedule(temp, Filter, Output)
+
+    traverse(outs[0].op)
+    return s

From f8f93f3ec898aef9d8383df2f31785da0202930b Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Sun, 13 Aug 2017 00:46:06 +0000
Subject: [PATCH 07/14] changes made according to CR

---
 topi/python/topi/cuda/conv2d_hwcn.py | 14 +++++---------
 topi/python/topi/cuda/conv2d_nchw.py | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index 0a246a84e118..7acdaa4303bc 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -3,24 +3,20 @@
 import tvm
 
 
-
 @tvm.register_func("topi.schedule.cuda.conv2d_hwcn")
-def schedule_conv2d_hwcn(outs, target):
-    """Schedule for conv2d_nchw.
+def schedule_conv2d_hwcn(outs):
+    """Schedule for conv2d_hwcn.
 
     Parameters
     ----------
-    outs: tvm.Array<tvm::Tensor>
-        The computation graph description of conv2d_nchw in the format
+    outs: Array<Tensor>
+        The computation graph description of conv2d_hwcn in the format
         of a list of tensors.
 
-    traget: str
-        Compilation target ('cuda' for gpu)
-
     Returns
     -------
     s: Schedule
-        The computation schedule for conv2d_nchw.
+        The computation schedule for conv2d_hwcn.
     """
     s = tvm.create_schedule([x.op for x in outs])
     def schedule(Apad, W, B):
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index ba570fd87143..392439cd205c 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -1,28 +1,32 @@
 # pylint: disable=invalid-name
-"""Schedule for conv2d_nchw with auto fusion, optimized for batch_size(n)=1."""
+"""Schedule for conv2d_nchw with auto fusion"""
 import tvm
 import math
 
 
 @tvm.register_func("topi.schedule.cuda.conv2d_nchw")
-def schedule_conv2d_nchw(outs, target):
+def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw.
 
     Parameters
     ----------
-    outs: tvm.Array<tvm::Tensor>
+    outs: Array<Tensor>
         The computation graph description of conv2d_nchw in the format 
         of a list of tensors. 
 
-    traget: str
-        Compilation target ('cuda' for gpu)
-
     Returns
     -------
     s: Schedule
         The computation schedule for conv2d_nchw.
     """
-    s = tvm.create_schedule([x.op for x in outs])
+    def schedule_conv2d_small_batch(outs):
+        batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value
+        if(batch_size > 1):
+            raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
+        s = tvm.create_schedule([x.op for x in outs])
+        return s
+
+    s = schedule_conv2d_small_batch(outs)
     def schedule(temp, Filter, Output):
         out_height = tvm.ir_pass.Simplify(Output.shape[2]).value
         out_width = tvm.ir_pass.Simplify(Output.shape[3]).value

From 60ba7cb5b86a7debffdf919ea0ec455e676397b7 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Sun, 13 Aug 2017 06:00:36 +0000
Subject: [PATCH 08/14] conv2d_nchw formating updated, conv2d_hwcn tests
 updated

---
 topi/python/topi/cuda/__init__.py             |  4 +-
 topi/python/topi/cuda/conv2d_hwcn.py          | 41 +++++++-------
 topi/python/topi/cuda/conv2d_nchw.py          | 56 ++++++++-----------
 topi/recipe/conv/test_conv2d_hwcn_map.py      |  6 +-
 .../tests/python/test_topi_conv2d_hwcn_map.py |  4 +-
 5 files changed, 51 insertions(+), 60 deletions(-)

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 449cd4c5b161..65753bc088f3 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,6 +2,6 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from . import conv2d_nchw
-from . import conv2d_hwcn
+from .conv2d_nchw import schedule_conv2d_nchw
+from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d_map import schedule_depthwise_conv2d_map
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index 7acdaa4303bc..fe65c2d29002 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -3,22 +3,21 @@
 import tvm
 
 
-@tvm.register_func("topi.schedule.cuda.conv2d_hwcn")
 def schedule_conv2d_hwcn(outs):
-    """Schedule for conv2d_hwcn.
+    """Schedule for conv2d_hwcn and any element-wise operations.
 
     Parameters
     ----------
-    outs: Array<Tensor>
+    outs: Array of Tensor
         The computation graph description of conv2d_hwcn in the format
-        of a list of tensors.
+        of an array of tensors.
 
     Returns
     -------
     s: Schedule
         The computation schedule for conv2d_hwcn.
     """
-    s = tvm.create_schedule([x.op for x in outs])
+    sch = tvm.create_schedule([x.op for x in outs])
     def schedule(Apad, W, B):
 
         sch[Apad].compute_inline()
@@ -27,8 +26,8 @@ def schedule(Apad, W, B):
         AL = sch.cache_read(AA, "local", [B])
         WL = sch.cache_read(WW, "local", [B])
 
-        if op in sch.outputs:
-            Out = op.output(0)
+        if outs[0].op in sch.outputs:
+            Out = B
             BL = sch.cache_write(Out, "local")
         else:
             Out = sch.outputs[0].output(0)
@@ -99,20 +98,20 @@ def schedule(Apad, W, B):
         sch[WW].bind(tx, thread_x)
         sch[WW].vectorize(fi)
 
-        def traverse(operator):
-            if operator.tag == 'ewise' or operator.tag == 'scale_shift':
-                if operator not in sch.outputs:
-                    sch[operator].compute_inline()
-                for tensor in operator.input_tensors:
-                    if tensor.op.input_tensors:
-                        traverse(tensor.op)
-            elif operator.tag == 'conv2d_hwcn':
-                Apad = op.input_tensors[0]
-                W = op.input_tensors[1]
-                B = op.output(0)
-                schedule(Apad, W, B)
-            else:
-                raise RuntimeError("Unsupported operator: %s" % operator.tag)
+    def traverse(operator):
+        if operator.tag == 'ewise' or operator.tag == 'scale_shift':
+            if operator not in sch.outputs:
+                sch[operator].compute_inline()
+            for tensor in operator.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        elif operator.tag == 'conv2d_hwcn':
+            Apad = operator.input_tensors[0]
+            W = operator.input_tensors[1]
+            B = operator.output(0)
+            schedule(Apad, W, B)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % operator.tag)
 
     traverse(outs[0].op)
     return sch
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index 392439cd205c..a911ec51cc38 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -1,18 +1,16 @@
 # pylint: disable=invalid-name
 """Schedule for conv2d_nchw with auto fusion"""
 import tvm
-import math
 
 
-@tvm.register_func("topi.schedule.cuda.conv2d_nchw")
 def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw.
+    """Schedule for conv2d_nchw and any element-wise operations.
 
     Parameters
     ----------
-    outs: Array<Tensor>
-        The computation graph description of conv2d_nchw in the format 
-        of a list of tensors. 
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
 
     Returns
     -------
@@ -21,18 +19,14 @@ def schedule_conv2d_nchw(outs):
     """
     def schedule_conv2d_small_batch(outs):
         batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value
-        if(batch_size > 1):
+        if batch_size > 1:
             raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
         s = tvm.create_schedule([x.op for x in outs])
         return s
 
     s = schedule_conv2d_small_batch(outs)
     def schedule(temp, Filter, Output):
-        out_height = tvm.ir_pass.Simplify(Output.shape[2]).value
-        out_width = tvm.ir_pass.Simplify(Output.shape[3]).value
-        channel_multiplier = tvm.ir_pass.Simplify(Filter.shape[1]).value
-
-        block_h = out_width
+        block_h = tvm.ir_pass.Simplify(Output.shape[3]).value
         block_w = tvm.ir_pass.Simplify(temp.shape[1]).value
         if block_h % 48 == 0:
             block_h = 48
@@ -44,10 +38,10 @@ def schedule(temp, Filter, Output):
             block_w = 32
 
         s[temp].compute_inline()
-        
-        temp_S   = s.cache_read(temp, "shared", [Output])
+
+        temp_S = s.cache_read(temp, "shared", [Output])
         Filter_S = s.cache_read(Filter, "shared", [Output])
-        
+ 
         if outs[0].op in s.outputs:
             Out = Output
             Out_L = s.cache_write(Out, "local")
@@ -57,18 +51,16 @@ def schedule(temp, Filter, Output):
             Out_L = Output
 
         # sheduler params
-        tile = 8
         num_thread = 8
-        step = 16
         vthread = 2
         out_filter = tvm.ir_pass.Simplify(Filter.shape[0]).value
         in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value
         opart2 = out_filter/8
-        ofactor=out_filter
-        wfactor=block_h
-        ifactor=in_filter/4
-        sfactor=max(1, ofactor/(opart2*2))
-        spart = int(math.ceil(wfactor/vthread))
+        ofactor = out_filter
+        wfactor = block_h
+        ifactor = in_filter/4
+        sfactor = max(1, ofactor/(opart2*2))
+        spart = int(float(wfactor)/vthread)
 
         block_x = tvm.thread_axis("blockIdx.x")
         block_y = tvm.thread_axis("blockIdx.y")
@@ -82,9 +74,9 @@ def schedule(temp, Filter, Output):
         ooc, ioc = s[Out].split(oc, factor=ofactor)
         ow, iw = s[Out].split(w, factor=wfactor)
         ow = s[Out].fuse(ow, h)
-        oioc, iioc = s[Out].split(ioc, nparts = vthread)
+        oioc, iioc = s[Out].split(ioc, nparts=vthread)
         oiw, iiw = s[Out].split(iw, nparts=vthread)
-        oiioc, iiioc = s[Out].split(iioc, nparts = opart2)
+        oiioc, iiioc = s[Out].split(iioc, nparts=opart2)
         s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc)
         s[Out].bind(iiioc, thread_x)
         s[Out].bind(iiw, thread_y)
@@ -107,28 +99,28 @@ def schedule(temp, Filter, Output):
 
         s[temp_S].compute_at(s[Out_L], dw)
         s[Filter_S].compute_at(s[Out_L], dw)
-        
+
         #schedule temp_S shared mem load
         i, ic, h, w = s[temp_S].op.axis
-        oic, iic = s[temp_S].split(ic, factor=sfactor)
-        ow, iw = s[temp_S].split(w, factor=spart)
+        _, iic = s[temp_S].split(ic, factor=sfactor)
+        _, iw = s[temp_S].split(w, factor=spart)
         s[temp_S].bind(iic, thread_x)
         s[temp_S].bind(iw, thread_y)
-        
+
         #schedule Filter_S shared mem load
         i, oc, h, w = s[Filter_S].op.axis
-        ooc, ioc = s[Filter_S].split(oc, factor=sfactor)
-        oi, ii = s[Filter_S].split(i, factor=spart)
+        _, ioc = s[Filter_S].split(oc, factor=sfactor)
+        _, ii = s[Filter_S].split(i, factor=spart)
         s[Filter_S].bind(ioc, thread_x)
         s[Filter_S].bind(ii, thread_y)
-        
+
     def traverse(OP):
         # inline all one-to-one-mapping operators except the last stage (output)
         if 'ewise' in OP.tag or 'bcast' in OP.tag:
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if str(tensor.op.input_tensors) != str([]):
+                if tensor.op.input_tensors:
                     traverse(tensor.op)
         # schedule conv2d
         if 'conv2d_nchw' in OP.tag:
diff --git a/topi/recipe/conv/test_conv2d_hwcn_map.py b/topi/recipe/conv/test_conv2d_hwcn_map.py
index 553c93aa74a2..a6b9017a74eb 100644
--- a/topi/recipe/conv/test_conv2d_hwcn_map.py
+++ b/topi/recipe/conv/test_conv2d_hwcn_map.py
@@ -12,7 +12,7 @@
 
 @tvm.register_func
 def tvm_callback_cuda_compile(code):
-    ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"])
+    ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_37"])
     return ptx
 
 def write_code(code, fname):
@@ -43,8 +43,8 @@ def test_conv2d_hwcn_map():
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
     B = topi.nn.conv2d_hwcn(A, W, stride, padding)
     C = topi.nn.relu(B)
-    s1 = topi.cuda.schedule_conv2d_hwcn_map(B.op)
-    s2 = topi.cuda.schedule_conv2d_hwcn_map(C.op)
+    s1 = topi.cuda.schedule_conv2d_hwcn([B])
+    s2 = topi.cuda.schedule_conv2d_hwcn([C])
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
diff --git a/topi/tests/python/test_topi_conv2d_hwcn_map.py b/topi/tests/python/test_topi_conv2d_hwcn_map.py
index 993e5713cfe4..f6b56126e389 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn_map.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn_map.py
@@ -13,8 +13,8 @@ def verify_conv2d_hwcn_map(batch, in_channel, in_size, num_filter, kernel, strid
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
     B = topi.nn.conv2d_hwcn(A, W, stride, padding)
     C = topi.nn.relu(B)
-    s1 = topi.cuda.schedule_conv2d_hwcn_map(B.op)
-    s2 = topi.cuda.schedule_conv2d_hwcn_map(C.op)
+    s1 = topi.cuda.schedule_conv2d_hwcn([B])
+    s2 = topi.cuda.schedule_conv2d_hwcn([C])
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)

From d64da8d6d462a4eaccfc39715c4f7296694140f9 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Sun, 13 Aug 2017 06:26:43 +0000
Subject: [PATCH 09/14] lint error fixed

---
 topi/python/topi/cuda/conv2d_hwcn.py           | 5 +++--
 topi/python/topi/cuda/conv2d_nchw.py           | 7 +++++--
 topi/python/topi/testing/conv2d_hwcn_python.py | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index fe65c2d29002..50d71fefaee3 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, too-many-locals, too-many-statements
 """Schedule for conv2d_hwcn with auto fusion"""
 import tvm
 
@@ -19,7 +19,7 @@ def schedule_conv2d_hwcn(outs):
     """
     sch = tvm.create_schedule([x.op for x in outs])
     def schedule(Apad, W, B):
-
+        """Schedule conv2d_hwcn"""
         sch[Apad].compute_inline()
         AA = sch.cache_read(Apad, "shared", [B])
         WW = sch.cache_read(W, "shared", [B])
@@ -99,6 +99,7 @@ def schedule(Apad, W, B):
         sch[WW].vectorize(fi)
 
     def traverse(operator):
+        """Traverse operators from computation graph"""
         if operator.tag == 'ewise' or operator.tag == 'scale_shift':
             if operator not in sch.outputs:
                 sch[operator].compute_inline()
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index a911ec51cc38..69fbc303a028 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements
 """Schedule for conv2d_nchw with auto fusion"""
 import tvm
 
@@ -18,6 +18,7 @@ def schedule_conv2d_nchw(outs):
         The computation schedule for conv2d_nchw.
     """
     def schedule_conv2d_small_batch(outs):
+        """Create schedule for tensors or return error if batch size is larager than 1"""
         batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value
         if batch_size > 1:
             raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
@@ -26,6 +27,7 @@ def schedule_conv2d_small_batch(outs):
 
     s = schedule_conv2d_small_batch(outs)
     def schedule(temp, Filter, Output):
+        """Schedule conv2d_nchw"""
         block_h = tvm.ir_pass.Simplify(Output.shape[3]).value
         block_w = tvm.ir_pass.Simplify(temp.shape[1]).value
         if block_h % 48 == 0:
@@ -41,7 +43,7 @@ def schedule(temp, Filter, Output):
 
         temp_S = s.cache_read(temp, "shared", [Output])
         Filter_S = s.cache_read(Filter, "shared", [Output])
- 
+
         if outs[0].op in s.outputs:
             Out = Output
             Out_L = s.cache_write(Out, "local")
@@ -115,6 +117,7 @@ def schedule(temp, Filter, Output):
         s[Filter_S].bind(ii, thread_y)
 
     def traverse(OP):
+        """Traverse operators from computation graph"""
         # inline all one-to-one-mapping operators except the last stage (output)
         if 'ewise' in OP.tag or 'bcast' in OP.tag:
             if OP not in s.outputs:
diff --git a/topi/python/topi/testing/conv2d_hwcn_python.py b/topi/python/topi/testing/conv2d_hwcn_python.py
index e240cfb722ae..c84efce5e777 100644
--- a/topi/python/topi/testing/conv2d_hwcn_python.py
+++ b/topi/python/topi/testing/conv2d_hwcn_python.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
 """Convolution in python"""
 import numpy as np
 import scipy.signal

From 8d5facc008432c1e022ad3648c25be5111a0021f Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Sun, 13 Aug 2017 06:45:29 +0000
Subject: [PATCH 10/14] element wise operator schedule fusing fixed for conv2d

---
 topi/python/topi/cuda/conv2d_hwcn.py | 2 +-
 topi/python/topi/cuda/conv2d_nchw.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index 50d71fefaee3..675cc0adcbc2 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -26,7 +26,7 @@ def schedule(Apad, W, B):
         AL = sch.cache_read(AA, "local", [B])
         WL = sch.cache_read(WW, "local", [B])
 
-        if outs[0].op in sch.outputs:
+        if B.op in sch.outputs:
             Out = B
             BL = sch.cache_write(Out, "local")
         else:
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index 69fbc303a028..e3cf68d4cffc 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -44,7 +44,7 @@ def schedule(temp, Filter, Output):
         temp_S = s.cache_read(temp, "shared", [Output])
         Filter_S = s.cache_read(Filter, "shared", [Output])
 
-        if outs[0].op in s.outputs:
+        if Output.op in s.outputs:
             Out = Output
             Out_L = s.cache_write(Out, "local")
         else:

From 8a80a7e48d5791b35bad911d5b8350683f33c389 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Sun, 13 Aug 2017 23:53:36 +0000
Subject: [PATCH 11/14] conv2d_nchw topi test added, all resnet workloads now
 pass

---
 topi/python/topi/cuda/conv2d_nchw.py          | 49 +++++++-------
 topi/python/topi/nn/conv.py                   | 63 ++++++++++++++++++
 topi/python/topi/testing/__init__.py          |  1 +
 .../python/topi/testing/conv2d_nchw_python.py | 64 +++++++++++++++++++
 topi/tests/python/test_topi_conv2d_nchw.py    | 61 ++++++++++++++++++
 5 files changed, 213 insertions(+), 25 deletions(-)
 create mode 100644 topi/python/topi/testing/conv2d_nchw_python.py
 create mode 100644 topi/tests/python/test_topi_conv2d_nchw.py

diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index e3cf68d4cffc..c752fc1473cf 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -3,29 +3,10 @@
 import tvm
 
 
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw and any element-wise operations.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
+def schedule_conv2d_small_batch(outs):
+    """Create schedule for tensors or return error if batch size is larager than 1"""
+    s = tvm.create_schedule([x.op for x in outs])
 
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    def schedule_conv2d_small_batch(outs):
-        """Create schedule for tensors or return error if batch size is larager than 1"""
-        batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value
-        if batch_size > 1:
-            raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
-        s = tvm.create_schedule([x.op for x in outs])
-        return s
-
-    s = schedule_conv2d_small_batch(outs)
     def schedule(temp, Filter, Output):
         """Schedule conv2d_nchw"""
         block_h = tvm.ir_pass.Simplify(Output.shape[3]).value
@@ -55,15 +36,14 @@ def schedule(temp, Filter, Output):
         # sheduler params
         num_thread = 8
         vthread = 2
-        out_filter = tvm.ir_pass.Simplify(Filter.shape[0]).value
+        out_filter = min(64, tvm.ir_pass.Simplify(Filter.shape[0]).value)
         in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value
         opart2 = out_filter/8
         ofactor = out_filter
         wfactor = block_h
         ifactor = in_filter/4
         sfactor = max(1, ofactor/(opart2*2))
-        spart = int(float(wfactor)/vthread)
-
+        spart = (wfactor + vthread-1) // vthread
         block_x = tvm.thread_axis("blockIdx.x")
         block_y = tvm.thread_axis("blockIdx.y")
         block_z = tvm.thread_axis("blockIdx.z")
@@ -134,3 +114,22 @@ def traverse(OP):
 
     traverse(outs[0].op)
     return s
+
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw and any element-wise operations.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value
+    if batch_size > 1:
+        raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
+    return  schedule_conv2d_small_batch(outs)
diff --git a/topi/python/topi/nn/conv.py b/topi/python/topi/nn/conv.py
index 4c233aa1568a..bd62a6ec01ff 100644
--- a/topi/python/topi/nn/conv.py
+++ b/topi/python/topi/nn/conv.py
@@ -6,6 +6,69 @@
 from .util import get_const_tuple
 
 
+@tvm.tag_scope(tag="conv2d_nchw")
+def conv2d_nchw(Input, Filter, stride, padding):
+    """Convolution operator in HWCN layout.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(padding, int) or padding in ['VALID', 'SAME']
+    batch, in_channel, in_height, in_width = get_const_tuple(Input.shape)
+    num_filter, channel, kernel_h, kernel_w = get_const_tuple(Filter.shape)
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    # compute the padding size
+    if isinstance(padding, int):
+        pad_h = pad_w = padding * 2
+    elif padding == 'VALID':
+        pad_h = 0
+        pad_w = 0
+    else: # 'SAME'
+        pad_h = kernel_h - 1
+        pad_w = kernel_w - 1
+    pad_top = int(np.ceil(float(pad_h) / 2))
+    pad_left = int(np.ceil(float(pad_w) / 2))
+    # compute the output shape
+    out_channel = num_filter
+    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
+    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
+    # compute graph
+    temp = tvm.compute(
+        (batch, in_channel, in_height + pad_h, in_width + pad_w),
+        lambda nn, cc, yy, xx: tvm.select(
+            tvm.all(yy >= pad_top, yy - pad_top < in_height,
+                    xx >= pad_left, xx - pad_left < in_width),
+            Input[nn, cc, yy - pad_top, xx - pad_left], tvm.const(0.)),
+            name='temp')
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    return tvm.compute(
+        (batch, out_channel, out_height, out_width),
+        lambda nn, ff, yy, xx: tvm.sum(
+            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx] * Filter[ff, rc, ry, rx],
+            axis=[rc, ry, rx]))
+    
 @tvm.tag_scope(tag="conv2d_hwcn")
 def conv2d_hwcn(Input, Filter, stride, padding):
     """Convolution operator in HWCN layout.
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 28100658fc05..63bc8eb7215a 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -5,3 +5,4 @@
 from __future__ import absolute_import as _abs
 
 from .conv2d_hwcn_python import conv2d_hwcn_python
+from .conv2d_nchw_python import conv2d_nchw_python
diff --git a/topi/python/topi/testing/conv2d_nchw_python.py b/topi/python/topi/testing/conv2d_nchw_python.py
new file mode 100644
index 000000000000..169605faaf45
--- /dev/null
+++ b/topi/python/topi/testing/conv2d_nchw_python.py
@@ -0,0 +1,64 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Convolution in python"""
+import numpy as np
+import scipy.signal
+
+
+def conv2d_nchw_python(a_np, w_np, stride, padding):
+    """Convolution operator in HWCN layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    w_np : numpy.ndarray
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    batch, in_channel, in_height, in_width = a_np.shape
+    num_filter, _, kernel_h, kernel_w = w_np.shape
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    if isinstance(padding, int):
+        pad_h = pad_w = padding * 2
+    elif padding == 'VALID':
+        pad_h = 0
+        pad_w = 0
+    else: # 'SAME'
+        pad_h = kernel_h - 1
+        pad_w = kernel_w - 1
+    pad_top = int(np.ceil(float(pad_h) / 2))
+    pad_bottom = pad_h - pad_top
+    pad_left = int(np.ceil(float(pad_w) / 2))
+    pad_right = pad_w - pad_left
+    # compute the output shape
+    out_channel = num_filter
+    out_height = (in_height - kernel_h + pad_h) // stride_h + 1
+    out_width = (in_width - kernel_w + pad_w) // stride_w + 1
+    b_np = np.zeros((batch, out_channel, out_height, out_width))
+    # computation
+    for n in range(batch):
+        for f in range(out_channel):
+            for c in range(in_channel):
+                if pad_h > 0:
+                    apad = np.zeros((in_height + pad_h, in_width + pad_w))
+                    apad[pad_top:-pad_bottom, pad_left:-pad_right] = a_np[n, c]
+                else:
+                    apad = a_np[n, c]
+                out = scipy.signal.convolve2d(
+                    apad, np.rot90(np.rot90(w_np[f, c])), mode='valid')
+                b_np[n, f] += out[::stride, ::stride]
+    return b_np
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
new file mode 100644
index 000000000000..6a7ed6cf8017
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -0,0 +1,61 @@
+"""Example code to do convolution."""
+import os
+import numpy as np
+import tvm
+import topi
+from topi.nn.util import get_const_tuple
+
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    B = topi.nn.conv2d_nchw(A, W, stride, padding)
+    C = topi.nn.relu(B)
+    s1 = topi.cuda.schedule_conv2d_nchw([B])
+    s2 = topi.cuda.schedule_conv2d_nchw([C])
+
+    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
+    w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
+    b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+    c_np = np.maximum(b_np, 0)
+
+    def check_device(device):
+        if not tvm.module.enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        with tvm.build_config(auto_unroll_max_step=32,
+                              auto_unroll_min_depth=0,
+                              unroll_explicit=False):
+            func1 = tvm.build(s1, [A, W, B], device)
+            func2 = tvm.build(s2, [A, W, C], device)
+            func1(a, w, b)
+            func2(a, w, c)
+            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ['cuda', 'opencl', 'metal']:
+        check_device(device)
+
+
+def test_conv2d_nchw():
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0)
+    verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1)
+    verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0)
+    verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
+    verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1)
+    verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0)
+    verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
+    verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1)
+    verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
+
+if __name__ == "__main__":
+    test_conv2d_nchw()

From 62cc3636701a1ce49804e11154ec2309330a977d Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Mon, 14 Aug 2017 01:39:56 +0000
Subject: [PATCH 12/14] conv compute lint error fixed

---
 topi/python/topi/nn/conv.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/topi/python/topi/nn/conv.py b/topi/python/topi/nn/conv.py
index bd62a6ec01ff..37e6be1f53d5 100644
--- a/topi/python/topi/nn/conv.py
+++ b/topi/python/topi/nn/conv.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
 """Convolution operators"""
 from __future__ import absolute_import as _abs
 import tvm
@@ -59,7 +59,7 @@ def conv2d_nchw(Input, Filter, stride, padding):
             tvm.all(yy >= pad_top, yy - pad_top < in_height,
                     xx >= pad_left, xx - pad_left < in_width),
             Input[nn, cc, yy - pad_top, xx - pad_left], tvm.const(0.)),
-            name='temp')
+        name='temp')
     rc = tvm.reduce_axis((0, in_channel), name='rc')
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
@@ -68,7 +68,7 @@ def conv2d_nchw(Input, Filter, stride, padding):
         lambda nn, ff, yy, xx: tvm.sum(
             temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx] * Filter[ff, rc, ry, rx],
             axis=[rc, ry, rx]))
-    
+
 @tvm.tag_scope(tag="conv2d_hwcn")
 def conv2d_hwcn(Input, Filter, stride, padding):
     """Convolution operator in HWCN layout.

From 289eadd78f1cdb109f05816d5629d0d1e4631f7e Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Mon, 14 Aug 2017 03:01:41 +0000
Subject: [PATCH 13/14] fixed python 3 compatibility problem

---
 topi/python/topi/cuda/conv2d_nchw.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index c752fc1473cf..17e65a5d6332 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -38,11 +38,11 @@ def schedule(temp, Filter, Output):
         vthread = 2
         out_filter = min(64, tvm.ir_pass.Simplify(Filter.shape[0]).value)
         in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value
-        opart2 = out_filter/8
+        opart2 = out_filter//8
         ofactor = out_filter
         wfactor = block_h
-        ifactor = in_filter/4
-        sfactor = max(1, ofactor/(opart2*2))
+        ifactor = in_filter//4
+        sfactor = max(1, ofactor//(opart2*2))
         spart = (wfactor + vthread-1) // vthread
         block_x = tvm.thread_axis("blockIdx.x")
         block_y = tvm.thread_axis("blockIdx.y")

From 5f57a1e01052a0bc3a860d9acb10f835dd5729f3 Mon Sep 17 00:00:00 2001
From: Your Name <laurawly@gmail.com>
Date: Mon, 14 Aug 2017 03:42:17 +0000
Subject: [PATCH 14/14] conv2d tensor input support added, test typo fixed,
 ir_pass.Simplify changed to util.get_const_int

---
 topi/python/topi/cuda/conv2d_hwcn.py       |  1 +
 topi/python/topi/cuda/conv2d_nchw.py       | 12 +++++++-----
 topi/tests/python/test_topi_conv2d_nchw.py |  2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index 675cc0adcbc2..210660a230f5 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -17,6 +17,7 @@ def schedule_conv2d_hwcn(outs):
     s: Schedule
         The computation schedule for conv2d_hwcn.
     """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     sch = tvm.create_schedule([x.op for x in outs])
     def schedule(Apad, W, B):
         """Schedule conv2d_hwcn"""
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index 17e65a5d6332..83abfde9107e 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -1,6 +1,7 @@
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements
 """Schedule for conv2d_nchw with auto fusion"""
 import tvm
+from .. import util
 
 
 def schedule_conv2d_small_batch(outs):
@@ -9,8 +10,8 @@ def schedule_conv2d_small_batch(outs):
 
     def schedule(temp, Filter, Output):
         """Schedule conv2d_nchw"""
-        block_h = tvm.ir_pass.Simplify(Output.shape[3]).value
-        block_w = tvm.ir_pass.Simplify(temp.shape[1]).value
+        block_h = util.get_const_int(Output.shape[3])
+        block_w = util.get_const_int(temp.shape[1])
         if block_h % 48 == 0:
             block_h = 48
         elif block_h % 32 == 0:
@@ -36,8 +37,8 @@ def schedule(temp, Filter, Output):
         # sheduler params
         num_thread = 8
         vthread = 2
-        out_filter = min(64, tvm.ir_pass.Simplify(Filter.shape[0]).value)
-        in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value
+        out_filter = min(64, util.get_const_int(Filter.shape[0]))
+        in_filter = util.get_const_int(Filter.shape[1])
         opart2 = out_filter//8
         ofactor = out_filter
         wfactor = block_h
@@ -129,7 +130,8 @@ def schedule_conv2d_nchw(outs):
     s: Schedule
         The computation schedule for conv2d_nchw.
     """
-    batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    batch_size = util.get_const_int(outs[0].op.output(0).shape[0])
     if batch_size > 1:
         raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
     return  schedule_conv2d_small_batch(outs)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index 6a7ed6cf8017..a40f10ce3d7e 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -3,7 +3,7 @@
 import numpy as np
 import tvm
 import topi
-from topi.nn.util import get_const_tuple
+from topi.util import get_const_tuple
 
 
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):