From 42e8d221f8d02d02d54d50ceb8d6e1f56eabb559 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 28 Jul 2017 22:19:54 +0000 Subject: [PATCH 01/14] __init__ updated --- python/tvm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 367efe7f911d..bf3d0ace75dd 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -27,5 +27,5 @@ from .node import register_node from .ndarray import register_extension from .schedule import create_schedule -from .build import build, lower, build_config +from .build import build, lower, build_config, BuildConfig from .tag import tag_scope From 01a6b49c900d510e8cc5b55139b4417cf8a1a4e9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 28 Jul 2017 23:33:57 +0000 Subject: [PATCH 02/14] pull request updated --- python/tvm/__init__.py | 2 +- python/tvm/build.py | 336 ----------------------------------------- 2 files changed, 1 insertion(+), 337 deletions(-) delete mode 100644 python/tvm/build.py diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index bf3d0ace75dd..367efe7f911d 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -27,5 +27,5 @@ from .node import register_node from .ndarray import register_extension from .schedule import create_schedule -from .build import build, lower, build_config, BuildConfig +from .build import build, lower, build_config from .tag import tag_scope diff --git a/python/tvm/build.py b/python/tvm/build.py deleted file mode 100644 index 900e10553c53..000000000000 --- a/python/tvm/build.py +++ /dev/null @@ -1,336 +0,0 @@ -"""The build utils in python. - -This module provides the functions to transform schedule to -LoweredFunc and compiled Module. -""" -from __future__ import absolute_import as _abs -from . import api -from . import tensor -from . import schedule -from . import expr -from . import ir_pass -from . import container -from . import module -from . import codegen -from . import ndarray - -class BuildConfig(object): - """Configuration scope to set a build config option. - - Parameters - ---------- - kwargs - Keyword arguments of configurations to set. - """ - current = None - defaults = { - 'auto_unroll_max_step': 0, - 'auto_unroll_min_depth': 1, - 'unroll_explicit': True, - 'detect_global_barrier': False, - 'offset_factor': 0, - 'data_alignment': -1, - 'restricted_func': True - } - def __init__(self, **kwargs): - self._old_scope = None - for k, _ in kwargs.items(): - if k not in BuildConfig.defaults: - raise ValueError( - "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys())) - self._attr = kwargs - - def __getattr__(self, name): - if name not in self._attr: - return BuildConfig.defaults[name] - return self._attr[name] - - def __enter__(self): - # pylint: disable=protected-access - self._old_scope = BuildConfig.current - attr = BuildConfig.current._attr.copy() - attr.update(self._attr) - self._attr = attr - BuildConfig.current = self - return self - - def __exit__(self, ptype, value, trace): - assert self._old_scope - BuildConfig.current = self._old_scope - -BuildConfig.current = BuildConfig() - -def build_config(**kwargs): - """Configure the build behavior by setting config variables. - - Parameters - ---------- - auto_unroll_max_step: int, default=0 - Threshold of loop extent to be automatically unrolled. - - auto_unroll_min_depth: int, default=1 - The minimum loop nest level before the loop can be automatically unrolled. - - unroll_explicit: bool, default=True - Whether explicitly unroll the loop, if set false, the unroll hint will - be passed to the CodeGen phase, which may generate pragma unroll hint. - Set this to be true if CodeGen support unroll pragma and - when we want to be more readable. - - detect_global_barrier: bool, default=True - Whether detect global barrier. - - data_alignment: int, optional - The alignment of data pointer in bytes. - If -1 is passed, the alignment will be set to TVM's internal default. - - offset_factor: int, default=0 - The factor used in default buffer declaration. - If specified as 0, offset field is not used. - - restricted_func: bool, default=True - Whether build restricted function. - That is each buffer argument to the function are guaranteed - not to overlap. This enables more optimization. - Corresponds to restricted keyword in C99 - - Returns - ------- - config: BuildConfig - The build configuration - """ - return BuildConfig(**kwargs) - - -def get_binds(args, binds=None): - """Internal function to get binds and arg_list given arguments. - - Parameters - ---------- - args : list of Buffer or Tensor or Var - The argument lists to the function. - - binds : dict of :any:`Tensor` to :any:`Buffer`, optional - Dictionary that maps the Tensor to Buffer which specified the data layout - requirement of the function. By default, a new compact buffer is created - for each tensor in the argument. - - Returns - ------- - binds: dict - The bind specification - - arg_list: list - The list of symbolic buffers of arguments. - """ - binds = {} if binds is None else binds.copy() - cfg = BuildConfig.current - arg_list = [] - for x in args: - if isinstance(x, tensor.Tensor): - if x not in binds: - buf = api.decl_buffer(x.shape, - dtype=x.dtype, - name=x.name, - data_alignment=cfg.data_alignment, - offset_factor=cfg.offset_factor) - binds[x] = buf - arg_list.append(buf) - else: - arg_list.append(binds[x]) - elif isinstance(x, schedule.Buffer): - arg_list.append(x) - elif isinstance(x, expr.Var): - arg_list.append(x) - else: - raise ValueError("args must be Tensor, Buffer or Var") - return binds, arg_list - - -def lower(sch, - args, - name="default_function", - binds=None, - simple_mode=False): - """Lowering step before build into target. - - Parameters - ---------- - sch : tvm.Schedule - The schedule to be builded - - args : list of Buffer or Tensor or Var - The argument lists to the function. - - name : str, optional - The name of result function. - - binds : dict of :any:`Tensor` to :any:`Buffer`, optional - Dictionary that maps the Tensor to Buffer which specified the data layout - requirement of the function. By default, a new compact buffer is created - for each tensor in the argument. - - simple_mode : bool, optional - Whether only output simple and compact statement, this will skip - LoopPartition, api wrapper generation and Unrolling. - - Returns - ------- - f : LoweredFunc or Stmt - The result function, if with_api_wrapper=False - Then the Stmt before make api is returned. - """ - binds, arg_list = get_binds(args, binds) - # normalize schedule first - sch = sch.normalize() - bounds = schedule.InferBound(sch) - stmt = schedule.ScheduleOps(sch, bounds) - stmt = ir_pass.InjectPrefetch(stmt) - stmt = ir_pass.StorageFlatten(stmt, binds, 64) - stmt = ir_pass.CanonicalSimplify(stmt) - if not simple_mode: - stmt = ir_pass.LoopPartition(stmt) - stmt = ir_pass.VectorizeLoop(stmt) - stmt = ir_pass.InjectVirtualThread(stmt) - stmt = ir_pass.StorageRewrite(stmt) - cfg = BuildConfig.current - stmt = ir_pass.UnrollLoop( - stmt, - cfg.auto_unroll_max_step, - cfg.auto_unroll_min_depth, - cfg.unroll_explicit) - stmt = ir_pass.Simplify(stmt) - if simple_mode: - return stmt - return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func) - - -def build(sch, - args=None, - target="llvm", - target_host=None, - name="default_function", - binds=None): - """Build a function with arguments as signiture. - - Parameters - ---------- - sch : tvm.Schedule, or LoweredFunc - The schedule to be builded - - args : list of Buffer or Tensor or Var, optional - The argument lists to the function. - - target : str, optional - The target and option of the compilation. - When the target is llvm, you can set options like: - - - **-mtriple=** or **-target** - - Specify the target triple, which is useful for cross - compilation. - - - **-mcpu=** - - Specify a specific chip in the current architecture to - generate code for. By default this is infered from the - target triple and autodetected to the current architecture. - - - **-mattr=a1,+a2,-a3,...** - - Override or control specific attributes of the target, - such as whether SIMD operations are enabled or not. The - default set of attributes is set by the current CPU. - - - **-system-lib** - - Build TVM system library module. System lib is a global module that contains - self registered functions in program startup. User can get the module using - :any:`tvm.module.system_lib`. - It is useful in environments where dynamic loading api like dlopen is banned. - The system lib will be available as long as the result code is linked by the program. - - target_host : str, optional - Host compilation target, if target is device. - When TVM compiles device specific program such as CUDA, - we also need host(CPU) side code to interact with the driver - setup the dimensions and parameters correctly. - target_host is used to specify the host side codegen target. - By default, llvm is used if it is enabled, - otherwise a stackvm intepreter is used. - - name : str, optional - The name of result function. - - binds : dict, optional - Dictionary that maps the binding of symbolic buffer to Tensor. - By default, a new buffer is created for each tensor in the argument. - - Returns - ------- - f : Function, or pair of functions - The result function. - """ - if isinstance(sch, schedule.Schedule): - if args is None: - raise ValueError("args must be given for build from schedule") - flist = lower(sch, args, - name=name, - binds=binds) - if isinstance(flist, container.LoweredFunc): - flist = [flist] - elif isinstance(sch, container.LoweredFunc): - if args: - raise ValueError("args must be done when build from LoweredFunc") - flist = [sch] - elif isinstance(sch, (list, tuple, container.Array)): - flist = sch - else: - raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc") - fname_set = set() - for x in flist: - if not isinstance(x, container.LoweredFunc): - raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc") - if x.name in fname_set: - raise ValueError("Duplicate function name %s" % x.name) - - fhost = [] - fdevice = [] - for func in flist: - if func.func_type == container.LoweredFunc.MixedFunc: - if BuildConfig.current.detect_global_barrier: - func = ir_pass.StorageSync(func, "global") - func = ir_pass.StorageSync(func, "shared") - warp_size = 32 if target == "cuda" else 1 - func = ir_pass.LowerThreadAllreduce(func, warp_size) - fsplits = [s for s in ir_pass.SplitHostDevice(func)] - fhost.append(fsplits[0]) - for x in fsplits[1:]: - fdevice.append(x) - elif func.func_type == container.LoweredFunc.HostFunc: - fhost.append(func) - elif func.func_type == container.LoweredFunc.DeviceFunc: - fdevice.append(func) - else: - raise ValueError("unknown function type %d" % func.func_type) - - if not target.startswith("llvm") and target != "stackvm" and not fdevice: - raise ValueError( - "Specified target %s, but cannot find device code, did you do bind?" % target) - - device = "cpu" if target.startswith("llvm") or target == "stackvm" else target - device_type = ndarray.context(device, 0).device_type - fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost] - fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost] - fhost = [ir_pass.CombineContextCall(x) for x in fhost] - - if fdevice: - if not target_host: - target_host = "llvm" if module.enabled("llvm") else "stackvm" - mhost = codegen.build_module(fhost, target_host) - if target: - mdev = codegen.build_module(fdevice, target) - mhost.import_module(mdev) - return mhost - else: - return codegen.build_module(fhost, target) From 077ec59681fcba89d1c11df50cde235b45f0377f Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 28 Jul 2017 23:40:38 +0000 Subject: [PATCH 03/14] build_module added --- python/tvm/build_module.py | 336 +++++++++++++++++++++++++++++++++++++ 1 file changed, 336 insertions(+) create mode 100644 python/tvm/build_module.py diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py new file mode 100644 index 000000000000..900e10553c53 --- /dev/null +++ b/python/tvm/build_module.py @@ -0,0 +1,336 @@ +"""The build utils in python. + +This module provides the functions to transform schedule to +LoweredFunc and compiled Module. +""" +from __future__ import absolute_import as _abs +from . import api +from . import tensor +from . import schedule +from . import expr +from . import ir_pass +from . import container +from . import module +from . import codegen +from . import ndarray + +class BuildConfig(object): + """Configuration scope to set a build config option. + + Parameters + ---------- + kwargs + Keyword arguments of configurations to set. + """ + current = None + defaults = { + 'auto_unroll_max_step': 0, + 'auto_unroll_min_depth': 1, + 'unroll_explicit': True, + 'detect_global_barrier': False, + 'offset_factor': 0, + 'data_alignment': -1, + 'restricted_func': True + } + def __init__(self, **kwargs): + self._old_scope = None + for k, _ in kwargs.items(): + if k not in BuildConfig.defaults: + raise ValueError( + "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys())) + self._attr = kwargs + + def __getattr__(self, name): + if name not in self._attr: + return BuildConfig.defaults[name] + return self._attr[name] + + def __enter__(self): + # pylint: disable=protected-access + self._old_scope = BuildConfig.current + attr = BuildConfig.current._attr.copy() + attr.update(self._attr) + self._attr = attr + BuildConfig.current = self + return self + + def __exit__(self, ptype, value, trace): + assert self._old_scope + BuildConfig.current = self._old_scope + +BuildConfig.current = BuildConfig() + +def build_config(**kwargs): + """Configure the build behavior by setting config variables. + + Parameters + ---------- + auto_unroll_max_step: int, default=0 + Threshold of loop extent to be automatically unrolled. + + auto_unroll_min_depth: int, default=1 + The minimum loop nest level before the loop can be automatically unrolled. + + unroll_explicit: bool, default=True + Whether explicitly unroll the loop, if set false, the unroll hint will + be passed to the CodeGen phase, which may generate pragma unroll hint. + Set this to be true if CodeGen support unroll pragma and + when we want to be more readable. + + detect_global_barrier: bool, default=True + Whether detect global barrier. + + data_alignment: int, optional + The alignment of data pointer in bytes. + If -1 is passed, the alignment will be set to TVM's internal default. + + offset_factor: int, default=0 + The factor used in default buffer declaration. + If specified as 0, offset field is not used. + + restricted_func: bool, default=True + Whether build restricted function. + That is each buffer argument to the function are guaranteed + not to overlap. This enables more optimization. + Corresponds to restricted keyword in C99 + + Returns + ------- + config: BuildConfig + The build configuration + """ + return BuildConfig(**kwargs) + + +def get_binds(args, binds=None): + """Internal function to get binds and arg_list given arguments. + + Parameters + ---------- + args : list of Buffer or Tensor or Var + The argument lists to the function. + + binds : dict of :any:`Tensor` to :any:`Buffer`, optional + Dictionary that maps the Tensor to Buffer which specified the data layout + requirement of the function. By default, a new compact buffer is created + for each tensor in the argument. + + Returns + ------- + binds: dict + The bind specification + + arg_list: list + The list of symbolic buffers of arguments. + """ + binds = {} if binds is None else binds.copy() + cfg = BuildConfig.current + arg_list = [] + for x in args: + if isinstance(x, tensor.Tensor): + if x not in binds: + buf = api.decl_buffer(x.shape, + dtype=x.dtype, + name=x.name, + data_alignment=cfg.data_alignment, + offset_factor=cfg.offset_factor) + binds[x] = buf + arg_list.append(buf) + else: + arg_list.append(binds[x]) + elif isinstance(x, schedule.Buffer): + arg_list.append(x) + elif isinstance(x, expr.Var): + arg_list.append(x) + else: + raise ValueError("args must be Tensor, Buffer or Var") + return binds, arg_list + + +def lower(sch, + args, + name="default_function", + binds=None, + simple_mode=False): + """Lowering step before build into target. + + Parameters + ---------- + sch : tvm.Schedule + The schedule to be builded + + args : list of Buffer or Tensor or Var + The argument lists to the function. + + name : str, optional + The name of result function. + + binds : dict of :any:`Tensor` to :any:`Buffer`, optional + Dictionary that maps the Tensor to Buffer which specified the data layout + requirement of the function. By default, a new compact buffer is created + for each tensor in the argument. + + simple_mode : bool, optional + Whether only output simple and compact statement, this will skip + LoopPartition, api wrapper generation and Unrolling. + + Returns + ------- + f : LoweredFunc or Stmt + The result function, if with_api_wrapper=False + Then the Stmt before make api is returned. + """ + binds, arg_list = get_binds(args, binds) + # normalize schedule first + sch = sch.normalize() + bounds = schedule.InferBound(sch) + stmt = schedule.ScheduleOps(sch, bounds) + stmt = ir_pass.InjectPrefetch(stmt) + stmt = ir_pass.StorageFlatten(stmt, binds, 64) + stmt = ir_pass.CanonicalSimplify(stmt) + if not simple_mode: + stmt = ir_pass.LoopPartition(stmt) + stmt = ir_pass.VectorizeLoop(stmt) + stmt = ir_pass.InjectVirtualThread(stmt) + stmt = ir_pass.StorageRewrite(stmt) + cfg = BuildConfig.current + stmt = ir_pass.UnrollLoop( + stmt, + cfg.auto_unroll_max_step, + cfg.auto_unroll_min_depth, + cfg.unroll_explicit) + stmt = ir_pass.Simplify(stmt) + if simple_mode: + return stmt + return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func) + + +def build(sch, + args=None, + target="llvm", + target_host=None, + name="default_function", + binds=None): + """Build a function with arguments as signiture. + + Parameters + ---------- + sch : tvm.Schedule, or LoweredFunc + The schedule to be builded + + args : list of Buffer or Tensor or Var, optional + The argument lists to the function. + + target : str, optional + The target and option of the compilation. + When the target is llvm, you can set options like: + + - **-mtriple=** or **-target** + + Specify the target triple, which is useful for cross + compilation. + + - **-mcpu=** + + Specify a specific chip in the current architecture to + generate code for. By default this is infered from the + target triple and autodetected to the current architecture. + + - **-mattr=a1,+a2,-a3,...** + + Override or control specific attributes of the target, + such as whether SIMD operations are enabled or not. The + default set of attributes is set by the current CPU. + + - **-system-lib** + + Build TVM system library module. System lib is a global module that contains + self registered functions in program startup. User can get the module using + :any:`tvm.module.system_lib`. + It is useful in environments where dynamic loading api like dlopen is banned. + The system lib will be available as long as the result code is linked by the program. + + target_host : str, optional + Host compilation target, if target is device. + When TVM compiles device specific program such as CUDA, + we also need host(CPU) side code to interact with the driver + setup the dimensions and parameters correctly. + target_host is used to specify the host side codegen target. + By default, llvm is used if it is enabled, + otherwise a stackvm intepreter is used. + + name : str, optional + The name of result function. + + binds : dict, optional + Dictionary that maps the binding of symbolic buffer to Tensor. + By default, a new buffer is created for each tensor in the argument. + + Returns + ------- + f : Function, or pair of functions + The result function. + """ + if isinstance(sch, schedule.Schedule): + if args is None: + raise ValueError("args must be given for build from schedule") + flist = lower(sch, args, + name=name, + binds=binds) + if isinstance(flist, container.LoweredFunc): + flist = [flist] + elif isinstance(sch, container.LoweredFunc): + if args: + raise ValueError("args must be done when build from LoweredFunc") + flist = [sch] + elif isinstance(sch, (list, tuple, container.Array)): + flist = sch + else: + raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc") + fname_set = set() + for x in flist: + if not isinstance(x, container.LoweredFunc): + raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc") + if x.name in fname_set: + raise ValueError("Duplicate function name %s" % x.name) + + fhost = [] + fdevice = [] + for func in flist: + if func.func_type == container.LoweredFunc.MixedFunc: + if BuildConfig.current.detect_global_barrier: + func = ir_pass.StorageSync(func, "global") + func = ir_pass.StorageSync(func, "shared") + warp_size = 32 if target == "cuda" else 1 + func = ir_pass.LowerThreadAllreduce(func, warp_size) + fsplits = [s for s in ir_pass.SplitHostDevice(func)] + fhost.append(fsplits[0]) + for x in fsplits[1:]: + fdevice.append(x) + elif func.func_type == container.LoweredFunc.HostFunc: + fhost.append(func) + elif func.func_type == container.LoweredFunc.DeviceFunc: + fdevice.append(func) + else: + raise ValueError("unknown function type %d" % func.func_type) + + if not target.startswith("llvm") and target != "stackvm" and not fdevice: + raise ValueError( + "Specified target %s, but cannot find device code, did you do bind?" % target) + + device = "cpu" if target.startswith("llvm") or target == "stackvm" else target + device_type = ndarray.context(device, 0).device_type + fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost] + fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost] + fhost = [ir_pass.CombineContextCall(x) for x in fhost] + + if fdevice: + if not target_host: + target_host = "llvm" if module.enabled("llvm") else "stackvm" + mhost = codegen.build_module(fhost, target_host) + if target: + mdev = codegen.build_module(fdevice, target) + mhost.import_module(mdev) + return mhost + else: + return codegen.build_module(fhost, target) From 5daeac2e818ea8ada7603f595878b0a8228af4f0 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 28 Jul 2017 23:45:08 +0000 Subject: [PATCH 04/14] typo fixed --- python/tvm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 367efe7f911d..53d1dd60ee78 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -27,5 +27,5 @@ from .node import register_node from .ndarray import register_extension from .schedule import create_schedule -from .build import build, lower, build_config +from .build_module import build, lower, build_config from .tag import tag_scope From 8856fb5800a265763547b0424ea4b203d5e32193 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 29 Jul 2017 01:53:31 +0000 Subject: [PATCH 05/14] another typo fixed --- python/tvm/tensor_intrin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/tensor_intrin.py b/python/tvm/tensor_intrin.py index e2f10b0dce28..e166f3706a7b 100644 --- a/python/tvm/tensor_intrin.py +++ b/python/tvm/tensor_intrin.py @@ -6,7 +6,7 @@ from . import stmt as _stmt from . import make as _make from . import tensor as _tensor -from .build import BuildConfig +from .build_module import BuildConfig from ._ffi.node import NodeBase, register_node @register_node From bcf6d734f5595eba08d99a9e2e3777d8ec22551a Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 12 Aug 2017 20:33:30 +0000 Subject: [PATCH 06/14] conv2d gpu scheduler for two layouts moved to tvm --- topi/python/topi/cuda/__init__.py | 3 +- topi/python/topi/cuda/conv2d_hwcn.py | 122 ++++++++++++++++++++ topi/python/topi/cuda/conv2d_hwcn_map.py | 121 -------------------- topi/python/topi/cuda/conv2d_nchw.py | 137 +++++++++++++++++++++++ 4 files changed, 261 insertions(+), 122 deletions(-) create mode 100644 topi/python/topi/cuda/conv2d_hwcn.py delete mode 100644 topi/python/topi/cuda/conv2d_hwcn_map.py create mode 100644 topi/python/topi/cuda/conv2d_nchw.py diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index 6456441d6038..449cd4c5b161 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -2,5 +2,6 @@ """CUDA specific declaration and schedules.""" from __future__ import absolute_import as _abs -from .conv2d_hwcn_map import schedule_conv2d_hwcn_map +from . import conv2d_nchw +from . import conv2d_hwcn from .depthwise_conv2d_map import schedule_depthwise_conv2d_map diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py new file mode 100644 index 000000000000..0a246a84e118 --- /dev/null +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -0,0 +1,122 @@ +# pylint: disable=invalid-name +"""Schedule for conv2d_hwcn with auto fusion""" +import tvm + + + +@tvm.register_func("topi.schedule.cuda.conv2d_hwcn") +def schedule_conv2d_hwcn(outs, target): + """Schedule for conv2d_nchw. + + Parameters + ---------- + outs: tvm.Array + The computation graph description of conv2d_nchw in the format + of a list of tensors. + + traget: str + Compilation target ('cuda' for gpu) + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + s = tvm.create_schedule([x.op for x in outs]) + def schedule(Apad, W, B): + + sch[Apad].compute_inline() + AA = sch.cache_read(Apad, "shared", [B]) + WW = sch.cache_read(W, "shared", [B]) + AL = sch.cache_read(AA, "local", [B]) + WL = sch.cache_read(WW, "local", [B]) + + if op in sch.outputs: + Out = op.output(0) + BL = sch.cache_write(Out, "local") + else: + Out = sch.outputs[0].output(0) + sch[B].set_scope("local") + BL = B + + tile = 8 + num_thread = 8 + block_factor = tile * num_thread + step = 8 + vthread = 2 + + block_x = tvm.thread_axis("blockIdx.x") + block_y = tvm.thread_axis("blockIdx.y") + block_z = tvm.thread_axis("blockIdx.z") + thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") + thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") + thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx") + thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy") + + hi, wi, fi, ni = sch[Out].op.axis + bz = sch[Out].fuse(hi, wi) + by, fi = sch[Out].split(fi, factor=block_factor) + bx, ni = sch[Out].split(ni, factor=block_factor) + tyz, fi = sch[Out].split(fi, nparts=vthread) + txz, ni = sch[Out].split(ni, nparts=vthread) + ty, fi = sch[Out].split(fi, nparts=num_thread) + tx, ni = sch[Out].split(ni, nparts=num_thread) + sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni) + sch[Out].bind(bz, block_z) + sch[Out].bind(by, block_y) + sch[Out].bind(bx, block_x) + sch[Out].bind(tyz, thread_yz) + sch[Out].bind(txz, thread_xz) + sch[Out].bind(ty, thread_y) + sch[Out].bind(tx, thread_x) + + # Schedule BL local write + sch[BL].compute_at(sch[Out], tx) + yi, xi, fi, ni = sch[BL].op.axis + ry, rx, rc = sch[BL].op.reduce_axis + rco, rci = sch[BL].split(rc, factor=step) + sch[BL].reorder(rco, ry, rx, rci, fi, ni) + fuse_index = sch[BL].fuse(ry, rx) + fuse_index = sch[BL].fuse(fuse_index, rco) + rx = fuse_index + + sch[AA].compute_at(sch[BL], rx) + sch[WW].compute_at(sch[BL], rx) + sch[AL].compute_at(sch[BL], rci) + sch[WL].compute_at(sch[BL], rci) + # Schedule for A's shared memory load + yi, xi, ci, ni = sch[AA].op.axis + ty, ci = sch[AA].split(ci, nparts=num_thread) + tx, ni = sch[AA].split(ni, nparts=num_thread) + _, ni = sch[AA].split(ni, factor=4) + sch[AA].reorder(ty, tx, yi, xi, ci, ni) + sch[AA].bind(ty, thread_y) + sch[AA].bind(tx, thread_x) + sch[AA].vectorize(ni) + # Schedule for W's shared memory load + yi, xi, ci, fi = sch[WW].op.axis + ty, ci = sch[WW].split(ci, nparts=num_thread) + tx, fi = sch[WW].split(fi, nparts=num_thread) + _, fi = sch[WW].split(fi, factor=4) + sch[WW].reorder(ty, tx, yi, xi, ci, fi) + sch[WW].bind(ty, thread_y) + sch[WW].bind(tx, thread_x) + sch[WW].vectorize(fi) + + def traverse(operator): + if operator.tag == 'ewise' or operator.tag == 'scale_shift': + if operator not in sch.outputs: + sch[operator].compute_inline() + for tensor in operator.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + elif operator.tag == 'conv2d_hwcn': + Apad = op.input_tensors[0] + W = op.input_tensors[1] + B = op.output(0) + schedule(Apad, W, B) + else: + raise RuntimeError("Unsupported operator: %s" % operator.tag) + + traverse(outs[0].op) + return sch diff --git a/topi/python/topi/cuda/conv2d_hwcn_map.py b/topi/python/topi/cuda/conv2d_hwcn_map.py deleted file mode 100644 index 7b932523b720..000000000000 --- a/topi/python/topi/cuda/conv2d_hwcn_map.py +++ /dev/null @@ -1,121 +0,0 @@ -# pylint: disable=invalid-name -"""Schedule for conv2d_hwcn with auto fusion""" -import tvm - - -def _schedule_conv2d_hwcn(op, sch): - assert len(op.input_tensors) == 2 - Apad = op.input_tensors[0] - W = op.input_tensors[1] - B = op.output(0) - - sch[Apad].compute_inline() - AA = sch.cache_read(Apad, "shared", [B]) - WW = sch.cache_read(W, "shared", [B]) - AL = sch.cache_read(AA, "local", [B]) - WL = sch.cache_read(WW, "local", [B]) - - if op in sch.outputs: - Out = op.output(0) - BL = sch.cache_write(Out, "local") - else: - Out = sch.outputs[0].output(0) - sch[B].set_scope("local") - BL = B - - tile = 8 - num_thread = 8 - block_factor = tile * num_thread - step = 8 - vthread = 2 - - block_x = tvm.thread_axis("blockIdx.x") - block_y = tvm.thread_axis("blockIdx.y") - block_z = tvm.thread_axis("blockIdx.z") - thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") - thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") - thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx") - thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy") - - hi, wi, fi, ni = sch[Out].op.axis - bz = sch[Out].fuse(hi, wi) - by, fi = sch[Out].split(fi, factor=block_factor) - bx, ni = sch[Out].split(ni, factor=block_factor) - tyz, fi = sch[Out].split(fi, nparts=vthread) - txz, ni = sch[Out].split(ni, nparts=vthread) - ty, fi = sch[Out].split(fi, nparts=num_thread) - tx, ni = sch[Out].split(ni, nparts=num_thread) - sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni) - sch[Out].bind(bz, block_z) - sch[Out].bind(by, block_y) - sch[Out].bind(bx, block_x) - sch[Out].bind(tyz, thread_yz) - sch[Out].bind(txz, thread_xz) - sch[Out].bind(ty, thread_y) - sch[Out].bind(tx, thread_x) - - # Schedule BL local write - sch[BL].compute_at(sch[Out], tx) - yi, xi, fi, ni = sch[BL].op.axis - ry, rx, rc = sch[BL].op.reduce_axis - rco, rci = sch[BL].split(rc, factor=step) - sch[BL].reorder(rco, ry, rx, rci, fi, ni) - fuse_index = sch[BL].fuse(ry, rx) - fuse_index = sch[BL].fuse(fuse_index, rco) - rx = fuse_index - - sch[AA].compute_at(sch[BL], rx) - sch[WW].compute_at(sch[BL], rx) - sch[AL].compute_at(sch[BL], rci) - sch[WL].compute_at(sch[BL], rci) - # Schedule for A's shared memory load - yi, xi, ci, ni = sch[AA].op.axis - ty, ci = sch[AA].split(ci, nparts=num_thread) - tx, ni = sch[AA].split(ni, nparts=num_thread) - _, ni = sch[AA].split(ni, factor=4) - sch[AA].reorder(ty, tx, yi, xi, ci, ni) - sch[AA].bind(ty, thread_y) - sch[AA].bind(tx, thread_x) - sch[AA].vectorize(ni) - # Schedule for W's shared memory load - yi, xi, ci, fi = sch[WW].op.axis - ty, ci = sch[WW].split(ci, nparts=num_thread) - tx, fi = sch[WW].split(fi, nparts=num_thread) - _, fi = sch[WW].split(fi, factor=4) - sch[WW].reorder(ty, tx, yi, xi, ci, fi) - sch[WW].bind(ty, thread_y) - sch[WW].bind(tx, thread_x) - sch[WW].vectorize(fi) - - return sch - - -def schedule_conv2d_hwcn_map(op): - """Schedule for conv2d_hwcn map ops. - - Parameters - ---------- - op: tvm.tensor.Operation - The symbolic description of the operation, should be conv2d_hwcn or - conv2d_hwcn followed by a sequence of one-to-one-mapping operators. - - Returns - ------- - sch: Schedule - The computation schedule for the op. - """ - def traverse(operator): - if operator.tag == 'ewise' or operator.tag == 'scale_shift': - if operator not in sch.outputs: - sch[operator].compute_inline() - for tensor in operator.input_tensors: - if tensor.op.input_tensors: - traverse(tensor.op) - elif operator.tag == 'conv2d_hwcn': - _schedule_conv2d_hwcn(operator, sch) - else: - raise RuntimeError("Unsupported operator: %s" % operator.tag) - - sch = tvm.create_schedule(op) - traverse(op) - return sch diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py new file mode 100644 index 000000000000..ba570fd87143 --- /dev/null +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -0,0 +1,137 @@ +# pylint: disable=invalid-name +"""Schedule for conv2d_nchw with auto fusion, optimized for batch_size(n)=1.""" +import tvm +import math + + +@tvm.register_func("topi.schedule.cuda.conv2d_nchw") +def schedule_conv2d_nchw(outs, target): + """Schedule for conv2d_nchw. + + Parameters + ---------- + outs: tvm.Array + The computation graph description of conv2d_nchw in the format + of a list of tensors. + + traget: str + Compilation target ('cuda' for gpu) + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + s = tvm.create_schedule([x.op for x in outs]) + def schedule(temp, Filter, Output): + out_height = tvm.ir_pass.Simplify(Output.shape[2]).value + out_width = tvm.ir_pass.Simplify(Output.shape[3]).value + channel_multiplier = tvm.ir_pass.Simplify(Filter.shape[1]).value + + block_h = out_width + block_w = tvm.ir_pass.Simplify(temp.shape[1]).value + if block_h % 48 == 0: + block_h = 48 + elif block_h % 32 == 0: + block_h = 32 + if block_w % 48 == 0: + block_w = 48 + elif block_w % 32 == 0: + block_w = 32 + + s[temp].compute_inline() + + temp_S = s.cache_read(temp, "shared", [Output]) + Filter_S = s.cache_read(Filter, "shared", [Output]) + + if outs[0].op in s.outputs: + Out = Output + Out_L = s.cache_write(Out, "local") + else: + Out = outs[0].op.output(0) + s[Output].set_scope("local") + Out_L = Output + + # sheduler params + tile = 8 + num_thread = 8 + step = 16 + vthread = 2 + out_filter = tvm.ir_pass.Simplify(Filter.shape[0]).value + in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value + opart2 = out_filter/8 + ofactor=out_filter + wfactor=block_h + ifactor=in_filter/4 + sfactor=max(1, ofactor/(opart2*2)) + spart = int(math.ceil(wfactor/vthread)) + + block_x = tvm.thread_axis("blockIdx.x") + block_y = tvm.thread_axis("blockIdx.y") + block_z = tvm.thread_axis("blockIdx.z") + thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") + thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") + thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx") + thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy") + + i, oc, h, w = s[Out].op.axis + ooc, ioc = s[Out].split(oc, factor=ofactor) + ow, iw = s[Out].split(w, factor=wfactor) + ow = s[Out].fuse(ow, h) + oioc, iioc = s[Out].split(ioc, nparts = vthread) + oiw, iiw = s[Out].split(iw, nparts=vthread) + oiioc, iiioc = s[Out].split(iioc, nparts = opart2) + s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc) + s[Out].bind(iiioc, thread_x) + s[Out].bind(iiw, thread_y) + s[Out].bind(oiioc, thread_xz) + s[Out].bind(oiw, thread_yz) + s[Out].bind(oioc, block_x) + s[Out].bind(ow, block_y) + s[Out].bind(ooc, block_z) + + s[Out_L].compute_at(s[Out], iiioc) + + # schedule Out_L local write + i, oc, h, w = s[Out_L].op.axis + ic, dh, dw = s[Out_L].op.reduce_axis + oic, iic = s[Out_L].split(ic, factor=ifactor) + s[Out_L].reorder(oic, dh, dw, iic, h, w) + fuse_index = s[Out_L].fuse(dw, dh) + fuse_index = s[Out_L].fuse(fuse_index, oic) + dw = fuse_index + + s[temp_S].compute_at(s[Out_L], dw) + s[Filter_S].compute_at(s[Out_L], dw) + + #schedule temp_S shared mem load + i, ic, h, w = s[temp_S].op.axis + oic, iic = s[temp_S].split(ic, factor=sfactor) + ow, iw = s[temp_S].split(w, factor=spart) + s[temp_S].bind(iic, thread_x) + s[temp_S].bind(iw, thread_y) + + #schedule Filter_S shared mem load + i, oc, h, w = s[Filter_S].op.axis + ooc, ioc = s[Filter_S].split(oc, factor=sfactor) + oi, ii = s[Filter_S].split(i, factor=spart) + s[Filter_S].bind(ioc, thread_x) + s[Filter_S].bind(ii, thread_y) + + def traverse(OP): + # inline all one-to-one-mapping operators except the last stage (output) + if 'ewise' in OP.tag or 'bcast' in OP.tag: + if OP not in s.outputs: + s[OP].compute_inline() + for tensor in OP.input_tensors: + if str(tensor.op.input_tensors) != str([]): + traverse(tensor.op) + # schedule conv2d + if 'conv2d_nchw' in OP.tag: + temp = OP.input_tensors[0] + Filter = OP.input_tensors[1] + Output = OP.output(0) + schedule(temp, Filter, Output) + + traverse(outs[0].op) + return s From f8f93f3ec898aef9d8383df2f31785da0202930b Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 13 Aug 2017 00:46:06 +0000 Subject: [PATCH 07/14] changes made according to CR --- topi/python/topi/cuda/conv2d_hwcn.py | 14 +++++--------- topi/python/topi/cuda/conv2d_nchw.py | 18 +++++++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py index 0a246a84e118..7acdaa4303bc 100644 --- a/topi/python/topi/cuda/conv2d_hwcn.py +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -3,24 +3,20 @@ import tvm - @tvm.register_func("topi.schedule.cuda.conv2d_hwcn") -def schedule_conv2d_hwcn(outs, target): - """Schedule for conv2d_nchw. +def schedule_conv2d_hwcn(outs): + """Schedule for conv2d_hwcn. Parameters ---------- - outs: tvm.Array - The computation graph description of conv2d_nchw in the format + outs: Array + The computation graph description of conv2d_hwcn in the format of a list of tensors. - traget: str - Compilation target ('cuda' for gpu) - Returns ------- s: Schedule - The computation schedule for conv2d_nchw. + The computation schedule for conv2d_hwcn. """ s = tvm.create_schedule([x.op for x in outs]) def schedule(Apad, W, B): diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index ba570fd87143..392439cd205c 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -1,28 +1,32 @@ # pylint: disable=invalid-name -"""Schedule for conv2d_nchw with auto fusion, optimized for batch_size(n)=1.""" +"""Schedule for conv2d_nchw with auto fusion""" import tvm import math @tvm.register_func("topi.schedule.cuda.conv2d_nchw") -def schedule_conv2d_nchw(outs, target): +def schedule_conv2d_nchw(outs): """Schedule for conv2d_nchw. Parameters ---------- - outs: tvm.Array + outs: Array The computation graph description of conv2d_nchw in the format of a list of tensors. - traget: str - Compilation target ('cuda' for gpu) - Returns ------- s: Schedule The computation schedule for conv2d_nchw. """ - s = tvm.create_schedule([x.op for x in outs]) + def schedule_conv2d_small_batch(outs): + batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value + if(batch_size > 1): + raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size) + s = tvm.create_schedule([x.op for x in outs]) + return s + + s = schedule_conv2d_small_batch(outs) def schedule(temp, Filter, Output): out_height = tvm.ir_pass.Simplify(Output.shape[2]).value out_width = tvm.ir_pass.Simplify(Output.shape[3]).value From 60ba7cb5b86a7debffdf919ea0ec455e676397b7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 13 Aug 2017 06:00:36 +0000 Subject: [PATCH 08/14] conv2d_nchw formating updated, conv2d_hwcn tests updated --- topi/python/topi/cuda/__init__.py | 4 +- topi/python/topi/cuda/conv2d_hwcn.py | 41 +++++++------- topi/python/topi/cuda/conv2d_nchw.py | 56 ++++++++----------- topi/recipe/conv/test_conv2d_hwcn_map.py | 6 +- .../tests/python/test_topi_conv2d_hwcn_map.py | 4 +- 5 files changed, 51 insertions(+), 60 deletions(-) diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index 449cd4c5b161..65753bc088f3 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -2,6 +2,6 @@ """CUDA specific declaration and schedules.""" from __future__ import absolute_import as _abs -from . import conv2d_nchw -from . import conv2d_hwcn +from .conv2d_nchw import schedule_conv2d_nchw +from .conv2d_hwcn import schedule_conv2d_hwcn from .depthwise_conv2d_map import schedule_depthwise_conv2d_map diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py index 7acdaa4303bc..fe65c2d29002 100644 --- a/topi/python/topi/cuda/conv2d_hwcn.py +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -3,22 +3,21 @@ import tvm -@tvm.register_func("topi.schedule.cuda.conv2d_hwcn") def schedule_conv2d_hwcn(outs): - """Schedule for conv2d_hwcn. + """Schedule for conv2d_hwcn and any element-wise operations. Parameters ---------- - outs: Array + outs: Array of Tensor The computation graph description of conv2d_hwcn in the format - of a list of tensors. + of an array of tensors. Returns ------- s: Schedule The computation schedule for conv2d_hwcn. """ - s = tvm.create_schedule([x.op for x in outs]) + sch = tvm.create_schedule([x.op for x in outs]) def schedule(Apad, W, B): sch[Apad].compute_inline() @@ -27,8 +26,8 @@ def schedule(Apad, W, B): AL = sch.cache_read(AA, "local", [B]) WL = sch.cache_read(WW, "local", [B]) - if op in sch.outputs: - Out = op.output(0) + if outs[0].op in sch.outputs: + Out = B BL = sch.cache_write(Out, "local") else: Out = sch.outputs[0].output(0) @@ -99,20 +98,20 @@ def schedule(Apad, W, B): sch[WW].bind(tx, thread_x) sch[WW].vectorize(fi) - def traverse(operator): - if operator.tag == 'ewise' or operator.tag == 'scale_shift': - if operator not in sch.outputs: - sch[operator].compute_inline() - for tensor in operator.input_tensors: - if tensor.op.input_tensors: - traverse(tensor.op) - elif operator.tag == 'conv2d_hwcn': - Apad = op.input_tensors[0] - W = op.input_tensors[1] - B = op.output(0) - schedule(Apad, W, B) - else: - raise RuntimeError("Unsupported operator: %s" % operator.tag) + def traverse(operator): + if operator.tag == 'ewise' or operator.tag == 'scale_shift': + if operator not in sch.outputs: + sch[operator].compute_inline() + for tensor in operator.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + elif operator.tag == 'conv2d_hwcn': + Apad = operator.input_tensors[0] + W = operator.input_tensors[1] + B = operator.output(0) + schedule(Apad, W, B) + else: + raise RuntimeError("Unsupported operator: %s" % operator.tag) traverse(outs[0].op) return sch diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index 392439cd205c..a911ec51cc38 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -1,18 +1,16 @@ # pylint: disable=invalid-name """Schedule for conv2d_nchw with auto fusion""" import tvm -import math -@tvm.register_func("topi.schedule.cuda.conv2d_nchw") def schedule_conv2d_nchw(outs): - """Schedule for conv2d_nchw. + """Schedule for conv2d_nchw and any element-wise operations. Parameters ---------- - outs: Array - The computation graph description of conv2d_nchw in the format - of a list of tensors. + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. Returns ------- @@ -21,18 +19,14 @@ def schedule_conv2d_nchw(outs): """ def schedule_conv2d_small_batch(outs): batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value - if(batch_size > 1): + if batch_size > 1: raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size) s = tvm.create_schedule([x.op for x in outs]) return s s = schedule_conv2d_small_batch(outs) def schedule(temp, Filter, Output): - out_height = tvm.ir_pass.Simplify(Output.shape[2]).value - out_width = tvm.ir_pass.Simplify(Output.shape[3]).value - channel_multiplier = tvm.ir_pass.Simplify(Filter.shape[1]).value - - block_h = out_width + block_h = tvm.ir_pass.Simplify(Output.shape[3]).value block_w = tvm.ir_pass.Simplify(temp.shape[1]).value if block_h % 48 == 0: block_h = 48 @@ -44,10 +38,10 @@ def schedule(temp, Filter, Output): block_w = 32 s[temp].compute_inline() - - temp_S = s.cache_read(temp, "shared", [Output]) + + temp_S = s.cache_read(temp, "shared", [Output]) Filter_S = s.cache_read(Filter, "shared", [Output]) - + if outs[0].op in s.outputs: Out = Output Out_L = s.cache_write(Out, "local") @@ -57,18 +51,16 @@ def schedule(temp, Filter, Output): Out_L = Output # sheduler params - tile = 8 num_thread = 8 - step = 16 vthread = 2 out_filter = tvm.ir_pass.Simplify(Filter.shape[0]).value in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value opart2 = out_filter/8 - ofactor=out_filter - wfactor=block_h - ifactor=in_filter/4 - sfactor=max(1, ofactor/(opart2*2)) - spart = int(math.ceil(wfactor/vthread)) + ofactor = out_filter + wfactor = block_h + ifactor = in_filter/4 + sfactor = max(1, ofactor/(opart2*2)) + spart = int(float(wfactor)/vthread) block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") @@ -82,9 +74,9 @@ def schedule(temp, Filter, Output): ooc, ioc = s[Out].split(oc, factor=ofactor) ow, iw = s[Out].split(w, factor=wfactor) ow = s[Out].fuse(ow, h) - oioc, iioc = s[Out].split(ioc, nparts = vthread) + oioc, iioc = s[Out].split(ioc, nparts=vthread) oiw, iiw = s[Out].split(iw, nparts=vthread) - oiioc, iiioc = s[Out].split(iioc, nparts = opart2) + oiioc, iiioc = s[Out].split(iioc, nparts=opart2) s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc) s[Out].bind(iiioc, thread_x) s[Out].bind(iiw, thread_y) @@ -107,28 +99,28 @@ def schedule(temp, Filter, Output): s[temp_S].compute_at(s[Out_L], dw) s[Filter_S].compute_at(s[Out_L], dw) - + #schedule temp_S shared mem load i, ic, h, w = s[temp_S].op.axis - oic, iic = s[temp_S].split(ic, factor=sfactor) - ow, iw = s[temp_S].split(w, factor=spart) + _, iic = s[temp_S].split(ic, factor=sfactor) + _, iw = s[temp_S].split(w, factor=spart) s[temp_S].bind(iic, thread_x) s[temp_S].bind(iw, thread_y) - + #schedule Filter_S shared mem load i, oc, h, w = s[Filter_S].op.axis - ooc, ioc = s[Filter_S].split(oc, factor=sfactor) - oi, ii = s[Filter_S].split(i, factor=spart) + _, ioc = s[Filter_S].split(oc, factor=sfactor) + _, ii = s[Filter_S].split(i, factor=spart) s[Filter_S].bind(ioc, thread_x) s[Filter_S].bind(ii, thread_y) - + def traverse(OP): # inline all one-to-one-mapping operators except the last stage (output) if 'ewise' in OP.tag or 'bcast' in OP.tag: if OP not in s.outputs: s[OP].compute_inline() for tensor in OP.input_tensors: - if str(tensor.op.input_tensors) != str([]): + if tensor.op.input_tensors: traverse(tensor.op) # schedule conv2d if 'conv2d_nchw' in OP.tag: diff --git a/topi/recipe/conv/test_conv2d_hwcn_map.py b/topi/recipe/conv/test_conv2d_hwcn_map.py index 553c93aa74a2..a6b9017a74eb 100644 --- a/topi/recipe/conv/test_conv2d_hwcn_map.py +++ b/topi/recipe/conv/test_conv2d_hwcn_map.py @@ -12,7 +12,7 @@ @tvm.register_func def tvm_callback_cuda_compile(code): - ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"]) + ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_37"]) return ptx def write_code(code, fname): @@ -43,8 +43,8 @@ def test_conv2d_hwcn_map(): W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W') B = topi.nn.conv2d_hwcn(A, W, stride, padding) C = topi.nn.relu(B) - s1 = topi.cuda.schedule_conv2d_hwcn_map(B.op) - s2 = topi.cuda.schedule_conv2d_hwcn_map(C.op) + s1 = topi.cuda.schedule_conv2d_hwcn([B]) + s2 = topi.cuda.schedule_conv2d_hwcn([C]) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) diff --git a/topi/tests/python/test_topi_conv2d_hwcn_map.py b/topi/tests/python/test_topi_conv2d_hwcn_map.py index 993e5713cfe4..f6b56126e389 100644 --- a/topi/tests/python/test_topi_conv2d_hwcn_map.py +++ b/topi/tests/python/test_topi_conv2d_hwcn_map.py @@ -13,8 +13,8 @@ def verify_conv2d_hwcn_map(batch, in_channel, in_size, num_filter, kernel, strid W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W') B = topi.nn.conv2d_hwcn(A, W, stride, padding) C = topi.nn.relu(B) - s1 = topi.cuda.schedule_conv2d_hwcn_map(B.op) - s2 = topi.cuda.schedule_conv2d_hwcn_map(C.op) + s1 = topi.cuda.schedule_conv2d_hwcn([B]) + s2 = topi.cuda.schedule_conv2d_hwcn([C]) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) From d64da8d6d462a4eaccfc39715c4f7296694140f9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 13 Aug 2017 06:26:43 +0000 Subject: [PATCH 09/14] lint error fixed --- topi/python/topi/cuda/conv2d_hwcn.py | 5 +++-- topi/python/topi/cuda/conv2d_nchw.py | 7 +++++-- topi/python/topi/testing/conv2d_hwcn_python.py | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py index fe65c2d29002..50d71fefaee3 100644 --- a/topi/python/topi/cuda/conv2d_hwcn.py +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name +# pylint: disable=invalid-name, too-many-locals, too-many-statements """Schedule for conv2d_hwcn with auto fusion""" import tvm @@ -19,7 +19,7 @@ def schedule_conv2d_hwcn(outs): """ sch = tvm.create_schedule([x.op for x in outs]) def schedule(Apad, W, B): - + """Schedule conv2d_hwcn""" sch[Apad].compute_inline() AA = sch.cache_read(Apad, "shared", [B]) WW = sch.cache_read(W, "shared", [B]) @@ -99,6 +99,7 @@ def schedule(Apad, W, B): sch[WW].vectorize(fi) def traverse(operator): + """Traverse operators from computation graph""" if operator.tag == 'ewise' or operator.tag == 'scale_shift': if operator not in sch.outputs: sch[operator].compute_inline() diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index a911ec51cc38..69fbc303a028 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements """Schedule for conv2d_nchw with auto fusion""" import tvm @@ -18,6 +18,7 @@ def schedule_conv2d_nchw(outs): The computation schedule for conv2d_nchw. """ def schedule_conv2d_small_batch(outs): + """Create schedule for tensors or return error if batch size is larager than 1""" batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value if batch_size > 1: raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size) @@ -26,6 +27,7 @@ def schedule_conv2d_small_batch(outs): s = schedule_conv2d_small_batch(outs) def schedule(temp, Filter, Output): + """Schedule conv2d_nchw""" block_h = tvm.ir_pass.Simplify(Output.shape[3]).value block_w = tvm.ir_pass.Simplify(temp.shape[1]).value if block_h % 48 == 0: @@ -41,7 +43,7 @@ def schedule(temp, Filter, Output): temp_S = s.cache_read(temp, "shared", [Output]) Filter_S = s.cache_read(Filter, "shared", [Output]) - + if outs[0].op in s.outputs: Out = Output Out_L = s.cache_write(Out, "local") @@ -115,6 +117,7 @@ def schedule(temp, Filter, Output): s[Filter_S].bind(ii, thread_y) def traverse(OP): + """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if 'ewise' in OP.tag or 'bcast' in OP.tag: if OP not in s.outputs: diff --git a/topi/python/topi/testing/conv2d_hwcn_python.py b/topi/python/topi/testing/conv2d_hwcn_python.py index e240cfb722ae..c84efce5e777 100644 --- a/topi/python/topi/testing/conv2d_hwcn_python.py +++ b/topi/python/topi/testing/conv2d_hwcn_python.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, line-too-long, unused-variable +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals """Convolution in python""" import numpy as np import scipy.signal From 8d5facc008432c1e022ad3648c25be5111a0021f Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 13 Aug 2017 06:45:29 +0000 Subject: [PATCH 10/14] element wise operator schedule fusing fixed for conv2d --- topi/python/topi/cuda/conv2d_hwcn.py | 2 +- topi/python/topi/cuda/conv2d_nchw.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py index 50d71fefaee3..675cc0adcbc2 100644 --- a/topi/python/topi/cuda/conv2d_hwcn.py +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -26,7 +26,7 @@ def schedule(Apad, W, B): AL = sch.cache_read(AA, "local", [B]) WL = sch.cache_read(WW, "local", [B]) - if outs[0].op in sch.outputs: + if B.op in sch.outputs: Out = B BL = sch.cache_write(Out, "local") else: diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index 69fbc303a028..e3cf68d4cffc 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -44,7 +44,7 @@ def schedule(temp, Filter, Output): temp_S = s.cache_read(temp, "shared", [Output]) Filter_S = s.cache_read(Filter, "shared", [Output]) - if outs[0].op in s.outputs: + if Output.op in s.outputs: Out = Output Out_L = s.cache_write(Out, "local") else: From 8a80a7e48d5791b35bad911d5b8350683f33c389 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 13 Aug 2017 23:53:36 +0000 Subject: [PATCH 11/14] conv2d_nchw topi test added, all resnet workloads now pass --- topi/python/topi/cuda/conv2d_nchw.py | 49 +++++++------- topi/python/topi/nn/conv.py | 63 ++++++++++++++++++ topi/python/topi/testing/__init__.py | 1 + .../python/topi/testing/conv2d_nchw_python.py | 64 +++++++++++++++++++ topi/tests/python/test_topi_conv2d_nchw.py | 61 ++++++++++++++++++ 5 files changed, 213 insertions(+), 25 deletions(-) create mode 100644 topi/python/topi/testing/conv2d_nchw_python.py create mode 100644 topi/tests/python/test_topi_conv2d_nchw.py diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index e3cf68d4cffc..c752fc1473cf 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -3,29 +3,10 @@ import tvm -def schedule_conv2d_nchw(outs): - """Schedule for conv2d_nchw and any element-wise operations. - - Parameters - ---------- - outs: Array of Tensor - The computation graph description of conv2d_nchw - in the format of an array of tensors. +def schedule_conv2d_small_batch(outs): + """Create schedule for tensors or return error if batch size is larager than 1""" + s = tvm.create_schedule([x.op for x in outs]) - Returns - ------- - s: Schedule - The computation schedule for conv2d_nchw. - """ - def schedule_conv2d_small_batch(outs): - """Create schedule for tensors or return error if batch size is larager than 1""" - batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value - if batch_size > 1: - raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size) - s = tvm.create_schedule([x.op for x in outs]) - return s - - s = schedule_conv2d_small_batch(outs) def schedule(temp, Filter, Output): """Schedule conv2d_nchw""" block_h = tvm.ir_pass.Simplify(Output.shape[3]).value @@ -55,15 +36,14 @@ def schedule(temp, Filter, Output): # sheduler params num_thread = 8 vthread = 2 - out_filter = tvm.ir_pass.Simplify(Filter.shape[0]).value + out_filter = min(64, tvm.ir_pass.Simplify(Filter.shape[0]).value) in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value opart2 = out_filter/8 ofactor = out_filter wfactor = block_h ifactor = in_filter/4 sfactor = max(1, ofactor/(opart2*2)) - spart = int(float(wfactor)/vthread) - + spart = (wfactor + vthread-1) // vthread block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") block_z = tvm.thread_axis("blockIdx.z") @@ -134,3 +114,22 @@ def traverse(OP): traverse(outs[0].op) return s + +def schedule_conv2d_nchw(outs): + """Schedule for conv2d_nchw and any element-wise operations. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value + if batch_size > 1: + raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size) + return schedule_conv2d_small_batch(outs) diff --git a/topi/python/topi/nn/conv.py b/topi/python/topi/nn/conv.py index 4c233aa1568a..bd62a6ec01ff 100644 --- a/topi/python/topi/nn/conv.py +++ b/topi/python/topi/nn/conv.py @@ -6,6 +6,69 @@ from .util import get_const_tuple +@tvm.tag_scope(tag="conv2d_nchw") +def conv2d_nchw(Input, Filter, stride, padding): + """Convolution operator in HWCN layout. + + Parameters + ---------- + Input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + Filter : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or str + Padding size, or ['VALID', 'SAME'] + + Returns + ------- + Output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(padding, int) or padding in ['VALID', 'SAME'] + batch, in_channel, in_height, in_width = get_const_tuple(Input.shape) + num_filter, channel, kernel_h, kernel_w = get_const_tuple(Filter.shape) + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + # compute the padding size + if isinstance(padding, int): + pad_h = pad_w = padding * 2 + elif padding == 'VALID': + pad_h = 0 + pad_w = 0 + else: # 'SAME' + pad_h = kernel_h - 1 + pad_w = kernel_w - 1 + pad_top = int(np.ceil(float(pad_h) / 2)) + pad_left = int(np.ceil(float(pad_w) / 2)) + # compute the output shape + out_channel = num_filter + out_height = (in_height - kernel_h + pad_h) // stride_h + 1 + out_width = (in_width - kernel_w + pad_w) // stride_w + 1 + # compute graph + temp = tvm.compute( + (batch, in_channel, in_height + pad_h, in_width + pad_w), + lambda nn, cc, yy, xx: tvm.select( + tvm.all(yy >= pad_top, yy - pad_top < in_height, + xx >= pad_left, xx - pad_left < in_width), + Input[nn, cc, yy - pad_top, xx - pad_left], tvm.const(0.)), + name='temp') + rc = tvm.reduce_axis((0, in_channel), name='rc') + ry = tvm.reduce_axis((0, kernel_h), name='ry') + rx = tvm.reduce_axis((0, kernel_w), name='rx') + return tvm.compute( + (batch, out_channel, out_height, out_width), + lambda nn, ff, yy, xx: tvm.sum( + temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx] * Filter[ff, rc, ry, rx], + axis=[rc, ry, rx])) + @tvm.tag_scope(tag="conv2d_hwcn") def conv2d_hwcn(Input, Filter, stride, padding): """Convolution operator in HWCN layout. diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index 28100658fc05..63bc8eb7215a 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -5,3 +5,4 @@ from __future__ import absolute_import as _abs from .conv2d_hwcn_python import conv2d_hwcn_python +from .conv2d_nchw_python import conv2d_nchw_python diff --git a/topi/python/topi/testing/conv2d_nchw_python.py b/topi/python/topi/testing/conv2d_nchw_python.py new file mode 100644 index 000000000000..169605faaf45 --- /dev/null +++ b/topi/python/topi/testing/conv2d_nchw_python.py @@ -0,0 +1,64 @@ +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals +"""Convolution in python""" +import numpy as np +import scipy.signal + + +def conv2d_nchw_python(a_np, w_np, stride, padding): + """Convolution operator in HWCN layout. + + Parameters + ---------- + a_np : numpy.ndarray + 4-D with shape [batch, in_channel, in_height, in_width] + + w_np : numpy.ndarray + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or str + Padding size, or ['VALID', 'SAME'] + + Returns + ------- + b_np : np.ndarray + 4-D with shape [batch, out_channel, out_height, out_width] + """ + batch, in_channel, in_height, in_width = a_np.shape + num_filter, _, kernel_h, kernel_w = w_np.shape + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + if isinstance(padding, int): + pad_h = pad_w = padding * 2 + elif padding == 'VALID': + pad_h = 0 + pad_w = 0 + else: # 'SAME' + pad_h = kernel_h - 1 + pad_w = kernel_w - 1 + pad_top = int(np.ceil(float(pad_h) / 2)) + pad_bottom = pad_h - pad_top + pad_left = int(np.ceil(float(pad_w) / 2)) + pad_right = pad_w - pad_left + # compute the output shape + out_channel = num_filter + out_height = (in_height - kernel_h + pad_h) // stride_h + 1 + out_width = (in_width - kernel_w + pad_w) // stride_w + 1 + b_np = np.zeros((batch, out_channel, out_height, out_width)) + # computation + for n in range(batch): + for f in range(out_channel): + for c in range(in_channel): + if pad_h > 0: + apad = np.zeros((in_height + pad_h, in_width + pad_w)) + apad[pad_top:-pad_bottom, pad_left:-pad_right] = a_np[n, c] + else: + apad = a_np[n, c] + out = scipy.signal.convolve2d( + apad, np.rot90(np.rot90(w_np[f, c])), mode='valid') + b_np[n, f] += out[::stride, ::stride] + return b_np diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py new file mode 100644 index 000000000000..6a7ed6cf8017 --- /dev/null +++ b/topi/tests/python/test_topi_conv2d_nchw.py @@ -0,0 +1,61 @@ +"""Example code to do convolution.""" +import os +import numpy as np +import tvm +import topi +from topi.nn.util import get_const_tuple + + +def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding): + in_height = in_width = in_size + + A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') + W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') + B = topi.nn.conv2d_nchw(A, W, stride, padding) + C = topi.nn.relu(B) + s1 = topi.cuda.schedule_conv2d_nchw([B]) + s2 = topi.cuda.schedule_conv2d_nchw([C]) + + a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) + w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) + b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) + c_np = np.maximum(b_np, 0) + + def check_device(device): + if not tvm.module.enabled(device): + print("Skip because %s is not enabled" % device) + return + ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + with tvm.build_config(auto_unroll_max_step=32, + auto_unroll_min_depth=0, + unroll_explicit=False): + func1 = tvm.build(s1, [A, W, B], device) + func2 = tvm.build(s2, [A, W, C], device) + func1(a, w, b) + func2(a, w, c) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + + for device in ['cuda', 'opencl', 'metal']: + check_device(device) + + +def test_conv2d_nchw(): + verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) + verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0) + verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1) + verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0) + verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1) + verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1) + verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0) + verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1) + verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1) + verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0) + verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) + +if __name__ == "__main__": + test_conv2d_nchw() From 62cc3636701a1ce49804e11154ec2309330a977d Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 14 Aug 2017 01:39:56 +0000 Subject: [PATCH 12/14] conv compute lint error fixed --- topi/python/topi/nn/conv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/topi/python/topi/nn/conv.py b/topi/python/topi/nn/conv.py index bd62a6ec01ff..37e6be1f53d5 100644 --- a/topi/python/topi/nn/conv.py +++ b/topi/python/topi/nn/conv.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, line-too-long, unused-variable +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals """Convolution operators""" from __future__ import absolute_import as _abs import tvm @@ -59,7 +59,7 @@ def conv2d_nchw(Input, Filter, stride, padding): tvm.all(yy >= pad_top, yy - pad_top < in_height, xx >= pad_left, xx - pad_left < in_width), Input[nn, cc, yy - pad_top, xx - pad_left], tvm.const(0.)), - name='temp') + name='temp') rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') @@ -68,7 +68,7 @@ def conv2d_nchw(Input, Filter, stride, padding): lambda nn, ff, yy, xx: tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx] * Filter[ff, rc, ry, rx], axis=[rc, ry, rx])) - + @tvm.tag_scope(tag="conv2d_hwcn") def conv2d_hwcn(Input, Filter, stride, padding): """Convolution operator in HWCN layout. From 289eadd78f1cdb109f05816d5629d0d1e4631f7e Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 14 Aug 2017 03:01:41 +0000 Subject: [PATCH 13/14] fixed python 3 compatibility problem --- topi/python/topi/cuda/conv2d_nchw.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index c752fc1473cf..17e65a5d6332 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -38,11 +38,11 @@ def schedule(temp, Filter, Output): vthread = 2 out_filter = min(64, tvm.ir_pass.Simplify(Filter.shape[0]).value) in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value - opart2 = out_filter/8 + opart2 = out_filter//8 ofactor = out_filter wfactor = block_h - ifactor = in_filter/4 - sfactor = max(1, ofactor/(opart2*2)) + ifactor = in_filter//4 + sfactor = max(1, ofactor//(opart2*2)) spart = (wfactor + vthread-1) // vthread block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") From 5f57a1e01052a0bc3a860d9acb10f835dd5729f3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 14 Aug 2017 03:42:17 +0000 Subject: [PATCH 14/14] conv2d tensor input support added, test typo fixed, ir_pass.Simplify changed to util.get_const_int --- topi/python/topi/cuda/conv2d_hwcn.py | 1 + topi/python/topi/cuda/conv2d_nchw.py | 12 +++++++----- topi/tests/python/test_topi_conv2d_nchw.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py index 675cc0adcbc2..210660a230f5 100644 --- a/topi/python/topi/cuda/conv2d_hwcn.py +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -17,6 +17,7 @@ def schedule_conv2d_hwcn(outs): s: Schedule The computation schedule for conv2d_hwcn. """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs sch = tvm.create_schedule([x.op for x in outs]) def schedule(Apad, W, B): """Schedule conv2d_hwcn""" diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py index 17e65a5d6332..83abfde9107e 100644 --- a/topi/python/topi/cuda/conv2d_nchw.py +++ b/topi/python/topi/cuda/conv2d_nchw.py @@ -1,6 +1,7 @@ # pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements """Schedule for conv2d_nchw with auto fusion""" import tvm +from .. import util def schedule_conv2d_small_batch(outs): @@ -9,8 +10,8 @@ def schedule_conv2d_small_batch(outs): def schedule(temp, Filter, Output): """Schedule conv2d_nchw""" - block_h = tvm.ir_pass.Simplify(Output.shape[3]).value - block_w = tvm.ir_pass.Simplify(temp.shape[1]).value + block_h = util.get_const_int(Output.shape[3]) + block_w = util.get_const_int(temp.shape[1]) if block_h % 48 == 0: block_h = 48 elif block_h % 32 == 0: @@ -36,8 +37,8 @@ def schedule(temp, Filter, Output): # sheduler params num_thread = 8 vthread = 2 - out_filter = min(64, tvm.ir_pass.Simplify(Filter.shape[0]).value) - in_filter = tvm.ir_pass.Simplify(Filter.shape[1]).value + out_filter = min(64, util.get_const_int(Filter.shape[0])) + in_filter = util.get_const_int(Filter.shape[1]) opart2 = out_filter//8 ofactor = out_filter wfactor = block_h @@ -129,7 +130,8 @@ def schedule_conv2d_nchw(outs): s: Schedule The computation schedule for conv2d_nchw. """ - batch_size = tvm.ir_pass.Simplify(outs[0].op.output(0).shape[0]).value + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + batch_size = util.get_const_int(outs[0].op.output(0).shape[0]) if batch_size > 1: raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size) return schedule_conv2d_small_batch(outs) diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py index 6a7ed6cf8017..a40f10ce3d7e 100644 --- a/topi/tests/python/test_topi_conv2d_nchw.py +++ b/topi/tests/python/test_topi_conv2d_nchw.py @@ -3,7 +3,7 @@ import numpy as np import tvm import topi -from topi.nn.util import get_const_tuple +from topi.util import get_const_tuple def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):