Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions nnvm/python/nnvm/top/nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,25 @@ def compute_conv2d(attrs, inputs, _):
kernel_layout = attrs["kernel_layout"]
out_dtype = attrs["out_dtype"]
out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
assert layout == "NCHW" or layout == "NHWC"
assert layout in ["NCHW", "NHWC", "NCHW4c"]
(dilation_h, dilation_w) = dilation
if dilation_h < 1 or dilation_w < 1:
raise ValueError("dilation should be positive value")
elif layout == "NCHW4c" and (dilation_h > 1 or dilation_w > 1):
raise ValueError("not support dilate now")
elif dilation == (1, 1):
kernel = inputs[1]
elif layout == "NCHW":
kernel = topi.nn.dilate(inputs[1], [1, 1, dilation_h, dilation_w])
else: #layout == NHWC
kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])

if groups == 1:
if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
# pylint: disable=assignment-from-no-return
out = topi.nn.conv2d_NCHWc_int8_prepacked(inputs[0], kernel, strides, padding,
layout, out_dtype=out_dtype)
# pylint: enable=assignment-from-no-return
elif groups == 1:
out = topi.nn.conv2d(
inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype)
elif layout == "NCHW" and \
Expand All @@ -120,7 +127,7 @@ def compute_conv2d(attrs, inputs, _):

if attrs.get_bool("use_bias"):
bias = inputs[2]
expand_axis = 1 if layout == "NCHW" else 0
expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0
bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
out = topi.add(out, bias)
return out
Expand All @@ -136,6 +143,8 @@ def schedule_conv2d(attrs, outs, target):
with tvm.target.create(target):
if groups == 1 and layout == "NCHW":
return topi.generic.schedule_conv2d_nchw(outs)
elif groups == 1 and layout == "NCHW4c":
return topi.generic.schedule_conv2d_NCHWc_int8_prepacked(outs)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@merrymercy I can't check dtype of input here. Could you comment here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get dtype from outs[0].dtype?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@merrymercy outs[0].dtype is a user input, which can be arbitrary

elif groups == 1 and layout == "NHWC":
return topi.generic.schedule_conv2d_nhwc(outs)
elif groups == channels and layout == "NCHW":
Expand Down
1 change: 0 additions & 1 deletion nnvm/src/top/nn/convolution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,6 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
.set_support_level(2);


NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
.describe(R"code(Weight transformation of winograd fast convolution algorithm.
Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
Expand Down
17 changes: 13 additions & 4 deletions topi/python/topi/cuda/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@

from .conv2d_direct import schedule_direct_cuda
from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8


@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd'])
@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='float32'):
"""Conv2D operator for cuda backend.

Expand All @@ -21,10 +22,13 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
The config for this template

data : tvm.Tensor
4-D with shape [batch, in_channel, in_height, in_width]
4-D with shape [batch, in_channel, in_height, in_width] or
5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]

kernel : tvm.Tensor
4-D with shape [num_filter, in_channel, filter_height, filter_width]
4-D with shape [num_filter, in_channel, filter_height, filter_width] or
6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
filter_width, num_filter_block, in_channel_block]

strides : int or a list/tuple of two ints
stride size, or [stride_height, stride_width]
Expand Down Expand Up @@ -98,6 +102,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f
if cfg.template_key == 'winograd':
return winograd_cuda(cfg, data, kernel, strides, padding, layout, out_dtype,
pre_computed=False)
if cfg.template_key == 'int8':
return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, layout, out_dtype,
pre_computed=False)

if layout == 'NCHW':
return nn.conv2d_nchw(data, kernel, strides, padding, out_dtype)
Expand All @@ -108,7 +115,7 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, layout='NCHW', out_dtype='f


@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
["direct", 'winograd'])
["direct", 'winograd', "int8"])
def schedule_conv2d_nchw_cuda(cfg, outs):
"""TOPI schedule callback of conv2d for cuda gpu

Expand Down Expand Up @@ -138,6 +145,8 @@ def _callback(op):
schedule_direct_cuda(cfg, s, op.output(0))
if op.tag == 'conv2d_nchw_winograd':
schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
if op.tag == "conv2d_NCHWc_int8":
schedule_conv2d_NCHWc_int8(cfg, s, op.output(0), pre_computed=False)

traverse_inline(s, outs[0].op, _callback)
return s
5 changes: 5 additions & 0 deletions topi/python/topi/cuda/conv2d_direct.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""The templates for cuda conv2d operators"""
import tvm
from tvm import autotvm
from ..util import get_const_tuple

def schedule_direct_cuda(cfg, s, conv):
"""schedule optimized for batch size = 1"""
Expand Down Expand Up @@ -94,3 +95,7 @@ def schedule_direct_cuda(cfg, s, conv):
# unroll
s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)

N, CO, OH, OW = get_const_tuple(output.shape)
_, KH, KW, CI = get_const_tuple(kernel.shape)
cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
Loading