From e7a870215cb4d2163061cb2c9c3e1f2f1be88376 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Thu, 23 Aug 2018 15:08:25 -0700 Subject: [PATCH 01/18] Int8 implementation for convolution operator on Intel Skylake --- src/codegen/llvm/codegen_llvm.cc | 6 + tests/python/unittest/test_conv_int8_intel.py | 140 ++++++++++++++++++ topi/python/topi/nn/conv2d.py | 2 +- topi/python/topi/x86/conv2d.py | 102 +++++++++++-- topi/python/topi/x86/conv2d_avx_1x1.py | 108 ++++++++++++++ topi/python/topi/x86/conv2d_avx_common.py | 121 +++++++++++++++ topi/python/topi/x86/int8Intrinsics.py | 95 ++++++++++++ 7 files changed, 564 insertions(+), 10 deletions(-) create mode 100644 tests/python/unittest/test_conv_int8_intel.py create mode 100644 topi/python/topi/x86/int8Intrinsics.py diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index ae576c981395..1a57fb1a34d4 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -688,6 +688,12 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); + } else if (op->is_intrinsic("broadcast16")){ + llvm::Value *v = MakeValue(op->args[0]); + return CreateBroadcast(v, 16); + } else if (op->is_intrinsic("bitcast")){ + llvm::Type * target = LLVMType(op->type); + return builder_->CreateBitCast(MakeValue(op->args[0]), target); } else { LOG(FATAL) << "unknown intrinsic " << op->name; return nullptr; diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py new file mode 100644 index 000000000000..62305e914a70 --- /dev/null +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -0,0 +1,140 @@ +import tvm +import topi +import numpy as np +from tvm.contrib import cc +from tvm.contrib import util +import timeit +from collections import namedtuple + +# All the workloads from Resnet except first layer +# Workload is ['height', 'width', 'in_filter', 'out_filter', +# 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) + + +workloads = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1), + (56, 56, 64, 64, 1, 1, 0, 0, 1, 1), + (56, 56, 64, 128, 3, 3, 1, 1, 2, 2), + (56, 56, 64, 128, 1, 1, 0, 0, 2, 2), + (28, 28, 128, 128, 3, 3, 1, 1, 1, 1), + (28, 28, 128, 256, 3, 3, 1, 1, 2, 2), + (28, 28, 128, 256, 1, 1, 0, 0, 2, 2), + (14, 14, 256, 256, 3, 3, 1, 1, 1, 1), + (14, 14, 256, 512, 3, 3, 1, 1, 2, 2), + (14, 14, 256, 512, 1, 1, 0, 0, 2, 2), + (7, 7, 512, 512, 3, 3, 1, 1, 1, 1), + (56, 56, 64, 256, 1, 1, 0, 0, 1, 1), + (56, 56, 256, 64, 1, 1, 0, 0, 1, 1), + (56, 56, 256, 128, 1, 1, 0, 0, 2, 2), + (28, 28, 128, 512, 1, 1, 0, 0, 1, 1), + (56, 56, 256, 512, 1, 1, 0, 0, 2, 2), + (28, 28, 512, 128, 1, 1, 0, 0, 1, 1), + (28, 28, 512, 256, 1, 1, 0, 0, 2, 2), + (14, 14, 256, 1024, 1, 1, 0, 0, 1, 1), + (28, 28, 512, 1024, 1, 1, 0, 0, 2, 2), + (14, 14, 1024, 256, 1, 1, 0, 0, 1, 1), + (14, 14, 1024, 512, 1, 1, 0, 0, 2, 2), + (7, 7, 512, 2048, 1, 1, 0, 0, 1, 1), + (14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2), + (7, 7, 2048, 512, 1, 1, 0, 0, 1, 1) + ] + + +target_name = 'llvm -mcpu=skylake-avx512' +avx2_len = 16 +ctx = tvm.context(target_name, 0); + +def getShape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, + hstride, wstride, outDtype): + ## Find shapes + dataShape = (1, in_filter/avx2_len, im_height, im_width, avx2_len) + + if outDtype == 'int32': + if kh != 1: + kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4) + else: + kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw) + elif outDtype == 'float32': + if kh != 1: + kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len) + else: + kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw) + out_height = (im_height + 2 * hpad - kh) // hstride + 1 + out_width = (im_width + 2 * wpad - kw) // wstride + 1 + oShape = (1, out_filter/avx2_len, out_height, out_width, avx2_len) + return (dataShape, kernelShape, oShape) + + + +def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filter, + out_filter, kh, kw, hpad, wpad, hstride, wstride): + + (dataShape, kernelShape, oShape) = getShape(im_height, im_width, in_filter, + out_filter, kh, kw, hpad, wpad, + hstride, wstride, outDtype) + + # Create TVM placeholders + data = tvm.placeholder(dataShape, name='data', dtype=dataDtype); + kernel = tvm.placeholder(kernelShape, name='kernel', dtype=kernelDtype); + + # Create the numpy arrays to be used for executing conv models + if dataDtype == 'float32': + a = tvm.nd.array(np.random.rand(*dataShape).astype(dtype=dataDtype), ctx); + b = tvm.nd.array(np.random.rand(*kernelShape).astype(dtype=kernelDtype), ctx); + else: + a = tvm.nd.array(np.random.randint(100, size=dataShape).astype(dataDtype)); + b = tvm.nd.array(np.random.randint(100, size=kernelShape).astype(kernelDtype)); + #a = tvm.nd.array(np.ones(dataShape, dtype='uint8'), ctx); + #b = tvm.nd.array(np.zeros(kernelShape, dtype='int8'), ctx); + + # cOrig will be used for declaration ouptut + # cSch will be used for scheduled computation output + cOrig = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx); + cSch = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx); + + + with tvm.target.create(target_name): + conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter, + kernel_size=(kh, kw), stride=hstride, + padding=hpad, layout='NCHWc', + out_layout='NCHWc', out_dtype=outDtype); + out = topi.nn.relu(conv) + s = tvm.create_schedule(out.op); + func = tvm.build(s, [data, kernel, out], target=target_name, name='out') + func(a, b, cOrig) + #print(tvm.lower(s, [data, kernel], simple_mode=True)); + + # Generate and run the optimized schedule + sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter, + kernel_size=(kh,kw), + strides=hstride, + padding=hpad, + layout='NCHWc', + out_layout='NCHWc', + outs=[out]); + func = tvm.build(sconv, [data, kernel, out], target=target_name, name='conv') + func(a, b, cSch) + + # Functional check + if dataDtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy()) + else : assert(np.allclose(cOrig.asnumpy(), cSch.asnumpy())) + + evaluator = func.time_evaluator(func.entry_name, ctx, number=1000) + #print(tvm.lower(sconv, [data, kernel], simple_mode=True)) + return evaluator(a, b, cSch).mean + +if __name__ == "__main__": + print "Workload, kernelSize, FP32_time, INT8_time, Speedup" + speedUps = [] + for i in range(0, len(workloads)): + # workloas[i] -> (im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, hstride, wstride) + # Int8 + fpTime = run_inference('float32','float32','float32', *workloads[i]) + int8Time = run_inference('uint8', 'int8', 'int32', *workloads[i]) + kh = workloads[i][4] + kw = workloads[i][5] + print "Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time) + + speedUps.append(fpTime/int8Time) + print("Average speedup --> ", sum(speedUps)/float(len(speedUps))) + + diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index e0d2c403d4b4..1f3c1c1dd379 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -79,7 +79,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride - assert data.dtype == kernel.dtype, \ + assert data.dtype == kernel.dtype or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \ "Do not support inputs with different data types now. ' \ '{} vs. {}".format(data.dtype, kernel.dtype) return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 721c7c169d99..1634dde05c5f 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -48,6 +48,36 @@ def _get_schedule_conv(wkl): # workloads of resnet152_v1 on imagenet, no extra workload required # workloads of resnet18_v2 on imagenet, no extra workload required # workloads of resnet34_v2 on imagenet, no extra workload required + + ## Following are for INT8 kernels + Workload('uint8', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2), + Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), + Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), + Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), + Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), + Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), + Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), + Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), + # workloads of resnet34_v1 on imagenet, no extra workload required + # workloads of resnet50_v1 on imagenet + Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1), ] fp32_vec_len = 8 @@ -90,6 +120,42 @@ def _get_schedule_conv(wkl): # workloads of resnet152_v1 on imagenet, no extra workload required # workloads of resnet18_v2 on imagenet, no extra workload required # workloads of resnet34_v2 on imagenet, no extra workload required + + + # Following are for INT8 operations + # workloads of resnet18_v1 on imagenet + AVXConvCommonFwd(3, fp32_vec_len, 28, False), #TODO + AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28), + AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28), + AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False), + AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, False), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14), + AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, True), + AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 7), + AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True), + # workloads of resnet34_v1 on imagenet, no extra workload required + # workloads of resnet50_v1 on imagenet + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7), + AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7), + # workloads of resnet101_v1 on imagenet, no extra workload required + # workloads of resnet152_v1 on imagenet, no extra workload required + # workloads of resnet18_v2 on imagenet, no extra workload required + # workloads of resnet34_v2 on imagenet, no extra workload required ] if wkl not in _WORKLOADS_AVX: @@ -169,11 +235,20 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride, AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc, AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc } + + # Use int8 schedules if the input data is of int8 dtype + if data.dtype == 'uint8': + _AVX_SCH_TO_DECL_FUNC = { + AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc_int8, + AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc_int8 + } + n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block kh, kw = kernel_size - wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=out_dtype), - tvm.placeholder((num_filter, ic, kh, kw), dtype=out_dtype), + wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype), + tvm.placeholder((num_filter, ic, kh, kw), + dtype=kernel.dtype), stride, padding, out_dtype) sch = _get_schedule_NCHWc(wkl, layout, out_layout) return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel) @@ -289,10 +364,6 @@ def traverse(op): def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, layout, out_layout, outs): """Create schedule for tensors""" - _AVX_SCH_TO_SCH_FUNC = { - AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc, - AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc - } s = tvm.create_schedule([x.op for x in outs]) scheduled_ops = [] @@ -316,13 +387,26 @@ def traverse(op): if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] - + + _AVX_SCH_TO_SCH_FUNC = { + AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc, + AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc + } + + # Use int8 schedules if the input data is of int8 dtype + if data.dtype == 'uint8': + _AVX_SCH_TO_SCH_FUNC = { + AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc_int8, + AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc_int8 + } + n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block - original_data = tvm.placeholder((n, ic, h, w), dtype=conv_out.dtype) + original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype) kh, kw = kernel_size - original_kernel = tvm.placeholder((num_filter, ic, kh, kw), dtype=conv_out.dtype) + original_kernel = tvm.placeholder((num_filter, ic, kh, kw), + dtype=kernel.dtype) wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype) sch = _get_schedule_NCHWc(wkl, layout, out_layout) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 7d820701e1f4..4e7491bd95d8 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -8,6 +8,7 @@ from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad +from .int8Intrinsics import _intrin_reduce4int8_1x1 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor']) @@ -229,3 +230,110 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): s[O].parallel(parallel_axis) return s + + +def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): + """ Declaration for int8 conv""" + out_dtype = wkl.out_dtype + HPAD, WPAD = wkl.hpad, wkl.wpad + HSTR, WSTR = wkl.hstride, wkl.wstride + + batch_size = data.shape[0] + out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 + out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + + DOPAD = (HPAD != 0 or WPAD != 0) + if DOPAD: + data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") + else: + data_pad = data + + oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn) + + # Intel performs dot product of 2 "4" Int8 values + n_elems = 4 + assert(sch.ic_bn%4 == 0) + ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer') + ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner') + ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner') + + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * + kernel[oc_chunk, ic_outer, ic_f_inner, oc_block, ic_s_inner, 0, 0].astype(out_dtype), + axis=[ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8', + tag="conv2d_NCHWc_int8") + + + return conv + + +def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): + """ + Defines the schedule for INT8 for intel machines + Uses the Intel intrinsics to use INT8 operations + More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training + """ + + target = tvm.target.current_target(allow_none=False) + avx2_len = -1 + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + avx2_len = 16 + else: + return s + assert(avx2_len != -1) + + # schedule data + A = data + if isinstance(s[A].op, tvm.tensor.ComputeOp): + batch, ic_chunk, ih, iw, ic_block = s[A].op.axis + parallel_axis = s[A].fuse(ic_chunk, ih) + s[A].parallel(parallel_axis) + + C, O = conv_out, last + CC = s.cache_write(C, 'global') + + batch, oc_chunk, oh, ow, oc_block = s[C].op.axis + oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor) + ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor) + s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) + s[C].vectorize(oc_block) + + parallel_axis = s[C].fuse(oc_chunk, oh_outer) + s[CC].compute_at(s[C], parallel_axis) + if C == O: + s[C].parallel(parallel_axis) + + _, oc_chunk, oh, ow, oc_block = s[CC].op.axis + ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis + + # Sylake and future processors have 16 vector lanes + assert(sch.oc_bn % avx2_len == 0) + + oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len); + + oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor) + ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor) + + s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_outer, ic_f_inner, oh_inner, + ow_inner, oc_f_inner, oc_s_inner, ic_s_inner) + s[CC].fuse(oc_chunk, oh_outer) + + n_elems = 4 + pc = _intrin_reduce4int8_1x1(avx2_len, n_elems) + s[CC].tensorize(oc_s_inner, pc) + s[CC].unroll(ow_inner) + s[CC].unroll(oh_inner) + + if C != O: + batch, oc_chunk, oh, ow, oc_block = s[O].op.axis + oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor) + ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor) + s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) + + parallel_axis = s[O].fuse(oc_chunk, oh_outer) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + + return s diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 8f8086fdebb4..488ee0e41249 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -8,6 +8,7 @@ from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad +from .int8Intrinsics import _intrin_reduce4int8_common AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw']) @@ -252,3 +253,123 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last): s[O].parallel(parallel_axis) return s + + +def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): + """ + This function sets up the compute for INT8 conv 2d + Inputs are in INT8 datatype + Ouptut is in INT32 datatype + """ + + out_dtype = wkl.out_dtype + HPAD, WPAD = wkl.hpad, wkl.wpad + HSTR, WSTR = wkl.hstride, wkl.wstride + + batch_size = data.shape[0] + out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 + out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + + # pack data + DOPAD = (HPAD != 0 or WPAD != 0) + if DOPAD: + data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") + else: + data_pad = data + + # convolution + oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn) + kh = tvm.reduce_axis((0, wkl.hkernel), name='kh') + kw = tvm.reduce_axis((0, wkl.wkernel), name='kw') + + # Intel performs dot product of 2 "4" Int8 values + # Current implementation requires ic_bn to be a multiple of 4 + n_elems = 4 + assert(sch.ic_bn%4 == 0) + + ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer') + ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner') + ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner') + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: + tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * + kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), + axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), + name='conv2d_NCHWc_int8', + tag="conv2d_NCHWc_int8") + return conv + +def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): + """ + Defines the schedule for INT8 for intel machines + Uses the Intel intrinsics to use INT8 operations + More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training + """ + + # Currently INT8 operations are supported for only Skylake + # In future the _intrin_reduce4int8 will be updated for VNNI instructions + # In case of unsupported target, the schedule will go to the original + # compute + + target = tvm.target.current_target(allow_none=False) + avx2_len = -1 + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + avx2_len = 16 + else: + return s + assert(avx2_len != -1) + + A = data + if isinstance(s[A].op, tvm.tensor.ComputeOp): + batch, ic_chunk, ih, iw, _ = s[A].op.axis + parallel_axis = s[A].fuse(ic_chunk, ih) + s[A].parallel(parallel_axis) + + # schedule 5-D NCHW[x]c conv + C, O = conv_out, last + CC = s.cache_write(C, 'global') + + _, oc_chunk, oh, ow, oc_block = s[C].op.axis + ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n) + s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[C].fuse(oc_chunk, oh) + s[C].vectorize(oc_block) + if C == O: + s[C].parallel(parallel_axis) + + s[CC].compute_at(s[C], ow_chunk) + _, oc_chunk, oh, ow, oc_block = s[CC].op.axis + kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis + + ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) + + # Sylake and future processors have 16 vector lanes + assert(sch.oc_bn % avx2_len == 0) + + oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len); + + if sch.unroll_kw: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw, + ow_block, oc_f_inner, oc_s_inner, ic_s_inner) + s[CC].unroll(kw) + else: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, kw, ic_f_inner, + ow_block, oc_f_inner, oc_s_inner, ic_s_inner) + + + n_elems = 4 + pc = _intrin_reduce4int8_common(avx2_len, n_elems) + s[CC].tensorize(oc_s_inner, pc) + s[CC].unroll(ow_block) + s[CC].unroll(oc_f_inner) + + if C != O: + batch, oc_chunk, oh, ow, oc_block = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n) + s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + + return s diff --git a/topi/python/topi/x86/int8Intrinsics.py b/topi/python/topi/x86/int8Intrinsics.py new file mode 100644 index 000000000000..abf6e3ace607 --- /dev/null +++ b/topi/python/topi/x86/int8Intrinsics.py @@ -0,0 +1,95 @@ +"""Core kernel of dot product of 4 Int8 operations""" +import tvm + + +def _intrin_reduce4int8_common(vec_size, num_elements_intel): + A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A') + B = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='B') + k = tvm.reduce_axis((0, 4), name='k') + C = tvm.compute((vec_size,), \ + lambda i: tvm.sum(\ + A[k].astype('int32') * B[i, k].astype('int32'), \ + axis=k), name="C") + s = tvm.create_schedule(C.op) + + Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab", + offset_factor=1, + strides=[1]) + Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb", + offset_factor=1, + strides=[tvm.var('ldw'), 1]) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.ir_builder.create() + if index == 1: + ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) + return ib.get() + + A_int8 = ins[0].vload([0], "uint8x4") + re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8) + vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32); + vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32) + vecB = ins[1].vload([0, 0], "int8x64") + vecOne = tvm.const(1, "int16x32") + pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB) + quadReduction = tvm.call_llvm_intrin('int32x16', + 'llvm.x86.avx512.pmaddw.d.512', + tvm.const(0, 'uint32'), \ + pairReduction, vecOne); + vecC = outs[0].vload([0], "int32x16") + out = quadReduction + vecC + ib.emit(outs[0].vstore(0, out)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + with tvm.build_config(offset_factor=1, partition_const_loop=True): + return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb}) + +def _intrin_reduce4int8_1x1(vec_size, num_elements_intel): + A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A') + B = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='B') + k = tvm.reduce_axis((0, 4), name='k') + C = tvm.compute((vec_size,), \ + lambda i: tvm.sum(\ + A[k].astype('int32') * B[i, k, 0, 0].astype('int32'), \ + axis=k), name="C") + s = tvm.create_schedule(C.op) + + Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab", + offset_factor=1, + strides=[1]) + Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb", + offset_factor=1, + strides=[tvm.var('ldw'), tvm.var('ldw'), tvm.var('ldw'), 1]) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.ir_builder.create() + if index == 1: + ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) + return ib.get() + + A_int8 = ins[0].vload([0], "uint8x4") + re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8) + vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32); + vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32) + vecB = ins[1].vload([0, 0, 0, 0], "int8x64") + vecOne = tvm.const(1, "int16x32") + pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB) + quadReduction = tvm.call_llvm_intrin('int32x16', + 'llvm.x86.avx512.pmaddw.d.512', + tvm.const(0, 'uint32'), \ + pairReduction, vecOne); + vecC = outs[0].vload([0], "int32x16") + out = quadReduction + vecC + ib.emit(outs[0].vstore(0, out)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + with tvm.build_config(offset_factor=1, partition_const_loop=True): + return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb}) From b268b7f5fb661e86eac263a41bdaa576635f42cf Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Thu, 23 Aug 2018 15:08:25 -0700 Subject: [PATCH 02/18] Int8 implementation for convolution operator on Intel Skylake --- topi/python/topi/x86/conv2d.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 1634dde05c5f..735b8400e5a5 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -50,7 +50,6 @@ def _get_schedule_conv(wkl): # workloads of resnet34_v2 on imagenet, no extra workload required ## Following are for INT8 kernels - Workload('uint8', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2), Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), @@ -124,7 +123,6 @@ def _get_schedule_conv(wkl): # Following are for INT8 operations # workloads of resnet18_v1 on imagenet - AVXConvCommonFwd(3, fp32_vec_len, 28, False), #TODO AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False), AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28), AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False), From 58e9fbb8ed84dd390948b1aa587b6f34ac18b25f Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Fri, 31 Aug 2018 15:16:37 -0700 Subject: [PATCH 03/18] PR changes --- tests/python/unittest/test_conv_int8_intel.py | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py index 62305e914a70..e6113a426cf0 100644 --- a/tests/python/unittest/test_conv_int8_intel.py +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -1,16 +1,17 @@ import tvm import topi import numpy as np -from tvm.contrib import cc -from tvm.contrib import util import timeit -from collections import namedtuple +import logging +import sys + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger('test_conv_int8_intel') +logger.disabled = True # All the workloads from Resnet except first layer # Workload is ['height', 'width', 'in_filter', 'out_filter', # 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) - - workloads = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1), (56, 56, 64, 64, 1, 1, 0, 0, 1, 1), (56, 56, 64, 128, 3, 3, 1, 1, 2, 2), @@ -41,67 +42,65 @@ target_name = 'llvm -mcpu=skylake-avx512' avx2_len = 16 -ctx = tvm.context(target_name, 0); +ctx = tvm.context(target_name, 0) -def getShape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, - hstride, wstride, outDtype): +def get_shape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, + hstride, wstride, out_dtype): ## Find shapes - dataShape = (1, in_filter/avx2_len, im_height, im_width, avx2_len) + data_shape = (1, in_filter/avx2_len, im_height, im_width, avx2_len) - if outDtype == 'int32': + if out_dtype == 'int32': if kh != 1: - kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4) + kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4) else: - kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw) - elif outDtype == 'float32': + kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw) + elif out_dtype == 'float32': if kh != 1: - kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len) + kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len) else: - kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw) + kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw) out_height = (im_height + 2 * hpad - kh) // hstride + 1 out_width = (im_width + 2 * wpad - kw) // wstride + 1 - oShape = (1, out_filter/avx2_len, out_height, out_width, avx2_len) - return (dataShape, kernelShape, oShape) + o_shape = (1, out_filter/avx2_len, out_height, out_width, avx2_len) + return (data_shape, kernel_shape, o_shape) -def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filter, +def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, hstride, wstride): - (dataShape, kernelShape, oShape) = getShape(im_height, im_width, in_filter, + (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, - hstride, wstride, outDtype) + hstride, wstride, out_dtype) # Create TVM placeholders - data = tvm.placeholder(dataShape, name='data', dtype=dataDtype); - kernel = tvm.placeholder(kernelShape, name='kernel', dtype=kernelDtype); + data = tvm.placeholder(data_shape, name='data', dtype=data_dtype) + kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype) # Create the numpy arrays to be used for executing conv models - if dataDtype == 'float32': - a = tvm.nd.array(np.random.rand(*dataShape).astype(dtype=dataDtype), ctx); - b = tvm.nd.array(np.random.rand(*kernelShape).astype(dtype=kernelDtype), ctx); + if data_dtype == 'float32': + a = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), ctx) + b = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), ctx) else: - a = tvm.nd.array(np.random.randint(100, size=dataShape).astype(dataDtype)); - b = tvm.nd.array(np.random.randint(100, size=kernelShape).astype(kernelDtype)); - #a = tvm.nd.array(np.ones(dataShape, dtype='uint8'), ctx); - #b = tvm.nd.array(np.zeros(kernelShape, dtype='int8'), ctx); + a = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype)) + b = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) # cOrig will be used for declaration ouptut # cSch will be used for scheduled computation output - cOrig = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx); - cSch = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx); + cOrig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx) + cSch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx) with tvm.target.create(target_name): conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter, kernel_size=(kh, kw), stride=hstride, padding=hpad, layout='NCHWc', - out_layout='NCHWc', out_dtype=outDtype); + out_layout='NCHWc', out_dtype=out_dtype) out = topi.nn.relu(conv) - s = tvm.create_schedule(out.op); + s = tvm.create_schedule(out.op) func = tvm.build(s, [data, kernel, out], target=target_name, name='out') func(a, b, cOrig) - #print(tvm.lower(s, [data, kernel], simple_mode=True)); + logger.debug(tvm.lower(s, [data, kernel], simple_mode=True)) # Generate and run the optimized schedule sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter, @@ -110,20 +109,20 @@ def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filt padding=hpad, layout='NCHWc', out_layout='NCHWc', - outs=[out]); + outs=[out]) func = tvm.build(sconv, [data, kernel, out], target=target_name, name='conv') func(a, b, cSch) # Functional check - if dataDtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy()) + if data_dtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy()) else : assert(np.allclose(cOrig.asnumpy(), cSch.asnumpy())) evaluator = func.time_evaluator(func.entry_name, ctx, number=1000) - #print(tvm.lower(sconv, [data, kernel], simple_mode=True)) + logger.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) return evaluator(a, b, cSch).mean if __name__ == "__main__": - print "Workload, kernelSize, FP32_time, INT8_time, Speedup" + logger.info("Workload, kernelSize, FP32_time, INT8_time, Speedup") speedUps = [] for i in range(0, len(workloads)): # workloas[i] -> (im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, hstride, wstride) @@ -132,9 +131,9 @@ def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filt int8Time = run_inference('uint8', 'int8', 'int32', *workloads[i]) kh = workloads[i][4] kw = workloads[i][5] - print "Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time) + logger.info("Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time)) speedUps.append(fpTime/int8Time) - print("Average speedup --> ", sum(speedUps)/float(len(speedUps))) + logger.info("Average speedup --> ", sum(speedUps)/float(len(speedUps))) From 541d1550fa78a995d68e812b2c0c282c9aa152ef Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Fri, 31 Aug 2018 16:28:37 -0700 Subject: [PATCH 04/18] PR changes --- src/codegen/llvm/codegen_llvm.cc | 2 +- tests/python/unittest/test_conv_int8_intel.py | 138 ++++++++++-------- topi/python/topi/nn/conv2d.py | 2 +- 3 files changed, 76 insertions(+), 66 deletions(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 1ef825cf4785..9c788521cdfe 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -692,7 +692,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { llvm::Value *v = MakeValue(op->args[0]); return CreateBroadcast(v, 16); } else if (op->is_intrinsic("bitcast")){ - llvm::Type * target = LLVMType(op->type); + llvm::Type* target = LLVMType(op->type); return builder_->CreateBitCast(MakeValue(op->args[0]), target); } else { LOG(FATAL) << "unknown intrinsic " << op->name; diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py index e6113a426cf0..1887a65cb236 100644 --- a/tests/python/unittest/test_conv_int8_intel.py +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -1,18 +1,19 @@ +#pylint: disable-msg=too-many-arguments, too-many-locals, assignment-from-no-return +""" Conv Int8 functional and performance testing""" +import sys +import logging +import numpy as np import tvm import topi -import numpy as np -import timeit -import logging -import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) -logger = logging.getLogger('test_conv_int8_intel') -logger.disabled = True +LOGGER = logging.getLogger('test_conv_int8_intel') +LOGGER.disabled = True -# All the workloads from Resnet except first layer +# All the WORKLOADS from Resnet except first layer # Workload is ['height', 'width', 'in_filter', 'out_filter', # 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) -workloads = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1), +WORKLOADS = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1), (56, 56, 64, 64, 1, 1, 0, 0, 1, 1), (56, 56, 64, 128, 3, 3, 1, 1, 2, 2), (56, 56, 64, 128, 1, 1, 0, 0, 2, 2), @@ -40,38 +41,48 @@ ] -target_name = 'llvm -mcpu=skylake-avx512' -avx2_len = 16 -ctx = tvm.context(target_name, 0) +TARGET_NAME = 'llvm -mcpu=skylake-avx512' +NUM_VEC_LANES = 16 +CTX = tvm.context(TARGET_NAME, 0) -def get_shape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, - hstride, wstride, out_dtype): +def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, + hstride, wstride, out_dtype): + """ + Finds out the shape of all data structures + """ ## Find shapes - data_shape = (1, in_filter/avx2_len, im_height, im_width, avx2_len) + data_shape = (1, in_filter/NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES) if out_dtype == 'int32': - if kh != 1: - kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4) + if k_h != 1: + kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w, + NUM_VEC_LANES/4, NUM_VEC_LANES, 4) else: - kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw) + kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES/4, + NUM_VEC_LANES, 4, k_h, k_w) elif out_dtype == 'float32': - if kh != 1: - kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len) + if k_h != 1: + kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w, + NUM_VEC_LANES, NUM_VEC_LANES) else: - kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw) - out_height = (im_height + 2 * hpad - kh) // hstride + 1 - out_width = (im_width + 2 * wpad - kw) // wstride + 1 - o_shape = (1, out_filter/avx2_len, out_height, out_width, avx2_len) + kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES, + NUM_VEC_LANES, k_h, k_w) + out_height = (im_height + 2 * hpad - k_h) // hstride + 1 + out_width = (im_width + 2 * wpad - k_w) // wstride + 1 + o_shape = (1, out_filter/NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES) return (data_shape, kernel_shape, o_shape) def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter, - out_filter, kh, kw, hpad, wpad, hstride, wstride): - + out_filter, k_h, k_w, hpad, wpad, hstride, wstride): + """ + Runs the inference and checks the functional correctness between + compute and schedule outputs + """ (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter, - out_filter, kh, kw, hpad, wpad, - hstride, wstride, out_dtype) + out_filter, k_h, k_w, hpad, wpad, + hstride, wstride, out_dtype) # Create TVM placeholders data = tvm.placeholder(data_shape, name='data', dtype=data_dtype) @@ -79,61 +90,60 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f # Create the numpy arrays to be used for executing conv models if data_dtype == 'float32': - a = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), ctx) - b = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), ctx) + data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX) + kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX) else: - a = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype)) - b = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) + data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype)) + kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) - # cOrig will be used for declaration ouptut - # cSch will be used for scheduled computation output - cOrig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx) - cSch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx) + # c_orig will be used for declaration ouptut + # c_sch will be used for scheduled computation output + c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) + c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) - with tvm.target.create(target_name): + with tvm.target.create(TARGET_NAME): conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter, - kernel_size=(kh, kw), stride=hstride, + kernel_size=(k_h, k_w), stride=hstride, padding=hpad, layout='NCHWc', out_layout='NCHWc', out_dtype=out_dtype) out = topi.nn.relu(conv) - s = tvm.create_schedule(out.op) - func = tvm.build(s, [data, kernel, out], target=target_name, name='out') - func(a, b, cOrig) - logger.debug(tvm.lower(s, [data, kernel], simple_mode=True)) + sch = tvm.create_schedule(out.op) + func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out') + func(data_array, kernel_array, c_orig) + LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True)) # Generate and run the optimized schedule sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter, - kernel_size=(kh,kw), + kernel_size=(k_h, k_w), strides=hstride, padding=hpad, layout='NCHWc', out_layout='NCHWc', outs=[out]) - func = tvm.build(sconv, [data, kernel, out], target=target_name, name='conv') - func(a, b, cSch) + func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv') + func(data_array, kernel_array, c_sch) # Functional check - if data_dtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy()) - else : assert(np.allclose(cOrig.asnumpy(), cSch.asnumpy())) + if data_dtype == 'uint8': + np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy()) + else: + assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy()) - evaluator = func.time_evaluator(func.entry_name, ctx, number=1000) - logger.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) - return evaluator(a, b, cSch).mean + evaluator = func.time_evaluator(func.entry_name, CTX, number=1000) + LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) + return evaluator(data_array, kernel_array, c_sch).mean if __name__ == "__main__": - logger.info("Workload, kernelSize, FP32_time, INT8_time, Speedup") - speedUps = [] - for i in range(0, len(workloads)): - # workloas[i] -> (im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, hstride, wstride) - # Int8 - fpTime = run_inference('float32','float32','float32', *workloads[i]) - int8Time = run_inference('uint8', 'int8', 'int32', *workloads[i]) - kh = workloads[i][4] - kw = workloads[i][5] - logger.info("Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time)) - - speedUps.append(fpTime/int8Time) - logger.info("Average speedup --> ", sum(speedUps)/float(len(speedUps))) - - + LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup") + SPEEDUP_ARRAY = [] + for i in enumerate(len(WORKLOADS)): + fp32_time = run_inference('float32', 'float32', 'float32', *WORKLOADS[i]) + int8_time = run_inference('uint8', 'int8', 'int32', *WORKLOADS[i]) + kernel_h = WORKLOADS[i][4] + kernel_w = WORKLOADS[i][5] + LOGGER.info("Workload#" + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", " + + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time)) + + SPEEDUP_ARRAY.append(fp32_time/int8_time) + LOGGER.info("Average speedup --> %s" % sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))) diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 1f3c1c1dd379..809a05851825 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -79,7 +79,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride - assert data.dtype == kernel.dtype or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \ + assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \ "Do not support inputs with different data types now. ' \ '{} vs. {}".format(data.dtype, kernel.dtype) return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) From 314333d75b04312ce47d25df85d901c1aec3a89e Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Fri, 31 Aug 2018 16:59:05 -0700 Subject: [PATCH 05/18] PR changes --- tests/python/unittest/test_conv_int8_intel.py | 10 +- topi/python/topi/x86/conv2d_avx_common.py | 2 +- topi/python/topi/x86/int8Intrinsics.py | 95 ---------------- topi/python/topi/x86/int8_intrinsics.py | 104 ++++++++++++++++++ 4 files changed, 110 insertions(+), 101 deletions(-) delete mode 100644 topi/python/topi/x86/int8Intrinsics.py create mode 100644 topi/python/topi/x86/int8_intrinsics.py diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py index 1887a65cb236..e50a583df51f 100644 --- a/tests/python/unittest/test_conv_int8_intel.py +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -137,11 +137,11 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f if __name__ == "__main__": LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup") SPEEDUP_ARRAY = [] - for i in enumerate(len(WORKLOADS)): - fp32_time = run_inference('float32', 'float32', 'float32', *WORKLOADS[i]) - int8_time = run_inference('uint8', 'int8', 'int32', *WORKLOADS[i]) - kernel_h = WORKLOADS[i][4] - kernel_w = WORKLOADS[i][5] + for i, wkl in enumerate(WORKLOADS): + fp32_time = run_inference('float32', 'float32', 'float32', *wkl) + int8_time = run_inference('uint8', 'int8', 'int32', *wkl) + kernel_h = wkl[4] + kernel_w = wkl[5] LOGGER.info("Workload#" + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", " + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time)) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 488ee0e41249..f014aa5719a2 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -8,7 +8,7 @@ from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad -from .int8Intrinsics import _intrin_reduce4int8_common +from .int8_intrinsics import _intrin_reduce4int8_common AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw']) diff --git a/topi/python/topi/x86/int8Intrinsics.py b/topi/python/topi/x86/int8Intrinsics.py deleted file mode 100644 index abf6e3ace607..000000000000 --- a/topi/python/topi/x86/int8Intrinsics.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Core kernel of dot product of 4 Int8 operations""" -import tvm - - -def _intrin_reduce4int8_common(vec_size, num_elements_intel): - A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A') - B = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='B') - k = tvm.reduce_axis((0, 4), name='k') - C = tvm.compute((vec_size,), \ - lambda i: tvm.sum(\ - A[k].astype('int32') * B[i, k].astype('int32'), \ - axis=k), name="C") - s = tvm.create_schedule(C.op) - - Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab", - offset_factor=1, - strides=[1]) - Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb", - offset_factor=1, - strides=[tvm.var('ldw'), 1]) - - def _intrin_func(ins, outs): - def _instr(index): - ib = tvm.ir_builder.create() - if index == 1: - ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) - return ib.get() - - A_int8 = ins[0].vload([0], "uint8x4") - re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8) - vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32); - vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32) - vecB = ins[1].vload([0, 0], "int8x64") - vecOne = tvm.const(1, "int16x32") - pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB) - quadReduction = tvm.call_llvm_intrin('int32x16', - 'llvm.x86.avx512.pmaddw.d.512', - tvm.const(0, 'uint32'), \ - pairReduction, vecOne); - vecC = outs[0].vload([0], "int32x16") - out = quadReduction + vecC - ib.emit(outs[0].vstore(0, out)) - return ib.get() - - # body, reset, update - return _instr(0), _instr(1), _instr(2) - - with tvm.build_config(offset_factor=1, partition_const_loop=True): - return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb}) - -def _intrin_reduce4int8_1x1(vec_size, num_elements_intel): - A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A') - B = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='B') - k = tvm.reduce_axis((0, 4), name='k') - C = tvm.compute((vec_size,), \ - lambda i: tvm.sum(\ - A[k].astype('int32') * B[i, k, 0, 0].astype('int32'), \ - axis=k), name="C") - s = tvm.create_schedule(C.op) - - Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab", - offset_factor=1, - strides=[1]) - Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb", - offset_factor=1, - strides=[tvm.var('ldw'), tvm.var('ldw'), tvm.var('ldw'), 1]) - - def _intrin_func(ins, outs): - def _instr(index): - ib = tvm.ir_builder.create() - if index == 1: - ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) - return ib.get() - - A_int8 = ins[0].vload([0], "uint8x4") - re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8) - vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32); - vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32) - vecB = ins[1].vload([0, 0, 0, 0], "int8x64") - vecOne = tvm.const(1, "int16x32") - pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB) - quadReduction = tvm.call_llvm_intrin('int32x16', - 'llvm.x86.avx512.pmaddw.d.512', - tvm.const(0, 'uint32'), \ - pairReduction, vecOne); - vecC = outs[0].vload([0], "int32x16") - out = quadReduction + vecC - ib.emit(outs[0].vstore(0, out)) - return ib.get() - - # body, reset, update - return _instr(0), _instr(1), _instr(2) - - with tvm.build_config(offset_factor=1, partition_const_loop=True): - return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb}) diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py new file mode 100644 index 000000000000..7fb948e23db6 --- /dev/null +++ b/topi/python/topi/x86/int8_intrinsics.py @@ -0,0 +1,104 @@ +"""Core kernel of dot product of 4 Int8 operations""" +import tvm + + +def _intrin_reduce4int8_common(vec_size, num_elements_intel): + data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data') + kernel = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='kernel') + k = tvm.reduce_axis((0, 4), name='k') + C = tvm.compute((vec_size,), + lambda i: tvm.sum(data[k].astype('int32') * + kernel[i, k].astype('int32'), + axis=k), + name="C") + + a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer", + offset_factor=1, + strides=[1]) + b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", + offset_factor=1, + strides=[tvm.var('ldw'), 1]) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.ir_builder.create() + if index == 1: + ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) + return ib.get() + + a_int8 = ins[0].vload([0], "uint8x4") + re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8) + vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32) + vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32) + vec_b = ins[1].vload([0, 0], "int8x64") + vec_one = tvm.const(1, "int16x32") + pair_reduction = tvm.call_llvm_intrin('int16x32', + 'llvm.x86.avx512.pmaddubs.w.512', + tvm.const(0, 'uint32'), + vec_a, vec_b) + quad_reduction = tvm.call_llvm_intrin('int32x16', + 'llvm.x86.avx512.pmaddw.d.512', + tvm.const(0, 'uint32'), + pair_reduction, vec_one) + vec_c = outs[0].vload([0], "int32x16") + out = quad_reduction + vec_c + ib.emit(outs[0].vstore(0, out)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + with tvm.build_config(offset_factor=1, partition_const_loop=True): + return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer}) + +def _intrin_reduce4int8_1x1(vec_size, num_elements_intel): + data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data') + kernel = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='kernel') + k = tvm.reduce_axis((0, 4), name='k') + C = tvm.compute((vec_size,), \ + lambda i: tvm.sum(data[k].astype('int32') * + kernel[i, k, 0, 0].astype('int32'), + axis=k), + name="C") + + a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer", + offset_factor=1, + strides=[1]) + b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", + offset_factor=1, + strides=[tvm.var('ldw'), + tvm.var('ldw'), + tvm.var('ldw'), 1] + ) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.ir_builder.create() + if index == 1: + ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) + return ib.get() + + a_int8 = ins[0].vload([0], "uint8x4") + re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8) + vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32) + vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32) + vec_b = ins[1].vload([0, 0, 0, 0], "int8x64") + vec_one = tvm.const(1, "int16x32") + pair_reduction = tvm.call_llvm_intrin('int16x32', + 'llvm.x86.avx512.pmaddubs.w.512', + tvm.const(0, 'uint32'), + vec_a, vec_b) + quad_reduction = tvm.call_llvm_intrin('int32x16', + 'llvm.x86.avx512.pmaddw.d.512', + tvm.const(0, 'uint32'), \ + pair_reduction, vec_one) + vec_c = outs[0].vload([0], "int32x16") + out = quad_reduction + vec_c + ib.emit(outs[0].vstore(0, out)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + with tvm.build_config(offset_factor=1, partition_const_loop=True): + return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer}) From b24726fb690d885334e52d9c0953358b1a582134 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Fri, 31 Aug 2018 17:04:57 -0700 Subject: [PATCH 06/18] Fixing an error --- tests/python/unittest/test_conv_int8_intel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py index e50a583df51f..5d77324ba503 100644 --- a/tests/python/unittest/test_conv_int8_intel.py +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -146,4 +146,4 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time)) SPEEDUP_ARRAY.append(fp32_time/int8_time) - LOGGER.info("Average speedup --> %s" % sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))) + LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY)))) From 6d4aac21c53458b654188b8dc2d24ca622126eeb Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Fri, 31 Aug 2018 17:04:57 -0700 Subject: [PATCH 07/18] Fixing an error --- src/codegen/llvm/codegen_llvm.cc | 4 +-- tests/python/unittest/test_conv_int8_intel.py | 2 +- topi/python/topi/x86/conv2d.py | 16 ++++----- topi/python/topi/x86/conv2d_avx_1x1.py | 36 ++++++++++--------- topi/python/topi/x86/conv2d_avx_common.py | 33 +++++++++-------- topi/python/topi/x86/int8_intrinsics.py | 1 + 6 files changed, 50 insertions(+), 42 deletions(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 9c788521cdfe..f8b402d78b03 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -688,10 +688,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); - } else if (op->is_intrinsic("broadcast16")){ + } else if (op->is_intrinsic("broadcast16")) { llvm::Value *v = MakeValue(op->args[0]); return CreateBroadcast(v, 16); - } else if (op->is_intrinsic("bitcast")){ + } else if (op->is_intrinsic("bitcast")) { llvm::Type* target = LLVMType(op->type); return builder_->CreateBitCast(MakeValue(op->args[0]), target); } else { diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py index e50a583df51f..5d77324ba503 100644 --- a/tests/python/unittest/test_conv_int8_intel.py +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -146,4 +146,4 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time)) SPEEDUP_ARRAY.append(fp32_time/int8_time) - LOGGER.info("Average speedup --> %s" % sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))) + LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY)))) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 735b8400e5a5..dbc4b678c19a 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -48,7 +48,7 @@ def _get_schedule_conv(wkl): # workloads of resnet152_v1 on imagenet, no extra workload required # workloads of resnet18_v2 on imagenet, no extra workload required # workloads of resnet34_v2 on imagenet, no extra workload required - + ## Following are for INT8 kernels Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), @@ -119,8 +119,8 @@ def _get_schedule_conv(wkl): # workloads of resnet152_v1 on imagenet, no extra workload required # workloads of resnet18_v2 on imagenet, no extra workload required # workloads of resnet34_v2 on imagenet, no extra workload required - - + + # Following are for INT8 operations # workloads of resnet18_v1 on imagenet AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False), @@ -233,7 +233,7 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride, AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc, AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc } - + # Use int8 schedules if the input data is of int8 dtype if data.dtype == 'uint8': _AVX_SCH_TO_DECL_FUNC = { @@ -246,7 +246,7 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride, kh, kw = kernel_size wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype), tvm.placeholder((num_filter, ic, kh, kw), - dtype=kernel.dtype), + dtype=kernel.dtype), stride, padding, out_dtype) sch = _get_schedule_NCHWc(wkl, layout, out_layout) return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel) @@ -385,7 +385,7 @@ def traverse(op): if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] - + _AVX_SCH_TO_SCH_FUNC = { AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc, AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc @@ -397,14 +397,14 @@ def traverse(op): AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc_int8, AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc_int8 } - + n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype) kh, kw = kernel_size original_kernel = tvm.placeholder((num_filter, ic, kh, kw), - dtype=kernel.dtype) + dtype=kernel.dtype) wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype) sch = _get_schedule_NCHWc(wkl, layout, out_layout) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 4e7491bd95d8..20dd162f3ee8 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -252,16 +252,19 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): # Intel performs dot product of 2 "4" Int8 values n_elems = 4 - assert(sch.ic_bn%4 == 0) + assert sch.ic_bn%4 == 0 ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer') ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner') ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner') - + conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * - kernel[oc_chunk, ic_outer, ic_f_inner, oc_block, ic_s_inner, 0, 0].astype(out_dtype), - axis=[ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8', - tag="conv2d_NCHWc_int8") + tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR, + ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * + kernel[oc_chunk, ic_outer, ic_f_inner, + oc_block, ic_s_inner, 0, 0].astype(out_dtype), + axis=[ic_outer, ic_f_inner, ic_s_inner]), + name='conv2d_NCHWc_int8', + tag="conv2d_NCHWc_int8") return conv @@ -271,9 +274,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): """ Defines the schedule for INT8 for intel machines Uses the Intel intrinsics to use INT8 operations - More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training + More details - https://software.intel.com/en-us/articles/ + lower-numerical-precision-deep-learning-inference-and-training """ - + target = tvm.target.current_target(allow_none=False) avx2_len = -1 for opt in target.options: @@ -281,8 +285,8 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): avx2_len = 16 else: return s - assert(avx2_len != -1) - + assert avx2_len != -1 + # schedule data A = data if isinstance(s[A].op, tvm.tensor.ComputeOp): @@ -306,19 +310,19 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis - - # Sylake and future processors have 16 vector lanes - assert(sch.oc_bn % avx2_len == 0) - oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len); + # Sylake and future processors have 16 vector lanes + assert sch.oc_bn % avx2_len == 0 + + oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor) ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor) s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_outer, ic_f_inner, oh_inner, - ow_inner, oc_f_inner, oc_s_inner, ic_s_inner) + ow_inner, oc_f_inner, oc_s_inner, ic_s_inner) s[CC].fuse(oc_chunk, oh_outer) - + n_elems = 4 pc = _intrin_reduce4int8_1x1(avx2_len, n_elems) s[CC].tensorize(oc_s_inner, pc) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index f014aa5719a2..e59a6132d9f1 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -261,7 +261,7 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): Inputs are in INT8 datatype Ouptut is in INT32 datatype """ - + out_dtype = wkl.out_dtype HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride @@ -285,15 +285,17 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): # Intel performs dot product of 2 "4" Int8 values # Current implementation requires ic_bn to be a multiple of 4 n_elems = 4 - assert(sch.ic_bn%4 == 0) - + assert sch.ic_bn%4 == 0 + ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer') ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner') ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner') conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * - kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), - axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), + tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw, + ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * + kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, + oc_block, ic_s_inner].astype(out_dtype), + axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8") return conv @@ -302,9 +304,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): """ Defines the schedule for INT8 for intel machines Uses the Intel intrinsics to use INT8 operations - More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training + More details - https://software.intel.com/en-us/articles/ + lower-numerical-precision-deep-learning-inference-and-training """ - + # Currently INT8 operations are supported for only Skylake # In future the _intrin_reduce4int8 will be updated for VNNI instructions # In case of unsupported target, the schedule will go to the original @@ -317,8 +320,8 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): avx2_len = 16 else: return s - assert(avx2_len != -1) - + assert avx2_len != -1 + A = data if isinstance(s[A].op, tvm.tensor.ComputeOp): batch, ic_chunk, ih, iw, _ = s[A].op.axis @@ -342,11 +345,11 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) - - # Sylake and future processors have 16 vector lanes - assert(sch.oc_bn % avx2_len == 0) - oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len); + # Sylake and future processors have 16 vector lanes + assert sch.oc_bn % avx2_len == 0 + + oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) if sch.unroll_kw: s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw, @@ -356,7 +359,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, kw, ic_f_inner, ow_block, oc_f_inner, oc_s_inner, ic_s_inner) - + n_elems = 4 pc = _intrin_reduce4int8_common(avx2_len, n_elems) s[CC].tensorize(oc_s_inner, pc) diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py index 7fb948e23db6..0d9f5987804b 100644 --- a/topi/python/topi/x86/int8_intrinsics.py +++ b/topi/python/topi/x86/int8_intrinsics.py @@ -1,4 +1,5 @@ """Core kernel of dot product of 4 Int8 operations""" +#pylint: disable=invalid-name import tvm From 5ff9c8269dd041305caf8b87d8e042f6232c191f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 4 Sep 2018 22:20:14 +0000 Subject: [PATCH 08/18] Minor typos fix --- tests/python/unittest/test_conv_int8_intel.py | 16 ++++++++-------- topi/python/topi/x86/conv2d_avx_1x1.py | 2 +- topi/python/topi/x86/conv2d_avx_common.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py index 5d77324ba503..863b3a6a41ab 100644 --- a/tests/python/unittest/test_conv_int8_intel.py +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -8,7 +8,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO) LOGGER = logging.getLogger('test_conv_int8_intel') -LOGGER.disabled = True +LOGGER.disabled = False # All the WORKLOADS from Resnet except first layer # Workload is ['height', 'width', 'in_filter', 'out_filter', @@ -51,25 +51,25 @@ def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, Finds out the shape of all data structures """ ## Find shapes - data_shape = (1, in_filter/NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES) + data_shape = (1, in_filter//NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES) if out_dtype == 'int32': if k_h != 1: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w, - NUM_VEC_LANES/4, NUM_VEC_LANES, 4) + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w, + NUM_VEC_LANES//4, NUM_VEC_LANES, 4) else: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES/4, + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES//4, NUM_VEC_LANES, 4, k_h, k_w) elif out_dtype == 'float32': if k_h != 1: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w, + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w, NUM_VEC_LANES, NUM_VEC_LANES) else: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES, + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES, NUM_VEC_LANES, k_h, k_w) out_height = (im_height + 2 * hpad - k_h) // hstride + 1 out_width = (im_width + 2 * wpad - k_w) // wstride + 1 - o_shape = (1, out_filter/NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES) + o_shape = (1, out_filter//NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES) return (data_shape, kernel_shape, o_shape) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 20dd162f3ee8..f6d59a2a37c2 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -311,7 +311,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis - # Sylake and future processors have 16 vector lanes + # Skylake and future processors have 16 vector lanes assert sch.oc_bn % avx2_len == 0 oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index e59a6132d9f1..052b6ec14060 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -259,7 +259,7 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): """ This function sets up the compute for INT8 conv 2d Inputs are in INT8 datatype - Ouptut is in INT32 datatype + Output is in INT32 datatype """ out_dtype = wkl.out_dtype @@ -346,7 +346,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) - # Sylake and future processors have 16 vector lanes + # Skylake and future processors have 16 vector lanes assert sch.oc_bn % avx2_len == 0 oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) From 4cd7f3066a6e162398d9bd5bc8faba7149968c42 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 4 Sep 2018 22:20:14 +0000 Subject: [PATCH 09/18] Minor typos fix --- tests/python/unittest/test_conv_int8_intel.py | 16 ++++++++-------- topi/python/topi/x86/conv2d_avx_1x1.py | 4 ++-- topi/python/topi/x86/conv2d_avx_common.py | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py index 5d77324ba503..863b3a6a41ab 100644 --- a/tests/python/unittest/test_conv_int8_intel.py +++ b/tests/python/unittest/test_conv_int8_intel.py @@ -8,7 +8,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO) LOGGER = logging.getLogger('test_conv_int8_intel') -LOGGER.disabled = True +LOGGER.disabled = False # All the WORKLOADS from Resnet except first layer # Workload is ['height', 'width', 'in_filter', 'out_filter', @@ -51,25 +51,25 @@ def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, Finds out the shape of all data structures """ ## Find shapes - data_shape = (1, in_filter/NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES) + data_shape = (1, in_filter//NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES) if out_dtype == 'int32': if k_h != 1: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w, - NUM_VEC_LANES/4, NUM_VEC_LANES, 4) + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w, + NUM_VEC_LANES//4, NUM_VEC_LANES, 4) else: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES/4, + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES//4, NUM_VEC_LANES, 4, k_h, k_w) elif out_dtype == 'float32': if k_h != 1: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w, + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w, NUM_VEC_LANES, NUM_VEC_LANES) else: - kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES, + kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES, NUM_VEC_LANES, k_h, k_w) out_height = (im_height + 2 * hpad - k_h) // hstride + 1 out_width = (im_width + 2 * wpad - k_w) // wstride + 1 - o_shape = (1, out_filter/NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES) + o_shape = (1, out_filter//NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES) return (data_shape, kernel_shape, o_shape) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 20dd162f3ee8..b35ec3f6b8f1 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -8,7 +8,7 @@ from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad -from .int8Intrinsics import _intrin_reduce4int8_1x1 +from .int8_intrinsics import _intrin_reduce4int8_1x1 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor']) @@ -311,7 +311,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): _, oc_chunk, oh, ow, oc_block = s[CC].op.axis ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis - # Sylake and future processors have 16 vector lanes + # Skylake and future processors have 16 vector lanes assert sch.oc_bn % avx2_len == 0 oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index e59a6132d9f1..052b6ec14060 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -259,7 +259,7 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): """ This function sets up the compute for INT8 conv 2d Inputs are in INT8 datatype - Ouptut is in INT32 datatype + Output is in INT32 datatype """ out_dtype = wkl.out_dtype @@ -346,7 +346,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) - # Sylake and future processors have 16 vector lanes + # Skylake and future processors have 16 vector lanes assert sch.oc_bn % avx2_len == 0 oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) From 621f7bb478f77f77b7badc271d82c2a0fbf7a964 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 5 Sep 2018 17:32:52 +0000 Subject: [PATCH 10/18] Removing the broadcast16 CPP code. Using astype feature instead --- src/codegen/llvm/codegen_llvm.cc | 3 --- topi/python/topi/x86/int8_intrinsics.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index f8b402d78b03..799193b1dede 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -688,9 +688,6 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); - } else if (op->is_intrinsic("broadcast16")) { - llvm::Value *v = MakeValue(op->args[0]); - return CreateBroadcast(v, 16); } else if (op->is_intrinsic("bitcast")) { llvm::Type* target = LLVMType(op->type); return builder_->CreateBitCast(MakeValue(op->args[0]), target); diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py index 0d9f5987804b..b802137205a1 100644 --- a/topi/python/topi/x86/int8_intrinsics.py +++ b/topi/python/topi/x86/int8_intrinsics.py @@ -29,7 +29,7 @@ def _instr(index): a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8) - vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32) + vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") @@ -81,7 +81,7 @@ def _instr(index): a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8) - vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32) + vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32) vec_b = ins[1].vload([0, 0, 0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") From 74516c053fa39039df9403c707b8c8d12ae1cd0f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 5 Sep 2018 21:34:56 +0000 Subject: [PATCH 11/18] Replacing constant by variable name num_elements_intel --- topi/python/topi/x86/int8_intrinsics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py index b802137205a1..83a574283048 100644 --- a/topi/python/topi/x86/int8_intrinsics.py +++ b/topi/python/topi/x86/int8_intrinsics.py @@ -6,7 +6,7 @@ def _intrin_reduce4int8_common(vec_size, num_elements_intel): data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data') kernel = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='kernel') - k = tvm.reduce_axis((0, 4), name='k') + k = tvm.reduce_axis((0, num_elements_intel), name='k') C = tvm.compute((vec_size,), lambda i: tvm.sum(data[k].astype('int32') * kernel[i, k].astype('int32'), @@ -55,7 +55,7 @@ def _instr(index): def _intrin_reduce4int8_1x1(vec_size, num_elements_intel): data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data') kernel = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='kernel') - k = tvm.reduce_axis((0, 4), name='k') + k = tvm.reduce_axis((0, num_elements_intel), name='k') C = tvm.compute((vec_size,), \ lambda i: tvm.sum(data[k].astype('int32') * kernel[i, k, 0, 0].astype('int32'), From f68a6fa72f6cd0686621e74946e6354da86f3068 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 6 Sep 2018 01:21:29 +0000 Subject: [PATCH 12/18] Name fixes and tensorize update rule updated --- topi/python/topi/x86/conv2d_avx_1x1.py | 12 ++++++------ topi/python/topi/x86/conv2d_avx_common.py | 12 ++++++------ topi/python/topi/x86/int8_intrinsics.py | 14 ++++++++------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index b35ec3f6b8f1..b43da7372eed 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -279,13 +279,13 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): """ target = tvm.target.current_target(allow_none=False) - avx2_len = -1 + int32_lanes = -1 for opt in target.options: if opt == '-mcpu=skylake-avx512': - avx2_len = 16 + int32_lanes = 16 else: return s - assert avx2_len != -1 + assert int32_lanes != -1 # schedule data A = data @@ -312,9 +312,9 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis # Skylake and future processors have 16 vector lanes - assert sch.oc_bn % avx2_len == 0 + assert sch.oc_bn % int32_lanes == 0 - oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) + oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes) oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor) ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor) @@ -324,7 +324,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): s[CC].fuse(oc_chunk, oh_outer) n_elems = 4 - pc = _intrin_reduce4int8_1x1(avx2_len, n_elems) + pc = _intrin_reduce4int8_1x1(int32_lanes, n_elems) s[CC].tensorize(oc_s_inner, pc) s[CC].unroll(ow_inner) s[CC].unroll(oh_inner) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 052b6ec14060..ae4ed0270a76 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -314,13 +314,13 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): # compute target = tvm.target.current_target(allow_none=False) - avx2_len = -1 + int32_lanes = -1 for opt in target.options: if opt == '-mcpu=skylake-avx512': - avx2_len = 16 + int32_lanes = 16 else: return s - assert avx2_len != -1 + assert int32_lanes != -1 A = data if isinstance(s[A].op, tvm.tensor.ComputeOp): @@ -347,9 +347,9 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n) # Skylake and future processors have 16 vector lanes - assert sch.oc_bn % avx2_len == 0 + assert sch.oc_bn % int32_lanes == 0 - oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len) + oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes) if sch.unroll_kw: s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw, @@ -361,7 +361,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): n_elems = 4 - pc = _intrin_reduce4int8_common(avx2_len, n_elems) + pc = _intrin_reduce4int8_common(int32_lanes, n_elems) s[CC].tensorize(oc_s_inner, pc) s[CC].unroll(ow_block) s[CC].unroll(oc_f_inner) diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py index 83a574283048..b0b5523c2980 100644 --- a/topi/python/topi/x86/int8_intrinsics.py +++ b/topi/python/topi/x86/int8_intrinsics.py @@ -41,9 +41,10 @@ def _instr(index): 'llvm.x86.avx512.pmaddw.d.512', tvm.const(0, 'uint32'), pair_reduction, vec_one) - vec_c = outs[0].vload([0], "int32x16") - out = quad_reduction + vec_c - ib.emit(outs[0].vstore(0, out)) + if index == 0: + ib.emit(outs[0].vstore(0, quad_reduction)) + else: + ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get() # body, reset, update @@ -93,9 +94,10 @@ def _instr(index): 'llvm.x86.avx512.pmaddw.d.512', tvm.const(0, 'uint32'), \ pair_reduction, vec_one) - vec_c = outs[0].vload([0], "int32x16") - out = quad_reduction + vec_c - ib.emit(outs[0].vstore(0, out)) + if index == 0: + ib.emit(outs[0].vstore(0, quad_reduction)) + else: + ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get() # body, reset, update From ed984533008ad1cd5c6f6efccde71a3463b06514 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 11 Sep 2018 16:56:41 +0000 Subject: [PATCH 13/18] Fixing the bug about checking skylake --- topi/python/topi/nn/conv2d.py | 10 ++++++++++ topi/python/topi/x86/conv2d_avx_1x1.py | 11 +++++------ topi/python/topi/x86/conv2d_avx_common.py | 11 +++++------ 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 809a05851825..c67828b79e26 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -397,3 +397,13 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding, 4-D with shape [batch, out_height, out_width, out_channel] """ raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform") + +def check_skylake(target): + """ + Checks if the target is skylake + """ + + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + return True + return False diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index b43da7372eed..8dc46d16c2f3 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -5,7 +5,7 @@ import tvm from ..util import get_const_tuple -from ..nn.conv2d import _get_schedule, _get_workload +from ..nn.conv2d import _get_schedule, _get_workload, check_skylake from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad from .int8_intrinsics import _intrin_reduce4int8_1x1 @@ -280,11 +280,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): target = tvm.target.current_target(allow_none=False) int32_lanes = -1 - for opt in target.options: - if opt == '-mcpu=skylake-avx512': - int32_lanes = 16 - else: - return s + if check_skylake(target): + int32_lanes = 16 + else: + return s assert int32_lanes != -1 # schedule data diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index ae4ed0270a76..03c86701a131 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -5,7 +5,7 @@ import tvm from ..util import get_const_tuple -from ..nn.conv2d import _get_schedule, _get_workload +from ..nn.conv2d import _get_schedule, _get_workload, check_skylake from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad from .int8_intrinsics import _intrin_reduce4int8_common @@ -315,11 +315,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): target = tvm.target.current_target(allow_none=False) int32_lanes = -1 - for opt in target.options: - if opt == '-mcpu=skylake-avx512': - int32_lanes = 16 - else: - return s + if check_skylake(target): + int32_lanes = 16 + else: + return s assert int32_lanes != -1 A = data From 3a53b510dd7db2e5e6277dd41639e9ffc0c4a237 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Sep 2018 01:06:54 +0000 Subject: [PATCH 14/18] Replacing bitcast with reinterpret --- src/codegen/llvm/codegen_llvm.cc | 3 --- topi/python/topi/x86/int8_intrinsics.py | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 799193b1dede..c1b1fe24f0a8 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -688,9 +688,6 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { indices.push_back(i); } return builder_->CreateShuffleVector(v0, v1, indices); - } else if (op->is_intrinsic("bitcast")) { - llvm::Type* target = LLVMType(op->type); - return builder_->CreateBitCast(MakeValue(op->args[0]), target); } else { LOG(FATAL) << "unknown intrinsic " << op->name; return nullptr; diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py index b0b5523c2980..26657abe8160 100644 --- a/topi/python/topi/x86/int8_intrinsics.py +++ b/topi/python/topi/x86/int8_intrinsics.py @@ -28,9 +28,9 @@ def _instr(index): return ib.get() a_int8 = ins[0].vload([0], "uint8x4") - re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8) + re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') - vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32) + vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") pair_reduction = tvm.call_llvm_intrin('int16x32', @@ -81,9 +81,9 @@ def _instr(index): return ib.get() a_int8 = ins[0].vload([0], "uint8x4") - re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8) + re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') - vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32) + vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0, 0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") pair_reduction = tvm.call_llvm_intrin('int16x32', From 9acbd753c3e3e5f6c1559464673a39eaf83b1c0b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 13 Sep 2018 01:56:17 +0000 Subject: [PATCH 15/18] Isolating INT8 and FP32 schedules to ease out future AutoTVM PR merge --- topi/python/topi/nn/conv2d.py | 26 ++++++++ topi/python/topi/x86/conv2d.py | 111 ++++++++++++++++++++++----------- 2 files changed, 99 insertions(+), 38 deletions(-) diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index c67828b79e26..5f58cf153640 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -84,6 +84,21 @@ def _get_workload(data, kernel, stride, padding, out_dtype): '{} vs. {}".format(data.dtype, kernel.dtype) return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) +def _get_workload_int8(data, kernel, stride, padding, out_dtype): + """ Get the workload structure. """ + _, CI, IH, IW = [x.value for x in data.shape] + CO, _, KH, KW = [x.value for x in kernel.shape] + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \ + "Do not support inputs with different data types now. ' \ + '{} vs. {}".format(data.dtype, kernel.dtype) + return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) + + @tvm.target.generic_func def _get_alter_layout_schedule(wkl): @@ -118,6 +133,17 @@ def _get_schedule_NCHWc(wkl, layout, out_layout): return wkl +@tvm.target.generic_func +def _get_schedule_NCHWc_int8(wkl, layout, out_layout): + # pylint: disable=unreachable + """ Get the platform specific schedule. """ + target = tvm.target.current_target() + raise RuntimeError( + "No schedule for current target:{}".format(target)) + # This return has no use, merely to supress pylint warning + return wkl + + def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None): """Convolution operator in NCHW layout. diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index dbc4b678c19a..257ad0819bb8 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -5,8 +5,8 @@ from .. import nn from ..nn.util import infer_pad, infer_stride from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \ - _get_workload, _get_schedule, _get_schedule_NCHWc, \ - _get_alter_layout_schedule, Workload + _get_workload, _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \ + _get_schedule_NCHWc_int8, _get_alter_layout_schedule, Workload from . import conv2d_avx_1x1, conv2d_avx_common from .conv2d_avx_common import AVXConvCommonFwd @@ -48,35 +48,6 @@ def _get_schedule_conv(wkl): # workloads of resnet152_v1 on imagenet, no extra workload required # workloads of resnet18_v2 on imagenet, no extra workload required # workloads of resnet34_v2 on imagenet, no extra workload required - - ## Following are for INT8 kernels - Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), - Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), - Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), - Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), - Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), - Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), - Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), - # workloads of resnet34_v1 on imagenet, no extra workload required - # workloads of resnet50_v1 on imagenet - Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1), - Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2), - Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1), ] fp32_vec_len = 8 @@ -119,8 +90,55 @@ def _get_schedule_conv(wkl): # workloads of resnet152_v1 on imagenet, no extra workload required # workloads of resnet18_v2 on imagenet, no extra workload required # workloads of resnet34_v2 on imagenet, no extra workload required + ] + if wkl not in _WORKLOADS_AVX: + if wkl.hkernel == 1 and wkl.wkernel == 1: + return conv2d_avx_1x1._get_default_schedule(wkl, fp32_vec_len) + return conv2d_avx_common._get_default_schedule(wkl, fp32_vec_len) + idx = _WORKLOADS_AVX.index(wkl) + sch = _SCHEDULES_AVX[idx] + return sch +def _get_schedule_conv_int8(wkl): + _WORKLOADS_AVX = [ + ## Following are for INT8 kernels + Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), + Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), + Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), + Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), + Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), + Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), + Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), + # workloads of resnet34_v1 on imagenet, no extra workload required + # workloads of resnet50_v1 on imagenet + Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1), + Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2), + Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1), + ] + + fp32_vec_len = 8 + target = tvm.target.current_target(allow_none=False) + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + fp32_vec_len = 16 + + _SCHEDULES_AVX = [ # Following are for INT8 operations # workloads of resnet18_v1 on imagenet AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False), @@ -168,6 +186,10 @@ def _get_schedule_conv(wkl): def _get_schedule_NCHWc_x86(wkl, layout, out_layout): return _get_schedule_conv(wkl) +@_get_schedule_NCHWc_int8.register("cpu") +def _get_schedule_NCHWc_x86_int8(wkl, layout, out_layout): + return _get_schedule_conv_int8(wkl) + @_get_alter_layout_schedule.register("cpu") def _get_alter_layout_schedule_x86(wkl): return _get_schedule_conv(wkl) @@ -226,6 +248,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos): return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) + @conv2d_NCHWc.register("cpu") def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride, padding, layout, out_layout, out_dtype): @@ -244,11 +267,18 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride, n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block kh, kw = kernel_size - wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype), - tvm.placeholder((num_filter, ic, kh, kw), - dtype=kernel.dtype), - stride, padding, out_dtype) - sch = _get_schedule_NCHWc(wkl, layout, out_layout) + if data.dtype == 'uint8': + wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype), + tvm.placeholder((num_filter, ic, kh, kw), + dtype=kernel.dtype), + stride, padding, out_dtype) + sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout) + else: + wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype), + tvm.placeholder((num_filter, ic, kh, kw), + dtype=kernel.dtype), + stride, padding, out_dtype) + sch = _get_schedule_NCHWc(wkl, layout, out_layout) return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel) @@ -406,8 +436,13 @@ def traverse(op): original_kernel = tvm.placeholder((num_filter, ic, kh, kw), dtype=kernel.dtype) - wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype) - sch = _get_schedule_NCHWc(wkl, layout, out_layout) + if data.dtype == 'uint8': + wkl = _get_workload_int8(original_data, original_kernel, + stride, padding, conv_out.dtype) + sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout) + else: + wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype) + sch = _get_schedule_NCHWc(wkl, layout, out_layout) _AVX_SCH_TO_SCH_FUNC[type(sch)](s, wkl, sch, data_vec, kernel, conv_out, outs[0]) From cdfce1fc58946338568b542990b0e353b8089c13 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 14 Sep 2018 23:28:53 +0000 Subject: [PATCH 16/18] Putting check_skylake function in the x86 directory --- topi/python/topi/nn/conv2d.py | 10 ---------- topi/python/topi/x86/check_targets.py | 12 ++++++++++++ topi/python/topi/x86/conv2d.py | 6 +++--- topi/python/topi/x86/conv2d_avx_1x1.py | 3 ++- topi/python/topi/x86/conv2d_avx_common.py | 3 ++- 5 files changed, 19 insertions(+), 15 deletions(-) create mode 100644 topi/python/topi/x86/check_targets.py diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 5f58cf153640..3e06f6f6fed5 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -423,13 +423,3 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding, 4-D with shape [batch, out_height, out_width, out_channel] """ raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform") - -def check_skylake(target): - """ - Checks if the target is skylake - """ - - for opt in target.options: - if opt == '-mcpu=skylake-avx512': - return True - return False diff --git a/topi/python/topi/x86/check_targets.py b/topi/python/topi/x86/check_targets.py new file mode 100644 index 000000000000..fad74eaf582a --- /dev/null +++ b/topi/python/topi/x86/check_targets.py @@ -0,0 +1,12 @@ +# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument +"""Checks different x86 targets for target specific schedules""" + +def check_skylake(target): + """ + Checks if the target is skylake + """ + + for opt in target.options: + if opt == '-mcpu=skylake-avx512': + return True + return False diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 257ad0819bb8..6fe59a909510 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -11,6 +11,7 @@ from . import conv2d_avx_1x1, conv2d_avx_common from .conv2d_avx_common import AVXConvCommonFwd from .conv2d_avx_1x1 import AVXConv1x1Fwd +from .check_targets import check_skylake @_get_schedule.register("cpu") def _get_schedule_conv(wkl): @@ -134,9 +135,8 @@ def _get_schedule_conv_int8(wkl): fp32_vec_len = 8 target = tvm.target.current_target(allow_none=False) - for opt in target.options: - if opt == '-mcpu=skylake-avx512': - fp32_vec_len = 16 + if check_skylake(target): + fp32_vec_len = 16 _SCHEDULES_AVX = [ # Following are for INT8 operations diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 8dc46d16c2f3..50c451949026 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -5,10 +5,11 @@ import tvm from ..util import get_const_tuple -from ..nn.conv2d import _get_schedule, _get_workload, check_skylake +from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad from .int8_intrinsics import _intrin_reduce4int8_1x1 +from .check_targets import check_skylake AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor']) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 03c86701a131..bb5f11b7cf1f 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -5,10 +5,11 @@ import tvm from ..util import get_const_tuple -from ..nn.conv2d import _get_schedule, _get_workload, check_skylake +from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad from .int8_intrinsics import _intrin_reduce4int8_common +from .check_targets import check_skylake AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw']) From 1fdef3891eb886eb95c7445bb6e741095b1f4037 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 20 Sep 2018 16:31:35 +0000 Subject: [PATCH 17/18] Added documentation and organizing files to better locations --- topi/python/topi/x86/conv2d_avx_1x1.py | 5 +- topi/python/topi/x86/conv2d_avx_common.py | 5 +- .../{int8_intrinsics.py => tensor_intrin.py} | 80 ++++++++++++++++--- .../recipe/conv}/test_conv_int8_intel.py | 0 4 files changed, 74 insertions(+), 16 deletions(-) rename topi/python/topi/x86/{int8_intrinsics.py => tensor_intrin.py} (62%) rename {tests/python/unittest => topi/recipe/conv}/test_conv_int8_intel.py (100%) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 50c451949026..e471b89b98ea 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -8,7 +8,7 @@ from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad -from .int8_intrinsics import _intrin_reduce4int8_1x1 +from .tensor_intrin import reduce_4int8_1x1 from .check_targets import check_skylake AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor']) @@ -323,8 +323,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ow_inner, oc_f_inner, oc_s_inner, ic_s_inner) s[CC].fuse(oc_chunk, oh_outer) - n_elems = 4 - pc = _intrin_reduce4int8_1x1(int32_lanes, n_elems) + pc = reduce_4int8_1x1() s[CC].tensorize(oc_s_inner, pc) s[CC].unroll(ow_inner) s[CC].unroll(oh_inner) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index bb5f11b7cf1f..ec2a79b28d26 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -8,7 +8,7 @@ from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad -from .int8_intrinsics import _intrin_reduce4int8_common +from .tensor_intrin import reduce_4int8_common from .check_targets import check_skylake AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw']) @@ -360,8 +360,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ow_block, oc_f_inner, oc_s_inner, ic_s_inner) - n_elems = 4 - pc = _intrin_reduce4int8_common(int32_lanes, n_elems) + pc = reduce_4int8_common() s[CC].tensorize(oc_s_inner, pc) s[CC].unroll(ow_block) s[CC].unroll(oc_f_inner) diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/tensor_intrin.py similarity index 62% rename from topi/python/topi/x86/int8_intrinsics.py rename to topi/python/topi/x86/tensor_intrin.py index 26657abe8160..2a62cd543302 100644 --- a/topi/python/topi/x86/int8_intrinsics.py +++ b/topi/python/topi/x86/tensor_intrin.py @@ -3,11 +3,41 @@ import tvm -def _intrin_reduce4int8_common(vec_size, num_elements_intel): - data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data') - kernel = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='kernel') - k = tvm.reduce_axis((0, num_elements_intel), name='k') - C = tvm.compute((vec_size,), +def reduce_4int8_common(): + """ + Int8 dot product by every 4 elements using AVX2 Skylake instructions. + This function takes two arrays of int8 datatype -- data[4] and + kernel[16][4] -- and computes a dot product of data[4] with every + 4 elements of kernels, resulting in output[16] of int32 datatype. + The pseudo code is as follows. + .. code-block:: c + void reduce_4_int8_common(int8 data[4], int8 kernel[16][4], + int32 output[16]){ + for (int i = 0; i < 16; i++){ + out[i] = 0; + for (int k = 0; k < 4; k++){ + out[i] += data[k] * kernel[i][k] + } + } + } + + Physically, the kernel array sits in an AVX512 vector register and + the data[4] is broadcasted to another AVX512 vector register. This + function returns a TensorIntrin that can be used to tensorize + a schedule. + + Returns + ------- + intrin : TensorIntrin + The Skylake int8 TensorIntrin that can be used in tensorizing schedule + """ + + int32_lanes = 16 # 16 int32 lanes in AVX512 + num_int8_elements = 4 # 4 int8 elements in int32 + data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data') + kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel') + k = tvm.reduce_axis((0, num_int8_elements), name='k') + C = tvm.compute((int32_lanes,), lambda i: tvm.sum(data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k), @@ -53,11 +83,41 @@ def _instr(index): with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer}) -def _intrin_reduce4int8_1x1(vec_size, num_elements_intel): - data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data') - kernel = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='kernel') - k = tvm.reduce_axis((0, num_elements_intel), name='k') - C = tvm.compute((vec_size,), \ +def reduce_4int8_1x1(): + """ + Int8 dot product by every 4 elements using AVX2 Skylake instructions. + This function takes two arrays of int8 datatype -- data[4] and + kernel[16][4] -- and computes a dot product of data[4] with every + 4 elements of kernels, resulting in output[16] of int32 datatype. + The pseudo code is as follows. + .. code-block:: c + void reduce_4_int8_common(int8 data[4], int8 kernel[16][4], + int32 output[16]){ + for (int i = 0; i < 16; i++){ + out[i] = 0; + for (int k = 0; k < 4; k++){ + out[i] += data[k] * kernel[i][k] + } + } + } + + Physically, the kernel array sits in an AVX512 vector register and + the data[4] is broadcasted to another AVX512 vector register. This + function returns a TensorIntrin that can be used to tensorize + a schedule. + + Returns + ------- + intrin : TensorIntrin + The Skylake int8 TensorIntrin that can be used in tensorizing schedule + """ + + int32_lanes = 16 # 16 int32 lanes in AVX512 + num_int8_elements = 4 # 4 int8 elements in int32 + data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data') + kernel = tvm.placeholder((int32_lanes, num_int8_elements, 1, 1), dtype='int8', name='kernel') + k = tvm.reduce_axis((0, num_int8_elements), name='k') + C = tvm.compute((int32_lanes,), \ lambda i: tvm.sum(data[k].astype('int32') * kernel[i, k, 0, 0].astype('int32'), axis=k), diff --git a/tests/python/unittest/test_conv_int8_intel.py b/topi/recipe/conv/test_conv_int8_intel.py similarity index 100% rename from tests/python/unittest/test_conv_int8_intel.py rename to topi/recipe/conv/test_conv_int8_intel.py From abd99da083571b727f41d53537f12bc2586166d0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 24 Sep 2018 01:16:38 +0000 Subject: [PATCH 18/18] Tensor intrin renaming. Avoid code duplication for intrin by kernel reshaping --- topi/python/topi/x86/conv2d_avx_1x1.py | 16 +++-- topi/python/topi/x86/conv2d_avx_common.py | 8 +-- topi/python/topi/x86/tensor_intrin.py | 87 +---------------------- 3 files changed, 17 insertions(+), 94 deletions(-) diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index e471b89b98ea..bace7451d665 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -3,12 +3,13 @@ from __future__ import absolute_import as _abs from collections import namedtuple import tvm +import topi from ..util import get_const_tuple from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad -from .tensor_intrin import reduce_4int8_1x1 +from .tensor_intrin import dot_16x1x16_int8_int8_int32 from .check_targets import check_skylake AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor']) @@ -253,16 +254,21 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): # Intel performs dot product of 2 "4" Int8 values n_elems = 4 - assert sch.ic_bn%4 == 0 + assert sch.ic_bn%n_elems == 0 ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer') ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner') - ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner') + ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner') + + # Reshaping kernel as the last 2 dimensions are 1x1 (k_h x k_w) + k_shape = kernel.shape + kernel = topi.reshape(kernel, (k_shape[0], k_shape[1], k_shape[2], k_shape[3], + k_shape[4] * k_shape[5] * k_shape[6])) conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * kernel[oc_chunk, ic_outer, ic_f_inner, - oc_block, ic_s_inner, 0, 0].astype(out_dtype), + oc_block, ic_s_inner].astype(out_dtype), axis=[ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8") @@ -323,7 +329,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ow_inner, oc_f_inner, oc_s_inner, ic_s_inner) s[CC].fuse(oc_chunk, oh_outer) - pc = reduce_4int8_1x1() + pc = dot_16x1x16_int8_int8_int32() s[CC].tensorize(oc_s_inner, pc) s[CC].unroll(ow_inner) s[CC].unroll(oh_inner) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index ec2a79b28d26..0d7aba23d236 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -8,7 +8,7 @@ from ..nn.conv2d import _get_schedule, _get_workload from ..nn.util import infer_pad, infer_stride from ..nn.pad import pad -from .tensor_intrin import reduce_4int8_common +from .tensor_intrin import dot_16x1x16_int8_int8_int32 from .check_targets import check_skylake AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw']) @@ -286,11 +286,11 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel): # Intel performs dot product of 2 "4" Int8 values # Current implementation requires ic_bn to be a multiple of 4 n_elems = 4 - assert sch.ic_bn%4 == 0 + assert sch.ic_bn%n_elems == 0 ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer') ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner') - ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner') + ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner') conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * @@ -360,7 +360,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last): ow_block, oc_f_inner, oc_s_inner, ic_s_inner) - pc = reduce_4int8_common() + pc = dot_16x1x16_int8_int8_int32() s[CC].tensorize(oc_s_inner, pc) s[CC].unroll(ow_block) s[CC].unroll(oc_f_inner) diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py index 2a62cd543302..28e57f1c10f8 100644 --- a/topi/python/topi/x86/tensor_intrin.py +++ b/topi/python/topi/x86/tensor_intrin.py @@ -3,7 +3,7 @@ import tvm -def reduce_4int8_common(): +def dot_16x1x16_int8_int8_int32(): """ Int8 dot product by every 4 elements using AVX2 Skylake instructions. This function takes two arrays of int8 datatype -- data[4] and @@ -11,7 +11,7 @@ def reduce_4int8_common(): 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c - void reduce_4_int8_common(int8 data[4], int8 kernel[16][4], + void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ out[i] = 0; @@ -82,86 +82,3 @@ def _instr(index): with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer}) - -def reduce_4int8_1x1(): - """ - Int8 dot product by every 4 elements using AVX2 Skylake instructions. - This function takes two arrays of int8 datatype -- data[4] and - kernel[16][4] -- and computes a dot product of data[4] with every - 4 elements of kernels, resulting in output[16] of int32 datatype. - The pseudo code is as follows. - .. code-block:: c - void reduce_4_int8_common(int8 data[4], int8 kernel[16][4], - int32 output[16]){ - for (int i = 0; i < 16; i++){ - out[i] = 0; - for (int k = 0; k < 4; k++){ - out[i] += data[k] * kernel[i][k] - } - } - } - - Physically, the kernel array sits in an AVX512 vector register and - the data[4] is broadcasted to another AVX512 vector register. This - function returns a TensorIntrin that can be used to tensorize - a schedule. - - Returns - ------- - intrin : TensorIntrin - The Skylake int8 TensorIntrin that can be used in tensorizing schedule - """ - - int32_lanes = 16 # 16 int32 lanes in AVX512 - num_int8_elements = 4 # 4 int8 elements in int32 - data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data') - kernel = tvm.placeholder((int32_lanes, num_int8_elements, 1, 1), dtype='int8', name='kernel') - k = tvm.reduce_axis((0, num_int8_elements), name='k') - C = tvm.compute((int32_lanes,), \ - lambda i: tvm.sum(data[k].astype('int32') * - kernel[i, k, 0, 0].astype('int32'), - axis=k), - name="C") - - a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer", - offset_factor=1, - strides=[1]) - b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", - offset_factor=1, - strides=[tvm.var('ldw'), - tvm.var('ldw'), - tvm.var('ldw'), 1] - ) - - def _intrin_func(ins, outs): - def _instr(index): - ib = tvm.ir_builder.create() - if index == 1: - ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) - return ib.get() - - a_int8 = ins[0].vload([0], "uint8x4") - re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) - vec_ai32 = re_int32.astype('int32x16') - vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) - vec_b = ins[1].vload([0, 0, 0, 0], "int8x64") - vec_one = tvm.const(1, "int16x32") - pair_reduction = tvm.call_llvm_intrin('int16x32', - 'llvm.x86.avx512.pmaddubs.w.512', - tvm.const(0, 'uint32'), - vec_a, vec_b) - quad_reduction = tvm.call_llvm_intrin('int32x16', - 'llvm.x86.avx512.pmaddw.d.512', - tvm.const(0, 'uint32'), \ - pair_reduction, vec_one) - if index == 0: - ib.emit(outs[0].vstore(0, quad_reduction)) - else: - ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16'))) - return ib.get() - - # body, reset, update - return _instr(0), _instr(1), _instr(2) - - with tvm.build_config(offset_factor=1, partition_const_loop=True): - return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})