From e7a870215cb4d2163061cb2c9c3e1f2f1be88376 Mon Sep 17 00:00:00 2001
From: Animesh Jain <janimesh@amazon.com>
Date: Thu, 23 Aug 2018 15:08:25 -0700
Subject: [PATCH 01/18] Int8 implementation for convolution operator on Intel
 Skylake

---
 src/codegen/llvm/codegen_llvm.cc              |   6 +
 tests/python/unittest/test_conv_int8_intel.py | 140 ++++++++++++++++++
 topi/python/topi/nn/conv2d.py                 |   2 +-
 topi/python/topi/x86/conv2d.py                | 102 +++++++++++--
 topi/python/topi/x86/conv2d_avx_1x1.py        | 108 ++++++++++++++
 topi/python/topi/x86/conv2d_avx_common.py     | 121 +++++++++++++++
 topi/python/topi/x86/int8Intrinsics.py        |  95 ++++++++++++
 7 files changed, 564 insertions(+), 10 deletions(-)
 create mode 100644 tests/python/unittest/test_conv_int8_intel.py
 create mode 100644 topi/python/topi/x86/int8Intrinsics.py

diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index ae576c981395..1a57fb1a34d4 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -688,6 +688,12 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
+  } else if (op->is_intrinsic("broadcast16")){
+    llvm::Value *v = MakeValue(op->args[0]);
+    return CreateBroadcast(v, 16);
+  } else if (op->is_intrinsic("bitcast")){
+    llvm::Type * target = LLVMType(op->type);
+    return builder_->CreateBitCast(MakeValue(op->args[0]), target);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->name;
     return nullptr;
diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
new file mode 100644
index 000000000000..62305e914a70
--- /dev/null
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -0,0 +1,140 @@
+import tvm
+import topi
+import numpy as np
+from tvm.contrib import cc
+from tvm.contrib import util
+import timeit
+from collections import namedtuple
+
+# All the workloads from Resnet except first layer
+# Workload is ['height', 'width', 'in_filter', 'out_filter',
+#              'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+
+workloads = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+             (56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+             (56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+             (56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+             (28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+             (28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+             (28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+             (14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+             (14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+             (14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+             (7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+             (56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
+             (28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
+             (28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
+             (28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
+             (14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
+             (28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
+             (14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
+             (14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
+             (7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
+             (14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
+             (7, 7, 2048, 512, 1, 1, 0, 0, 1, 1)
+            ]
+
+
+target_name = 'llvm -mcpu=skylake-avx512'
+avx2_len = 16
+ctx = tvm.context(target_name, 0);
+
+def getShape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad,
+             hstride, wstride, outDtype):
+    ## Find shapes
+    dataShape = (1, in_filter/avx2_len, im_height, im_width, avx2_len)
+
+    if outDtype == 'int32':
+        if kh != 1:
+            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4)
+        else:
+            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw)
+    elif outDtype == 'float32':
+        if kh != 1:
+            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len)
+        else:
+            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw)
+    out_height = (im_height + 2 * hpad - kh) // hstride + 1
+    out_width = (im_width + 2 * wpad - kw) // wstride + 1
+    oShape = (1, out_filter/avx2_len, out_height, out_width, avx2_len)
+    return (dataShape, kernelShape, oShape)
+
+
+
+def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filter,
+             out_filter, kh, kw, hpad, wpad, hstride, wstride):
+
+    (dataShape, kernelShape, oShape) = getShape(im_height, im_width, in_filter,
+                                                out_filter, kh, kw, hpad, wpad,
+                                                hstride, wstride, outDtype)
+
+    # Create TVM placeholders
+    data = tvm.placeholder(dataShape, name='data', dtype=dataDtype);
+    kernel = tvm.placeholder(kernelShape, name='kernel', dtype=kernelDtype);
+
+    # Create the numpy arrays to be used for executing conv models
+    if dataDtype == 'float32':
+        a = tvm.nd.array(np.random.rand(*dataShape).astype(dtype=dataDtype), ctx);
+        b = tvm.nd.array(np.random.rand(*kernelShape).astype(dtype=kernelDtype), ctx);
+    else:
+        a = tvm.nd.array(np.random.randint(100, size=dataShape).astype(dataDtype));
+        b = tvm.nd.array(np.random.randint(100, size=kernelShape).astype(kernelDtype));
+        #a = tvm.nd.array(np.ones(dataShape, dtype='uint8'), ctx);
+        #b = tvm.nd.array(np.zeros(kernelShape, dtype='int8'), ctx);
+
+    # cOrig will be used for declaration ouptut
+    # cSch will be used for scheduled computation output
+    cOrig = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx);
+    cSch = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx);
+
+
+    with tvm.target.create(target_name):
+        conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter,
+                                    kernel_size=(kh, kw), stride=hstride,
+                                    padding=hpad, layout='NCHWc',
+                                    out_layout='NCHWc', out_dtype=outDtype);
+        out = topi.nn.relu(conv)
+        s = tvm.create_schedule(out.op);
+        func = tvm.build(s, [data, kernel, out], target=target_name, name='out')
+        func(a, b, cOrig)
+        #print(tvm.lower(s, [data, kernel], simple_mode=True));
+
+        # Generate and run the optimized schedule
+        sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter,
+                                                      kernel_size=(kh,kw),
+                                                      strides=hstride,
+                                                      padding=hpad,
+                                                      layout='NCHWc',
+                                                      out_layout='NCHWc',
+                                                      outs=[out]);
+        func = tvm.build(sconv, [data, kernel, out], target=target_name, name='conv')
+        func(a, b, cSch)
+
+        # Functional check
+        if dataDtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy())
+        else : assert(np.allclose(cOrig.asnumpy(), cSch.asnumpy()))
+
+        evaluator = func.time_evaluator(func.entry_name, ctx, number=1000)
+        #print(tvm.lower(sconv, [data, kernel], simple_mode=True))
+        return evaluator(a, b, cSch).mean
+
+if __name__ == "__main__":
+    print "Workload, kernelSize, FP32_time, INT8_time, Speedup"
+    speedUps = []
+    for i in range(0, len(workloads)):
+        # workloas[i] -> (im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, hstride, wstride)
+        # Int8
+        fpTime = run_inference('float32','float32','float32', *workloads[i])
+        int8Time = run_inference('uint8', 'int8', 'int32', *workloads[i])
+        kh = workloads[i][4]
+        kw = workloads[i][5]
+        print "Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time)
+
+        speedUps.append(fpTime/int8Time)
+    print("Average speedup --> ", sum(speedUps)/float(len(speedUps)))
+
+
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index e0d2c403d4b4..1f3c1c1dd379 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -79,7 +79,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         HSTR, WSTR = stride
     else:
         HSTR, WSTR = stride, stride
-    assert data.dtype == kernel.dtype, \
+    assert data.dtype == kernel.dtype or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
         "Do not support inputs with different data types now. ' \
         '{} vs. {}".format(data.dtype, kernel.dtype)
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 721c7c169d99..1634dde05c5f 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -48,6 +48,36 @@ def _get_schedule_conv(wkl):
         # workloads of resnet152_v1 on imagenet, no extra workload required
         # workloads of resnet18_v2 on imagenet, no extra workload required
         # workloads of resnet34_v2 on imagenet, no extra workload required
+        
+        ## Following are for INT8 kernels
+        Workload('uint8', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
+        Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+        # workloads of resnet34_v1 on imagenet, no extra workload required
+        # workloads of resnet50_v1 on imagenet
+        Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
     ]
 
     fp32_vec_len = 8
@@ -90,6 +120,42 @@ def _get_schedule_conv(wkl):
         # workloads of resnet152_v1 on imagenet, no extra workload required
         # workloads of resnet18_v2 on imagenet, no extra workload required
         # workloads of resnet34_v2 on imagenet, no extra workload required
+    
+    
+        # Following are for INT8 operations
+        # workloads of resnet18_v1 on imagenet
+        AVXConvCommonFwd(3, fp32_vec_len, 28, False),   #TODO
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, False),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, True),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 7),
+        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
+        # workloads of resnet34_v1 on imagenet, no extra workload required
+        # workloads of resnet50_v1 on imagenet
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
+        # workloads of resnet101_v1 on imagenet, no extra workload required
+        # workloads of resnet152_v1 on imagenet, no extra workload required
+        # workloads of resnet18_v2 on imagenet, no extra workload required
+        # workloads of resnet34_v2 on imagenet, no extra workload required
     ]
 
     if wkl not in _WORKLOADS_AVX:
@@ -169,11 +235,20 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
         AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc,
         AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc
     }
+    
+    # Use int8 schedules if the input data is of int8 dtype
+    if data.dtype == 'uint8':
+        _AVX_SCH_TO_DECL_FUNC = {
+            AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc_int8,
+            AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc_int8
+        }
+
     n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
     ic = ic_chunk * ic_block
     kh, kw = kernel_size
-    wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=out_dtype),
-                        tvm.placeholder((num_filter, ic, kh, kw), dtype=out_dtype),
+    wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
+                        tvm.placeholder((num_filter, ic, kh, kw),
+                                         dtype=kernel.dtype),
                         stride, padding, out_dtype)
     sch = _get_schedule_NCHWc(wkl, layout, out_layout)
     return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel)
@@ -289,10 +364,6 @@ def traverse(op):
 def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding,
                           layout, out_layout, outs):
     """Create schedule for tensors"""
-    _AVX_SCH_TO_SCH_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
-        AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
-    }
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
@@ -316,13 +387,26 @@ def traverse(op):
             if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
-
+            
+            _AVX_SCH_TO_SCH_FUNC = {
+                AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
+                AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
+            }
+
+            # Use int8 schedules if the input data is of int8 dtype
+            if data.dtype == 'uint8':
+                _AVX_SCH_TO_SCH_FUNC = {
+                    AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc_int8,
+                    AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc_int8
+                }
+            
             n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
             ic = ic_chunk * ic_block
-            original_data = tvm.placeholder((n, ic, h, w), dtype=conv_out.dtype)
+            original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype)
 
             kh, kw = kernel_size
-            original_kernel = tvm.placeholder((num_filter, ic, kh, kw), dtype=conv_out.dtype)
+            original_kernel = tvm.placeholder((num_filter, ic, kh, kw),
+                                               dtype=kernel.dtype)
 
             wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
             sch = _get_schedule_NCHWc(wkl, layout, out_layout)
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 7d820701e1f4..4e7491bd95d8 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -8,6 +8,7 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
+from .int8Intrinsics import _intrin_reduce4int8_1x1
 
 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
 
@@ -229,3 +230,110 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
         s[O].parallel(parallel_axis)
 
     return s
+
+
+def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
+    """ Declaration for int8 conv"""
+    out_dtype = wkl.out_dtype
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size = data.shape[0]
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+
+    # Intel performs dot product of 2 "4" Int8 values
+    n_elems = 4
+    assert(sch.ic_bn%4 == 0)
+    ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
+    ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
+    ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner')
+    
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
+                               kernel[oc_chunk, ic_outer, ic_f_inner, oc_block, ic_s_inner, 0, 0].astype(out_dtype),
+                               axis=[ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8',
+                               tag="conv2d_NCHWc_int8")
+
+
+    return conv
+
+
+def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
+    """
+    Defines the schedule for INT8 for intel machines
+    Uses the Intel intrinsics to use INT8 operations
+    More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training
+    """
+    
+    target = tvm.target.current_target(allow_none=False)
+    avx2_len = -1
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            avx2_len = 16
+        else:
+            return s
+    assert(avx2_len != -1)
+    
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
+
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor)
+    s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+    s[C].vectorize(oc_block)
+
+    parallel_axis = s[C].fuse(oc_chunk, oh_outer)
+    s[CC].compute_at(s[C], parallel_axis)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
+    
+    # Sylake and future processors have 16 vector lanes 
+    assert(sch.oc_bn % avx2_len == 0)
+
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len);
+
+    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+
+    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_outer, ic_f_inner, oh_inner,
+    ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
+    s[CC].fuse(oc_chunk, oh_outer)
+    
+    n_elems = 4
+    pc = _intrin_reduce4int8_1x1(avx2_len, n_elems)
+    s[CC].tensorize(oc_s_inner, pc)
+    s[CC].unroll(ow_inner)
+    s[CC].unroll(oh_inner)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
+        ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+        s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+
+        parallel_axis = s[O].fuse(oc_chunk, oh_outer)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+
+    return s
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 8f8086fdebb4..488ee0e41249 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -8,6 +8,7 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
+from .int8Intrinsics import _intrin_reduce4int8_common
 
 AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
 
@@ -252,3 +253,123 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
         s[O].parallel(parallel_axis)
 
     return s
+
+
+def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
+    """
+    This function sets up the compute for INT8 conv 2d
+    Inputs are in INT8 datatype
+    Ouptut is in INT32 datatype
+    """
+    
+    out_dtype = wkl.out_dtype
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+
+    batch_size = data.shape[0]
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    # pack data
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    # convolution
+    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+    kh = tvm.reduce_axis((0, wkl.hkernel), name='kh')
+    kw = tvm.reduce_axis((0, wkl.wkernel), name='kw')
+
+    # Intel performs dot product of 2 "4" Int8 values
+    # Current implementation requires ic_bn to be a multiple of 4
+    n_elems = 4
+    assert(sch.ic_bn%4 == 0)
+    
+    ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
+    ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
+    ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner')
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw, ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype) *
+                               kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype),
+                               axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), 
+                       name='conv2d_NCHWc_int8',
+                       tag="conv2d_NCHWc_int8")
+    return conv
+
+def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
+    """
+    Defines the schedule for INT8 for intel machines
+    Uses the Intel intrinsics to use INT8 operations
+    More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training
+    """
+    
+    # Currently INT8 operations are supported for only Skylake
+    # In future the _intrin_reduce4int8 will be updated for VNNI instructions
+    # In case of unsupported target, the schedule will go to the original
+    # compute
+
+    target = tvm.target.current_target(allow_none=False)
+    avx2_len = -1
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            avx2_len = 16
+        else:
+            return s
+    assert(avx2_len != -1)
+    
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, _ = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
+
+    # schedule 5-D NCHW[x]c conv
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    s[CC].compute_at(s[C], ow_chunk)
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
+
+    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
+    
+    # Sylake and future processors have 16 vector lanes 
+    assert(sch.oc_bn % avx2_len == 0)
+
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len);
+
+    if sch.unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw,
+                      ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, kw, ic_f_inner,
+                      ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
+
+    
+    n_elems = 4
+    pc = _intrin_reduce4int8_common(avx2_len, n_elems)
+    s[CC].tensorize(oc_s_inner, pc)
+    s[CC].unroll(ow_block)
+    s[CC].unroll(oc_f_inner)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
+        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+        parallel_axis = s[O].fuse(oc_chunk, oh)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+
+    return s
diff --git a/topi/python/topi/x86/int8Intrinsics.py b/topi/python/topi/x86/int8Intrinsics.py
new file mode 100644
index 000000000000..abf6e3ace607
--- /dev/null
+++ b/topi/python/topi/x86/int8Intrinsics.py
@@ -0,0 +1,95 @@
+"""Core kernel of dot product of 4 Int8 operations"""
+import tvm
+
+
+def _intrin_reduce4int8_common(vec_size, num_elements_intel):
+  A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A')
+  B = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='B')
+  k = tvm.reduce_axis((0, 4), name='k')
+  C = tvm.compute((vec_size,), \
+      lambda i:  tvm.sum(\
+                    A[k].astype('int32') * B[i, k].astype('int32'), \
+                    axis=k), name="C")
+  s = tvm.create_schedule(C.op)
+
+  Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab",
+                       offset_factor=1,
+                       strides=[1])
+  Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb",
+                       offset_factor=1,
+                       strides=[tvm.var('ldw'), 1])
+
+  def _intrin_func(ins, outs):
+    def _instr(index):
+      ib = tvm.ir_builder.create()
+      if index == 1:
+          ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+          return ib.get()
+
+      A_int8 = ins[0].vload([0], "uint8x4")
+      re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8)
+      vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32);
+      vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32)
+      vecB = ins[1].vload([0, 0], "int8x64")
+      vecOne = tvm.const(1, "int16x32")
+      pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB)
+      quadReduction = tvm.call_llvm_intrin('int32x16',
+                                            'llvm.x86.avx512.pmaddw.d.512',
+                                            tvm.const(0, 'uint32'), \
+                                            pairReduction, vecOne);
+      vecC = outs[0].vload([0], "int32x16")
+      out = quadReduction + vecC
+      ib.emit(outs[0].vstore(0, out))
+      return ib.get()
+
+    # body, reset, update
+    return _instr(0), _instr(1), _instr(2)
+
+  with tvm.build_config(offset_factor=1, partition_const_loop=True):
+    return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb})
+
+def _intrin_reduce4int8_1x1(vec_size, num_elements_intel):
+  A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A')
+  B = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='B')
+  k = tvm.reduce_axis((0, 4), name='k')
+  C = tvm.compute((vec_size,), \
+      lambda i:  tvm.sum(\
+                    A[k].astype('int32') * B[i, k, 0, 0].astype('int32'), \
+                    axis=k), name="C")
+  s = tvm.create_schedule(C.op)
+
+  Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab",
+                       offset_factor=1,
+                       strides=[1])
+  Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb",
+                       offset_factor=1,
+                       strides=[tvm.var('ldw'), tvm.var('ldw'), tvm.var('ldw'), 1])
+
+  def _intrin_func(ins, outs):
+    def _instr(index):
+      ib = tvm.ir_builder.create()
+      if index == 1:
+          ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+          return ib.get()
+
+      A_int8 = ins[0].vload([0], "uint8x4")
+      re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8)
+      vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32);
+      vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32)
+      vecB = ins[1].vload([0, 0, 0, 0], "int8x64")
+      vecOne = tvm.const(1, "int16x32")
+      pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB)
+      quadReduction = tvm.call_llvm_intrin('int32x16',
+                                            'llvm.x86.avx512.pmaddw.d.512',
+                                            tvm.const(0, 'uint32'), \
+                                            pairReduction, vecOne);
+      vecC = outs[0].vload([0], "int32x16")
+      out = quadReduction + vecC
+      ib.emit(outs[0].vstore(0, out))
+      return ib.get()
+
+    # body, reset, update
+    return _instr(0), _instr(1), _instr(2)
+
+  with tvm.build_config(offset_factor=1, partition_const_loop=True):
+    return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb})

From b268b7f5fb661e86eac263a41bdaa576635f42cf Mon Sep 17 00:00:00 2001
From: Animesh Jain <janimesh@amazon.com>
Date: Thu, 23 Aug 2018 15:08:25 -0700
Subject: [PATCH 02/18] Int8 implementation for convolution operator on Intel
 Skylake

---
 topi/python/topi/x86/conv2d.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 1634dde05c5f..735b8400e5a5 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -50,7 +50,6 @@ def _get_schedule_conv(wkl):
         # workloads of resnet34_v2 on imagenet, no extra workload required
         
         ## Following are for INT8 kernels
-        Workload('uint8', 'int32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
         Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
         Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
         Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
@@ -124,7 +123,6 @@ def _get_schedule_conv(wkl):
     
         # Following are for INT8 operations
         # workloads of resnet18_v1 on imagenet
-        AVXConvCommonFwd(3, fp32_vec_len, 28, False),   #TODO
         AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
         AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
         AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),

From 58e9fbb8ed84dd390948b1aa587b6f34ac18b25f Mon Sep 17 00:00:00 2001
From: Animesh Jain <janimesh@amazon.com>
Date: Fri, 31 Aug 2018 15:16:37 -0700
Subject: [PATCH 03/18] PR changes

---
 tests/python/unittest/test_conv_int8_intel.py | 79 +++++++++----------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
index 62305e914a70..e6113a426cf0 100644
--- a/tests/python/unittest/test_conv_int8_intel.py
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -1,16 +1,17 @@
 import tvm
 import topi
 import numpy as np
-from tvm.contrib import cc
-from tvm.contrib import util
 import timeit
-from collections import namedtuple
+import logging
+import sys
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger('test_conv_int8_intel')
+logger.disabled = True
 
 # All the workloads from Resnet except first layer
 # Workload is ['height', 'width', 'in_filter', 'out_filter',
 #              'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
-
-
 workloads = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
              (56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
              (56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
@@ -41,67 +42,65 @@
 
 target_name = 'llvm -mcpu=skylake-avx512'
 avx2_len = 16
-ctx = tvm.context(target_name, 0);
+ctx = tvm.context(target_name, 0)
 
-def getShape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad,
-             hstride, wstride, outDtype):
+def get_shape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad,
+             hstride, wstride, out_dtype):
     ## Find shapes
-    dataShape = (1, in_filter/avx2_len, im_height, im_width, avx2_len)
+    data_shape = (1, in_filter/avx2_len, im_height, im_width, avx2_len)
 
-    if outDtype == 'int32':
+    if out_dtype == 'int32':
         if kh != 1:
-            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4)
+            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4)
         else:
-            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw)
-    elif outDtype == 'float32':
+            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw)
+    elif out_dtype == 'float32':
         if kh != 1:
-            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len)
+            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len)
         else:
-            kernelShape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw)
+            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw)
     out_height = (im_height + 2 * hpad - kh) // hstride + 1
     out_width = (im_width + 2 * wpad - kw) // wstride + 1
-    oShape = (1, out_filter/avx2_len, out_height, out_width, avx2_len)
-    return (dataShape, kernelShape, oShape)
+    o_shape = (1, out_filter/avx2_len, out_height, out_width, avx2_len)
+    return (data_shape, kernel_shape, o_shape)
 
 
 
-def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filter,
+def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter,
              out_filter, kh, kw, hpad, wpad, hstride, wstride):
 
-    (dataShape, kernelShape, oShape) = getShape(im_height, im_width, in_filter,
+    (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter,
                                                 out_filter, kh, kw, hpad, wpad,
-                                                hstride, wstride, outDtype)
+                                                hstride, wstride, out_dtype)
 
     # Create TVM placeholders
-    data = tvm.placeholder(dataShape, name='data', dtype=dataDtype);
-    kernel = tvm.placeholder(kernelShape, name='kernel', dtype=kernelDtype);
+    data = tvm.placeholder(data_shape, name='data', dtype=data_dtype)
+    kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)
 
     # Create the numpy arrays to be used for executing conv models
-    if dataDtype == 'float32':
-        a = tvm.nd.array(np.random.rand(*dataShape).astype(dtype=dataDtype), ctx);
-        b = tvm.nd.array(np.random.rand(*kernelShape).astype(dtype=kernelDtype), ctx);
+    if data_dtype == 'float32':
+        a = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), ctx)
+        b = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), ctx)
     else:
-        a = tvm.nd.array(np.random.randint(100, size=dataShape).astype(dataDtype));
-        b = tvm.nd.array(np.random.randint(100, size=kernelShape).astype(kernelDtype));
-        #a = tvm.nd.array(np.ones(dataShape, dtype='uint8'), ctx);
-        #b = tvm.nd.array(np.zeros(kernelShape, dtype='int8'), ctx);
+        a = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
+        b = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
 
     # cOrig will be used for declaration ouptut
     # cSch will be used for scheduled computation output
-    cOrig = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx);
-    cSch = tvm.nd.array(np.zeros(oShape, dtype=outDtype), ctx);
+    cOrig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx)
+    cSch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx)
 
 
     with tvm.target.create(target_name):
         conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter,
                                     kernel_size=(kh, kw), stride=hstride,
                                     padding=hpad, layout='NCHWc',
-                                    out_layout='NCHWc', out_dtype=outDtype);
+                                    out_layout='NCHWc', out_dtype=out_dtype)
         out = topi.nn.relu(conv)
-        s = tvm.create_schedule(out.op);
+        s = tvm.create_schedule(out.op)
         func = tvm.build(s, [data, kernel, out], target=target_name, name='out')
         func(a, b, cOrig)
-        #print(tvm.lower(s, [data, kernel], simple_mode=True));
+        logger.debug(tvm.lower(s, [data, kernel], simple_mode=True))
 
         # Generate and run the optimized schedule
         sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter,
@@ -110,20 +109,20 @@ def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filt
                                                       padding=hpad,
                                                       layout='NCHWc',
                                                       out_layout='NCHWc',
-                                                      outs=[out]);
+                                                      outs=[out])
         func = tvm.build(sconv, [data, kernel, out], target=target_name, name='conv')
         func(a, b, cSch)
 
         # Functional check
-        if dataDtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy())
+        if data_dtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy())
         else : assert(np.allclose(cOrig.asnumpy(), cSch.asnumpy()))
 
         evaluator = func.time_evaluator(func.entry_name, ctx, number=1000)
-        #print(tvm.lower(sconv, [data, kernel], simple_mode=True))
+        logger.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
         return evaluator(a, b, cSch).mean
 
 if __name__ == "__main__":
-    print "Workload, kernelSize, FP32_time, INT8_time, Speedup"
+    logger.info("Workload, kernelSize, FP32_time, INT8_time, Speedup")
     speedUps = []
     for i in range(0, len(workloads)):
         # workloas[i] -> (im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, hstride, wstride)
@@ -132,9 +131,9 @@ def run_inference(dataDtype, kernelDtype, outDtype, im_height, im_width, in_filt
         int8Time = run_inference('uint8', 'int8', 'int32', *workloads[i])
         kh = workloads[i][4]
         kw = workloads[i][5]
-        print "Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time)
+        logger.info("Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time))
 
         speedUps.append(fpTime/int8Time)
-    print("Average speedup --> ", sum(speedUps)/float(len(speedUps)))
+    logger.info("Average speedup --> ", sum(speedUps)/float(len(speedUps)))
 
 

From 541d1550fa78a995d68e812b2c0c282c9aa152ef Mon Sep 17 00:00:00 2001
From: Animesh Jain <janimesh@amazon.com>
Date: Fri, 31 Aug 2018 16:28:37 -0700
Subject: [PATCH 04/18] PR changes

---
 src/codegen/llvm/codegen_llvm.cc              |   2 +-
 tests/python/unittest/test_conv_int8_intel.py | 138 ++++++++++--------
 topi/python/topi/nn/conv2d.py                 |   2 +-
 3 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 1ef825cf4785..9c788521cdfe 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -692,7 +692,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
     llvm::Value *v = MakeValue(op->args[0]);
     return CreateBroadcast(v, 16);
   } else if (op->is_intrinsic("bitcast")){
-    llvm::Type * target = LLVMType(op->type);
+    llvm::Type* target = LLVMType(op->type);
     return builder_->CreateBitCast(MakeValue(op->args[0]), target);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->name;
diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
index e6113a426cf0..1887a65cb236 100644
--- a/tests/python/unittest/test_conv_int8_intel.py
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -1,18 +1,19 @@
+#pylint: disable-msg=too-many-arguments, too-many-locals, assignment-from-no-return
+""" Conv Int8 functional and performance testing"""
+import sys
+import logging
+import numpy as np
 import tvm
 import topi
-import numpy as np
-import timeit
-import logging
-import sys
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-logger = logging.getLogger('test_conv_int8_intel')
-logger.disabled = True
+LOGGER = logging.getLogger('test_conv_int8_intel')
+LOGGER.disabled = True
 
-# All the workloads from Resnet except first layer
+# All the WORKLOADS from Resnet except first layer
 # Workload is ['height', 'width', 'in_filter', 'out_filter',
 #              'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
-workloads = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+WORKLOADS = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
              (56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
              (56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
              (56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
@@ -40,38 +41,48 @@
             ]
 
 
-target_name = 'llvm -mcpu=skylake-avx512'
-avx2_len = 16
-ctx = tvm.context(target_name, 0)
+TARGET_NAME = 'llvm -mcpu=skylake-avx512'
+NUM_VEC_LANES = 16
+CTX = tvm.context(TARGET_NAME, 0)
 
-def get_shape(im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad,
-             hstride, wstride, out_dtype):
+def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad,
+              hstride, wstride, out_dtype):
+    """
+    Finds out the shape of all data structures
+    """
     ## Find shapes
-    data_shape = (1, in_filter/avx2_len, im_height, im_width, avx2_len)
+    data_shape = (1, in_filter/NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
 
     if out_dtype == 'int32':
-        if kh != 1:
-            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len/4, avx2_len, 4)
+        if k_h != 1:
+            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w,
+                            NUM_VEC_LANES/4, NUM_VEC_LANES, 4)
         else:
-            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len/4, avx2_len, 4, kh, kw)
+            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES/4,
+                            NUM_VEC_LANES, 4, k_h, k_w)
     elif out_dtype == 'float32':
-        if kh != 1:
-            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, kh, kw, avx2_len, avx2_len)
+        if k_h != 1:
+            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w,
+                            NUM_VEC_LANES, NUM_VEC_LANES)
         else:
-            kernel_shape = (out_filter/avx2_len, in_filter/avx2_len, avx2_len, avx2_len, kh, kw)
-    out_height = (im_height + 2 * hpad - kh) // hstride + 1
-    out_width = (im_width + 2 * wpad - kw) // wstride + 1
-    o_shape = (1, out_filter/avx2_len, out_height, out_width, avx2_len)
+            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES,
+                            NUM_VEC_LANES, k_h, k_w)
+    out_height = (im_height + 2 * hpad - k_h) // hstride + 1
+    out_width = (im_width + 2 * wpad - k_w) // wstride + 1
+    o_shape = (1, out_filter/NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
     return (data_shape, kernel_shape, o_shape)
 
 
 
 def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter,
-             out_filter, kh, kw, hpad, wpad, hstride, wstride):
-
+                  out_filter, k_h, k_w, hpad, wpad, hstride, wstride):
+    """
+    Runs the inference and checks the functional correctness between
+    compute and schedule outputs
+    """
     (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter,
-                                                out_filter, kh, kw, hpad, wpad,
-                                                hstride, wstride, out_dtype)
+                                                    out_filter, k_h, k_w, hpad, wpad,
+                                                    hstride, wstride, out_dtype)
 
     # Create TVM placeholders
     data = tvm.placeholder(data_shape, name='data', dtype=data_dtype)
@@ -79,61 +90,60 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
 
     # Create the numpy arrays to be used for executing conv models
     if data_dtype == 'float32':
-        a = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), ctx)
-        b = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), ctx)
+        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
+        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
     else:
-        a = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
-        b = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
+        data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
+        kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
 
-    # cOrig will be used for declaration ouptut
-    # cSch will be used for scheduled computation output
-    cOrig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx)
-    cSch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), ctx)
+    # c_orig will be used for declaration ouptut
+    # c_sch will be used for scheduled computation output
+    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
+    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
 
 
-    with tvm.target.create(target_name):
+    with tvm.target.create(TARGET_NAME):
         conv = topi.nn.conv2d_NCHWc(data, kernel, num_filter=out_filter,
-                                    kernel_size=(kh, kw), stride=hstride,
+                                    kernel_size=(k_h, k_w), stride=hstride,
                                     padding=hpad, layout='NCHWc',
                                     out_layout='NCHWc', out_dtype=out_dtype)
         out = topi.nn.relu(conv)
-        s = tvm.create_schedule(out.op)
-        func = tvm.build(s, [data, kernel, out], target=target_name, name='out')
-        func(a, b, cOrig)
-        logger.debug(tvm.lower(s, [data, kernel], simple_mode=True))
+        sch = tvm.create_schedule(out.op)
+        func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out')
+        func(data_array, kernel_array, c_orig)
+        LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))
 
         # Generate and run the optimized schedule
         sconv = topi.generic.nn.schedule_conv2d_NCHWc(num_filter=out_filter,
-                                                      kernel_size=(kh,kw),
+                                                      kernel_size=(k_h, k_w),
                                                       strides=hstride,
                                                       padding=hpad,
                                                       layout='NCHWc',
                                                       out_layout='NCHWc',
                                                       outs=[out])
-        func = tvm.build(sconv, [data, kernel, out], target=target_name, name='conv')
-        func(a, b, cSch)
+        func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv')
+        func(data_array, kernel_array, c_sch)
 
         # Functional check
-        if data_dtype == 'uint8': np.testing.assert_equal(cOrig.asnumpy(), cSch.asnumpy())
-        else : assert(np.allclose(cOrig.asnumpy(), cSch.asnumpy()))
+        if data_dtype == 'uint8':
+            np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
+        else:
+            assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())
 
-        evaluator = func.time_evaluator(func.entry_name, ctx, number=1000)
-        logger.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
-        return evaluator(a, b, cSch).mean
+        evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
+        LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
+        return evaluator(data_array, kernel_array, c_sch).mean
 
 if __name__ == "__main__":
-    logger.info("Workload, kernelSize, FP32_time, INT8_time, Speedup")
-    speedUps = []
-    for i in range(0, len(workloads)):
-        # workloas[i] -> (im_height, im_width, in_filter, out_filter, kh, kw, hpad, wpad, hstride, wstride)
-        # Int8
-        fpTime = run_inference('float32','float32','float32', *workloads[i])
-        int8Time = run_inference('uint8', 'int8', 'int32', *workloads[i])
-        kh = workloads[i][4]
-        kw = workloads[i][5]
-        logger.info("Workload#" + str(i) + ", " + str(kh) + "x" + str(kw) + ", " + str(fpTime) + ", " + str(int8Time) + ", " + str(fpTime/int8Time))
-
-        speedUps.append(fpTime/int8Time)
-    logger.info("Average speedup --> ", sum(speedUps)/float(len(speedUps)))
-
-
+    LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
+    SPEEDUP_ARRAY = []
+    for i in enumerate(len(WORKLOADS)):
+        fp32_time = run_inference('float32', 'float32', 'float32', *WORKLOADS[i])
+        int8_time = run_inference('uint8', 'int8', 'int32', *WORKLOADS[i])
+        kernel_h = WORKLOADS[i][4]
+        kernel_w = WORKLOADS[i][5]
+        LOGGER.info("Workload#" + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", "
+                    + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
+
+        SPEEDUP_ARRAY.append(fp32_time/int8_time)
+    LOGGER.info("Average speedup --> %s" % sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY)))
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 1f3c1c1dd379..809a05851825 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -79,7 +79,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         HSTR, WSTR = stride
     else:
         HSTR, WSTR = stride, stride
-    assert data.dtype == kernel.dtype or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
+    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
         "Do not support inputs with different data types now. ' \
         '{} vs. {}".format(data.dtype, kernel.dtype)
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)

From 314333d75b04312ce47d25df85d901c1aec3a89e Mon Sep 17 00:00:00 2001
From: Animesh Jain <janimesh@amazon.com>
Date: Fri, 31 Aug 2018 16:59:05 -0700
Subject: [PATCH 05/18] PR changes

---
 tests/python/unittest/test_conv_int8_intel.py |  10 +-
 topi/python/topi/x86/conv2d_avx_common.py     |   2 +-
 topi/python/topi/x86/int8Intrinsics.py        |  95 ----------------
 topi/python/topi/x86/int8_intrinsics.py       | 104 ++++++++++++++++++
 4 files changed, 110 insertions(+), 101 deletions(-)
 delete mode 100644 topi/python/topi/x86/int8Intrinsics.py
 create mode 100644 topi/python/topi/x86/int8_intrinsics.py

diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
index 1887a65cb236..e50a583df51f 100644
--- a/tests/python/unittest/test_conv_int8_intel.py
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -137,11 +137,11 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
 if __name__ == "__main__":
     LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
     SPEEDUP_ARRAY = []
-    for i in enumerate(len(WORKLOADS)):
-        fp32_time = run_inference('float32', 'float32', 'float32', *WORKLOADS[i])
-        int8_time = run_inference('uint8', 'int8', 'int32', *WORKLOADS[i])
-        kernel_h = WORKLOADS[i][4]
-        kernel_w = WORKLOADS[i][5]
+    for i, wkl in enumerate(WORKLOADS):
+        fp32_time = run_inference('float32', 'float32', 'float32', *wkl)
+        int8_time = run_inference('uint8', 'int8', 'int32', *wkl)
+        kernel_h = wkl[4]
+        kernel_w = wkl[5]
         LOGGER.info("Workload#" + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", "
                     + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
 
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 488ee0e41249..f014aa5719a2 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -8,7 +8,7 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
-from .int8Intrinsics import _intrin_reduce4int8_common
+from .int8_intrinsics import _intrin_reduce4int8_common
 
 AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
 
diff --git a/topi/python/topi/x86/int8Intrinsics.py b/topi/python/topi/x86/int8Intrinsics.py
deleted file mode 100644
index abf6e3ace607..000000000000
--- a/topi/python/topi/x86/int8Intrinsics.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""Core kernel of dot product of 4 Int8 operations"""
-import tvm
-
-
-def _intrin_reduce4int8_common(vec_size, num_elements_intel):
-  A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A')
-  B = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='B')
-  k = tvm.reduce_axis((0, 4), name='k')
-  C = tvm.compute((vec_size,), \
-      lambda i:  tvm.sum(\
-                    A[k].astype('int32') * B[i, k].astype('int32'), \
-                    axis=k), name="C")
-  s = tvm.create_schedule(C.op)
-
-  Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab",
-                       offset_factor=1,
-                       strides=[1])
-  Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb",
-                       offset_factor=1,
-                       strides=[tvm.var('ldw'), 1])
-
-  def _intrin_func(ins, outs):
-    def _instr(index):
-      ib = tvm.ir_builder.create()
-      if index == 1:
-          ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
-          return ib.get()
-
-      A_int8 = ins[0].vload([0], "uint8x4")
-      re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8)
-      vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32);
-      vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32)
-      vecB = ins[1].vload([0, 0], "int8x64")
-      vecOne = tvm.const(1, "int16x32")
-      pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB)
-      quadReduction = tvm.call_llvm_intrin('int32x16',
-                                            'llvm.x86.avx512.pmaddw.d.512',
-                                            tvm.const(0, 'uint32'), \
-                                            pairReduction, vecOne);
-      vecC = outs[0].vload([0], "int32x16")
-      out = quadReduction + vecC
-      ib.emit(outs[0].vstore(0, out))
-      return ib.get()
-
-    # body, reset, update
-    return _instr(0), _instr(1), _instr(2)
-
-  with tvm.build_config(offset_factor=1, partition_const_loop=True):
-    return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb})
-
-def _intrin_reduce4int8_1x1(vec_size, num_elements_intel):
-  A = tvm.placeholder((num_elements_intel,), dtype='uint8', name='A')
-  B = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='B')
-  k = tvm.reduce_axis((0, 4), name='k')
-  C = tvm.compute((vec_size,), \
-      lambda i:  tvm.sum(\
-                    A[k].astype('int32') * B[i, k, 0, 0].astype('int32'), \
-                    axis=k), name="C")
-  s = tvm.create_schedule(C.op)
-
-  Ab = tvm.decl_buffer(A.shape, dtype='uint8', name="Ab",
-                       offset_factor=1,
-                       strides=[1])
-  Bb = tvm.decl_buffer(B.shape, dtype='int8', name="Bb",
-                       offset_factor=1,
-                       strides=[tvm.var('ldw'), tvm.var('ldw'), tvm.var('ldw'), 1])
-
-  def _intrin_func(ins, outs):
-    def _instr(index):
-      ib = tvm.ir_builder.create()
-      if index == 1:
-          ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
-          return ib.get()
-
-      A_int8 = ins[0].vload([0], "uint8x4")
-      re_int32 = tvm.call_pure_intrin('int32', 'bitcast', A_int8)
-      vecA_i32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32);
-      vecA = tvm.call_pure_intrin('int8x64', 'bitcast', vecA_i32)
-      vecB = ins[1].vload([0, 0, 0, 0], "int8x64")
-      vecOne = tvm.const(1, "int16x32")
-      pairReduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vecA, vecB)
-      quadReduction = tvm.call_llvm_intrin('int32x16',
-                                            'llvm.x86.avx512.pmaddw.d.512',
-                                            tvm.const(0, 'uint32'), \
-                                            pairReduction, vecOne);
-      vecC = outs[0].vload([0], "int32x16")
-      out = quadReduction + vecC
-      ib.emit(outs[0].vstore(0, out))
-      return ib.get()
-
-    # body, reset, update
-    return _instr(0), _instr(1), _instr(2)
-
-  with tvm.build_config(offset_factor=1, partition_const_loop=True):
-    return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:Ab, B:Bb})
diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py
new file mode 100644
index 000000000000..7fb948e23db6
--- /dev/null
+++ b/topi/python/topi/x86/int8_intrinsics.py
@@ -0,0 +1,104 @@
+"""Core kernel of dot product of 4 Int8 operations"""
+import tvm
+
+
+def _intrin_reduce4int8_common(vec_size, num_elements_intel):
+    data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data')
+    kernel = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='kernel')
+    k = tvm.reduce_axis((0, 4), name='k')
+    C = tvm.compute((vec_size,),
+                    lambda i: tvm.sum(data[k].astype('int32') *
+                                      kernel[i, k].astype('int32'),
+                                      axis=k),
+                    name="C")
+
+    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                               offset_factor=1,
+                               strides=[1])
+    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                               offset_factor=1,
+                               strides=[tvm.var('ldw'), 1])
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+                return ib.get()
+
+            a_int8 = ins[0].vload([0], "uint8x4")
+            re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8)
+            vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32)
+            vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32)
+            vec_b = ins[1].vload([0, 0], "int8x64")
+            vec_one = tvm.const(1, "int16x32")
+            pair_reduction = tvm.call_llvm_intrin('int16x32',
+                                                  'llvm.x86.avx512.pmaddubs.w.512',
+                                                  tvm.const(0, 'uint32'),
+                                                  vec_a, vec_b)
+            quad_reduction = tvm.call_llvm_intrin('int32x16',
+                                                  'llvm.x86.avx512.pmaddw.d.512',
+                                                  tvm.const(0, 'uint32'),
+                                                  pair_reduction, vec_one)
+            vec_c = outs[0].vload([0], "int32x16")
+            out = quad_reduction + vec_c
+            ib.emit(outs[0].vstore(0, out))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
+
+def _intrin_reduce4int8_1x1(vec_size, num_elements_intel):
+    data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data')
+    kernel = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='kernel')
+    k = tvm.reduce_axis((0, 4), name='k')
+    C = tvm.compute((vec_size,), \
+                    lambda i: tvm.sum(data[k].astype('int32') *
+                                      kernel[i, k, 0, 0].astype('int32'),
+                                      axis=k),
+                    name="C")
+
+    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                               offset_factor=1,
+                               strides=[1])
+    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                               offset_factor=1,
+                               strides=[tvm.var('ldw'),
+                                        tvm.var('ldw'),
+                                        tvm.var('ldw'), 1]
+                              )
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+                return ib.get()
+
+            a_int8 = ins[0].vload([0], "uint8x4")
+            re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8)
+            vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32)
+            vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32)
+            vec_b = ins[1].vload([0, 0, 0, 0], "int8x64")
+            vec_one = tvm.const(1, "int16x32")
+            pair_reduction = tvm.call_llvm_intrin('int16x32',
+                                                  'llvm.x86.avx512.pmaddubs.w.512',
+                                                  tvm.const(0, 'uint32'),
+                                                  vec_a, vec_b)
+            quad_reduction = tvm.call_llvm_intrin('int32x16',
+                                                  'llvm.x86.avx512.pmaddw.d.512',
+                                                  tvm.const(0, 'uint32'), \
+                                                  pair_reduction, vec_one)
+            vec_c = outs[0].vload([0], "int32x16")
+            out = quad_reduction + vec_c
+            ib.emit(outs[0].vstore(0, out))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})

From b24726fb690d885334e52d9c0953358b1a582134 Mon Sep 17 00:00:00 2001
From: Animesh Jain <janimesh@amazon.com>
Date: Fri, 31 Aug 2018 17:04:57 -0700
Subject: [PATCH 06/18] Fixing an error

---
 tests/python/unittest/test_conv_int8_intel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
index e50a583df51f..5d77324ba503 100644
--- a/tests/python/unittest/test_conv_int8_intel.py
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -146,4 +146,4 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
                     + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
 
         SPEEDUP_ARRAY.append(fp32_time/int8_time)
-    LOGGER.info("Average speedup --> %s" % sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY)))
+    LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))))

From 6d4aac21c53458b654188b8dc2d24ca622126eeb Mon Sep 17 00:00:00 2001
From: Animesh Jain <janimesh@amazon.com>
Date: Fri, 31 Aug 2018 17:04:57 -0700
Subject: [PATCH 07/18] Fixing an error

---
 src/codegen/llvm/codegen_llvm.cc              |  4 +--
 tests/python/unittest/test_conv_int8_intel.py |  2 +-
 topi/python/topi/x86/conv2d.py                | 16 ++++-----
 topi/python/topi/x86/conv2d_avx_1x1.py        | 36 ++++++++++---------
 topi/python/topi/x86/conv2d_avx_common.py     | 33 +++++++++--------
 topi/python/topi/x86/int8_intrinsics.py       |  1 +
 6 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 9c788521cdfe..f8b402d78b03 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -688,10 +688,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
-  } else if (op->is_intrinsic("broadcast16")){
+  } else if (op->is_intrinsic("broadcast16")) {
     llvm::Value *v = MakeValue(op->args[0]);
     return CreateBroadcast(v, 16);
-  } else if (op->is_intrinsic("bitcast")){
+  } else if (op->is_intrinsic("bitcast")) {
     llvm::Type* target = LLVMType(op->type);
     return builder_->CreateBitCast(MakeValue(op->args[0]), target);
   } else {
diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
index e50a583df51f..5d77324ba503 100644
--- a/tests/python/unittest/test_conv_int8_intel.py
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -146,4 +146,4 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
                     + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
 
         SPEEDUP_ARRAY.append(fp32_time/int8_time)
-    LOGGER.info("Average speedup --> %s" % sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY)))
+    LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))))
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 735b8400e5a5..dbc4b678c19a 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -48,7 +48,7 @@ def _get_schedule_conv(wkl):
         # workloads of resnet152_v1 on imagenet, no extra workload required
         # workloads of resnet18_v2 on imagenet, no extra workload required
         # workloads of resnet34_v2 on imagenet, no extra workload required
-        
+
         ## Following are for INT8 kernels
         Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
         Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
@@ -119,8 +119,8 @@ def _get_schedule_conv(wkl):
         # workloads of resnet152_v1 on imagenet, no extra workload required
         # workloads of resnet18_v2 on imagenet, no extra workload required
         # workloads of resnet34_v2 on imagenet, no extra workload required
-    
-    
+
+
         # Following are for INT8 operations
         # workloads of resnet18_v1 on imagenet
         AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
@@ -233,7 +233,7 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
         AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc,
         AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc
     }
-    
+
     # Use int8 schedules if the input data is of int8 dtype
     if data.dtype == 'uint8':
         _AVX_SCH_TO_DECL_FUNC = {
@@ -246,7 +246,7 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
     kh, kw = kernel_size
     wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
                         tvm.placeholder((num_filter, ic, kh, kw),
-                                         dtype=kernel.dtype),
+                                        dtype=kernel.dtype),
                         stride, padding, out_dtype)
     sch = _get_schedule_NCHWc(wkl, layout, out_layout)
     return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel)
@@ -385,7 +385,7 @@ def traverse(op):
             if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
-            
+
             _AVX_SCH_TO_SCH_FUNC = {
                 AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
                 AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
@@ -397,14 +397,14 @@ def traverse(op):
                     AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc_int8,
                     AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc_int8
                 }
-            
+
             n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
             ic = ic_chunk * ic_block
             original_data = tvm.placeholder((n, ic, h, w), dtype=data.dtype)
 
             kh, kw = kernel_size
             original_kernel = tvm.placeholder((num_filter, ic, kh, kw),
-                                               dtype=kernel.dtype)
+                                              dtype=kernel.dtype)
 
             wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
             sch = _get_schedule_NCHWc(wkl, layout, out_layout)
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 4e7491bd95d8..20dd162f3ee8 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -252,16 +252,19 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
 
     # Intel performs dot product of 2 "4" Int8 values
     n_elems = 4
-    assert(sch.ic_bn%4 == 0)
+    assert sch.ic_bn%4 == 0
     ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
     ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
     ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner')
-    
+
     conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
-                               kernel[oc_chunk, ic_outer, ic_f_inner, oc_block, ic_s_inner, 0, 0].astype(out_dtype),
-                               axis=[ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8',
-                               tag="conv2d_NCHWc_int8")
+                       tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR,
+                                        ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
+                               kernel[oc_chunk, ic_outer, ic_f_inner,
+                                      oc_block, ic_s_inner, 0, 0].astype(out_dtype),
+                               axis=[ic_outer, ic_f_inner, ic_s_inner]),
+                       name='conv2d_NCHWc_int8',
+                       tag="conv2d_NCHWc_int8")
 
 
     return conv
@@ -271,9 +274,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     """
     Defines the schedule for INT8 for intel machines
     Uses the Intel intrinsics to use INT8 operations
-    More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training
+    More details - https://software.intel.com/en-us/articles/
+    lower-numerical-precision-deep-learning-inference-and-training
     """
-    
+
     target = tvm.target.current_target(allow_none=False)
     avx2_len = -1
     for opt in target.options:
@@ -281,8 +285,8 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
             avx2_len = 16
         else:
             return s
-    assert(avx2_len != -1)
-    
+    assert avx2_len != -1
+
     # schedule data
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
@@ -306,19 +310,19 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
-    
-    # Sylake and future processors have 16 vector lanes 
-    assert(sch.oc_bn % avx2_len == 0)
 
-    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len);
+    # Sylake and future processors have 16 vector lanes
+    assert sch.oc_bn % avx2_len == 0
+
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)
 
     oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
     ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
 
     s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_outer, ic_f_inner, oh_inner,
-    ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
+                  ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
     s[CC].fuse(oc_chunk, oh_outer)
-    
+
     n_elems = 4
     pc = _intrin_reduce4int8_1x1(avx2_len, n_elems)
     s[CC].tensorize(oc_s_inner, pc)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index f014aa5719a2..e59a6132d9f1 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -261,7 +261,7 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
     Inputs are in INT8 datatype
     Ouptut is in INT32 datatype
     """
-    
+
     out_dtype = wkl.out_dtype
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
@@ -285,15 +285,17 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
     # Intel performs dot product of 2 "4" Int8 values
     # Current implementation requires ic_bn to be a multiple of 4
     n_elems = 4
-    assert(sch.ic_bn%4 == 0)
-    
+    assert sch.ic_bn%4 == 0
+
     ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
     ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
     ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner')
     conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw, ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype) *
-                               kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype),
-                               axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), 
+                       tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw,
+                                        ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype) *
+                               kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner,
+                                      oc_block, ic_s_inner].astype(out_dtype),
+                               axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
                        name='conv2d_NCHWc_int8',
                        tag="conv2d_NCHWc_int8")
     return conv
@@ -302,9 +304,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     """
     Defines the schedule for INT8 for intel machines
     Uses the Intel intrinsics to use INT8 operations
-    More details - https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training
+    More details - https://software.intel.com/en-us/articles/
+    lower-numerical-precision-deep-learning-inference-and-training
     """
-    
+
     # Currently INT8 operations are supported for only Skylake
     # In future the _intrin_reduce4int8 will be updated for VNNI instructions
     # In case of unsupported target, the schedule will go to the original
@@ -317,8 +320,8 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
             avx2_len = 16
         else:
             return s
-    assert(avx2_len != -1)
-    
+    assert avx2_len != -1
+
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
         batch, ic_chunk, ih, iw, _ = s[A].op.axis
@@ -342,11 +345,11 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
     ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
-    
-    # Sylake and future processors have 16 vector lanes 
-    assert(sch.oc_bn % avx2_len == 0)
 
-    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len);
+    # Sylake and future processors have 16 vector lanes
+    assert sch.oc_bn % avx2_len == 0
+
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)
 
     if sch.unroll_kw:
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw,
@@ -356,7 +359,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, kw, ic_f_inner,
                       ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
 
-    
+
     n_elems = 4
     pc = _intrin_reduce4int8_common(avx2_len, n_elems)
     s[CC].tensorize(oc_s_inner, pc)
diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py
index 7fb948e23db6..0d9f5987804b 100644
--- a/topi/python/topi/x86/int8_intrinsics.py
+++ b/topi/python/topi/x86/int8_intrinsics.py
@@ -1,4 +1,5 @@
 """Core kernel of dot product of 4 Int8 operations"""
+#pylint: disable=invalid-name
 import tvm
 
 

From 5ff9c8269dd041305caf8b87d8e042f6232c191f Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Tue, 4 Sep 2018 22:20:14 +0000
Subject: [PATCH 08/18] Minor typos fix

---
 tests/python/unittest/test_conv_int8_intel.py | 16 ++++++++--------
 topi/python/topi/x86/conv2d_avx_1x1.py        |  2 +-
 topi/python/topi/x86/conv2d_avx_common.py     |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
index 5d77324ba503..863b3a6a41ab 100644
--- a/tests/python/unittest/test_conv_int8_intel.py
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -8,7 +8,7 @@
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 LOGGER = logging.getLogger('test_conv_int8_intel')
-LOGGER.disabled = True
+LOGGER.disabled = False
 
 # All the WORKLOADS from Resnet except first layer
 # Workload is ['height', 'width', 'in_filter', 'out_filter',
@@ -51,25 +51,25 @@ def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad,
     Finds out the shape of all data structures
     """
     ## Find shapes
-    data_shape = (1, in_filter/NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
+    data_shape = (1, in_filter//NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
 
     if out_dtype == 'int32':
         if k_h != 1:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w,
-                            NUM_VEC_LANES/4, NUM_VEC_LANES, 4)
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                            NUM_VEC_LANES//4, NUM_VEC_LANES, 4)
         else:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES/4,
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES//4,
                             NUM_VEC_LANES, 4, k_h, k_w)
     elif out_dtype == 'float32':
         if k_h != 1:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w,
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
                             NUM_VEC_LANES, NUM_VEC_LANES)
         else:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES,
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES,
                             NUM_VEC_LANES, k_h, k_w)
     out_height = (im_height + 2 * hpad - k_h) // hstride + 1
     out_width = (im_width + 2 * wpad - k_w) // wstride + 1
-    o_shape = (1, out_filter/NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
+    o_shape = (1, out_filter//NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
     return (data_shape, kernel_shape, o_shape)
 
 
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 20dd162f3ee8..f6d59a2a37c2 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -311,7 +311,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
-    # Sylake and future processors have 16 vector lanes
+    # Skylake and future processors have 16 vector lanes
     assert sch.oc_bn % avx2_len == 0
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index e59a6132d9f1..052b6ec14060 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -259,7 +259,7 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
     """
     This function sets up the compute for INT8 conv 2d
     Inputs are in INT8 datatype
-    Ouptut is in INT32 datatype
+    Output is in INT32 datatype
     """
 
     out_dtype = wkl.out_dtype
@@ -346,7 +346,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
     ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
 
-    # Sylake and future processors have 16 vector lanes
+    # Skylake and future processors have 16 vector lanes
     assert sch.oc_bn % avx2_len == 0
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)

From 4cd7f3066a6e162398d9bd5bc8faba7149968c42 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Tue, 4 Sep 2018 22:20:14 +0000
Subject: [PATCH 09/18] Minor typos fix

---
 tests/python/unittest/test_conv_int8_intel.py | 16 ++++++++--------
 topi/python/topi/x86/conv2d_avx_1x1.py        |  4 ++--
 topi/python/topi/x86/conv2d_avx_common.py     |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/python/unittest/test_conv_int8_intel.py b/tests/python/unittest/test_conv_int8_intel.py
index 5d77324ba503..863b3a6a41ab 100644
--- a/tests/python/unittest/test_conv_int8_intel.py
+++ b/tests/python/unittest/test_conv_int8_intel.py
@@ -8,7 +8,7 @@
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 LOGGER = logging.getLogger('test_conv_int8_intel')
-LOGGER.disabled = True
+LOGGER.disabled = False
 
 # All the WORKLOADS from Resnet except first layer
 # Workload is ['height', 'width', 'in_filter', 'out_filter',
@@ -51,25 +51,25 @@ def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad,
     Finds out the shape of all data structures
     """
     ## Find shapes
-    data_shape = (1, in_filter/NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
+    data_shape = (1, in_filter//NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
 
     if out_dtype == 'int32':
         if k_h != 1:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w,
-                            NUM_VEC_LANES/4, NUM_VEC_LANES, 4)
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                            NUM_VEC_LANES//4, NUM_VEC_LANES, 4)
         else:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES/4,
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES//4,
                             NUM_VEC_LANES, 4, k_h, k_w)
     elif out_dtype == 'float32':
         if k_h != 1:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, k_h, k_w,
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
                             NUM_VEC_LANES, NUM_VEC_LANES)
         else:
-            kernel_shape = (out_filter/NUM_VEC_LANES, in_filter/NUM_VEC_LANES, NUM_VEC_LANES,
+            kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, NUM_VEC_LANES,
                             NUM_VEC_LANES, k_h, k_w)
     out_height = (im_height + 2 * hpad - k_h) // hstride + 1
     out_width = (im_width + 2 * wpad - k_w) // wstride + 1
-    o_shape = (1, out_filter/NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
+    o_shape = (1, out_filter//NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
     return (data_shape, kernel_shape, o_shape)
 
 
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 20dd162f3ee8..b35ec3f6b8f1 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -8,7 +8,7 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
-from .int8Intrinsics import _intrin_reduce4int8_1x1
+from .int8_intrinsics import _intrin_reduce4int8_1x1
 
 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
 
@@ -311,7 +311,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
-    # Sylake and future processors have 16 vector lanes
+    # Skylake and future processors have 16 vector lanes
     assert sch.oc_bn % avx2_len == 0
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index e59a6132d9f1..052b6ec14060 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -259,7 +259,7 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
     """
     This function sets up the compute for INT8 conv 2d
     Inputs are in INT8 datatype
-    Ouptut is in INT32 datatype
+    Output is in INT32 datatype
     """
 
     out_dtype = wkl.out_dtype
@@ -346,7 +346,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
     ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
 
-    # Sylake and future processors have 16 vector lanes
+    # Skylake and future processors have 16 vector lanes
     assert sch.oc_bn % avx2_len == 0
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)

From 621f7bb478f77f77b7badc271d82c2a0fbf7a964 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Wed, 5 Sep 2018 17:32:52 +0000
Subject: [PATCH 10/18] Removing the broadcast16 CPP code. Using astype feature
 instead

---
 src/codegen/llvm/codegen_llvm.cc        | 3 ---
 topi/python/topi/x86/int8_intrinsics.py | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index f8b402d78b03..799193b1dede 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -688,9 +688,6 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
-  } else if (op->is_intrinsic("broadcast16")) {
-    llvm::Value *v = MakeValue(op->args[0]);
-    return CreateBroadcast(v, 16);
   } else if (op->is_intrinsic("bitcast")) {
     llvm::Type* target = LLVMType(op->type);
     return builder_->CreateBitCast(MakeValue(op->args[0]), target);
diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py
index 0d9f5987804b..b802137205a1 100644
--- a/topi/python/topi/x86/int8_intrinsics.py
+++ b/topi/python/topi/x86/int8_intrinsics.py
@@ -29,7 +29,7 @@ def _instr(index):
 
             a_int8 = ins[0].vload([0], "uint8x4")
             re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8)
-            vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32)
+            vec_ai32 = re_int32.astype('int32x16')
             vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32)
             vec_b = ins[1].vload([0, 0], "int8x64")
             vec_one = tvm.const(1, "int16x32")
@@ -81,7 +81,7 @@ def _instr(index):
 
             a_int8 = ins[0].vload([0], "uint8x4")
             re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8)
-            vec_ai32 = tvm.call_pure_intrin('int32x16', 'broadcast16', re_int32)
+            vec_ai32 = re_int32.astype('int32x16')
             vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32)
             vec_b = ins[1].vload([0, 0, 0, 0], "int8x64")
             vec_one = tvm.const(1, "int16x32")

From 74516c053fa39039df9403c707b8c8d12ae1cd0f Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Wed, 5 Sep 2018 21:34:56 +0000
Subject: [PATCH 11/18] Replacing constant by variable name num_elements_intel

---
 topi/python/topi/x86/int8_intrinsics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py
index b802137205a1..83a574283048 100644
--- a/topi/python/topi/x86/int8_intrinsics.py
+++ b/topi/python/topi/x86/int8_intrinsics.py
@@ -6,7 +6,7 @@
 def _intrin_reduce4int8_common(vec_size, num_elements_intel):
     data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data')
     kernel = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, 4), name='k')
+    k = tvm.reduce_axis((0, num_elements_intel), name='k')
     C = tvm.compute((vec_size,),
                     lambda i: tvm.sum(data[k].astype('int32') *
                                       kernel[i, k].astype('int32'),
@@ -55,7 +55,7 @@ def _instr(index):
 def _intrin_reduce4int8_1x1(vec_size, num_elements_intel):
     data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data')
     kernel = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, 4), name='k')
+    k = tvm.reduce_axis((0, num_elements_intel), name='k')
     C = tvm.compute((vec_size,), \
                     lambda i: tvm.sum(data[k].astype('int32') *
                                       kernel[i, k, 0, 0].astype('int32'),

From f68a6fa72f6cd0686621e74946e6354da86f3068 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Thu, 6 Sep 2018 01:21:29 +0000
Subject: [PATCH 12/18] Name fixes and tensorize update rule updated

---
 topi/python/topi/x86/conv2d_avx_1x1.py    | 12 ++++++------
 topi/python/topi/x86/conv2d_avx_common.py | 12 ++++++------
 topi/python/topi/x86/int8_intrinsics.py   | 14 ++++++++------
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index b35ec3f6b8f1..b43da7372eed 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -279,13 +279,13 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     """
 
     target = tvm.target.current_target(allow_none=False)
-    avx2_len = -1
+    int32_lanes = -1
     for opt in target.options:
         if opt == '-mcpu=skylake-avx512':
-            avx2_len = 16
+            int32_lanes = 16
         else:
             return s
-    assert avx2_len != -1
+    assert int32_lanes != -1
 
     # schedule data
     A = data
@@ -312,9 +312,9 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
     # Skylake and future processors have 16 vector lanes
-    assert sch.oc_bn % avx2_len == 0
+    assert sch.oc_bn % int32_lanes == 0
 
-    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
     oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
     ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
@@ -324,7 +324,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     s[CC].fuse(oc_chunk, oh_outer)
 
     n_elems = 4
-    pc = _intrin_reduce4int8_1x1(avx2_len, n_elems)
+    pc = _intrin_reduce4int8_1x1(int32_lanes, n_elems)
     s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_inner)
     s[CC].unroll(oh_inner)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 052b6ec14060..ae4ed0270a76 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -314,13 +314,13 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     # compute
 
     target = tvm.target.current_target(allow_none=False)
-    avx2_len = -1
+    int32_lanes = -1
     for opt in target.options:
         if opt == '-mcpu=skylake-avx512':
-            avx2_len = 16
+            int32_lanes = 16
         else:
             return s
-    assert avx2_len != -1
+    assert int32_lanes != -1
 
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
@@ -347,9 +347,9 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
     ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
 
     # Skylake and future processors have 16 vector lanes
-    assert sch.oc_bn % avx2_len == 0
+    assert sch.oc_bn % int32_lanes == 0
 
-    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=avx2_len)
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
     if sch.unroll_kw:
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw,
@@ -361,7 +361,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
 
     n_elems = 4
-    pc = _intrin_reduce4int8_common(avx2_len, n_elems)
+    pc = _intrin_reduce4int8_common(int32_lanes, n_elems)
     s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_block)
     s[CC].unroll(oc_f_inner)
diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py
index 83a574283048..b0b5523c2980 100644
--- a/topi/python/topi/x86/int8_intrinsics.py
+++ b/topi/python/topi/x86/int8_intrinsics.py
@@ -41,9 +41,10 @@ def _instr(index):
                                                   'llvm.x86.avx512.pmaddw.d.512',
                                                   tvm.const(0, 'uint32'),
                                                   pair_reduction, vec_one)
-            vec_c = outs[0].vload([0], "int32x16")
-            out = quad_reduction + vec_c
-            ib.emit(outs[0].vstore(0, out))
+            if index == 0:
+                ib.emit(outs[0].vstore(0, quad_reduction))
+            else:
+                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
             return ib.get()
 
         # body, reset, update
@@ -93,9 +94,10 @@ def _instr(index):
                                                   'llvm.x86.avx512.pmaddw.d.512',
                                                   tvm.const(0, 'uint32'), \
                                                   pair_reduction, vec_one)
-            vec_c = outs[0].vload([0], "int32x16")
-            out = quad_reduction + vec_c
-            ib.emit(outs[0].vstore(0, out))
+            if index == 0:
+                ib.emit(outs[0].vstore(0, quad_reduction))
+            else:
+                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
             return ib.get()
 
         # body, reset, update

From ed984533008ad1cd5c6f6efccde71a3463b06514 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Tue, 11 Sep 2018 16:56:41 +0000
Subject: [PATCH 13/18] Fixing the bug about checking skylake

---
 topi/python/topi/nn/conv2d.py             | 10 ++++++++++
 topi/python/topi/x86/conv2d_avx_1x1.py    | 11 +++++------
 topi/python/topi/x86/conv2d_avx_common.py | 11 +++++------
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 809a05851825..c67828b79e26 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -397,3 +397,13 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
+
+def check_skylake(target):
+    """
+    Checks if the target is skylake
+    """
+
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            return True
+    return False
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index b43da7372eed..8dc46d16c2f3 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -5,7 +5,7 @@
 import tvm
 
 from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload
+from ..nn.conv2d import _get_schedule, _get_workload, check_skylake
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
 from .int8_intrinsics import _intrin_reduce4int8_1x1
@@ -280,11 +280,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
     target = tvm.target.current_target(allow_none=False)
     int32_lanes = -1
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            int32_lanes = 16
-        else:
-            return s
+    if check_skylake(target):
+        int32_lanes = 16
+    else:
+        return s
     assert int32_lanes != -1
 
     # schedule data
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index ae4ed0270a76..03c86701a131 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -5,7 +5,7 @@
 import tvm
 
 from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload
+from ..nn.conv2d import _get_schedule, _get_workload, check_skylake
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
 from .int8_intrinsics import _intrin_reduce4int8_common
@@ -315,11 +315,10 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
 
     target = tvm.target.current_target(allow_none=False)
     int32_lanes = -1
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            int32_lanes = 16
-        else:
-            return s
+    if check_skylake(target):
+        int32_lanes = 16
+    else:
+        return s
     assert int32_lanes != -1
 
     A = data

From 3a53b510dd7db2e5e6277dd41639e9ffc0c4a237 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Thu, 13 Sep 2018 01:06:54 +0000
Subject: [PATCH 14/18] Replacing bitcast with reinterpret

---
 src/codegen/llvm/codegen_llvm.cc        | 3 ---
 topi/python/topi/x86/int8_intrinsics.py | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 799193b1dede..c1b1fe24f0a8 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -688,9 +688,6 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
       indices.push_back(i);
     }
     return builder_->CreateShuffleVector(v0, v1, indices);
-  } else if (op->is_intrinsic("bitcast")) {
-    llvm::Type* target = LLVMType(op->type);
-    return builder_->CreateBitCast(MakeValue(op->args[0]), target);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->name;
     return nullptr;
diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/int8_intrinsics.py
index b0b5523c2980..26657abe8160 100644
--- a/topi/python/topi/x86/int8_intrinsics.py
+++ b/topi/python/topi/x86/int8_intrinsics.py
@@ -28,9 +28,9 @@ def _instr(index):
                 return ib.get()
 
             a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8)
+            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
             vec_ai32 = re_int32.astype('int32x16')
-            vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32)
+            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
             vec_b = ins[1].vload([0, 0], "int8x64")
             vec_one = tvm.const(1, "int16x32")
             pair_reduction = tvm.call_llvm_intrin('int16x32',
@@ -81,9 +81,9 @@ def _instr(index):
                 return ib.get()
 
             a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.call_pure_intrin('int32', 'bitcast', a_int8)
+            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
             vec_ai32 = re_int32.astype('int32x16')
-            vec_a = tvm.call_pure_intrin('int8x64', 'bitcast', vec_ai32)
+            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
             vec_b = ins[1].vload([0, 0, 0, 0], "int8x64")
             vec_one = tvm.const(1, "int16x32")
             pair_reduction = tvm.call_llvm_intrin('int16x32',

From 9acbd753c3e3e5f6c1559464673a39eaf83b1c0b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Thu, 13 Sep 2018 01:56:17 +0000
Subject: [PATCH 15/18] Isolating INT8 and FP32 schedules to ease out future
 AutoTVM PR merge

---
 topi/python/topi/nn/conv2d.py  |  26 ++++++++
 topi/python/topi/x86/conv2d.py | 111 ++++++++++++++++++++++-----------
 2 files changed, 99 insertions(+), 38 deletions(-)

diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index c67828b79e26..5f58cf153640 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -84,6 +84,21 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         '{} vs. {}".format(data.dtype, kernel.dtype)
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
+def _get_workload_int8(data, kernel, stride, padding, out_dtype):
+    """ Get the workload structure. """
+    _, CI, IH, IW = [x.value for x in data.shape]
+    CO, _, KH, KW = [x.value for x in kernel.shape]
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
+        "Do not support inputs with different data types now. ' \
+        '{} vs. {}".format(data.dtype, kernel.dtype)
+    return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+
+
 
 @tvm.target.generic_func
 def _get_alter_layout_schedule(wkl):
@@ -118,6 +133,17 @@ def _get_schedule_NCHWc(wkl, layout, out_layout):
     return wkl
 
 
+@tvm.target.generic_func
+def _get_schedule_NCHWc_int8(wkl, layout, out_layout):
+    # pylint: disable=unreachable
+    """ Get the platform specific schedule. """
+    target = tvm.target.current_target()
+    raise RuntimeError(
+        "No schedule for current target:{}".format(target))
+    # This return has no use, merely to supress pylint warning
+    return wkl
+
+
 def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """Convolution operator in NCHW layout.
 
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index dbc4b678c19a..257ad0819bb8 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -5,8 +5,8 @@
 from .. import nn
 from ..nn.util import infer_pad, infer_stride
 from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \
-    _get_workload, _get_schedule, _get_schedule_NCHWc, \
-    _get_alter_layout_schedule, Workload
+    _get_workload, _get_workload_int8, _get_schedule, _get_schedule_NCHWc, \
+    _get_schedule_NCHWc_int8, _get_alter_layout_schedule, Workload
 
 from . import conv2d_avx_1x1, conv2d_avx_common
 from .conv2d_avx_common import AVXConvCommonFwd
@@ -48,35 +48,6 @@ def _get_schedule_conv(wkl):
         # workloads of resnet152_v1 on imagenet, no extra workload required
         # workloads of resnet18_v2 on imagenet, no extra workload required
         # workloads of resnet34_v2 on imagenet, no extra workload required
-
-        ## Following are for INT8 kernels
-        Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-        Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-        Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-        Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-        # workloads of resnet34_v1 on imagenet, no extra workload required
-        # workloads of resnet50_v1 on imagenet
-        Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
-        Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
-        Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
     ]
 
     fp32_vec_len = 8
@@ -119,8 +90,55 @@ def _get_schedule_conv(wkl):
         # workloads of resnet152_v1 on imagenet, no extra workload required
         # workloads of resnet18_v2 on imagenet, no extra workload required
         # workloads of resnet34_v2 on imagenet, no extra workload required
+    ]
 
+    if wkl not in _WORKLOADS_AVX:
+        if wkl.hkernel == 1 and wkl.wkernel == 1:
+            return conv2d_avx_1x1._get_default_schedule(wkl, fp32_vec_len)
+        return conv2d_avx_common._get_default_schedule(wkl, fp32_vec_len)
+    idx = _WORKLOADS_AVX.index(wkl)
+    sch = _SCHEDULES_AVX[idx]
+    return sch
 
+def _get_schedule_conv_int8(wkl):
+    _WORKLOADS_AVX = [
+        ## Following are for INT8 kernels
+        Workload('uint8', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+        # workloads of resnet34_v1 on imagenet, no extra workload required
+        # workloads of resnet50_v1 on imagenet
+        Workload('uint8', 'int32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
+        Workload('uint8', 'int32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
+        Workload('uint8', 'int32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
+    ]
+
+    fp32_vec_len = 8
+    target = tvm.target.current_target(allow_none=False)
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            fp32_vec_len = 16
+
+    _SCHEDULES_AVX = [
         # Following are for INT8 operations
         # workloads of resnet18_v1 on imagenet
         AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
@@ -168,6 +186,10 @@ def _get_schedule_conv(wkl):
 def _get_schedule_NCHWc_x86(wkl, layout, out_layout):
     return _get_schedule_conv(wkl)
 
+@_get_schedule_NCHWc_int8.register("cpu")
+def _get_schedule_NCHWc_x86_int8(wkl, layout, out_layout):
+    return _get_schedule_conv_int8(wkl)
+
 @_get_alter_layout_schedule.register("cpu")
 def _get_alter_layout_schedule_x86(wkl):
     return _get_schedule_conv(wkl)
@@ -226,6 +248,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 
+
 @conv2d_NCHWc.register("cpu")
 def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
                             padding, layout, out_layout, out_dtype):
@@ -244,11 +267,18 @@ def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
     n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
     ic = ic_chunk * ic_block
     kh, kw = kernel_size
-    wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
-                        tvm.placeholder((num_filter, ic, kh, kw),
-                                        dtype=kernel.dtype),
-                        stride, padding, out_dtype)
-    sch = _get_schedule_NCHWc(wkl, layout, out_layout)
+    if data.dtype == 'uint8':
+        wkl = _get_workload_int8(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
+                                 tvm.placeholder((num_filter, ic, kh, kw),
+                                                 dtype=kernel.dtype),
+                                 stride, padding, out_dtype)
+        sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
+    else:
+        wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=data.dtype),
+                            tvm.placeholder((num_filter, ic, kh, kw),
+                                            dtype=kernel.dtype),
+                            stride, padding, out_dtype)
+        sch = _get_schedule_NCHWc(wkl, layout, out_layout)
     return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel)
 
 
@@ -406,8 +436,13 @@ def traverse(op):
             original_kernel = tvm.placeholder((num_filter, ic, kh, kw),
                                               dtype=kernel.dtype)
 
-            wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
-            sch = _get_schedule_NCHWc(wkl, layout, out_layout)
+            if data.dtype == 'uint8':
+                wkl = _get_workload_int8(original_data, original_kernel,
+                                         stride, padding, conv_out.dtype)
+                sch = _get_schedule_NCHWc_int8(wkl, layout, out_layout)
+            else:
+                wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
+                sch = _get_schedule_NCHWc(wkl, layout, out_layout)
             _AVX_SCH_TO_SCH_FUNC[type(sch)](s, wkl, sch, data_vec,
                                             kernel, conv_out, outs[0])
 

From cdfce1fc58946338568b542990b0e353b8089c13 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Fri, 14 Sep 2018 23:28:53 +0000
Subject: [PATCH 16/18] Putting check_skylake function in the x86 directory

---
 topi/python/topi/nn/conv2d.py             | 10 ----------
 topi/python/topi/x86/check_targets.py     | 12 ++++++++++++
 topi/python/topi/x86/conv2d.py            |  6 +++---
 topi/python/topi/x86/conv2d_avx_1x1.py    |  3 ++-
 topi/python/topi/x86/conv2d_avx_common.py |  3 ++-
 5 files changed, 19 insertions(+), 15 deletions(-)
 create mode 100644 topi/python/topi/x86/check_targets.py

diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 5f58cf153640..3e06f6f6fed5 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -423,13 +423,3 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
-
-def check_skylake(target):
-    """
-    Checks if the target is skylake
-    """
-
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            return True
-    return False
diff --git a/topi/python/topi/x86/check_targets.py b/topi/python/topi/x86/check_targets.py
new file mode 100644
index 000000000000..fad74eaf582a
--- /dev/null
+++ b/topi/python/topi/x86/check_targets.py
@@ -0,0 +1,12 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument
+"""Checks different x86 targets for target specific schedules"""
+
+def check_skylake(target):
+    """
+    Checks if the target is skylake
+    """
+
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            return True
+    return False
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 257ad0819bb8..6fe59a909510 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -11,6 +11,7 @@
 from . import conv2d_avx_1x1, conv2d_avx_common
 from .conv2d_avx_common import AVXConvCommonFwd
 from .conv2d_avx_1x1 import AVXConv1x1Fwd
+from .check_targets import check_skylake
 
 @_get_schedule.register("cpu")
 def _get_schedule_conv(wkl):
@@ -134,9 +135,8 @@ def _get_schedule_conv_int8(wkl):
 
     fp32_vec_len = 8
     target = tvm.target.current_target(allow_none=False)
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            fp32_vec_len = 16
+    if check_skylake(target):
+        fp32_vec_len = 16
 
     _SCHEDULES_AVX = [
         # Following are for INT8 operations
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 8dc46d16c2f3..50c451949026 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -5,10 +5,11 @@
 import tvm
 
 from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload, check_skylake
+from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
 from .int8_intrinsics import _intrin_reduce4int8_1x1
+from .check_targets import check_skylake
 
 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
 
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 03c86701a131..bb5f11b7cf1f 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -5,10 +5,11 @@
 import tvm
 
 from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload, check_skylake
+from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
 from .int8_intrinsics import _intrin_reduce4int8_common
+from .check_targets import check_skylake
 
 AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
 

From 1fdef3891eb886eb95c7445bb6e741095b1f4037 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Thu, 20 Sep 2018 16:31:35 +0000
Subject: [PATCH 17/18] Added documentation and organizing files to better
 locations

---
 topi/python/topi/x86/conv2d_avx_1x1.py        |  5 +-
 topi/python/topi/x86/conv2d_avx_common.py     |  5 +-
 .../{int8_intrinsics.py => tensor_intrin.py}  | 80 ++++++++++++++++---
 .../recipe/conv}/test_conv_int8_intel.py      |  0
 4 files changed, 74 insertions(+), 16 deletions(-)
 rename topi/python/topi/x86/{int8_intrinsics.py => tensor_intrin.py} (62%)
 rename {tests/python/unittest => topi/recipe/conv}/test_conv_int8_intel.py (100%)

diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 50c451949026..e471b89b98ea 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -8,7 +8,7 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
-from .int8_intrinsics import _intrin_reduce4int8_1x1
+from .tensor_intrin import reduce_4int8_1x1
 from .check_targets import check_skylake
 
 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
@@ -323,8 +323,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
                   ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
     s[CC].fuse(oc_chunk, oh_outer)
 
-    n_elems = 4
-    pc = _intrin_reduce4int8_1x1(int32_lanes, n_elems)
+    pc = reduce_4int8_1x1()
     s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_inner)
     s[CC].unroll(oh_inner)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index bb5f11b7cf1f..ec2a79b28d26 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -8,7 +8,7 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
-from .int8_intrinsics import _intrin_reduce4int8_common
+from .tensor_intrin import reduce_4int8_common
 from .check_targets import check_skylake
 
 AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
@@ -360,8 +360,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
                       ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
 
 
-    n_elems = 4
-    pc = _intrin_reduce4int8_common(int32_lanes, n_elems)
+    pc = reduce_4int8_common()
     s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_block)
     s[CC].unroll(oc_f_inner)
diff --git a/topi/python/topi/x86/int8_intrinsics.py b/topi/python/topi/x86/tensor_intrin.py
similarity index 62%
rename from topi/python/topi/x86/int8_intrinsics.py
rename to topi/python/topi/x86/tensor_intrin.py
index 26657abe8160..2a62cd543302 100644
--- a/topi/python/topi/x86/int8_intrinsics.py
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -3,11 +3,41 @@
 import tvm
 
 
-def _intrin_reduce4int8_common(vec_size, num_elements_intel):
-    data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data')
-    kernel = tvm.placeholder((vec_size, num_elements_intel), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, num_elements_intel), name='k')
-    C = tvm.compute((vec_size,),
+def reduce_4int8_common():
+    """
+    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
+    This function takes two arrays of int8 datatype -- data[4] and
+    kernel[16][4] -- and computes a dot product of data[4] with every
+    4 elements of kernels, resulting in output[16] of int32 datatype.
+    The pseudo code is as follows.
+    .. code-block:: c
+        void reduce_4_int8_common(int8 data[4], int8 kernel[16][4],
+                int32 output[16]){
+            for (int i = 0; i < 16; i++){
+                out[i] = 0;
+                for (int k = 0; k < 4; k++){
+                    out[i] += data[k] * kernel[i][k]
+                }
+            }
+        }
+
+    Physically, the kernel array sits in an AVX512 vector register and
+    the data[4] is broadcasted to another AVX512 vector register. This
+    function returns a TensorIntrin that can be used to tensorize
+    a schedule.
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
+    """
+
+    int32_lanes = 16 # 16 int32 lanes in AVX512
+    num_int8_elements = 4 # 4 int8 elements in int32
+    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
+    k = tvm.reduce_axis((0, num_int8_elements), name='k')
+    C = tvm.compute((int32_lanes,),
                     lambda i: tvm.sum(data[k].astype('int32') *
                                       kernel[i, k].astype('int32'),
                                       axis=k),
@@ -53,11 +83,41 @@ def _instr(index):
     with tvm.build_config(offset_factor=1, partition_const_loop=True):
         return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
 
-def _intrin_reduce4int8_1x1(vec_size, num_elements_intel):
-    data = tvm.placeholder((num_elements_intel,), dtype='uint8', name='data')
-    kernel = tvm.placeholder((vec_size, num_elements_intel, 1, 1), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, num_elements_intel), name='k')
-    C = tvm.compute((vec_size,), \
+def reduce_4int8_1x1():
+    """
+    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
+    This function takes two arrays of int8 datatype -- data[4] and
+    kernel[16][4] -- and computes a dot product of data[4] with every
+    4 elements of kernels, resulting in output[16] of int32 datatype.
+    The pseudo code is as follows.
+    .. code-block:: c
+        void reduce_4_int8_common(int8 data[4], int8 kernel[16][4],
+                int32 output[16]){
+            for (int i = 0; i < 16; i++){
+                out[i] = 0;
+                for (int k = 0; k < 4; k++){
+                    out[i] += data[k] * kernel[i][k]
+                }
+            }
+        }
+
+    Physically, the kernel array sits in an AVX512 vector register and
+    the data[4] is broadcasted to another AVX512 vector register. This
+    function returns a TensorIntrin that can be used to tensorize
+    a schedule.
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
+    """
+
+    int32_lanes = 16 # 16 int32 lanes in AVX512
+    num_int8_elements = 4 # 4 int8 elements in int32
+    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = tvm.placeholder((int32_lanes, num_int8_elements, 1, 1), dtype='int8', name='kernel')
+    k = tvm.reduce_axis((0, num_int8_elements), name='k')
+    C = tvm.compute((int32_lanes,), \
                     lambda i: tvm.sum(data[k].astype('int32') *
                                       kernel[i, k, 0, 0].astype('int32'),
                                       axis=k),
diff --git a/tests/python/unittest/test_conv_int8_intel.py b/topi/recipe/conv/test_conv_int8_intel.py
similarity index 100%
rename from tests/python/unittest/test_conv_int8_intel.py
rename to topi/recipe/conv/test_conv_int8_intel.py

From abd99da083571b727f41d53537f12bc2586166d0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-33-132.us-east-2.compute.internal>
Date: Mon, 24 Sep 2018 01:16:38 +0000
Subject: [PATCH 18/18] Tensor intrin renaming. Avoid code duplication for
 intrin by kernel reshaping

---
 topi/python/topi/x86/conv2d_avx_1x1.py    | 16 +++--
 topi/python/topi/x86/conv2d_avx_common.py |  8 +--
 topi/python/topi/x86/tensor_intrin.py     | 87 +----------------------
 3 files changed, 17 insertions(+), 94 deletions(-)

diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index e471b89b98ea..bace7451d665 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -3,12 +3,13 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
+import topi
 
 from ..util import get_const_tuple
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
-from .tensor_intrin import reduce_4int8_1x1
+from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
 
 AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
@@ -253,16 +254,21 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
 
     # Intel performs dot product of 2 "4" Int8 values
     n_elems = 4
-    assert sch.ic_bn%4 == 0
+    assert sch.ic_bn%n_elems == 0
     ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
     ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
-    ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner')
+    ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
+
+    # Reshaping kernel as the last 2 dimensions are 1x1 (k_h x k_w)
+    k_shape = kernel.shape
+    kernel = topi.reshape(kernel, (k_shape[0], k_shape[1], k_shape[2], k_shape[3],
+                                   k_shape[4] * k_shape[5] * k_shape[6]))
 
     conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
                        tvm.sum(data_pad[n, ic_outer, oh*HSTR, ow*WSTR,
                                         ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) *
                                kernel[oc_chunk, ic_outer, ic_f_inner,
-                                      oc_block, ic_s_inner, 0, 0].astype(out_dtype),
+                                      oc_block, ic_s_inner].astype(out_dtype),
                                axis=[ic_outer, ic_f_inner, ic_s_inner]),
                        name='conv2d_NCHWc_int8',
                        tag="conv2d_NCHWc_int8")
@@ -323,7 +329,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
                   ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
     s[CC].fuse(oc_chunk, oh_outer)
 
-    pc = reduce_4int8_1x1()
+    pc = dot_16x1x16_int8_int8_int32()
     s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_inner)
     s[CC].unroll(oh_inner)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index ec2a79b28d26..0d7aba23d236 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -8,7 +8,7 @@
 from ..nn.conv2d import _get_schedule, _get_workload
 from ..nn.util import infer_pad, infer_stride
 from ..nn.pad import pad
-from .tensor_intrin import reduce_4int8_common
+from .tensor_intrin import dot_16x1x16_int8_int8_int32
 from .check_targets import check_skylake
 
 AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
@@ -286,11 +286,11 @@ def _declaration_conv_NCHWc_int8(wkl, sch, data, kernel):
     # Intel performs dot product of 2 "4" Int8 values
     # Current implementation requires ic_bn to be a multiple of 4
     n_elems = 4
-    assert sch.ic_bn%4 == 0
+    assert sch.ic_bn%n_elems == 0
 
     ic_outer = tvm.reduce_axis((0, wkl.in_filter//(sch.ic_bn)), name='ic_outer')
     ic_f_inner = tvm.reduce_axis((0, sch.ic_bn//n_elems), name='ic_f_inner')
-    ic_s_inner = tvm.reduce_axis((0, 4), name='ic_s_inner')
+    ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
     conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
                        tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw,
                                         ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype) *
@@ -360,7 +360,7 @@ def _schedule_conv_NCHWc_int8(s, wkl, sch, data, kernel, conv_out, last):
                       ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
 
 
-    pc = reduce_4int8_common()
+    pc = dot_16x1x16_int8_int8_int32()
     s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_block)
     s[CC].unroll(oc_f_inner)
diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py
index 2a62cd543302..28e57f1c10f8 100644
--- a/topi/python/topi/x86/tensor_intrin.py
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -3,7 +3,7 @@
 import tvm
 
 
-def reduce_4int8_common():
+def dot_16x1x16_int8_int8_int32():
     """
     Int8 dot product by every 4 elements using AVX2 Skylake instructions.
     This function takes two arrays of int8 datatype -- data[4] and
@@ -11,7 +11,7 @@ def reduce_4int8_common():
     4 elements of kernels, resulting in output[16] of int32 datatype.
     The pseudo code is as follows.
     .. code-block:: c
-        void reduce_4_int8_common(int8 data[4], int8 kernel[16][4],
+        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
                 int32 output[16]){
             for (int i = 0; i < 16; i++){
                 out[i] = 0;
@@ -82,86 +82,3 @@ def _instr(index):
 
     with tvm.build_config(offset_factor=1, partition_const_loop=True):
         return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
-
-def reduce_4int8_1x1():
-    """
-    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
-    This function takes two arrays of int8 datatype -- data[4] and
-    kernel[16][4] -- and computes a dot product of data[4] with every
-    4 elements of kernels, resulting in output[16] of int32 datatype.
-    The pseudo code is as follows.
-    .. code-block:: c
-        void reduce_4_int8_common(int8 data[4], int8 kernel[16][4],
-                int32 output[16]){
-            for (int i = 0; i < 16; i++){
-                out[i] = 0;
-                for (int k = 0; k < 4; k++){
-                    out[i] += data[k] * kernel[i][k]
-                }
-            }
-        }
-
-    Physically, the kernel array sits in an AVX512 vector register and
-    the data[4] is broadcasted to another AVX512 vector register. This
-    function returns a TensorIntrin that can be used to tensorize
-    a schedule.
-
-    Returns
-    -------
-    intrin : TensorIntrin
-        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
-    """
-
-    int32_lanes = 16 # 16 int32 lanes in AVX512
-    num_int8_elements = 4 # 4 int8 elements in int32
-    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
-    kernel = tvm.placeholder((int32_lanes, num_int8_elements, 1, 1), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, num_int8_elements), name='k')
-    C = tvm.compute((int32_lanes,), \
-                    lambda i: tvm.sum(data[k].astype('int32') *
-                                      kernel[i, k, 0, 0].astype('int32'),
-                                      axis=k),
-                    name="C")
-
-    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
-                               offset_factor=1,
-                               strides=[1])
-    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
-                               offset_factor=1,
-                               strides=[tvm.var('ldw'),
-                                        tvm.var('ldw'),
-                                        tvm.var('ldw'), 1]
-                              )
-
-    def _intrin_func(ins, outs):
-        def _instr(index):
-            ib = tvm.ir_builder.create()
-            if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
-                return ib.get()
-
-            a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
-            vec_ai32 = re_int32.astype('int32x16')
-            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
-            vec_b = ins[1].vload([0, 0, 0, 0], "int8x64")
-            vec_one = tvm.const(1, "int16x32")
-            pair_reduction = tvm.call_llvm_intrin('int16x32',
-                                                  'llvm.x86.avx512.pmaddubs.w.512',
-                                                  tvm.const(0, 'uint32'),
-                                                  vec_a, vec_b)
-            quad_reduction = tvm.call_llvm_intrin('int32x16',
-                                                  'llvm.x86.avx512.pmaddw.d.512',
-                                                  tvm.const(0, 'uint32'), \
-                                                  pair_reduction, vec_one)
-            if index == 0:
-                ib.emit(outs[0].vstore(0, quad_reduction))
-            else:
-                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
-            return ib.get()
-
-        # body, reset, update
-        return _instr(0), _instr(1), _instr(2)
-
-    with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})