From 34d39adba23a6683d41f74b3fa89cb0a06c39ffc Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Thu, 27 Jun 2019 16:40:50 -0700
Subject: [PATCH 1/4] [Relay] [Quantization] WIP - Prototyping Quantize and
 Dequantize operator with type infer type, lowering and test cases.

---
 include/tvm/relay/attrs/nn_quantize.h       | 156 ++++++++++++++++++++
 include/tvm/relay/quantize_util.h           |  98 ++++++++++++
 python/tvm/relay/op/nn/__init__.py          |   1 +
 python/tvm/relay/op/nn/_make_quantize.py    |  20 +++
 python/tvm/relay/op/nn/_quantize.py         |  73 +++++++++
 python/tvm/relay/quantize/__init__.py       |   1 +
 src/relay/op/nn/dequantize.cc               |  78 ++++++++++
 src/relay/op/nn/quantize_op.cc              |  91 ++++++++++++
 src/relay/pass/quantize_rewrite.cc          |  93 ++++++++++++
 tests/python/unittest/test_quantized_ops.py | 117 +++++++++++++++
 10 files changed, 728 insertions(+)
 create mode 100644 include/tvm/relay/attrs/nn_quantize.h
 create mode 100644 include/tvm/relay/quantize_util.h
 create mode 100644 python/tvm/relay/op/nn/_make_quantize.py
 create mode 100644 python/tvm/relay/op/nn/_quantize.py
 create mode 100644 src/relay/op/nn/dequantize.cc
 create mode 100644 src/relay/op/nn/quantize_op.cc
 create mode 100644 src/relay/pass/quantize_rewrite.cc
 create mode 100644 tests/python/unittest/test_quantized_ops.py
diff --git a/include/tvm/relay/attrs/nn_quantize.h b/include/tvm/relay/attrs/nn_quantize.h
new file mode 100644
index 000000000000..af420dca8139
--- /dev/null
+++ b/include/tvm/relay/attrs/nn_quantize.h
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/nn.h
+ * \brief Auxiliary attributes for nn operators.
+ */
+#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+struct QuantizeAttrs : public tvm::AttrsNode<QuantizeAttrs> {
+  DataType input_dtype;
+  int32_t output_zero_point;
+  double output_scale;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(QuantizeAttrs, "relay.attrs.QuantizeAttrs") {
+    TVM_ATTR_FIELD(out_dtype)
+      .describe("Output data type, can be one of [int8 or uint8].");
+
+    TVM_ATTR_FIELD(input_dtype)
+      .describe("Input data type, can be one of [float32, int8, uint8].");
+
+    TVM_ATTR_FIELD(output_zero_point)
+      .describe("The zero_point for the activation of this op.");
+
+    TVM_ATTR_FIELD(output_scale)
+      .describe("The scale for the activation of this op.");
+  }
+};
+
+struct DequantizeAttrs : public tvm::AttrsNode<DequantizeAttrs> {
+  int32_t input_zero_point;
+  double input_scale;
+
+  TVM_DECLARE_ATTRS(QuantizeAttrs, "relay.attrs.QuantizeAttrs") {
+
+    TVM_ATTR_FIELD(input_zero_point)
+      .describe("The zero_point for the input tensor of this op.");
+
+    TVM_ATTR_FIELD(input_scale)
+      .describe("The scale for the input tensor of this op.");
+  }
+};
+
+// TODO(anijain2305) - Copy of QuantizedConv2DAttrs. Should we inherit?
+/*! \brief Attribute for quantized conv2d operator */
+struct QuantizedConv2DAttrs : public tvm::AttrsNode<QuantizedConv2DAttrs> {
+  // Traditional conv2d attributes.
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  int groups;
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  std::string data_layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  DataType out_dtype;
+
+  // Quantization related attributes.
+  int32_t input_zero_point;
+  int32_t kernel_zero_point;
+  int32_t output_zero_point;
+  double input_scale;
+  double kernel_scale;
+  double output_scale;
+  bool use_int_compute_for_requantize;
+  std::string rounding;
+
+  TVM_DECLARE_ATTRS(QuantizedConv2DAttrs, "relay.attrs.QuantizedConv2DAttrs") {
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+        .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                  "on both sides for padding number of points");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+        .describe("Controls the connections between inputs and outputs."
+                  "At groups=1, all inputs are convolved to all outputs."
+                  "At groups=2, the operation becomes equivalent to having two convolution"
+                  "layers side by side, each seeing half the input channels, and producing"
+                  "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(channels)
+        .describe("The number of output channels in the convolution."
+                  " If it is not set, inferred by shape of the weight.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(kernel_size)
+        .describe("Specifies the dimensions of the convolution window.")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Convolution is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(kernel_layout).set_default("OIHW")
+        .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                  "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                  "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("")
+        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Default to be same as input layout.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+
+
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
+    TVM_ATTR_FIELD(kernel_zero_point)
+        .describe("The zero point of the kernel tensor.");
+    TVM_ATTR_FIELD(output_zero_point)
+        .describe("The zero point of the output tensor.");
+    TVM_ATTR_FIELD(input_scale)
+        .describe("The scale of the input tensor.");
+    TVM_ATTR_FIELD(kernel_scale)
+        .describe("The scale of the kernel tensor.");
+    TVM_ATTR_FIELD(output_scale)
+        .describe("The scale of the output tensor.");
+    TVM_ATTR_FIELD(use_int_compute_for_requantize).set_default(false)
+      .describe("When true, the integer computation is used to handle output scale");
+    TVM_ATTR_FIELD(rounding).set_default("ceil")
+        .describe("The rounding that has to be used for handling scales.");
+
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
\ No newline at end of file
diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
new file mode 100644
index 000000000000..85b7e55b4461
--- /dev/null
+++ b/include/tvm/relay/quantize_util.h
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nnvm/compiler/quantize_util.h
+ * \brief Utility methods needs for quantized ops that can be shared
+ */
+
+#ifndef TVM_QUANTIZE_UTIL_H
+#define TVM_QUANTIZE_UTIL_H
+
+#include <tvm/expr.h>
+#include "./base.h"
+
+namespace tvm {
+namespace relay {
+
+inline bool is_Int8(const DataType& dtype) {
+  return dtype == Int(8);
+}
+
+inline bool is_UInt8(const DataType& dtype) {
+  return dtype == UInt(8);
+}
+
+inline bool is_Float32(const DataType& dtype) {
+  return dtype == Float(32);
+}
+
+inline bool is_qauntized_type(const DataType& dtype) {
+  return is_Int8(dtype) || is_UInt8(dtype);
+}
+
+enum class QuantizeOpType : uint8_t {
+  Quantize_Requantize,
+  Dequantize
+};
+
+inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_Float32(in_dtype) || is_qauntized_type(in_dtype);
+    case QuantizeOpType ::Dequantize:
+      return is_qauntized_type(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_qauntized_type(in_dtype);
+    case QuantizeOpType::Dequantize:
+      return is_Float32(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline const int32_t get_qmin(const DataType&  dtype) {
+  CHECK(is_qauntized_type(dtype)) << "Expected quantized data type [int8, uint8] but was " << dtype;
+  if(is_Int8(dtype)) {
+    return std::numeric_limits<int8_t>::min();
+  } else {
+    return std::numeric_limits<uint8_t>::min();
+  }
+}
+
+
+inline const int32_t get_qmax(const DataType&  dtype) {
+  CHECK(is_qauntized_type(dtype)) << "Expected quantized data type [int8, uint8] but was " << dtype;
+  if(dtype == Int(8)) {
+    return std::numeric_limits<int8_t>::max();
+  } else {
+    return std::numeric_limits<uint8_t>::max();
+  }
+}
+
+} // namespace relay
+} // namespace tvm
+#endif //TVM_QUANTIZE_UTIL_H
\ No newline at end of file
diff --git a/python/tvm/relay/op/nn/__init__.py b/python/tvm/relay/op/nn/__init__.py
index ebabbbcd9d3a..25ab07b48d67 100644
--- a/python/tvm/relay/op/nn/__init__.py
+++ b/python/tvm/relay/op/nn/__init__.py
@@ -19,3 +19,4 @@
 from __future__ import absolute_import as _abs
 from .nn import *
 from . import _nn
+from . import _quantize
diff --git a/python/tvm/relay/op/nn/_make_quantize.py b/python/tvm/relay/op/nn/_make_quantize.py
new file mode 100644
index 000000000000..2480c99068c4
--- /dev/null
+++ b/python/tvm/relay/op/nn/_make_quantize.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.nn._quantize._make", __name__)
diff --git a/python/tvm/relay/op/nn/_quantize.py b/python/tvm/relay/op/nn/_quantize.py
new file mode 100644
index 000000000000..767628dee21e
--- /dev/null
+++ b/python/tvm/relay/op/nn/_quantize.py
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make_quantize
+
+def quantize(input_data, output_zero_point, output_scale, out_dtype='int8'):
+    r""" Quantize op
+
+    This operator takes floating point 32 or quantized int8 and unit8 as input and produces
+    quantized int8 or unit8 as output. The output shape is the same as input shape. The input
+    tensor can be of any shape.
+
+    ..math::
+            \mbox{out}[x] =
+                \mbox{clamp(round(input_tensor/output_scale) + output_zero_point); out_dtype::min, out_dtype::max}
+
+    Parameters
+    ----------
+    input_data : tvm.relay.Expr
+        The input tensor to be quantized. Can be of type [float32, int8, uint8].
+    output_zero_point :
+        The output zero_point.
+    output_scale:
+        The output scale.
+    input_dtype:
+        The data type of the input tensor. Can be [int8, uint8, float32]
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make_quantize.quantize(input_data, output_zero_point, output_scale, out_dtype)
+
+
+def dequantize(input_data, input_zero_point, input_scale):
+    r""" Dequantize op
+
+    This operator takes quantized int8 and unit8 as input and produces
+    dequantized float32 as output. The output shape is the same as input shape. The input
+    tensor can be of any shape.
+
+    Parameters
+    ----------
+    input_data : tvm.relay.Expr
+        The input tensor to be quantized. Can be of type [float32, int8, uint8].
+    input_zero_point :
+        The output zero_point.
+    input_scale:
+        The output scale.
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make_quantize.dequantize(input_data, input_zero_point, input_scale)
+
+
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
index 45bb62e66853..ddb99efe057c 100644
--- a/python/tvm/relay/quantize/__init__.py
+++ b/python/tvm/relay/quantize/__init__.py
@@ -20,3 +20,4 @@
 
 from .quantize import *
 from ._annotate import register_annotate_function
+from ._quantize import *
diff --git a/src/relay/op/nn/dequantize.cc b/src/relay/op/nn/dequantize.cc
new file mode 100644
index 000000000000..8fd25efd6bad
--- /dev/null
+++ b/src/relay/op/nn/dequantize.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize.cpp
+ * \brief Quantize and requantize operator
+ */
+
+#include <tvm/data_layout.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/attrs/nn_quantize.h>
+#include <tvm/relay/quantize_util.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(DequantizeAttrs);
+
+bool DequantizeRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto input_dtype = data->dtype;
+  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Dequantize, input_dtype))
+    << "Input type should be one of the quantized types [unit8, int8] but was " <<  input_dtype;
+  const Array<tvm::Expr> oshape = data->shape;
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, Float(32)));
+  return true;
+}
+
+Expr MakeDequantize(Expr data,
+                  int32_t input_zero_point,
+                  double input_scale) {
+  auto attrs = make_node<DequantizeAttrs>();
+  attrs->input_scale = input_scale;
+  attrs->input_zero_point = input_zero_point;
+  static const Op& op = Op::Get("nn_quantized.dequantize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("nn_quantized.dequantize")
+    .describe(R"code(Quantizes the input and produces quantized output.
+
+The input is always quantized (int8, uint8) and will be converted to float32 given input scale and shift.
+- **data**: Quantized tensor of any shape to dequantize. The input data can be of floating point
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.DequantizeAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The tensor to dequantize.")
+.set_support_level(10)
+.add_type_rel("Dequantize", DequantizeRel);
+
+TVM_REGISTER_API("relay.op.nn._quantize._make.dequantize")
+.set_body_typed(MakeDequantize);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/quantize_op.cc b/src/relay/op/nn/quantize_op.cc
new file mode 100644
index 000000000000..dd79895d1601
--- /dev/null
+++ b/src/relay/op/nn/quantize_op.cc
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize.cpp
+ * \brief Quantize and requantize operator
+ */
+
+#include <tvm/data_layout.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/attrs/nn_quantize.h>
+#include <tvm/relay/quantize_util.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(QuantizeAttrs);
+
+bool QuantizeRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto input_dtype = data->dtype;
+  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Quantize_Requantize, input_dtype))
+      << "Input type should be one of [float32, unit8, int8] but was " <<  input_dtype;
+  const auto* param = attrs.as<QuantizeAttrs>();
+  const Array<tvm::Expr> oshape = data->shape;
+  const DataType out_dtype = param->out_dtype;
+  CHECK(is_valid_quantized_op_output_type(QuantizeOpType::Quantize_Requantize, out_dtype))
+      << "Output type should be one of [int8, unit8 ] but was " << out_dtype;
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, out_dtype)); // ??: should it be reporter->Assign(types[1], types[0]);
+  return true;
+}
+
+Expr MakeQuantize(Expr data,
+                  int32_t output_zero_point,
+                  double output_scale,
+                  DataType out_dtype) {
+  auto attrs = make_node<QuantizeAttrs>();
+  attrs->output_scale = output_scale;
+  attrs->output_zero_point = output_zero_point;
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("nn_quantized.quantize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("nn_quantized.quantize")
+.describe(R"code(Quantizes the input and produces quantized output.
+
+The input can be either float or quantized(int8, unit8). If the input is float,
+this op takes scale and zero point and quantize the float value to
+quantized output, in int8 or uint8 format. If the input is quantized value,
+the op requantize the input (of a certain type, with a given scale and zero
+point) to the output of the same or different type with a same or different
+scale and zero point.
+
+- **data**: Tensor of any shape to quantize. The input data can be of floating point
+            or quantized.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.QuantizeAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The tensor to quantize.")
+.set_support_level(10)
+.add_type_rel("Quantize", QuantizeRel);
+
+TVM_REGISTER_API("relay.op.nn._quantize._make.quantize")
+.set_body_typed(MakeQuantize);
+
+}  // namespace relay
+}  // namespace tvm
\ No newline at end of file
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
new file mode 100644
index 000000000000..7e8491c0da7e
--- /dev/null
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize_rewrite.cc
+ * \brief Lower quantized ops to exisiting Relay ops.
+ */
+
+#include <tvm/relay/pass.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/attrs/nn_quantize.h>
+#include <tvm/relay/quantize_util.h>
+#include "pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+Expr QuantizeForwardRewrite(const Call& ref_call,
+                            const Array<Expr>& new_args,
+                            const NodeRef& ctx) {
+  CHECK_EQ(new_args.size(), 1);
+  Expr data = new_args[0];
+  const auto* attrs = ref_call->attrs.as<QuantizeAttrs>();
+  const auto out_dtype = attrs->out_dtype;
+  const auto* new_tesnor = data.operator->()->checked_type().as<TensorTypeNode>();
+  CHECK(new_tesnor) << "Expected TensorTypeNode but was " << data.operator->()->checked_type();
+  const auto input_dtype = new_tesnor->dtype;
+  if(is_Float32(input_dtype)) // this the quantization, float32 -> [int8, uint8]
+  {
+    const auto output_zero_point = MakeConstantScalar(Int(32), attrs->output_zero_point);
+    const auto scale = MakeConstantScalar(Float(32), attrs->output_scale);
+    const int32_t min_val = get_qmin(out_dtype);
+    const int32_t max_val = get_qmax(out_dtype);
+    auto scale_data = Cast(Round(Divide(data, scale)), Int(32));
+    // we are trying to do - std::min(std::max(unclamped, min_val), max_val);
+    auto unclamped = Cast(Add(scale_data, output_zero_point), out_dtype);
+    auto clamped_output = Clip(unclamped, min_val, max_val);
+    return clamped_output;
+  }
+  else { // this is requantization, [int8, uint8] -> [int8, uint8] with possible different scales. TODO: See how to take input scale and shift as input.
+    //TODO: Implement requantization
+    return Expr(); // to hide the warning.
+  }
+}
+
+RELAY_REGISTER_OP("nn_quantized.quantize")
+.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", QuantizeForwardRewrite);
+
+Expr DequantizeForwardRewrite(const Call& ref_call,
+                            const Array<Expr>& new_args,
+                            const NodeRef& ctx) {
+  CHECK_EQ(new_args.size(), 1);
+  Expr data = new_args[0];
+  const auto* attrs = ref_call->attrs.as<DequantizeAttrs>();
+  const auto* new_tesnor = data.operator->()->checked_type().as<TensorTypeNode>();
+  CHECK(new_tesnor) << "Expected TensorTypeNode but was " << data.operator->()->checked_type();
+  const auto input_zero_point = MakeConstantScalar(Int(32), attrs->input_zero_point);
+  const auto input_scale = MakeConstantScalar(Float(32), attrs->input_scale);
+  auto shift = Subtract(Cast(data, Int(32)), input_zero_point);
+  auto scale = Multiply(Cast(shift, Float(32)), input_scale);
+  return scale;
+}
+
+RELAY_REGISTER_OP("nn_quantized.dequantize")
+.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", DequantizeForwardRewrite);
+
+
+TVM_REGISTER_API("relay._quantize.quantize_rewrite")
+.set_body_typed<Expr(Expr)>([](const Expr& e) {
+  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  return ret;
+});
+
+
+}  // namespace relay
+}  // namespace tvm
\ No newline at end of file
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
new file mode 100644
index 000000000000..a8300d27136a
--- /dev/null
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import relay
+from tvm.contrib import graph_runtime
+import numpy as np
+
+def test_quantize_op():
+
+    def quantize_test_driver(in_dtype, quant_args, out_dtype, in_data, verify_output_data):
+        shape = in_data.shape
+        input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
+        output_zero_point = quant_args['out_zero_point']
+        output_scale = quant_args['out_scale']
+        quantized_output = relay.op.nn._quantize.quantize(input_data, output_zero_point=output_zero_point,
+                                                          output_scale=output_scale, out_dtype=out_dtype)
+        func = relay.Function(relay.ir_pass.free_vars(quantized_output), quantized_output)
+        func = relay.ir_pass.infer_type(func)
+        func = relay.quantize.quantize_rewrite(func)
+        func = relay.ir_pass.infer_type(func)
+        graph, lib, params = relay.build(func, "llvm", params=None)
+        mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        mod.set_input(input_data=in_data)
+        mod.run()
+        res = mod.get_output(0).asnumpy()
+        np.testing.assert_equal(res, verify_output_data)
+        assert res.dtype == out_dtype
+
+    def test_float32_to_uint8():
+        data = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]) \
+            .astype('float32') \
+            .reshape((2,5))
+        output = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]) \
+            .astype('uint8') \
+            .reshape((2,5))
+        quant_args = {"out_zero_point":127, "out_scale":0.5}
+        quantize_test_driver(in_dtype='float32', quant_args=quant_args, out_dtype='uint8', in_data=data,
+                             verify_output_data=output)
+
+    def test_float32_to_int8():
+        data = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]) \
+            .astype('float32') \
+            .reshape((2,5))
+        output = np.array([-128, -127, -126, -125, -124, 123, 124, 125, 126, 127]) \
+            .astype('int8') \
+            .reshape((2,5))
+        quant_args = {"out_zero_point":-1, "out_scale":0.5}
+        quantize_test_driver(in_dtype='float32', quant_args=quant_args, out_dtype='int8', in_data=data,
+                             verify_output_data=output)
+
+    test_float32_to_uint8()
+    test_float32_to_int8()
+
+def test_dequantize_op():
+
+    def quantize_test_driver(in_dtype, quant_args, in_data, verify_output_data):
+        shape = in_data.shape
+        input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
+        input_zero_point = quant_args['in_zero_point']
+        input_scale = quant_args['in_scale']
+        quantized_output = relay.op.nn._quantize.dequantize(input_data, input_zero_point=input_zero_point,
+                                                            input_scale=input_scale)
+        func = relay.Function(relay.ir_pass.free_vars(quantized_output), quantized_output)
+        func = relay.ir_pass.infer_type(func)
+        func = relay.quantize.quantize_rewrite(func)
+        func = relay.ir_pass.infer_type(func)
+        graph, lib, params = relay.build(func, "llvm", params=None)
+        mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        mod.set_input(input_data=in_data)
+        mod.run()
+        res = mod.get_output(0).asnumpy()
+        np.testing.assert_allclose(res, verify_output_data)
+        assert res.dtype == np.float32
+
+    def test_uint8_to_float32():
+        data = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]) \
+            .astype('uint8') \
+            .reshape((2,5))
+        output = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]) \
+            .astype('float32') \
+            .reshape((2,5))
+        quant_args = {"in_zero_point":127, "in_scale":0.5}
+        quantize_test_driver(in_dtype='uint8', quant_args=quant_args, in_data=data,
+                             verify_output_data=output)
+
+    def test_int8_to_float32():
+        data = np.array([-128, -127, -126, -125, -124, 123, 124, 125, 126, 127]) \
+            .astype('int8') \
+            .reshape((2,5))
+        output = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]) \
+            .astype('float32') \
+            .reshape((2,5))
+        quant_args = {"in_zero_point":-1, "in_scale":0.5}
+        quantize_test_driver(in_dtype='int8', quant_args=quant_args, in_data=data,
+                             verify_output_data=output)
+
+    test_uint8_to_float32()
+    test_int8_to_float32()
+
+if __name__ == "__main__":
+    test_quantize_op()
+    test_dequantize_op()

From f9b043dfadc5d144d977e1092d6fe003983d28bc Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Fri, 28 Jun 2019 10:01:53 -0700
Subject: [PATCH 2/4] [Relay] [Quantization] WIP - Fixing typos and removing
 redundant code.

[Relay] [Quantization] WIP - Removing redundant code.
---
 include/tvm/relay/attrs/nn_quantize.h | 89 ---------------------------
 src/relay/pass/quantize_rewrite.cc    | 10 +--
 2 files changed, 5 insertions(+), 94 deletions(-)

diff --git a/include/tvm/relay/attrs/nn_quantize.h b/include/tvm/relay/attrs/nn_quantize.h
index af420dca8139..8f86d4019aaa 100644
--- a/include/tvm/relay/attrs/nn_quantize.h
+++ b/include/tvm/relay/attrs/nn_quantize.h
@@ -31,7 +31,6 @@ namespace tvm {
 namespace relay {
 
 struct QuantizeAttrs : public tvm::AttrsNode<QuantizeAttrs> {
-  DataType input_dtype;
   int32_t output_zero_point;
   double output_scale;
   DataType out_dtype;
@@ -40,9 +39,6 @@ struct QuantizeAttrs : public tvm::AttrsNode<QuantizeAttrs> {
     TVM_ATTR_FIELD(out_dtype)
       .describe("Output data type, can be one of [int8 or uint8].");
 
-    TVM_ATTR_FIELD(input_dtype)
-      .describe("Input data type, can be one of [float32, int8, uint8].");
-
     TVM_ATTR_FIELD(output_zero_point)
       .describe("The zero_point for the activation of this op.");
 
@@ -65,91 +61,6 @@ struct DequantizeAttrs : public tvm::AttrsNode<DequantizeAttrs> {
   }
 };
 
-// TODO(anijain2305) - Copy of QuantizedConv2DAttrs. Should we inherit?
-/*! \brief Attribute for quantized conv2d operator */
-struct QuantizedConv2DAttrs : public tvm::AttrsNode<QuantizedConv2DAttrs> {
-  // Traditional conv2d attributes.
-  Array<IndexExpr> strides;
-  Array<IndexExpr> padding;
-  Array<IndexExpr> dilation;
-  int groups;
-  IndexExpr channels;
-  Array<IndexExpr> kernel_size;
-  std::string data_layout;
-  std::string kernel_layout;
-  std::string out_layout;
-  DataType out_dtype;
-
-  // Quantization related attributes.
-  int32_t input_zero_point;
-  int32_t kernel_zero_point;
-  int32_t output_zero_point;
-  double input_scale;
-  double kernel_scale;
-  double output_scale;
-  bool use_int_compute_for_requantize;
-  std::string rounding;
-
-  TVM_DECLARE_ATTRS(QuantizedConv2DAttrs, "relay.attrs.QuantizedConv2DAttrs") {
-    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
-        .describe("Specifies the strides of the convolution.");
-    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
-        .describe("If padding is non-zero, then the input is implicitly zero-padded"
-                  "on both sides for padding number of points");
-    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
-        .describe("Specifies the dilation rate to use for dilated convolution.");
-    TVM_ATTR_FIELD(groups).set_default(1)
-        .describe("Controls the connections between inputs and outputs."
-                  "At groups=1, all inputs are convolved to all outputs."
-                  "At groups=2, the operation becomes equivalent to having two convolution"
-                  "layers side by side, each seeing half the input channels, and producing"
-                  "half the output channels, and both subsequently concatenated.");
-    TVM_ATTR_FIELD(channels)
-        .describe("The number of output channels in the convolution."
-                  " If it is not set, inferred by shape of the weight.")
-        .set_default(NullValue<IndexExpr>());
-    TVM_ATTR_FIELD(kernel_size)
-        .describe("Specifies the dimensions of the convolution window.")
-        .set_default(NullValue<Array<IndexExpr> >());
-    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
-        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
-                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
-                  "dimensions respectively. Convolution is applied on the 'H' and"
-                  "'W' dimensions.");
-    TVM_ATTR_FIELD(kernel_layout).set_default("OIHW")
-        .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
-                  "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
-                  "dimensions respectively.");
-    TVM_ATTR_FIELD(out_layout).set_default("")
-        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
-                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
-                  "dimensions respectively. Default to be same as input layout.");
-
-    // use 0 bits to indicate none.
-    TVM_ATTR_FIELD(out_dtype)
-        .set_default(NullValue<DataType>())
-        .describe("Output data type, set to explicit type under mixed precision setting");
-
-
-    TVM_ATTR_FIELD(input_zero_point)
-        .describe("The zero point of the input tensor.");
-    TVM_ATTR_FIELD(kernel_zero_point)
-        .describe("The zero point of the kernel tensor.");
-    TVM_ATTR_FIELD(output_zero_point)
-        .describe("The zero point of the output tensor.");
-    TVM_ATTR_FIELD(input_scale)
-        .describe("The scale of the input tensor.");
-    TVM_ATTR_FIELD(kernel_scale)
-        .describe("The scale of the kernel tensor.");
-    TVM_ATTR_FIELD(output_scale)
-        .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_compute_for_requantize).set_default(false)
-      .describe("When true, the integer computation is used to handle output scale");
-    TVM_ATTR_FIELD(rounding).set_default("ceil")
-        .describe("The rounding that has to be used for handling scales.");
-
-  }
-};
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 7e8491c0da7e..f93658bee4c1 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -39,9 +39,9 @@ Expr QuantizeForwardRewrite(const Call& ref_call,
   Expr data = new_args[0];
   const auto* attrs = ref_call->attrs.as<QuantizeAttrs>();
   const auto out_dtype = attrs->out_dtype;
-  const auto* new_tesnor = data.operator->()->checked_type().as<TensorTypeNode>();
-  CHECK(new_tesnor) << "Expected TensorTypeNode but was " << data.operator->()->checked_type();
-  const auto input_dtype = new_tesnor->dtype;
+  const auto* new_tensor = data.operator->()->checked_type().as<TensorTypeNode>();
+  CHECK(new_tensor) << "Expected TensorTypeNode but was " << data.operator->()->checked_type();
+  const auto input_dtype = new_tensor->dtype;
   if(is_Float32(input_dtype)) // this the quantization, float32 -> [int8, uint8]
   {
     const auto output_zero_point = MakeConstantScalar(Int(32), attrs->output_zero_point);
@@ -69,8 +69,8 @@ Expr DequantizeForwardRewrite(const Call& ref_call,
   CHECK_EQ(new_args.size(), 1);
   Expr data = new_args[0];
   const auto* attrs = ref_call->attrs.as<DequantizeAttrs>();
-  const auto* new_tesnor = data.operator->()->checked_type().as<TensorTypeNode>();
-  CHECK(new_tesnor) << "Expected TensorTypeNode but was " << data.operator->()->checked_type();
+  const auto* new_tensor = data.operator->()->checked_type().as<TensorTypeNode>();
+  CHECK(new_tensor) << "Expected TensorTypeNode but was " << data.operator->()->checked_type();
   const auto input_zero_point = MakeConstantScalar(Int(32), attrs->input_zero_point);
   const auto input_scale = MakeConstantScalar(Float(32), attrs->input_scale);
   auto shift = Subtract(Cast(data, Int(32)), input_zero_point);

From 0b00e96c8ed01e046239f70a4e75f007e9b58de4 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Sun, 30 Jun 2019 20:47:06 -0700
Subject: [PATCH 3/4] merge from upstream

---
 3rdparty/HalideIR | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index c4e5bc77bd7b..32057b53eee8 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit c4e5bc77bd7bca05e45664b35c6ce88246c43b1b
+Subproject commit 32057b53eee870d73c6c21dc820d6546b4d9a13f

From 6e1f5af0faa7e9ab8827f0eea760a6f1190a1c65 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Sun, 30 Jun 2019 21:13:17 -0700
Subject: [PATCH 4/4] merge from upstream/mainline

---
 3rdparty/HalideIR | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index c4e5bc77bd7b..32057b53eee8 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit c4e5bc77bd7bca05e45664b35c6ce88246c43b1b
+Subproject commit 32057b53eee870d73c6c21dc820d6546b4d9a13f