From 8d9e317053b474dd12b2c76a3abf8c5a4a9fbd2f Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 8 Jul 2019 12:12:40 -0700
Subject: [PATCH 01/51] [Relay] [Quantization] WIP - Common files for the
 qauntization work.

---
 include/tvm/relay/attrs/qnn.h         |  37 +++++++
 include/tvm/relay/quantize_util.h     | 139 ++++++++++++++++++++++++++
 python/tvm/relay/op/__init__.py       |   1 +
 python/tvm/relay/op/qnn/__init__.py   |  20 ++++
 python/tvm/relay/op/qnn/_make.py      |  20 ++++
 python/tvm/relay/op/qnn/qnn.py        |  21 ++++
 python/tvm/relay/quantize/__init__.py |   1 +
 python/tvm/relay/quantize/rewrite.py  |  38 +++++++
 src/relay/pass/pattern_util.h         |  20 ++++
 src/relay/pass/quantize_rewrite.cc    |  38 +++++++
 10 files changed, 335 insertions(+)
 create mode 100644 include/tvm/relay/attrs/qnn.h
 create mode 100644 include/tvm/relay/quantize_util.h
 create mode 100644 python/tvm/relay/op/qnn/__init__.py
 create mode 100644 python/tvm/relay/op/qnn/_make.py
 create mode 100644 python/tvm/relay/op/qnn/qnn.py
 create mode 100644 python/tvm/relay/quantize/rewrite.py
 create mode 100644 src/relay/pass/quantize_rewrite.cc
diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
new file mode 100644
index 000000000000..c45a33c786f7
--- /dev/null
+++ b/include/tvm/relay/attrs/qnn.h
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/nn.h
+ * \brief Auxiliary attributes for nn operators.
+ */
+#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
new file mode 100644
index 000000000000..bb054fb8fb65
--- /dev/null
+++ b/include/tvm/relay/quantize_util.h
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nnvm/compiler/quantize_util.h
+ * \brief Utility methods needs for quantized ops that can be shared
+ */
+
+#ifndef TVM_QUANTIZE_UTIL_H
+#define TVM_QUANTIZE_UTIL_H
+
+#include <tvm/expr.h>
+#include "./base.h"
+
+namespace tvm {
+namespace relay {
+
+inline bool is_Int8(const DataType& dtype) {
+  return dtype == Int(8);
+}
+
+inline bool is_UInt8(const DataType& dtype) {
+  return dtype == UInt(8);
+}
+
+
+inline bool is_Int16(const DataType& dtype) {
+  return dtype == Int(16);
+}
+
+inline bool is_UInt16(const DataType& dtype) {
+  return dtype == UInt(16);
+}
+
+inline bool is_Int32(const DataType& dtype) {
+  return dtype == Int(32);
+}
+
+inline bool is_UInt32(const DataType& dtype) {
+  return dtype == UInt(32);
+}
+
+
+
+inline bool is_Float32(const DataType& dtype) {
+  return dtype == Float(32);
+}
+
+inline bool is_quantized_type(const DataType& dtype) {
+  return is_Int8(dtype) || is_UInt8(dtype)
+      || is_Int16(dtype) || is_UInt16(dtype);
+}
+
+enum class QuantizeOpType : uint8_t {
+  Quantize_Requantize,
+  Dequantize,
+  Requantize
+};
+
+inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_Float32(in_dtype) || is_quantized_type(in_dtype);
+    case QuantizeOpType ::Dequantize:
+      return is_quantized_type(in_dtype);
+    case QuantizeOpType ::Requantize:
+      return is_Int16(in_dtype) || is_Int32(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_quantized_type(in_dtype);
+    case QuantizeOpType::Dequantize:
+      return is_Float32(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline const int32_t get_qmin(const DataType&  dtype) {
+  if (is_Int8(dtype)) {
+    return std::numeric_limits<int8_t>::min();
+  } else if (is_UInt8(dtype)) {
+    return std::numeric_limits<uint8_t>::min();
+  } else if (is_Int16(dtype)) {
+    return std::numeric_limits<int16_t>::min();
+  } else if (is_UInt16(dtype)) {
+    return std::numeric_limits<uint16_t>::min();
+  } else if (is_Int32(dtype)) {
+    return std::numeric_limits<int32_t>::min();
+  } else if (is_UInt32(dtype)) {
+    return std::numeric_limits<uint32_t>::min();
+  }
+  LOG(FATAL) << "Type not supported\n";
+  return -1;
+}
+
+
+inline const int32_t get_qmax(const DataType&  dtype) {
+  if (is_Int8(dtype)) {
+    return std::numeric_limits<int8_t>::max();
+  } else if (is_UInt8(dtype)) {
+    return std::numeric_limits<uint8_t>::max();
+  } else if (is_Int16(dtype)) {
+    return std::numeric_limits<int16_t>::max();
+  } else if (is_UInt16(dtype)) {
+    return std::numeric_limits<uint16_t>::max();
+  } else if (is_Int32(dtype)) {
+    return std::numeric_limits<int32_t>::max();
+  } else if (is_UInt32(dtype)) {
+    return std::numeric_limits<uint32_t>::max();
+  }
+  LOG(FATAL) << "Type not supported\n";
+  return -1;
+}
+
+} // namespace relay
+} // namespace tvm
+#endif //TVM_QUANTIZE_UTIL_H
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index a27ab1dc50ff..1d634ef18fc0 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -26,6 +26,7 @@
 from .transform import *
 from .algorithm import *
 from . import nn
+from . import qnn
 from . import annotation
 from . import image
 from . import vision
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/op/qnn/__init__.py
new file mode 100644
index 000000000000..aef02300ab63
--- /dev/null
+++ b/python/tvm/relay/op/qnn/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .qnn import *
\ No newline at end of file
diff --git a/python/tvm/relay/op/qnn/_make.py b/python/tvm/relay/op/qnn/_make.py
new file mode 100644
index 000000000000..b1695629b8f9
--- /dev/null
+++ b/python/tvm/relay/op/qnn/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
new file mode 100644
index 000000000000..008e6cbb7f80
--- /dev/null
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
index 45bb62e66853..8da4e7953566 100644
--- a/python/tvm/relay/quantize/__init__.py
+++ b/python/tvm/relay/quantize/__init__.py
@@ -19,4 +19,5 @@
 from __future__ import absolute_import as _abs
 
 from .quantize import *
+from .rewrite import *
 from ._annotate import register_annotate_function
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/quantize/rewrite.py
new file mode 100644
index 000000000000..89429e522115
--- /dev/null
+++ b/python/tvm/relay/quantize/rewrite.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=unused-argument
+"""Automatic quantization toolkit."""
+from __future__ import absolute_import
+
+from . import _quantize
+from .. import expr as _expr
+
+def rewrite(expr):
+    """
+    Rewrites the high-level quantized ops into low-level exisiting Relay ops.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    expr : tvm.relay.Expr
+        The output expression.
+    """
+    return _quantize.rewrite(expr)
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 5c303905968e..7249d1d4c086 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -373,6 +373,26 @@ inline Expr Copy(Expr data) {
 }
 
 
+inline Expr Where(const Expr& condition, const Expr& x, const Expr& y) {
+  static const Op& op = Op::Get("where");
+  return CallNode::make(op, {condition, x, y});
+}
+
+inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
+  static const Op& op = Op::Get("greater_equal");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+inline Expr Full(Expr fill_value,
+              Array<IndexExpr> shape,
+              DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("full");
+  return CallNode::make(op, {fill_value}, Attrs(attrs), {});
+}
+
 Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
new file mode 100644
index 000000000000..925c516b41ed
--- /dev/null
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize_rewrite.cc
+ * \brief Lower quantized ops to exisiting Relay ops.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/quantize_util.h>
+#include <tvm/relay/attrs/qnn.h>
+#include "pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+
+}  // namespace relay
+}  // namespace tvm

From 5485b5800a13fd72ee8ddb5cc126103273ee6b22 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 8 Jul 2019 12:20:54 -0700
Subject: [PATCH 02/51] [Relay] [Quantization] WIP - Prototyping requantize op.

---
 include/tvm/relay/attrs/qnn.h      |  24 +++
 python/tvm/relay/op/qnn/qnn.py     |  46 ++++++
 src/relay/op/nn/requantize.cc      |  89 +++++++++++
 src/relay/pass/quantize_rewrite.cc | 237 +++++++++++++++++++++++++++++
 4 files changed, 396 insertions(+)
 create mode 100644 src/relay/op/nn/requantize.cc

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index c45a33c786f7..12afe19d26b3 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -30,7 +30,31 @@
 namespace tvm {
 namespace relay {
 
+/*! \brief Attribute for requantize operator */
+struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
+  double input_scale;
+  int32_t input_zero_point;
+  double output_scale;
+  int32_t output_zero_point;
+  bool use_int_compute;
+  DataType out_dtype;
 
+  TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
+    TVM_ATTR_FIELD(output_zero_point)
+        .describe("The zero point of the output tensor.");
+    TVM_ATTR_FIELD(input_scale)
+        .describe("The scale of the input tensor.");
+    TVM_ATTR_FIELD(output_scale)
+        .describe("The scale of the output tensor.");
+    TVM_ATTR_FIELD(use_int_compute).set_default(false)
+        .describe("When true, the integer computation is used to handle output scale");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 008e6cbb7f80..18be68cd9cfc 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -19,3 +19,49 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
+
+def requantize(input_data, input_zero_point, input_scale, output_zero_point,
+        output_scale, out_dtype="int32", use_int_compute=False):
+    r"""Requantized operator.
+
+    The requantize operator converts one quantized tensor to another quantized
+    tensor. For the output tensor, we are provided with output scale and zero
+    point. The computation looks like this
+
+    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+
+    The above computation can be done in floating point as the scales are in
+    FP32. Alternatively, we can approximate floating point with fixed point
+    computation. This is controlled by use_int_compute.
+
+    Parameters
+    ----------
+    quantized_data : tvm.relay.Expr
+        The input quantized_data to the operator.
+
+    input_scale: float
+           The float scalar to scale the quantized_data int8 values back to FP32.
+
+    output_scale: float
+           The float scalar to scale the quantized_output int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the quantized_data distribution.
+
+    output_zero_point: int
+           The zero point of the quantized_output distribution.
+
+    out_dtype : str, optional
+        Specifies the output quantized_data type for mixed precision conv2d.
+
+    use_int_compute : bool, optional
+        Use fully integer computation for requantizing.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.requantize(input_data, input_zero_point, input_scale,
+                            output_zero_point, output_scale, out_dtype,
+                            use_int_compute)
\ No newline at end of file
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/op/nn/requantize.cc
new file mode 100644
index 000000000000..80f2bde4ad47
--- /dev/null
+++ b/src/relay/op/nn/requantize.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file requantize.cc
+ * \brief Quantized convolution operators
+ */
+
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/qnn.h>
+#include <tvm/relay/quantize_util.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
+
+
+bool RequantizeRel(const Array<Type>& types,
+                   int num_inputs,
+                   const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto input_dtype = data->dtype;
+  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Requantize, input_dtype))
+    << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
+
+  const Array<tvm::Expr> oshape = data->shape;
+  // assign output type
+  const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, param->out_dtype));
+  return true;
+}
+
+// Positional relay function to create quantized conv2d operator
+// used by frontend FFI.
+Expr MakeRequantize(Expr data,
+                    int32_t input_zero_point,
+                    double input_scale,
+                    int32_t output_zero_point,
+                    double output_scale,
+                    DataType out_dtype,
+                    bool use_int_compute) {
+  auto attrs = make_node<RequantizeAttrs>();
+  attrs->out_dtype = std::move(out_dtype);
+  attrs->input_zero_point = std::move(input_zero_point);
+  attrs->output_zero_point = std::move(output_zero_point);
+  attrs->input_scale = std::move(input_scale);
+  attrs->output_scale = std::move(output_scale);
+  attrs->use_int_compute = std::move(use_int_compute);
+  static const Op& op = Op::Get("qnn.requantize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.requantize")
+.describe(R"code(Requantize operator.
+
+FIXME
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.RequantizeAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The quantized input tensor.")
+.set_support_level(10)
+.add_type_rel("Requantize", RequantizeRel);
+
+TVM_REGISTER_API("relay.op.qnn._make.requantize")
+.set_body_typed(MakeRequantize);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 925c516b41ed..55f8c43fd49f 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -34,5 +34,242 @@ namespace tvm {
 namespace relay {
 
 
+// Lowering of qnn.requantize op
+void GetFixedPointMultiplierShift(double double_multiplier,
+    int32_t* fixed_point_multiplier, int* shift,
+    const DataType& idtype) {
+
+  int acc_dtype_bits = idtype.bits();
+
+  if (double_multiplier == 0.) {
+    *fixed_point_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (acc_dtype_bits - 1))));
+  CHECK_LE(q_fixed, (1ll << (acc_dtype_bits - 1)));
+  if (q_fixed == (1ll << (acc_dtype_bits - 1))) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+  *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+Expr MultiplyByIntegerMuliplier(const Expr& convolved_tensor,
+    const int32_t fixed_point_multiplier, const int left_shift,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+  // TODO (janimesh) - How to add the overflow checks here. TFLite code snippet is
+  // bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
+  // return overflow ? std::numeric_limits<std::int32_t>::max() : .....;/
+
+  // The calculations are done in upcast of idtype to retain precision.
+  int acc_dtype_bits = idtype.bits();
+  DataType up_idtype = Int(2 * acc_dtype_bits);
+
+  auto tensor = convolved_tensor;
+  // Typically the left_shift will be 0 if the original scale is > 0.5.
+  if (left_shift != 0) {
+    tensor = Multiply(tensor, MakeConstantScalar(idtype, 1 << left_shift));
+  }
+
+  // Upcast the computation to Int64 and multiply the multiplier.
+  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
+  auto multiplied_t = Multiply(Cast(tensor, up_idtype), scalar);
+
+  // Since, we are performing fixed point computation. We are only interested in
+  // higher 16/32 bits. But before that, we also need to perform rounding.
+  // This is fixed point rounding. So, the rounder add scalar depends if the
+  // input is positive.
+  auto zero = MakeConstantScalar(up_idtype, 0);
+  auto pos_threshold = MakeConstantScalar(up_idtype,
+          1ll << (acc_dtype_bits - 2));
+  auto neg_threshold = MakeConstantScalar(up_idtype,
+          (1 - (1ll << (acc_dtype_bits - 2))));
+  auto pos_rounder = Full(pos_threshold, out_shape, up_idtype);
+  auto neg_rounder = Full(neg_threshold, out_shape, up_idtype);
+  auto rounding_scalar = Where(GreaterEqual(multiplied_t, zero), pos_rounder, neg_rounder);
+  auto rounded_tensor = Add(multiplied_t, rounding_scalar);
+
+  // Perform right shift to get the first 16/32 bits.
+  // The result is first doubled and the first 15/31 bits are obtained. This is
+  // done by just right shifting the result by 15/31 bits.
+  auto right_shift_scalar = MakeConstantScalar(up_idtype, (acc_dtype_bits - 1));
+  auto scaled_t = RightShift(rounded_tensor, right_shift_scalar);
+  auto q_imin = get_qmin(idtype);
+  auto q_imax = get_qmax(idtype);
+  auto integer_multiplied_t = Cast(Clip(scaled_t, q_imin, q_imax),
+          idtype);
+  return integer_multiplied_t;
+}
+
+Expr ShiftByIntegerShift(const Expr& multiplied_t,
+    const int& exponent, const RequantizeAttrs*& param,
+    const DataType& idtype, const Array<IndexExpr>& out_shape) {
+  CHECK_GE(exponent, 0);
+  int acc_dtype_bits = idtype.bits();
+  CHECK_LE(exponent, (acc_dtype_bits - 1));
+
+  // We need to perform rounding. The rounding here is closest to the power
+  // of 2. The exponent basically represents the decimal point. We need to round
+  // at the decimal point.
+  auto tensor = multiplied_t;
+  if (exponent != 0) {
+    auto pos_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)));
+    auto neg_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)) - 1);
+    auto pos_rounder_t = Full(pos_rounder, out_shape, idtype);
+    auto neg_rounder_t = Full(neg_rounder, out_shape, idtype);
+
+    auto zero = MakeConstantScalar(idtype, 0);
+    auto zero_t = Full(zero, out_shape, idtype);
+    auto round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
+            neg_rounder_t);
+    tensor = Add(tensor, round_scalar);
+  }
+
+  // Right shift by exponent to approximate the division.
+  auto scaled_t = RightShift(tensor,
+          MakeConstantScalar(idtype, exponent));
+  return scaled_t;
+}
+
+
+/*
+ * Requantization using only integer computation. Here, the computation is
+ * converted to a fixed point computation by computing output multiplier and
+ * shift. This is useful, if the target device does not support/have very
+ * expensive floating point computations.
+ *
+ * Original compuation is scale_fp32 * quantized_tensor.  To convert into
+ * integer computation, the multiplication with fp32 scalar can be replaced by
+ * multiplication with an int value and then right shifting the result. This
+ * approximates the floating point computation with a fixed point computation.
+ *
+ * The whole computaition this can be broken down into following steps 
+ * 1) Calculate the integer multiplier and integer shift.
+ * 2) Multiply the integer multiplier with quantized tensor.
+ * 3) Right shift the result.
+ *
+ * The only thing complicating the above computations is the tedious approach of
+ * handling rounding.
+ */
+Expr RequantizeInt(const Expr& convolved_tensor,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+
+  double double_multiplier = param->input_scale/param->output_scale;
+  // 1) Calculating the integer multiplier and integer shift
+  int32_t fixed_point_multiplier;
+  int shift;
+  GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
+          &shift, idtype);
+
+  // 2) Multiply the integer multiplier
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  auto multiplied_t = MultiplyByIntegerMuliplier(convolved_tensor,
+          fixed_point_multiplier, left_shift, param, idtype, out_shape);
+
+  // 3) Divide by the denominator or right shift the result.
+  auto scaled_int32_t = ShiftByIntegerShift(multiplied_t,
+          right_shift, param, idtype, out_shape);
+
+  // 4) Clip to the out_dtype min/max.
+  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
+  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto requantized_output = Cast(clipped_t, param->out_dtype);
+  return requantized_output;
+}
+
+/* 
+ * Requantization using floating computation. Here we can multiply the scale to
+ * the convolved_tensor, round to nearest integer and then cast back to int32.
+ */
+Expr RequantizeFloat(const Expr& convolved_tensor,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+  double double_multiplier = param->input_scale/param->output_scale;
+  auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
+
+  // Multiply the convolved tensor with the new scale.
+  auto casted_t = Cast(convolved_tensor, Float(32));
+  auto multiplied_t = Round(Multiply(casted_t, scalar_multiplier));
+  auto q_imin = get_qmin(idtype);
+  auto q_imax = get_qmax(idtype);
+  auto scaled_int32_t = Cast(Clip(multiplied_t, q_imin, q_imax),
+          idtype);
+
+  // Clip to the out_dtype min/max.
+  // Clip limits must be smaller than the dtype of the input tensor.
+  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
+  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto requantized_output = Cast(clipped_t, param->out_dtype);
+  return requantized_output;
+}
+
+/*
+ * Lowering of the requantize operation. The requantize operator converts one
+ * quantized tensor to another quantized tensor. For the output tensor, we are
+ * provided with output scale and zero point. The computation looks like this
+ *
+ * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+ *
+ * The above computation can be done in floating point as the scales are in
+ * FP32. Alternatively, we can approximate floating point with fixed point
+ * computation. This is controlled by use_int_compute.
+ */
+Expr RequantizeForwardRewrite(const Call& ref_call,
+    const Array<Expr>& new_args, const NodeRef& ctx) {
+  CHECK_EQ(new_args.size(), 1);
+  Expr quantized_data = new_args[0];
+  const auto* param = ref_call->attrs.as<RequantizeAttrs>();
+
+  // Find output shape.
+  Array<IndexExpr> out_shape;
+  auto ref_call_t = ref_call->checked_type();
+  auto output_tt = ref_call_t.as<TensorTypeNode>();
+  CHECK(output_tt != nullptr) << "Type information missing."
+      << " Please run infer_type pass.";
+  out_shape = output_tt->shape;
+
+  // Find input dtype.
+  auto ref_input_t = ref_call->args[0]->checked_type();
+  auto input_tt = ref_input_t.as<TensorTypeNode>();
+  CHECK(input_tt != nullptr) << "Type information missing."
+      << " Please run infer_type pass.";
+  const auto input_dtype = input_tt->dtype;
+
+  // Check for current quantization support.
+  CHECK_EQ(param->input_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+  CHECK_EQ(param->output_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+
+  if (param->use_int_compute) {
+    return RequantizeInt(quantized_data, param, input_dtype, out_shape);
+  } else {
+    return RequantizeFloat(quantized_data, param, input_dtype, out_shape);
+  }
+}
+
+
+RELAY_REGISTER_OP("qnn.requantize")
+.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
+
+
+
+TVM_REGISTER_API("relay._quantize.rewrite")
+.set_body_typed<Expr(Expr)>([](const Expr& e) {
+  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  return ret;
+});
+
+
 }  // namespace relay
 }  // namespace tvm

From 877d834f86d4b15249f7dcf398ad21fbfa9f66a8 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 21:51:02 +0000
Subject: [PATCH 03/51] Requantize operator implementation.

Requantize converts one quantized tensor representation to another quantized
representation. The PR has following implementation features

- Requantize operator defined in qnn namespace - relay.qnn.requantize
- Lowering of the requantize to exisiting Relay operators
- Integer fixed point implementation of requantize
    - Two rounding modes - FE_UPWARDS (round towards infinity) and
    FE_AWAY_FROM_ZERO (std::round behavior)
- Floating point implementation as well, that can act as reference or can be
used for devices when FP32 computation is not used.
- Unit test cases

Relevant Issue - https://github.com/dmlc/tvm/issues/2351

Credit to TFLite and GemmLowp to provide reference implementations.
---
 include/tvm/relay/attrs/qnn.h               |  13 +-
 python/tvm/relay/op/qnn/qnn.py              |  13 +-
 src/relay/op/nn/requantize.cc               |   4 +-
 src/relay/pass/quantize_rewrite.cc          | 231 +++++++++---------
 tests/python/unittest/test_quantized_ops.py | 257 ++++++++++++++++++++
 5 files changed, 390 insertions(+), 128 deletions(-)
 create mode 100644 tests/python/unittest/test_quantized_ops.py

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index 12afe19d26b3..cf69fa759c1c 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -37,6 +37,7 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   double output_scale;
   int32_t output_zero_point;
   bool use_int_compute;
+  std::string rounding_mode;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
@@ -48,14 +49,22 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("The scale of the input tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_compute).set_default(false)
-        .describe("When true, the integer computation is used to handle output scale");
+    TVM_ATTR_FIELD(use_int_compute).set_default(true)
+      .describe("When true, the integer computation is used to handle output scale."
+                "The float compuation can be used as reference implementation or in"
+                "cases where FP32 computation for requantize is not expensive");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
+    TVM_ATTR_FIELD(rounding_mode).set_default("FE_UPWARD")
+        .describe("Defines the rounding direction when the value is midway between"
+                  "two representable values. There are two supported modes - FE_UPWARD"
+                  "or FE_AWAY_FROM_ZERO. More context can be found at"
+                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
   }
 };
 
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 18be68cd9cfc..484b3864f22f 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -19,9 +19,9 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
-
 def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-        output_scale, out_dtype="int32", use_int_compute=False):
+        output_scale, out_dtype="int32", use_int_compute=False,
+        rounding_mode="FE_UPWARD"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -57,11 +57,18 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
     use_int_compute : bool, optional
         Use fully integer computation for requantizing.
 
+    rounding_mode : string, optional
+        Defines the rounding direction when the value is midway between two
+        representable values.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
+    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+            "Unsupported rounding mode"
+
     return _make.requantize(input_data, input_zero_point, input_scale,
                             output_zero_point, output_scale, out_dtype,
-                            use_int_compute)
\ No newline at end of file
+                            use_int_compute, rounding_mode)
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/op/nn/requantize.cc
index 80f2bde4ad47..285528993f6f 100644
--- a/src/relay/op/nn/requantize.cc
+++ b/src/relay/op/nn/requantize.cc
@@ -59,7 +59,8 @@ Expr MakeRequantize(Expr data,
                     int32_t output_zero_point,
                     double output_scale,
                     DataType out_dtype,
-                    bool use_int_compute) {
+                    bool use_int_compute,
+                    std::string rounding_mode) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->out_dtype = std::move(out_dtype);
   attrs->input_zero_point = std::move(input_zero_point);
@@ -67,6 +68,7 @@ Expr MakeRequantize(Expr data,
   attrs->input_scale = std::move(input_scale);
   attrs->output_scale = std::move(output_scale);
   attrs->use_int_compute = std::move(use_int_compute);
+  attrs->rounding_mode = std::move(rounding_mode);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 55f8c43fd49f..645b20c0730e 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -33,13 +33,27 @@
 namespace tvm {
 namespace relay {
 
-
 // Lowering of qnn.requantize op
+
+/*
+ * Converts a floating point number so that it can be represented by integers.
+ * The representation is
+ *      float_number = (fixed_point_multiplier) * 2^(shift)
+ *
+ * The fixed_point_multiplier is a number between 0.5 and 1. This is represented
+ * by an integer number. For example, if it is int32, then the decimal point
+ * exists between bit 31 and 30 from LSB (or between first and second bit from
+ * the left).
+ *
+ * Some examples are
+ *           0.25 = (0.5) * 2^(-1)
+ *           0.125 = (0.5) * 2^(-2)
+ */
 void GetFixedPointMultiplierShift(double double_multiplier,
     int32_t* fixed_point_multiplier, int* shift,
     const DataType& idtype) {
 
-  int acc_dtype_bits = idtype.bits();
+  int idtype_bits = idtype.bits();
 
   if (double_multiplier == 0.) {
     *fixed_point_multiplier = 0;
@@ -47,9 +61,9 @@ void GetFixedPointMultiplierShift(double double_multiplier,
     return;
   }
   const double q = std::frexp(double_multiplier, shift);
-  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (acc_dtype_bits - 1))));
-  CHECK_LE(q_fixed, (1ll << (acc_dtype_bits - 1)));
-  if (q_fixed == (1ll << (acc_dtype_bits - 1))) {
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (idtype_bits - 1))));
+  CHECK_LE(q_fixed, (1ll << (idtype_bits - 1)));
+  if (q_fixed == (1ll << (idtype_bits - 1))) {
     q_fixed /= 2;
     ++*shift;
   }
@@ -57,85 +71,6 @@ void GetFixedPointMultiplierShift(double double_multiplier,
   *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
 }
 
-Expr MultiplyByIntegerMuliplier(const Expr& convolved_tensor,
-    const int32_t fixed_point_multiplier, const int left_shift,
-    const RequantizeAttrs*& param, const DataType& idtype,
-    const Array<IndexExpr>& out_shape) {
-  // TODO (janimesh) - How to add the overflow checks here. TFLite code snippet is
-  // bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
-  // return overflow ? std::numeric_limits<std::int32_t>::max() : .....;/
-
-  // The calculations are done in upcast of idtype to retain precision.
-  int acc_dtype_bits = idtype.bits();
-  DataType up_idtype = Int(2 * acc_dtype_bits);
-
-  auto tensor = convolved_tensor;
-  // Typically the left_shift will be 0 if the original scale is > 0.5.
-  if (left_shift != 0) {
-    tensor = Multiply(tensor, MakeConstantScalar(idtype, 1 << left_shift));
-  }
-
-  // Upcast the computation to Int64 and multiply the multiplier.
-  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
-  auto multiplied_t = Multiply(Cast(tensor, up_idtype), scalar);
-
-  // Since, we are performing fixed point computation. We are only interested in
-  // higher 16/32 bits. But before that, we also need to perform rounding.
-  // This is fixed point rounding. So, the rounder add scalar depends if the
-  // input is positive.
-  auto zero = MakeConstantScalar(up_idtype, 0);
-  auto pos_threshold = MakeConstantScalar(up_idtype,
-          1ll << (acc_dtype_bits - 2));
-  auto neg_threshold = MakeConstantScalar(up_idtype,
-          (1 - (1ll << (acc_dtype_bits - 2))));
-  auto pos_rounder = Full(pos_threshold, out_shape, up_idtype);
-  auto neg_rounder = Full(neg_threshold, out_shape, up_idtype);
-  auto rounding_scalar = Where(GreaterEqual(multiplied_t, zero), pos_rounder, neg_rounder);
-  auto rounded_tensor = Add(multiplied_t, rounding_scalar);
-
-  // Perform right shift to get the first 16/32 bits.
-  // The result is first doubled and the first 15/31 bits are obtained. This is
-  // done by just right shifting the result by 15/31 bits.
-  auto right_shift_scalar = MakeConstantScalar(up_idtype, (acc_dtype_bits - 1));
-  auto scaled_t = RightShift(rounded_tensor, right_shift_scalar);
-  auto q_imin = get_qmin(idtype);
-  auto q_imax = get_qmax(idtype);
-  auto integer_multiplied_t = Cast(Clip(scaled_t, q_imin, q_imax),
-          idtype);
-  return integer_multiplied_t;
-}
-
-Expr ShiftByIntegerShift(const Expr& multiplied_t,
-    const int& exponent, const RequantizeAttrs*& param,
-    const DataType& idtype, const Array<IndexExpr>& out_shape) {
-  CHECK_GE(exponent, 0);
-  int acc_dtype_bits = idtype.bits();
-  CHECK_LE(exponent, (acc_dtype_bits - 1));
-
-  // We need to perform rounding. The rounding here is closest to the power
-  // of 2. The exponent basically represents the decimal point. We need to round
-  // at the decimal point.
-  auto tensor = multiplied_t;
-  if (exponent != 0) {
-    auto pos_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)));
-    auto neg_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)) - 1);
-    auto pos_rounder_t = Full(pos_rounder, out_shape, idtype);
-    auto neg_rounder_t = Full(neg_rounder, out_shape, idtype);
-
-    auto zero = MakeConstantScalar(idtype, 0);
-    auto zero_t = Full(zero, out_shape, idtype);
-    auto round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
-            neg_rounder_t);
-    tensor = Add(tensor, round_scalar);
-  }
-
-  // Right shift by exponent to approximate the division.
-  auto scaled_t = RightShift(tensor,
-          MakeConstantScalar(idtype, exponent));
-  return scaled_t;
-}
-
-
 /*
  * Requantization using only integer computation. Here, the computation is
  * converted to a fixed point computation by computing output multiplier and
@@ -147,59 +82,123 @@ Expr ShiftByIntegerShift(const Expr& multiplied_t,
  * multiplication with an int value and then right shifting the result. This
  * approximates the floating point computation with a fixed point computation.
  *
- * The whole computaition this can be broken down into following steps 
+ * The whole computation this can be broken down into following steps
  * 1) Calculate the integer multiplier and integer shift.
- * 2) Multiply the integer multiplier with quantized tensor.
- * 3) Right shift the result.
+ * 2) Subtract the input integer point.
+ * 2) Multiply the integer fixed point multiplier with quantized tensor.
+ * 3) Round the result.
+ * 4) Right shift the result.
+ * 5) Add the output_zero_point.
+ * 6) Cast to the out_dtype.
  *
- * The only thing complicating the above computations is the tedious approach of
- * handling rounding.
  */
-Expr RequantizeInt(const Expr& convolved_tensor,
+Expr RequantizeInt(const Expr& input_tensor,
     const RequantizeAttrs*& param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
   double double_multiplier = param->input_scale/param->output_scale;
+
+  // The multiplication will be performed in higher precision. Find the dtype.
+  int idtype_bits = idtype.bits();
+  DataType up_idtype = Int(2 * idtype_bits);
+
   // 1) Calculating the integer multiplier and integer shift
   int32_t fixed_point_multiplier;
   int shift;
   GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
           &shift, idtype);
-
-  // 2) Multiply the integer multiplier
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
-  auto multiplied_t = MultiplyByIntegerMuliplier(convolved_tensor,
-          fixed_point_multiplier, left_shift, param, idtype, out_shape);
 
-  // 3) Divide by the denominator or right shift the result.
-  auto scaled_int32_t = ShiftByIntegerShift(multiplied_t,
-          right_shift, param, idtype, out_shape);
+  // 2) Subtract the input_zero_point
+  auto tensor = input_tensor;
+  tensor = Cast(tensor, up_idtype);
+  if (param->input_zero_point != 0) {
+    auto input_zp = MakeConstantScalar(up_idtype, param->input_zero_point);
+    tensor = Subtract(tensor, input_zp);
+  }
 
-  // 4) Clip to the out_dtype min/max.
+
+
+  // 3) Multiply the integer multiplier
+  if (left_shift != 0) {
+    tensor = Multiply(tensor, MakeConstantScalar(up_idtype, 1 << left_shift));
+  }
+  // Perform the multiplication in higher precision.
+  // If idtype is Int(32), the scalar is a fixed point value of int32 where the
+  // decimal point is between bits 31 and 30. After multiplying with
+  // input_tensor, the result in int64 where the decimal point is sitting
+  // between bits 31 and 30 (from the right, rightmost bit is bit 0).
+  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
+  auto multiplied_t = Multiply(tensor, scalar);
+
+
+  // 4) Find the rounding scalar. This depends on where the final decimal point
+  // sits. As we will be right shifting the multiplied_t, we need to first
+  // calculate the totol_right_shift.
+  int total_right_shift = right_shift + idtype_bits - 1;
+
+  tensor = multiplied_t;
+  Expr round_scalar;
+  if (param->rounding_mode == "FE_UPWARD") {
+    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
+    round_scalar = pos_rounder;
+  } else if (param->rounding_mode == "FE_AWAY_FROM_ZERO") {
+    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
+    auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
+    auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
+    auto neg_rounder_t = Full(neg_rounder, out_shape, up_idtype);
+
+    auto zero = MakeConstantScalar(up_idtype, 0);
+    auto zero_t = Full(zero, out_shape, up_idtype);
+    round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
+            neg_rounder_t);
+  }
+  // Add the rounding scalar.
+  tensor = Add(tensor, round_scalar);
+
+  // 5) Simply right shift the result to get the final output.
+  auto scaled_int64_t = RightShift(tensor,
+          MakeConstantScalar(up_idtype, total_right_shift));
+
+  // 6) Add the output zero point.
+  auto output_zp = MakeConstantScalar(up_idtype, param->output_zero_point);
+  auto shifted_int64_t = Add(output_zp, scaled_int64_t);
+
+  // 7) Clip to the out_dtype min/max.
+  // Find the right clip min/maxes. While clipping, it is necessary that
+  // clip_min and clip_max are within the dtype range of the input tensor to the
+  // clip operator. For example, if the input to clip operator is int8, but the
+  // out_dtype is uint8, we will get incorrect results, if we set max as 255.
   auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
   auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
-  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;
 }
 
-/* 
+
+/*
  * Requantization using floating computation. Here we can multiply the scale to
- * the convolved_tensor, round to nearest integer and then cast back to int32.
+ * the input_tensor, round to nearest integer and then cast back to int32.
  */
-Expr RequantizeFloat(const Expr& convolved_tensor,
+Expr RequantizeFloat(const Expr& input_tensor,
     const RequantizeAttrs*& param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
   double double_multiplier = param->input_scale/param->output_scale;
   auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
-
-  // Multiply the convolved tensor with the new scale.
-  auto casted_t = Cast(convolved_tensor, Float(32));
-  auto multiplied_t = Round(Multiply(casted_t, scalar_multiplier));
+  auto input_zp = MakeConstantScalar(idtype, param->input_zero_point);
+  auto output_zp = MakeConstantScalar(Float(32), param->output_zero_point);
+
+  // Multiply the tensor with the new scale.
+  auto shifted_input_t = Subtract(input_tensor, input_zp);
+  auto casted_t = Cast(shifted_input_t, Float(32));
+  auto multiplied_t = Multiply(casted_t, scalar_multiplier);
+  auto shifted_multiplied_t = Add(output_zp, multiplied_t);
+  auto rounded_t = Round(shifted_multiplied_t);
   auto q_imin = get_qmin(idtype);
   auto q_imax = get_qmax(idtype);
-  auto scaled_int32_t = Cast(Clip(multiplied_t, q_imin, q_imax),
+  auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
           idtype);
 
   // Clip to the out_dtype min/max.
@@ -243,14 +242,6 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
-  // Check for current quantization support.
-  CHECK_EQ(param->input_zero_point, 0)
-      << "Encountered non-zero zero point."
-      << " Only symmetric quantization supported for now.";
-  CHECK_EQ(param->output_zero_point, 0)
-      << "Encountered non-zero zero point."
-      << " Only symmetric quantization supported for now.";
-
   if (param->use_int_compute) {
     return RequantizeInt(quantized_data, param, input_dtype, out_shape);
   } else {
@@ -258,18 +249,14 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
   }
 }
 
-
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-
-
 TVM_REGISTER_API("relay._quantize.rewrite")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
-  return ret;
-});
-
+          Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+            return ret;
+            });
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
new file mode 100644
index 000000000000..e70ea0925231
--- /dev/null
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -0,0 +1,257 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.testing import create_workload
+from tvm.contrib import graph_runtime
+
+rounding_modes = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
+
+def run_infer_type(expr):
+    mod = relay.Module.from_expr(expr)
+    mod = relay.transform.InferType()(mod)
+    entry = mod["main"]
+    return entry if isinstance(expr, relay.Function) else entry.body
+
+
+def test_requantize():
+    def verify(func, goldens):
+        with relay.build_config(opt_level=0):
+            graph, lib, params = relay.build(func, "llvm", params=None)
+            golden_data, golden_output = goldens
+            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod.set_input("quantized_data",golden_data)
+            mod.set_input(**params)
+            mod.run()
+            res = mod.get_output(0).asnumpy()
+            np.testing.assert_equal(res, golden_output)
+
+    def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
+            rounding_mode, input_scale, output_scale, input_zero_point=0,
+            output_zero_point=0):
+        quantized_data = relay.var("quantized_data", shape=data_shape,
+                dtype=data_dtype)
+        func = relay.op.qnn.requantize(
+                quantized_data,
+                input_zero_point=input_zero_point,
+                output_zero_point=output_zero_point,
+                input_scale=input_scale,
+                output_scale=output_scale,
+                rounding_mode=rounding_mode,
+                out_dtype=out_dtype,
+                use_int_compute=use_int_compute)
+
+        func = relay.Function(relay.analysis.free_vars(func),
+                func)
+        func = run_infer_type(func)
+        func = relay.quantize.rewrite(func)
+        print(func)
+        return func
+
+
+    def run_tests():
+        def same_scale_test():
+            # Have same scales, everything within range
+            golden_data = np.arange(-100, 100, 1).astype('int32')
+            golden_output = golden_data
+
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(200, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=0.5,
+                                    output_scale=0.5)
+                    verify(func, (golden_data, golden_output))
+
+        def downscale_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                    verify(func, (golden_data, golden_output))
+
+                # Try a different scale
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=4)
+
+                    # Try positive values
+                    # 2I corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                              [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                                  [3, 4, 4, 4, 4, 4, 4, 4, 1])
+                    else:
+                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                                  [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                    verify(func, (golden_data, golden_output))
+
+        def upscale_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=2,
+                                    output_scale=1)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.multiply(2, golden_data)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    golden_output = np.multiply(2, golden_data)
+                    verify(func, (golden_data, golden_output))
+
+        def saturation_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(16, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=0.5,
+                                    output_scale=0.5)
+                    golden_data = np.arange(0, 16, 1).astype('int32')
+                    golden_data = np.add(120, golden_data)
+                    output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
+                                       127, 127, 127, 127, 127, 127, 127, 127])
+                    golden_output = output
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative numbers
+                    golden_data = np.arange(0, -16, -1).astype('int32')
+                    golden_data = np.add(-120, golden_data)
+                    output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
+                                       -128, -128, -128, -128, -128, -128, -128, -128])
+                    golden_output = output
+                    verify(func, (golden_data, golden_output))
+
+        def zero_point_test():
+            # Output zero point
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16,
+                                    output_zero_point=1)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                    golden_output = np.add(1, golden_output)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(-32, -64, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                    golden_output = np.add(1, golden_output)
+                    verify(func, (golden_data, golden_output))
+
+            # Input zero point
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16,
+                                    input_zero_point=16)
+
+                    # Try positive values
+                    golden_data = np.arange(32, 64, 1).astype('int32')
+                    golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+                    golden_output = np.subtract(golden_output, 1)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    golden_data = np.arange(-32, -64, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                    golden_output = np.subtract(golden_output, 1)
+                    verify(func, (golden_data, golden_output))
+
+
+
+
+        if __name__ == "__main__":
+            same_scale_test()
+            downscale_test()
+            upscale_test()
+            saturation_test()
+            zero_point_test()
+
+    run_tests()
+
+if __name__ == "__main__":
+    test_requantize()

From 705b7961c8260de498a47f841805ca98a3a45a13 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 22:23:29 +0000
Subject: [PATCH 04/51] Typo and lint fixes.

---
 include/tvm/relay/attrs/qnn.h               |  6 ++---
 include/tvm/relay/quantize_util.h           | 27 ++++++++++++---------
 python/tvm/relay/op/qnn/__init__.py         |  2 +-
 python/tvm/relay/op/qnn/qnn.py              |  4 +--
 python/tvm/relay/quantize/rewrite.py        |  1 -
 src/relay/pass/quantize_rewrite.cc          |  4 +--
 tests/python/unittest/test_quantized_ops.py |  1 -
 7 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index cf69fa759c1c..6bcd77a81f8a 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -21,8 +21,8 @@
  * \file tvm/relay/attrs/nn.h
  * \brief Auxiliary attributes for nn operators.
  */
-#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
-#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#ifndef TVM_RELAY_ATTRS_QNN_H_
+#define TVM_RELAY_ATTRS_QNN_H_
 
 #include <tvm/attrs.h>
 #include <string>
@@ -67,4 +67,4 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#endif  // TVM_RELAY_ATTRS_QNN_H_
diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
index bb054fb8fb65..6a8c2e520098 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/include/tvm/relay/quantize_util.h
@@ -22,10 +22,11 @@
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
-#ifndef TVM_QUANTIZE_UTIL_H
-#define TVM_QUANTIZE_UTIL_H
+#ifndef TVM_RELAY_QUANTIZE_UTIL_H_
+#define TVM_RELAY_QUANTIZE_UTIL_H_
 
 #include <tvm/expr.h>
+#include<limits>
 #include "./base.h"
 
 namespace tvm {
@@ -68,14 +69,15 @@ inline bool is_quantized_type(const DataType& dtype) {
 }
 
 enum class QuantizeOpType : uint8_t {
-  Quantize_Requantize,
+  Quantize,
   Dequantize,
   Requantize
 };
 
-inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
-  switch(op_type) {
-    case QuantizeOpType::Quantize_Requantize:
+inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type,
+        const DataType &in_dtype) {
+  switch (op_type) {
+    case QuantizeOpType::Quantize:
       return is_Float32(in_dtype) || is_quantized_type(in_dtype);
     case QuantizeOpType ::Dequantize:
       return is_quantized_type(in_dtype);
@@ -86,9 +88,10 @@ inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, cons
   }
 }
 
-inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
-  switch(op_type) {
-    case QuantizeOpType::Quantize_Requantize:
+inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type,
+        const DataType &in_dtype) {
+  switch (op_type) {
+    case QuantizeOpType::Quantize:
       return is_quantized_type(in_dtype);
     case QuantizeOpType::Dequantize:
       return is_Float32(in_dtype);
@@ -134,6 +137,6 @@ inline const int32_t get_qmax(const DataType&  dtype) {
   return -1;
 }
 
-} // namespace relay
-} // namespace tvm
-#endif //TVM_QUANTIZE_UTIL_H
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_QUANTIZE_UTIL_H_
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/op/qnn/__init__.py
index aef02300ab63..e9adfa783f93 100644
--- a/python/tvm/relay/op/qnn/__init__.py
+++ b/python/tvm/relay/op/qnn/__init__.py
@@ -17,4 +17,4 @@
 # pylint: disable=wildcard-import
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
-from .qnn import *
\ No newline at end of file
+from .qnn import *
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 484b3864f22f..10477e22ac04 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -20,8 +20,8 @@
 from . import _make
 
 def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-        output_scale, out_dtype="int32", use_int_compute=False,
-        rounding_mode="FE_UPWARD"):
+               output_scale, out_dtype="int32", use_int_compute=False,
+               rounding_mode="FE_UPWARD"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/quantize/rewrite.py
index 89429e522115..c8860775b77f 100644
--- a/python/tvm/relay/quantize/rewrite.py
+++ b/python/tvm/relay/quantize/rewrite.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import
 
 from . import _quantize
-from .. import expr as _expr
 
 def rewrite(expr):
     """
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 645b20c0730e..92bd51ad7e15 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -93,7 +93,7 @@ void GetFixedPointMultiplierShift(double double_multiplier,
  *
  */
 Expr RequantizeInt(const Expr& input_tensor,
-    const RequantizeAttrs*& param, const DataType& idtype,
+    const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
   double double_multiplier = param->input_scale/param->output_scale;
@@ -183,7 +183,7 @@ Expr RequantizeInt(const Expr& input_tensor,
  * the input_tensor, round to nearest integer and then cast back to int32.
  */
 Expr RequantizeFloat(const Expr& input_tensor,
-    const RequantizeAttrs*& param, const DataType& idtype,
+    const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
   double double_multiplier = param->input_scale/param->output_scale;
   auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index e70ea0925231..8a039edd12b6 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -61,7 +61,6 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
                 func)
         func = run_infer_type(func)
         func = relay.quantize.rewrite(func)
-        print(func)
         return func
 
 

From 6cd13285384d891c4daefddbec794007b69e61d3 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 22:56:57 +0000
Subject: [PATCH 05/51] Lint fix.

---
 src/relay/pass/pattern_util.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 7249d1d4c086..faccd518a782 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -33,6 +33,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/transform.h>
+#include <utility>
 #include <string>
 
 

From ac4349b1f6f8f8071d838c4c4f84fb4f134c46d6 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:08:18 +0000
Subject: [PATCH 06/51] Doc fix.

---
 include/tvm/relay/quantize_util.h |  2 +-
 tests/scripts/task_lint.sh        | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
index 6a8c2e520098..5b5215dc4459 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/include/tvm/relay/quantize_util.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file nnvm/compiler/quantize_util.h
+ * \file tvm/relay/quantize_util.h
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 544ef7224770..896cc4c65c22 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,18 +31,18 @@ echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Check ASF license header..."
-java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
-if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
-    echo "Need to add ASF header to the following files."
-    echo "----------------File List----------------"
-    cat /tmp/$$.apache-rat.txt
-    echo "-----------------------------------------"
-    echo "Use the following steps to add the headers:"
-    echo "- Create file_list.txt in your text editor"
-    echo "- Copy paste the above content in file-list into file_list.txt"
-    echo "- python3 tests/lint/add_asf_header.py file_list.txt"
-    exit 1
-fi
+# java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
+# if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
+#     echo "Need to add ASF header to the following files."
+#     echo "----------------File List----------------"
+#     cat /tmp/$$.apache-rat.txt
+#     echo "-----------------------------------------"
+#     echo "Use the following steps to add the headers:"
+#     echo "- Create file_list.txt in your text editor"
+#     echo "- Copy paste the above content in file-list into file_list.txt"
+#     echo "- python3 tests/lint/add_asf_header.py file_list.txt"
+#     exit 1
+# fi
 
 echo "Check codestyle of c++ code..."
 make cpplint

From a9fef75b018a58ff0a01b43cb16e2ec05c24a720 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:10:34 +0000
Subject: [PATCH 07/51] Uncommenting the lint script (fixing mistake).

---
 tests/scripts/task_lint.sh | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 896cc4c65c22..544ef7224770 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,18 +31,18 @@ echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Check ASF license header..."
-# java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
-# if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
-#     echo "Need to add ASF header to the following files."
-#     echo "----------------File List----------------"
-#     cat /tmp/$$.apache-rat.txt
-#     echo "-----------------------------------------"
-#     echo "Use the following steps to add the headers:"
-#     echo "- Create file_list.txt in your text editor"
-#     echo "- Copy paste the above content in file-list into file_list.txt"
-#     echo "- python3 tests/lint/add_asf_header.py file_list.txt"
-#     exit 1
-# fi
+java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
+if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
+    echo "Need to add ASF header to the following files."
+    echo "----------------File List----------------"
+    cat /tmp/$$.apache-rat.txt
+    echo "-----------------------------------------"
+    echo "Use the following steps to add the headers:"
+    echo "- Create file_list.txt in your text editor"
+    echo "- Copy paste the above content in file-list into file_list.txt"
+    echo "- python3 tests/lint/add_asf_header.py file_list.txt"
+    exit 1
+fi
 
 echo "Check codestyle of c++ code..."
 make cpplint

From d9eff68520983d03ca4d659256d392eebd202bda Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:13:54 +0000
Subject: [PATCH 08/51] Modifying the unit tests.

---
 tests/python/unittest/test_quantized_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 8a039edd12b6..6dc35d801543 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -86,7 +86,7 @@ def downscale_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,
@@ -189,7 +189,7 @@ def zero_point_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,
@@ -218,7 +218,7 @@ def zero_point_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,

From abc7c4e7c9daed8bb2b058b6475f5cfbb5138961 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 06:05:00 +0000
Subject: [PATCH 09/51] Moving C++ files into src/relay/qnn

---
 .../qnn.h => src/relay/qnn/include/attrs.h    |  0
 .../relay/qnn/include/util.h                  | 73 +++++++++----------
 src/relay/{op/nn => qnn/op}/requantize.cc     |  6 +-
 src/relay/{ => qnn}/pass/quantize_rewrite.cc  | 18 ++---
 4 files changed, 47 insertions(+), 50 deletions(-)
 rename include/tvm/relay/attrs/qnn.h => src/relay/qnn/include/attrs.h (100%)
 rename include/tvm/relay/quantize_util.h => src/relay/qnn/include/util.h (62%)
 rename src/relay/{op/nn => qnn/op}/requantize.cc (95%)
 rename src/relay/{ => qnn}/pass/quantize_rewrite.cc (95%)

diff --git a/include/tvm/relay/attrs/qnn.h b/src/relay/qnn/include/attrs.h
similarity index 100%
rename from include/tvm/relay/attrs/qnn.h
rename to src/relay/qnn/include/attrs.h
diff --git a/include/tvm/relay/quantize_util.h b/src/relay/qnn/include/util.h
similarity index 62%
rename from include/tvm/relay/quantize_util.h
rename to src/relay/qnn/include/util.h
index 5b5215dc4459..61663b0da85e 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/src/relay/qnn/include/util.h
@@ -26,46 +26,43 @@
 #define TVM_RELAY_QUANTIZE_UTIL_H_
 
 #include <tvm/expr.h>
-#include<limits>
-#include "./base.h"
+#include <limits>
+#include <tvm/relay/expr.h>
 
 namespace tvm {
 namespace relay {
 
-inline bool is_Int8(const DataType& dtype) {
+inline bool IsInt8(const DataType& dtype) {
   return dtype == Int(8);
 }
 
-inline bool is_UInt8(const DataType& dtype) {
+inline bool IsUint8(const DataType& dtype) {
   return dtype == UInt(8);
 }
 
-
-inline bool is_Int16(const DataType& dtype) {
+inline bool IsInt16(const DataType& dtype) {
   return dtype == Int(16);
 }
 
-inline bool is_UInt16(const DataType& dtype) {
+inline bool IsUint16(const DataType& dtype) {
   return dtype == UInt(16);
 }
 
-inline bool is_Int32(const DataType& dtype) {
+inline bool IsInt32(const DataType& dtype) {
   return dtype == Int(32);
 }
 
-inline bool is_UInt32(const DataType& dtype) {
+inline bool IsUint32(const DataType& dtype) {
   return dtype == UInt(32);
 }
 
-
-
-inline bool is_Float32(const DataType& dtype) {
+inline bool IsFloat32(const DataType& dtype) {
   return dtype == Float(32);
 }
 
-inline bool is_quantized_type(const DataType& dtype) {
-  return is_Int8(dtype) || is_UInt8(dtype)
-      || is_Int16(dtype) || is_UInt16(dtype);
+inline bool IsQuantizedType(const DataType& dtype) {
+  return IsInt8(dtype) || IsUint8(dtype)
+      || IsInt16(dtype) || IsUint16(dtype);
 }
 
 enum class QuantizeOpType : uint8_t {
@@ -74,44 +71,44 @@ enum class QuantizeOpType : uint8_t {
   Requantize
 };
 
-inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type,
-        const DataType &in_dtype) {
+inline bool IsValidOpInputType(const QuantizeOpType& op_type,
+        const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return is_Float32(in_dtype) || is_quantized_type(in_dtype);
+      return IsFloat32(in_dtype) || IsQuantizedType(in_dtype);
     case QuantizeOpType ::Dequantize:
-      return is_quantized_type(in_dtype);
+      return IsQuantizedType(in_dtype);
     case QuantizeOpType ::Requantize:
-      return is_Int16(in_dtype) || is_Int32(in_dtype);
+      return IsInt16(in_dtype) || IsInt32(in_dtype);
     default:
       return false;
   }
 }
 
-inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type,
-        const DataType &in_dtype) {
+inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
+        const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return is_quantized_type(in_dtype);
+      return IsQuantizedType(in_dtype);
     case QuantizeOpType::Dequantize:
-      return is_Float32(in_dtype);
+      return IsFloat32(in_dtype);
     default:
       return false;
   }
 }
 
-inline const int32_t get_qmin(const DataType&  dtype) {
-  if (is_Int8(dtype)) {
+inline const int32_t GetQmin(const DataType& dtype) {
+  if (IsInt8(dtype)) {
     return std::numeric_limits<int8_t>::min();
-  } else if (is_UInt8(dtype)) {
+  } else if (IsUint8(dtype)) {
     return std::numeric_limits<uint8_t>::min();
-  } else if (is_Int16(dtype)) {
+  } else if (IsInt16(dtype)) {
     return std::numeric_limits<int16_t>::min();
-  } else if (is_UInt16(dtype)) {
+  } else if (IsUint16(dtype)) {
     return std::numeric_limits<uint16_t>::min();
-  } else if (is_Int32(dtype)) {
+  } else if (IsInt32(dtype)) {
     return std::numeric_limits<int32_t>::min();
-  } else if (is_UInt32(dtype)) {
+  } else if (IsUint32(dtype)) {
     return std::numeric_limits<uint32_t>::min();
   }
   LOG(FATAL) << "Type not supported\n";
@@ -119,18 +116,18 @@ inline const int32_t get_qmin(const DataType&  dtype) {
 }
 
 
-inline const int32_t get_qmax(const DataType&  dtype) {
-  if (is_Int8(dtype)) {
+inline const int32_t GetQmax(const DataType& dtype) {
+  if (IsInt8(dtype)) {
     return std::numeric_limits<int8_t>::max();
-  } else if (is_UInt8(dtype)) {
+  } else if (IsUint8(dtype)) {
     return std::numeric_limits<uint8_t>::max();
-  } else if (is_Int16(dtype)) {
+  } else if (IsInt16(dtype)) {
     return std::numeric_limits<int16_t>::max();
-  } else if (is_UInt16(dtype)) {
+  } else if (IsUint16(dtype)) {
     return std::numeric_limits<uint16_t>::max();
-  } else if (is_Int32(dtype)) {
+  } else if (IsInt32(dtype)) {
     return std::numeric_limits<int32_t>::max();
-  } else if (is_UInt32(dtype)) {
+  } else if (IsUint32(dtype)) {
     return std::numeric_limits<uint32_t>::max();
   }
   LOG(FATAL) << "Type not supported\n";
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/qnn/op/requantize.cc
similarity index 95%
rename from src/relay/op/nn/requantize.cc
rename to src/relay/qnn/op/requantize.cc
index 285528993f6f..9e4ddc97467f 100644
--- a/src/relay/op/nn/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -25,8 +25,8 @@
 
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/qnn.h>
-#include <tvm/relay/quantize_util.h>
+#include "../include/attrs.h"
+#include "../include/util.h"
 
 namespace tvm {
 namespace relay {
@@ -41,7 +41,7 @@ bool RequantizeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
-  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Requantize, input_dtype))
+  CHECK(IsValidOpInputType(QuantizeOpType::Requantize, input_dtype))
     << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
similarity index 95%
rename from src/relay/pass/quantize_rewrite.cc
rename to src/relay/qnn/pass/quantize_rewrite.cc
index 92bd51ad7e15..30265ca1dc32 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -26,9 +26,9 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/quantize_util.h>
-#include <tvm/relay/attrs/qnn.h>
-#include "pattern_util.h"
+#include "../include/attrs.h"
+#include "../include/util.h"
+#include "../../pass/pattern_util.h"
 
 namespace tvm {
 namespace relay {
@@ -170,8 +170,8 @@ Expr RequantizeInt(const Expr& input_tensor,
   // clip_min and clip_max are within the dtype range of the input tensor to the
   // clip operator. For example, if the input to clip operator is int8, but the
   // out_dtype is uint8, we will get incorrect results, if we set max as 255.
-  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
-  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
+  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
   auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;
@@ -196,15 +196,15 @@ Expr RequantizeFloat(const Expr& input_tensor,
   auto multiplied_t = Multiply(casted_t, scalar_multiplier);
   auto shifted_multiplied_t = Add(output_zp, multiplied_t);
   auto rounded_t = Round(shifted_multiplied_t);
-  auto q_imin = get_qmin(idtype);
-  auto q_imax = get_qmax(idtype);
+  auto q_imin = GetQmin(idtype);
+  auto q_imax = GetQmax(idtype);
   auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
           idtype);
 
   // Clip to the out_dtype min/max.
   // Clip limits must be smaller than the dtype of the input tensor.
-  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
-  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
+  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
   auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;

From 275ddd0ad424483f2d57d220bbceedfe2fafb295 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 16:15:26 +0000
Subject: [PATCH 10/51] Moving python files to python/tvm/relay/qnn. Some minor
 fixes.

---
 python/tvm/relay/__init__.py                  |  3 +
 python/tvm/relay/op/__init__.py               |  1 -
 python/tvm/relay/op/qnn/_make.py              | 20 -----
 python/tvm/relay/op/qnn/qnn.py                | 74 -------------------
 python/tvm/relay/{op => }/qnn/__init__.py     |  3 +-
 .../{quantize/rewrite.py => qnn/ir_pass.py}   |  4 +-
 python/tvm/relay/quantize/__init__.py         |  1 -
 src/relay/qnn/pass/quantize_rewrite.cc        | 18 ++---
 tests/python/unittest/test_quantized_ops.py   |  4 +-
 9 files changed, 18 insertions(+), 110 deletions(-)
 delete mode 100644 python/tvm/relay/op/qnn/_make.py
 delete mode 100644 python/tvm/relay/op/qnn/qnn.py
 rename python/tvm/relay/{op => }/qnn/__init__.py (95%)
 rename python/tvm/relay/{quantize/rewrite.py => qnn/ir_pass.py} (95%)

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index dfac85bb1ed2..be78d8bdc353 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -49,6 +49,9 @@
 from . import backend
 from . import quantize
 
+# Dialects
+from . import qnn
+
 from .scope_builder import ScopeBuilder
 
 # Span
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 1d634ef18fc0..a27ab1dc50ff 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -26,7 +26,6 @@
 from .transform import *
 from .algorithm import *
 from . import nn
-from . import qnn
 from . import annotation
 from . import image
 from . import vision
diff --git a/python/tvm/relay/op/qnn/_make.py b/python/tvm/relay/op/qnn/_make.py
deleted file mode 100644
index b1695629b8f9..000000000000
--- a/python/tvm/relay/op/qnn/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-from ...._ffi.function import _init_api
-
-_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
deleted file mode 100644
index 10477e22ac04..000000000000
--- a/python/tvm/relay/op/qnn/qnn.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#pylint: disable=invalid-name, too-many-lines
-"""Neural network operations."""
-from __future__ import absolute_import as _abs
-from . import _make
-
-def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-               output_scale, out_dtype="int32", use_int_compute=False,
-               rounding_mode="FE_UPWARD"):
-    r"""Requantized operator.
-
-    The requantize operator converts one quantized tensor to another quantized
-    tensor. For the output tensor, we are provided with output scale and zero
-    point. The computation looks like this
-
-    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
-
-    The above computation can be done in floating point as the scales are in
-    FP32. Alternatively, we can approximate floating point with fixed point
-    computation. This is controlled by use_int_compute.
-
-    Parameters
-    ----------
-    quantized_data : tvm.relay.Expr
-        The input quantized_data to the operator.
-
-    input_scale: float
-           The float scalar to scale the quantized_data int8 values back to FP32.
-
-    output_scale: float
-           The float scalar to scale the quantized_output int8 values back to FP32.
-
-    input_zero_point: int
-           The zero point of the quantized_data distribution.
-
-    output_zero_point: int
-           The zero point of the quantized_output distribution.
-
-    out_dtype : str, optional
-        Specifies the output quantized_data type for mixed precision conv2d.
-
-    use_int_compute : bool, optional
-        Use fully integer computation for requantizing.
-
-    rounding_mode : string, optional
-        Defines the rounding direction when the value is midway between two
-        representable values.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
-            "Unsupported rounding mode"
-
-    return _make.requantize(input_data, input_zero_point, input_scale,
-                            output_zero_point, output_scale, out_dtype,
-                            use_int_compute, rounding_mode)
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
similarity index 95%
rename from python/tvm/relay/op/qnn/__init__.py
rename to python/tvm/relay/qnn/__init__.py
index e9adfa783f93..0836c5770ce4 100644
--- a/python/tvm/relay/op/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -17,4 +17,5 @@
 # pylint: disable=wildcard-import
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
-from .qnn import *
+from . import op
+from . import ir_pass
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/qnn/ir_pass.py
similarity index 95%
rename from python/tvm/relay/quantize/rewrite.py
rename to python/tvm/relay/qnn/ir_pass.py
index c8860775b77f..24e3329e961c 100644
--- a/python/tvm/relay/quantize/rewrite.py
+++ b/python/tvm/relay/qnn/ir_pass.py
@@ -18,7 +18,7 @@
 """Automatic quantization toolkit."""
 from __future__ import absolute_import
 
-from . import _quantize
+from . import _qnn
 
 def rewrite(expr):
     """
@@ -34,4 +34,4 @@ def rewrite(expr):
     expr : tvm.relay.Expr
         The output expression.
     """
-    return _quantize.rewrite(expr)
+    return _qnn.rewrite(expr)
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
index 8da4e7953566..45bb62e66853 100644
--- a/python/tvm/relay/quantize/__init__.py
+++ b/python/tvm/relay/quantize/__init__.py
@@ -19,5 +19,4 @@
 from __future__ import absolute_import as _abs
 
 from .quantize import *
-from .rewrite import *
 from ._annotate import register_annotate_function
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 30265ca1dc32..9d10b5a47ba9 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -85,11 +85,11 @@ void GetFixedPointMultiplierShift(double double_multiplier,
  * The whole computation this can be broken down into following steps
  * 1) Calculate the integer multiplier and integer shift.
  * 2) Subtract the input integer point.
- * 2) Multiply the integer fixed point multiplier with quantized tensor.
- * 3) Round the result.
- * 4) Right shift the result.
- * 5) Add the output_zero_point.
- * 6) Cast to the out_dtype.
+ * 3) Multiply the integer fixed point multiplier with quantized tensor.
+ * 4) Round the result.
+ * 5) Right shift the result.
+ * 6) Add the output_zero_point.
+ * 7) Cast to the out_dtype.
  *
  */
 Expr RequantizeInt(const Expr& input_tensor,
@@ -252,11 +252,11 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay._quantize.rewrite")
+TVM_REGISTER_API("relay._qnn.rewrite")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-          Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
-            return ret;
-            });
+  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  return ret;
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 6dc35d801543..092e695cf533 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -47,7 +47,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
             output_zero_point=0):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
-        func = relay.op.qnn.requantize(
+        func = relay.qnn.op.requantize(
                 quantized_data,
                 input_zero_point=input_zero_point,
                 output_zero_point=output_zero_point,
@@ -60,7 +60,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
         func = run_infer_type(func)
-        func = relay.quantize.rewrite(func)
+        func = relay.qnn.ir_pass.rewrite(func)
         return func
 
 

From a0ad8caf71c8114601160f356f098c9c0afad1d9 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 16:58:44 +0000
Subject: [PATCH 11/51] Moving the attrs.h inside the include directory.

---
 {src/relay/qnn/include => include/tvm/relay/qnn}/attrs.h | 0
 src/relay/qnn/op/requantize.cc                           | 2 +-
 src/relay/qnn/pass/quantize_rewrite.cc                   | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename {src/relay/qnn/include => include/tvm/relay/qnn}/attrs.h (100%)

diff --git a/src/relay/qnn/include/attrs.h b/include/tvm/relay/qnn/attrs.h
similarity index 100%
rename from src/relay/qnn/include/attrs.h
rename to include/tvm/relay/qnn/attrs.h
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 9e4ddc97467f..c389e82fba80 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -25,7 +25,7 @@
 
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
-#include "../include/attrs.h"
+#include <tvm/relay/qnn/attrs.h>
 #include "../include/util.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 9d10b5a47ba9..5d4942c80a7c 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
-#include "../include/attrs.h"
+#include <tvm/relay/qnn/attrs.h>
 #include "../include/util.h"
 #include "../../pass/pattern_util.h"
 

From ff8936c1a412770c21566e1ba6a8a247b3f3601c Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 19:05:03 +0000
Subject: [PATCH 12/51] Pushing files that I forgot earlier. Changing util
 location.

---
 python/tvm/relay/qnn/_qnn.py           | 22 ++++++++
 python/tvm/relay/qnn/op/__init__.py    | 20 +++++++
 python/tvm/relay/qnn/op/_make.py       | 20 +++++++
 python/tvm/relay/qnn/op/qnn.py         | 74 ++++++++++++++++++++++++++
 src/relay/qnn/op/requantize.cc         |  2 +-
 src/relay/qnn/pass/quantize_rewrite.cc |  2 +-
 src/relay/qnn/{include => }/util.h     |  0
 7 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/relay/qnn/_qnn.py
 create mode 100644 python/tvm/relay/qnn/op/__init__.py
 create mode 100644 python/tvm/relay/qnn/op/_make.py
 create mode 100644 python/tvm/relay/qnn/op/qnn.py
 rename src/relay/qnn/{include => }/util.h (100%)

diff --git a/python/tvm/relay/qnn/_qnn.py b/python/tvm/relay/qnn/_qnn.py
new file mode 100644
index 000000000000..bd3cdbb976d6
--- /dev/null
+++ b/python/tvm/relay/qnn/_qnn.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=unused-argument
+"""Internal module for quantization."""
+from __future__ import absolute_import
+from tvm._ffi.function import _init_api
+
+_init_api("relay._qnn", __name__)
diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py
new file mode 100644
index 000000000000..e9adfa783f93
--- /dev/null
+++ b/python/tvm/relay/qnn/op/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .qnn import *
diff --git a/python/tvm/relay/qnn/op/_make.py b/python/tvm/relay/qnn/op/_make.py
new file mode 100644
index 000000000000..b1695629b8f9
--- /dev/null
+++ b/python/tvm/relay/qnn/op/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
new file mode 100644
index 000000000000..8db431eebe23
--- /dev/null
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def requantize(input_data, input_zero_point, input_scale, output_zero_point,
+               output_scale, out_dtype="int32", use_int_compute=True,
+               rounding_mode="FE_AWAY_FROM_ZERO"):
+    r"""Requantized operator.
+
+    The requantize operator converts one quantized tensor to another quantized
+    tensor. For the output tensor, we are provided with output scale and zero
+    point. The computation looks like this
+
+    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+
+    The above computation can be done in floating point as the scales are in
+    FP32. Alternatively, we can approximate floating point with fixed point
+    computation. This is controlled by use_int_compute.
+
+    Parameters
+    ----------
+    quantized_data : tvm.relay.Expr
+        The input quantized_data to the operator.
+
+    input_scale: float
+           The float scalar to scale the quantized_data int8 values back to FP32.
+
+    output_scale: float
+           The float scalar to scale the quantized_output int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the quantized_data distribution.
+
+    output_zero_point: int
+           The zero point of the quantized_output distribution.
+
+    out_dtype : str, optional
+        Specifies the output quantized_data type for mixed precision conv2d.
+
+    use_int_compute : bool, optional
+        Use fully integer computation for requantizing.
+
+    rounding_mode : string, optional
+        Defines the rounding direction when the value is midway between two
+        representable values.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+            "Unsupported rounding mode"
+
+    return _make.requantize(input_data, input_zero_point, input_scale,
+                            output_zero_point, output_scale, out_dtype,
+                            use_int_compute, rounding_mode)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index c389e82fba80..df4a224fc2ba 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/qnn/attrs.h>
-#include "../include/util.h"
+#include "../util.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 5d4942c80a7c..7d4e0f017050 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
-#include "../include/util.h"
+#include "../util.h"
 #include "../../pass/pattern_util.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/include/util.h b/src/relay/qnn/util.h
similarity index 100%
rename from src/relay/qnn/include/util.h
rename to src/relay/qnn/util.h

From bdca4c68e8332e44ad163cd4d28a3eed687d54ce Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 8 Jul 2019 12:12:40 -0700
Subject: [PATCH 13/51] [Relay] [Quantization] WIP - Common files for the
 qauntization work.

---
 include/tvm/relay/attrs/qnn.h         |  37 +++++++
 include/tvm/relay/quantize_util.h     | 139 ++++++++++++++++++++++++++
 python/tvm/relay/op/__init__.py       |   1 +
 python/tvm/relay/op/qnn/__init__.py   |  20 ++++
 python/tvm/relay/op/qnn/_make.py      |  20 ++++
 python/tvm/relay/op/qnn/qnn.py        |  21 ++++
 python/tvm/relay/quantize/__init__.py |   1 +
 python/tvm/relay/quantize/rewrite.py  |  38 +++++++
 src/relay/pass/pattern_util.h         |  20 ++++
 src/relay/pass/quantize_rewrite.cc    |  38 +++++++
 10 files changed, 335 insertions(+)
 create mode 100644 include/tvm/relay/attrs/qnn.h
 create mode 100644 include/tvm/relay/quantize_util.h
 create mode 100644 python/tvm/relay/op/qnn/__init__.py
 create mode 100644 python/tvm/relay/op/qnn/_make.py
 create mode 100644 python/tvm/relay/op/qnn/qnn.py
 create mode 100644 python/tvm/relay/quantize/rewrite.py
 create mode 100644 src/relay/pass/quantize_rewrite.cc

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
new file mode 100644
index 000000000000..c45a33c786f7
--- /dev/null
+++ b/include/tvm/relay/attrs/qnn.h
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/nn.h
+ * \brief Auxiliary attributes for nn operators.
+ */
+#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
new file mode 100644
index 000000000000..bb054fb8fb65
--- /dev/null
+++ b/include/tvm/relay/quantize_util.h
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nnvm/compiler/quantize_util.h
+ * \brief Utility methods needs for quantized ops that can be shared
+ */
+
+#ifndef TVM_QUANTIZE_UTIL_H
+#define TVM_QUANTIZE_UTIL_H
+
+#include <tvm/expr.h>
+#include "./base.h"
+
+namespace tvm {
+namespace relay {
+
+inline bool is_Int8(const DataType& dtype) {
+  return dtype == Int(8);
+}
+
+inline bool is_UInt8(const DataType& dtype) {
+  return dtype == UInt(8);
+}
+
+
+inline bool is_Int16(const DataType& dtype) {
+  return dtype == Int(16);
+}
+
+inline bool is_UInt16(const DataType& dtype) {
+  return dtype == UInt(16);
+}
+
+inline bool is_Int32(const DataType& dtype) {
+  return dtype == Int(32);
+}
+
+inline bool is_UInt32(const DataType& dtype) {
+  return dtype == UInt(32);
+}
+
+
+
+inline bool is_Float32(const DataType& dtype) {
+  return dtype == Float(32);
+}
+
+inline bool is_quantized_type(const DataType& dtype) {
+  return is_Int8(dtype) || is_UInt8(dtype)
+      || is_Int16(dtype) || is_UInt16(dtype);
+}
+
+enum class QuantizeOpType : uint8_t {
+  Quantize_Requantize,
+  Dequantize,
+  Requantize
+};
+
+inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_Float32(in_dtype) || is_quantized_type(in_dtype);
+    case QuantizeOpType ::Dequantize:
+      return is_quantized_type(in_dtype);
+    case QuantizeOpType ::Requantize:
+      return is_Int16(in_dtype) || is_Int32(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_quantized_type(in_dtype);
+    case QuantizeOpType::Dequantize:
+      return is_Float32(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline const int32_t get_qmin(const DataType&  dtype) {
+  if (is_Int8(dtype)) {
+    return std::numeric_limits<int8_t>::min();
+  } else if (is_UInt8(dtype)) {
+    return std::numeric_limits<uint8_t>::min();
+  } else if (is_Int16(dtype)) {
+    return std::numeric_limits<int16_t>::min();
+  } else if (is_UInt16(dtype)) {
+    return std::numeric_limits<uint16_t>::min();
+  } else if (is_Int32(dtype)) {
+    return std::numeric_limits<int32_t>::min();
+  } else if (is_UInt32(dtype)) {
+    return std::numeric_limits<uint32_t>::min();
+  }
+  LOG(FATAL) << "Type not supported\n";
+  return -1;
+}
+
+
+inline const int32_t get_qmax(const DataType&  dtype) {
+  if (is_Int8(dtype)) {
+    return std::numeric_limits<int8_t>::max();
+  } else if (is_UInt8(dtype)) {
+    return std::numeric_limits<uint8_t>::max();
+  } else if (is_Int16(dtype)) {
+    return std::numeric_limits<int16_t>::max();
+  } else if (is_UInt16(dtype)) {
+    return std::numeric_limits<uint16_t>::max();
+  } else if (is_Int32(dtype)) {
+    return std::numeric_limits<int32_t>::max();
+  } else if (is_UInt32(dtype)) {
+    return std::numeric_limits<uint32_t>::max();
+  }
+  LOG(FATAL) << "Type not supported\n";
+  return -1;
+}
+
+} // namespace relay
+} // namespace tvm
+#endif //TVM_QUANTIZE_UTIL_H
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index a27ab1dc50ff..1d634ef18fc0 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -26,6 +26,7 @@
 from .transform import *
 from .algorithm import *
 from . import nn
+from . import qnn
 from . import annotation
 from . import image
 from . import vision
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/op/qnn/__init__.py
new file mode 100644
index 000000000000..aef02300ab63
--- /dev/null
+++ b/python/tvm/relay/op/qnn/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .qnn import *
\ No newline at end of file
diff --git a/python/tvm/relay/op/qnn/_make.py b/python/tvm/relay/op/qnn/_make.py
new file mode 100644
index 000000000000..b1695629b8f9
--- /dev/null
+++ b/python/tvm/relay/op/qnn/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
new file mode 100644
index 000000000000..008e6cbb7f80
--- /dev/null
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
index 45bb62e66853..8da4e7953566 100644
--- a/python/tvm/relay/quantize/__init__.py
+++ b/python/tvm/relay/quantize/__init__.py
@@ -19,4 +19,5 @@
 from __future__ import absolute_import as _abs
 
 from .quantize import *
+from .rewrite import *
 from ._annotate import register_annotate_function
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/quantize/rewrite.py
new file mode 100644
index 000000000000..89429e522115
--- /dev/null
+++ b/python/tvm/relay/quantize/rewrite.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=unused-argument
+"""Automatic quantization toolkit."""
+from __future__ import absolute_import
+
+from . import _quantize
+from .. import expr as _expr
+
+def rewrite(expr):
+    """
+    Rewrites the high-level quantized ops into low-level exisiting Relay ops.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    expr : tvm.relay.Expr
+        The output expression.
+    """
+    return _quantize.rewrite(expr)
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 5c303905968e..7249d1d4c086 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -373,6 +373,26 @@ inline Expr Copy(Expr data) {
 }
 
 
+inline Expr Where(const Expr& condition, const Expr& x, const Expr& y) {
+  static const Op& op = Op::Get("where");
+  return CallNode::make(op, {condition, x, y});
+}
+
+inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
+  static const Op& op = Op::Get("greater_equal");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+inline Expr Full(Expr fill_value,
+              Array<IndexExpr> shape,
+              DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("full");
+  return CallNode::make(op, {fill_value}, Attrs(attrs), {});
+}
+
 Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
new file mode 100644
index 000000000000..925c516b41ed
--- /dev/null
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize_rewrite.cc
+ * \brief Lower quantized ops to exisiting Relay ops.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/quantize_util.h>
+#include <tvm/relay/attrs/qnn.h>
+#include "pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+
+}  // namespace relay
+}  // namespace tvm

From 755f9340a3e3ce0fa01aa7323058aa12a879a25a Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 8 Jul 2019 12:20:54 -0700
Subject: [PATCH 14/51] [Relay] [Quantization] WIP - Prototyping requantize op.

---
 include/tvm/relay/attrs/qnn.h      |  24 +++
 python/tvm/relay/op/qnn/qnn.py     |  46 ++++++
 src/relay/op/nn/requantize.cc      |  89 +++++++++++
 src/relay/pass/quantize_rewrite.cc | 237 +++++++++++++++++++++++++++++
 4 files changed, 396 insertions(+)
 create mode 100644 src/relay/op/nn/requantize.cc

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index c45a33c786f7..12afe19d26b3 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -30,7 +30,31 @@
 namespace tvm {
 namespace relay {
 
+/*! \brief Attribute for requantize operator */
+struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
+  double input_scale;
+  int32_t input_zero_point;
+  double output_scale;
+  int32_t output_zero_point;
+  bool use_int_compute;
+  DataType out_dtype;
 
+  TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
+    TVM_ATTR_FIELD(output_zero_point)
+        .describe("The zero point of the output tensor.");
+    TVM_ATTR_FIELD(input_scale)
+        .describe("The scale of the input tensor.");
+    TVM_ATTR_FIELD(output_scale)
+        .describe("The scale of the output tensor.");
+    TVM_ATTR_FIELD(use_int_compute).set_default(false)
+        .describe("When true, the integer computation is used to handle output scale");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 008e6cbb7f80..18be68cd9cfc 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -19,3 +19,49 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
+
+def requantize(input_data, input_zero_point, input_scale, output_zero_point,
+        output_scale, out_dtype="int32", use_int_compute=False):
+    r"""Requantized operator.
+
+    The requantize operator converts one quantized tensor to another quantized
+    tensor. For the output tensor, we are provided with output scale and zero
+    point. The computation looks like this
+
+    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+
+    The above computation can be done in floating point as the scales are in
+    FP32. Alternatively, we can approximate floating point with fixed point
+    computation. This is controlled by use_int_compute.
+
+    Parameters
+    ----------
+    quantized_data : tvm.relay.Expr
+        The input quantized_data to the operator.
+
+    input_scale: float
+           The float scalar to scale the quantized_data int8 values back to FP32.
+
+    output_scale: float
+           The float scalar to scale the quantized_output int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the quantized_data distribution.
+
+    output_zero_point: int
+           The zero point of the quantized_output distribution.
+
+    out_dtype : str, optional
+        Specifies the output quantized_data type for mixed precision conv2d.
+
+    use_int_compute : bool, optional
+        Use fully integer computation for requantizing.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.requantize(input_data, input_zero_point, input_scale,
+                            output_zero_point, output_scale, out_dtype,
+                            use_int_compute)
\ No newline at end of file
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/op/nn/requantize.cc
new file mode 100644
index 000000000000..80f2bde4ad47
--- /dev/null
+++ b/src/relay/op/nn/requantize.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file requantize.cc
+ * \brief Quantized convolution operators
+ */
+
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/qnn.h>
+#include <tvm/relay/quantize_util.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
+
+
+bool RequantizeRel(const Array<Type>& types,
+                   int num_inputs,
+                   const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto input_dtype = data->dtype;
+  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Requantize, input_dtype))
+    << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
+
+  const Array<tvm::Expr> oshape = data->shape;
+  // assign output type
+  const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, param->out_dtype));
+  return true;
+}
+
+// Positional relay function to create quantized conv2d operator
+// used by frontend FFI.
+Expr MakeRequantize(Expr data,
+                    int32_t input_zero_point,
+                    double input_scale,
+                    int32_t output_zero_point,
+                    double output_scale,
+                    DataType out_dtype,
+                    bool use_int_compute) {
+  auto attrs = make_node<RequantizeAttrs>();
+  attrs->out_dtype = std::move(out_dtype);
+  attrs->input_zero_point = std::move(input_zero_point);
+  attrs->output_zero_point = std::move(output_zero_point);
+  attrs->input_scale = std::move(input_scale);
+  attrs->output_scale = std::move(output_scale);
+  attrs->use_int_compute = std::move(use_int_compute);
+  static const Op& op = Op::Get("qnn.requantize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.requantize")
+.describe(R"code(Requantize operator.
+
+FIXME
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.RequantizeAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The quantized input tensor.")
+.set_support_level(10)
+.add_type_rel("Requantize", RequantizeRel);
+
+TVM_REGISTER_API("relay.op.qnn._make.requantize")
+.set_body_typed(MakeRequantize);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 925c516b41ed..55f8c43fd49f 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -34,5 +34,242 @@ namespace tvm {
 namespace relay {
 
 
+// Lowering of qnn.requantize op
+void GetFixedPointMultiplierShift(double double_multiplier,
+    int32_t* fixed_point_multiplier, int* shift,
+    const DataType& idtype) {
+
+  int acc_dtype_bits = idtype.bits();
+
+  if (double_multiplier == 0.) {
+    *fixed_point_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (acc_dtype_bits - 1))));
+  CHECK_LE(q_fixed, (1ll << (acc_dtype_bits - 1)));
+  if (q_fixed == (1ll << (acc_dtype_bits - 1))) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+  *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+Expr MultiplyByIntegerMuliplier(const Expr& convolved_tensor,
+    const int32_t fixed_point_multiplier, const int left_shift,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+  // TODO (janimesh) - How to add the overflow checks here. TFLite code snippet is
+  // bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
+  // return overflow ? std::numeric_limits<std::int32_t>::max() : .....;/
+
+  // The calculations are done in upcast of idtype to retain precision.
+  int acc_dtype_bits = idtype.bits();
+  DataType up_idtype = Int(2 * acc_dtype_bits);
+
+  auto tensor = convolved_tensor;
+  // Typically the left_shift will be 0 if the original scale is > 0.5.
+  if (left_shift != 0) {
+    tensor = Multiply(tensor, MakeConstantScalar(idtype, 1 << left_shift));
+  }
+
+  // Upcast the computation to Int64 and multiply the multiplier.
+  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
+  auto multiplied_t = Multiply(Cast(tensor, up_idtype), scalar);
+
+  // Since, we are performing fixed point computation. We are only interested in
+  // higher 16/32 bits. But before that, we also need to perform rounding.
+  // This is fixed point rounding. So, the rounder add scalar depends if the
+  // input is positive.
+  auto zero = MakeConstantScalar(up_idtype, 0);
+  auto pos_threshold = MakeConstantScalar(up_idtype,
+          1ll << (acc_dtype_bits - 2));
+  auto neg_threshold = MakeConstantScalar(up_idtype,
+          (1 - (1ll << (acc_dtype_bits - 2))));
+  auto pos_rounder = Full(pos_threshold, out_shape, up_idtype);
+  auto neg_rounder = Full(neg_threshold, out_shape, up_idtype);
+  auto rounding_scalar = Where(GreaterEqual(multiplied_t, zero), pos_rounder, neg_rounder);
+  auto rounded_tensor = Add(multiplied_t, rounding_scalar);
+
+  // Perform right shift to get the first 16/32 bits.
+  // The result is first doubled and the first 15/31 bits are obtained. This is
+  // done by just right shifting the result by 15/31 bits.
+  auto right_shift_scalar = MakeConstantScalar(up_idtype, (acc_dtype_bits - 1));
+  auto scaled_t = RightShift(rounded_tensor, right_shift_scalar);
+  auto q_imin = get_qmin(idtype);
+  auto q_imax = get_qmax(idtype);
+  auto integer_multiplied_t = Cast(Clip(scaled_t, q_imin, q_imax),
+          idtype);
+  return integer_multiplied_t;
+}
+
+Expr ShiftByIntegerShift(const Expr& multiplied_t,
+    const int& exponent, const RequantizeAttrs*& param,
+    const DataType& idtype, const Array<IndexExpr>& out_shape) {
+  CHECK_GE(exponent, 0);
+  int acc_dtype_bits = idtype.bits();
+  CHECK_LE(exponent, (acc_dtype_bits - 1));
+
+  // We need to perform rounding. The rounding here is closest to the power
+  // of 2. The exponent basically represents the decimal point. We need to round
+  // at the decimal point.
+  auto tensor = multiplied_t;
+  if (exponent != 0) {
+    auto pos_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)));
+    auto neg_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)) - 1);
+    auto pos_rounder_t = Full(pos_rounder, out_shape, idtype);
+    auto neg_rounder_t = Full(neg_rounder, out_shape, idtype);
+
+    auto zero = MakeConstantScalar(idtype, 0);
+    auto zero_t = Full(zero, out_shape, idtype);
+    auto round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
+            neg_rounder_t);
+    tensor = Add(tensor, round_scalar);
+  }
+
+  // Right shift by exponent to approximate the division.
+  auto scaled_t = RightShift(tensor,
+          MakeConstantScalar(idtype, exponent));
+  return scaled_t;
+}
+
+
+/*
+ * Requantization using only integer computation. Here, the computation is
+ * converted to a fixed point computation by computing output multiplier and
+ * shift. This is useful, if the target device does not support/have very
+ * expensive floating point computations.
+ *
+ * Original compuation is scale_fp32 * quantized_tensor.  To convert into
+ * integer computation, the multiplication with fp32 scalar can be replaced by
+ * multiplication with an int value and then right shifting the result. This
+ * approximates the floating point computation with a fixed point computation.
+ *
+ * The whole computaition this can be broken down into following steps 
+ * 1) Calculate the integer multiplier and integer shift.
+ * 2) Multiply the integer multiplier with quantized tensor.
+ * 3) Right shift the result.
+ *
+ * The only thing complicating the above computations is the tedious approach of
+ * handling rounding.
+ */
+Expr RequantizeInt(const Expr& convolved_tensor,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+
+  double double_multiplier = param->input_scale/param->output_scale;
+  // 1) Calculating the integer multiplier and integer shift
+  int32_t fixed_point_multiplier;
+  int shift;
+  GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
+          &shift, idtype);
+
+  // 2) Multiply the integer multiplier
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  auto multiplied_t = MultiplyByIntegerMuliplier(convolved_tensor,
+          fixed_point_multiplier, left_shift, param, idtype, out_shape);
+
+  // 3) Divide by the denominator or right shift the result.
+  auto scaled_int32_t = ShiftByIntegerShift(multiplied_t,
+          right_shift, param, idtype, out_shape);
+
+  // 4) Clip to the out_dtype min/max.
+  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
+  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto requantized_output = Cast(clipped_t, param->out_dtype);
+  return requantized_output;
+}
+
+/* 
+ * Requantization using floating computation. Here we can multiply the scale to
+ * the convolved_tensor, round to nearest integer and then cast back to int32.
+ */
+Expr RequantizeFloat(const Expr& convolved_tensor,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+  double double_multiplier = param->input_scale/param->output_scale;
+  auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
+
+  // Multiply the convolved tensor with the new scale.
+  auto casted_t = Cast(convolved_tensor, Float(32));
+  auto multiplied_t = Round(Multiply(casted_t, scalar_multiplier));
+  auto q_imin = get_qmin(idtype);
+  auto q_imax = get_qmax(idtype);
+  auto scaled_int32_t = Cast(Clip(multiplied_t, q_imin, q_imax),
+          idtype);
+
+  // Clip to the out_dtype min/max.
+  // Clip limits must be smaller than the dtype of the input tensor.
+  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
+  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto requantized_output = Cast(clipped_t, param->out_dtype);
+  return requantized_output;
+}
+
+/*
+ * Lowering of the requantize operation. The requantize operator converts one
+ * quantized tensor to another quantized tensor. For the output tensor, we are
+ * provided with output scale and zero point. The computation looks like this
+ *
+ * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+ *
+ * The above computation can be done in floating point as the scales are in
+ * FP32. Alternatively, we can approximate floating point with fixed point
+ * computation. This is controlled by use_int_compute.
+ */
+Expr RequantizeForwardRewrite(const Call& ref_call,
+    const Array<Expr>& new_args, const NodeRef& ctx) {
+  CHECK_EQ(new_args.size(), 1);
+  Expr quantized_data = new_args[0];
+  const auto* param = ref_call->attrs.as<RequantizeAttrs>();
+
+  // Find output shape.
+  Array<IndexExpr> out_shape;
+  auto ref_call_t = ref_call->checked_type();
+  auto output_tt = ref_call_t.as<TensorTypeNode>();
+  CHECK(output_tt != nullptr) << "Type information missing."
+      << " Please run infer_type pass.";
+  out_shape = output_tt->shape;
+
+  // Find input dtype.
+  auto ref_input_t = ref_call->args[0]->checked_type();
+  auto input_tt = ref_input_t.as<TensorTypeNode>();
+  CHECK(input_tt != nullptr) << "Type information missing."
+      << " Please run infer_type pass.";
+  const auto input_dtype = input_tt->dtype;
+
+  // Check for current quantization support.
+  CHECK_EQ(param->input_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+  CHECK_EQ(param->output_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+
+  if (param->use_int_compute) {
+    return RequantizeInt(quantized_data, param, input_dtype, out_shape);
+  } else {
+    return RequantizeFloat(quantized_data, param, input_dtype, out_shape);
+  }
+}
+
+
+RELAY_REGISTER_OP("qnn.requantize")
+.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
+
+
+
+TVM_REGISTER_API("relay._quantize.rewrite")
+.set_body_typed<Expr(Expr)>([](const Expr& e) {
+  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  return ret;
+});
+
+
 }  // namespace relay
 }  // namespace tvm

From dba71f06114a7697ad49c4e05703b07ff1b741a8 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 21:51:02 +0000
Subject: [PATCH 15/51] Requantize operator implementation.

Requantize converts one quantized tensor representation to another quantized
representation. The PR has following implementation features

- Requantize operator defined in qnn namespace - relay.qnn.requantize
- Lowering of the requantize to exisiting Relay operators
- Integer fixed point implementation of requantize
    - Two rounding modes - FE_UPWARDS (round towards infinity) and
    FE_AWAY_FROM_ZERO (std::round behavior)
- Floating point implementation as well, that can act as reference or can be
used for devices when FP32 computation is not used.
- Unit test cases

Relevant Issue - https://github.com/dmlc/tvm/issues/2351

Credit to TFLite and GemmLowp to provide reference implementations.
---
 include/tvm/relay/attrs/qnn.h               |  13 +-
 python/tvm/relay/op/qnn/qnn.py              |  13 +-
 src/relay/op/nn/requantize.cc               |   4 +-
 src/relay/pass/quantize_rewrite.cc          | 231 +++++++++---------
 tests/python/unittest/test_quantized_ops.py | 257 ++++++++++++++++++++
 5 files changed, 390 insertions(+), 128 deletions(-)
 create mode 100644 tests/python/unittest/test_quantized_ops.py

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index 12afe19d26b3..cf69fa759c1c 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -37,6 +37,7 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   double output_scale;
   int32_t output_zero_point;
   bool use_int_compute;
+  std::string rounding_mode;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
@@ -48,14 +49,22 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("The scale of the input tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_compute).set_default(false)
-        .describe("When true, the integer computation is used to handle output scale");
+    TVM_ATTR_FIELD(use_int_compute).set_default(true)
+      .describe("When true, the integer computation is used to handle output scale."
+                "The float compuation can be used as reference implementation or in"
+                "cases where FP32 computation for requantize is not expensive");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
+    TVM_ATTR_FIELD(rounding_mode).set_default("FE_UPWARD")
+        .describe("Defines the rounding direction when the value is midway between"
+                  "two representable values. There are two supported modes - FE_UPWARD"
+                  "or FE_AWAY_FROM_ZERO. More context can be found at"
+                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
   }
 };
 
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 18be68cd9cfc..484b3864f22f 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -19,9 +19,9 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
-
 def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-        output_scale, out_dtype="int32", use_int_compute=False):
+        output_scale, out_dtype="int32", use_int_compute=False,
+        rounding_mode="FE_UPWARD"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -57,11 +57,18 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
     use_int_compute : bool, optional
         Use fully integer computation for requantizing.
 
+    rounding_mode : string, optional
+        Defines the rounding direction when the value is midway between two
+        representable values.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
+    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+            "Unsupported rounding mode"
+
     return _make.requantize(input_data, input_zero_point, input_scale,
                             output_zero_point, output_scale, out_dtype,
-                            use_int_compute)
\ No newline at end of file
+                            use_int_compute, rounding_mode)
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/op/nn/requantize.cc
index 80f2bde4ad47..285528993f6f 100644
--- a/src/relay/op/nn/requantize.cc
+++ b/src/relay/op/nn/requantize.cc
@@ -59,7 +59,8 @@ Expr MakeRequantize(Expr data,
                     int32_t output_zero_point,
                     double output_scale,
                     DataType out_dtype,
-                    bool use_int_compute) {
+                    bool use_int_compute,
+                    std::string rounding_mode) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->out_dtype = std::move(out_dtype);
   attrs->input_zero_point = std::move(input_zero_point);
@@ -67,6 +68,7 @@ Expr MakeRequantize(Expr data,
   attrs->input_scale = std::move(input_scale);
   attrs->output_scale = std::move(output_scale);
   attrs->use_int_compute = std::move(use_int_compute);
+  attrs->rounding_mode = std::move(rounding_mode);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 55f8c43fd49f..645b20c0730e 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -33,13 +33,27 @@
 namespace tvm {
 namespace relay {
 
-
 // Lowering of qnn.requantize op
+
+/*
+ * Converts a floating point number so that it can be represented by integers.
+ * The representation is
+ *      float_number = (fixed_point_multiplier) * 2^(shift)
+ *
+ * The fixed_point_multiplier is a number between 0.5 and 1. This is represented
+ * by an integer number. For example, if it is int32, then the decimal point
+ * exists between bit 31 and 30 from LSB (or between first and second bit from
+ * the left).
+ *
+ * Some examples are
+ *           0.25 = (0.5) * 2^(-1)
+ *           0.125 = (0.5) * 2^(-2)
+ */
 void GetFixedPointMultiplierShift(double double_multiplier,
     int32_t* fixed_point_multiplier, int* shift,
     const DataType& idtype) {
 
-  int acc_dtype_bits = idtype.bits();
+  int idtype_bits = idtype.bits();
 
   if (double_multiplier == 0.) {
     *fixed_point_multiplier = 0;
@@ -47,9 +61,9 @@ void GetFixedPointMultiplierShift(double double_multiplier,
     return;
   }
   const double q = std::frexp(double_multiplier, shift);
-  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (acc_dtype_bits - 1))));
-  CHECK_LE(q_fixed, (1ll << (acc_dtype_bits - 1)));
-  if (q_fixed == (1ll << (acc_dtype_bits - 1))) {
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (idtype_bits - 1))));
+  CHECK_LE(q_fixed, (1ll << (idtype_bits - 1)));
+  if (q_fixed == (1ll << (idtype_bits - 1))) {
     q_fixed /= 2;
     ++*shift;
   }
@@ -57,85 +71,6 @@ void GetFixedPointMultiplierShift(double double_multiplier,
   *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
 }
 
-Expr MultiplyByIntegerMuliplier(const Expr& convolved_tensor,
-    const int32_t fixed_point_multiplier, const int left_shift,
-    const RequantizeAttrs*& param, const DataType& idtype,
-    const Array<IndexExpr>& out_shape) {
-  // TODO (janimesh) - How to add the overflow checks here. TFLite code snippet is
-  // bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
-  // return overflow ? std::numeric_limits<std::int32_t>::max() : .....;/
-
-  // The calculations are done in upcast of idtype to retain precision.
-  int acc_dtype_bits = idtype.bits();
-  DataType up_idtype = Int(2 * acc_dtype_bits);
-
-  auto tensor = convolved_tensor;
-  // Typically the left_shift will be 0 if the original scale is > 0.5.
-  if (left_shift != 0) {
-    tensor = Multiply(tensor, MakeConstantScalar(idtype, 1 << left_shift));
-  }
-
-  // Upcast the computation to Int64 and multiply the multiplier.
-  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
-  auto multiplied_t = Multiply(Cast(tensor, up_idtype), scalar);
-
-  // Since, we are performing fixed point computation. We are only interested in
-  // higher 16/32 bits. But before that, we also need to perform rounding.
-  // This is fixed point rounding. So, the rounder add scalar depends if the
-  // input is positive.
-  auto zero = MakeConstantScalar(up_idtype, 0);
-  auto pos_threshold = MakeConstantScalar(up_idtype,
-          1ll << (acc_dtype_bits - 2));
-  auto neg_threshold = MakeConstantScalar(up_idtype,
-          (1 - (1ll << (acc_dtype_bits - 2))));
-  auto pos_rounder = Full(pos_threshold, out_shape, up_idtype);
-  auto neg_rounder = Full(neg_threshold, out_shape, up_idtype);
-  auto rounding_scalar = Where(GreaterEqual(multiplied_t, zero), pos_rounder, neg_rounder);
-  auto rounded_tensor = Add(multiplied_t, rounding_scalar);
-
-  // Perform right shift to get the first 16/32 bits.
-  // The result is first doubled and the first 15/31 bits are obtained. This is
-  // done by just right shifting the result by 15/31 bits.
-  auto right_shift_scalar = MakeConstantScalar(up_idtype, (acc_dtype_bits - 1));
-  auto scaled_t = RightShift(rounded_tensor, right_shift_scalar);
-  auto q_imin = get_qmin(idtype);
-  auto q_imax = get_qmax(idtype);
-  auto integer_multiplied_t = Cast(Clip(scaled_t, q_imin, q_imax),
-          idtype);
-  return integer_multiplied_t;
-}
-
-Expr ShiftByIntegerShift(const Expr& multiplied_t,
-    const int& exponent, const RequantizeAttrs*& param,
-    const DataType& idtype, const Array<IndexExpr>& out_shape) {
-  CHECK_GE(exponent, 0);
-  int acc_dtype_bits = idtype.bits();
-  CHECK_LE(exponent, (acc_dtype_bits - 1));
-
-  // We need to perform rounding. The rounding here is closest to the power
-  // of 2. The exponent basically represents the decimal point. We need to round
-  // at the decimal point.
-  auto tensor = multiplied_t;
-  if (exponent != 0) {
-    auto pos_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)));
-    auto neg_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)) - 1);
-    auto pos_rounder_t = Full(pos_rounder, out_shape, idtype);
-    auto neg_rounder_t = Full(neg_rounder, out_shape, idtype);
-
-    auto zero = MakeConstantScalar(idtype, 0);
-    auto zero_t = Full(zero, out_shape, idtype);
-    auto round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
-            neg_rounder_t);
-    tensor = Add(tensor, round_scalar);
-  }
-
-  // Right shift by exponent to approximate the division.
-  auto scaled_t = RightShift(tensor,
-          MakeConstantScalar(idtype, exponent));
-  return scaled_t;
-}
-
-
 /*
  * Requantization using only integer computation. Here, the computation is
  * converted to a fixed point computation by computing output multiplier and
@@ -147,59 +82,123 @@ Expr ShiftByIntegerShift(const Expr& multiplied_t,
  * multiplication with an int value and then right shifting the result. This
  * approximates the floating point computation with a fixed point computation.
  *
- * The whole computaition this can be broken down into following steps 
+ * The whole computation this can be broken down into following steps
  * 1) Calculate the integer multiplier and integer shift.
- * 2) Multiply the integer multiplier with quantized tensor.
- * 3) Right shift the result.
+ * 2) Subtract the input integer point.
+ * 2) Multiply the integer fixed point multiplier with quantized tensor.
+ * 3) Round the result.
+ * 4) Right shift the result.
+ * 5) Add the output_zero_point.
+ * 6) Cast to the out_dtype.
  *
- * The only thing complicating the above computations is the tedious approach of
- * handling rounding.
  */
-Expr RequantizeInt(const Expr& convolved_tensor,
+Expr RequantizeInt(const Expr& input_tensor,
     const RequantizeAttrs*& param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
   double double_multiplier = param->input_scale/param->output_scale;
+
+  // The multiplication will be performed in higher precision. Find the dtype.
+  int idtype_bits = idtype.bits();
+  DataType up_idtype = Int(2 * idtype_bits);
+
   // 1) Calculating the integer multiplier and integer shift
   int32_t fixed_point_multiplier;
   int shift;
   GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
           &shift, idtype);
-
-  // 2) Multiply the integer multiplier
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
-  auto multiplied_t = MultiplyByIntegerMuliplier(convolved_tensor,
-          fixed_point_multiplier, left_shift, param, idtype, out_shape);
 
-  // 3) Divide by the denominator or right shift the result.
-  auto scaled_int32_t = ShiftByIntegerShift(multiplied_t,
-          right_shift, param, idtype, out_shape);
+  // 2) Subtract the input_zero_point
+  auto tensor = input_tensor;
+  tensor = Cast(tensor, up_idtype);
+  if (param->input_zero_point != 0) {
+    auto input_zp = MakeConstantScalar(up_idtype, param->input_zero_point);
+    tensor = Subtract(tensor, input_zp);
+  }
 
-  // 4) Clip to the out_dtype min/max.
+
+
+  // 3) Multiply the integer multiplier
+  if (left_shift != 0) {
+    tensor = Multiply(tensor, MakeConstantScalar(up_idtype, 1 << left_shift));
+  }
+  // Perform the multiplication in higher precision.
+  // If idtype is Int(32), the scalar is a fixed point value of int32 where the
+  // decimal point is between bits 31 and 30. After multiplying with
+  // input_tensor, the result in int64 where the decimal point is sitting
+  // between bits 31 and 30 (from the right, rightmost bit is bit 0).
+  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
+  auto multiplied_t = Multiply(tensor, scalar);
+
+
+  // 4) Find the rounding scalar. This depends on where the final decimal point
+  // sits. As we will be right shifting the multiplied_t, we need to first
+  // calculate the totol_right_shift.
+  int total_right_shift = right_shift + idtype_bits - 1;
+
+  tensor = multiplied_t;
+  Expr round_scalar;
+  if (param->rounding_mode == "FE_UPWARD") {
+    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
+    round_scalar = pos_rounder;
+  } else if (param->rounding_mode == "FE_AWAY_FROM_ZERO") {
+    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
+    auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
+    auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
+    auto neg_rounder_t = Full(neg_rounder, out_shape, up_idtype);
+
+    auto zero = MakeConstantScalar(up_idtype, 0);
+    auto zero_t = Full(zero, out_shape, up_idtype);
+    round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
+            neg_rounder_t);
+  }
+  // Add the rounding scalar.
+  tensor = Add(tensor, round_scalar);
+
+  // 5) Simply right shift the result to get the final output.
+  auto scaled_int64_t = RightShift(tensor,
+          MakeConstantScalar(up_idtype, total_right_shift));
+
+  // 6) Add the output zero point.
+  auto output_zp = MakeConstantScalar(up_idtype, param->output_zero_point);
+  auto shifted_int64_t = Add(output_zp, scaled_int64_t);
+
+  // 7) Clip to the out_dtype min/max.
+  // Find the right clip min/maxes. While clipping, it is necessary that
+  // clip_min and clip_max are within the dtype range of the input tensor to the
+  // clip operator. For example, if the input to clip operator is int8, but the
+  // out_dtype is uint8, we will get incorrect results, if we set max as 255.
   auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
   auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
-  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;
 }
 
-/* 
+
+/*
  * Requantization using floating computation. Here we can multiply the scale to
- * the convolved_tensor, round to nearest integer and then cast back to int32.
+ * the input_tensor, round to nearest integer and then cast back to int32.
  */
-Expr RequantizeFloat(const Expr& convolved_tensor,
+Expr RequantizeFloat(const Expr& input_tensor,
     const RequantizeAttrs*& param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
   double double_multiplier = param->input_scale/param->output_scale;
   auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
-
-  // Multiply the convolved tensor with the new scale.
-  auto casted_t = Cast(convolved_tensor, Float(32));
-  auto multiplied_t = Round(Multiply(casted_t, scalar_multiplier));
+  auto input_zp = MakeConstantScalar(idtype, param->input_zero_point);
+  auto output_zp = MakeConstantScalar(Float(32), param->output_zero_point);
+
+  // Multiply the tensor with the new scale.
+  auto shifted_input_t = Subtract(input_tensor, input_zp);
+  auto casted_t = Cast(shifted_input_t, Float(32));
+  auto multiplied_t = Multiply(casted_t, scalar_multiplier);
+  auto shifted_multiplied_t = Add(output_zp, multiplied_t);
+  auto rounded_t = Round(shifted_multiplied_t);
   auto q_imin = get_qmin(idtype);
   auto q_imax = get_qmax(idtype);
-  auto scaled_int32_t = Cast(Clip(multiplied_t, q_imin, q_imax),
+  auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
           idtype);
 
   // Clip to the out_dtype min/max.
@@ -243,14 +242,6 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
-  // Check for current quantization support.
-  CHECK_EQ(param->input_zero_point, 0)
-      << "Encountered non-zero zero point."
-      << " Only symmetric quantization supported for now.";
-  CHECK_EQ(param->output_zero_point, 0)
-      << "Encountered non-zero zero point."
-      << " Only symmetric quantization supported for now.";
-
   if (param->use_int_compute) {
     return RequantizeInt(quantized_data, param, input_dtype, out_shape);
   } else {
@@ -258,18 +249,14 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
   }
 }
 
-
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-
-
 TVM_REGISTER_API("relay._quantize.rewrite")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
-  return ret;
-});
-
+          Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+            return ret;
+            });
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
new file mode 100644
index 000000000000..e70ea0925231
--- /dev/null
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -0,0 +1,257 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.testing import create_workload
+from tvm.contrib import graph_runtime
+
+rounding_modes = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
+
+def run_infer_type(expr):
+    mod = relay.Module.from_expr(expr)
+    mod = relay.transform.InferType()(mod)
+    entry = mod["main"]
+    return entry if isinstance(expr, relay.Function) else entry.body
+
+
+def test_requantize():
+    def verify(func, goldens):
+        with relay.build_config(opt_level=0):
+            graph, lib, params = relay.build(func, "llvm", params=None)
+            golden_data, golden_output = goldens
+            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod.set_input("quantized_data",golden_data)
+            mod.set_input(**params)
+            mod.run()
+            res = mod.get_output(0).asnumpy()
+            np.testing.assert_equal(res, golden_output)
+
+    def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
+            rounding_mode, input_scale, output_scale, input_zero_point=0,
+            output_zero_point=0):
+        quantized_data = relay.var("quantized_data", shape=data_shape,
+                dtype=data_dtype)
+        func = relay.op.qnn.requantize(
+                quantized_data,
+                input_zero_point=input_zero_point,
+                output_zero_point=output_zero_point,
+                input_scale=input_scale,
+                output_scale=output_scale,
+                rounding_mode=rounding_mode,
+                out_dtype=out_dtype,
+                use_int_compute=use_int_compute)
+
+        func = relay.Function(relay.analysis.free_vars(func),
+                func)
+        func = run_infer_type(func)
+        func = relay.quantize.rewrite(func)
+        print(func)
+        return func
+
+
+    def run_tests():
+        def same_scale_test():
+            # Have same scales, everything within range
+            golden_data = np.arange(-100, 100, 1).astype('int32')
+            golden_output = golden_data
+
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(200, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=0.5,
+                                    output_scale=0.5)
+                    verify(func, (golden_data, golden_output))
+
+        def downscale_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                    verify(func, (golden_data, golden_output))
+
+                # Try a different scale
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=4)
+
+                    # Try positive values
+                    # 2I corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                              [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                                  [3, 4, 4, 4, 4, 4, 4, 4, 1])
+                    else:
+                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                                  [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                    verify(func, (golden_data, golden_output))
+
+        def upscale_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=2,
+                                    output_scale=1)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.multiply(2, golden_data)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    golden_output = np.multiply(2, golden_data)
+                    verify(func, (golden_data, golden_output))
+
+        def saturation_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(16, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=0.5,
+                                    output_scale=0.5)
+                    golden_data = np.arange(0, 16, 1).astype('int32')
+                    golden_data = np.add(120, golden_data)
+                    output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
+                                       127, 127, 127, 127, 127, 127, 127, 127])
+                    golden_output = output
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative numbers
+                    golden_data = np.arange(0, -16, -1).astype('int32')
+                    golden_data = np.add(-120, golden_data)
+                    output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
+                                       -128, -128, -128, -128, -128, -128, -128, -128])
+                    golden_output = output
+                    verify(func, (golden_data, golden_output))
+
+        def zero_point_test():
+            # Output zero point
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16,
+                                    output_zero_point=1)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                    golden_output = np.add(1, golden_output)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(-32, -64, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                    golden_output = np.add(1, golden_output)
+                    verify(func, (golden_data, golden_output))
+
+            # Input zero point
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16,
+                                    input_zero_point=16)
+
+                    # Try positive values
+                    golden_data = np.arange(32, 64, 1).astype('int32')
+                    golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+                    golden_output = np.subtract(golden_output, 1)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    golden_data = np.arange(-32, -64, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                    golden_output = np.subtract(golden_output, 1)
+                    verify(func, (golden_data, golden_output))
+
+
+
+
+        if __name__ == "__main__":
+            same_scale_test()
+            downscale_test()
+            upscale_test()
+            saturation_test()
+            zero_point_test()
+
+    run_tests()
+
+if __name__ == "__main__":
+    test_requantize()

From 6016b2a573818a83f2b23155486c03fb91cabfd9 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 22:23:29 +0000
Subject: [PATCH 16/51] Typo and lint fixes.

---
 include/tvm/relay/attrs/qnn.h               |  6 ++---
 include/tvm/relay/quantize_util.h           | 27 ++++++++++++---------
 python/tvm/relay/op/qnn/__init__.py         |  2 +-
 python/tvm/relay/op/qnn/qnn.py              |  4 +--
 python/tvm/relay/quantize/rewrite.py        |  1 -
 src/relay/pass/quantize_rewrite.cc          |  4 +--
 tests/python/unittest/test_quantized_ops.py |  1 -
 7 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index cf69fa759c1c..6bcd77a81f8a 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -21,8 +21,8 @@
  * \file tvm/relay/attrs/nn.h
  * \brief Auxiliary attributes for nn operators.
  */
-#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
-#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#ifndef TVM_RELAY_ATTRS_QNN_H_
+#define TVM_RELAY_ATTRS_QNN_H_
 
 #include <tvm/attrs.h>
 #include <string>
@@ -67,4 +67,4 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#endif  // TVM_RELAY_ATTRS_QNN_H_
diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
index bb054fb8fb65..6a8c2e520098 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/include/tvm/relay/quantize_util.h
@@ -22,10 +22,11 @@
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
-#ifndef TVM_QUANTIZE_UTIL_H
-#define TVM_QUANTIZE_UTIL_H
+#ifndef TVM_RELAY_QUANTIZE_UTIL_H_
+#define TVM_RELAY_QUANTIZE_UTIL_H_
 
 #include <tvm/expr.h>
+#include<limits>
 #include "./base.h"
 
 namespace tvm {
@@ -68,14 +69,15 @@ inline bool is_quantized_type(const DataType& dtype) {
 }
 
 enum class QuantizeOpType : uint8_t {
-  Quantize_Requantize,
+  Quantize,
   Dequantize,
   Requantize
 };
 
-inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
-  switch(op_type) {
-    case QuantizeOpType::Quantize_Requantize:
+inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type,
+        const DataType &in_dtype) {
+  switch (op_type) {
+    case QuantizeOpType::Quantize:
       return is_Float32(in_dtype) || is_quantized_type(in_dtype);
     case QuantizeOpType ::Dequantize:
       return is_quantized_type(in_dtype);
@@ -86,9 +88,10 @@ inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, cons
   }
 }
 
-inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
-  switch(op_type) {
-    case QuantizeOpType::Quantize_Requantize:
+inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type,
+        const DataType &in_dtype) {
+  switch (op_type) {
+    case QuantizeOpType::Quantize:
       return is_quantized_type(in_dtype);
     case QuantizeOpType::Dequantize:
       return is_Float32(in_dtype);
@@ -134,6 +137,6 @@ inline const int32_t get_qmax(const DataType&  dtype) {
   return -1;
 }
 
-} // namespace relay
-} // namespace tvm
-#endif //TVM_QUANTIZE_UTIL_H
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_QUANTIZE_UTIL_H_
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/op/qnn/__init__.py
index aef02300ab63..e9adfa783f93 100644
--- a/python/tvm/relay/op/qnn/__init__.py
+++ b/python/tvm/relay/op/qnn/__init__.py
@@ -17,4 +17,4 @@
 # pylint: disable=wildcard-import
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
-from .qnn import *
\ No newline at end of file
+from .qnn import *
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 484b3864f22f..10477e22ac04 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -20,8 +20,8 @@
 from . import _make
 
 def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-        output_scale, out_dtype="int32", use_int_compute=False,
-        rounding_mode="FE_UPWARD"):
+               output_scale, out_dtype="int32", use_int_compute=False,
+               rounding_mode="FE_UPWARD"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/quantize/rewrite.py
index 89429e522115..c8860775b77f 100644
--- a/python/tvm/relay/quantize/rewrite.py
+++ b/python/tvm/relay/quantize/rewrite.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import
 
 from . import _quantize
-from .. import expr as _expr
 
 def rewrite(expr):
     """
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 645b20c0730e..92bd51ad7e15 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -93,7 +93,7 @@ void GetFixedPointMultiplierShift(double double_multiplier,
  *
  */
 Expr RequantizeInt(const Expr& input_tensor,
-    const RequantizeAttrs*& param, const DataType& idtype,
+    const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
   double double_multiplier = param->input_scale/param->output_scale;
@@ -183,7 +183,7 @@ Expr RequantizeInt(const Expr& input_tensor,
  * the input_tensor, round to nearest integer and then cast back to int32.
  */
 Expr RequantizeFloat(const Expr& input_tensor,
-    const RequantizeAttrs*& param, const DataType& idtype,
+    const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
   double double_multiplier = param->input_scale/param->output_scale;
   auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index e70ea0925231..8a039edd12b6 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -61,7 +61,6 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
                 func)
         func = run_infer_type(func)
         func = relay.quantize.rewrite(func)
-        print(func)
         return func
 
 

From d54cea80cbe3c1f3111abc2f347fa64fae2eb21a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 22:56:57 +0000
Subject: [PATCH 17/51] Lint fix.

---
 src/relay/pass/pattern_util.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 7249d1d4c086..faccd518a782 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -33,6 +33,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/transform.h>
+#include <utility>
 #include <string>
 
 

From ca954e0abf296c30e58ec2651712ba7525579991 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:08:18 +0000
Subject: [PATCH 18/51] Doc fix.

---
 include/tvm/relay/quantize_util.h |  2 +-
 tests/scripts/task_lint.sh        | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
index 6a8c2e520098..5b5215dc4459 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/include/tvm/relay/quantize_util.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file nnvm/compiler/quantize_util.h
+ * \file tvm/relay/quantize_util.h
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 544ef7224770..896cc4c65c22 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,18 +31,18 @@ echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Check ASF license header..."
-java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
-if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
-    echo "Need to add ASF header to the following files."
-    echo "----------------File List----------------"
-    cat /tmp/$$.apache-rat.txt
-    echo "-----------------------------------------"
-    echo "Use the following steps to add the headers:"
-    echo "- Create file_list.txt in your text editor"
-    echo "- Copy paste the above content in file-list into file_list.txt"
-    echo "- python3 tests/lint/add_asf_header.py file_list.txt"
-    exit 1
-fi
+# java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
+# if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
+#     echo "Need to add ASF header to the following files."
+#     echo "----------------File List----------------"
+#     cat /tmp/$$.apache-rat.txt
+#     echo "-----------------------------------------"
+#     echo "Use the following steps to add the headers:"
+#     echo "- Create file_list.txt in your text editor"
+#     echo "- Copy paste the above content in file-list into file_list.txt"
+#     echo "- python3 tests/lint/add_asf_header.py file_list.txt"
+#     exit 1
+# fi
 
 echo "Check codestyle of c++ code..."
 make cpplint

From db24f1ea1a4859e749ed51830f572615dd9a7470 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:10:34 +0000
Subject: [PATCH 19/51] Uncommenting the lint script (fixing mistake).

---
 tests/scripts/task_lint.sh | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 896cc4c65c22..544ef7224770 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,18 +31,18 @@ echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Check ASF license header..."
-# java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
-# if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
-#     echo "Need to add ASF header to the following files."
-#     echo "----------------File List----------------"
-#     cat /tmp/$$.apache-rat.txt
-#     echo "-----------------------------------------"
-#     echo "Use the following steps to add the headers:"
-#     echo "- Create file_list.txt in your text editor"
-#     echo "- Copy paste the above content in file-list into file_list.txt"
-#     echo "- python3 tests/lint/add_asf_header.py file_list.txt"
-#     exit 1
-# fi
+java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
+if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
+    echo "Need to add ASF header to the following files."
+    echo "----------------File List----------------"
+    cat /tmp/$$.apache-rat.txt
+    echo "-----------------------------------------"
+    echo "Use the following steps to add the headers:"
+    echo "- Create file_list.txt in your text editor"
+    echo "- Copy paste the above content in file-list into file_list.txt"
+    echo "- python3 tests/lint/add_asf_header.py file_list.txt"
+    exit 1
+fi
 
 echo "Check codestyle of c++ code..."
 make cpplint

From 523e16a5fed86a1d13f4489226164b48017050d5 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:13:54 +0000
Subject: [PATCH 20/51] Modifying the unit tests.

---
 tests/python/unittest/test_quantized_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 8a039edd12b6..6dc35d801543 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -86,7 +86,7 @@ def downscale_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,
@@ -189,7 +189,7 @@ def zero_point_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,
@@ -218,7 +218,7 @@ def zero_point_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,

From 18bff7633b7e58c767537136dcbf2e3e8e94d1fe Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 06:05:00 +0000
Subject: [PATCH 21/51] Moving C++ files into src/relay/qnn

---
 .../qnn.h => src/relay/qnn/include/attrs.h    |  0
 .../relay/qnn/include/util.h                  | 73 +++++++++----------
 src/relay/{op/nn => qnn/op}/requantize.cc     |  6 +-
 src/relay/{ => qnn}/pass/quantize_rewrite.cc  | 18 ++---
 4 files changed, 47 insertions(+), 50 deletions(-)
 rename include/tvm/relay/attrs/qnn.h => src/relay/qnn/include/attrs.h (100%)
 rename include/tvm/relay/quantize_util.h => src/relay/qnn/include/util.h (62%)
 rename src/relay/{op/nn => qnn/op}/requantize.cc (95%)
 rename src/relay/{ => qnn}/pass/quantize_rewrite.cc (95%)

diff --git a/include/tvm/relay/attrs/qnn.h b/src/relay/qnn/include/attrs.h
similarity index 100%
rename from include/tvm/relay/attrs/qnn.h
rename to src/relay/qnn/include/attrs.h
diff --git a/include/tvm/relay/quantize_util.h b/src/relay/qnn/include/util.h
similarity index 62%
rename from include/tvm/relay/quantize_util.h
rename to src/relay/qnn/include/util.h
index 5b5215dc4459..61663b0da85e 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/src/relay/qnn/include/util.h
@@ -26,46 +26,43 @@
 #define TVM_RELAY_QUANTIZE_UTIL_H_
 
 #include <tvm/expr.h>
-#include<limits>
-#include "./base.h"
+#include <limits>
+#include <tvm/relay/expr.h>
 
 namespace tvm {
 namespace relay {
 
-inline bool is_Int8(const DataType& dtype) {
+inline bool IsInt8(const DataType& dtype) {
   return dtype == Int(8);
 }
 
-inline bool is_UInt8(const DataType& dtype) {
+inline bool IsUint8(const DataType& dtype) {
   return dtype == UInt(8);
 }
 
-
-inline bool is_Int16(const DataType& dtype) {
+inline bool IsInt16(const DataType& dtype) {
   return dtype == Int(16);
 }
 
-inline bool is_UInt16(const DataType& dtype) {
+inline bool IsUint16(const DataType& dtype) {
   return dtype == UInt(16);
 }
 
-inline bool is_Int32(const DataType& dtype) {
+inline bool IsInt32(const DataType& dtype) {
   return dtype == Int(32);
 }
 
-inline bool is_UInt32(const DataType& dtype) {
+inline bool IsUint32(const DataType& dtype) {
   return dtype == UInt(32);
 }
 
-
-
-inline bool is_Float32(const DataType& dtype) {
+inline bool IsFloat32(const DataType& dtype) {
   return dtype == Float(32);
 }
 
-inline bool is_quantized_type(const DataType& dtype) {
-  return is_Int8(dtype) || is_UInt8(dtype)
-      || is_Int16(dtype) || is_UInt16(dtype);
+inline bool IsQuantizedType(const DataType& dtype) {
+  return IsInt8(dtype) || IsUint8(dtype)
+      || IsInt16(dtype) || IsUint16(dtype);
 }
 
 enum class QuantizeOpType : uint8_t {
@@ -74,44 +71,44 @@ enum class QuantizeOpType : uint8_t {
   Requantize
 };
 
-inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type,
-        const DataType &in_dtype) {
+inline bool IsValidOpInputType(const QuantizeOpType& op_type,
+        const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return is_Float32(in_dtype) || is_quantized_type(in_dtype);
+      return IsFloat32(in_dtype) || IsQuantizedType(in_dtype);
     case QuantizeOpType ::Dequantize:
-      return is_quantized_type(in_dtype);
+      return IsQuantizedType(in_dtype);
     case QuantizeOpType ::Requantize:
-      return is_Int16(in_dtype) || is_Int32(in_dtype);
+      return IsInt16(in_dtype) || IsInt32(in_dtype);
     default:
       return false;
   }
 }
 
-inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type,
-        const DataType &in_dtype) {
+inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
+        const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return is_quantized_type(in_dtype);
+      return IsQuantizedType(in_dtype);
     case QuantizeOpType::Dequantize:
-      return is_Float32(in_dtype);
+      return IsFloat32(in_dtype);
     default:
       return false;
   }
 }
 
-inline const int32_t get_qmin(const DataType&  dtype) {
-  if (is_Int8(dtype)) {
+inline const int32_t GetQmin(const DataType& dtype) {
+  if (IsInt8(dtype)) {
     return std::numeric_limits<int8_t>::min();
-  } else if (is_UInt8(dtype)) {
+  } else if (IsUint8(dtype)) {
     return std::numeric_limits<uint8_t>::min();
-  } else if (is_Int16(dtype)) {
+  } else if (IsInt16(dtype)) {
     return std::numeric_limits<int16_t>::min();
-  } else if (is_UInt16(dtype)) {
+  } else if (IsUint16(dtype)) {
     return std::numeric_limits<uint16_t>::min();
-  } else if (is_Int32(dtype)) {
+  } else if (IsInt32(dtype)) {
     return std::numeric_limits<int32_t>::min();
-  } else if (is_UInt32(dtype)) {
+  } else if (IsUint32(dtype)) {
     return std::numeric_limits<uint32_t>::min();
   }
   LOG(FATAL) << "Type not supported\n";
@@ -119,18 +116,18 @@ inline const int32_t get_qmin(const DataType&  dtype) {
 }
 
 
-inline const int32_t get_qmax(const DataType&  dtype) {
-  if (is_Int8(dtype)) {
+inline const int32_t GetQmax(const DataType& dtype) {
+  if (IsInt8(dtype)) {
     return std::numeric_limits<int8_t>::max();
-  } else if (is_UInt8(dtype)) {
+  } else if (IsUint8(dtype)) {
     return std::numeric_limits<uint8_t>::max();
-  } else if (is_Int16(dtype)) {
+  } else if (IsInt16(dtype)) {
     return std::numeric_limits<int16_t>::max();
-  } else if (is_UInt16(dtype)) {
+  } else if (IsUint16(dtype)) {
     return std::numeric_limits<uint16_t>::max();
-  } else if (is_Int32(dtype)) {
+  } else if (IsInt32(dtype)) {
     return std::numeric_limits<int32_t>::max();
-  } else if (is_UInt32(dtype)) {
+  } else if (IsUint32(dtype)) {
     return std::numeric_limits<uint32_t>::max();
   }
   LOG(FATAL) << "Type not supported\n";
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/qnn/op/requantize.cc
similarity index 95%
rename from src/relay/op/nn/requantize.cc
rename to src/relay/qnn/op/requantize.cc
index 285528993f6f..9e4ddc97467f 100644
--- a/src/relay/op/nn/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -25,8 +25,8 @@
 
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/qnn.h>
-#include <tvm/relay/quantize_util.h>
+#include "../include/attrs.h"
+#include "../include/util.h"
 
 namespace tvm {
 namespace relay {
@@ -41,7 +41,7 @@ bool RequantizeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
-  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Requantize, input_dtype))
+  CHECK(IsValidOpInputType(QuantizeOpType::Requantize, input_dtype))
     << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
similarity index 95%
rename from src/relay/pass/quantize_rewrite.cc
rename to src/relay/qnn/pass/quantize_rewrite.cc
index 92bd51ad7e15..30265ca1dc32 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -26,9 +26,9 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/quantize_util.h>
-#include <tvm/relay/attrs/qnn.h>
-#include "pattern_util.h"
+#include "../include/attrs.h"
+#include "../include/util.h"
+#include "../../pass/pattern_util.h"
 
 namespace tvm {
 namespace relay {
@@ -170,8 +170,8 @@ Expr RequantizeInt(const Expr& input_tensor,
   // clip_min and clip_max are within the dtype range of the input tensor to the
   // clip operator. For example, if the input to clip operator is int8, but the
   // out_dtype is uint8, we will get incorrect results, if we set max as 255.
-  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
-  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
+  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
   auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;
@@ -196,15 +196,15 @@ Expr RequantizeFloat(const Expr& input_tensor,
   auto multiplied_t = Multiply(casted_t, scalar_multiplier);
   auto shifted_multiplied_t = Add(output_zp, multiplied_t);
   auto rounded_t = Round(shifted_multiplied_t);
-  auto q_imin = get_qmin(idtype);
-  auto q_imax = get_qmax(idtype);
+  auto q_imin = GetQmin(idtype);
+  auto q_imax = GetQmax(idtype);
   auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
           idtype);
 
   // Clip to the out_dtype min/max.
   // Clip limits must be smaller than the dtype of the input tensor.
-  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
-  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
+  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
   auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;

From 32b69dfbec402193c875e49a87942ca2972895e3 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 16:15:26 +0000
Subject: [PATCH 22/51] Moving python files to python/tvm/relay/qnn. Some minor
 fixes.

---
 python/tvm/relay/__init__.py                  |  3 +
 python/tvm/relay/op/__init__.py               |  1 -
 python/tvm/relay/op/qnn/_make.py              | 20 -----
 python/tvm/relay/op/qnn/qnn.py                | 74 -------------------
 python/tvm/relay/{op => }/qnn/__init__.py     |  3 +-
 .../{quantize/rewrite.py => qnn/ir_pass.py}   |  4 +-
 python/tvm/relay/quantize/__init__.py         |  1 -
 src/relay/qnn/pass/quantize_rewrite.cc        | 18 ++---
 tests/python/unittest/test_quantized_ops.py   |  4 +-
 9 files changed, 18 insertions(+), 110 deletions(-)
 delete mode 100644 python/tvm/relay/op/qnn/_make.py
 delete mode 100644 python/tvm/relay/op/qnn/qnn.py
 rename python/tvm/relay/{op => }/qnn/__init__.py (95%)
 rename python/tvm/relay/{quantize/rewrite.py => qnn/ir_pass.py} (95%)

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 509196f635b9..7adf82895751 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -49,6 +49,9 @@
 from . import backend
 from . import quantize
 
+# Dialects
+from . import qnn
+
 from .scope_builder import ScopeBuilder
 
 # Span
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 1d634ef18fc0..a27ab1dc50ff 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -26,7 +26,6 @@
 from .transform import *
 from .algorithm import *
 from . import nn
-from . import qnn
 from . import annotation
 from . import image
 from . import vision
diff --git a/python/tvm/relay/op/qnn/_make.py b/python/tvm/relay/op/qnn/_make.py
deleted file mode 100644
index b1695629b8f9..000000000000
--- a/python/tvm/relay/op/qnn/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-from ...._ffi.function import _init_api
-
-_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
deleted file mode 100644
index 10477e22ac04..000000000000
--- a/python/tvm/relay/op/qnn/qnn.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#pylint: disable=invalid-name, too-many-lines
-"""Neural network operations."""
-from __future__ import absolute_import as _abs
-from . import _make
-
-def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-               output_scale, out_dtype="int32", use_int_compute=False,
-               rounding_mode="FE_UPWARD"):
-    r"""Requantized operator.
-
-    The requantize operator converts one quantized tensor to another quantized
-    tensor. For the output tensor, we are provided with output scale and zero
-    point. The computation looks like this
-
-    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
-
-    The above computation can be done in floating point as the scales are in
-    FP32. Alternatively, we can approximate floating point with fixed point
-    computation. This is controlled by use_int_compute.
-
-    Parameters
-    ----------
-    quantized_data : tvm.relay.Expr
-        The input quantized_data to the operator.
-
-    input_scale: float
-           The float scalar to scale the quantized_data int8 values back to FP32.
-
-    output_scale: float
-           The float scalar to scale the quantized_output int8 values back to FP32.
-
-    input_zero_point: int
-           The zero point of the quantized_data distribution.
-
-    output_zero_point: int
-           The zero point of the quantized_output distribution.
-
-    out_dtype : str, optional
-        Specifies the output quantized_data type for mixed precision conv2d.
-
-    use_int_compute : bool, optional
-        Use fully integer computation for requantizing.
-
-    rounding_mode : string, optional
-        Defines the rounding direction when the value is midway between two
-        representable values.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
-            "Unsupported rounding mode"
-
-    return _make.requantize(input_data, input_zero_point, input_scale,
-                            output_zero_point, output_scale, out_dtype,
-                            use_int_compute, rounding_mode)
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
similarity index 95%
rename from python/tvm/relay/op/qnn/__init__.py
rename to python/tvm/relay/qnn/__init__.py
index e9adfa783f93..0836c5770ce4 100644
--- a/python/tvm/relay/op/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -17,4 +17,5 @@
 # pylint: disable=wildcard-import
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
-from .qnn import *
+from . import op
+from . import ir_pass
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/qnn/ir_pass.py
similarity index 95%
rename from python/tvm/relay/quantize/rewrite.py
rename to python/tvm/relay/qnn/ir_pass.py
index c8860775b77f..24e3329e961c 100644
--- a/python/tvm/relay/quantize/rewrite.py
+++ b/python/tvm/relay/qnn/ir_pass.py
@@ -18,7 +18,7 @@
 """Automatic quantization toolkit."""
 from __future__ import absolute_import
 
-from . import _quantize
+from . import _qnn
 
 def rewrite(expr):
     """
@@ -34,4 +34,4 @@ def rewrite(expr):
     expr : tvm.relay.Expr
         The output expression.
     """
-    return _quantize.rewrite(expr)
+    return _qnn.rewrite(expr)
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
index 8da4e7953566..45bb62e66853 100644
--- a/python/tvm/relay/quantize/__init__.py
+++ b/python/tvm/relay/quantize/__init__.py
@@ -19,5 +19,4 @@
 from __future__ import absolute_import as _abs
 
 from .quantize import *
-from .rewrite import *
 from ._annotate import register_annotate_function
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 30265ca1dc32..9d10b5a47ba9 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -85,11 +85,11 @@ void GetFixedPointMultiplierShift(double double_multiplier,
  * The whole computation this can be broken down into following steps
  * 1) Calculate the integer multiplier and integer shift.
  * 2) Subtract the input integer point.
- * 2) Multiply the integer fixed point multiplier with quantized tensor.
- * 3) Round the result.
- * 4) Right shift the result.
- * 5) Add the output_zero_point.
- * 6) Cast to the out_dtype.
+ * 3) Multiply the integer fixed point multiplier with quantized tensor.
+ * 4) Round the result.
+ * 5) Right shift the result.
+ * 6) Add the output_zero_point.
+ * 7) Cast to the out_dtype.
  *
  */
 Expr RequantizeInt(const Expr& input_tensor,
@@ -252,11 +252,11 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay._quantize.rewrite")
+TVM_REGISTER_API("relay._qnn.rewrite")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-          Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
-            return ret;
-            });
+  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  return ret;
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 6dc35d801543..092e695cf533 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -47,7 +47,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
             output_zero_point=0):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
-        func = relay.op.qnn.requantize(
+        func = relay.qnn.op.requantize(
                 quantized_data,
                 input_zero_point=input_zero_point,
                 output_zero_point=output_zero_point,
@@ -60,7 +60,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
         func = run_infer_type(func)
-        func = relay.quantize.rewrite(func)
+        func = relay.qnn.ir_pass.rewrite(func)
         return func
 
 

From 21168aea7b8aaa9131e5dacdca780c9091182f05 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 16:58:44 +0000
Subject: [PATCH 23/51] Moving the attrs.h inside the include directory.

---
 {src/relay/qnn/include => include/tvm/relay/qnn}/attrs.h | 0
 src/relay/qnn/op/requantize.cc                           | 2 +-
 src/relay/qnn/pass/quantize_rewrite.cc                   | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename {src/relay/qnn/include => include/tvm/relay/qnn}/attrs.h (100%)

diff --git a/src/relay/qnn/include/attrs.h b/include/tvm/relay/qnn/attrs.h
similarity index 100%
rename from src/relay/qnn/include/attrs.h
rename to include/tvm/relay/qnn/attrs.h
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 9e4ddc97467f..c389e82fba80 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -25,7 +25,7 @@
 
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
-#include "../include/attrs.h"
+#include <tvm/relay/qnn/attrs.h>
 #include "../include/util.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 9d10b5a47ba9..5d4942c80a7c 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
-#include "../include/attrs.h"
+#include <tvm/relay/qnn/attrs.h>
 #include "../include/util.h"
 #include "../../pass/pattern_util.h"
 

From 4a4beecf78959859fb8dc37199e062a31e6faf29 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 19:05:03 +0000
Subject: [PATCH 24/51] Pushing files that I forgot earlier. Changing util
 location.

---
 python/tvm/relay/qnn/_qnn.py           | 22 ++++++++
 python/tvm/relay/qnn/op/__init__.py    | 20 +++++++
 python/tvm/relay/qnn/op/_make.py       | 20 +++++++
 python/tvm/relay/qnn/op/qnn.py         | 74 ++++++++++++++++++++++++++
 src/relay/qnn/op/requantize.cc         |  2 +-
 src/relay/qnn/pass/quantize_rewrite.cc |  2 +-
 src/relay/qnn/{include => }/util.h     |  0
 7 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/relay/qnn/_qnn.py
 create mode 100644 python/tvm/relay/qnn/op/__init__.py
 create mode 100644 python/tvm/relay/qnn/op/_make.py
 create mode 100644 python/tvm/relay/qnn/op/qnn.py
 rename src/relay/qnn/{include => }/util.h (100%)

diff --git a/python/tvm/relay/qnn/_qnn.py b/python/tvm/relay/qnn/_qnn.py
new file mode 100644
index 000000000000..bd3cdbb976d6
--- /dev/null
+++ b/python/tvm/relay/qnn/_qnn.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=unused-argument
+"""Internal module for quantization."""
+from __future__ import absolute_import
+from tvm._ffi.function import _init_api
+
+_init_api("relay._qnn", __name__)
diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py
new file mode 100644
index 000000000000..e9adfa783f93
--- /dev/null
+++ b/python/tvm/relay/qnn/op/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .qnn import *
diff --git a/python/tvm/relay/qnn/op/_make.py b/python/tvm/relay/qnn/op/_make.py
new file mode 100644
index 000000000000..b1695629b8f9
--- /dev/null
+++ b/python/tvm/relay/qnn/op/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
new file mode 100644
index 000000000000..8db431eebe23
--- /dev/null
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def requantize(input_data, input_zero_point, input_scale, output_zero_point,
+               output_scale, out_dtype="int32", use_int_compute=True,
+               rounding_mode="FE_AWAY_FROM_ZERO"):
+    r"""Requantized operator.
+
+    The requantize operator converts one quantized tensor to another quantized
+    tensor. For the output tensor, we are provided with output scale and zero
+    point. The computation looks like this
+
+    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+
+    The above computation can be done in floating point as the scales are in
+    FP32. Alternatively, we can approximate floating point with fixed point
+    computation. This is controlled by use_int_compute.
+
+    Parameters
+    ----------
+    quantized_data : tvm.relay.Expr
+        The input quantized_data to the operator.
+
+    input_scale: float
+           The float scalar to scale the quantized_data int8 values back to FP32.
+
+    output_scale: float
+           The float scalar to scale the quantized_output int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the quantized_data distribution.
+
+    output_zero_point: int
+           The zero point of the quantized_output distribution.
+
+    out_dtype : str, optional
+        Specifies the output quantized_data type for mixed precision conv2d.
+
+    use_int_compute : bool, optional
+        Use fully integer computation for requantizing.
+
+    rounding_mode : string, optional
+        Defines the rounding direction when the value is midway between two
+        representable values.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+            "Unsupported rounding mode"
+
+    return _make.requantize(input_data, input_zero_point, input_scale,
+                            output_zero_point, output_scale, out_dtype,
+                            use_int_compute, rounding_mode)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index c389e82fba80..df4a224fc2ba 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/qnn/attrs.h>
-#include "../include/util.h"
+#include "../util.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 5d4942c80a7c..7d4e0f017050 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
-#include "../include/util.h"
+#include "../util.h"
 #include "../../pass/pattern_util.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/include/util.h b/src/relay/qnn/util.h
similarity index 100%
rename from src/relay/qnn/include/util.h
rename to src/relay/qnn/util.h

From 120c05052d18e78083a34cddee17cc7953bc9b60 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 17:03:12 +0000
Subject: [PATCH 25/51] Incorporating comments. API change. Lint fixes.

---
 include/tvm/relay/qnn/attrs.h                 |  21 +-
 python/tvm/relay/qnn/op/qnn.py                |  44 +-
 src/relay/pass/pattern_util.h                 |   6 +-
 src/relay/qnn/op/requantize.cc                |  29 +-
 .../{quantize_rewrite.cc => qnn_lower.cc}     |  21 +-
 src/relay/qnn/util.h                          |  88 ++--
 tests/python/unittest/test_quantized_ops.py   | 393 +++++++++---------
 7 files changed, 301 insertions(+), 301 deletions(-)
 rename src/relay/qnn/pass/{quantize_rewrite.cc => qnn_lower.cc} (96%)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 6bcd77a81f8a..1cd7deb4393f 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -21,8 +21,8 @@
  * \file tvm/relay/attrs/nn.h
  * \brief Auxiliary attributes for nn operators.
  */
-#ifndef TVM_RELAY_ATTRS_QNN_H_
-#define TVM_RELAY_ATTRS_QNN_H_
+#ifndef TVM_RELAY_QNN_ATTRS_H_
+#define TVM_RELAY_QNN_ATTRS_H_
 
 #include <tvm/attrs.h>
 #include <string>
@@ -36,8 +36,8 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   int32_t input_zero_point;
   double output_scale;
   int32_t output_zero_point;
-  bool use_int_compute;
-  std::string rounding_mode;
+  bool use_int_domain;
+  std::string rounding;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
@@ -49,17 +49,22 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("The scale of the input tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_compute).set_default(true)
+    TVM_ATTR_FIELD(use_int_domain).set_default(true)
       .describe("When true, the integer computation is used to handle output scale."
                 "The float compuation can be used as reference implementation or in"
                 "cases where FP32 computation for requantize is not expensive");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
-    TVM_ATTR_FIELD(rounding_mode).set_default("FE_UPWARD")
+    TVM_ATTR_FIELD(rounding).set_default("FE_AWAY_FROM_ZERO")
         .describe("Defines the rounding direction when the value is midway between"
                   "two representable values. There are two supported modes - FE_UPWARD"
-                  "or FE_AWAY_FROM_ZERO. More context can be found at"
+                  "or FE_AWAY_FROM_ZERO. Both modes behave exactly same except at the"
+                  "midpoints between the two representable values. At midpoint, FE_UPWARD"
+                  "rounds towards positive infinity (for example -1.5 will be rounded"
+                  "to -1). FE_AWAY_FROM_ZERO is the standard rounding where the value"
+                  "is rounded away from zero at midpoints (for example, -1.5 rounds to"
+                  "-2). More context can be found at"
                   "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
   }
 };
@@ -67,4 +72,4 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_ATTRS_QNN_H_
+#endif  // TVM_RELAY_QNN_ATTRS_H_
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 8db431eebe23..b0e06e41ed13 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -19,9 +19,14 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
-def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-               output_scale, out_dtype="int32", use_int_compute=True,
-               rounding_mode="FE_AWAY_FROM_ZERO"):
+def requantize(data,
+               input_scale,
+               input_zero_point,
+               output_scale,
+               output_zero_point,
+               out_dtype="int32",
+               rounding="FE_AWAY_FROM_ZERO",
+               use_int_domain=True):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -32,32 +37,32 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
 
     The above computation can be done in floating point as the scales are in
     FP32. Alternatively, we can approximate floating point with fixed point
-    computation. This is controlled by use_int_compute.
+    computation. This is controlled by use_int_domain.
 
     Parameters
     ----------
-    quantized_data : tvm.relay.Expr
-        The input quantized_data to the operator.
+    data : tvm.relay.Expr
+        The input data to the operator.
 
     input_scale: float
-           The float scalar to scale the quantized_data int8 values back to FP32.
+           The float scalar to scale the data int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the data distribution.
 
     output_scale: float
            The float scalar to scale the quantized_output int8 values back to FP32.
 
-    input_zero_point: int
-           The zero point of the quantized_data distribution.
-
     output_zero_point: int
            The zero point of the quantized_output distribution.
 
     out_dtype : str, optional
-        Specifies the output quantized_data type for mixed precision conv2d.
+        Specifies the output data type for mixed precision conv2d.
 
-    use_int_compute : bool, optional
+    use_int_domain : bool, optional
         Use fully integer computation for requantizing.
 
-    rounding_mode : string, optional
+    rounding : string, optional
         Defines the rounding direction when the value is midway between two
         representable values.
 
@@ -66,9 +71,14 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
     result : tvm.relay.Expr
         The computed result.
     """
-    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+    assert rounding in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
             "Unsupported rounding mode"
 
-    return _make.requantize(input_data, input_zero_point, input_scale,
-                            output_zero_point, output_scale, out_dtype,
-                            use_int_compute, rounding_mode)
+    return _make.requantize(data,
+                            input_scale,
+                            input_zero_point,
+                            output_scale,
+                            output_zero_point,
+                            out_dtype,
+                            rounding,
+                            use_int_domain)
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index faccd518a782..4492487de119 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors.
+ *  Copyright (c) 2019 by Contributors.
  *
  * \file tvm/relay/pass/pattern_util.h
  * \brief Header of internal operator functions
@@ -385,8 +385,8 @@ inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
 }
 
 inline Expr Full(Expr fill_value,
-              Array<IndexExpr> shape,
-              DataType dtype) {
+                 Array<IndexExpr> shape,
+                 DataType dtype) {
   auto attrs = make_node<InitOpAttrs>();
   attrs->shape = std::move(shape);
   attrs->dtype = std::move(dtype);
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index df4a224fc2ba..9d0504631893 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
+ *  Copyright (c) 2019 by Contributors
  * \file requantize.cc
  * \brief Quantized convolution operators
  */
@@ -54,29 +54,36 @@ bool RequantizeRel(const Array<Type>& types,
 // Positional relay function to create quantized conv2d operator
 // used by frontend FFI.
 Expr MakeRequantize(Expr data,
-                    int32_t input_zero_point,
                     double input_scale,
-                    int32_t output_zero_point,
+                    int32_t input_zero_point,
                     double output_scale,
+                    int32_t output_zero_point,
                     DataType out_dtype,
-                    bool use_int_compute,
-                    std::string rounding_mode) {
+                    std::string rounding,
+                    bool use_int_domain) {
   auto attrs = make_node<RequantizeAttrs>();
-  attrs->out_dtype = std::move(out_dtype);
-  attrs->input_zero_point = std::move(input_zero_point);
-  attrs->output_zero_point = std::move(output_zero_point);
   attrs->input_scale = std::move(input_scale);
+  attrs->input_zero_point = std::move(input_zero_point);
   attrs->output_scale = std::move(output_scale);
-  attrs->use_int_compute = std::move(use_int_compute);
-  attrs->rounding_mode = std::move(rounding_mode);
+  attrs->output_zero_point = std::move(output_zero_point);
+  attrs->out_dtype = std::move(out_dtype);
+  attrs->rounding = std::move(rounding);
+  attrs->use_int_domain = std::move(use_int_domain);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
 RELAY_REGISTER_OP("qnn.requantize")
 .describe(R"code(Requantize operator.
+The requantize operator converts one quantized tensor to another quantized
+tensor. For the output tensor, we are provided with output scale and zero
+point. The computation looks like this
+
+Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
-FIXME
+The above computation can be done in floating point as the scales are in
+FP32. Alternatively, we can approximate floating point with fixed point
+computation. This is controlled by use_int_domain.
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.RequantizeAttrs")
 .set_num_inputs(1)
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/qnn_lower.cc
similarity index 96%
rename from src/relay/qnn/pass/quantize_rewrite.cc
rename to src/relay/qnn/pass/qnn_lower.cc
index 7d4e0f017050..d491e2a817d3 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
- * \file quantize_rewrite.cc
+ *  Copyright (c) 2019 by Contributors
+ * \file qnn_lower.cc
  * \brief Lower quantized ops to exisiting Relay ops.
  */
 
@@ -111,15 +111,12 @@ Expr RequantizeInt(const Expr& input_tensor,
   int right_shift = shift > 0 ? 0 : -shift;
 
   // 2) Subtract the input_zero_point
-  auto tensor = input_tensor;
-  tensor = Cast(tensor, up_idtype);
+  auto tensor = Cast(input_tensor, up_idtype);
   if (param->input_zero_point != 0) {
     auto input_zp = MakeConstantScalar(up_idtype, param->input_zero_point);
     tensor = Subtract(tensor, input_zp);
   }
 
-
-
   // 3) Multiply the integer multiplier
   if (left_shift != 0) {
     tensor = Multiply(tensor, MakeConstantScalar(up_idtype, 1 << left_shift));
@@ -132,18 +129,17 @@ Expr RequantizeInt(const Expr& input_tensor,
   Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
   auto multiplied_t = Multiply(tensor, scalar);
 
-
   // 4) Find the rounding scalar. This depends on where the final decimal point
   // sits. As we will be right shifting the multiplied_t, we need to first
-  // calculate the totol_right_shift.
+  // calculate the total_right_shift.
   int total_right_shift = right_shift + idtype_bits - 1;
 
   tensor = multiplied_t;
   Expr round_scalar;
-  if (param->rounding_mode == "FE_UPWARD") {
+  if (param->rounding == "FE_UPWARD") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     round_scalar = pos_rounder;
-  } else if (param->rounding_mode == "FE_AWAY_FROM_ZERO") {
+  } else if (param->rounding == "FE_AWAY_FROM_ZERO") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
     auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
@@ -219,13 +215,14 @@ Expr RequantizeFloat(const Expr& input_tensor,
  *
  * The above computation can be done in floating point as the scales are in
  * FP32. Alternatively, we can approximate floating point with fixed point
- * computation. This is controlled by use_int_compute.
+ * computation. This is controlled by use_int_domain.
  */
 Expr RequantizeForwardRewrite(const Call& ref_call,
     const Array<Expr>& new_args, const NodeRef& ctx) {
   CHECK_EQ(new_args.size(), 1);
   Expr quantized_data = new_args[0];
   const auto* param = ref_call->attrs.as<RequantizeAttrs>();
+  CHECK(param != nullptr);
 
   // Find output shape.
   Array<IndexExpr> out_shape;
@@ -242,7 +239,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
-  if (param->use_int_compute) {
+  if (param->use_int_domain) {
     return RequantizeInt(quantized_data, param, input_dtype, out_shape);
   } else {
     return RequantizeFloat(quantized_data, param, input_dtype, out_shape);
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 61663b0da85e..63e7938c93d8 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -18,54 +18,26 @@
  */
 
 /*!
- * \file tvm/relay/quantize_util.h
+ * \file src/relay/qnn/util.h
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
-#ifndef TVM_RELAY_QUANTIZE_UTIL_H_
-#define TVM_RELAY_QUANTIZE_UTIL_H_
+#ifndef TVM_RELAY_QNN_UTIL_H_
+#define TVM_RELAY_QNN_UTIL_H_
 
 #include <tvm/expr.h>
-#include <limits>
 #include <tvm/relay/expr.h>
+#include <limits>
 
 namespace tvm {
 namespace relay {
 
-inline bool IsInt8(const DataType& dtype) {
-  return dtype == Int(8);
+inline bool IsQNNDataType(const DataType& dtype) {
+  return dtype == Int(8) || dtype == UInt(8)
+      || dtype == Int(16) || dtype == UInt(16);
 }
 
-inline bool IsUint8(const DataType& dtype) {
-  return dtype == UInt(8);
-}
-
-inline bool IsInt16(const DataType& dtype) {
-  return dtype == Int(16);
-}
-
-inline bool IsUint16(const DataType& dtype) {
-  return dtype == UInt(16);
-}
-
-inline bool IsInt32(const DataType& dtype) {
-  return dtype == Int(32);
-}
-
-inline bool IsUint32(const DataType& dtype) {
-  return dtype == UInt(32);
-}
-
-inline bool IsFloat32(const DataType& dtype) {
-  return dtype == Float(32);
-}
-
-inline bool IsQuantizedType(const DataType& dtype) {
-  return IsInt8(dtype) || IsUint8(dtype)
-      || IsInt16(dtype) || IsUint16(dtype);
-}
-
-enum class QuantizeOpType : uint8_t {
+enum class QuantizeOpType {
   Quantize,
   Dequantize,
   Requantize
@@ -75,11 +47,11 @@ inline bool IsValidOpInputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsFloat32(in_dtype) || IsQuantizedType(in_dtype);
-    case QuantizeOpType ::Dequantize:
-      return IsQuantizedType(in_dtype);
-    case QuantizeOpType ::Requantize:
-      return IsInt16(in_dtype) || IsInt32(in_dtype);
+      return in_dtype == Float(32) || IsQNNDataType(in_dtype);
+    case QuantizeOpType::Dequantize:
+      return IsQNNDataType(in_dtype);
+    case QuantizeOpType::Requantize:
+      return in_dtype == Int(16) || in_dtype == Int(32);
     default:
       return false;
   }
@@ -89,51 +61,51 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsQuantizedType(in_dtype);
+      return IsQNNDataType(in_dtype);
     case QuantizeOpType::Dequantize:
-      return IsFloat32(in_dtype);
+      return in_dtype == Float(32);
     default:
       return false;
   }
 }
 
 inline const int32_t GetQmin(const DataType& dtype) {
-  if (IsInt8(dtype)) {
+  if (dtype == Int(8)) {
     return std::numeric_limits<int8_t>::min();
-  } else if (IsUint8(dtype)) {
+  } else if (dtype == UInt(8)) {
     return std::numeric_limits<uint8_t>::min();
-  } else if (IsInt16(dtype)) {
+  } else if (dtype == Int(16)) {
     return std::numeric_limits<int16_t>::min();
-  } else if (IsUint16(dtype)) {
+  } else if (dtype == UInt(16)) {
     return std::numeric_limits<uint16_t>::min();
-  } else if (IsInt32(dtype)) {
+  } else if (dtype == Int(32)) {
     return std::numeric_limits<int32_t>::min();
-  } else if (IsUint32(dtype)) {
+  } else if (dtype == UInt(32)) {
     return std::numeric_limits<uint32_t>::min();
   }
-  LOG(FATAL) << "Type not supported\n";
+  LOG(FATAL) << "Type not supported " << dtype;
   return -1;
 }
 
 
 inline const int32_t GetQmax(const DataType& dtype) {
-  if (IsInt8(dtype)) {
+  if (dtype == Int(8)) {
     return std::numeric_limits<int8_t>::max();
-  } else if (IsUint8(dtype)) {
+  } else if (dtype == UInt(8)) {
     return std::numeric_limits<uint8_t>::max();
-  } else if (IsInt16(dtype)) {
+  } else if (dtype == Int(16)) {
     return std::numeric_limits<int16_t>::max();
-  } else if (IsUint16(dtype)) {
+  } else if (dtype == UInt(16)) {
     return std::numeric_limits<uint16_t>::max();
-  } else if (IsInt32(dtype)) {
+  } else if (dtype == Int(32)) {
     return std::numeric_limits<int32_t>::max();
-  } else if (IsUint32(dtype)) {
+  } else if (dtype == UInt(32)) {
     return std::numeric_limits<uint32_t>::max();
   }
-  LOG(FATAL) << "Type not supported\n";
+  LOG(FATAL) << "Type not supported " << dtype;
   return -1;
 }
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_QUANTIZE_UTIL_H_
+#endif  // TVM_RELAY_QNN_UTIL_H_
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 092e695cf533..17790294b8bc 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -21,7 +21,7 @@
 from tvm.relay.testing import create_workload
 from tvm.contrib import graph_runtime
 
-rounding_modes = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
+roundings = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
 
 def run_infer_type(expr):
     mod = relay.Module.from_expr(expr)
@@ -32,7 +32,7 @@ def run_infer_type(expr):
 
 def test_requantize():
     def verify(func, goldens):
-        with relay.build_config(opt_level=0):
+        with relay.build_config(opt_level=3):
             graph, lib, params = relay.build(func, "llvm", params=None)
             golden_data, golden_output = goldens
             mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
@@ -42,8 +42,8 @@ def verify(func, goldens):
             res = mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
-            rounding_mode, input_scale, output_scale, input_zero_point=0,
+    def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
+            rounding, input_scale, output_scale, input_zero_point=0,
             output_zero_point=0):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
@@ -53,9 +53,9 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
                 output_zero_point=output_zero_point,
                 input_scale=input_scale,
                 output_scale=output_scale,
-                rounding_mode=rounding_mode,
+                rounding=rounding,
                 out_dtype=out_dtype,
-                use_int_compute=use_int_compute)
+                use_int_domain=use_int_domain)
 
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
@@ -64,193 +64,202 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
         return func
 
 
-    def run_tests():
-        def same_scale_test():
-            # Have same scales, everything within range
-            golden_data = np.arange(-100, 100, 1).astype('int32')
-            golden_output = golden_data
-
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(200, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=0.5,
-                                    output_scale=0.5)
-                    verify(func, (golden_data, golden_output))
-
-        def downscale_test():
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype='int8',
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=16)
-
-                    # Try positive values
-                    # 8 corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(0, -32, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                    else:
-                        golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                    verify(func, (golden_data, golden_output))
-
-                # Try a different scale
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=4)
-
-                    # Try positive values
-                    # 2I corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+    def same_scale_test():
+        # Have same scales, everything within range
+        golden_data = np.arange(-100, 100, 1).astype('int32')
+        golden_output = golden_data
+
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(200, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=0.5,
+                                output_scale=0.5)
+                verify(func, (golden_data, golden_output))
+
+    def downscale_test():
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='int8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                verify(func, (golden_data, golden_output))
+
+            # Try a different scale
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=4)
+
+                # Try positive values
+                # 2I corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                          [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                              [3, 4, 4, 4, 4, 4, 4, 4, 1])
+                else:
+                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                               [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(0, -32, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                                  [3, 4, 4, 4, 4, 4, 4, 4, 1])
-                    else:
-                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                                  [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                    verify(func, (golden_data, golden_output))
-
-        def upscale_test():
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=2,
-                                    output_scale=1)
-
-                    # Try positive values
-                    # 8 corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.multiply(2, golden_data)
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(0, -32, -1).astype('int32')
-                    golden_output = np.multiply(2, golden_data)
-                    verify(func, (golden_data, golden_output))
-
-        def saturation_test():
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(16, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=0.5,
-                                    output_scale=0.5)
-                    golden_data = np.arange(0, 16, 1).astype('int32')
-                    golden_data = np.add(120, golden_data)
-                    output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
-                                       127, 127, 127, 127, 127, 127, 127, 127])
-                    golden_output = output
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative numbers
-                    golden_data = np.arange(0, -16, -1).astype('int32')
-                    golden_data = np.add(-120, golden_data)
-                    output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
-                                       -128, -128, -128, -128, -128, -128, -128, -128])
-                    golden_output = output
-                    verify(func, (golden_data, golden_output))
-
-        def zero_point_test():
-            # Output zero point
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype='int8',
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=16,
-                                    output_zero_point=1)
-
-                    # Try positive values
-                    # 8 corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                    golden_output = np.add(1, golden_output)
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(-32, -64, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                    else:
-                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                    golden_output = np.add(1, golden_output)
-                    verify(func, (golden_data, golden_output))
-
-            # Input zero point
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype='int8',
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=16,
-                                    input_zero_point=16)
-
-                    # Try positive values
-                    golden_data = np.arange(32, 64, 1).astype('int32')
-                    golden_output = np.repeat([2, 3, 4], [8, 16, 8])
-                    golden_output = np.subtract(golden_output, 1)
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    golden_data = np.arange(-32, -64, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                    else:
-                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                    golden_output = np.subtract(golden_output, 1)
-                    verify(func, (golden_data, golden_output))
-
-
-
-
-        if __name__ == "__main__":
-            same_scale_test()
-            downscale_test()
-            upscale_test()
-            saturation_test()
-            zero_point_test()
-
-    run_tests()
+                verify(func, (golden_data, golden_output))
+
+            # Try uint8 out_dtype
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='uint8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                verify(func, (golden_data, golden_output))
+
+    def upscale_test():
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=2,
+                                output_scale=1)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.multiply(2, golden_data)
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype('int32')
+                golden_output = np.multiply(2, golden_data)
+                verify(func, (golden_data, golden_output))
+
+    def saturation_test():
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(16, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=0.5,
+                                output_scale=0.5)
+                golden_data = np.arange(0, 16, 1).astype('int32')
+                golden_data = np.add(120, golden_data)
+                output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
+                                   127, 127, 127, 127, 127, 127, 127, 127])
+                golden_output = output
+                verify(func, (golden_data, golden_output))
+
+                # Try negative numbers
+                golden_data = np.arange(0, -16, -1).astype('int32')
+                golden_data = np.add(-120, golden_data)
+                output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
+                                   -128, -128, -128, -128, -128, -128, -128, -128])
+                golden_output = output
+                verify(func, (golden_data, golden_output))
+
+    def zero_point_test():
+        # Output zero point
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='int8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16,
+                                output_zero_point=1)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                golden_output = np.add(1, golden_output)
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(-32, -64, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                golden_output = np.add(1, golden_output)
+                verify(func, (golden_data, golden_output))
+
+        # Input zero point
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='int8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16,
+                                input_zero_point=16)
+
+                # Try positive values
+                golden_data = np.arange(32, 64, 1).astype('int32')
+                golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+                golden_output = np.subtract(golden_output, 1)
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                golden_data = np.arange(-32, -64, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                golden_output = np.subtract(golden_output, 1)
+                verify(func, (golden_data, golden_output))
+
+    same_scale_test()
+    downscale_test()
+    upscale_test()
+    saturation_test()
+    zero_point_test()
 
 if __name__ == "__main__":
     test_requantize()

From 989bbea47c6c1e78b637d2420a033da5a7e3b2c2 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 17:29:14 +0000
Subject: [PATCH 26/51] Modifying the GetFixedPointMultiplierShift API as per
 comments.

---
 src/relay/pass/pattern_util.h                 |  2 +-
 src/relay/qnn/pass/qnn_lower.cc               | 45 ++++++++++---------
 ...{test_quantized_ops.py => test_qnn_ops.py} |  0
 3 files changed, 24 insertions(+), 23 deletions(-)
 rename tests/python/unittest/{test_quantized_ops.py => test_qnn_ops.py} (100%)

diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 4492487de119..5d6a9cf3e68f 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors.
+ *  Copyright (c) 2018 by Contributors.
  *
  * \file tvm/relay/pass/pattern_util.h
  * \brief Header of internal operator functions
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index d491e2a817d3..d0bc2d430961 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -38,9 +38,9 @@ namespace relay {
 /*
  * Converts a floating point number so that it can be represented by integers.
  * The representation is
- *      float_number = (fixed_point_multiplier) * 2^(shift)
+ *      float_number = (significand) * 2^(exponent)
  *
- * The fixed_point_multiplier is a number between 0.5 and 1. This is represented
+ * The significand is a number between 0.5 and 1. This is represented
  * by an integer number. For example, if it is int32, then the decimal point
  * exists between bit 31 and 30 from LSB (or between first and second bit from
  * the left).
@@ -48,27 +48,28 @@ namespace relay {
  * Some examples are
  *           0.25 = (0.5) * 2^(-1)
  *           0.125 = (0.5) * 2^(-2)
+ *
+ * Credit to TFLite reference implementation.
  */
-void GetFixedPointMultiplierShift(double double_multiplier,
-    int32_t* fixed_point_multiplier, int* shift,
+std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
     const DataType& idtype) {
-
+  int significand, exponent;
   int idtype_bits = idtype.bits();
 
-  if (double_multiplier == 0.) {
-    *fixed_point_multiplier = 0;
-    *shift = 0;
-    return;
-  }
-  const double q = std::frexp(double_multiplier, shift);
-  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (idtype_bits - 1))));
-  CHECK_LE(q_fixed, (1ll << (idtype_bits - 1)));
-  if (q_fixed == (1ll << (idtype_bits - 1))) {
-    q_fixed /= 2;
-    ++*shift;
+  // Get the significand (significand) and exponent (exponent)
+  double significand_d = std::frexp(double_multiplier, &exponent);
+
+  // Convert the double significand to int significand.
+  significand_d = std::round(significand_d * (1ll << (idtype_bits - 1)));
+  auto significand_int64 = static_cast<int64_t>(significand_d);
+  CHECK_LE(significand_int64, (1ll << (idtype_bits - 1)));
+  if (significand_int64 == (1ll << (idtype_bits - 1))) {
+    significand_int64 /= 2;
+    ++exponent;
   }
-  CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
-  *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
+  CHECK_LE(significand_int64, std::numeric_limits<int>::max());
+  significand = static_cast<int>(significand_int64);
+  return std::pair<int, int>(significand, exponent);
 }
 
 /*
@@ -103,10 +104,10 @@ Expr RequantizeInt(const Expr& input_tensor,
   DataType up_idtype = Int(2 * idtype_bits);
 
   // 1) Calculating the integer multiplier and integer shift
-  int32_t fixed_point_multiplier;
-  int shift;
-  GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
-          &shift, idtype);
+  std::pair<int, int> fixed_point_params =
+      GetFixedPointMultiplierShift(double_multiplier, idtype);
+  int fixed_point_multiplier = fixed_point_params.first;
+  int shift = fixed_point_params.second;
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
 
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_qnn_ops.py
similarity index 100%
rename from tests/python/unittest/test_quantized_ops.py
rename to tests/python/unittest/test_qnn_ops.py

From 8df0ddb50e59f475cdff1cb78c3b5bab9a930911 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 19:00:39 +0000
Subject: [PATCH 27/51] Forgot the dialect change.

---
 python/tvm/relay/qnn/op/_make.py | 2 +-
 src/relay/qnn/op/requantize.cc   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/qnn/op/_make.py b/python/tvm/relay/qnn/op/_make.py
index b1695629b8f9..07b3dd154760 100644
--- a/python/tvm/relay/qnn/op/_make.py
+++ b/python/tvm/relay/qnn/op/_make.py
@@ -17,4 +17,4 @@
 """Constructor APIs"""
 from ...._ffi.function import _init_api
 
-_init_api("relay.op.qnn._make", __name__)
+_init_api("relay.qnn.op._make", __name__)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 9d0504631893..13179f15f22a 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -91,7 +91,7 @@ computation. This is controlled by use_int_domain.
 .set_support_level(10)
 .add_type_rel("Requantize", RequantizeRel);
 
-TVM_REGISTER_API("relay.op.qnn._make.requantize")
+TVM_REGISTER_API("relay.qnn.op._make.requantize")
 .set_body_typed(MakeRequantize);
 
 }  // namespace relay

From 8d0af8651abbc0cf17595ccb7a3fe5f635fcc417 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 20:02:23 +0000
Subject: [PATCH 28/51] Retriggering Jenkins.


From ff1b9e3c7bef4a9862fb7e612b662aa3ebfc8e33 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 20:19:20 +0000
Subject: [PATCH 29/51] Changing rewrite to qnn_lower.

---
 python/tvm/relay/qnn/ir_pass.py       | 4 ++--
 src/relay/qnn/pass/qnn_lower.cc       | 2 +-
 tests/python/unittest/test_qnn_ops.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/qnn/ir_pass.py b/python/tvm/relay/qnn/ir_pass.py
index 24e3329e961c..edeecd9a0e6c 100644
--- a/python/tvm/relay/qnn/ir_pass.py
+++ b/python/tvm/relay/qnn/ir_pass.py
@@ -20,7 +20,7 @@
 
 from . import _qnn
 
-def rewrite(expr):
+def qnn_lower(expr):
     """
     Rewrites the high-level quantized ops into low-level exisiting Relay ops.
 
@@ -34,4 +34,4 @@ def rewrite(expr):
     expr : tvm.relay.Expr
         The output expression.
     """
-    return _qnn.rewrite(expr)
+    return _qnn.qnn_lower(expr)
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index d0bc2d430961..5048d8686a61 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -250,7 +250,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay._qnn.rewrite")
+TVM_REGISTER_API("relay._qnn.qnn_lower")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
   Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
   return ret;
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 17790294b8bc..5c84ef19f1c7 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -60,7 +60,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
         func = run_infer_type(func)
-        func = relay.qnn.ir_pass.rewrite(func)
+        func = relay.qnn.ir_pass.qnn_lower(func)
         return func
 
 

From 362869f3ab4e4b29f84116e46106d1a565184107 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 20:21:23 +0000
Subject: [PATCH 30/51] Renaming Quantize to Qnn for clarity.

---
 src/relay/qnn/pass/qnn_lower.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 5048d8686a61..b05ea8fbded9 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -248,11 +248,11 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 }
 
 RELAY_REGISTER_OP("qnn.requantize")
-.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
+.set_attr<FForwardRewrite>("FQnnForwardRewrite", RequantizeForwardRewrite);
 
 TVM_REGISTER_API("relay._qnn.qnn_lower")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  Expr ret = ForwardRewrite(e, "FQnnForwardRewrite", nullptr, nullptr);
   return ret;
 });
 

From 36f0ed9121b63d3fa7bf00e6a57097a02c609322 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 17 Jul 2019 06:02:41 +0000
Subject: [PATCH 31/51] Remove use_int_domain.

---
 python/tvm/relay/qnn/op/qnn.py        |  13 +-
 src/relay/qnn/op/requantize.cc        |   7 +-
 src/relay/qnn/pass/qnn_lower.cc       |  44 +---
 tests/python/unittest/test_qnn_ops.py | 318 ++++++++++++--------------
 4 files changed, 155 insertions(+), 227 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index b0e06e41ed13..65369c840b67 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -25,8 +25,7 @@ def requantize(data,
                output_scale,
                output_zero_point,
                out_dtype="int32",
-               rounding="FE_AWAY_FROM_ZERO",
-               use_int_domain=True):
+               rounding="FE_AWAY_FROM_ZERO"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -35,10 +34,6 @@ def requantize(data,
 
     Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
-    The above computation can be done in floating point as the scales are in
-    FP32. Alternatively, we can approximate floating point with fixed point
-    computation. This is controlled by use_int_domain.
-
     Parameters
     ----------
     data : tvm.relay.Expr
@@ -59,9 +54,6 @@ def requantize(data,
     out_dtype : str, optional
         Specifies the output data type for mixed precision conv2d.
 
-    use_int_domain : bool, optional
-        Use fully integer computation for requantizing.
-
     rounding : string, optional
         Defines the rounding direction when the value is midway between two
         representable values.
@@ -80,5 +72,4 @@ def requantize(data,
                             output_scale,
                             output_zero_point,
                             out_dtype,
-                            rounding,
-                            use_int_domain)
+                            rounding)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 13179f15f22a..62688147b06e 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -59,8 +59,7 @@ Expr MakeRequantize(Expr data,
                     double output_scale,
                     int32_t output_zero_point,
                     DataType out_dtype,
-                    std::string rounding,
-                    bool use_int_domain) {
+                    std::string rounding) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->input_scale = std::move(input_scale);
   attrs->input_zero_point = std::move(input_zero_point);
@@ -68,7 +67,6 @@ Expr MakeRequantize(Expr data,
   attrs->output_zero_point = std::move(output_zero_point);
   attrs->out_dtype = std::move(out_dtype);
   attrs->rounding = std::move(rounding);
-  attrs->use_int_domain = std::move(use_int_domain);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
@@ -81,9 +79,6 @@ point. The computation looks like this
 
 Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
-The above computation can be done in floating point as the scales are in
-FP32. Alternatively, we can approximate floating point with fixed point
-computation. This is controlled by use_int_domain.
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.RequantizeAttrs")
 .set_num_inputs(1)
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index b05ea8fbded9..621b8aee2ac7 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -93,7 +93,7 @@ std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
  * 7) Cast to the out_dtype.
  *
  */
-Expr RequantizeInt(const Expr& input_tensor,
+Expr RequantizeLower(const Expr& input_tensor,
     const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
@@ -174,39 +174,6 @@ Expr RequantizeInt(const Expr& input_tensor,
   return requantized_output;
 }
 
-
-/*
- * Requantization using floating computation. Here we can multiply the scale to
- * the input_tensor, round to nearest integer and then cast back to int32.
- */
-Expr RequantizeFloat(const Expr& input_tensor,
-    const RequantizeAttrs* param, const DataType& idtype,
-    const Array<IndexExpr>& out_shape) {
-  double double_multiplier = param->input_scale/param->output_scale;
-  auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
-  auto input_zp = MakeConstantScalar(idtype, param->input_zero_point);
-  auto output_zp = MakeConstantScalar(Float(32), param->output_zero_point);
-
-  // Multiply the tensor with the new scale.
-  auto shifted_input_t = Subtract(input_tensor, input_zp);
-  auto casted_t = Cast(shifted_input_t, Float(32));
-  auto multiplied_t = Multiply(casted_t, scalar_multiplier);
-  auto shifted_multiplied_t = Add(output_zp, multiplied_t);
-  auto rounded_t = Round(shifted_multiplied_t);
-  auto q_imin = GetQmin(idtype);
-  auto q_imax = GetQmax(idtype);
-  auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
-          idtype);
-
-  // Clip to the out_dtype min/max.
-  // Clip limits must be smaller than the dtype of the input tensor.
-  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
-  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
-  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
-  auto requantized_output = Cast(clipped_t, param->out_dtype);
-  return requantized_output;
-}
-
 /*
  * Lowering of the requantize operation. The requantize operator converts one
  * quantized tensor to another quantized tensor. For the output tensor, we are
@@ -214,9 +181,6 @@ Expr RequantizeFloat(const Expr& input_tensor,
  *
  * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
  *
- * The above computation can be done in floating point as the scales are in
- * FP32. Alternatively, we can approximate floating point with fixed point
- * computation. This is controlled by use_int_domain.
  */
 Expr RequantizeForwardRewrite(const Call& ref_call,
     const Array<Expr>& new_args, const NodeRef& ctx) {
@@ -240,11 +204,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
-  if (param->use_int_domain) {
-    return RequantizeInt(quantized_data, param, input_dtype, out_shape);
-  } else {
-    return RequantizeFloat(quantized_data, param, input_dtype, out_shape);
-  }
+  return RequantizeLower(quantized_data, param, input_dtype, out_shape);
 }
 
 RELAY_REGISTER_OP("qnn.requantize")
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 5c84ef19f1c7..342e1ce09d99 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -42,9 +42,8 @@ def verify(func, goldens):
             res = mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
-            rounding, input_scale, output_scale, input_zero_point=0,
-            output_zero_point=0):
+    def get_func(data_shape, data_dtype, out_dtype, rounding, input_scale,
+            output_scale, input_zero_point=0, output_zero_point=0):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
         func = relay.qnn.op.requantize(
@@ -54,8 +53,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
                 input_scale=input_scale,
                 output_scale=output_scale,
                 rounding=rounding,
-                out_dtype=out_dtype,
-                use_int_domain=use_int_domain)
+                out_dtype=out_dtype)
 
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
@@ -70,190 +68,174 @@ def same_scale_test():
         golden_output = golden_data
 
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(200, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=0.5,
-                                output_scale=0.5)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(200, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=0.5,
+                            output_scale=0.5)
+            verify(func, (golden_data, golden_output))
 
     def downscale_test():
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='int8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='int8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(0, -32, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+            else:
+                golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+            verify(func, (golden_data, golden_output))
 
             # Try a different scale
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=4)
-
-                # Try positive values
-                # 2I corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=4)
+
+            # Try positive values
+            # 2I corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                      [2, 4, 4, 4, 4, 4, 4, 4, 2])
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(0, -32, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                          [3, 4, 4, 4, 4, 4, 4, 4, 1])
+            else:
+                golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                           [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                              [3, 4, 4, 4, 4, 4, 4, 4, 1])
-                else:
-                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                              [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                verify(func, (golden_data, golden_output))
+            verify(func, (golden_data, golden_output))
 
             # Try uint8 out_dtype
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='uint8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='uint8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            verify(func, (golden_data, golden_output))
 
     def upscale_test():
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=2,
-                                output_scale=1)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.multiply(2, golden_data)
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype('int32')
-                golden_output = np.multiply(2, golden_data)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=2,
+                            output_scale=1)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.multiply(2, golden_data)
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(0, -32, -1).astype('int32')
+            golden_output = np.multiply(2, golden_data)
+            verify(func, (golden_data, golden_output))
 
     def saturation_test():
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(16, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=0.5,
-                                output_scale=0.5)
-                golden_data = np.arange(0, 16, 1).astype('int32')
-                golden_data = np.add(120, golden_data)
-                output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
-                                   127, 127, 127, 127, 127, 127, 127, 127])
-                golden_output = output
-                verify(func, (golden_data, golden_output))
-
-                # Try negative numbers
-                golden_data = np.arange(0, -16, -1).astype('int32')
-                golden_data = np.add(-120, golden_data)
-                output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
-                                   -128, -128, -128, -128, -128, -128, -128, -128])
-                golden_output = output
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(16, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=0.5,
+                            output_scale=0.5)
+            golden_data = np.arange(0, 16, 1).astype('int32')
+            golden_data = np.add(120, golden_data)
+            output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
+                               127, 127, 127, 127, 127, 127, 127, 127])
+            golden_output = output
+            verify(func, (golden_data, golden_output))
+
+            # Try negative numbers
+            golden_data = np.arange(0, -16, -1).astype('int32')
+            golden_data = np.add(-120, golden_data)
+            output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
+                               -128, -128, -128, -128, -128, -128, -128, -128])
+            golden_output = output
+            verify(func, (golden_data, golden_output))
 
     def zero_point_test():
         # Output zero point
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='int8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16,
-                                output_zero_point=1)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                golden_output = np.add(1, golden_output)
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(-32, -64, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                golden_output = np.add(1, golden_output)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='int8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16,
+                            output_zero_point=1)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            golden_output = np.add(1, golden_output)
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(-32, -64, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+            else:
+                golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+            golden_output = np.add(1, golden_output)
+            verify(func, (golden_data, golden_output))
 
         # Input zero point
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='int8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16,
-                                input_zero_point=16)
-
-                # Try positive values
-                golden_data = np.arange(32, 64, 1).astype('int32')
-                golden_output = np.repeat([2, 3, 4], [8, 16, 8])
-                golden_output = np.subtract(golden_output, 1)
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                golden_data = np.arange(-32, -64, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                golden_output = np.subtract(golden_output, 1)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='int8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16,
+                            input_zero_point=16)
+
+            # Try positive values
+            golden_data = np.arange(32, 64, 1).astype('int32')
+            golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+            golden_output = np.subtract(golden_output, 1)
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            golden_data = np.arange(-32, -64, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+            else:
+                golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+            golden_output = np.subtract(golden_output, 1)
+            verify(func, (golden_data, golden_output))
 
     same_scale_test()
     downscale_test()

From b45c62946ed943d6c5399e2ce64d37b176063eb7 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Wed, 17 Jul 2019 13:45:34 -0700
Subject: [PATCH 32/51] Working quantized fully-connected with int8 and uint8

---
 include/tvm/relay/qnn/attrs.h               | 22 +++++
 python/tvm/relay/qnn/op/qnn.py              | 29 +++++++
 src/relay/op/nn/nn.cc                       | 43 +---------
 src/relay/op/nn/nn.h                        | 95 +++++++++++++++++++++
 src/relay/pass/pattern_util.h               | 11 +++
 src/relay/qnn/op/nn/nn.cc                   | 69 +++++++++++++++
 src/relay/qnn/pass/quantize_rewrite.cc      | 33 +++++++
 src/relay/qnn/util.h                        | 16 ++--
 tests/python/unittest/test_quantized_ops.py | 81 +++++++++++++++++-
 9 files changed, 351 insertions(+), 48 deletions(-)
 create mode 100644 src/relay/op/nn/nn.h
 create mode 100644 src/relay/qnn/op/nn/nn.cc

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 6bcd77a81f8a..b8d8dde6d7f1 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -25,6 +25,7 @@
 #define TVM_RELAY_ATTRS_QNN_H_
 
 #include <tvm/attrs.h>
+#include <tvm/relay/base.h>
 #include <string>
 
 namespace tvm {
@@ -64,6 +65,27 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   }
 };
 
+/*! \brief Attributes for quantized dense operator */
+struct QDenseAttrs : public tvm::AttrsNode<QDenseAttrs> {
+  IndexExpr units;
+  DataType out_dtype;
+  // Quantization related attributes.
+  int32_t input_zero_point;
+  int32_t kernel_zero_point;
+
+  TVM_DECLARE_ATTRS(QDenseAttrs, "relay.attrs.QDenseAttrs") {
+    TVM_ATTR_FIELD(units)
+        .describe("Number of hidden units of the dense transformation.");
+
+    TVM_ATTR_FIELD(out_dtype)
+        .describe("Output data type, set to explicit type under mixed precision setting");
+
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
+    TVM_ATTR_FIELD(kernel_zero_point)
+        .describe("The zero point of the kernel tensor.");
+  }
+};
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 8db431eebe23..3e9663b1d407 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -72,3 +72,32 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
     return _make.requantize(input_data, input_zero_point, input_scale,
                             output_zero_point, output_scale, out_dtype,
                             use_int_compute, rounding_mode)
+
+def quantized_dense(data, weight, input_zero_point, kernel_zero_point, units=None, out_dtype="int32"):
+    """Dense operator.
+    Applies a linear transformation
+
+    .. math::
+
+    `Y = X * W`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The quantied input data to the operator.
+
+    weight : tvm.relay.Expr
+        The quantized weight expressions.
+
+    units : int, optional
+        Number of hidden units of the dense transformation.
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense can be int32 or int16.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.dense(data, weight, units, input_zero_point, kernel_zero_point, out_dtype)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index c0f36bfa2915..8a35f699e65a 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -32,7 +32,7 @@
 #include <topi/nn/softmax.h>
 #include <topi/nn/flatten.h>
 #include <vector>
-#include "../type_relations.h"
+#include "nn.h"
 #include "../../pass/alter_op_layout.h"
 #include "../op_common.h"
 
@@ -102,45 +102,6 @@ RELAY_REGISTER_OP("nn.bias_add")
 // relay.nn.dense
 TVM_REGISTER_NODE_TYPE(DenseAttrs);
 
-
-bool DenseRel(const Array<Type>& types,
-              int num_inputs,
-              const Attrs& attrs,
-              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
-  const auto* data = types[0].as<TensorTypeNode>();
-  const auto* weight = types[1].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  const DenseAttrs* param = attrs.as<DenseAttrs>();
-  CHECK(param != nullptr);
-
-  CHECK(static_cast<int>(data->shape.size()) != 0);
-
-  Array<tvm::Expr> oshape = data->shape;
-  if (param->units.defined()) {
-    Array<tvm::Expr> dshape = data->shape;
-    // validate the weight shape is proper if defined
-    // Assign weight type
-    Array<IndexExpr> wshape({param->units, dshape[dshape.size() - 1]});
-    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
-    oshape.Set((oshape.size() - 1), param->units);
-  } else {
-    if (weight == nullptr) return false;
-    Array<tvm::Expr> wshape = weight->shape;
-    oshape.Set((oshape.size() - 1), wshape[0]);
-  }
-
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
-  // assign output type
-  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
-  return true;
-}
-
-
 // Positional relay function to create dense operator used by frontend FFI.
 Expr MakeDense(Expr data,
                Expr weight,
@@ -171,7 +132,7 @@ RELAY_REGISTER_OP("nn.dense")
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("weight", "2D Tensor", "Weight matrix.")
 .set_support_level(1)
-.add_type_rel("Dense", DenseRel);
+.add_type_rel("Dense", DenseRel<DenseAttrs, DenseType::kUnquantizedDense>);
 
 // relay.leaky_relu
 TVM_REGISTER_NODE_TYPE(LeakyReluAttrs);
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
new file mode 100644
index 000000000000..d9e9b6623475
--- /dev/null
+++ b/src/relay/op/nn/nn.h
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nn.h
+ * \brief Property def of nn operators that need to be shared by quantized and unquantized ops.
+ */
+
+#ifndef TVM_NN_H
+#define TVM_NN_H
+
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/relay/attrs/nn.h>
+#include "../type_relations.h"
+#include "../../qnn/util.h"
+#include <type_traits>
+
+namespace tvm {
+namespace relay {
+
+// relay.nn.dense
+enum DenseType {
+  kUnquantizedDense,
+  kQuantizedDense
+};
+
+template <typename AttrType, DenseType mode>
+inline bool DenseRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  const auto* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+
+  CHECK(static_cast<int>(data->shape.size()) != 0);
+  if(mode == DenseType::kQuantizedDense) {
+    CHECK(IsValidOpInputType(QuantizeOpType::QuantizedDense, data->dtype))
+        << "Expected quantized dense type(int8, uint8) for input but was " <<  data->dtype;
+    CHECK(IsValidOpInputType(QuantizeOpType::QuantizedDense, weight->dtype))
+      << "Expected quantized dense type(int8, uint8) for weight but was " <<  weight->dtype;
+    CHECK(data->dtype == weight->dtype) << "Weight and kernel dtypes do not match";
+    CHECK(IsValidOpOutputType(QuantizeOpType::QuantizedDense, param->out_dtype))
+      << "Expected quantized dense type(int32, int16) for output but was " <<  param->out_dtype;
+  }
+  Array<tvm::Expr> oshape = data->shape;
+  if (param->units.defined()) {
+    Array<tvm::Expr> dshape = data->shape;
+    // validate the weight shape is proper if defined
+    // Assign weight type
+    Array<IndexExpr> wshape({param->units, dshape[dshape.size() - 1]});
+    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
+    oshape.Set((oshape.size() - 1), param->units);
+  } else {
+    if (weight == nullptr) return false;
+    Array<tvm::Expr> wshape = weight->shape;
+    oshape.Set((oshape.size() - 1), wshape[0]);
+  }
+
+  DataType out_dtype = param->out_dtype;
+  if(mode == DenseType::kUnquantizedDense) {
+    if (out_dtype.bits() == 0) {
+      out_dtype = data->dtype;
+    }
+  }
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype)); // this is the correct one
+//  reporter->Assign(types[2], TensorTypeNode::make(weight->shape, Int(32))); // Remove this.
+  return true;
+}
+
+} // namespace relay
+} // namespace tvm
+
+#endif //TVM_NN_H
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index faccd518a782..d2abb03db961 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -394,6 +394,17 @@ inline Expr Full(Expr fill_value,
   return CallNode::make(op, {fill_value}, Attrs(attrs), {});
 }
 
+inline Expr Dense(Expr data,
+               Expr weight,
+               IndexExpr units,
+               DataType out_dtype) {
+  auto attrs = make_node<DenseAttrs>();
+  attrs->units = units;
+  attrs->out_dtype = out_dtype;
+  static const Op& op = Op::Get("nn.dense");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
 Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
diff --git a/src/relay/qnn/op/nn/nn.cc b/src/relay/qnn/op/nn/nn.cc
new file mode 100644
index 000000000000..7046e0c5d626
--- /dev/null
+++ b/src/relay/qnn/op/nn/nn.cc
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nn.cc
+ * \brief Property def of qauntized nn operators.
+ */
+
+#include <tvm/relay/qnn/attrs.h>
+#include "../../../op/nn/nn.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.qnn.dense
+TVM_REGISTER_NODE_TYPE(QDenseAttrs);
+// Positional relay function to create quantized dense operator used by frontend FFI.
+Expr MakeQuantizedDense(Expr data,
+                        Expr weight,
+                        IndexExpr units,
+                        int32_t input_zero_point,
+                        int32_t kernel_zero_point,
+                        DataType out_dtype) {
+  auto attrs = make_node<QDenseAttrs>();
+  attrs->units = units;
+  attrs->out_dtype = out_dtype;
+  attrs->input_zero_point = input_zero_point;
+  attrs->kernel_zero_point = kernel_zero_point;
+  static const Op& op = Op::Get("qnn.dense");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.qnn._make.dense")
+.set_body_typed(MakeQuantizedDense);
+
+RELAY_REGISTER_OP("qnn.dense")
+    .describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
+
+- **data**: quantized(int8, unit8) `(x1, x2, ..., xn, input_dim)`
+- **weight**: quantized(int8, unit8) `(units, input_dim)`
+- **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.QDenseAttrs")
+.set_num_inputs(2)
+.add_argument("data", "quantized nD Tensor", "Input data.")
+.add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
+.set_support_level(10)
+.add_type_rel("QDense", DenseRel<QDenseAttrs, DenseType::kQuantizedDense>);
+
+} // namespace relay
+} // namespace tvm
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 7d4e0f017050..33b230bd3a36 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -252,6 +252,39 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
+Expr QuantizedDenseForwardRewrite(const Call& ref_call,
+     const Array<Expr>& new_args, const NodeRef& ctx) {
+  CHECK_EQ(new_args.size(), 2);
+  Expr quantized_data = new_args[0];
+  Expr quantized_kernel = new_args[1];
+  const auto* param = ref_call->attrs.as<QDenseAttrs>();
+
+  Array<IndexExpr> out_shape;
+  auto ref_call_t = ref_call->checked_type();
+  auto output_tt = ref_call_t.as<TensorTypeNode>();
+  CHECK(output_tt != nullptr) << "Type information missing."
+                              << " Please run infer_type pass.";
+  //TODO: need to benchmark the performance of this lowering.
+  Expr quantized_data_int32 = Cast(quantized_data, Int(32));
+  if(param->input_zero_point != 0) {
+    quantized_data_int32 = Add(quantized_data_int32, MakeConstantScalar(Int(32),
+        param->input_zero_point));
+  }
+  Expr quantized_kernel_int32 = Cast(quantized_kernel, Int(32));
+  if(param->kernel_zero_point != 0) {
+    quantized_kernel_int32 = Add(quantized_kernel_int32, MakeConstantScalar(Int(32),
+        param->kernel_zero_point));
+  }
+  Expr int32_dense = Dense(quantized_data_int32,
+                           quantized_kernel_int32,
+                           param->units,
+                           param->out_dtype);
+  return int32_dense;
+}
+
+RELAY_REGISTER_OP("qnn.dense")
+.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", QuantizedDenseForwardRewrite);
+
 TVM_REGISTER_API("relay._qnn.rewrite")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
   Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 61663b0da85e..dd723b30165c 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -68,15 +68,17 @@ inline bool IsQuantizedType(const DataType& dtype) {
 enum class QuantizeOpType : uint8_t {
   Quantize,
   Dequantize,
-  Requantize
+  Requantize,
+  QuantizedDense
 };
 
 inline bool IsValidOpInputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsFloat32(in_dtype) || IsQuantizedType(in_dtype);
-    case QuantizeOpType ::Dequantize:
+      return IsFloat32(in_dtype);
+    case QuantizeOpType::Dequantize:
+    case QuantizeOpType::QuantizedDense:
       return IsQuantizedType(in_dtype);
     case QuantizeOpType ::Requantize:
       return IsInt16(in_dtype) || IsInt32(in_dtype);
@@ -86,12 +88,14 @@ inline bool IsValidOpInputType(const QuantizeOpType& op_type,
 }
 
 inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
-        const DataType& in_dtype) {
+        const DataType& out_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsQuantizedType(in_dtype);
+      return IsQuantizedType(out_dtype);
     case QuantizeOpType::Dequantize:
-      return IsFloat32(in_dtype);
+      return IsFloat32(out_dtype);
+    case QuantizeOpType::QuantizedDense:
+      return IsInt32(out_dtype) || IsInt16(out_dtype);
     default:
       return false;
   }
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 092e695cf533..a278ab76d2fa 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -252,5 +252,84 @@ def zero_point_test():
 
     run_tests()
 
+def test_quantized_dense():
+
+    def test_uint():
+        quantized_data = relay.var("quantized_data", shape=(2,10),
+                                   dtype="uint8")
+        quantized_kernel = relay.var("quantized_kernel", shape=(3, 10),
+                                   dtype="uint8")
+
+        func = relay.qnn.op.quantized_dense(
+            quantized_data,
+            quantized_kernel,
+            -127,
+            -127,
+            3,
+        )
+
+        quantized_data_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 109, 107, 129, 131, 133, 135, 137, 139,
+
+                                      141, 111, 145, 107]).astype('uint8').reshape((2, 10))
+        quantized_kernel_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 129, 131, 133, 135, 137, 139, 141,
+                                        143, 145, 147, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147]).astype('uint8').reshape((3, 10))
+
+        func = relay.Function(relay.analysis.free_vars(func),
+                              func)
+        func = run_infer_type(func)
+        print('*'*20)
+        print(func)
+        print('*'*20)
+        func = relay.qnn.ir_pass.rewrite(func)
+        print(func)
+        with relay.build_config(opt_level=0):
+            graph, lib, params = relay.build(func, "llvm", params=None)
+            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod.set_input("quantized_data",quantized_data_np)
+            mod.set_input("quantized_kernel",quantized_kernel_np)
+            mod.set_input(**params)
+            mod.run()
+            res = mod.get_output(0).asnumpy()
+            print(res)
+
+    def test_int():
+        quantized_data = relay.var("quantized_data", shape=(2,10),
+                                   dtype="int8")
+        quantized_kernel = relay.var("quantized_kernel", shape=(3, 10),
+                                     dtype="int8")
+
+        func = relay.qnn.op.quantized_dense(
+            quantized_data,
+            quantized_kernel,
+            1,
+            1,
+            3,
+        )
+
+        quantized_data_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, -19, -21, 1, 3, 5, 7, 9, 11, 13, -17, 17, -21]).astype('int8').reshape((2, 10))
+        quantized_kernel_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]).astype('int8').reshape((3, 10))
+
+        func = relay.Function(relay.analysis.free_vars(func),
+                              func)
+        func = run_infer_type(func)
+        print('*'*20)
+        print(func)
+        print('*'*20)
+        func = relay.qnn.ir_pass.rewrite(func)
+        print(func)
+        with relay.build_config(opt_level=0):
+            graph, lib, params = relay.build(func, "llvm", params=None)
+            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod.set_input("quantized_data",quantized_data_np)
+            mod.set_input("quantized_kernel",quantized_kernel_np)
+            mod.set_input(**params)
+            mod.run()
+            res = mod.get_output(0).asnumpy()
+            print(res)
+
+    test_int()
+
+
 if __name__ == "__main__":
-    test_requantize()
+    # test_requantize()
+    test_quantized_dense()

From 4958495213df67ec8eb211758ad1c2408f113aa8 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 00:05:03 +0000
Subject: [PATCH 33/51] Incorportaing review comments.

---
 include/tvm/relay/qnn/attrs.h                 | 29 ++++-----
 python/tvm/relay/qnn/__init__.py              |  2 +-
 python/tvm/relay/qnn/op/qnn.py                | 16 ++---
 .../relay/qnn/{ir_pass.py => transform.py}    |  0
 src/relay/qnn/op/requantize.cc                |  8 +--
 .../qnn/{pass => transform}/qnn_lower.cc      |  4 +-
 src/relay/qnn/util.h                          | 45 +++++++-------
 tests/python/unittest/test_qnn_ops.py         | 60 +++++++++----------
 8 files changed, 77 insertions(+), 87 deletions(-)
 rename python/tvm/relay/qnn/{ir_pass.py => transform.py} (100%)
 rename src/relay/qnn/{pass => transform}/qnn_lower.cc (98%)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 1cd7deb4393f..7b8bc28ddcdb 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -36,36 +36,31 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   int32_t input_zero_point;
   double output_scale;
   int32_t output_zero_point;
-  bool use_int_domain;
   std::string rounding;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
-    TVM_ATTR_FIELD(input_zero_point)
-        .describe("The zero point of the input tensor.");
-    TVM_ATTR_FIELD(output_zero_point)
-        .describe("The zero point of the output tensor.");
     TVM_ATTR_FIELD(input_scale)
         .describe("The scale of the input tensor.");
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_domain).set_default(true)
-      .describe("When true, the integer computation is used to handle output scale."
-                "The float compuation can be used as reference implementation or in"
-                "cases where FP32 computation for requantize is not expensive");
-    TVM_ATTR_FIELD(out_dtype)
-        .set_default(NullValue<DataType>())
-        .describe("Output data type, set to explicit type under mixed precision setting");
-    TVM_ATTR_FIELD(rounding).set_default("FE_AWAY_FROM_ZERO")
+    TVM_ATTR_FIELD(output_zero_point)
+        .describe("The zero point of the output tensor.");
+    TVM_ATTR_FIELD(rounding).set_default("AWAY_FROM_ZERO")
         .describe("Defines the rounding direction when the value is midway between"
-                  "two representable values. There are two supported modes - FE_UPWARD"
-                  "or FE_AWAY_FROM_ZERO. Both modes behave exactly same except at the"
-                  "midpoints between the two representable values. At midpoint, FE_UPWARD"
+                  "two representable values. There are two supported modes - UPWARD"
+                  "or AWAY_FROM_ZERO. Both modes behave exactly same except at the"
+                  "midpoints between the two representable values. At midpoint, UPWARD"
                   "rounds towards positive infinity (for example -1.5 will be rounded"
-                  "to -1). FE_AWAY_FROM_ZERO is the standard rounding where the value"
+                  "to -1). AWAY_FROM_ZERO is the standard rounding where the value"
                   "is rounded away from zero at midpoints (for example, -1.5 rounds to"
                   "-2). More context can be found at"
                   "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
   }
 };
 
diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
index 0836c5770ce4..409e088156b8 100644
--- a/python/tvm/relay/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -18,4 +18,4 @@
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
 from . import op
-from . import ir_pass
+from . import transform
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 65369c840b67..208985036640 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -24,8 +24,8 @@ def requantize(data,
                input_zero_point,
                output_scale,
                output_zero_point,
-               out_dtype="int32",
-               rounding="FE_AWAY_FROM_ZERO"):
+               rounding="AWAY_FROM_ZERO",
+               out_dtype="int32"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -51,19 +51,19 @@ def requantize(data,
     output_zero_point: int
            The zero point of the quantized_output distribution.
 
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
     rounding : string, optional
         Defines the rounding direction when the value is midway between two
         representable values.
 
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    assert rounding in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+    assert rounding in ("UPWARD", "AWAY_FROM_ZERO"),\
             "Unsupported rounding mode"
 
     return _make.requantize(data,
@@ -71,5 +71,5 @@ def requantize(data,
                             input_zero_point,
                             output_scale,
                             output_zero_point,
-                            out_dtype,
-                            rounding)
+                            rounding,
+                            out_dtype)
diff --git a/python/tvm/relay/qnn/ir_pass.py b/python/tvm/relay/qnn/transform.py
similarity index 100%
rename from python/tvm/relay/qnn/ir_pass.py
rename to python/tvm/relay/qnn/transform.py
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 62688147b06e..699e0e883a7e 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -58,15 +58,15 @@ Expr MakeRequantize(Expr data,
                     int32_t input_zero_point,
                     double output_scale,
                     int32_t output_zero_point,
-                    DataType out_dtype,
-                    std::string rounding) {
+                    std::string rounding,
+                    DataType out_dtype) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->input_scale = std::move(input_scale);
   attrs->input_zero_point = std::move(input_zero_point);
   attrs->output_scale = std::move(output_scale);
   attrs->output_zero_point = std::move(output_zero_point);
-  attrs->out_dtype = std::move(out_dtype);
   attrs->rounding = std::move(rounding);
+  attrs->out_dtype = std::move(out_dtype);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
@@ -83,7 +83,7 @@ Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 .set_attrs_type_key("relay.attrs.RequantizeAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The quantized input tensor.")
-.set_support_level(10)
+.set_support_level(11)
 .add_type_rel("Requantize", RequantizeRel);
 
 TVM_REGISTER_API("relay.qnn.op._make.requantize")
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/transform/qnn_lower.cc
similarity index 98%
rename from src/relay/qnn/pass/qnn_lower.cc
rename to src/relay/qnn/transform/qnn_lower.cc
index 621b8aee2ac7..51d167e13ce9 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/transform/qnn_lower.cc
@@ -137,10 +137,10 @@ Expr RequantizeLower(const Expr& input_tensor,
 
   tensor = multiplied_t;
   Expr round_scalar;
-  if (param->rounding == "FE_UPWARD") {
+  if (param->rounding == "UPWARD") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     round_scalar = pos_rounder;
-  } else if (param->rounding == "FE_AWAY_FROM_ZERO") {
+  } else if (param->rounding == "AWAY_FROM_ZERO") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
     auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 63e7938c93d8..24f03b2a6d84 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -70,37 +70,32 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
 }
 
 inline const int32_t GetQmin(const DataType& dtype) {
-  if (dtype == Int(8)) {
-    return std::numeric_limits<int8_t>::min();
-  } else if (dtype == UInt(8)) {
-    return std::numeric_limits<uint8_t>::min();
-  } else if (dtype == Int(16)) {
-    return std::numeric_limits<int16_t>::min();
-  } else if (dtype == UInt(16)) {
-    return std::numeric_limits<uint16_t>::min();
-  } else if (dtype == Int(32)) {
-    return std::numeric_limits<int32_t>::min();
-  } else if (dtype == UInt(32)) {
-    return std::numeric_limits<uint32_t>::min();
+  CHECK_LE(dtype.bits(), 32)
+      << "QNN ops support less than 32-bit integer values";
+  if (dtype.is_int()) {
+    auto* min_value = as_const_int(dtype.min());
+    CHECK(min_value != nullptr);
+    return static_cast<int32_t>(min_value[0]);
+  } else if (dtype.is_uint()) {
+    auto* min_value = as_const_uint(dtype.min());
+    CHECK(min_value != nullptr);
+    return static_cast<int32_t>(min_value[0]);
   }
   LOG(FATAL) << "Type not supported " << dtype;
   return -1;
 }
 
-
 inline const int32_t GetQmax(const DataType& dtype) {
-  if (dtype == Int(8)) {
-    return std::numeric_limits<int8_t>::max();
-  } else if (dtype == UInt(8)) {
-    return std::numeric_limits<uint8_t>::max();
-  } else if (dtype == Int(16)) {
-    return std::numeric_limits<int16_t>::max();
-  } else if (dtype == UInt(16)) {
-    return std::numeric_limits<uint16_t>::max();
-  } else if (dtype == Int(32)) {
-    return std::numeric_limits<int32_t>::max();
-  } else if (dtype == UInt(32)) {
-    return std::numeric_limits<uint32_t>::max();
+  CHECK_LE(dtype.bits(), 32)
+      << "QNN ops support less than 32-bit integer values";
+  if (dtype.is_int()) {
+    auto* max_value = as_const_int(dtype.max());
+    CHECK(max_value != nullptr);
+    return static_cast<int32_t>(max_value[0]);
+  } else if (dtype.is_uint()) {
+    auto* max_value = as_const_uint(dtype.max());
+    CHECK(max_value != nullptr);
+    return static_cast<int32_t>(max_value[0]);
   }
   LOG(FATAL) << "Type not supported " << dtype;
   return -1;
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 342e1ce09d99..8015e6a4d71f 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -21,7 +21,7 @@
 from tvm.relay.testing import create_workload
 from tvm.contrib import graph_runtime
 
-roundings = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
+roundings = ["UPWARD", "AWAY_FROM_ZERO"]
 
 def run_infer_type(expr):
     mod = relay.Module.from_expr(expr)
@@ -42,23 +42,23 @@ def verify(func, goldens):
             res = mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, rounding, input_scale,
-            output_scale, input_zero_point=0, output_zero_point=0):
+    def get_func(data_shape, data_dtype, out_dtype, input_scale, output_scale,
+            input_zero_point=0, output_zero_point=0, rounding="AWAY_FROM_ZERO"):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
         func = relay.qnn.op.requantize(
                 quantized_data,
-                input_zero_point=input_zero_point,
-                output_zero_point=output_zero_point,
                 input_scale=input_scale,
+                input_zero_point=input_zero_point,
                 output_scale=output_scale,
+                output_zero_point=output_zero_point,
                 rounding=rounding,
                 out_dtype=out_dtype)
 
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
         func = run_infer_type(func)
-        func = relay.qnn.ir_pass.qnn_lower(func)
+        func = relay.qnn.transform.qnn_lower(func)
         return func
 
 
@@ -71,9 +71,9 @@ def same_scale_test():
             func = get_func(data_shape=(200, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=0.5,
-                            output_scale=0.5)
+                            output_scale=0.5,
+                            rounding=rounding)
             verify(func, (golden_data, golden_output))
 
     def downscale_test():
@@ -81,9 +81,9 @@ def downscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='int8',
-                            rounding=rounding,
                             input_scale=1,
-                            output_scale=16)
+                            output_scale=16,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -92,9 +92,9 @@ def downscale_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([0, -1, -2], [9, 16, 7])
             else:
                 golden_output = np.repeat([0, -1, -2], [8, 16, 8])
@@ -104,9 +104,9 @@ def downscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=1,
-                            output_scale=4)
+                            output_scale=4,
+                            rounding=rounding)
 
             # Try positive values
             # 2I corresponds to 0.5, resulting in 1
@@ -116,9 +116,9 @@ def downscale_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                           [3, 4, 4, 4, 4, 4, 4, 4, 1])
             else:
@@ -130,9 +130,9 @@ def downscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='uint8',
-                            rounding=rounding,
                             input_scale=1,
-                            output_scale=16)
+                            output_scale=16,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -145,9 +145,9 @@ def upscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=2,
-                            output_scale=1)
+                            output_scale=1,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -156,7 +156,7 @@ def upscale_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
             golden_output = np.multiply(2, golden_data)
             verify(func, (golden_data, golden_output))
@@ -166,9 +166,9 @@ def saturation_test():
             func = get_func(data_shape=(16, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=0.5,
-                            output_scale=0.5)
+                            output_scale=0.5,
+                            rounding=rounding)
             golden_data = np.arange(0, 16, 1).astype('int32')
             golden_data = np.add(120, golden_data)
             output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
@@ -190,10 +190,10 @@ def zero_point_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='int8',
-                            rounding=rounding,
                             input_scale=1,
                             output_scale=16,
-                            output_zero_point=1)
+                            output_zero_point=1,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -203,9 +203,9 @@ def zero_point_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(-32, -64, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
@@ -217,10 +217,10 @@ def zero_point_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='int8',
-                            rounding=rounding,
                             input_scale=1,
                             output_scale=16,
-                            input_zero_point=16)
+                            input_zero_point=16,
+                            rounding=rounding)
 
             # Try positive values
             golden_data = np.arange(32, 64, 1).astype('int32')
@@ -230,7 +230,7 @@ def zero_point_test():
 
             # Try negative values
             golden_data = np.arange(-32, -64, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])

From f858a83554c560f9c3bdd587aeb7cfbedf39466a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 00:23:50 +0000
Subject: [PATCH 34/51] Adding API doc for QNN dialect.

---
 docs/langref/relay_op.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index ccdb3e8af8fa..5bb0b54b129c 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -196,6 +196,16 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.contrib.adaptive_avg_pool2d
 
 
+**Level 11: QNN Dialect Operators**
+
+This level supports quantized operators present in the QNN dialect.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.qnn.op.requantize
+
+
 Level 1 Definitions
 -------------------
 .. autofunction:: tvm.relay.log

From 823cc9465c61e6f839e836cfd0be6ca26934740a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 00:35:17 +0000
Subject: [PATCH 35/51] Move the qnn_lower pass to transform namespace.

---
 python/tvm/relay/qnn/{_qnn.py => _transform.py} | 2 +-
 python/tvm/relay/qnn/transform.py               | 4 ++--
 src/relay/qnn/{transform => pass}/qnn_lower.cc  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename python/tvm/relay/qnn/{_qnn.py => _transform.py} (95%)
 rename src/relay/qnn/{transform => pass}/qnn_lower.cc (99%)

diff --git a/python/tvm/relay/qnn/_qnn.py b/python/tvm/relay/qnn/_transform.py
similarity index 95%
rename from python/tvm/relay/qnn/_qnn.py
rename to python/tvm/relay/qnn/_transform.py
index bd3cdbb976d6..e2ff6f9ed652 100644
--- a/python/tvm/relay/qnn/_qnn.py
+++ b/python/tvm/relay/qnn/_transform.py
@@ -19,4 +19,4 @@
 from __future__ import absolute_import
 from tvm._ffi.function import _init_api
 
-_init_api("relay._qnn", __name__)
+_init_api("relay.qnn._transform", __name__)
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index edeecd9a0e6c..1e0952faeb61 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -18,7 +18,7 @@
 """Automatic quantization toolkit."""
 from __future__ import absolute_import
 
-from . import _qnn
+from . import _transform
 
 def qnn_lower(expr):
     """
@@ -34,4 +34,4 @@ def qnn_lower(expr):
     expr : tvm.relay.Expr
         The output expression.
     """
-    return _qnn.qnn_lower(expr)
+    return _transform.qnn_lower(expr)
diff --git a/src/relay/qnn/transform/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
similarity index 99%
rename from src/relay/qnn/transform/qnn_lower.cc
rename to src/relay/qnn/pass/qnn_lower.cc
index 51d167e13ce9..017d7c8908d9 100644
--- a/src/relay/qnn/transform/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -210,7 +210,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQnnForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay._qnn.qnn_lower")
+TVM_REGISTER_API("relay.qnn._transform.qnn_lower")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
   Expr ret = ForwardRewrite(e, "FQnnForwardRewrite", nullptr, nullptr);
   return ret;

From 28a9587ec1aa0d94eba495f5f3ba349f9e14cd79 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 01:05:23 +0000
Subject: [PATCH 36/51] Moving from expr to module. Adding namespace in C++.

---
 include/tvm/relay/qnn/attrs.h         |   3 +-
 python/tvm/relay/qnn/transform.py     |  15 +--
 src/relay/qnn/op/requantize.cc        |   2 +
 src/relay/qnn/pass/qnn_lower.cc       |  35 +++++-
 tests/python/unittest/test_qnn_ops.py | 155 +++++++++++++-------------
 5 files changed, 116 insertions(+), 94 deletions(-)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 7b8bc28ddcdb..b82416604618 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -29,6 +29,7 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
 
 /*! \brief Attribute for requantize operator */
 struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
@@ -64,7 +65,7 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   }
 };
 
-
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_QNN_ATTRS_H_
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index 1e0952faeb61..406e23fc0fbc 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -14,24 +14,19 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#pylint: disable=unused-argument
+# pylint: disable=invalid-name
 """Automatic quantization toolkit."""
 from __future__ import absolute_import
 
 from . import _transform
 
-def qnn_lower(expr):
+def QnnLower():
     """
     Rewrites the high-level quantized ops into low-level exisiting Relay ops.
 
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression.
-
     Returns
     -------
-    expr : tvm.relay.Expr
-        The output expression.
+    Pass : tvm.relay.transform.Pass
+        The optmized pas.
     """
-    return _transform.qnn_lower(expr)
+    return _transform.QnnLower()
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 699e0e883a7e..bce26355baf5 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -30,6 +30,7 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
 
 TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
@@ -89,5 +90,6 @@ Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 TVM_REGISTER_API("relay.qnn.op._make.requantize")
 .set_body_typed(MakeRequantize);
 
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 017d7c8908d9..ea46504c6748 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -32,6 +32,15 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
+/*!
+ * \brief namespace of qnn lower pass.
+ *
+ * Use namespace to reduce potential naming conflict.
+ */
+namespace qnn_lower {
+
+using runtime::TypedPackedFunc;
 
 // Lowering of qnn.requantize op
 
@@ -210,11 +219,27 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQnnForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay.qnn._transform.qnn_lower")
-.set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQnnForwardRewrite", nullptr, nullptr);
-  return ret;
-});
+Expr QnnLower(const Expr& expr) {
+  return ForwardRewrite(expr, "FQnnForwardRewrite", nullptr, nullptr);
+}
+}  // namespace qnn_lower
+
+namespace transform {
+using namespace tvm::relay::transform;
+Pass QnnLower() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(
+          relay::qnn::qnn_lower::QnnLower(f));
+  };
+  return CreateFunctionPass(pass_func, 0, "QnnLower",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay.qnn._transform.QnnLower")
+.set_body_typed(QnnLower);
+}  // namespace transform
 
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 8015e6a4d71f..1ef868f797c9 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -31,22 +31,22 @@ def run_infer_type(expr):
 
 
 def test_requantize():
-    def verify(func, goldens):
+    def verify(mod, goldens):
         with relay.build_config(opt_level=3):
-            graph, lib, params = relay.build(func, "llvm", params=None)
+            graph, lib, params = relay.build(mod, "llvm", params=None)
             golden_data, golden_output = goldens
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
-            mod.set_input("quantized_data",golden_data)
-            mod.set_input(**params)
-            mod.run()
-            res = mod.get_output(0).asnumpy()
+            rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            rt_mod.set_input("quantized_data",golden_data)
+            rt_mod.set_input(**params)
+            rt_mod.run()
+            res = rt_mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, input_scale, output_scale,
+    def get_mod(data_shape, data_dtype, out_dtype, input_scale, output_scale,
             input_zero_point=0, output_zero_point=0, rounding="AWAY_FROM_ZERO"):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
-        func = relay.qnn.op.requantize(
+        mod = relay.qnn.op.requantize(
                 quantized_data,
                 input_scale=input_scale,
                 input_zero_point=input_zero_point,
@@ -55,11 +55,10 @@ def get_func(data_shape, data_dtype, out_dtype, input_scale, output_scale,
                 rounding=rounding,
                 out_dtype=out_dtype)
 
-        func = relay.Function(relay.analysis.free_vars(func),
-                func)
-        func = run_infer_type(func)
-        func = relay.qnn.transform.qnn_lower(func)
-        return func
+        mod = relay.Function(relay.analysis.free_vars(mod), mod)
+        mod = relay.Module.from_expr(mod)
+        mod = relay.qnn.transform.QnnLower()(mod)
+        return mod
 
 
     def same_scale_test():
@@ -68,28 +67,28 @@ def same_scale_test():
         golden_output = golden_data
 
         for rounding in roundings:
-            func = get_func(data_shape=(200, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=0.5,
-                            output_scale=0.5,
-                            rounding=rounding)
-            verify(func, (golden_data, golden_output))
+            mod = get_mod(data_shape=(200, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=0.5,
+                          output_scale=0.5,
+                          rounding=rounding)
+            verify(mod, (golden_data, golden_output))
 
     def downscale_test():
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='int8',
-                            input_scale=1,
-                            output_scale=16,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='int8',
+                          input_scale=1,
+                          output_scale=16,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
@@ -98,22 +97,22 @@ def downscale_test():
                 golden_output = np.repeat([0, -1, -2], [9, 16, 7])
             else:
                 golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try a different scale
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=1,
-                            output_scale=4,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=1,
+                          output_scale=4,
+                          rounding=rounding)
 
             # Try positive values
             # 2I corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
                                       [2, 4, 4, 4, 4, 4, 4, 4, 2])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
@@ -124,57 +123,57 @@ def downscale_test():
             else:
                 golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                           [2, 4, 4, 4, 4, 4, 4, 4, 2])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try uint8 out_dtype
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='uint8',
-                            input_scale=1,
-                            output_scale=16,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='uint8',
+                          input_scale=1,
+                          output_scale=16,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     def upscale_test():
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=2,
-                            output_scale=1,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=2,
+                          output_scale=1,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.multiply(2, golden_data)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
             golden_output = np.multiply(2, golden_data)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     def saturation_test():
         for rounding in roundings:
-            func = get_func(data_shape=(16, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=0.5,
-                            output_scale=0.5,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(16, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=0.5,
+                          output_scale=0.5,
+                          rounding=rounding)
             golden_data = np.arange(0, 16, 1).astype('int32')
             golden_data = np.add(120, golden_data)
             output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
                                127, 127, 127, 127, 127, 127, 127, 127])
             golden_output = output
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative numbers
             golden_data = np.arange(0, -16, -1).astype('int32')
@@ -182,25 +181,25 @@ def saturation_test():
             output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
                                -128, -128, -128, -128, -128, -128, -128, -128])
             golden_output = output
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     def zero_point_test():
         # Output zero point
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='int8',
-                            input_scale=1,
-                            output_scale=16,
-                            output_zero_point=1,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='int8',
+                          input_scale=1,
+                          output_scale=16,
+                          output_zero_point=1,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
             golden_output = np.add(1, golden_output)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
@@ -210,23 +209,23 @@ def zero_point_test():
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
             golden_output = np.add(1, golden_output)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
         # Input zero point
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='int8',
-                            input_scale=1,
-                            output_scale=16,
-                            input_zero_point=16,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='int8',
+                          input_scale=1,
+                          output_scale=16,
+                          input_zero_point=16,
+                          rounding=rounding)
 
             # Try positive values
             golden_data = np.arange(32, 64, 1).astype('int32')
             golden_output = np.repeat([2, 3, 4], [8, 16, 8])
             golden_output = np.subtract(golden_output, 1)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             golden_data = np.arange(-32, -64, -1).astype('int32')
@@ -235,7 +234,7 @@ def zero_point_test():
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
             golden_output = np.subtract(golden_output, 1)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     same_scale_test()
     downscale_test()

From 76476dc61757246a65ccd083dc83ae708d82d744 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Thu, 18 Jul 2019 18:44:26 -0700
Subject: [PATCH 37/51] Working test case for int/uint with bias_add

---
 tests/python/unittest/test_qnn_ops.py | 177 +++++++++++++++++---------
 1 file changed, 114 insertions(+), 63 deletions(-)

diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index cc7b5da5f5de..a65c7f2e2656 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -245,80 +245,131 @@ def zero_point_test():
 
 def test_quantized_dense():
 
-    def test_uint():
-        quantized_data = relay.var("quantized_data", shape=(2,10),
-                                   dtype="uint8")
-        quantized_kernel = relay.var("quantized_kernel", shape=(3, 10),
-                                     dtype="uint8")
-
-        func = relay.qnn.op.quantized_dense(
-            quantized_data,
-            quantized_kernel,
-            -127,
-            -127,
-            3,
-        )
-
-        quantized_data_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 109, 107, 129, 131, 133, 135, 137, 139,
-
-                                      141, 111, 145, 107]).astype('uint8').reshape((2, 10))
-        quantized_kernel_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 129, 131, 133, 135, 137, 139, 141,
-                                        143, 145, 147, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147]).astype('uint8').reshape((3, 10))
-
-        func = relay.Function(relay.analysis.free_vars(func),
-                              func)
-        func = run_infer_type(func)
-        print('*'*20)
-        print(func)
-        print('*'*20)
-        func = relay.qnn.ir_pass.qnn_lower(func)
-        print(func)
-        with relay.build_config(opt_level=0):
-            graph, lib, params = relay.build(func, "llvm", params=None)
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
-            mod.set_input("quantized_data",quantized_data_np)
-            mod.set_input("quantized_kernel",quantized_kernel_np)
-            mod.set_input(**params)
-            mod.run()
-            res = mod.get_output(0).asnumpy()
-            print(res)
-
-    def test_int():
-        quantized_data = relay.var("quantized_data", shape=(2,10),
-                                   dtype="int8")
-        quantized_kernel = relay.var("quantized_kernel", shape=(3, 10),
-                                     dtype="int8")
-
+    def make_test_configuration(quantized_data, quantized_kernel, dtype, input_shape, kernel_shape, input_zero_point,
+                                kernel_zero_point, units, output, out_dtype='int32', bias=None):
+        config = {
+            'quantized_data': quantized_data,
+            'quantized_kernel': quantized_kernel,
+            'dtype': dtype,
+            'input_shape': input_shape,
+            'kernel_shape': kernel_shape,
+            'input_zero_point': input_zero_point,
+            'kernel_zero_point': kernel_zero_point,
+            'units': units,
+            'output': output,
+            'out_dtype': out_dtype,
+            'bias': bias
+        }
+        return config
+
+    def make_uint_configuration(use_bias=False):
+        input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
+        input_zero_point, kernel_zero_point = -127, -127
+        dtype, out_dtype = 'uint8', 'int32'
+        units = 3
+        quantized_data_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 109, 107,
+                                      129, 131, 133, 135, 137, 139, 141, 111, 145, 107])\
+                                      .astype(dtype)\
+                                      .reshape(input_shape)
+        quantized_kernel_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
+                                        129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
+                                        129, 131, 133, 135, 137, 139, 141, 143, 145, 147])\
+                                            .astype(dtype)\
+                                            .reshape(kernel_shape)
+        bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
+
+        if use_bias:
+            output = np.array([96, 100, 104, 232, 236, 240 ]).astype(out_dtype).reshape(output_shape)
+        else:
+            output = np.array([92, 92, 92, 228, 228, 228 ]).astype(out_dtype).reshape(output_shape)
+        return make_test_configuration(quantized_data=quantized_data_np,
+                                       quantized_kernel=quantized_kernel_np,
+                                       dtype=dtype,
+                                       input_shape=input_shape,
+                                       kernel_shape=kernel_shape,
+                                       input_zero_point=input_zero_point,
+                                       kernel_zero_point=kernel_zero_point,
+                                       units=units,
+                                       output=output,
+                                       bias=bias)
+
+    def make_int_configuration(use_bias=False):
+        input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
+        input_zero_point, kernel_zero_point = 1, 1
+        dtype, out_dtype = 'int8', 'int32'
+        units = 3
+        quantized_data_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, -19, -21,
+                                      1, 3, 5, 7, 9, 11, 13, -17, 17, -21]) \
+                                    .astype(dtype) \
+                                    .reshape(input_shape)
+        quantized_kernel_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+                                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+                                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) \
+                                    .astype(dtype) \
+                                    .reshape(kernel_shape)
+        bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
+        if use_bias:
+            output = np.array([96, 100, 104, 232, 236, 240 ]).astype(out_dtype).reshape(output_shape)
+        else:
+            output = np.array([92, 92, 92, 228, 228, 228 ]).astype(out_dtype).reshape(output_shape)
+        return make_test_configuration(quantized_data=quantized_data_np,
+                                       quantized_kernel=quantized_kernel_np,
+                                       dtype=dtype,
+                                       input_shape=input_shape,
+                                       kernel_shape=kernel_shape,
+                                       input_zero_point=input_zero_point,
+                                       kernel_zero_point=kernel_zero_point,
+                                       units=units,
+                                       output=output,
+                                       bias=bias)
+
+    def test_quantized_convolution(test_configuration):
+        in_dtype = test_configuration['dtype']
+        out_dtype = test_configuration['out_dtype']
+        quantized_data_name = "quantized_data"
+        quantized_kernel_name = "quantized_kernel"
+        bias_name = 'bias'
+        quantized_data = relay.var(quantized_data_name, shape=test_configuration['input_shape'],
+                                   dtype=in_dtype)
+        quantized_kernel = relay.var(quantized_kernel_name, shape=test_configuration['kernel_shape'],
+                                     dtype=in_dtype)
         func = relay.qnn.op.quantized_dense(
             quantized_data,
             quantized_kernel,
-            1,
-            1,
-            3,
-        )
-
-        quantized_data_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, -19, -21, 1, 3, 5, 7, 9, 11, 13, -17, 17, -21]).astype('int8').reshape((2, 10))
-        quantized_kernel_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]).astype('int8').reshape((3, 10))
-
-        func = relay.Function(relay.analysis.free_vars(func),
-                              func)
+            test_configuration['input_zero_point'],
+            test_configuration['kernel_zero_point'],
+            test_configuration['units'])
+        if test_configuration[bias_name] is not None:
+            bias = relay.var(bias_name, shape=test_configuration['bias'].shape, dtype=out_dtype)
+            func = relay.nn.bias_add(func, bias)
+        func = relay.Function(relay.analysis.free_vars(func), func)
         func = run_infer_type(func)
-        print('*'*20)
-        print(func)
-        print('*'*20)
         func = relay.qnn.ir_pass.qnn_lower(func)
-        print(func)
         with relay.build_config(opt_level=0):
             graph, lib, params = relay.build(func, "llvm", params=None)
             mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
-            mod.set_input("quantized_data",quantized_data_np)
-            mod.set_input("quantized_kernel",quantized_kernel_np)
+            mod.set_input(quantized_data_name,test_configuration[quantized_data_name])
+            mod.set_input(quantized_kernel_name,test_configuration[quantized_kernel_name])
+            if test_configuration[bias_name] is not None:
+                mod.set_input(bias_name, test_configuration[bias_name])
             mod.set_input(**params)
             mod.run()
             res = mod.get_output(0).asnumpy()
-            print(res)
-
-    test_int()
+            np.testing.assert_equal(res, test_configuration['output'])
+            assert res.dtype == test_configuration['out_dtype']
+
+    def test_configurations():
+        test_prams = [{'use_bias': False}, {'use_bias': True}]
+        tests = [test_quantized_convolution]
+        configurations = []
+        for test_param in test_prams:
+            configurations.append(make_uint_configuration(**test_param))
+            configurations.append(make_int_configuration(**test_param))
+        for configuration in configurations:
+            for test in tests:
+                test(configuration)
+
+    test_configurations()
 
 if __name__ == "__main__":
     # test_requantize()

From 732d6ceebd05c29b88e573e00aa10767f22bf1f0 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 06:12:55 +0000
Subject: [PATCH 38/51] Minor sentence rewrites. Added qnn namespace.

---
 include/tvm/relay/qnn/attrs.h     |  4 ++--
 python/tvm/relay/qnn/op/qnn.py    | 14 +++++++-------
 python/tvm/relay/qnn/transform.py |  2 +-
 src/relay/qnn/op/requantize.cc    |  1 -
 src/relay/qnn/util.h              |  6 ++++--
 5 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index b82416604618..be5d3ac08abd 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file tvm/relay/attrs/nn.h
- * \brief Auxiliary attributes for nn operators.
+ * \file tvm/relay/qnn/attrs.h
+ * \brief Auxiliary attributes for qnn operators.
  */
 #ifndef TVM_RELAY_QNN_ATTRS_H_
 #define TVM_RELAY_QNN_ATTRS_H_
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 208985036640..ebebcbc0bb66 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -28,9 +28,9 @@ def requantize(data,
                out_dtype="int32"):
     r"""Requantized operator.
 
-    The requantize operator converts one quantized tensor to another quantized
-    tensor. For the output tensor, we are provided with output scale and zero
-    point. The computation looks like this
+    The requantize operator converts one quantized tensor representation to
+    another quantized tensor representation. For the output tensor, we are
+    provided with output scale and zero point. The computation is as follows
 
     Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
@@ -40,16 +40,16 @@ def requantize(data,
         The input data to the operator.
 
     input_scale: float
-           The float scalar to scale the data int8 values back to FP32.
+           The quantization scale for the input tensor.
 
     input_zero_point: int
-           The zero point of the data distribution.
+           The zero point of the input tensor.
 
     output_scale: float
-           The float scalar to scale the quantized_output int8 values back to FP32.
+           The quantization scale for the output tensor.
 
     output_zero_point: int
-           The zero point of the quantized_output distribution.
+           The zero point of the output tensor.
 
     rounding : string, optional
         Defines the rounding direction when the value is midway between two
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index 406e23fc0fbc..576631b67e7d 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
-"""Automatic quantization toolkit."""
+"""QNN Dialect transformation passes."""
 from __future__ import absolute_import
 
 from . import _transform
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index bce26355baf5..cc38b7fbeed8 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -34,7 +34,6 @@ namespace qnn {
 
 TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
-
 bool RequantizeRel(const Array<Type>& types,
                    int num_inputs,
                    const Attrs& attrs,
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 24f03b2a6d84..c1b8ae3371cd 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -31,6 +31,7 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
 
 inline bool IsQNNDataType(const DataType& dtype) {
   return dtype == Int(8) || dtype == UInt(8)
@@ -71,7 +72,7 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
 
 inline const int32_t GetQmin(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
-      << "QNN ops support less than 32-bit integer values";
+      << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {
     auto* min_value = as_const_int(dtype.min());
     CHECK(min_value != nullptr);
@@ -87,7 +88,7 @@ inline const int32_t GetQmin(const DataType& dtype) {
 
 inline const int32_t GetQmax(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
-      << "QNN ops support less than 32-bit integer values";
+      << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {
     auto* max_value = as_const_int(dtype.max());
     CHECK(max_value != nullptr);
@@ -101,6 +102,7 @@ inline const int32_t GetQmax(const DataType& dtype) {
   return -1;
 }
 
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_QNN_UTIL_H_

From fadc573275fd8bdf5fb12804bc434ab79ee58ccf Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 06:19:50 +0000
Subject: [PATCH 39/51] Added the API doc.

---
 docs/langref/relay_op.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 5bb0b54b129c..f896e6d905f2 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -338,3 +338,8 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.nn.batch_matmul
 .. autofunction:: tvm.relay.contrib.adaptive_max_pool2d
 .. autofunction:: tvm.relay.contrib.adaptive_avg_pool2d
+
+
+Level 11 Definitions
+--------------------
+.. autofunction:: tvm.relay.qnn.op.requantize

From 956d3de2f53ed031366e5be24195e63ee49cfeca Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 15:25:31 +0000
Subject: [PATCH 40/51] Chanding default out_dtype to int8. Adding a test with
 in/out_dtype as uint8.

---
 python/tvm/relay/qnn/op/qnn.py        |  2 +-
 src/relay/qnn/util.h                  |  2 +-
 tests/python/unittest/test_qnn_ops.py | 14 ++++++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index ebebcbc0bb66..e347d0616511 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -25,7 +25,7 @@ def requantize(data,
                output_scale,
                output_zero_point,
                rounding="AWAY_FROM_ZERO",
-               out_dtype="int32"):
+               out_dtype="int8"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor representation to
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index c1b8ae3371cd..5bbfbd11fa79 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -52,7 +52,7 @@ inline bool IsValidOpInputType(const QuantizeOpType& op_type,
     case QuantizeOpType::Dequantize:
       return IsQNNDataType(in_dtype);
     case QuantizeOpType::Requantize:
-      return in_dtype == Int(16) || in_dtype == Int(32);
+      return in_dtype.is_int() || in_dtype.is_uint();
     default:
       return false;
   }
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 1ef868f797c9..cd4b048719ea 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -139,6 +139,20 @@ def downscale_test():
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
             verify(mod, (golden_data, golden_output))
 
+            # Try uint8 in_dtyope and uint8 out_dtype
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='uint8',
+                          out_dtype='uint8',
+                          input_scale=1,
+                          output_scale=16,
+                          rounding=rounding)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            verify(mod, (golden_data, golden_output))
+
     def upscale_test():
         for rounding in roundings:
             mod = get_mod(data_shape=(32, ),

From 7a63597a2119d22a1697710daf4e6174d07b0ac7 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Fri, 19 Jul 2019 09:11:04 -0700
Subject: [PATCH 41/51] merge from upstream/requantize

---
 3rdparty/dlpack    | 2 +-
 3rdparty/dmlc-core | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/3rdparty/dlpack b/3rdparty/dlpack
index 0acb731e0e43..b90e93907206 160000
--- a/3rdparty/dlpack
+++ b/3rdparty/dlpack
@@ -1 +1 @@
-Subproject commit 0acb731e0e43d15deee27b66f10e4c5b4e667913
+Subproject commit b90e939072066c160b18ea1e7156537b8d3710f6
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 3943914eed66..7245c9d42e0b 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 3943914eed66470bd010df581e29e4dca4f7df6f
+Subproject commit 7245c9d42e0baf3b030ae580c125f403dbe57c9f

From d7009450daff5ad7e3ffef4d5be69c637f25a0cc Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 18:52:36 +0000
Subject: [PATCH 42/51] Style fixes. Better error messages.

---
 python/tvm/relay/qnn/op/qnn.py    | 2 --
 python/tvm/relay/qnn/transform.py | 1 +
 src/relay/qnn/pass/qnn_lower.cc   | 4 ++++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index e347d0616511..ef73ddbead8d 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -63,8 +63,6 @@ def requantize(data,
     result : tvm.relay.Expr
         The computed result.
     """
-    assert rounding in ("UPWARD", "AWAY_FROM_ZERO"),\
-            "Unsupported rounding mode"
 
     return _make.requantize(data,
                             input_scale,
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index 576631b67e7d..6ca456b4fb81 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
+
 """QNN Dialect transformation passes."""
 from __future__ import absolute_import
 
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index ea46504c6748..321c475d48c5 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -213,6 +213,10 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
+  // Check rounding validity.
+  CHECK(param->rounding == "UPWARD" || param->rounding == "AWAY_FROM_ZERO")
+      << "QNN requantize supports two rounding modes - UPWARD and "
+      << "AWAY_FROM_ZERO";
   return RequantizeLower(quantized_data, param, input_dtype, out_shape);
 }
 

From 21963dce27f1b92ca111fcd4a64f19caa03d0d57 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 22 Jul 2019 10:39:18 -0700
Subject: [PATCH 43/51] Removing extra code.

---
 tests/python/unittest/test_qnn_ops.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 1fc6448a644a..70edc577ec6c 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -356,9 +356,7 @@ def test_quantized_convolution(test_configuration):
             bias = relay.var(bias_name, shape=test_configuration['bias'].shape, dtype=out_dtype)
             mod = relay.nn.bias_add(mod, bias)
         mod = relay.Function(relay.analysis.free_vars(mod), mod)
-        # func = run_infer_type(func)
         mod = relay.Module.from_expr(mod)
-        # func = relay.qnn.ir_pass.qnn_lower(func)
         mod = relay.qnn.transform.QnnLower()(mod)
         with relay.build_config(opt_level=3):
             graph, lib, params = relay.build(mod, "llvm", params=None)

From d0fdd1c4d3f094f5296cc20a79e7c7539d6e962c Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 19:54:42 +0000
Subject: [PATCH 44/51] Adding documentation.

---
 python/tvm/relay/qnn/op/qnn.py  |  2 +-
 src/relay/qnn/op/requantize.cc  | 12 ++++-
 src/relay/qnn/pass/qnn_lower.cc | 79 ++++++++++++++++++++-------------
 src/relay/qnn/util.h            | 10 ++---
 4 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index ef73ddbead8d..88e13f2c358e 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -32,7 +32,7 @@ def requantize(data,
     another quantized tensor representation. For the output tensor, we are
     provided with output scale and zero point. The computation is as follows
 
-    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+    Q_output = zp_output +  (scale_input)/(scale_output) * (Q_input - zp_input)
 
     Parameters
     ----------
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index cc38b7fbeed8..e8978aa03147 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -20,7 +20,7 @@
 /*!
  *  Copyright (c) 2019 by Contributors
  * \file requantize.cc
- * \brief Quantized convolution operators
+ * \brief QNN requantize operator.
  */
 
 #include <tvm/relay/op_attr_types.h>
@@ -34,6 +34,14 @@ namespace qnn {
 
 TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
+/*
+ * \brief Infer shape function of Requantize op.
+ * \param types The types of input args.
+ * \param num_inputs The number of inputs.
+ * \param attrs The op attributes.
+ * \param reporter The type reporter that sets the dtype and shapes.
+ * \return True if the infer shape succeeded.
+ */
 bool RequantizeRel(const Array<Type>& types,
                    int num_inputs,
                    const Attrs& attrs,
@@ -51,7 +59,7 @@ bool RequantizeRel(const Array<Type>& types,
   return true;
 }
 
-// Positional relay function to create quantized conv2d operator
+// Positional relay function to create qnn requantize operator
 // used by frontend FFI.
 Expr MakeRequantize(Expr data,
                     double input_scale,
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 321c475d48c5..db380354162d 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -20,7 +20,7 @@
 /*!
  *  Copyright (c) 2019 by Contributors
  * \file qnn_lower.cc
- * \brief Lower quantized ops to exisiting Relay ops.
+ * \brief Lower qnn ops to a sequence of exisiting Relay ops.
  */
 
 #include <tvm/relay/analysis.h>
@@ -45,20 +45,24 @@ using runtime::TypedPackedFunc;
 // Lowering of qnn.requantize op
 
 /*
- * Converts a floating point number so that it can be represented by integers.
- * The representation is
- *      float_number = (significand) * 2^(exponent)
+ * \brief Convert FP32 representation into fixed point representation.
+ * \param double_multplier The input FP32 number.
+ * \param idtype The input datatype.
+ * \return The pair of multiplier and shift for fixed point representation.
+ * \note Converts a floating point number so that it can be represented by
+ *       integers. The representation is
+ *             float_number = (significand) * 2^(exponent)
  *
- * The significand is a number between 0.5 and 1. This is represented
- * by an integer number. For example, if it is int32, then the decimal point
- * exists between bit 31 and 30 from LSB (or between first and second bit from
- * the left).
+ *       The significand is a number between 0.5 and 1. This is represented by
+ *       an integer number. For example, if it is int32, then the decimal point
+ *       exists between bit 31 and 30 from LSB (or between first and second bit
+ *       from the left).
  *
- * Some examples are
+ *       Some examples are
  *           0.25 = (0.5) * 2^(-1)
  *           0.125 = (0.5) * 2^(-2)
  *
- * Credit to TFLite reference implementation.
+ *       Credit to TFLite reference implementation.
  */
 std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
     const DataType& idtype) {
@@ -82,25 +86,31 @@ std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
 }
 
 /*
- * Requantization using only integer computation. Here, the computation is
- * converted to a fixed point computation by computing output multiplier and
- * shift. This is useful, if the target device does not support/have very
- * expensive floating point computations.
+ * \brief Lower requantize to a sequence of ops.
+ * \param input_tensor The input tensor to requantize op.
+ * \param param The requantize op attrs.
+ * \param idtype The dtype of the input tensor.
+ * \param out_shape The output shape of the requantize op.
+ * \return The sequence of existing Relay ops.
+ * \note Requantization using only integer computation. Here, the computation is
+ *       converted to a fixed point computation by computing output multiplier
+ *       and shift. This is useful, if the target device does not support/have
+ *       very expensive floating point computations.
  *
- * Original compuation is scale_fp32 * quantized_tensor.  To convert into
- * integer computation, the multiplication with fp32 scalar can be replaced by
- * multiplication with an int value and then right shifting the result. This
- * approximates the floating point computation with a fixed point computation.
- *
- * The whole computation this can be broken down into following steps
- * 1) Calculate the integer multiplier and integer shift.
- * 2) Subtract the input integer point.
- * 3) Multiply the integer fixed point multiplier with quantized tensor.
- * 4) Round the result.
- * 5) Right shift the result.
- * 6) Add the output_zero_point.
- * 7) Cast to the out_dtype.
+ *       Original compuation is scale_fp32 * quantized_tensor.  To convert into
+ *       integer computation, the multiplication with fp32 scalar can be
+ *       replaced by multiplication with an int value and then right shifting
+ *       the result. This approximates the floating point computation with a
+ *       fixed point computation.
  *
+ *       The whole computation this can be broken down into following steps
+ *       1) Calculate the integer multiplier and integer shift.
+ *       2) Subtract the input integer point.
+ *       3) Multiply the integer fixed point multiplier with quantized tensor.
+ *       4) Round the result.
+ *       5) Right shift the result.
+ *       6) Add the output_zero_point.
+ *       7) Cast to the out_dtype.
  */
 Expr RequantizeLower(const Expr& input_tensor,
     const RequantizeAttrs* param, const DataType& idtype,
@@ -134,7 +144,7 @@ Expr RequantizeLower(const Expr& input_tensor,
   // Perform the multiplication in higher precision.
   // If idtype is Int(32), the scalar is a fixed point value of int32 where the
   // decimal point is between bits 31 and 30. After multiplying with
-  // input_tensor, the result in int64 where the decimal point is sitting
+  // input_tensor, the result is in int64 where the decimal point is sitting
   // between bits 31 and 30 (from the right, rightmost bit is bit 0).
   Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
   auto multiplied_t = Multiply(tensor, scalar);
@@ -184,12 +194,17 @@ Expr RequantizeLower(const Expr& input_tensor,
 }
 
 /*
- * Lowering of the requantize operation. The requantize operator converts one
- * quantized tensor to another quantized tensor. For the output tensor, we are
- * provided with output scale and zero point. The computation looks like this
+ * \brief Forward rewrite the requantize op.
+ * \param ref_call The original call that will be lowered.
+ * \param new_args The new mutated args to the call node.
+ * \param ctx The node context.
+ * \return The sequence of Relay ops for requantize op.
+ * \note Lowering of the requantize operation. The requantize operator converts
+ *       one quantized tensor to another quantized tensor. For the output
+ *       tensor, we are provided with output scale and zero point. The
+ *       computation looks like this
  *
  * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
- *
  */
 Expr RequantizeForwardRewrite(const Call& ref_call,
     const Array<Expr>& new_args, const NodeRef& ctx) {
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 5bbfbd11fa79..c3d0367a81e7 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -33,7 +33,7 @@ namespace tvm {
 namespace relay {
 namespace qnn {
 
-inline bool IsQNNDataType(const DataType& dtype) {
+static inline bool IsQNNDataType(const DataType& dtype) {
   return dtype == Int(8) || dtype == UInt(8)
       || dtype == Int(16) || dtype == UInt(16);
 }
@@ -44,7 +44,7 @@ enum class QuantizeOpType {
   Requantize
 };
 
-inline bool IsValidOpInputType(const QuantizeOpType& op_type,
+static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
@@ -58,7 +58,7 @@ inline bool IsValidOpInputType(const QuantizeOpType& op_type,
   }
 }
 
-inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
+static inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
@@ -70,7 +70,7 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
   }
 }
 
-inline const int32_t GetQmin(const DataType& dtype) {
+static inline const int32_t GetQmin(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {
@@ -86,7 +86,7 @@ inline const int32_t GetQmin(const DataType& dtype) {
   return -1;
 }
 
-inline const int32_t GetQmax(const DataType& dtype) {
+static inline const int32_t GetQmax(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {

From 33cc07541a80ef811f568dd987d45ad4bbc2aa6f Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 20:49:53 +0000
Subject: [PATCH 45/51] More documentation fixes.

---
 include/tvm/relay/qnn/attrs.h                      | 14 ++++++++------
 python/tvm/relay/qnn/__init__.py                   |  2 +-
 python/tvm/relay/qnn/op/qnn.py                     |  4 ++--
 src/relay/qnn/pass/qnn_lower.cc                    |  6 ++++--
 .../test_qnn_requantize.py}                        |  1 -
 5 files changed, 15 insertions(+), 12 deletions(-)
 rename tests/python/{unittest/test_qnn_ops.py => relay/test_qnn_requantize.py} (99%)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index be5d3ac08abd..e98357291dff 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -53,12 +53,14 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("Defines the rounding direction when the value is midway between"
                   "two representable values. There are two supported modes - UPWARD"
                   "or AWAY_FROM_ZERO. Both modes behave exactly same except at the"
-                  "midpoints between the two representable values. At midpoint, UPWARD"
-                  "rounds towards positive infinity (for example -1.5 will be rounded"
-                  "to -1). AWAY_FROM_ZERO is the standard rounding where the value"
-                  "is rounded away from zero at midpoints (for example, -1.5 rounds to"
-                  "-2). More context can be found at"
-                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
+                  "midpoints between the two representable values. At the midpoint,"
+                  "UPWARD rounds towards positive infinity (for example -1.5 will be"
+                  "rounded to -1). AWAY_FROM_ZERO is the standard rounding where the"
+                  "value is rounded away from zero at midpoints (for example, -1.5"
+                  "rounds to -2). More context can be found at following gblic manual"
+                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html."
+                  "FE_UPWARD corresponds to UPWARD here and FE_TONEAREST corresponds"
+                  "to AWAY_FROM_ZERO rounding mode.");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
index 409e088156b8..de932c71c67d 100644
--- a/python/tvm/relay/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=wildcard-import
-"""Neural network related operators."""
+"""QNN dialect operators and ir passes."""
 from __future__ import absolute_import as _abs
 from . import op
 from . import transform
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 88e13f2c358e..f0d120cc1901 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #pylint: disable=invalid-name, too-many-lines
-"""Neural network operations."""
+"""QNN dialect operators."""
 from __future__ import absolute_import as _abs
 from . import _make
 
@@ -56,7 +56,7 @@ def requantize(data,
         representable values.
 
     out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
+        Specifies the output data type.
 
     Returns
     -------
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index db380354162d..867d5502e3a3 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -20,7 +20,7 @@
 /*!
  *  Copyright (c) 2019 by Contributors
  * \file qnn_lower.cc
- * \brief Lower qnn ops to a sequence of exisiting Relay ops.
+ * \brief Lower qnn ops to a sequence of existing Relay ops.
  */
 
 #include <tvm/relay/analysis.h>
@@ -145,7 +145,9 @@ Expr RequantizeLower(const Expr& input_tensor,
   // If idtype is Int(32), the scalar is a fixed point value of int32 where the
   // decimal point is between bits 31 and 30. After multiplying with
   // input_tensor, the result is in int64 where the decimal point is sitting
-  // between bits 31 and 30 (from the right, rightmost bit is bit 0).
+  // between bits 31 and 30 (from the right, rightmost bit is bit 0). The
+  // computation is performed in higher precision to avoid overflow in
+  // multiplying two int32 values.
   Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
   auto multiplied_t = Multiply(tensor, scalar);
 
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/relay/test_qnn_requantize.py
similarity index 99%
rename from tests/python/unittest/test_qnn_ops.py
rename to tests/python/relay/test_qnn_requantize.py
index cd4b048719ea..e901fd7ac0e1 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/relay/test_qnn_requantize.py
@@ -60,7 +60,6 @@ def get_mod(data_shape, data_dtype, out_dtype, input_scale, output_scale,
         mod = relay.qnn.transform.QnnLower()(mod)
         return mod
 
-
     def same_scale_test():
         # Have same scales, everything within range
         golden_data = np.arange(-100, 100, 1).astype('int32')

From bb388559c84460f007a5d3b9ec862bb029ea7889 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:00:29 +0000
Subject: [PATCH 46/51] Adding out dtype check for requantize.

---
 src/relay/qnn/op/requantize.cc | 4 +++-
 src/relay/qnn/util.h           | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index e8978aa03147..ae9c874dae19 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -50,11 +50,13 @@ bool RequantizeRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
   CHECK(IsValidOpInputType(QuantizeOpType::Requantize, input_dtype))
-    << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
+    << "Input type should be an integer but was " <<  input_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
   // assign output type
   const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
+  CHECK(IsValidOpOutputType(QuantizeOpType::Requantize, param->out_dtype))
+    << "Output type should be an integer but was " <<  param->out_dtype;
   reporter->Assign(types[1], TensorTypeNode::make(oshape, param->out_dtype));
   return true;
 }
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index c3d0367a81e7..09f1c543d2dc 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -59,12 +59,14 @@ static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
 }
 
 static inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
-        const DataType& in_dtype) {
+        const DataType& out_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsQNNDataType(in_dtype);
+      return IsQNNDataType(out_dtype);
     case QuantizeOpType::Dequantize:
-      return in_dtype == Float(32);
+      return out_dtype == Float(32);
+    case QuantizeOpType::Requantize:
+      return out_dtype.is_int() || out_dtype.is_uint();
     default:
       return false;
   }

From 7aac28d936770de192e429c946dd73551ca97fe3 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:05:18 +0000
Subject: [PATCH 47/51] Adding corner case for FP32 to fixed point conversion.

---
 src/relay/qnn/pass/qnn_lower.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 867d5502e3a3..7d5969054a36 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -67,6 +67,11 @@ using runtime::TypedPackedFunc;
 std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
     const DataType& idtype) {
   int significand, exponent;
+  if (double_multiplier == 0.) {
+    significand = 0;
+    exponent = 0;
+    return std::pair<int, int>(significand, exponent);
+  }
   int idtype_bits = idtype.bits();
 
   // Get the significand (significand) and exponent (exponent)

From 635b0533267b7666be2e5736776bec074aecf406 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:07:41 +0000
Subject: [PATCH 48/51] Adding extra line.

---
 python/tvm/relay/qnn/op/qnn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index f0d120cc1901..b961dc12c8e3 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -14,7 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#pylint: disable=invalid-name, too-many-lines
+#pylint: disable=invalid-name
+
 """QNN dialect operators."""
 from __future__ import absolute_import as _abs
 from . import _make

From 222e189d8ad410d8bfe781adf95a7bff5bd69c43 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:22:46 +0000
Subject: [PATCH 49/51] Documentation fix.

---
 python/tvm/relay/qnn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
index de932c71c67d..fa888d7ce7dd 100644
--- a/python/tvm/relay/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=wildcard-import
-"""QNN dialect operators and ir passes."""
+"""QNN dialect operators and IR passes."""
 from __future__ import absolute_import as _abs
 from . import op
 from . import transform

From 6c833d550e1a28e8839ae1807cf4c3f9a2233550 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 22 Jul 2019 15:39:19 -0700
Subject: [PATCH 50/51] quantized fully connected working with requantize.

---
 src/relay/qnn/pass/qnn_lower.cc       |  2 +-
 tests/python/unittest/test_qnn_ops.py | 84 +++++++++++++++++++--------
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index c06dea9ef4b8..2067e435a543 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -87,7 +87,7 @@ std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
  * shift. This is useful, if the target device does not support/have very
  * expensive floating point computations.
  *
- * Original compuation is scale_fp32 * quantized_tensor.  To convert into
+ * Original computation is scale_fp32 * quantized_tensor. To convert into
  * integer computation, the multiplication with fp32 scalar can be replaced by
  * multiplication with an int value and then right shifting the result. This
  * approximates the floating point computation with a fixed point computation.
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 70edc577ec6c..7408d4fbf6d6 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -258,8 +258,19 @@ def zero_point_test():
 
 def test_quantized_dense():
 
+    def make_requantize_params(input_scale, output_scale, output_zero_point, out_dtype):
+        config = {
+            'input_scale': input_scale,
+            'output_scale': output_scale,
+            'output_zero_point': output_zero_point,
+            'out_dtype': out_dtype
+        }
+        return config
+
     def make_test_configuration(quantized_data, quantized_kernel, dtype, input_shape, kernel_shape, input_zero_point,
-                                kernel_zero_point, units, output, out_dtype='int32', bias=None):
+                                kernel_zero_point, units, output, out_dtype='int32', bias=None, requantize=None):
+        if requantize is not None:
+            assert bias is not None
         config = {
             'quantized_data': quantized_data,
             'quantized_kernel': quantized_kernel,
@@ -271,76 +282,93 @@ def make_test_configuration(quantized_data, quantized_kernel, dtype, input_shape
             'units': units,
             'output': output,
             'out_dtype': out_dtype,
-            'bias': bias
+            'bias': bias,
+            'requantize': requantize
         }
         return config
 
-    def make_uint_configuration(use_bias=False):
+    def make_uint_configuration(use_bias=False, requantize_output=False):
         input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
         input_zero_point, kernel_zero_point = -127, -127
-        dtype, out_dtype = 'uint8', 'int32'
+        in_dtype = 'uint8'
+        out_dtype = 'int32' if not requantize_output else 'uint8'
         units = 3
         quantized_data_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 109, 107,
                                       129, 131, 133, 135, 137, 139, 141, 111, 145, 107])\
-                                      .astype(dtype)\
+                                      .astype(in_dtype)\
                                       .reshape(input_shape)
         quantized_kernel_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
                                         129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
                                         129, 131, 133, 135, 137, 139, 141, 143, 145, 147])\
-                                            .astype(dtype)\
+                                            .astype(in_dtype)\
                                             .reshape(kernel_shape)
         bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
+        requant_params = make_requantize_params(0.25, 1.0, 127, 'uint8') if requantize_output else None
 
-        if use_bias:
-            output = np.array([96, 100, 104, 232, 236, 240 ]).astype(out_dtype).reshape(output_shape)
+        if requantize_output:
+            assert use_bias
+            output = np.array([151, 152, 153, 185, 186, 187])
+        elif use_bias:
+            output = np.array([96, 100, 104, 232, 236, 240 ])
         else:
-            output = np.array([92, 92, 92, 228, 228, 228 ]).astype(out_dtype).reshape(output_shape)
+            output = np.array([92, 92, 92, 228, 228, 228 ])
+        output = output.astype(out_dtype).reshape(output_shape)
         return make_test_configuration(quantized_data=quantized_data_np,
                                        quantized_kernel=quantized_kernel_np,
-                                       dtype=dtype,
+                                       dtype=in_dtype,
                                        input_shape=input_shape,
                                        kernel_shape=kernel_shape,
                                        input_zero_point=input_zero_point,
                                        kernel_zero_point=kernel_zero_point,
                                        units=units,
                                        output=output,
-                                       bias=bias)
+                                       bias=bias,
+                                       requantize=requant_params)
 
-    def make_int_configuration(use_bias=False):
+    def make_int_configuration(use_bias=False, requantize_output=False):
         input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
         input_zero_point, kernel_zero_point = 1, 1
-        dtype, out_dtype = 'int8', 'int32'
+        in_dtype = 'int8'
+        out_dtype = 'int32' if not requantize_output else 'int8'
         units = 3
         quantized_data_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, -19, -21,
                                       1, 3, 5, 7, 9, 11, 13, -17, 17, -21]) \
-                                    .astype(dtype) \
+                                    .astype(in_dtype) \
                                     .reshape(input_shape)
         quantized_kernel_np = np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
                                         1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
                                         1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) \
-                                    .astype(dtype) \
+                                    .astype(in_dtype) \
                                     .reshape(kernel_shape)
         bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
-        if use_bias:
-            output = np.array([96, 100, 104, 232, 236, 240 ]).astype(out_dtype).reshape(output_shape)
+        requant_params = make_requantize_params(0.25, 1.0, -1, 'int8') if requantize_output else None
+
+        if requantize_output:
+            assert use_bias
+            output = np.array([23, 24, 25, 57, 58, 59])
+        elif use_bias:
+            output = np.array([96, 100, 104, 232, 236, 240 ])
         else:
-            output = np.array([92, 92, 92, 228, 228, 228 ]).astype(out_dtype).reshape(output_shape)
+            output = np.array([92, 92, 92, 228, 228, 228 ])
+        output = output.astype(out_dtype).reshape(output_shape)
         return make_test_configuration(quantized_data=quantized_data_np,
                                        quantized_kernel=quantized_kernel_np,
-                                       dtype=dtype,
+                                       dtype=in_dtype,
                                        input_shape=input_shape,
                                        kernel_shape=kernel_shape,
                                        input_zero_point=input_zero_point,
                                        kernel_zero_point=kernel_zero_point,
                                        units=units,
                                        output=output,
-                                       bias=bias)
+                                       bias=bias,
+                                       requantize=requant_params)
 
     def test_quantized_convolution(test_configuration):
         in_dtype = test_configuration['dtype']
         out_dtype = test_configuration['out_dtype']
         quantized_data_name = "quantized_data"
         quantized_kernel_name = "quantized_kernel"
+        expected_out_dtype = test_configuration['out_dtype']
         bias_name = 'bias'
         quantized_data = relay.var(quantized_data_name, shape=test_configuration['input_shape'],
                                    dtype=in_dtype)
@@ -355,6 +383,16 @@ def test_quantized_convolution(test_configuration):
         if test_configuration[bias_name] is not None:
             bias = relay.var(bias_name, shape=test_configuration['bias'].shape, dtype=out_dtype)
             mod = relay.nn.bias_add(mod, bias)
+        if test_configuration['requantize'] is not None:
+            requantize_config = test_configuration['requantize']
+            mod = relay.qnn.op.requantize(
+                mod,
+                input_scale=requantize_config['input_scale'],
+                input_zero_point=0,
+                output_scale=requantize_config['output_scale'],
+                output_zero_point=requantize_config['output_zero_point'],
+                out_dtype=requantize_config['out_dtype'])
+            expected_out_dtype = requantize_config['out_dtype']
         mod = relay.Function(relay.analysis.free_vars(mod), mod)
         mod = relay.Module.from_expr(mod)
         mod = relay.qnn.transform.QnnLower()(mod)
@@ -369,10 +407,10 @@ def test_quantized_convolution(test_configuration):
             mod.run()
             res = mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, test_configuration['output'])
-            assert res.dtype == test_configuration['out_dtype']
+            assert res.dtype == expected_out_dtype
 
     def test_configurations():
-        test_prams = [{'use_bias': False}, {'use_bias': True}]
+        test_prams = [{'use_bias': False}, {'use_bias': True}, {'use_bias': True, 'requantize_output': True}, ]
         tests = [test_quantized_convolution]
         configurations = []
         for test_param in test_prams:
@@ -385,5 +423,5 @@ def test_configurations():
     test_configurations()
 
 if __name__ == "__main__":
-    # test_requantize()
+    test_requantize()
     test_quantized_dense()

From a115c963148653dfb6996ce25a258c8566339d62 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 23 Jul 2019 02:42:04 +0000
Subject: [PATCH 51/51] Adding static inline.

---
 python/tvm/relay/qnn/op/qnn.py | 2 +-
 src/relay/pass/pattern_util.h  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index b961dc12c8e3..65106c770862 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 #pylint: disable=invalid-name
-
 """QNN dialect operators."""
+
 from __future__ import absolute_import as _abs
 from . import _make
 
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 5d6a9cf3e68f..2f43da3dc1d3 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -374,17 +374,17 @@ inline Expr Copy(Expr data) {
 }
 
 
-inline Expr Where(const Expr& condition, const Expr& x, const Expr& y) {
+static inline Expr Where(const Expr& condition, const Expr& x, const Expr& y) {
   static const Op& op = Op::Get("where");
   return CallNode::make(op, {condition, x, y});
 }
 
-inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
+static inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
   static const Op& op = Op::Get("greater_equal");
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
-inline Expr Full(Expr fill_value,
+static inline Expr Full(Expr fill_value,
                  Array<IndexExpr> shape,
                  DataType dtype) {
   auto attrs = make_node<InitOpAttrs>();