From 847dd52c17b5e85f7650f6a9c928a7b974d668e9 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 8 Jul 2019 12:12:40 -0700
Subject: [PATCH 01/37] [Relay] [Quantization] WIP - Common files for the
 qauntization work.

---
 include/tvm/relay/attrs/qnn.h         |  37 +++++++
 include/tvm/relay/quantize_util.h     | 139 ++++++++++++++++++++++++++
 python/tvm/relay/op/__init__.py       |   1 +
 python/tvm/relay/op/qnn/__init__.py   |  20 ++++
 python/tvm/relay/op/qnn/_make.py      |  20 ++++
 python/tvm/relay/op/qnn/qnn.py        |  21 ++++
 python/tvm/relay/quantize/__init__.py |   1 +
 python/tvm/relay/quantize/rewrite.py  |  38 +++++++
 src/relay/pass/pattern_util.h         |  20 ++++
 src/relay/pass/quantize_rewrite.cc    |  38 +++++++
 10 files changed, 335 insertions(+)
 create mode 100644 include/tvm/relay/attrs/qnn.h
 create mode 100644 include/tvm/relay/quantize_util.h
 create mode 100644 python/tvm/relay/op/qnn/__init__.py
 create mode 100644 python/tvm/relay/op/qnn/_make.py
 create mode 100644 python/tvm/relay/op/qnn/qnn.py
 create mode 100644 python/tvm/relay/quantize/rewrite.py
 create mode 100644 src/relay/pass/quantize_rewrite.cc
diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
new file mode 100644
index 000000000000..c45a33c786f7
--- /dev/null
+++ b/include/tvm/relay/attrs/qnn.h
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/nn.h
+ * \brief Auxiliary attributes for nn operators.
+ */
+#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
new file mode 100644
index 000000000000..bb054fb8fb65
--- /dev/null
+++ b/include/tvm/relay/quantize_util.h
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nnvm/compiler/quantize_util.h
+ * \brief Utility methods needs for quantized ops that can be shared
+ */
+
+#ifndef TVM_QUANTIZE_UTIL_H
+#define TVM_QUANTIZE_UTIL_H
+
+#include <tvm/expr.h>
+#include "./base.h"
+
+namespace tvm {
+namespace relay {
+
+inline bool is_Int8(const DataType& dtype) {
+  return dtype == Int(8);
+}
+
+inline bool is_UInt8(const DataType& dtype) {
+  return dtype == UInt(8);
+}
+
+
+inline bool is_Int16(const DataType& dtype) {
+  return dtype == Int(16);
+}
+
+inline bool is_UInt16(const DataType& dtype) {
+  return dtype == UInt(16);
+}
+
+inline bool is_Int32(const DataType& dtype) {
+  return dtype == Int(32);
+}
+
+inline bool is_UInt32(const DataType& dtype) {
+  return dtype == UInt(32);
+}
+
+
+
+inline bool is_Float32(const DataType& dtype) {
+  return dtype == Float(32);
+}
+
+inline bool is_quantized_type(const DataType& dtype) {
+  return is_Int8(dtype) || is_UInt8(dtype)
+      || is_Int16(dtype) || is_UInt16(dtype);
+}
+
+enum class QuantizeOpType : uint8_t {
+  Quantize_Requantize,
+  Dequantize,
+  Requantize
+};
+
+inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_Float32(in_dtype) || is_quantized_type(in_dtype);
+    case QuantizeOpType ::Dequantize:
+      return is_quantized_type(in_dtype);
+    case QuantizeOpType ::Requantize:
+      return is_Int16(in_dtype) || is_Int32(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
+  switch(op_type) {
+    case QuantizeOpType::Quantize_Requantize:
+      return is_quantized_type(in_dtype);
+    case QuantizeOpType::Dequantize:
+      return is_Float32(in_dtype);
+    default:
+      return false;
+  }
+}
+
+inline const int32_t get_qmin(const DataType&  dtype) {
+  if (is_Int8(dtype)) {
+    return std::numeric_limits<int8_t>::min();
+  } else if (is_UInt8(dtype)) {
+    return std::numeric_limits<uint8_t>::min();
+  } else if (is_Int16(dtype)) {
+    return std::numeric_limits<int16_t>::min();
+  } else if (is_UInt16(dtype)) {
+    return std::numeric_limits<uint16_t>::min();
+  } else if (is_Int32(dtype)) {
+    return std::numeric_limits<int32_t>::min();
+  } else if (is_UInt32(dtype)) {
+    return std::numeric_limits<uint32_t>::min();
+  }
+  LOG(FATAL) << "Type not supported\n";
+  return -1;
+}
+
+
+inline const int32_t get_qmax(const DataType&  dtype) {
+  if (is_Int8(dtype)) {
+    return std::numeric_limits<int8_t>::max();
+  } else if (is_UInt8(dtype)) {
+    return std::numeric_limits<uint8_t>::max();
+  } else if (is_Int16(dtype)) {
+    return std::numeric_limits<int16_t>::max();
+  } else if (is_UInt16(dtype)) {
+    return std::numeric_limits<uint16_t>::max();
+  } else if (is_Int32(dtype)) {
+    return std::numeric_limits<int32_t>::max();
+  } else if (is_UInt32(dtype)) {
+    return std::numeric_limits<uint32_t>::max();
+  }
+  LOG(FATAL) << "Type not supported\n";
+  return -1;
+}
+
+} // namespace relay
+} // namespace tvm
+#endif //TVM_QUANTIZE_UTIL_H
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index b8ef4df5cdc8..fa27641a8d07 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -27,6 +27,7 @@
 from .transform import *
 from .algorithm import *
 from . import nn
+from . import qnn
 from . import annotation
 from . import image
 from . import vision
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/op/qnn/__init__.py
new file mode 100644
index 000000000000..aef02300ab63
--- /dev/null
+++ b/python/tvm/relay/op/qnn/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .qnn import *
\ No newline at end of file
diff --git a/python/tvm/relay/op/qnn/_make.py b/python/tvm/relay/op/qnn/_make.py
new file mode 100644
index 000000000000..b1695629b8f9
--- /dev/null
+++ b/python/tvm/relay/op/qnn/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
new file mode 100644
index 000000000000..008e6cbb7f80
--- /dev/null
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
index a9e7b40b039e..4d3aad123a6b 100644
--- a/python/tvm/relay/quantize/__init__.py
+++ b/python/tvm/relay/quantize/__init__.py
@@ -19,5 +19,6 @@
 from __future__ import absolute_import as _abs
 
 from .quantize import *
+from .rewrite import *
 from ._annotate import register_annotate_function
 from .kl_divergence import kl_divergence_scale
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/quantize/rewrite.py
new file mode 100644
index 000000000000..89429e522115
--- /dev/null
+++ b/python/tvm/relay/quantize/rewrite.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=unused-argument
+"""Automatic quantization toolkit."""
+from __future__ import absolute_import
+
+from . import _quantize
+from .. import expr as _expr
+
+def rewrite(expr):
+    """
+    Rewrites the high-level quantized ops into low-level exisiting Relay ops.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    expr : tvm.relay.Expr
+        The output expression.
+    """
+    return _quantize.rewrite(expr)
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 7dcfd5cb4b7f..4bd203949136 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -394,6 +394,26 @@ inline Expr Variance(Expr data, Expr mean, Array<Integer> axis, bool keepdims, b
 }
 
 
+inline Expr Where(const Expr& condition, const Expr& x, const Expr& y) {
+  static const Op& op = Op::Get("where");
+  return CallNode::make(op, {condition, x, y});
+}
+
+inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
+  static const Op& op = Op::Get("greater_equal");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+inline Expr Full(Expr fill_value,
+              Array<IndexExpr> shape,
+              DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("full");
+  return CallNode::make(op, {fill_value}, Attrs(attrs), {});
+}
+
 Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
new file mode 100644
index 000000000000..925c516b41ed
--- /dev/null
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize_rewrite.cc
+ * \brief Lower quantized ops to exisiting Relay ops.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/quantize_util.h>
+#include <tvm/relay/attrs/qnn.h>
+#include "pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+
+}  // namespace relay
+}  // namespace tvm

From ed11cd7a17ea58cb5c752bf5d9e43a0efae81af5 Mon Sep 17 00:00:00 2001
From: "shoubhikbhatti@gmail.com" <shoubhikbhatti@gmail.coml>
Date: Mon, 8 Jul 2019 12:20:54 -0700
Subject: [PATCH 02/37] [Relay] [Quantization] WIP - Prototyping requantize op.

---
 include/tvm/relay/attrs/qnn.h      |  24 +++
 python/tvm/relay/op/qnn/qnn.py     |  46 ++++++
 src/relay/op/nn/requantize.cc      |  89 +++++++++++
 src/relay/pass/quantize_rewrite.cc | 237 +++++++++++++++++++++++++++++
 4 files changed, 396 insertions(+)
 create mode 100644 src/relay/op/nn/requantize.cc

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index c45a33c786f7..12afe19d26b3 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -30,7 +30,31 @@
 namespace tvm {
 namespace relay {
 
+/*! \brief Attribute for requantize operator */
+struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
+  double input_scale;
+  int32_t input_zero_point;
+  double output_scale;
+  int32_t output_zero_point;
+  bool use_int_compute;
+  DataType out_dtype;
 
+  TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
+    TVM_ATTR_FIELD(output_zero_point)
+        .describe("The zero point of the output tensor.");
+    TVM_ATTR_FIELD(input_scale)
+        .describe("The scale of the input tensor.");
+    TVM_ATTR_FIELD(output_scale)
+        .describe("The scale of the output tensor.");
+    TVM_ATTR_FIELD(use_int_compute).set_default(false)
+        .describe("When true, the integer computation is used to handle output scale");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 008e6cbb7f80..18be68cd9cfc 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -19,3 +19,49 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
+
+def requantize(input_data, input_zero_point, input_scale, output_zero_point,
+        output_scale, out_dtype="int32", use_int_compute=False):
+    r"""Requantized operator.
+
+    The requantize operator converts one quantized tensor to another quantized
+    tensor. For the output tensor, we are provided with output scale and zero
+    point. The computation looks like this
+
+    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+
+    The above computation can be done in floating point as the scales are in
+    FP32. Alternatively, we can approximate floating point with fixed point
+    computation. This is controlled by use_int_compute.
+
+    Parameters
+    ----------
+    quantized_data : tvm.relay.Expr
+        The input quantized_data to the operator.
+
+    input_scale: float
+           The float scalar to scale the quantized_data int8 values back to FP32.
+
+    output_scale: float
+           The float scalar to scale the quantized_output int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the quantized_data distribution.
+
+    output_zero_point: int
+           The zero point of the quantized_output distribution.
+
+    out_dtype : str, optional
+        Specifies the output quantized_data type for mixed precision conv2d.
+
+    use_int_compute : bool, optional
+        Use fully integer computation for requantizing.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.requantize(input_data, input_zero_point, input_scale,
+                            output_zero_point, output_scale, out_dtype,
+                            use_int_compute)
\ No newline at end of file
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/op/nn/requantize.cc
new file mode 100644
index 000000000000..80f2bde4ad47
--- /dev/null
+++ b/src/relay/op/nn/requantize.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file requantize.cc
+ * \brief Quantized convolution operators
+ */
+
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/qnn.h>
+#include <tvm/relay/quantize_util.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
+
+
+bool RequantizeRel(const Array<Type>& types,
+                   int num_inputs,
+                   const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto input_dtype = data->dtype;
+  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Requantize, input_dtype))
+    << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
+
+  const Array<tvm::Expr> oshape = data->shape;
+  // assign output type
+  const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, param->out_dtype));
+  return true;
+}
+
+// Positional relay function to create quantized conv2d operator
+// used by frontend FFI.
+Expr MakeRequantize(Expr data,
+                    int32_t input_zero_point,
+                    double input_scale,
+                    int32_t output_zero_point,
+                    double output_scale,
+                    DataType out_dtype,
+                    bool use_int_compute) {
+  auto attrs = make_node<RequantizeAttrs>();
+  attrs->out_dtype = std::move(out_dtype);
+  attrs->input_zero_point = std::move(input_zero_point);
+  attrs->output_zero_point = std::move(output_zero_point);
+  attrs->input_scale = std::move(input_scale);
+  attrs->output_scale = std::move(output_scale);
+  attrs->use_int_compute = std::move(use_int_compute);
+  static const Op& op = Op::Get("qnn.requantize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.requantize")
+.describe(R"code(Requantize operator.
+
+FIXME
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.RequantizeAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The quantized input tensor.")
+.set_support_level(10)
+.add_type_rel("Requantize", RequantizeRel);
+
+TVM_REGISTER_API("relay.op.qnn._make.requantize")
+.set_body_typed(MakeRequantize);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 925c516b41ed..55f8c43fd49f 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -34,5 +34,242 @@ namespace tvm {
 namespace relay {
 
 
+// Lowering of qnn.requantize op
+void GetFixedPointMultiplierShift(double double_multiplier,
+    int32_t* fixed_point_multiplier, int* shift,
+    const DataType& idtype) {
+
+  int acc_dtype_bits = idtype.bits();
+
+  if (double_multiplier == 0.) {
+    *fixed_point_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (acc_dtype_bits - 1))));
+  CHECK_LE(q_fixed, (1ll << (acc_dtype_bits - 1)));
+  if (q_fixed == (1ll << (acc_dtype_bits - 1))) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+  *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+Expr MultiplyByIntegerMuliplier(const Expr& convolved_tensor,
+    const int32_t fixed_point_multiplier, const int left_shift,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+  // TODO (janimesh) - How to add the overflow checks here. TFLite code snippet is
+  // bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
+  // return overflow ? std::numeric_limits<std::int32_t>::max() : .....;/
+
+  // The calculations are done in upcast of idtype to retain precision.
+  int acc_dtype_bits = idtype.bits();
+  DataType up_idtype = Int(2 * acc_dtype_bits);
+
+  auto tensor = convolved_tensor;
+  // Typically the left_shift will be 0 if the original scale is > 0.5.
+  if (left_shift != 0) {
+    tensor = Multiply(tensor, MakeConstantScalar(idtype, 1 << left_shift));
+  }
+
+  // Upcast the computation to Int64 and multiply the multiplier.
+  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
+  auto multiplied_t = Multiply(Cast(tensor, up_idtype), scalar);
+
+  // Since, we are performing fixed point computation. We are only interested in
+  // higher 16/32 bits. But before that, we also need to perform rounding.
+  // This is fixed point rounding. So, the rounder add scalar depends if the
+  // input is positive.
+  auto zero = MakeConstantScalar(up_idtype, 0);
+  auto pos_threshold = MakeConstantScalar(up_idtype,
+          1ll << (acc_dtype_bits - 2));
+  auto neg_threshold = MakeConstantScalar(up_idtype,
+          (1 - (1ll << (acc_dtype_bits - 2))));
+  auto pos_rounder = Full(pos_threshold, out_shape, up_idtype);
+  auto neg_rounder = Full(neg_threshold, out_shape, up_idtype);
+  auto rounding_scalar = Where(GreaterEqual(multiplied_t, zero), pos_rounder, neg_rounder);
+  auto rounded_tensor = Add(multiplied_t, rounding_scalar);
+
+  // Perform right shift to get the first 16/32 bits.
+  // The result is first doubled and the first 15/31 bits are obtained. This is
+  // done by just right shifting the result by 15/31 bits.
+  auto right_shift_scalar = MakeConstantScalar(up_idtype, (acc_dtype_bits - 1));
+  auto scaled_t = RightShift(rounded_tensor, right_shift_scalar);
+  auto q_imin = get_qmin(idtype);
+  auto q_imax = get_qmax(idtype);
+  auto integer_multiplied_t = Cast(Clip(scaled_t, q_imin, q_imax),
+          idtype);
+  return integer_multiplied_t;
+}
+
+Expr ShiftByIntegerShift(const Expr& multiplied_t,
+    const int& exponent, const RequantizeAttrs*& param,
+    const DataType& idtype, const Array<IndexExpr>& out_shape) {
+  CHECK_GE(exponent, 0);
+  int acc_dtype_bits = idtype.bits();
+  CHECK_LE(exponent, (acc_dtype_bits - 1));
+
+  // We need to perform rounding. The rounding here is closest to the power
+  // of 2. The exponent basically represents the decimal point. We need to round
+  // at the decimal point.
+  auto tensor = multiplied_t;
+  if (exponent != 0) {
+    auto pos_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)));
+    auto neg_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)) - 1);
+    auto pos_rounder_t = Full(pos_rounder, out_shape, idtype);
+    auto neg_rounder_t = Full(neg_rounder, out_shape, idtype);
+
+    auto zero = MakeConstantScalar(idtype, 0);
+    auto zero_t = Full(zero, out_shape, idtype);
+    auto round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
+            neg_rounder_t);
+    tensor = Add(tensor, round_scalar);
+  }
+
+  // Right shift by exponent to approximate the division.
+  auto scaled_t = RightShift(tensor,
+          MakeConstantScalar(idtype, exponent));
+  return scaled_t;
+}
+
+
+/*
+ * Requantization using only integer computation. Here, the computation is
+ * converted to a fixed point computation by computing output multiplier and
+ * shift. This is useful, if the target device does not support/have very
+ * expensive floating point computations.
+ *
+ * Original compuation is scale_fp32 * quantized_tensor.  To convert into
+ * integer computation, the multiplication with fp32 scalar can be replaced by
+ * multiplication with an int value and then right shifting the result. This
+ * approximates the floating point computation with a fixed point computation.
+ *
+ * The whole computaition this can be broken down into following steps 
+ * 1) Calculate the integer multiplier and integer shift.
+ * 2) Multiply the integer multiplier with quantized tensor.
+ * 3) Right shift the result.
+ *
+ * The only thing complicating the above computations is the tedious approach of
+ * handling rounding.
+ */
+Expr RequantizeInt(const Expr& convolved_tensor,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+
+  double double_multiplier = param->input_scale/param->output_scale;
+  // 1) Calculating the integer multiplier and integer shift
+  int32_t fixed_point_multiplier;
+  int shift;
+  GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
+          &shift, idtype);
+
+  // 2) Multiply the integer multiplier
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  auto multiplied_t = MultiplyByIntegerMuliplier(convolved_tensor,
+          fixed_point_multiplier, left_shift, param, idtype, out_shape);
+
+  // 3) Divide by the denominator or right shift the result.
+  auto scaled_int32_t = ShiftByIntegerShift(multiplied_t,
+          right_shift, param, idtype, out_shape);
+
+  // 4) Clip to the out_dtype min/max.
+  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
+  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto requantized_output = Cast(clipped_t, param->out_dtype);
+  return requantized_output;
+}
+
+/* 
+ * Requantization using floating computation. Here we can multiply the scale to
+ * the convolved_tensor, round to nearest integer and then cast back to int32.
+ */
+Expr RequantizeFloat(const Expr& convolved_tensor,
+    const RequantizeAttrs*& param, const DataType& idtype,
+    const Array<IndexExpr>& out_shape) {
+  double double_multiplier = param->input_scale/param->output_scale;
+  auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
+
+  // Multiply the convolved tensor with the new scale.
+  auto casted_t = Cast(convolved_tensor, Float(32));
+  auto multiplied_t = Round(Multiply(casted_t, scalar_multiplier));
+  auto q_imin = get_qmin(idtype);
+  auto q_imax = get_qmax(idtype);
+  auto scaled_int32_t = Cast(Clip(multiplied_t, q_imin, q_imax),
+          idtype);
+
+  // Clip to the out_dtype min/max.
+  // Clip limits must be smaller than the dtype of the input tensor.
+  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
+  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto requantized_output = Cast(clipped_t, param->out_dtype);
+  return requantized_output;
+}
+
+/*
+ * Lowering of the requantize operation. The requantize operator converts one
+ * quantized tensor to another quantized tensor. For the output tensor, we are
+ * provided with output scale and zero point. The computation looks like this
+ *
+ * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+ *
+ * The above computation can be done in floating point as the scales are in
+ * FP32. Alternatively, we can approximate floating point with fixed point
+ * computation. This is controlled by use_int_compute.
+ */
+Expr RequantizeForwardRewrite(const Call& ref_call,
+    const Array<Expr>& new_args, const NodeRef& ctx) {
+  CHECK_EQ(new_args.size(), 1);
+  Expr quantized_data = new_args[0];
+  const auto* param = ref_call->attrs.as<RequantizeAttrs>();
+
+  // Find output shape.
+  Array<IndexExpr> out_shape;
+  auto ref_call_t = ref_call->checked_type();
+  auto output_tt = ref_call_t.as<TensorTypeNode>();
+  CHECK(output_tt != nullptr) << "Type information missing."
+      << " Please run infer_type pass.";
+  out_shape = output_tt->shape;
+
+  // Find input dtype.
+  auto ref_input_t = ref_call->args[0]->checked_type();
+  auto input_tt = ref_input_t.as<TensorTypeNode>();
+  CHECK(input_tt != nullptr) << "Type information missing."
+      << " Please run infer_type pass.";
+  const auto input_dtype = input_tt->dtype;
+
+  // Check for current quantization support.
+  CHECK_EQ(param->input_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+  CHECK_EQ(param->output_zero_point, 0)
+      << "Encountered non-zero zero point."
+      << " Only symmetric quantization supported for now.";
+
+  if (param->use_int_compute) {
+    return RequantizeInt(quantized_data, param, input_dtype, out_shape);
+  } else {
+    return RequantizeFloat(quantized_data, param, input_dtype, out_shape);
+  }
+}
+
+
+RELAY_REGISTER_OP("qnn.requantize")
+.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
+
+
+
+TVM_REGISTER_API("relay._quantize.rewrite")
+.set_body_typed<Expr(Expr)>([](const Expr& e) {
+  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  return ret;
+});
+
+
 }  // namespace relay
 }  // namespace tvm

From 91b58a587246d6b1228c825c3c25a671ab570f6a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 21:51:02 +0000
Subject: [PATCH 03/37] Requantize operator implementation.

Requantize converts one quantized tensor representation to another quantized
representation. The PR has following implementation features

- Requantize operator defined in qnn namespace - relay.qnn.requantize
- Lowering of the requantize to exisiting Relay operators
- Integer fixed point implementation of requantize
    - Two rounding modes - FE_UPWARDS (round towards infinity) and
    FE_AWAY_FROM_ZERO (std::round behavior)
- Floating point implementation as well, that can act as reference or can be
used for devices when FP32 computation is not used.
- Unit test cases

Relevant Issue - https://github.com/dmlc/tvm/issues/2351

Credit to TFLite and GemmLowp to provide reference implementations.
---
 include/tvm/relay/attrs/qnn.h               |  13 +-
 python/tvm/relay/op/qnn/qnn.py              |  13 +-
 src/relay/op/nn/requantize.cc               |   4 +-
 src/relay/pass/quantize_rewrite.cc          | 231 +++++++++---------
 tests/python/unittest/test_quantized_ops.py | 257 ++++++++++++++++++++
 5 files changed, 390 insertions(+), 128 deletions(-)
 create mode 100644 tests/python/unittest/test_quantized_ops.py

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index 12afe19d26b3..cf69fa759c1c 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -37,6 +37,7 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   double output_scale;
   int32_t output_zero_point;
   bool use_int_compute;
+  std::string rounding_mode;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
@@ -48,14 +49,22 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("The scale of the input tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_compute).set_default(false)
-        .describe("When true, the integer computation is used to handle output scale");
+    TVM_ATTR_FIELD(use_int_compute).set_default(true)
+      .describe("When true, the integer computation is used to handle output scale."
+                "The float compuation can be used as reference implementation or in"
+                "cases where FP32 computation for requantize is not expensive");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
+    TVM_ATTR_FIELD(rounding_mode).set_default("FE_UPWARD")
+        .describe("Defines the rounding direction when the value is midway between"
+                  "two representable values. There are two supported modes - FE_UPWARD"
+                  "or FE_AWAY_FROM_ZERO. More context can be found at"
+                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
   }
 };
 
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 18be68cd9cfc..484b3864f22f 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -19,9 +19,9 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
-
 def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-        output_scale, out_dtype="int32", use_int_compute=False):
+        output_scale, out_dtype="int32", use_int_compute=False,
+        rounding_mode="FE_UPWARD"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -57,11 +57,18 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
     use_int_compute : bool, optional
         Use fully integer computation for requantizing.
 
+    rounding_mode : string, optional
+        Defines the rounding direction when the value is midway between two
+        representable values.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
+    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+            "Unsupported rounding mode"
+
     return _make.requantize(input_data, input_zero_point, input_scale,
                             output_zero_point, output_scale, out_dtype,
-                            use_int_compute)
\ No newline at end of file
+                            use_int_compute, rounding_mode)
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/op/nn/requantize.cc
index 80f2bde4ad47..285528993f6f 100644
--- a/src/relay/op/nn/requantize.cc
+++ b/src/relay/op/nn/requantize.cc
@@ -59,7 +59,8 @@ Expr MakeRequantize(Expr data,
                     int32_t output_zero_point,
                     double output_scale,
                     DataType out_dtype,
-                    bool use_int_compute) {
+                    bool use_int_compute,
+                    std::string rounding_mode) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->out_dtype = std::move(out_dtype);
   attrs->input_zero_point = std::move(input_zero_point);
@@ -67,6 +68,7 @@ Expr MakeRequantize(Expr data,
   attrs->input_scale = std::move(input_scale);
   attrs->output_scale = std::move(output_scale);
   attrs->use_int_compute = std::move(use_int_compute);
+  attrs->rounding_mode = std::move(rounding_mode);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 55f8c43fd49f..645b20c0730e 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -33,13 +33,27 @@
 namespace tvm {
 namespace relay {
 
-
 // Lowering of qnn.requantize op
+
+/*
+ * Converts a floating point number so that it can be represented by integers.
+ * The representation is
+ *      float_number = (fixed_point_multiplier) * 2^(shift)
+ *
+ * The fixed_point_multiplier is a number between 0.5 and 1. This is represented
+ * by an integer number. For example, if it is int32, then the decimal point
+ * exists between bit 31 and 30 from LSB (or between first and second bit from
+ * the left).
+ *
+ * Some examples are
+ *           0.25 = (0.5) * 2^(-1)
+ *           0.125 = (0.5) * 2^(-2)
+ */
 void GetFixedPointMultiplierShift(double double_multiplier,
     int32_t* fixed_point_multiplier, int* shift,
     const DataType& idtype) {
 
-  int acc_dtype_bits = idtype.bits();
+  int idtype_bits = idtype.bits();
 
   if (double_multiplier == 0.) {
     *fixed_point_multiplier = 0;
@@ -47,9 +61,9 @@ void GetFixedPointMultiplierShift(double double_multiplier,
     return;
   }
   const double q = std::frexp(double_multiplier, shift);
-  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (acc_dtype_bits - 1))));
-  CHECK_LE(q_fixed, (1ll << (acc_dtype_bits - 1)));
-  if (q_fixed == (1ll << (acc_dtype_bits - 1))) {
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (idtype_bits - 1))));
+  CHECK_LE(q_fixed, (1ll << (idtype_bits - 1)));
+  if (q_fixed == (1ll << (idtype_bits - 1))) {
     q_fixed /= 2;
     ++*shift;
   }
@@ -57,85 +71,6 @@ void GetFixedPointMultiplierShift(double double_multiplier,
   *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
 }
 
-Expr MultiplyByIntegerMuliplier(const Expr& convolved_tensor,
-    const int32_t fixed_point_multiplier, const int left_shift,
-    const RequantizeAttrs*& param, const DataType& idtype,
-    const Array<IndexExpr>& out_shape) {
-  // TODO (janimesh) - How to add the overflow checks here. TFLite code snippet is
-  // bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
-  // return overflow ? std::numeric_limits<std::int32_t>::max() : .....;/
-
-  // The calculations are done in upcast of idtype to retain precision.
-  int acc_dtype_bits = idtype.bits();
-  DataType up_idtype = Int(2 * acc_dtype_bits);
-
-  auto tensor = convolved_tensor;
-  // Typically the left_shift will be 0 if the original scale is > 0.5.
-  if (left_shift != 0) {
-    tensor = Multiply(tensor, MakeConstantScalar(idtype, 1 << left_shift));
-  }
-
-  // Upcast the computation to Int64 and multiply the multiplier.
-  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
-  auto multiplied_t = Multiply(Cast(tensor, up_idtype), scalar);
-
-  // Since, we are performing fixed point computation. We are only interested in
-  // higher 16/32 bits. But before that, we also need to perform rounding.
-  // This is fixed point rounding. So, the rounder add scalar depends if the
-  // input is positive.
-  auto zero = MakeConstantScalar(up_idtype, 0);
-  auto pos_threshold = MakeConstantScalar(up_idtype,
-          1ll << (acc_dtype_bits - 2));
-  auto neg_threshold = MakeConstantScalar(up_idtype,
-          (1 - (1ll << (acc_dtype_bits - 2))));
-  auto pos_rounder = Full(pos_threshold, out_shape, up_idtype);
-  auto neg_rounder = Full(neg_threshold, out_shape, up_idtype);
-  auto rounding_scalar = Where(GreaterEqual(multiplied_t, zero), pos_rounder, neg_rounder);
-  auto rounded_tensor = Add(multiplied_t, rounding_scalar);
-
-  // Perform right shift to get the first 16/32 bits.
-  // The result is first doubled and the first 15/31 bits are obtained. This is
-  // done by just right shifting the result by 15/31 bits.
-  auto right_shift_scalar = MakeConstantScalar(up_idtype, (acc_dtype_bits - 1));
-  auto scaled_t = RightShift(rounded_tensor, right_shift_scalar);
-  auto q_imin = get_qmin(idtype);
-  auto q_imax = get_qmax(idtype);
-  auto integer_multiplied_t = Cast(Clip(scaled_t, q_imin, q_imax),
-          idtype);
-  return integer_multiplied_t;
-}
-
-Expr ShiftByIntegerShift(const Expr& multiplied_t,
-    const int& exponent, const RequantizeAttrs*& param,
-    const DataType& idtype, const Array<IndexExpr>& out_shape) {
-  CHECK_GE(exponent, 0);
-  int acc_dtype_bits = idtype.bits();
-  CHECK_LE(exponent, (acc_dtype_bits - 1));
-
-  // We need to perform rounding. The rounding here is closest to the power
-  // of 2. The exponent basically represents the decimal point. We need to round
-  // at the decimal point.
-  auto tensor = multiplied_t;
-  if (exponent != 0) {
-    auto pos_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)));
-    auto neg_rounder = MakeConstantScalar(idtype, (1ll << (exponent - 1)) - 1);
-    auto pos_rounder_t = Full(pos_rounder, out_shape, idtype);
-    auto neg_rounder_t = Full(neg_rounder, out_shape, idtype);
-
-    auto zero = MakeConstantScalar(idtype, 0);
-    auto zero_t = Full(zero, out_shape, idtype);
-    auto round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
-            neg_rounder_t);
-    tensor = Add(tensor, round_scalar);
-  }
-
-  // Right shift by exponent to approximate the division.
-  auto scaled_t = RightShift(tensor,
-          MakeConstantScalar(idtype, exponent));
-  return scaled_t;
-}
-
-
 /*
  * Requantization using only integer computation. Here, the computation is
  * converted to a fixed point computation by computing output multiplier and
@@ -147,59 +82,123 @@ Expr ShiftByIntegerShift(const Expr& multiplied_t,
  * multiplication with an int value and then right shifting the result. This
  * approximates the floating point computation with a fixed point computation.
  *
- * The whole computaition this can be broken down into following steps 
+ * The whole computation this can be broken down into following steps
  * 1) Calculate the integer multiplier and integer shift.
- * 2) Multiply the integer multiplier with quantized tensor.
- * 3) Right shift the result.
+ * 2) Subtract the input integer point.
+ * 2) Multiply the integer fixed point multiplier with quantized tensor.
+ * 3) Round the result.
+ * 4) Right shift the result.
+ * 5) Add the output_zero_point.
+ * 6) Cast to the out_dtype.
  *
- * The only thing complicating the above computations is the tedious approach of
- * handling rounding.
  */
-Expr RequantizeInt(const Expr& convolved_tensor,
+Expr RequantizeInt(const Expr& input_tensor,
     const RequantizeAttrs*& param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
   double double_multiplier = param->input_scale/param->output_scale;
+
+  // The multiplication will be performed in higher precision. Find the dtype.
+  int idtype_bits = idtype.bits();
+  DataType up_idtype = Int(2 * idtype_bits);
+
   // 1) Calculating the integer multiplier and integer shift
   int32_t fixed_point_multiplier;
   int shift;
   GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
           &shift, idtype);
-
-  // 2) Multiply the integer multiplier
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
-  auto multiplied_t = MultiplyByIntegerMuliplier(convolved_tensor,
-          fixed_point_multiplier, left_shift, param, idtype, out_shape);
 
-  // 3) Divide by the denominator or right shift the result.
-  auto scaled_int32_t = ShiftByIntegerShift(multiplied_t,
-          right_shift, param, idtype, out_shape);
+  // 2) Subtract the input_zero_point
+  auto tensor = input_tensor;
+  tensor = Cast(tensor, up_idtype);
+  if (param->input_zero_point != 0) {
+    auto input_zp = MakeConstantScalar(up_idtype, param->input_zero_point);
+    tensor = Subtract(tensor, input_zp);
+  }
 
-  // 4) Clip to the out_dtype min/max.
+
+
+  // 3) Multiply the integer multiplier
+  if (left_shift != 0) {
+    tensor = Multiply(tensor, MakeConstantScalar(up_idtype, 1 << left_shift));
+  }
+  // Perform the multiplication in higher precision.
+  // If idtype is Int(32), the scalar is a fixed point value of int32 where the
+  // decimal point is between bits 31 and 30. After multiplying with
+  // input_tensor, the result in int64 where the decimal point is sitting
+  // between bits 31 and 30 (from the right, rightmost bit is bit 0).
+  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
+  auto multiplied_t = Multiply(tensor, scalar);
+
+
+  // 4) Find the rounding scalar. This depends on where the final decimal point
+  // sits. As we will be right shifting the multiplied_t, we need to first
+  // calculate the totol_right_shift.
+  int total_right_shift = right_shift + idtype_bits - 1;
+
+  tensor = multiplied_t;
+  Expr round_scalar;
+  if (param->rounding_mode == "FE_UPWARD") {
+    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
+    round_scalar = pos_rounder;
+  } else if (param->rounding_mode == "FE_AWAY_FROM_ZERO") {
+    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
+    auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
+    auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
+    auto neg_rounder_t = Full(neg_rounder, out_shape, up_idtype);
+
+    auto zero = MakeConstantScalar(up_idtype, 0);
+    auto zero_t = Full(zero, out_shape, up_idtype);
+    round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
+            neg_rounder_t);
+  }
+  // Add the rounding scalar.
+  tensor = Add(tensor, round_scalar);
+
+  // 5) Simply right shift the result to get the final output.
+  auto scaled_int64_t = RightShift(tensor,
+          MakeConstantScalar(up_idtype, total_right_shift));
+
+  // 6) Add the output zero point.
+  auto output_zp = MakeConstantScalar(up_idtype, param->output_zero_point);
+  auto shifted_int64_t = Add(output_zp, scaled_int64_t);
+
+  // 7) Clip to the out_dtype min/max.
+  // Find the right clip min/maxes. While clipping, it is necessary that
+  // clip_min and clip_max are within the dtype range of the input tensor to the
+  // clip operator. For example, if the input to clip operator is int8, but the
+  // out_dtype is uint8, we will get incorrect results, if we set max as 255.
   auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
   auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
-  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
+  auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;
 }
 
-/* 
+
+/*
  * Requantization using floating computation. Here we can multiply the scale to
- * the convolved_tensor, round to nearest integer and then cast back to int32.
+ * the input_tensor, round to nearest integer and then cast back to int32.
  */
-Expr RequantizeFloat(const Expr& convolved_tensor,
+Expr RequantizeFloat(const Expr& input_tensor,
     const RequantizeAttrs*& param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
   double double_multiplier = param->input_scale/param->output_scale;
   auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
-
-  // Multiply the convolved tensor with the new scale.
-  auto casted_t = Cast(convolved_tensor, Float(32));
-  auto multiplied_t = Round(Multiply(casted_t, scalar_multiplier));
+  auto input_zp = MakeConstantScalar(idtype, param->input_zero_point);
+  auto output_zp = MakeConstantScalar(Float(32), param->output_zero_point);
+
+  // Multiply the tensor with the new scale.
+  auto shifted_input_t = Subtract(input_tensor, input_zp);
+  auto casted_t = Cast(shifted_input_t, Float(32));
+  auto multiplied_t = Multiply(casted_t, scalar_multiplier);
+  auto shifted_multiplied_t = Add(output_zp, multiplied_t);
+  auto rounded_t = Round(shifted_multiplied_t);
   auto q_imin = get_qmin(idtype);
   auto q_imax = get_qmax(idtype);
-  auto scaled_int32_t = Cast(Clip(multiplied_t, q_imin, q_imax),
+  auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
           idtype);
 
   // Clip to the out_dtype min/max.
@@ -243,14 +242,6 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
-  // Check for current quantization support.
-  CHECK_EQ(param->input_zero_point, 0)
-      << "Encountered non-zero zero point."
-      << " Only symmetric quantization supported for now.";
-  CHECK_EQ(param->output_zero_point, 0)
-      << "Encountered non-zero zero point."
-      << " Only symmetric quantization supported for now.";
-
   if (param->use_int_compute) {
     return RequantizeInt(quantized_data, param, input_dtype, out_shape);
   } else {
@@ -258,18 +249,14 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
   }
 }
 
-
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-
-
 TVM_REGISTER_API("relay._quantize.rewrite")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
-  return ret;
-});
-
+          Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+            return ret;
+            });
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
new file mode 100644
index 000000000000..e70ea0925231
--- /dev/null
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -0,0 +1,257 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.testing import create_workload
+from tvm.contrib import graph_runtime
+
+rounding_modes = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
+
+def run_infer_type(expr):
+    mod = relay.Module.from_expr(expr)
+    mod = relay.transform.InferType()(mod)
+    entry = mod["main"]
+    return entry if isinstance(expr, relay.Function) else entry.body
+
+
+def test_requantize():
+    def verify(func, goldens):
+        with relay.build_config(opt_level=0):
+            graph, lib, params = relay.build(func, "llvm", params=None)
+            golden_data, golden_output = goldens
+            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod.set_input("quantized_data",golden_data)
+            mod.set_input(**params)
+            mod.run()
+            res = mod.get_output(0).asnumpy()
+            np.testing.assert_equal(res, golden_output)
+
+    def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
+            rounding_mode, input_scale, output_scale, input_zero_point=0,
+            output_zero_point=0):
+        quantized_data = relay.var("quantized_data", shape=data_shape,
+                dtype=data_dtype)
+        func = relay.op.qnn.requantize(
+                quantized_data,
+                input_zero_point=input_zero_point,
+                output_zero_point=output_zero_point,
+                input_scale=input_scale,
+                output_scale=output_scale,
+                rounding_mode=rounding_mode,
+                out_dtype=out_dtype,
+                use_int_compute=use_int_compute)
+
+        func = relay.Function(relay.analysis.free_vars(func),
+                func)
+        func = run_infer_type(func)
+        func = relay.quantize.rewrite(func)
+        print(func)
+        return func
+
+
+    def run_tests():
+        def same_scale_test():
+            # Have same scales, everything within range
+            golden_data = np.arange(-100, 100, 1).astype('int32')
+            golden_output = golden_data
+
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(200, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=0.5,
+                                    output_scale=0.5)
+                    verify(func, (golden_data, golden_output))
+
+        def downscale_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                    verify(func, (golden_data, golden_output))
+
+                # Try a different scale
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=4)
+
+                    # Try positive values
+                    # 2I corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                              [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                                  [3, 4, 4, 4, 4, 4, 4, 4, 1])
+                    else:
+                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                                  [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                    verify(func, (golden_data, golden_output))
+
+        def upscale_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=2,
+                                    output_scale=1)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.multiply(2, golden_data)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(0, -32, -1).astype('int32')
+                    golden_output = np.multiply(2, golden_data)
+                    verify(func, (golden_data, golden_output))
+
+        def saturation_test():
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(16, ),
+                                    data_dtype='int32',
+                                    out_dtype="int8",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=0.5,
+                                    output_scale=0.5)
+                    golden_data = np.arange(0, 16, 1).astype('int32')
+                    golden_data = np.add(120, golden_data)
+                    output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
+                                       127, 127, 127, 127, 127, 127, 127, 127])
+                    golden_output = output
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative numbers
+                    golden_data = np.arange(0, -16, -1).astype('int32')
+                    golden_data = np.add(-120, golden_data)
+                    output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
+                                       -128, -128, -128, -128, -128, -128, -128, -128])
+                    golden_output = output
+                    verify(func, (golden_data, golden_output))
+
+        def zero_point_test():
+            # Output zero point
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16,
+                                    output_zero_point=1)
+
+                    # Try positive values
+                    # 8 corresponds to 0.5, resulting in 1
+                    golden_data = np.arange(0, 32, 1).astype('int32')
+                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                    golden_output = np.add(1, golden_output)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                    golden_data = np.arange(-32, -64, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                    golden_output = np.add(1, golden_output)
+                    verify(func, (golden_data, golden_output))
+
+            # Input zero point
+            for rounding_mode in rounding_modes:
+                for use_int_compute in [True, False]:
+                    func = get_func(data_shape=(32, ),
+                                    data_dtype='int32',
+                                    out_dtype="int32",
+                                    use_int_compute=use_int_compute,
+                                    rounding_mode=rounding_mode,
+                                    input_scale=1,
+                                    output_scale=16,
+                                    input_zero_point=16)
+
+                    # Try positive values
+                    golden_data = np.arange(32, 64, 1).astype('int32')
+                    golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+                    golden_output = np.subtract(golden_output, 1)
+                    verify(func, (golden_data, golden_output))
+
+                    # Try negative values
+                    golden_data = np.arange(-32, -64, -1).astype('int32')
+                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
+                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                    else:
+                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                    golden_output = np.subtract(golden_output, 1)
+                    verify(func, (golden_data, golden_output))
+
+
+
+
+        if __name__ == "__main__":
+            same_scale_test()
+            downscale_test()
+            upscale_test()
+            saturation_test()
+            zero_point_test()
+
+    run_tests()
+
+if __name__ == "__main__":
+    test_requantize()

From 13fcc7076f6604895307831c873d343c0c241e64 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 22:23:29 +0000
Subject: [PATCH 04/37] Typo and lint fixes.

---
 include/tvm/relay/attrs/qnn.h               |  6 ++---
 include/tvm/relay/quantize_util.h           | 27 ++++++++++++---------
 python/tvm/relay/op/qnn/__init__.py         |  2 +-
 python/tvm/relay/op/qnn/qnn.py              |  4 +--
 python/tvm/relay/quantize/rewrite.py        |  1 -
 src/relay/pass/quantize_rewrite.cc          |  4 +--
 tests/python/unittest/test_quantized_ops.py |  1 -
 7 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/include/tvm/relay/attrs/qnn.h b/include/tvm/relay/attrs/qnn.h
index cf69fa759c1c..6bcd77a81f8a 100644
--- a/include/tvm/relay/attrs/qnn.h
+++ b/include/tvm/relay/attrs/qnn.h
@@ -21,8 +21,8 @@
  * \file tvm/relay/attrs/nn.h
  * \brief Auxiliary attributes for nn operators.
  */
-#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
-#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#ifndef TVM_RELAY_ATTRS_QNN_H_
+#define TVM_RELAY_ATTRS_QNN_H_
 
 #include <tvm/attrs.h>
 #include <string>
@@ -67,4 +67,4 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#endif  // TVM_RELAY_ATTRS_QNN_H_
diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
index bb054fb8fb65..6a8c2e520098 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/include/tvm/relay/quantize_util.h
@@ -22,10 +22,11 @@
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
-#ifndef TVM_QUANTIZE_UTIL_H
-#define TVM_QUANTIZE_UTIL_H
+#ifndef TVM_RELAY_QUANTIZE_UTIL_H_
+#define TVM_RELAY_QUANTIZE_UTIL_H_
 
 #include <tvm/expr.h>
+#include<limits>
 #include "./base.h"
 
 namespace tvm {
@@ -68,14 +69,15 @@ inline bool is_quantized_type(const DataType& dtype) {
 }
 
 enum class QuantizeOpType : uint8_t {
-  Quantize_Requantize,
+  Quantize,
   Dequantize,
   Requantize
 };
 
-inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
-  switch(op_type) {
-    case QuantizeOpType::Quantize_Requantize:
+inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type,
+        const DataType &in_dtype) {
+  switch (op_type) {
+    case QuantizeOpType::Quantize:
       return is_Float32(in_dtype) || is_quantized_type(in_dtype);
     case QuantizeOpType ::Dequantize:
       return is_quantized_type(in_dtype);
@@ -86,9 +88,10 @@ inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type, cons
   }
 }
 
-inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type, const DataType &in_dtype) {
-  switch(op_type) {
-    case QuantizeOpType::Quantize_Requantize:
+inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type,
+        const DataType &in_dtype) {
+  switch (op_type) {
+    case QuantizeOpType::Quantize:
       return is_quantized_type(in_dtype);
     case QuantizeOpType::Dequantize:
       return is_Float32(in_dtype);
@@ -134,6 +137,6 @@ inline const int32_t get_qmax(const DataType&  dtype) {
   return -1;
 }
 
-} // namespace relay
-} // namespace tvm
-#endif //TVM_QUANTIZE_UTIL_H
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_QUANTIZE_UTIL_H_
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/op/qnn/__init__.py
index aef02300ab63..e9adfa783f93 100644
--- a/python/tvm/relay/op/qnn/__init__.py
+++ b/python/tvm/relay/op/qnn/__init__.py
@@ -17,4 +17,4 @@
 # pylint: disable=wildcard-import
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
-from .qnn import *
\ No newline at end of file
+from .qnn import *
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
index 484b3864f22f..10477e22ac04 100644
--- a/python/tvm/relay/op/qnn/qnn.py
+++ b/python/tvm/relay/op/qnn/qnn.py
@@ -20,8 +20,8 @@
 from . import _make
 
 def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-        output_scale, out_dtype="int32", use_int_compute=False,
-        rounding_mode="FE_UPWARD"):
+               output_scale, out_dtype="int32", use_int_compute=False,
+               rounding_mode="FE_UPWARD"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/quantize/rewrite.py
index 89429e522115..c8860775b77f 100644
--- a/python/tvm/relay/quantize/rewrite.py
+++ b/python/tvm/relay/quantize/rewrite.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import
 
 from . import _quantize
-from .. import expr as _expr
 
 def rewrite(expr):
     """
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc
index 645b20c0730e..92bd51ad7e15 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/pass/quantize_rewrite.cc
@@ -93,7 +93,7 @@ void GetFixedPointMultiplierShift(double double_multiplier,
  *
  */
 Expr RequantizeInt(const Expr& input_tensor,
-    const RequantizeAttrs*& param, const DataType& idtype,
+    const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
   double double_multiplier = param->input_scale/param->output_scale;
@@ -183,7 +183,7 @@ Expr RequantizeInt(const Expr& input_tensor,
  * the input_tensor, round to nearest integer and then cast back to int32.
  */
 Expr RequantizeFloat(const Expr& input_tensor,
-    const RequantizeAttrs*& param, const DataType& idtype,
+    const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
   double double_multiplier = param->input_scale/param->output_scale;
   auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index e70ea0925231..8a039edd12b6 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -61,7 +61,6 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
                 func)
         func = run_infer_type(func)
         func = relay.quantize.rewrite(func)
-        print(func)
         return func
 
 

From ac4dfdc70bb0552a3ac616f3ca30a9194d9f5f71 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:08:18 +0000
Subject: [PATCH 05/37] Doc fix.

---
 include/tvm/relay/quantize_util.h |  2 +-
 tests/scripts/task_lint.sh        | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/quantize_util.h b/include/tvm/relay/quantize_util.h
index 6a8c2e520098..5b5215dc4459 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/include/tvm/relay/quantize_util.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file nnvm/compiler/quantize_util.h
+ * \file tvm/relay/quantize_util.h
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 544ef7224770..896cc4c65c22 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,18 +31,18 @@ echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Check ASF license header..."
-java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
-if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
-    echo "Need to add ASF header to the following files."
-    echo "----------------File List----------------"
-    cat /tmp/$$.apache-rat.txt
-    echo "-----------------------------------------"
-    echo "Use the following steps to add the headers:"
-    echo "- Create file_list.txt in your text editor"
-    echo "- Copy paste the above content in file-list into file_list.txt"
-    echo "- python3 tests/lint/add_asf_header.py file_list.txt"
-    exit 1
-fi
+# java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
+# if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
+#     echo "Need to add ASF header to the following files."
+#     echo "----------------File List----------------"
+#     cat /tmp/$$.apache-rat.txt
+#     echo "-----------------------------------------"
+#     echo "Use the following steps to add the headers:"
+#     echo "- Create file_list.txt in your text editor"
+#     echo "- Copy paste the above content in file-list into file_list.txt"
+#     echo "- python3 tests/lint/add_asf_header.py file_list.txt"
+#     exit 1
+# fi
 
 echo "Check codestyle of c++ code..."
 make cpplint

From 01cad3a9af08bb64bef6be6656c703b690535513 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:10:34 +0000
Subject: [PATCH 06/37] Uncommenting the lint script (fixing mistake).

---
 tests/scripts/task_lint.sh | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 896cc4c65c22..544ef7224770 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,18 +31,18 @@ echo "Check file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Check ASF license header..."
-# java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
-# if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
-#     echo "Need to add ASF header to the following files."
-#     echo "----------------File List----------------"
-#     cat /tmp/$$.apache-rat.txt
-#     echo "-----------------------------------------"
-#     echo "Use the following steps to add the headers:"
-#     echo "- Create file_list.txt in your text editor"
-#     echo "- Copy paste the above content in file-list into file_list.txt"
-#     echo "- python3 tests/lint/add_asf_header.py file_list.txt"
-#     exit 1
-# fi
+java -jar /bin/apache-rat.jar -E tests/lint/rat-excludes  -d . | (grep "== File" > /tmp/$$.apache-rat.txt || true)
+if grep --quiet -E "File" /tmp/$$.apache-rat.txt; then
+    echo "Need to add ASF header to the following files."
+    echo "----------------File List----------------"
+    cat /tmp/$$.apache-rat.txt
+    echo "-----------------------------------------"
+    echo "Use the following steps to add the headers:"
+    echo "- Create file_list.txt in your text editor"
+    echo "- Copy paste the above content in file-list into file_list.txt"
+    echo "- python3 tests/lint/add_asf_header.py file_list.txt"
+    exit 1
+fi
 
 echo "Check codestyle of c++ code..."
 make cpplint

From 6405755070bce1bebabb3d478ec9393fde5ce8a9 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 10 Jul 2019 23:13:54 +0000
Subject: [PATCH 07/37] Modifying the unit tests.

---
 tests/python/unittest/test_quantized_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 8a039edd12b6..6dc35d801543 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -86,7 +86,7 @@ def downscale_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,
@@ -189,7 +189,7 @@ def zero_point_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,
@@ -218,7 +218,7 @@ def zero_point_test():
                 for use_int_compute in [True, False]:
                     func = get_func(data_shape=(32, ),
                                     data_dtype='int32',
-                                    out_dtype="int32",
+                                    out_dtype='int8',
                                     use_int_compute=use_int_compute,
                                     rounding_mode=rounding_mode,
                                     input_scale=1,

From 7a49beeeb9d5a35794adf7162be0fed647ed537b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 06:05:00 +0000
Subject: [PATCH 08/37] Moving C++ files into src/relay/qnn

---
 .../qnn.h => src/relay/qnn/include/attrs.h    |  0
 .../relay/qnn/include/util.h                  | 73 +++++++++----------
 src/relay/{op/nn => qnn/op}/requantize.cc     |  6 +-
 src/relay/{ => qnn}/pass/quantize_rewrite.cc  | 18 ++---
 4 files changed, 47 insertions(+), 50 deletions(-)
 rename include/tvm/relay/attrs/qnn.h => src/relay/qnn/include/attrs.h (100%)
 rename include/tvm/relay/quantize_util.h => src/relay/qnn/include/util.h (62%)
 rename src/relay/{op/nn => qnn/op}/requantize.cc (95%)
 rename src/relay/{ => qnn}/pass/quantize_rewrite.cc (95%)

diff --git a/include/tvm/relay/attrs/qnn.h b/src/relay/qnn/include/attrs.h
similarity index 100%
rename from include/tvm/relay/attrs/qnn.h
rename to src/relay/qnn/include/attrs.h
diff --git a/include/tvm/relay/quantize_util.h b/src/relay/qnn/include/util.h
similarity index 62%
rename from include/tvm/relay/quantize_util.h
rename to src/relay/qnn/include/util.h
index 5b5215dc4459..61663b0da85e 100644
--- a/include/tvm/relay/quantize_util.h
+++ b/src/relay/qnn/include/util.h
@@ -26,46 +26,43 @@
 #define TVM_RELAY_QUANTIZE_UTIL_H_
 
 #include <tvm/expr.h>
-#include<limits>
-#include "./base.h"
+#include <limits>
+#include <tvm/relay/expr.h>
 
 namespace tvm {
 namespace relay {
 
-inline bool is_Int8(const DataType& dtype) {
+inline bool IsInt8(const DataType& dtype) {
   return dtype == Int(8);
 }
 
-inline bool is_UInt8(const DataType& dtype) {
+inline bool IsUint8(const DataType& dtype) {
   return dtype == UInt(8);
 }
 
-
-inline bool is_Int16(const DataType& dtype) {
+inline bool IsInt16(const DataType& dtype) {
   return dtype == Int(16);
 }
 
-inline bool is_UInt16(const DataType& dtype) {
+inline bool IsUint16(const DataType& dtype) {
   return dtype == UInt(16);
 }
 
-inline bool is_Int32(const DataType& dtype) {
+inline bool IsInt32(const DataType& dtype) {
   return dtype == Int(32);
 }
 
-inline bool is_UInt32(const DataType& dtype) {
+inline bool IsUint32(const DataType& dtype) {
   return dtype == UInt(32);
 }
 
-
-
-inline bool is_Float32(const DataType& dtype) {
+inline bool IsFloat32(const DataType& dtype) {
   return dtype == Float(32);
 }
 
-inline bool is_quantized_type(const DataType& dtype) {
-  return is_Int8(dtype) || is_UInt8(dtype)
-      || is_Int16(dtype) || is_UInt16(dtype);
+inline bool IsQuantizedType(const DataType& dtype) {
+  return IsInt8(dtype) || IsUint8(dtype)
+      || IsInt16(dtype) || IsUint16(dtype);
 }
 
 enum class QuantizeOpType : uint8_t {
@@ -74,44 +71,44 @@ enum class QuantizeOpType : uint8_t {
   Requantize
 };
 
-inline bool is_valid_quantized_op_input_type(const QuantizeOpType &op_type,
-        const DataType &in_dtype) {
+inline bool IsValidOpInputType(const QuantizeOpType& op_type,
+        const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return is_Float32(in_dtype) || is_quantized_type(in_dtype);
+      return IsFloat32(in_dtype) || IsQuantizedType(in_dtype);
     case QuantizeOpType ::Dequantize:
-      return is_quantized_type(in_dtype);
+      return IsQuantizedType(in_dtype);
     case QuantizeOpType ::Requantize:
-      return is_Int16(in_dtype) || is_Int32(in_dtype);
+      return IsInt16(in_dtype) || IsInt32(in_dtype);
     default:
       return false;
   }
 }
 
-inline bool is_valid_quantized_op_output_type(const QuantizeOpType &op_type,
-        const DataType &in_dtype) {
+inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
+        const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return is_quantized_type(in_dtype);
+      return IsQuantizedType(in_dtype);
     case QuantizeOpType::Dequantize:
-      return is_Float32(in_dtype);
+      return IsFloat32(in_dtype);
     default:
       return false;
   }
 }
 
-inline const int32_t get_qmin(const DataType&  dtype) {
-  if (is_Int8(dtype)) {
+inline const int32_t GetQmin(const DataType& dtype) {
+  if (IsInt8(dtype)) {
     return std::numeric_limits<int8_t>::min();
-  } else if (is_UInt8(dtype)) {
+  } else if (IsUint8(dtype)) {
     return std::numeric_limits<uint8_t>::min();
-  } else if (is_Int16(dtype)) {
+  } else if (IsInt16(dtype)) {
     return std::numeric_limits<int16_t>::min();
-  } else if (is_UInt16(dtype)) {
+  } else if (IsUint16(dtype)) {
     return std::numeric_limits<uint16_t>::min();
-  } else if (is_Int32(dtype)) {
+  } else if (IsInt32(dtype)) {
     return std::numeric_limits<int32_t>::min();
-  } else if (is_UInt32(dtype)) {
+  } else if (IsUint32(dtype)) {
     return std::numeric_limits<uint32_t>::min();
   }
   LOG(FATAL) << "Type not supported\n";
@@ -119,18 +116,18 @@ inline const int32_t get_qmin(const DataType&  dtype) {
 }
 
 
-inline const int32_t get_qmax(const DataType&  dtype) {
-  if (is_Int8(dtype)) {
+inline const int32_t GetQmax(const DataType& dtype) {
+  if (IsInt8(dtype)) {
     return std::numeric_limits<int8_t>::max();
-  } else if (is_UInt8(dtype)) {
+  } else if (IsUint8(dtype)) {
     return std::numeric_limits<uint8_t>::max();
-  } else if (is_Int16(dtype)) {
+  } else if (IsInt16(dtype)) {
     return std::numeric_limits<int16_t>::max();
-  } else if (is_UInt16(dtype)) {
+  } else if (IsUint16(dtype)) {
     return std::numeric_limits<uint16_t>::max();
-  } else if (is_Int32(dtype)) {
+  } else if (IsInt32(dtype)) {
     return std::numeric_limits<int32_t>::max();
-  } else if (is_UInt32(dtype)) {
+  } else if (IsUint32(dtype)) {
     return std::numeric_limits<uint32_t>::max();
   }
   LOG(FATAL) << "Type not supported\n";
diff --git a/src/relay/op/nn/requantize.cc b/src/relay/qnn/op/requantize.cc
similarity index 95%
rename from src/relay/op/nn/requantize.cc
rename to src/relay/qnn/op/requantize.cc
index 285528993f6f..9e4ddc97467f 100644
--- a/src/relay/op/nn/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -25,8 +25,8 @@
 
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/qnn.h>
-#include <tvm/relay/quantize_util.h>
+#include "../include/attrs.h"
+#include "../include/util.h"
 
 namespace tvm {
 namespace relay {
@@ -41,7 +41,7 @@ bool RequantizeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
-  CHECK(is_valid_quantized_op_input_type(QuantizeOpType::Requantize, input_dtype))
+  CHECK(IsValidOpInputType(QuantizeOpType::Requantize, input_dtype))
     << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
similarity index 95%
rename from src/relay/pass/quantize_rewrite.cc
rename to src/relay/qnn/pass/quantize_rewrite.cc
index 92bd51ad7e15..30265ca1dc32 100644
--- a/src/relay/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -26,9 +26,9 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/quantize_util.h>
-#include <tvm/relay/attrs/qnn.h>
-#include "pattern_util.h"
+#include "../include/attrs.h"
+#include "../include/util.h"
+#include "../../pass/pattern_util.h"
 
 namespace tvm {
 namespace relay {
@@ -170,8 +170,8 @@ Expr RequantizeInt(const Expr& input_tensor,
   // clip_min and clip_max are within the dtype range of the input tensor to the
   // clip operator. For example, if the input to clip operator is int8, but the
   // out_dtype is uint8, we will get incorrect results, if we set max as 255.
-  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
-  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
+  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
   auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;
@@ -196,15 +196,15 @@ Expr RequantizeFloat(const Expr& input_tensor,
   auto multiplied_t = Multiply(casted_t, scalar_multiplier);
   auto shifted_multiplied_t = Add(output_zp, multiplied_t);
   auto rounded_t = Round(shifted_multiplied_t);
-  auto q_imin = get_qmin(idtype);
-  auto q_imax = get_qmax(idtype);
+  auto q_imin = GetQmin(idtype);
+  auto q_imax = GetQmax(idtype);
   auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
           idtype);
 
   // Clip to the out_dtype min/max.
   // Clip limits must be smaller than the dtype of the input tensor.
-  auto q_min = std::max(get_qmin(param->out_dtype), get_qmin(idtype));
-  auto q_max = std::min(get_qmax(param->out_dtype), get_qmax(idtype));
+  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
+  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
   auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;

From 154e64f60be0cd9527a52e2d979907128e407a6b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 16:15:26 +0000
Subject: [PATCH 09/37] Moving python files to python/tvm/relay/qnn. Some minor
 fixes.

---
 python/tvm/relay/__init__.py                  |  3 +
 python/tvm/relay/op/__init__.py               |  1 -
 python/tvm/relay/op/qnn/_make.py              | 20 -----
 python/tvm/relay/op/qnn/qnn.py                | 74 -------------------
 python/tvm/relay/{op => }/qnn/__init__.py     |  3 +-
 .../{quantize/rewrite.py => qnn/ir_pass.py}   |  4 +-
 python/tvm/relay/quantize/__init__.py         |  1 -
 src/relay/qnn/pass/quantize_rewrite.cc        | 18 ++---
 tests/python/unittest/test_quantized_ops.py   |  4 +-
 9 files changed, 18 insertions(+), 110 deletions(-)
 delete mode 100644 python/tvm/relay/op/qnn/_make.py
 delete mode 100644 python/tvm/relay/op/qnn/qnn.py
 rename python/tvm/relay/{op => }/qnn/__init__.py (95%)
 rename python/tvm/relay/{quantize/rewrite.py => qnn/ir_pass.py} (95%)

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index da14c80b33b4..01baa00c9f7e 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -53,6 +53,9 @@
 from . import backend
 from . import quantize
 
+# Dialects
+from . import qnn
+
 from .scope_builder import ScopeBuilder
 
 # Span
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index fa27641a8d07..b8ef4df5cdc8 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -27,7 +27,6 @@
 from .transform import *
 from .algorithm import *
 from . import nn
-from . import qnn
 from . import annotation
 from . import image
 from . import vision
diff --git a/python/tvm/relay/op/qnn/_make.py b/python/tvm/relay/op/qnn/_make.py
deleted file mode 100644
index b1695629b8f9..000000000000
--- a/python/tvm/relay/op/qnn/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-from ...._ffi.function import _init_api
-
-_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/op/qnn/qnn.py b/python/tvm/relay/op/qnn/qnn.py
deleted file mode 100644
index 10477e22ac04..000000000000
--- a/python/tvm/relay/op/qnn/qnn.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#pylint: disable=invalid-name, too-many-lines
-"""Neural network operations."""
-from __future__ import absolute_import as _abs
-from . import _make
-
-def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-               output_scale, out_dtype="int32", use_int_compute=False,
-               rounding_mode="FE_UPWARD"):
-    r"""Requantized operator.
-
-    The requantize operator converts one quantized tensor to another quantized
-    tensor. For the output tensor, we are provided with output scale and zero
-    point. The computation looks like this
-
-    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
-
-    The above computation can be done in floating point as the scales are in
-    FP32. Alternatively, we can approximate floating point with fixed point
-    computation. This is controlled by use_int_compute.
-
-    Parameters
-    ----------
-    quantized_data : tvm.relay.Expr
-        The input quantized_data to the operator.
-
-    input_scale: float
-           The float scalar to scale the quantized_data int8 values back to FP32.
-
-    output_scale: float
-           The float scalar to scale the quantized_output int8 values back to FP32.
-
-    input_zero_point: int
-           The zero point of the quantized_data distribution.
-
-    output_zero_point: int
-           The zero point of the quantized_output distribution.
-
-    out_dtype : str, optional
-        Specifies the output quantized_data type for mixed precision conv2d.
-
-    use_int_compute : bool, optional
-        Use fully integer computation for requantizing.
-
-    rounding_mode : string, optional
-        Defines the rounding direction when the value is midway between two
-        representable values.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
-            "Unsupported rounding mode"
-
-    return _make.requantize(input_data, input_zero_point, input_scale,
-                            output_zero_point, output_scale, out_dtype,
-                            use_int_compute, rounding_mode)
diff --git a/python/tvm/relay/op/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
similarity index 95%
rename from python/tvm/relay/op/qnn/__init__.py
rename to python/tvm/relay/qnn/__init__.py
index e9adfa783f93..0836c5770ce4 100644
--- a/python/tvm/relay/op/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -17,4 +17,5 @@
 # pylint: disable=wildcard-import
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
-from .qnn import *
+from . import op
+from . import ir_pass
diff --git a/python/tvm/relay/quantize/rewrite.py b/python/tvm/relay/qnn/ir_pass.py
similarity index 95%
rename from python/tvm/relay/quantize/rewrite.py
rename to python/tvm/relay/qnn/ir_pass.py
index c8860775b77f..24e3329e961c 100644
--- a/python/tvm/relay/quantize/rewrite.py
+++ b/python/tvm/relay/qnn/ir_pass.py
@@ -18,7 +18,7 @@
 """Automatic quantization toolkit."""
 from __future__ import absolute_import
 
-from . import _quantize
+from . import _qnn
 
 def rewrite(expr):
     """
@@ -34,4 +34,4 @@ def rewrite(expr):
     expr : tvm.relay.Expr
         The output expression.
     """
-    return _quantize.rewrite(expr)
+    return _qnn.rewrite(expr)
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
index 4d3aad123a6b..a9e7b40b039e 100644
--- a/python/tvm/relay/quantize/__init__.py
+++ b/python/tvm/relay/quantize/__init__.py
@@ -19,6 +19,5 @@
 from __future__ import absolute_import as _abs
 
 from .quantize import *
-from .rewrite import *
 from ._annotate import register_annotate_function
 from .kl_divergence import kl_divergence_scale
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 30265ca1dc32..9d10b5a47ba9 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -85,11 +85,11 @@ void GetFixedPointMultiplierShift(double double_multiplier,
  * The whole computation this can be broken down into following steps
  * 1) Calculate the integer multiplier and integer shift.
  * 2) Subtract the input integer point.
- * 2) Multiply the integer fixed point multiplier with quantized tensor.
- * 3) Round the result.
- * 4) Right shift the result.
- * 5) Add the output_zero_point.
- * 6) Cast to the out_dtype.
+ * 3) Multiply the integer fixed point multiplier with quantized tensor.
+ * 4) Round the result.
+ * 5) Right shift the result.
+ * 6) Add the output_zero_point.
+ * 7) Cast to the out_dtype.
  *
  */
 Expr RequantizeInt(const Expr& input_tensor,
@@ -252,11 +252,11 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay._quantize.rewrite")
+TVM_REGISTER_API("relay._qnn.rewrite")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-          Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
-            return ret;
-            });
+  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  return ret;
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 6dc35d801543..092e695cf533 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -47,7 +47,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
             output_zero_point=0):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
-        func = relay.op.qnn.requantize(
+        func = relay.qnn.op.requantize(
                 quantized_data,
                 input_zero_point=input_zero_point,
                 output_zero_point=output_zero_point,
@@ -60,7 +60,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
         func = run_infer_type(func)
-        func = relay.quantize.rewrite(func)
+        func = relay.qnn.ir_pass.rewrite(func)
         return func
 
 

From 324e75c5921bce9e642bcde82ad41e76ed27a344 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 16:58:44 +0000
Subject: [PATCH 10/37] Moving the attrs.h inside the include directory.

---
 {src/relay/qnn/include => include/tvm/relay/qnn}/attrs.h | 0
 src/relay/qnn/op/requantize.cc                           | 2 +-
 src/relay/qnn/pass/quantize_rewrite.cc                   | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename {src/relay/qnn/include => include/tvm/relay/qnn}/attrs.h (100%)

diff --git a/src/relay/qnn/include/attrs.h b/include/tvm/relay/qnn/attrs.h
similarity index 100%
rename from src/relay/qnn/include/attrs.h
rename to include/tvm/relay/qnn/attrs.h
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 9e4ddc97467f..c389e82fba80 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -25,7 +25,7 @@
 
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
-#include "../include/attrs.h"
+#include <tvm/relay/qnn/attrs.h>
 #include "../include/util.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 9d10b5a47ba9..5d4942c80a7c 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
-#include "../include/attrs.h"
+#include <tvm/relay/qnn/attrs.h>
 #include "../include/util.h"
 #include "../../pass/pattern_util.h"
 

From ffec47f1bd6e0ae534b59e46d1fc4ffb23f63093 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 11 Jul 2019 19:05:03 +0000
Subject: [PATCH 11/37] Pushing files that I forgot earlier. Changing util
 location.

---
 python/tvm/relay/qnn/_qnn.py           | 22 ++++++++
 python/tvm/relay/qnn/op/__init__.py    | 20 +++++++
 python/tvm/relay/qnn/op/_make.py       | 20 +++++++
 python/tvm/relay/qnn/op/qnn.py         | 74 ++++++++++++++++++++++++++
 src/relay/qnn/op/requantize.cc         |  2 +-
 src/relay/qnn/pass/quantize_rewrite.cc |  2 +-
 src/relay/qnn/{include => }/util.h     |  0
 7 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/relay/qnn/_qnn.py
 create mode 100644 python/tvm/relay/qnn/op/__init__.py
 create mode 100644 python/tvm/relay/qnn/op/_make.py
 create mode 100644 python/tvm/relay/qnn/op/qnn.py
 rename src/relay/qnn/{include => }/util.h (100%)

diff --git a/python/tvm/relay/qnn/_qnn.py b/python/tvm/relay/qnn/_qnn.py
new file mode 100644
index 000000000000..bd3cdbb976d6
--- /dev/null
+++ b/python/tvm/relay/qnn/_qnn.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=unused-argument
+"""Internal module for quantization."""
+from __future__ import absolute_import
+from tvm._ffi.function import _init_api
+
+_init_api("relay._qnn", __name__)
diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py
new file mode 100644
index 000000000000..e9adfa783f93
--- /dev/null
+++ b/python/tvm/relay/qnn/op/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .qnn import *
diff --git a/python/tvm/relay/qnn/op/_make.py b/python/tvm/relay/qnn/op/_make.py
new file mode 100644
index 000000000000..b1695629b8f9
--- /dev/null
+++ b/python/tvm/relay/qnn/op/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.qnn._make", __name__)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
new file mode 100644
index 000000000000..8db431eebe23
--- /dev/null
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def requantize(input_data, input_zero_point, input_scale, output_zero_point,
+               output_scale, out_dtype="int32", use_int_compute=True,
+               rounding_mode="FE_AWAY_FROM_ZERO"):
+    r"""Requantized operator.
+
+    The requantize operator converts one quantized tensor to another quantized
+    tensor. For the output tensor, we are provided with output scale and zero
+    point. The computation looks like this
+
+    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+
+    The above computation can be done in floating point as the scales are in
+    FP32. Alternatively, we can approximate floating point with fixed point
+    computation. This is controlled by use_int_compute.
+
+    Parameters
+    ----------
+    quantized_data : tvm.relay.Expr
+        The input quantized_data to the operator.
+
+    input_scale: float
+           The float scalar to scale the quantized_data int8 values back to FP32.
+
+    output_scale: float
+           The float scalar to scale the quantized_output int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the quantized_data distribution.
+
+    output_zero_point: int
+           The zero point of the quantized_output distribution.
+
+    out_dtype : str, optional
+        Specifies the output quantized_data type for mixed precision conv2d.
+
+    use_int_compute : bool, optional
+        Use fully integer computation for requantizing.
+
+    rounding_mode : string, optional
+        Defines the rounding direction when the value is midway between two
+        representable values.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+            "Unsupported rounding mode"
+
+    return _make.requantize(input_data, input_zero_point, input_scale,
+                            output_zero_point, output_scale, out_dtype,
+                            use_int_compute, rounding_mode)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index c389e82fba80..df4a224fc2ba 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/qnn/attrs.h>
-#include "../include/util.h"
+#include "../util.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/quantize_rewrite.cc
index 5d4942c80a7c..7d4e0f017050 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/quantize_rewrite.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
-#include "../include/util.h"
+#include "../util.h"
 #include "../../pass/pattern_util.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/include/util.h b/src/relay/qnn/util.h
similarity index 100%
rename from src/relay/qnn/include/util.h
rename to src/relay/qnn/util.h

From 72436ffee002e00f55fc2357065ae25bb1dfe5b4 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 17:03:12 +0000
Subject: [PATCH 12/37] Incorporating comments. API change. Lint fixes.

---
 include/tvm/relay/qnn/attrs.h                 |  21 +-
 python/tvm/relay/qnn/op/qnn.py                |  44 +-
 src/relay/pass/pattern_util.h                 |   6 +-
 src/relay/qnn/op/requantize.cc                |  29 +-
 .../{quantize_rewrite.cc => qnn_lower.cc}     |  21 +-
 src/relay/qnn/util.h                          |  88 ++--
 tests/python/unittest/test_quantized_ops.py   | 393 +++++++++---------
 7 files changed, 301 insertions(+), 301 deletions(-)
 rename src/relay/qnn/pass/{quantize_rewrite.cc => qnn_lower.cc} (96%)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 6bcd77a81f8a..1cd7deb4393f 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -21,8 +21,8 @@
  * \file tvm/relay/attrs/nn.h
  * \brief Auxiliary attributes for nn operators.
  */
-#ifndef TVM_RELAY_ATTRS_QNN_H_
-#define TVM_RELAY_ATTRS_QNN_H_
+#ifndef TVM_RELAY_QNN_ATTRS_H_
+#define TVM_RELAY_QNN_ATTRS_H_
 
 #include <tvm/attrs.h>
 #include <string>
@@ -36,8 +36,8 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   int32_t input_zero_point;
   double output_scale;
   int32_t output_zero_point;
-  bool use_int_compute;
-  std::string rounding_mode;
+  bool use_int_domain;
+  std::string rounding;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
@@ -49,17 +49,22 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("The scale of the input tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_compute).set_default(true)
+    TVM_ATTR_FIELD(use_int_domain).set_default(true)
       .describe("When true, the integer computation is used to handle output scale."
                 "The float compuation can be used as reference implementation or in"
                 "cases where FP32 computation for requantize is not expensive");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
-    TVM_ATTR_FIELD(rounding_mode).set_default("FE_UPWARD")
+    TVM_ATTR_FIELD(rounding).set_default("FE_AWAY_FROM_ZERO")
         .describe("Defines the rounding direction when the value is midway between"
                   "two representable values. There are two supported modes - FE_UPWARD"
-                  "or FE_AWAY_FROM_ZERO. More context can be found at"
+                  "or FE_AWAY_FROM_ZERO. Both modes behave exactly same except at the"
+                  "midpoints between the two representable values. At midpoint, FE_UPWARD"
+                  "rounds towards positive infinity (for example -1.5 will be rounded"
+                  "to -1). FE_AWAY_FROM_ZERO is the standard rounding where the value"
+                  "is rounded away from zero at midpoints (for example, -1.5 rounds to"
+                  "-2). More context can be found at"
                   "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
   }
 };
@@ -67,4 +72,4 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_ATTRS_QNN_H_
+#endif  // TVM_RELAY_QNN_ATTRS_H_
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 8db431eebe23..b0e06e41ed13 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -19,9 +19,14 @@
 from __future__ import absolute_import as _abs
 from . import _make
 
-def requantize(input_data, input_zero_point, input_scale, output_zero_point,
-               output_scale, out_dtype="int32", use_int_compute=True,
-               rounding_mode="FE_AWAY_FROM_ZERO"):
+def requantize(data,
+               input_scale,
+               input_zero_point,
+               output_scale,
+               output_zero_point,
+               out_dtype="int32",
+               rounding="FE_AWAY_FROM_ZERO",
+               use_int_domain=True):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -32,32 +37,32 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
 
     The above computation can be done in floating point as the scales are in
     FP32. Alternatively, we can approximate floating point with fixed point
-    computation. This is controlled by use_int_compute.
+    computation. This is controlled by use_int_domain.
 
     Parameters
     ----------
-    quantized_data : tvm.relay.Expr
-        The input quantized_data to the operator.
+    data : tvm.relay.Expr
+        The input data to the operator.
 
     input_scale: float
-           The float scalar to scale the quantized_data int8 values back to FP32.
+           The float scalar to scale the data int8 values back to FP32.
+
+    input_zero_point: int
+           The zero point of the data distribution.
 
     output_scale: float
            The float scalar to scale the quantized_output int8 values back to FP32.
 
-    input_zero_point: int
-           The zero point of the quantized_data distribution.
-
     output_zero_point: int
            The zero point of the quantized_output distribution.
 
     out_dtype : str, optional
-        Specifies the output quantized_data type for mixed precision conv2d.
+        Specifies the output data type for mixed precision conv2d.
 
-    use_int_compute : bool, optional
+    use_int_domain : bool, optional
         Use fully integer computation for requantizing.
 
-    rounding_mode : string, optional
+    rounding : string, optional
         Defines the rounding direction when the value is midway between two
         representable values.
 
@@ -66,9 +71,14 @@ def requantize(input_data, input_zero_point, input_scale, output_zero_point,
     result : tvm.relay.Expr
         The computed result.
     """
-    assert rounding_mode in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+    assert rounding in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
             "Unsupported rounding mode"
 
-    return _make.requantize(input_data, input_zero_point, input_scale,
-                            output_zero_point, output_scale, out_dtype,
-                            use_int_compute, rounding_mode)
+    return _make.requantize(data,
+                            input_scale,
+                            input_zero_point,
+                            output_scale,
+                            output_zero_point,
+                            out_dtype,
+                            rounding,
+                            use_int_domain)
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 4bd203949136..2ea1f8ecd6f6 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors.
+ *  Copyright (c) 2019 by Contributors.
  *
  * \file tvm/relay/pass/pattern_util.h
  * \brief Header of internal operator functions
@@ -405,8 +405,8 @@ inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
 }
 
 inline Expr Full(Expr fill_value,
-              Array<IndexExpr> shape,
-              DataType dtype) {
+                 Array<IndexExpr> shape,
+                 DataType dtype) {
   auto attrs = make_node<InitOpAttrs>();
   attrs->shape = std::move(shape);
   attrs->dtype = std::move(dtype);
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index df4a224fc2ba..9d0504631893 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
+ *  Copyright (c) 2019 by Contributors
  * \file requantize.cc
  * \brief Quantized convolution operators
  */
@@ -54,29 +54,36 @@ bool RequantizeRel(const Array<Type>& types,
 // Positional relay function to create quantized conv2d operator
 // used by frontend FFI.
 Expr MakeRequantize(Expr data,
-                    int32_t input_zero_point,
                     double input_scale,
-                    int32_t output_zero_point,
+                    int32_t input_zero_point,
                     double output_scale,
+                    int32_t output_zero_point,
                     DataType out_dtype,
-                    bool use_int_compute,
-                    std::string rounding_mode) {
+                    std::string rounding,
+                    bool use_int_domain) {
   auto attrs = make_node<RequantizeAttrs>();
-  attrs->out_dtype = std::move(out_dtype);
-  attrs->input_zero_point = std::move(input_zero_point);
-  attrs->output_zero_point = std::move(output_zero_point);
   attrs->input_scale = std::move(input_scale);
+  attrs->input_zero_point = std::move(input_zero_point);
   attrs->output_scale = std::move(output_scale);
-  attrs->use_int_compute = std::move(use_int_compute);
-  attrs->rounding_mode = std::move(rounding_mode);
+  attrs->output_zero_point = std::move(output_zero_point);
+  attrs->out_dtype = std::move(out_dtype);
+  attrs->rounding = std::move(rounding);
+  attrs->use_int_domain = std::move(use_int_domain);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
 RELAY_REGISTER_OP("qnn.requantize")
 .describe(R"code(Requantize operator.
+The requantize operator converts one quantized tensor to another quantized
+tensor. For the output tensor, we are provided with output scale and zero
+point. The computation looks like this
+
+Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
-FIXME
+The above computation can be done in floating point as the scales are in
+FP32. Alternatively, we can approximate floating point with fixed point
+computation. This is controlled by use_int_domain.
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.RequantizeAttrs")
 .set_num_inputs(1)
diff --git a/src/relay/qnn/pass/quantize_rewrite.cc b/src/relay/qnn/pass/qnn_lower.cc
similarity index 96%
rename from src/relay/qnn/pass/quantize_rewrite.cc
rename to src/relay/qnn/pass/qnn_lower.cc
index 7d4e0f017050..d491e2a817d3 100644
--- a/src/relay/qnn/pass/quantize_rewrite.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
- * \file quantize_rewrite.cc
+ *  Copyright (c) 2019 by Contributors
+ * \file qnn_lower.cc
  * \brief Lower quantized ops to exisiting Relay ops.
  */
 
@@ -111,15 +111,12 @@ Expr RequantizeInt(const Expr& input_tensor,
   int right_shift = shift > 0 ? 0 : -shift;
 
   // 2) Subtract the input_zero_point
-  auto tensor = input_tensor;
-  tensor = Cast(tensor, up_idtype);
+  auto tensor = Cast(input_tensor, up_idtype);
   if (param->input_zero_point != 0) {
     auto input_zp = MakeConstantScalar(up_idtype, param->input_zero_point);
     tensor = Subtract(tensor, input_zp);
   }
 
-
-
   // 3) Multiply the integer multiplier
   if (left_shift != 0) {
     tensor = Multiply(tensor, MakeConstantScalar(up_idtype, 1 << left_shift));
@@ -132,18 +129,17 @@ Expr RequantizeInt(const Expr& input_tensor,
   Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
   auto multiplied_t = Multiply(tensor, scalar);
 
-
   // 4) Find the rounding scalar. This depends on where the final decimal point
   // sits. As we will be right shifting the multiplied_t, we need to first
-  // calculate the totol_right_shift.
+  // calculate the total_right_shift.
   int total_right_shift = right_shift + idtype_bits - 1;
 
   tensor = multiplied_t;
   Expr round_scalar;
-  if (param->rounding_mode == "FE_UPWARD") {
+  if (param->rounding == "FE_UPWARD") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     round_scalar = pos_rounder;
-  } else if (param->rounding_mode == "FE_AWAY_FROM_ZERO") {
+  } else if (param->rounding == "FE_AWAY_FROM_ZERO") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
     auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
@@ -219,13 +215,14 @@ Expr RequantizeFloat(const Expr& input_tensor,
  *
  * The above computation can be done in floating point as the scales are in
  * FP32. Alternatively, we can approximate floating point with fixed point
- * computation. This is controlled by use_int_compute.
+ * computation. This is controlled by use_int_domain.
  */
 Expr RequantizeForwardRewrite(const Call& ref_call,
     const Array<Expr>& new_args, const NodeRef& ctx) {
   CHECK_EQ(new_args.size(), 1);
   Expr quantized_data = new_args[0];
   const auto* param = ref_call->attrs.as<RequantizeAttrs>();
+  CHECK(param != nullptr);
 
   // Find output shape.
   Array<IndexExpr> out_shape;
@@ -242,7 +239,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
-  if (param->use_int_compute) {
+  if (param->use_int_domain) {
     return RequantizeInt(quantized_data, param, input_dtype, out_shape);
   } else {
     return RequantizeFloat(quantized_data, param, input_dtype, out_shape);
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 61663b0da85e..63e7938c93d8 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -18,54 +18,26 @@
  */
 
 /*!
- * \file tvm/relay/quantize_util.h
+ * \file src/relay/qnn/util.h
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
-#ifndef TVM_RELAY_QUANTIZE_UTIL_H_
-#define TVM_RELAY_QUANTIZE_UTIL_H_
+#ifndef TVM_RELAY_QNN_UTIL_H_
+#define TVM_RELAY_QNN_UTIL_H_
 
 #include <tvm/expr.h>
-#include <limits>
 #include <tvm/relay/expr.h>
+#include <limits>
 
 namespace tvm {
 namespace relay {
 
-inline bool IsInt8(const DataType& dtype) {
-  return dtype == Int(8);
+inline bool IsQNNDataType(const DataType& dtype) {
+  return dtype == Int(8) || dtype == UInt(8)
+      || dtype == Int(16) || dtype == UInt(16);
 }
 
-inline bool IsUint8(const DataType& dtype) {
-  return dtype == UInt(8);
-}
-
-inline bool IsInt16(const DataType& dtype) {
-  return dtype == Int(16);
-}
-
-inline bool IsUint16(const DataType& dtype) {
-  return dtype == UInt(16);
-}
-
-inline bool IsInt32(const DataType& dtype) {
-  return dtype == Int(32);
-}
-
-inline bool IsUint32(const DataType& dtype) {
-  return dtype == UInt(32);
-}
-
-inline bool IsFloat32(const DataType& dtype) {
-  return dtype == Float(32);
-}
-
-inline bool IsQuantizedType(const DataType& dtype) {
-  return IsInt8(dtype) || IsUint8(dtype)
-      || IsInt16(dtype) || IsUint16(dtype);
-}
-
-enum class QuantizeOpType : uint8_t {
+enum class QuantizeOpType {
   Quantize,
   Dequantize,
   Requantize
@@ -75,11 +47,11 @@ inline bool IsValidOpInputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsFloat32(in_dtype) || IsQuantizedType(in_dtype);
-    case QuantizeOpType ::Dequantize:
-      return IsQuantizedType(in_dtype);
-    case QuantizeOpType ::Requantize:
-      return IsInt16(in_dtype) || IsInt32(in_dtype);
+      return in_dtype == Float(32) || IsQNNDataType(in_dtype);
+    case QuantizeOpType::Dequantize:
+      return IsQNNDataType(in_dtype);
+    case QuantizeOpType::Requantize:
+      return in_dtype == Int(16) || in_dtype == Int(32);
     default:
       return false;
   }
@@ -89,51 +61,51 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsQuantizedType(in_dtype);
+      return IsQNNDataType(in_dtype);
     case QuantizeOpType::Dequantize:
-      return IsFloat32(in_dtype);
+      return in_dtype == Float(32);
     default:
       return false;
   }
 }
 
 inline const int32_t GetQmin(const DataType& dtype) {
-  if (IsInt8(dtype)) {
+  if (dtype == Int(8)) {
     return std::numeric_limits<int8_t>::min();
-  } else if (IsUint8(dtype)) {
+  } else if (dtype == UInt(8)) {
     return std::numeric_limits<uint8_t>::min();
-  } else if (IsInt16(dtype)) {
+  } else if (dtype == Int(16)) {
     return std::numeric_limits<int16_t>::min();
-  } else if (IsUint16(dtype)) {
+  } else if (dtype == UInt(16)) {
     return std::numeric_limits<uint16_t>::min();
-  } else if (IsInt32(dtype)) {
+  } else if (dtype == Int(32)) {
     return std::numeric_limits<int32_t>::min();
-  } else if (IsUint32(dtype)) {
+  } else if (dtype == UInt(32)) {
     return std::numeric_limits<uint32_t>::min();
   }
-  LOG(FATAL) << "Type not supported\n";
+  LOG(FATAL) << "Type not supported " << dtype;
   return -1;
 }
 
 
 inline const int32_t GetQmax(const DataType& dtype) {
-  if (IsInt8(dtype)) {
+  if (dtype == Int(8)) {
     return std::numeric_limits<int8_t>::max();
-  } else if (IsUint8(dtype)) {
+  } else if (dtype == UInt(8)) {
     return std::numeric_limits<uint8_t>::max();
-  } else if (IsInt16(dtype)) {
+  } else if (dtype == Int(16)) {
     return std::numeric_limits<int16_t>::max();
-  } else if (IsUint16(dtype)) {
+  } else if (dtype == UInt(16)) {
     return std::numeric_limits<uint16_t>::max();
-  } else if (IsInt32(dtype)) {
+  } else if (dtype == Int(32)) {
     return std::numeric_limits<int32_t>::max();
-  } else if (IsUint32(dtype)) {
+  } else if (dtype == UInt(32)) {
     return std::numeric_limits<uint32_t>::max();
   }
-  LOG(FATAL) << "Type not supported\n";
+  LOG(FATAL) << "Type not supported " << dtype;
   return -1;
 }
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_QUANTIZE_UTIL_H_
+#endif  // TVM_RELAY_QNN_UTIL_H_
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_quantized_ops.py
index 092e695cf533..17790294b8bc 100644
--- a/tests/python/unittest/test_quantized_ops.py
+++ b/tests/python/unittest/test_quantized_ops.py
@@ -21,7 +21,7 @@
 from tvm.relay.testing import create_workload
 from tvm.contrib import graph_runtime
 
-rounding_modes = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
+roundings = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
 
 def run_infer_type(expr):
     mod = relay.Module.from_expr(expr)
@@ -32,7 +32,7 @@ def run_infer_type(expr):
 
 def test_requantize():
     def verify(func, goldens):
-        with relay.build_config(opt_level=0):
+        with relay.build_config(opt_level=3):
             graph, lib, params = relay.build(func, "llvm", params=None)
             golden_data, golden_output = goldens
             mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
@@ -42,8 +42,8 @@ def verify(func, goldens):
             res = mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
-            rounding_mode, input_scale, output_scale, input_zero_point=0,
+    def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
+            rounding, input_scale, output_scale, input_zero_point=0,
             output_zero_point=0):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
@@ -53,9 +53,9 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
                 output_zero_point=output_zero_point,
                 input_scale=input_scale,
                 output_scale=output_scale,
-                rounding_mode=rounding_mode,
+                rounding=rounding,
                 out_dtype=out_dtype,
-                use_int_compute=use_int_compute)
+                use_int_domain=use_int_domain)
 
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
@@ -64,193 +64,202 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_compute,
         return func
 
 
-    def run_tests():
-        def same_scale_test():
-            # Have same scales, everything within range
-            golden_data = np.arange(-100, 100, 1).astype('int32')
-            golden_output = golden_data
-
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(200, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=0.5,
-                                    output_scale=0.5)
-                    verify(func, (golden_data, golden_output))
-
-        def downscale_test():
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype='int8',
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=16)
-
-                    # Try positive values
-                    # 8 corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(0, -32, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                    else:
-                        golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                    verify(func, (golden_data, golden_output))
-
-                # Try a different scale
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=4)
-
-                    # Try positive values
-                    # 2I corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+    def same_scale_test():
+        # Have same scales, everything within range
+        golden_data = np.arange(-100, 100, 1).astype('int32')
+        golden_output = golden_data
+
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(200, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=0.5,
+                                output_scale=0.5)
+                verify(func, (golden_data, golden_output))
+
+    def downscale_test():
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='int8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+                verify(func, (golden_data, golden_output))
+
+            # Try a different scale
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=4)
+
+                # Try positive values
+                # 2I corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                          [2, 4, 4, 4, 4, 4, 4, 4, 2])
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                              [3, 4, 4, 4, 4, 4, 4, 4, 1])
+                else:
+                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                               [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(0, -32, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                                  [3, 4, 4, 4, 4, 4, 4, 4, 1])
-                    else:
-                        golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                                  [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                    verify(func, (golden_data, golden_output))
-
-        def upscale_test():
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=2,
-                                    output_scale=1)
-
-                    # Try positive values
-                    # 8 corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.multiply(2, golden_data)
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(0, -32, -1).astype('int32')
-                    golden_output = np.multiply(2, golden_data)
-                    verify(func, (golden_data, golden_output))
-
-        def saturation_test():
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(16, ),
-                                    data_dtype='int32',
-                                    out_dtype="int8",
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=0.5,
-                                    output_scale=0.5)
-                    golden_data = np.arange(0, 16, 1).astype('int32')
-                    golden_data = np.add(120, golden_data)
-                    output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
-                                       127, 127, 127, 127, 127, 127, 127, 127])
-                    golden_output = output
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative numbers
-                    golden_data = np.arange(0, -16, -1).astype('int32')
-                    golden_data = np.add(-120, golden_data)
-                    output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
-                                       -128, -128, -128, -128, -128, -128, -128, -128])
-                    golden_output = output
-                    verify(func, (golden_data, golden_output))
-
-        def zero_point_test():
-            # Output zero point
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype='int8',
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=16,
-                                    output_zero_point=1)
-
-                    # Try positive values
-                    # 8 corresponds to 0.5, resulting in 1
-                    golden_data = np.arange(0, 32, 1).astype('int32')
-                    golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                    golden_output = np.add(1, golden_output)
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                    golden_data = np.arange(-32, -64, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                    else:
-                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                    golden_output = np.add(1, golden_output)
-                    verify(func, (golden_data, golden_output))
-
-            # Input zero point
-            for rounding_mode in rounding_modes:
-                for use_int_compute in [True, False]:
-                    func = get_func(data_shape=(32, ),
-                                    data_dtype='int32',
-                                    out_dtype='int8',
-                                    use_int_compute=use_int_compute,
-                                    rounding_mode=rounding_mode,
-                                    input_scale=1,
-                                    output_scale=16,
-                                    input_zero_point=16)
-
-                    # Try positive values
-                    golden_data = np.arange(32, 64, 1).astype('int32')
-                    golden_output = np.repeat([2, 3, 4], [8, 16, 8])
-                    golden_output = np.subtract(golden_output, 1)
-                    verify(func, (golden_data, golden_output))
-
-                    # Try negative values
-                    golden_data = np.arange(-32, -64, -1).astype('int32')
-                    if use_int_compute == True and rounding_mode == "FE_UPWARD":
-                        golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                    else:
-                        golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                    golden_output = np.subtract(golden_output, 1)
-                    verify(func, (golden_data, golden_output))
-
-
-
-
-        if __name__ == "__main__":
-            same_scale_test()
-            downscale_test()
-            upscale_test()
-            saturation_test()
-            zero_point_test()
-
-    run_tests()
+                verify(func, (golden_data, golden_output))
+
+            # Try uint8 out_dtype
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='uint8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                verify(func, (golden_data, golden_output))
+
+    def upscale_test():
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=2,
+                                output_scale=1)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.multiply(2, golden_data)
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(0, -32, -1).astype('int32')
+                golden_output = np.multiply(2, golden_data)
+                verify(func, (golden_data, golden_output))
+
+    def saturation_test():
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(16, ),
+                                data_dtype='int32',
+                                out_dtype="int8",
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=0.5,
+                                output_scale=0.5)
+                golden_data = np.arange(0, 16, 1).astype('int32')
+                golden_data = np.add(120, golden_data)
+                output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
+                                   127, 127, 127, 127, 127, 127, 127, 127])
+                golden_output = output
+                verify(func, (golden_data, golden_output))
+
+                # Try negative numbers
+                golden_data = np.arange(0, -16, -1).astype('int32')
+                golden_data = np.add(-120, golden_data)
+                output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
+                                   -128, -128, -128, -128, -128, -128, -128, -128])
+                golden_output = output
+                verify(func, (golden_data, golden_output))
+
+    def zero_point_test():
+        # Output zero point
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='int8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16,
+                                output_zero_point=1)
+
+                # Try positive values
+                # 8 corresponds to 0.5, resulting in 1
+                golden_data = np.arange(0, 32, 1).astype('int32')
+                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+                golden_output = np.add(1, golden_output)
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+                golden_data = np.arange(-32, -64, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                golden_output = np.add(1, golden_output)
+                verify(func, (golden_data, golden_output))
+
+        # Input zero point
+        for rounding in roundings:
+            for use_int_domain in [True, False]:
+                func = get_func(data_shape=(32, ),
+                                data_dtype='int32',
+                                out_dtype='int8',
+                                use_int_domain=use_int_domain,
+                                rounding=rounding,
+                                input_scale=1,
+                                output_scale=16,
+                                input_zero_point=16)
+
+                # Try positive values
+                golden_data = np.arange(32, 64, 1).astype('int32')
+                golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+                golden_output = np.subtract(golden_output, 1)
+                verify(func, (golden_data, golden_output))
+
+                # Try negative values
+                golden_data = np.arange(-32, -64, -1).astype('int32')
+                if use_int_domain == True and rounding == "FE_UPWARD":
+                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+                else:
+                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+                golden_output = np.subtract(golden_output, 1)
+                verify(func, (golden_data, golden_output))
+
+    same_scale_test()
+    downscale_test()
+    upscale_test()
+    saturation_test()
+    zero_point_test()
 
 if __name__ == "__main__":
     test_requantize()

From 9a721ad95f69ebdd3a9f5a4dace4a7c0c99d334c Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 17:29:14 +0000
Subject: [PATCH 13/37] Modifying the GetFixedPointMultiplierShift API as per
 comments.

---
 src/relay/pass/pattern_util.h                 |  2 +-
 src/relay/qnn/pass/qnn_lower.cc               | 45 ++++++++++---------
 ...{test_quantized_ops.py => test_qnn_ops.py} |  0
 3 files changed, 24 insertions(+), 23 deletions(-)
 rename tests/python/unittest/{test_quantized_ops.py => test_qnn_ops.py} (100%)

diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 2ea1f8ecd6f6..4f9b11eb925c 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors.
+ *  Copyright (c) 2018 by Contributors.
  *
  * \file tvm/relay/pass/pattern_util.h
  * \brief Header of internal operator functions
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index d491e2a817d3..d0bc2d430961 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -38,9 +38,9 @@ namespace relay {
 /*
  * Converts a floating point number so that it can be represented by integers.
  * The representation is
- *      float_number = (fixed_point_multiplier) * 2^(shift)
+ *      float_number = (significand) * 2^(exponent)
  *
- * The fixed_point_multiplier is a number between 0.5 and 1. This is represented
+ * The significand is a number between 0.5 and 1. This is represented
  * by an integer number. For example, if it is int32, then the decimal point
  * exists between bit 31 and 30 from LSB (or between first and second bit from
  * the left).
@@ -48,27 +48,28 @@ namespace relay {
  * Some examples are
  *           0.25 = (0.5) * 2^(-1)
  *           0.125 = (0.5) * 2^(-2)
+ *
+ * Credit to TFLite reference implementation.
  */
-void GetFixedPointMultiplierShift(double double_multiplier,
-    int32_t* fixed_point_multiplier, int* shift,
+std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
     const DataType& idtype) {
-
+  int significand, exponent;
   int idtype_bits = idtype.bits();
 
-  if (double_multiplier == 0.) {
-    *fixed_point_multiplier = 0;
-    *shift = 0;
-    return;
-  }
-  const double q = std::frexp(double_multiplier, shift);
-  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << (idtype_bits - 1))));
-  CHECK_LE(q_fixed, (1ll << (idtype_bits - 1)));
-  if (q_fixed == (1ll << (idtype_bits - 1))) {
-    q_fixed /= 2;
-    ++*shift;
+  // Get the significand (significand) and exponent (exponent)
+  double significand_d = std::frexp(double_multiplier, &exponent);
+
+  // Convert the double significand to int significand.
+  significand_d = std::round(significand_d * (1ll << (idtype_bits - 1)));
+  auto significand_int64 = static_cast<int64_t>(significand_d);
+  CHECK_LE(significand_int64, (1ll << (idtype_bits - 1)));
+  if (significand_int64 == (1ll << (idtype_bits - 1))) {
+    significand_int64 /= 2;
+    ++exponent;
   }
-  CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
-  *fixed_point_multiplier = static_cast<int32_t>(q_fixed);
+  CHECK_LE(significand_int64, std::numeric_limits<int>::max());
+  significand = static_cast<int>(significand_int64);
+  return std::pair<int, int>(significand, exponent);
 }
 
 /*
@@ -103,10 +104,10 @@ Expr RequantizeInt(const Expr& input_tensor,
   DataType up_idtype = Int(2 * idtype_bits);
 
   // 1) Calculating the integer multiplier and integer shift
-  int32_t fixed_point_multiplier;
-  int shift;
-  GetFixedPointMultiplierShift(double_multiplier, &fixed_point_multiplier,
-          &shift, idtype);
+  std::pair<int, int> fixed_point_params =
+      GetFixedPointMultiplierShift(double_multiplier, idtype);
+  int fixed_point_multiplier = fixed_point_params.first;
+  int shift = fixed_point_params.second;
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
 
diff --git a/tests/python/unittest/test_quantized_ops.py b/tests/python/unittest/test_qnn_ops.py
similarity index 100%
rename from tests/python/unittest/test_quantized_ops.py
rename to tests/python/unittest/test_qnn_ops.py

From fb9cece8d6624b61bb341532be07af4fe4696872 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 19:00:39 +0000
Subject: [PATCH 14/37] Forgot the dialect change.

---
 python/tvm/relay/qnn/op/_make.py | 2 +-
 src/relay/qnn/op/requantize.cc   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/qnn/op/_make.py b/python/tvm/relay/qnn/op/_make.py
index b1695629b8f9..07b3dd154760 100644
--- a/python/tvm/relay/qnn/op/_make.py
+++ b/python/tvm/relay/qnn/op/_make.py
@@ -17,4 +17,4 @@
 """Constructor APIs"""
 from ...._ffi.function import _init_api
 
-_init_api("relay.op.qnn._make", __name__)
+_init_api("relay.qnn.op._make", __name__)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 9d0504631893..13179f15f22a 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -91,7 +91,7 @@ computation. This is controlled by use_int_domain.
 .set_support_level(10)
 .add_type_rel("Requantize", RequantizeRel);
 
-TVM_REGISTER_API("relay.op.qnn._make.requantize")
+TVM_REGISTER_API("relay.qnn.op._make.requantize")
 .set_body_typed(MakeRequantize);
 
 }  // namespace relay

From be7101fc2d34a20bd98b3ce18cac33703ed82384 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 20:19:20 +0000
Subject: [PATCH 15/37] Changing rewrite to qnn_lower.

---
 python/tvm/relay/qnn/ir_pass.py       | 4 ++--
 src/relay/qnn/pass/qnn_lower.cc       | 2 +-
 tests/python/unittest/test_qnn_ops.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/qnn/ir_pass.py b/python/tvm/relay/qnn/ir_pass.py
index 24e3329e961c..edeecd9a0e6c 100644
--- a/python/tvm/relay/qnn/ir_pass.py
+++ b/python/tvm/relay/qnn/ir_pass.py
@@ -20,7 +20,7 @@
 
 from . import _qnn
 
-def rewrite(expr):
+def qnn_lower(expr):
     """
     Rewrites the high-level quantized ops into low-level exisiting Relay ops.
 
@@ -34,4 +34,4 @@ def rewrite(expr):
     expr : tvm.relay.Expr
         The output expression.
     """
-    return _qnn.rewrite(expr)
+    return _qnn.qnn_lower(expr)
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index d0bc2d430961..5048d8686a61 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -250,7 +250,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay._qnn.rewrite")
+TVM_REGISTER_API("relay._qnn.qnn_lower")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
   Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
   return ret;
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 17790294b8bc..5c84ef19f1c7 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -60,7 +60,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
         func = run_infer_type(func)
-        func = relay.qnn.ir_pass.rewrite(func)
+        func = relay.qnn.ir_pass.qnn_lower(func)
         return func
 
 

From 0a5642a371788e135a235d73f855c41fffb4dc8d Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 15 Jul 2019 20:21:23 +0000
Subject: [PATCH 16/37] Renaming Quantize to Qnn for clarity.

---
 src/relay/qnn/pass/qnn_lower.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 5048d8686a61..b05ea8fbded9 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -248,11 +248,11 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 }
 
 RELAY_REGISTER_OP("qnn.requantize")
-.set_attr<FForwardRewrite>("FQuantizeForwardRewrite", RequantizeForwardRewrite);
+.set_attr<FForwardRewrite>("FQnnForwardRewrite", RequantizeForwardRewrite);
 
 TVM_REGISTER_API("relay._qnn.qnn_lower")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQuantizeForwardRewrite", nullptr, nullptr);
+  Expr ret = ForwardRewrite(e, "FQnnForwardRewrite", nullptr, nullptr);
   return ret;
 });
 

From a9c1ce00f48667deb2c9d3acbab48835c0cad9c5 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 17 Jul 2019 06:02:41 +0000
Subject: [PATCH 17/37] Remove use_int_domain.

---
 python/tvm/relay/qnn/op/qnn.py        |  13 +-
 src/relay/qnn/op/requantize.cc        |   7 +-
 src/relay/qnn/pass/qnn_lower.cc       |  44 +---
 tests/python/unittest/test_qnn_ops.py | 318 ++++++++++++--------------
 4 files changed, 155 insertions(+), 227 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index b0e06e41ed13..65369c840b67 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -25,8 +25,7 @@ def requantize(data,
                output_scale,
                output_zero_point,
                out_dtype="int32",
-               rounding="FE_AWAY_FROM_ZERO",
-               use_int_domain=True):
+               rounding="FE_AWAY_FROM_ZERO"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -35,10 +34,6 @@ def requantize(data,
 
     Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
-    The above computation can be done in floating point as the scales are in
-    FP32. Alternatively, we can approximate floating point with fixed point
-    computation. This is controlled by use_int_domain.
-
     Parameters
     ----------
     data : tvm.relay.Expr
@@ -59,9 +54,6 @@ def requantize(data,
     out_dtype : str, optional
         Specifies the output data type for mixed precision conv2d.
 
-    use_int_domain : bool, optional
-        Use fully integer computation for requantizing.
-
     rounding : string, optional
         Defines the rounding direction when the value is midway between two
         representable values.
@@ -80,5 +72,4 @@ def requantize(data,
                             output_scale,
                             output_zero_point,
                             out_dtype,
-                            rounding,
-                            use_int_domain)
+                            rounding)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 13179f15f22a..62688147b06e 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -59,8 +59,7 @@ Expr MakeRequantize(Expr data,
                     double output_scale,
                     int32_t output_zero_point,
                     DataType out_dtype,
-                    std::string rounding,
-                    bool use_int_domain) {
+                    std::string rounding) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->input_scale = std::move(input_scale);
   attrs->input_zero_point = std::move(input_zero_point);
@@ -68,7 +67,6 @@ Expr MakeRequantize(Expr data,
   attrs->output_zero_point = std::move(output_zero_point);
   attrs->out_dtype = std::move(out_dtype);
   attrs->rounding = std::move(rounding);
-  attrs->use_int_domain = std::move(use_int_domain);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
@@ -81,9 +79,6 @@ point. The computation looks like this
 
 Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
-The above computation can be done in floating point as the scales are in
-FP32. Alternatively, we can approximate floating point with fixed point
-computation. This is controlled by use_int_domain.
 )code" TVM_ADD_FILELINE)
 .set_attrs_type_key("relay.attrs.RequantizeAttrs")
 .set_num_inputs(1)
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index b05ea8fbded9..621b8aee2ac7 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -93,7 +93,7 @@ std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
  * 7) Cast to the out_dtype.
  *
  */
-Expr RequantizeInt(const Expr& input_tensor,
+Expr RequantizeLower(const Expr& input_tensor,
     const RequantizeAttrs* param, const DataType& idtype,
     const Array<IndexExpr>& out_shape) {
 
@@ -174,39 +174,6 @@ Expr RequantizeInt(const Expr& input_tensor,
   return requantized_output;
 }
 
-
-/*
- * Requantization using floating computation. Here we can multiply the scale to
- * the input_tensor, round to nearest integer and then cast back to int32.
- */
-Expr RequantizeFloat(const Expr& input_tensor,
-    const RequantizeAttrs* param, const DataType& idtype,
-    const Array<IndexExpr>& out_shape) {
-  double double_multiplier = param->input_scale/param->output_scale;
-  auto scalar_multiplier = MakeConstantScalar(Float(32), double_multiplier);
-  auto input_zp = MakeConstantScalar(idtype, param->input_zero_point);
-  auto output_zp = MakeConstantScalar(Float(32), param->output_zero_point);
-
-  // Multiply the tensor with the new scale.
-  auto shifted_input_t = Subtract(input_tensor, input_zp);
-  auto casted_t = Cast(shifted_input_t, Float(32));
-  auto multiplied_t = Multiply(casted_t, scalar_multiplier);
-  auto shifted_multiplied_t = Add(output_zp, multiplied_t);
-  auto rounded_t = Round(shifted_multiplied_t);
-  auto q_imin = GetQmin(idtype);
-  auto q_imax = GetQmax(idtype);
-  auto scaled_int32_t = Cast(Clip(rounded_t, q_imin, q_imax),
-          idtype);
-
-  // Clip to the out_dtype min/max.
-  // Clip limits must be smaller than the dtype of the input tensor.
-  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
-  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
-  auto clipped_t = Clip(scaled_int32_t, q_min, q_max);
-  auto requantized_output = Cast(clipped_t, param->out_dtype);
-  return requantized_output;
-}
-
 /*
  * Lowering of the requantize operation. The requantize operator converts one
  * quantized tensor to another quantized tensor. For the output tensor, we are
@@ -214,9 +181,6 @@ Expr RequantizeFloat(const Expr& input_tensor,
  *
  * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
  *
- * The above computation can be done in floating point as the scales are in
- * FP32. Alternatively, we can approximate floating point with fixed point
- * computation. This is controlled by use_int_domain.
  */
 Expr RequantizeForwardRewrite(const Call& ref_call,
     const Array<Expr>& new_args, const NodeRef& ctx) {
@@ -240,11 +204,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
-  if (param->use_int_domain) {
-    return RequantizeInt(quantized_data, param, input_dtype, out_shape);
-  } else {
-    return RequantizeFloat(quantized_data, param, input_dtype, out_shape);
-  }
+  return RequantizeLower(quantized_data, param, input_dtype, out_shape);
 }
 
 RELAY_REGISTER_OP("qnn.requantize")
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 5c84ef19f1c7..342e1ce09d99 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -42,9 +42,8 @@ def verify(func, goldens):
             res = mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
-            rounding, input_scale, output_scale, input_zero_point=0,
-            output_zero_point=0):
+    def get_func(data_shape, data_dtype, out_dtype, rounding, input_scale,
+            output_scale, input_zero_point=0, output_zero_point=0):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
         func = relay.qnn.op.requantize(
@@ -54,8 +53,7 @@ def get_func(data_shape, data_dtype, out_dtype, use_int_domain,
                 input_scale=input_scale,
                 output_scale=output_scale,
                 rounding=rounding,
-                out_dtype=out_dtype,
-                use_int_domain=use_int_domain)
+                out_dtype=out_dtype)
 
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
@@ -70,190 +68,174 @@ def same_scale_test():
         golden_output = golden_data
 
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(200, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=0.5,
-                                output_scale=0.5)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(200, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=0.5,
+                            output_scale=0.5)
+            verify(func, (golden_data, golden_output))
 
     def downscale_test():
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='int8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([0, -1, -2], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='int8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(0, -32, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([0, -1, -2], [9, 16, 7])
+            else:
+                golden_output = np.repeat([0, -1, -2], [8, 16, 8])
+            verify(func, (golden_data, golden_output))
 
             # Try a different scale
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=4)
-
-                # Try positive values
-                # 2I corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=4)
+
+            # Try positive values
+            # 2I corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                      [2, 4, 4, 4, 4, 4, 4, 4, 2])
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(0, -32, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
+                                          [3, 4, 4, 4, 4, 4, 4, 4, 1])
+            else:
+                golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                           [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                              [3, 4, 4, 4, 4, 4, 4, 4, 1])
-                else:
-                    golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
-                                              [2, 4, 4, 4, 4, 4, 4, 4, 2])
-                verify(func, (golden_data, golden_output))
+            verify(func, (golden_data, golden_output))
 
             # Try uint8 out_dtype
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='uint8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='uint8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            verify(func, (golden_data, golden_output))
 
     def upscale_test():
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=2,
-                                output_scale=1)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.multiply(2, golden_data)
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(0, -32, -1).astype('int32')
-                golden_output = np.multiply(2, golden_data)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=2,
+                            output_scale=1)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.multiply(2, golden_data)
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(0, -32, -1).astype('int32')
+            golden_output = np.multiply(2, golden_data)
+            verify(func, (golden_data, golden_output))
 
     def saturation_test():
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(16, ),
-                                data_dtype='int32',
-                                out_dtype="int8",
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=0.5,
-                                output_scale=0.5)
-                golden_data = np.arange(0, 16, 1).astype('int32')
-                golden_data = np.add(120, golden_data)
-                output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
-                                   127, 127, 127, 127, 127, 127, 127, 127])
-                golden_output = output
-                verify(func, (golden_data, golden_output))
-
-                # Try negative numbers
-                golden_data = np.arange(0, -16, -1).astype('int32')
-                golden_data = np.add(-120, golden_data)
-                output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
-                                   -128, -128, -128, -128, -128, -128, -128, -128])
-                golden_output = output
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(16, ),
+                            data_dtype='int32',
+                            out_dtype="int8",
+                            rounding=rounding,
+                            input_scale=0.5,
+                            output_scale=0.5)
+            golden_data = np.arange(0, 16, 1).astype('int32')
+            golden_data = np.add(120, golden_data)
+            output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
+                               127, 127, 127, 127, 127, 127, 127, 127])
+            golden_output = output
+            verify(func, (golden_data, golden_output))
+
+            # Try negative numbers
+            golden_data = np.arange(0, -16, -1).astype('int32')
+            golden_data = np.add(-120, golden_data)
+            output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
+                               -128, -128, -128, -128, -128, -128, -128, -128])
+            golden_output = output
+            verify(func, (golden_data, golden_output))
 
     def zero_point_test():
         # Output zero point
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='int8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16,
-                                output_zero_point=1)
-
-                # Try positive values
-                # 8 corresponds to 0.5, resulting in 1
-                golden_data = np.arange(0, 32, 1).astype('int32')
-                golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-                golden_output = np.add(1, golden_output)
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                # -8 corresponds to -0.5. For FE_UPWARD, this is 0
-                golden_data = np.arange(-32, -64, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                golden_output = np.add(1, golden_output)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='int8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16,
+                            output_zero_point=1)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            golden_output = np.add(1, golden_output)
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            golden_data = np.arange(-32, -64, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+            else:
+                golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+            golden_output = np.add(1, golden_output)
+            verify(func, (golden_data, golden_output))
 
         # Input zero point
         for rounding in roundings:
-            for use_int_domain in [True, False]:
-                func = get_func(data_shape=(32, ),
-                                data_dtype='int32',
-                                out_dtype='int8',
-                                use_int_domain=use_int_domain,
-                                rounding=rounding,
-                                input_scale=1,
-                                output_scale=16,
-                                input_zero_point=16)
-
-                # Try positive values
-                golden_data = np.arange(32, 64, 1).astype('int32')
-                golden_output = np.repeat([2, 3, 4], [8, 16, 8])
-                golden_output = np.subtract(golden_output, 1)
-                verify(func, (golden_data, golden_output))
-
-                # Try negative values
-                golden_data = np.arange(-32, -64, -1).astype('int32')
-                if use_int_domain == True and rounding == "FE_UPWARD":
-                    golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
-                else:
-                    golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
-                golden_output = np.subtract(golden_output, 1)
-                verify(func, (golden_data, golden_output))
+            func = get_func(data_shape=(32, ),
+                            data_dtype='int32',
+                            out_dtype='int8',
+                            rounding=rounding,
+                            input_scale=1,
+                            output_scale=16,
+                            input_zero_point=16)
+
+            # Try positive values
+            golden_data = np.arange(32, 64, 1).astype('int32')
+            golden_output = np.repeat([2, 3, 4], [8, 16, 8])
+            golden_output = np.subtract(golden_output, 1)
+            verify(func, (golden_data, golden_output))
+
+            # Try negative values
+            golden_data = np.arange(-32, -64, -1).astype('int32')
+            if rounding == "FE_UPWARD":
+                golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
+            else:
+                golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
+            golden_output = np.subtract(golden_output, 1)
+            verify(func, (golden_data, golden_output))
 
     same_scale_test()
     downscale_test()

From a0d0324eb0e334901015a3e0b1606c4a543f8edd Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 00:05:03 +0000
Subject: [PATCH 18/37] Incorportaing review comments.

---
 include/tvm/relay/qnn/attrs.h                 | 29 ++++-----
 python/tvm/relay/qnn/__init__.py              |  2 +-
 python/tvm/relay/qnn/op/qnn.py                | 16 ++---
 .../relay/qnn/{ir_pass.py => transform.py}    |  0
 src/relay/qnn/op/requantize.cc                |  8 +--
 .../qnn/{pass => transform}/qnn_lower.cc      |  4 +-
 src/relay/qnn/util.h                          | 45 +++++++-------
 tests/python/unittest/test_qnn_ops.py         | 60 +++++++++----------
 8 files changed, 77 insertions(+), 87 deletions(-)
 rename python/tvm/relay/qnn/{ir_pass.py => transform.py} (100%)
 rename src/relay/qnn/{pass => transform}/qnn_lower.cc (98%)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 1cd7deb4393f..7b8bc28ddcdb 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -36,36 +36,31 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   int32_t input_zero_point;
   double output_scale;
   int32_t output_zero_point;
-  bool use_int_domain;
   std::string rounding;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(RequantizeAttrs, "relay.attrs.RequantizeAttrs") {
-    TVM_ATTR_FIELD(input_zero_point)
-        .describe("The zero point of the input tensor.");
-    TVM_ATTR_FIELD(output_zero_point)
-        .describe("The zero point of the output tensor.");
     TVM_ATTR_FIELD(input_scale)
         .describe("The scale of the input tensor.");
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
     TVM_ATTR_FIELD(output_scale)
         .describe("The scale of the output tensor.");
-    TVM_ATTR_FIELD(use_int_domain).set_default(true)
-      .describe("When true, the integer computation is used to handle output scale."
-                "The float compuation can be used as reference implementation or in"
-                "cases where FP32 computation for requantize is not expensive");
-    TVM_ATTR_FIELD(out_dtype)
-        .set_default(NullValue<DataType>())
-        .describe("Output data type, set to explicit type under mixed precision setting");
-    TVM_ATTR_FIELD(rounding).set_default("FE_AWAY_FROM_ZERO")
+    TVM_ATTR_FIELD(output_zero_point)
+        .describe("The zero point of the output tensor.");
+    TVM_ATTR_FIELD(rounding).set_default("AWAY_FROM_ZERO")
         .describe("Defines the rounding direction when the value is midway between"
-                  "two representable values. There are two supported modes - FE_UPWARD"
-                  "or FE_AWAY_FROM_ZERO. Both modes behave exactly same except at the"
-                  "midpoints between the two representable values. At midpoint, FE_UPWARD"
+                  "two representable values. There are two supported modes - UPWARD"
+                  "or AWAY_FROM_ZERO. Both modes behave exactly same except at the"
+                  "midpoints between the two representable values. At midpoint, UPWARD"
                   "rounds towards positive infinity (for example -1.5 will be rounded"
-                  "to -1). FE_AWAY_FROM_ZERO is the standard rounding where the value"
+                  "to -1). AWAY_FROM_ZERO is the standard rounding where the value"
                   "is rounded away from zero at midpoints (for example, -1.5 rounds to"
                   "-2). More context can be found at"
                   "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
   }
 };
 
diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
index 0836c5770ce4..409e088156b8 100644
--- a/python/tvm/relay/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -18,4 +18,4 @@
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
 from . import op
-from . import ir_pass
+from . import transform
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 65369c840b67..208985036640 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -24,8 +24,8 @@ def requantize(data,
                input_zero_point,
                output_scale,
                output_zero_point,
-               out_dtype="int32",
-               rounding="FE_AWAY_FROM_ZERO"):
+               rounding="AWAY_FROM_ZERO",
+               out_dtype="int32"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor to another quantized
@@ -51,19 +51,19 @@ def requantize(data,
     output_zero_point: int
            The zero point of the quantized_output distribution.
 
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
     rounding : string, optional
         Defines the rounding direction when the value is midway between two
         representable values.
 
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    assert rounding in ("FE_UPWARD", "FE_AWAY_FROM_ZERO"),\
+    assert rounding in ("UPWARD", "AWAY_FROM_ZERO"),\
             "Unsupported rounding mode"
 
     return _make.requantize(data,
@@ -71,5 +71,5 @@ def requantize(data,
                             input_zero_point,
                             output_scale,
                             output_zero_point,
-                            out_dtype,
-                            rounding)
+                            rounding,
+                            out_dtype)
diff --git a/python/tvm/relay/qnn/ir_pass.py b/python/tvm/relay/qnn/transform.py
similarity index 100%
rename from python/tvm/relay/qnn/ir_pass.py
rename to python/tvm/relay/qnn/transform.py
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 62688147b06e..699e0e883a7e 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -58,15 +58,15 @@ Expr MakeRequantize(Expr data,
                     int32_t input_zero_point,
                     double output_scale,
                     int32_t output_zero_point,
-                    DataType out_dtype,
-                    std::string rounding) {
+                    std::string rounding,
+                    DataType out_dtype) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->input_scale = std::move(input_scale);
   attrs->input_zero_point = std::move(input_zero_point);
   attrs->output_scale = std::move(output_scale);
   attrs->output_zero_point = std::move(output_zero_point);
-  attrs->out_dtype = std::move(out_dtype);
   attrs->rounding = std::move(rounding);
+  attrs->out_dtype = std::move(out_dtype);
   static const Op& op = Op::Get("qnn.requantize");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
@@ -83,7 +83,7 @@ Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 .set_attrs_type_key("relay.attrs.RequantizeAttrs")
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The quantized input tensor.")
-.set_support_level(10)
+.set_support_level(11)
 .add_type_rel("Requantize", RequantizeRel);
 
 TVM_REGISTER_API("relay.qnn.op._make.requantize")
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/transform/qnn_lower.cc
similarity index 98%
rename from src/relay/qnn/pass/qnn_lower.cc
rename to src/relay/qnn/transform/qnn_lower.cc
index 621b8aee2ac7..51d167e13ce9 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/transform/qnn_lower.cc
@@ -137,10 +137,10 @@ Expr RequantizeLower(const Expr& input_tensor,
 
   tensor = multiplied_t;
   Expr round_scalar;
-  if (param->rounding == "FE_UPWARD") {
+  if (param->rounding == "UPWARD") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     round_scalar = pos_rounder;
-  } else if (param->rounding == "FE_AWAY_FROM_ZERO") {
+  } else if (param->rounding == "AWAY_FROM_ZERO") {
     auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
     auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
     auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 63e7938c93d8..24f03b2a6d84 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -70,37 +70,32 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
 }
 
 inline const int32_t GetQmin(const DataType& dtype) {
-  if (dtype == Int(8)) {
-    return std::numeric_limits<int8_t>::min();
-  } else if (dtype == UInt(8)) {
-    return std::numeric_limits<uint8_t>::min();
-  } else if (dtype == Int(16)) {
-    return std::numeric_limits<int16_t>::min();
-  } else if (dtype == UInt(16)) {
-    return std::numeric_limits<uint16_t>::min();
-  } else if (dtype == Int(32)) {
-    return std::numeric_limits<int32_t>::min();
-  } else if (dtype == UInt(32)) {
-    return std::numeric_limits<uint32_t>::min();
+  CHECK_LE(dtype.bits(), 32)
+      << "QNN ops support less than 32-bit integer values";
+  if (dtype.is_int()) {
+    auto* min_value = as_const_int(dtype.min());
+    CHECK(min_value != nullptr);
+    return static_cast<int32_t>(min_value[0]);
+  } else if (dtype.is_uint()) {
+    auto* min_value = as_const_uint(dtype.min());
+    CHECK(min_value != nullptr);
+    return static_cast<int32_t>(min_value[0]);
   }
   LOG(FATAL) << "Type not supported " << dtype;
   return -1;
 }
 
-
 inline const int32_t GetQmax(const DataType& dtype) {
-  if (dtype == Int(8)) {
-    return std::numeric_limits<int8_t>::max();
-  } else if (dtype == UInt(8)) {
-    return std::numeric_limits<uint8_t>::max();
-  } else if (dtype == Int(16)) {
-    return std::numeric_limits<int16_t>::max();
-  } else if (dtype == UInt(16)) {
-    return std::numeric_limits<uint16_t>::max();
-  } else if (dtype == Int(32)) {
-    return std::numeric_limits<int32_t>::max();
-  } else if (dtype == UInt(32)) {
-    return std::numeric_limits<uint32_t>::max();
+  CHECK_LE(dtype.bits(), 32)
+      << "QNN ops support less than 32-bit integer values";
+  if (dtype.is_int()) {
+    auto* max_value = as_const_int(dtype.max());
+    CHECK(max_value != nullptr);
+    return static_cast<int32_t>(max_value[0]);
+  } else if (dtype.is_uint()) {
+    auto* max_value = as_const_uint(dtype.max());
+    CHECK(max_value != nullptr);
+    return static_cast<int32_t>(max_value[0]);
   }
   LOG(FATAL) << "Type not supported " << dtype;
   return -1;
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 342e1ce09d99..8015e6a4d71f 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -21,7 +21,7 @@
 from tvm.relay.testing import create_workload
 from tvm.contrib import graph_runtime
 
-roundings = ["FE_UPWARD", "FE_AWAY_FROM_ZERO"]
+roundings = ["UPWARD", "AWAY_FROM_ZERO"]
 
 def run_infer_type(expr):
     mod = relay.Module.from_expr(expr)
@@ -42,23 +42,23 @@ def verify(func, goldens):
             res = mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, rounding, input_scale,
-            output_scale, input_zero_point=0, output_zero_point=0):
+    def get_func(data_shape, data_dtype, out_dtype, input_scale, output_scale,
+            input_zero_point=0, output_zero_point=0, rounding="AWAY_FROM_ZERO"):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
         func = relay.qnn.op.requantize(
                 quantized_data,
-                input_zero_point=input_zero_point,
-                output_zero_point=output_zero_point,
                 input_scale=input_scale,
+                input_zero_point=input_zero_point,
                 output_scale=output_scale,
+                output_zero_point=output_zero_point,
                 rounding=rounding,
                 out_dtype=out_dtype)
 
         func = relay.Function(relay.analysis.free_vars(func),
                 func)
         func = run_infer_type(func)
-        func = relay.qnn.ir_pass.qnn_lower(func)
+        func = relay.qnn.transform.qnn_lower(func)
         return func
 
 
@@ -71,9 +71,9 @@ def same_scale_test():
             func = get_func(data_shape=(200, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=0.5,
-                            output_scale=0.5)
+                            output_scale=0.5,
+                            rounding=rounding)
             verify(func, (golden_data, golden_output))
 
     def downscale_test():
@@ -81,9 +81,9 @@ def downscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='int8',
-                            rounding=rounding,
                             input_scale=1,
-                            output_scale=16)
+                            output_scale=16,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -92,9 +92,9 @@ def downscale_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([0, -1, -2], [9, 16, 7])
             else:
                 golden_output = np.repeat([0, -1, -2], [8, 16, 8])
@@ -104,9 +104,9 @@ def downscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=1,
-                            output_scale=4)
+                            output_scale=4,
+                            rounding=rounding)
 
             # Try positive values
             # 2I corresponds to 0.5, resulting in 1
@@ -116,9 +116,9 @@ def downscale_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                           [3, 4, 4, 4, 4, 4, 4, 4, 1])
             else:
@@ -130,9 +130,9 @@ def downscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='uint8',
-                            rounding=rounding,
                             input_scale=1,
-                            output_scale=16)
+                            output_scale=16,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -145,9 +145,9 @@ def upscale_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=2,
-                            output_scale=1)
+                            output_scale=1,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -156,7 +156,7 @@ def upscale_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
             golden_output = np.multiply(2, golden_data)
             verify(func, (golden_data, golden_output))
@@ -166,9 +166,9 @@ def saturation_test():
             func = get_func(data_shape=(16, ),
                             data_dtype='int32',
                             out_dtype="int8",
-                            rounding=rounding,
                             input_scale=0.5,
-                            output_scale=0.5)
+                            output_scale=0.5,
+                            rounding=rounding)
             golden_data = np.arange(0, 16, 1).astype('int32')
             golden_data = np.add(120, golden_data)
             output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
@@ -190,10 +190,10 @@ def zero_point_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='int8',
-                            rounding=rounding,
                             input_scale=1,
                             output_scale=16,
-                            output_zero_point=1)
+                            output_zero_point=1,
+                            rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
@@ -203,9 +203,9 @@ def zero_point_test():
             verify(func, (golden_data, golden_output))
 
             # Try negative values
-            # -8 corresponds to -0.5. For FE_UPWARD, this is 0
+            # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(-32, -64, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
@@ -217,10 +217,10 @@ def zero_point_test():
             func = get_func(data_shape=(32, ),
                             data_dtype='int32',
                             out_dtype='int8',
-                            rounding=rounding,
                             input_scale=1,
                             output_scale=16,
-                            input_zero_point=16)
+                            input_zero_point=16,
+                            rounding=rounding)
 
             # Try positive values
             golden_data = np.arange(32, 64, 1).astype('int32')
@@ -230,7 +230,7 @@ def zero_point_test():
 
             # Try negative values
             golden_data = np.arange(-32, -64, -1).astype('int32')
-            if rounding == "FE_UPWARD":
+            if rounding == "UPWARD":
                 golden_output = np.repeat([-2, -3, -4], [9, 16, 7])
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])

From 513b54417b2284355f25bcd09f589f4f6a051192 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 00:23:50 +0000
Subject: [PATCH 19/37] Adding API doc for QNN dialect.

---
 docs/langref/relay_op.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 757fdac32b81..52ee03ab134a 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -202,6 +202,16 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.contrib.adaptive_avg_pool2d
 
 
+**Level 11: QNN Dialect Operators**
+
+This level supports quantized operators present in the QNN dialect.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.qnn.op.requantize
+
+
 Level 1 Definitions
 -------------------
 .. autofunction:: tvm.relay.log

From 435ca2759cc487db072f2475aa8c9179949cf6e5 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 00:35:17 +0000
Subject: [PATCH 20/37] Move the qnn_lower pass to transform namespace.

---
 python/tvm/relay/qnn/{_qnn.py => _transform.py} | 2 +-
 python/tvm/relay/qnn/transform.py               | 4 ++--
 src/relay/qnn/{transform => pass}/qnn_lower.cc  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename python/tvm/relay/qnn/{_qnn.py => _transform.py} (95%)
 rename src/relay/qnn/{transform => pass}/qnn_lower.cc (99%)

diff --git a/python/tvm/relay/qnn/_qnn.py b/python/tvm/relay/qnn/_transform.py
similarity index 95%
rename from python/tvm/relay/qnn/_qnn.py
rename to python/tvm/relay/qnn/_transform.py
index bd3cdbb976d6..e2ff6f9ed652 100644
--- a/python/tvm/relay/qnn/_qnn.py
+++ b/python/tvm/relay/qnn/_transform.py
@@ -19,4 +19,4 @@
 from __future__ import absolute_import
 from tvm._ffi.function import _init_api
 
-_init_api("relay._qnn", __name__)
+_init_api("relay.qnn._transform", __name__)
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index edeecd9a0e6c..1e0952faeb61 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -18,7 +18,7 @@
 """Automatic quantization toolkit."""
 from __future__ import absolute_import
 
-from . import _qnn
+from . import _transform
 
 def qnn_lower(expr):
     """
@@ -34,4 +34,4 @@ def qnn_lower(expr):
     expr : tvm.relay.Expr
         The output expression.
     """
-    return _qnn.qnn_lower(expr)
+    return _transform.qnn_lower(expr)
diff --git a/src/relay/qnn/transform/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
similarity index 99%
rename from src/relay/qnn/transform/qnn_lower.cc
rename to src/relay/qnn/pass/qnn_lower.cc
index 51d167e13ce9..017d7c8908d9 100644
--- a/src/relay/qnn/transform/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -210,7 +210,7 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQnnForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay._qnn.qnn_lower")
+TVM_REGISTER_API("relay.qnn._transform.qnn_lower")
 .set_body_typed<Expr(Expr)>([](const Expr& e) {
   Expr ret = ForwardRewrite(e, "FQnnForwardRewrite", nullptr, nullptr);
   return ret;

From e4f6a4e74501dccd8b0b9759a49a527b4c059b5c Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 01:05:23 +0000
Subject: [PATCH 21/37] Moving from expr to module. Adding namespace in C++.

---
 include/tvm/relay/qnn/attrs.h         |   3 +-
 python/tvm/relay/qnn/transform.py     |  15 +--
 src/relay/qnn/op/requantize.cc        |   2 +
 src/relay/qnn/pass/qnn_lower.cc       |  35 +++++-
 tests/python/unittest/test_qnn_ops.py | 155 +++++++++++++-------------
 5 files changed, 116 insertions(+), 94 deletions(-)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 7b8bc28ddcdb..b82416604618 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -29,6 +29,7 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
 
 /*! \brief Attribute for requantize operator */
 struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
@@ -64,7 +65,7 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
   }
 };
 
-
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_QNN_ATTRS_H_
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index 1e0952faeb61..406e23fc0fbc 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -14,24 +14,19 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#pylint: disable=unused-argument
+# pylint: disable=invalid-name
 """Automatic quantization toolkit."""
 from __future__ import absolute_import
 
 from . import _transform
 
-def qnn_lower(expr):
+def QnnLower():
     """
     Rewrites the high-level quantized ops into low-level exisiting Relay ops.
 
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-        The input expression.
-
     Returns
     -------
-    expr : tvm.relay.Expr
-        The output expression.
+    Pass : tvm.relay.transform.Pass
+        The optmized pas.
     """
-    return _transform.qnn_lower(expr)
+    return _transform.QnnLower()
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 699e0e883a7e..bce26355baf5 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -30,6 +30,7 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
 
 TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
@@ -89,5 +90,6 @@ Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 TVM_REGISTER_API("relay.qnn.op._make.requantize")
 .set_body_typed(MakeRequantize);
 
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 017d7c8908d9..ea46504c6748 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -32,6 +32,15 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
+/*!
+ * \brief namespace of qnn lower pass.
+ *
+ * Use namespace to reduce potential naming conflict.
+ */
+namespace qnn_lower {
+
+using runtime::TypedPackedFunc;
 
 // Lowering of qnn.requantize op
 
@@ -210,11 +219,27 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
 RELAY_REGISTER_OP("qnn.requantize")
 .set_attr<FForwardRewrite>("FQnnForwardRewrite", RequantizeForwardRewrite);
 
-TVM_REGISTER_API("relay.qnn._transform.qnn_lower")
-.set_body_typed<Expr(Expr)>([](const Expr& e) {
-  Expr ret = ForwardRewrite(e, "FQnnForwardRewrite", nullptr, nullptr);
-  return ret;
-});
+Expr QnnLower(const Expr& expr) {
+  return ForwardRewrite(expr, "FQnnForwardRewrite", nullptr, nullptr);
+}
+}  // namespace qnn_lower
+
+namespace transform {
+using namespace tvm::relay::transform;
+Pass QnnLower() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+      return Downcast<Function>(
+          relay::qnn::qnn_lower::QnnLower(f));
+  };
+  return CreateFunctionPass(pass_func, 0, "QnnLower",
+                            {ir::StringImm::make("InferType")});
+}
+
+TVM_REGISTER_API("relay.qnn._transform.QnnLower")
+.set_body_typed(QnnLower);
+}  // namespace transform
 
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 8015e6a4d71f..1ef868f797c9 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -31,22 +31,22 @@ def run_infer_type(expr):
 
 
 def test_requantize():
-    def verify(func, goldens):
+    def verify(mod, goldens):
         with relay.build_config(opt_level=3):
-            graph, lib, params = relay.build(func, "llvm", params=None)
+            graph, lib, params = relay.build(mod, "llvm", params=None)
             golden_data, golden_output = goldens
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
-            mod.set_input("quantized_data",golden_data)
-            mod.set_input(**params)
-            mod.run()
-            res = mod.get_output(0).asnumpy()
+            rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            rt_mod.set_input("quantized_data",golden_data)
+            rt_mod.set_input(**params)
+            rt_mod.run()
+            res = rt_mod.get_output(0).asnumpy()
             np.testing.assert_equal(res, golden_output)
 
-    def get_func(data_shape, data_dtype, out_dtype, input_scale, output_scale,
+    def get_mod(data_shape, data_dtype, out_dtype, input_scale, output_scale,
             input_zero_point=0, output_zero_point=0, rounding="AWAY_FROM_ZERO"):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
-        func = relay.qnn.op.requantize(
+        mod = relay.qnn.op.requantize(
                 quantized_data,
                 input_scale=input_scale,
                 input_zero_point=input_zero_point,
@@ -55,11 +55,10 @@ def get_func(data_shape, data_dtype, out_dtype, input_scale, output_scale,
                 rounding=rounding,
                 out_dtype=out_dtype)
 
-        func = relay.Function(relay.analysis.free_vars(func),
-                func)
-        func = run_infer_type(func)
-        func = relay.qnn.transform.qnn_lower(func)
-        return func
+        mod = relay.Function(relay.analysis.free_vars(mod), mod)
+        mod = relay.Module.from_expr(mod)
+        mod = relay.qnn.transform.QnnLower()(mod)
+        return mod
 
 
     def same_scale_test():
@@ -68,28 +67,28 @@ def same_scale_test():
         golden_output = golden_data
 
         for rounding in roundings:
-            func = get_func(data_shape=(200, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=0.5,
-                            output_scale=0.5,
-                            rounding=rounding)
-            verify(func, (golden_data, golden_output))
+            mod = get_mod(data_shape=(200, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=0.5,
+                          output_scale=0.5,
+                          rounding=rounding)
+            verify(mod, (golden_data, golden_output))
 
     def downscale_test():
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='int8',
-                            input_scale=1,
-                            output_scale=16,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='int8',
+                          input_scale=1,
+                          output_scale=16,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
@@ -98,22 +97,22 @@ def downscale_test():
                 golden_output = np.repeat([0, -1, -2], [9, 16, 7])
             else:
                 golden_output = np.repeat([0, -1, -2], [8, 16, 8])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try a different scale
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=1,
-                            output_scale=4,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=1,
+                          output_scale=4,
+                          rounding=rounding)
 
             # Try positive values
             # 2I corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2, 3, 4, 5, 6, 7, 8],
                                       [2, 4, 4, 4, 4, 4, 4, 4, 2])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
@@ -124,57 +123,57 @@ def downscale_test():
             else:
                 golden_output = np.repeat([0, -1, -2, -3, -4, -5, -6, -7, -8],
                                           [2, 4, 4, 4, 4, 4, 4, 4, 2])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try uint8 out_dtype
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='uint8',
-                            input_scale=1,
-                            output_scale=16,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='uint8',
+                          input_scale=1,
+                          output_scale=16,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     def upscale_test():
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=2,
-                            output_scale=1,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=2,
+                          output_scale=1,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.multiply(2, golden_data)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
             golden_data = np.arange(0, -32, -1).astype('int32')
             golden_output = np.multiply(2, golden_data)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     def saturation_test():
         for rounding in roundings:
-            func = get_func(data_shape=(16, ),
-                            data_dtype='int32',
-                            out_dtype="int8",
-                            input_scale=0.5,
-                            output_scale=0.5,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(16, ),
+                          data_dtype='int32',
+                          out_dtype="int8",
+                          input_scale=0.5,
+                          output_scale=0.5,
+                          rounding=rounding)
             golden_data = np.arange(0, 16, 1).astype('int32')
             golden_data = np.add(120, golden_data)
             output = np.array([120, 121, 122, 123, 124, 125, 126, 127,
                                127, 127, 127, 127, 127, 127, 127, 127])
             golden_output = output
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative numbers
             golden_data = np.arange(0, -16, -1).astype('int32')
@@ -182,25 +181,25 @@ def saturation_test():
             output = np.array([-120, -121, -122, -123, -124, -125, -126, -127,
                                -128, -128, -128, -128, -128, -128, -128, -128])
             golden_output = output
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     def zero_point_test():
         # Output zero point
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='int8',
-                            input_scale=1,
-                            output_scale=16,
-                            output_zero_point=1,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='int8',
+                          input_scale=1,
+                          output_scale=16,
+                          output_zero_point=1,
+                          rounding=rounding)
 
             # Try positive values
             # 8 corresponds to 0.5, resulting in 1
             golden_data = np.arange(0, 32, 1).astype('int32')
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
             golden_output = np.add(1, golden_output)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             # -8 corresponds to -0.5. For UPWARD, this is 0
@@ -210,23 +209,23 @@ def zero_point_test():
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
             golden_output = np.add(1, golden_output)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
         # Input zero point
         for rounding in roundings:
-            func = get_func(data_shape=(32, ),
-                            data_dtype='int32',
-                            out_dtype='int8',
-                            input_scale=1,
-                            output_scale=16,
-                            input_zero_point=16,
-                            rounding=rounding)
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='int32',
+                          out_dtype='int8',
+                          input_scale=1,
+                          output_scale=16,
+                          input_zero_point=16,
+                          rounding=rounding)
 
             # Try positive values
             golden_data = np.arange(32, 64, 1).astype('int32')
             golden_output = np.repeat([2, 3, 4], [8, 16, 8])
             golden_output = np.subtract(golden_output, 1)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
             # Try negative values
             golden_data = np.arange(-32, -64, -1).astype('int32')
@@ -235,7 +234,7 @@ def zero_point_test():
             else:
                 golden_output = np.repeat([-2, -3, -4], [8, 16, 8])
             golden_output = np.subtract(golden_output, 1)
-            verify(func, (golden_data, golden_output))
+            verify(mod, (golden_data, golden_output))
 
     same_scale_test()
     downscale_test()

From 10a20d3a9550bd3a6d7876552c75312e80152d05 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 06:12:55 +0000
Subject: [PATCH 22/37] Minor sentence rewrites. Added qnn namespace.

---
 include/tvm/relay/qnn/attrs.h     |  4 ++--
 python/tvm/relay/qnn/op/qnn.py    | 14 +++++++-------
 python/tvm/relay/qnn/transform.py |  2 +-
 src/relay/qnn/op/requantize.cc    |  1 -
 src/relay/qnn/util.h              |  6 ++++--
 5 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index b82416604618..be5d3ac08abd 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file tvm/relay/attrs/nn.h
- * \brief Auxiliary attributes for nn operators.
+ * \file tvm/relay/qnn/attrs.h
+ * \brief Auxiliary attributes for qnn operators.
  */
 #ifndef TVM_RELAY_QNN_ATTRS_H_
 #define TVM_RELAY_QNN_ATTRS_H_
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 208985036640..ebebcbc0bb66 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -28,9 +28,9 @@ def requantize(data,
                out_dtype="int32"):
     r"""Requantized operator.
 
-    The requantize operator converts one quantized tensor to another quantized
-    tensor. For the output tensor, we are provided with output scale and zero
-    point. The computation looks like this
+    The requantize operator converts one quantized tensor representation to
+    another quantized tensor representation. For the output tensor, we are
+    provided with output scale and zero point. The computation is as follows
 
     Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 
@@ -40,16 +40,16 @@ def requantize(data,
         The input data to the operator.
 
     input_scale: float
-           The float scalar to scale the data int8 values back to FP32.
+           The quantization scale for the input tensor.
 
     input_zero_point: int
-           The zero point of the data distribution.
+           The zero point of the input tensor.
 
     output_scale: float
-           The float scalar to scale the quantized_output int8 values back to FP32.
+           The quantization scale for the output tensor.
 
     output_zero_point: int
-           The zero point of the quantized_output distribution.
+           The zero point of the output tensor.
 
     rounding : string, optional
         Defines the rounding direction when the value is midway between two
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index 406e23fc0fbc..576631b67e7d 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
-"""Automatic quantization toolkit."""
+"""QNN Dialect transformation passes."""
 from __future__ import absolute_import
 
 from . import _transform
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index bce26355baf5..cc38b7fbeed8 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -34,7 +34,6 @@ namespace qnn {
 
 TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
-
 bool RequantizeRel(const Array<Type>& types,
                    int num_inputs,
                    const Attrs& attrs,
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 24f03b2a6d84..c1b8ae3371cd 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -31,6 +31,7 @@
 
 namespace tvm {
 namespace relay {
+namespace qnn {
 
 inline bool IsQNNDataType(const DataType& dtype) {
   return dtype == Int(8) || dtype == UInt(8)
@@ -71,7 +72,7 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
 
 inline const int32_t GetQmin(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
-      << "QNN ops support less than 32-bit integer values";
+      << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {
     auto* min_value = as_const_int(dtype.min());
     CHECK(min_value != nullptr);
@@ -87,7 +88,7 @@ inline const int32_t GetQmin(const DataType& dtype) {
 
 inline const int32_t GetQmax(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
-      << "QNN ops support less than 32-bit integer values";
+      << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {
     auto* max_value = as_const_int(dtype.max());
     CHECK(max_value != nullptr);
@@ -101,6 +102,7 @@ inline const int32_t GetQmax(const DataType& dtype) {
   return -1;
 }
 
+}  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_QNN_UTIL_H_

From 927825d633cf498404446c3713a9ee6db6149a2f Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 06:19:50 +0000
Subject: [PATCH 23/37] Added the API doc.

---
 docs/langref/relay_op.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 52ee03ab134a..269895293f98 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -350,3 +350,8 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.nn.batch_matmul
 .. autofunction:: tvm.relay.contrib.adaptive_max_pool2d
 .. autofunction:: tvm.relay.contrib.adaptive_avg_pool2d
+
+
+Level 11 Definitions
+--------------------
+.. autofunction:: tvm.relay.qnn.op.requantize

From 48f5a52b5af880ca1d2e7e311e1775aadf2e0794 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 15:25:31 +0000
Subject: [PATCH 24/37] Chanding default out_dtype to int8. Adding a test with
 in/out_dtype as uint8.

---
 python/tvm/relay/qnn/op/qnn.py        |  2 +-
 src/relay/qnn/util.h                  |  2 +-
 tests/python/unittest/test_qnn_ops.py | 14 ++++++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index ebebcbc0bb66..e347d0616511 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -25,7 +25,7 @@ def requantize(data,
                output_scale,
                output_zero_point,
                rounding="AWAY_FROM_ZERO",
-               out_dtype="int32"):
+               out_dtype="int8"):
     r"""Requantized operator.
 
     The requantize operator converts one quantized tensor representation to
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index c1b8ae3371cd..5bbfbd11fa79 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -52,7 +52,7 @@ inline bool IsValidOpInputType(const QuantizeOpType& op_type,
     case QuantizeOpType::Dequantize:
       return IsQNNDataType(in_dtype);
     case QuantizeOpType::Requantize:
-      return in_dtype == Int(16) || in_dtype == Int(32);
+      return in_dtype.is_int() || in_dtype.is_uint();
     default:
       return false;
   }
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/unittest/test_qnn_ops.py
index 1ef868f797c9..cd4b048719ea 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/unittest/test_qnn_ops.py
@@ -139,6 +139,20 @@ def downscale_test():
             golden_output = np.repeat([0, 1, 2], [8, 16, 8])
             verify(mod, (golden_data, golden_output))
 
+            # Try uint8 in_dtyope and uint8 out_dtype
+            mod = get_mod(data_shape=(32, ),
+                          data_dtype='uint8',
+                          out_dtype='uint8',
+                          input_scale=1,
+                          output_scale=16,
+                          rounding=rounding)
+
+            # Try positive values
+            # 8 corresponds to 0.5, resulting in 1
+            golden_data = np.arange(0, 32, 1).astype('int32')
+            golden_output = np.repeat([0, 1, 2], [8, 16, 8])
+            verify(mod, (golden_data, golden_output))
+
     def upscale_test():
         for rounding in roundings:
             mod = get_mod(data_shape=(32, ),

From 1422f6d876cd6cb28077d96595158ca11ac43195 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 19 Jul 2019 18:52:36 +0000
Subject: [PATCH 25/37] Style fixes. Better error messages.

---
 python/tvm/relay/qnn/op/qnn.py    | 2 --
 python/tvm/relay/qnn/transform.py | 1 +
 src/relay/qnn/pass/qnn_lower.cc   | 4 ++++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index e347d0616511..ef73ddbead8d 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -63,8 +63,6 @@ def requantize(data,
     result : tvm.relay.Expr
         The computed result.
     """
-    assert rounding in ("UPWARD", "AWAY_FROM_ZERO"),\
-            "Unsupported rounding mode"
 
     return _make.requantize(data,
                             input_scale,
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
index 576631b67e7d..6ca456b4fb81 100644
--- a/python/tvm/relay/qnn/transform.py
+++ b/python/tvm/relay/qnn/transform.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
+
 """QNN Dialect transformation passes."""
 from __future__ import absolute_import
 
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index ea46504c6748..321c475d48c5 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -213,6 +213,10 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
       << " Please run infer_type pass.";
   const auto input_dtype = input_tt->dtype;
 
+  // Check rounding validity.
+  CHECK(param->rounding == "UPWARD" || param->rounding == "AWAY_FROM_ZERO")
+      << "QNN requantize supports two rounding modes - UPWARD and "
+      << "AWAY_FROM_ZERO";
   return RequantizeLower(quantized_data, param, input_dtype, out_shape);
 }
 

From 66a4d76013d4f01aea2aa70468b49fe74d803d38 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 19:54:42 +0000
Subject: [PATCH 26/37] Adding documentation.

---
 python/tvm/relay/qnn/op/qnn.py  |  2 +-
 src/relay/qnn/op/requantize.cc  | 12 ++++-
 src/relay/qnn/pass/qnn_lower.cc | 79 ++++++++++++++++++++-------------
 src/relay/qnn/util.h            | 10 ++---
 4 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index ef73ddbead8d..88e13f2c358e 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -32,7 +32,7 @@ def requantize(data,
     another quantized tensor representation. For the output tensor, we are
     provided with output scale and zero point. The computation is as follows
 
-    Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+    Q_output = zp_output +  (scale_input)/(scale_output) * (Q_input - zp_input)
 
     Parameters
     ----------
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index cc38b7fbeed8..e8978aa03147 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -20,7 +20,7 @@
 /*!
  *  Copyright (c) 2019 by Contributors
  * \file requantize.cc
- * \brief Quantized convolution operators
+ * \brief QNN requantize operator.
  */
 
 #include <tvm/relay/op_attr_types.h>
@@ -34,6 +34,14 @@ namespace qnn {
 
 TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
+/*
+ * \brief Infer shape function of Requantize op.
+ * \param types The types of input args.
+ * \param num_inputs The number of inputs.
+ * \param attrs The op attributes.
+ * \param reporter The type reporter that sets the dtype and shapes.
+ * \return True if the infer shape succeeded.
+ */
 bool RequantizeRel(const Array<Type>& types,
                    int num_inputs,
                    const Attrs& attrs,
@@ -51,7 +59,7 @@ bool RequantizeRel(const Array<Type>& types,
   return true;
 }
 
-// Positional relay function to create quantized conv2d operator
+// Positional relay function to create qnn requantize operator
 // used by frontend FFI.
 Expr MakeRequantize(Expr data,
                     double input_scale,
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 321c475d48c5..db380354162d 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -20,7 +20,7 @@
 /*!
  *  Copyright (c) 2019 by Contributors
  * \file qnn_lower.cc
- * \brief Lower quantized ops to exisiting Relay ops.
+ * \brief Lower qnn ops to a sequence of exisiting Relay ops.
  */
 
 #include <tvm/relay/analysis.h>
@@ -45,20 +45,24 @@ using runtime::TypedPackedFunc;
 // Lowering of qnn.requantize op
 
 /*
- * Converts a floating point number so that it can be represented by integers.
- * The representation is
- *      float_number = (significand) * 2^(exponent)
+ * \brief Convert FP32 representation into fixed point representation.
+ * \param double_multplier The input FP32 number.
+ * \param idtype The input datatype.
+ * \return The pair of multiplier and shift for fixed point representation.
+ * \note Converts a floating point number so that it can be represented by
+ *       integers. The representation is
+ *             float_number = (significand) * 2^(exponent)
  *
- * The significand is a number between 0.5 and 1. This is represented
- * by an integer number. For example, if it is int32, then the decimal point
- * exists between bit 31 and 30 from LSB (or between first and second bit from
- * the left).
+ *       The significand is a number between 0.5 and 1. This is represented by
+ *       an integer number. For example, if it is int32, then the decimal point
+ *       exists between bit 31 and 30 from LSB (or between first and second bit
+ *       from the left).
  *
- * Some examples are
+ *       Some examples are
  *           0.25 = (0.5) * 2^(-1)
  *           0.125 = (0.5) * 2^(-2)
  *
- * Credit to TFLite reference implementation.
+ *       Credit to TFLite reference implementation.
  */
 std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
     const DataType& idtype) {
@@ -82,25 +86,31 @@ std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
 }
 
 /*
- * Requantization using only integer computation. Here, the computation is
- * converted to a fixed point computation by computing output multiplier and
- * shift. This is useful, if the target device does not support/have very
- * expensive floating point computations.
+ * \brief Lower requantize to a sequence of ops.
+ * \param input_tensor The input tensor to requantize op.
+ * \param param The requantize op attrs.
+ * \param idtype The dtype of the input tensor.
+ * \param out_shape The output shape of the requantize op.
+ * \return The sequence of existing Relay ops.
+ * \note Requantization using only integer computation. Here, the computation is
+ *       converted to a fixed point computation by computing output multiplier
+ *       and shift. This is useful, if the target device does not support/have
+ *       very expensive floating point computations.
  *
- * Original compuation is scale_fp32 * quantized_tensor.  To convert into
- * integer computation, the multiplication with fp32 scalar can be replaced by
- * multiplication with an int value and then right shifting the result. This
- * approximates the floating point computation with a fixed point computation.
- *
- * The whole computation this can be broken down into following steps
- * 1) Calculate the integer multiplier and integer shift.
- * 2) Subtract the input integer point.
- * 3) Multiply the integer fixed point multiplier with quantized tensor.
- * 4) Round the result.
- * 5) Right shift the result.
- * 6) Add the output_zero_point.
- * 7) Cast to the out_dtype.
+ *       Original compuation is scale_fp32 * quantized_tensor.  To convert into
+ *       integer computation, the multiplication with fp32 scalar can be
+ *       replaced by multiplication with an int value and then right shifting
+ *       the result. This approximates the floating point computation with a
+ *       fixed point computation.
  *
+ *       The whole computation this can be broken down into following steps
+ *       1) Calculate the integer multiplier and integer shift.
+ *       2) Subtract the input integer point.
+ *       3) Multiply the integer fixed point multiplier with quantized tensor.
+ *       4) Round the result.
+ *       5) Right shift the result.
+ *       6) Add the output_zero_point.
+ *       7) Cast to the out_dtype.
  */
 Expr RequantizeLower(const Expr& input_tensor,
     const RequantizeAttrs* param, const DataType& idtype,
@@ -134,7 +144,7 @@ Expr RequantizeLower(const Expr& input_tensor,
   // Perform the multiplication in higher precision.
   // If idtype is Int(32), the scalar is a fixed point value of int32 where the
   // decimal point is between bits 31 and 30. After multiplying with
-  // input_tensor, the result in int64 where the decimal point is sitting
+  // input_tensor, the result is in int64 where the decimal point is sitting
   // between bits 31 and 30 (from the right, rightmost bit is bit 0).
   Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
   auto multiplied_t = Multiply(tensor, scalar);
@@ -184,12 +194,17 @@ Expr RequantizeLower(const Expr& input_tensor,
 }
 
 /*
- * Lowering of the requantize operation. The requantize operator converts one
- * quantized tensor to another quantized tensor. For the output tensor, we are
- * provided with output scale and zero point. The computation looks like this
+ * \brief Forward rewrite the requantize op.
+ * \param ref_call The original call that will be lowered.
+ * \param new_args The new mutated args to the call node.
+ * \param ctx The node context.
+ * \return The sequence of Relay ops for requantize op.
+ * \note Lowering of the requantize operation. The requantize operator converts
+ *       one quantized tensor to another quantized tensor. For the output
+ *       tensor, we are provided with output scale and zero point. The
+ *       computation looks like this
  *
  * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
- *
  */
 Expr RequantizeForwardRewrite(const Call& ref_call,
     const Array<Expr>& new_args, const NodeRef& ctx) {
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 5bbfbd11fa79..c3d0367a81e7 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -33,7 +33,7 @@ namespace tvm {
 namespace relay {
 namespace qnn {
 
-inline bool IsQNNDataType(const DataType& dtype) {
+static inline bool IsQNNDataType(const DataType& dtype) {
   return dtype == Int(8) || dtype == UInt(8)
       || dtype == Int(16) || dtype == UInt(16);
 }
@@ -44,7 +44,7 @@ enum class QuantizeOpType {
   Requantize
 };
 
-inline bool IsValidOpInputType(const QuantizeOpType& op_type,
+static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
@@ -58,7 +58,7 @@ inline bool IsValidOpInputType(const QuantizeOpType& op_type,
   }
 }
 
-inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
+static inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
@@ -70,7 +70,7 @@ inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
   }
 }
 
-inline const int32_t GetQmin(const DataType& dtype) {
+static inline const int32_t GetQmin(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {
@@ -86,7 +86,7 @@ inline const int32_t GetQmin(const DataType& dtype) {
   return -1;
 }
 
-inline const int32_t GetQmax(const DataType& dtype) {
+static inline const int32_t GetQmax(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support uint32/int32 or lower precision";
   if (dtype.is_int()) {

From 99483c24ed7eb436dee82b9fdb1e3845b5fd1cbf Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 20:49:53 +0000
Subject: [PATCH 27/37] More documentation fixes.

---
 include/tvm/relay/qnn/attrs.h                      | 14 ++++++++------
 python/tvm/relay/qnn/__init__.py                   |  2 +-
 python/tvm/relay/qnn/op/qnn.py                     |  4 ++--
 src/relay/qnn/pass/qnn_lower.cc                    |  6 ++++--
 .../test_qnn_requantize.py}                        |  1 -
 5 files changed, 15 insertions(+), 12 deletions(-)
 rename tests/python/{unittest/test_qnn_ops.py => relay/test_qnn_requantize.py} (99%)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index be5d3ac08abd..e98357291dff 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -53,12 +53,14 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("Defines the rounding direction when the value is midway between"
                   "two representable values. There are two supported modes - UPWARD"
                   "or AWAY_FROM_ZERO. Both modes behave exactly same except at the"
-                  "midpoints between the two representable values. At midpoint, UPWARD"
-                  "rounds towards positive infinity (for example -1.5 will be rounded"
-                  "to -1). AWAY_FROM_ZERO is the standard rounding where the value"
-                  "is rounded away from zero at midpoints (for example, -1.5 rounds to"
-                  "-2). More context can be found at"
-                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html");
+                  "midpoints between the two representable values. At the midpoint,"
+                  "UPWARD rounds towards positive infinity (for example -1.5 will be"
+                  "rounded to -1). AWAY_FROM_ZERO is the standard rounding where the"
+                  "value is rounded away from zero at midpoints (for example, -1.5"
+                  "rounds to -2). More context can be found at following gblic manual"
+                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html."
+                  "FE_UPWARD corresponds to UPWARD here and FE_TONEAREST corresponds"
+                  "to AWAY_FROM_ZERO rounding mode.");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
index 409e088156b8..de932c71c67d 100644
--- a/python/tvm/relay/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=wildcard-import
-"""Neural network related operators."""
+"""QNN dialect operators and ir passes."""
 from __future__ import absolute_import as _abs
 from . import op
 from . import transform
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 88e13f2c358e..f0d120cc1901 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #pylint: disable=invalid-name, too-many-lines
-"""Neural network operations."""
+"""QNN dialect operators."""
 from __future__ import absolute_import as _abs
 from . import _make
 
@@ -56,7 +56,7 @@ def requantize(data,
         representable values.
 
     out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
+        Specifies the output data type.
 
     Returns
     -------
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index db380354162d..867d5502e3a3 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -20,7 +20,7 @@
 /*!
  *  Copyright (c) 2019 by Contributors
  * \file qnn_lower.cc
- * \brief Lower qnn ops to a sequence of exisiting Relay ops.
+ * \brief Lower qnn ops to a sequence of existing Relay ops.
  */
 
 #include <tvm/relay/analysis.h>
@@ -145,7 +145,9 @@ Expr RequantizeLower(const Expr& input_tensor,
   // If idtype is Int(32), the scalar is a fixed point value of int32 where the
   // decimal point is between bits 31 and 30. After multiplying with
   // input_tensor, the result is in int64 where the decimal point is sitting
-  // between bits 31 and 30 (from the right, rightmost bit is bit 0).
+  // between bits 31 and 30 (from the right, rightmost bit is bit 0). The
+  // computation is performed in higher precision to avoid overflow in
+  // multiplying two int32 values.
   Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
   auto multiplied_t = Multiply(tensor, scalar);
 
diff --git a/tests/python/unittest/test_qnn_ops.py b/tests/python/relay/test_qnn_requantize.py
similarity index 99%
rename from tests/python/unittest/test_qnn_ops.py
rename to tests/python/relay/test_qnn_requantize.py
index cd4b048719ea..e901fd7ac0e1 100644
--- a/tests/python/unittest/test_qnn_ops.py
+++ b/tests/python/relay/test_qnn_requantize.py
@@ -60,7 +60,6 @@ def get_mod(data_shape, data_dtype, out_dtype, input_scale, output_scale,
         mod = relay.qnn.transform.QnnLower()(mod)
         return mod
 
-
     def same_scale_test():
         # Have same scales, everything within range
         golden_data = np.arange(-100, 100, 1).astype('int32')

From f8439e6e6404fed80fabdd16736e0668ebb70839 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:00:29 +0000
Subject: [PATCH 28/37] Adding out dtype check for requantize.

---
 src/relay/qnn/op/requantize.cc | 4 +++-
 src/relay/qnn/util.h           | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index e8978aa03147..ae9c874dae19 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -50,11 +50,13 @@ bool RequantizeRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
   CHECK(IsValidOpInputType(QuantizeOpType::Requantize, input_dtype))
-    << "Input type should be a quantized type (u)int8 or (u)int16 but was " <<  input_dtype;
+    << "Input type should be an integer but was " <<  input_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
   // assign output type
   const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
+  CHECK(IsValidOpOutputType(QuantizeOpType::Requantize, param->out_dtype))
+    << "Output type should be an integer but was " <<  param->out_dtype;
   reporter->Assign(types[1], TensorTypeNode::make(oshape, param->out_dtype));
   return true;
 }
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index c3d0367a81e7..09f1c543d2dc 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -59,12 +59,14 @@ static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
 }
 
 static inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
-        const DataType& in_dtype) {
+        const DataType& out_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return IsQNNDataType(in_dtype);
+      return IsQNNDataType(out_dtype);
     case QuantizeOpType::Dequantize:
-      return in_dtype == Float(32);
+      return out_dtype == Float(32);
+    case QuantizeOpType::Requantize:
+      return out_dtype.is_int() || out_dtype.is_uint();
     default:
       return false;
   }

From e756843dee2b8982c166a3b4d7dfe0513da02a4b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:05:18 +0000
Subject: [PATCH 29/37] Adding corner case for FP32 to fixed point conversion.

---
 src/relay/qnn/pass/qnn_lower.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 867d5502e3a3..7d5969054a36 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -67,6 +67,11 @@ using runtime::TypedPackedFunc;
 std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
     const DataType& idtype) {
   int significand, exponent;
+  if (double_multiplier == 0.) {
+    significand = 0;
+    exponent = 0;
+    return std::pair<int, int>(significand, exponent);
+  }
   int idtype_bits = idtype.bits();
 
   // Get the significand (significand) and exponent (exponent)

From 5d7938f2dcc3b156a522ed86d4c18dbfa802d16b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:07:41 +0000
Subject: [PATCH 30/37] Adding extra line.

---
 python/tvm/relay/qnn/op/qnn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index f0d120cc1901..b961dc12c8e3 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -14,7 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#pylint: disable=invalid-name, too-many-lines
+#pylint: disable=invalid-name
+
 """QNN dialect operators."""
 from __future__ import absolute_import as _abs
 from . import _make

From 10ce99d038ec3d537cc312025b8550568264b09f Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 22 Jul 2019 21:22:46 +0000
Subject: [PATCH 31/37] Documentation fix.

---
 python/tvm/relay/qnn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
index de932c71c67d..fa888d7ce7dd 100644
--- a/python/tvm/relay/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=wildcard-import
-"""QNN dialect operators and ir passes."""
+"""QNN dialect operators and IR passes."""
 from __future__ import absolute_import as _abs
 from . import op
 from . import transform

From f2e09d1fcdb27c83dee0ddb1cc578dc5bb2a5c7d Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 23 Jul 2019 02:42:04 +0000
Subject: [PATCH 32/37] Adding static inline.

---
 python/tvm/relay/qnn/op/qnn.py | 2 +-
 src/relay/pass/pattern_util.h  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index b961dc12c8e3..65106c770862 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 #pylint: disable=invalid-name
-
 """QNN dialect operators."""
+
 from __future__ import absolute_import as _abs
 from . import _make
 
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 4f9b11eb925c..6970c885215c 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -394,17 +394,17 @@ inline Expr Variance(Expr data, Expr mean, Array<Integer> axis, bool keepdims, b
 }
 
 
-inline Expr Where(const Expr& condition, const Expr& x, const Expr& y) {
+static inline Expr Where(const Expr& condition, const Expr& x, const Expr& y) {
   static const Op& op = Op::Get("where");
   return CallNode::make(op, {condition, x, y});
 }
 
-inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
+static inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
   static const Op& op = Op::Get("greater_equal");
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
-inline Expr Full(Expr fill_value,
+static inline Expr Full(Expr fill_value,
                  Array<IndexExpr> shape,
                  DataType dtype) {
   auto attrs = make_node<InitOpAttrs>();

From 65c0b463451ad869e9b3881c9adfb29cacd13c2b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 24 Jul 2019 21:34:58 +0000
Subject: [PATCH 33/37] Incorporating jackwish comment. Removed idtype from
 requantize lowering.

---
 include/tvm/relay/qnn/attrs.h             |  10 +-
 python/tvm/relay/qnn/op/qnn.py            |   2 +-
 src/relay/qnn/pass/qnn_lower.cc           | 117 +++++++++-------------
 src/relay/qnn/util.h                      |  27 ++---
 tests/python/relay/test_qnn_requantize.py |   4 +-
 5 files changed, 72 insertions(+), 88 deletions(-)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index e98357291dff..e99602813229 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -49,18 +49,16 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("The scale of the output tensor.");
     TVM_ATTR_FIELD(output_zero_point)
         .describe("The zero point of the output tensor.");
-    TVM_ATTR_FIELD(rounding).set_default("AWAY_FROM_ZERO")
+    TVM_ATTR_FIELD(rounding).set_default("TONEAREST")
         .describe("Defines the rounding direction when the value is midway between"
                   "two representable values. There are two supported modes - UPWARD"
-                  "or AWAY_FROM_ZERO. Both modes behave exactly same except at the"
+                  "or TONEAREST. Both modes behave exactly same except at the"
                   "midpoints between the two representable values. At the midpoint,"
                   "UPWARD rounds towards positive infinity (for example -1.5 will be"
-                  "rounded to -1). AWAY_FROM_ZERO is the standard rounding where the"
+                  "rounded to -1). TONEAREST is the standard rounding where the"
                   "value is rounded away from zero at midpoints (for example, -1.5"
                   "rounds to -2). More context can be found at following gblic manual"
-                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html."
-                  "FE_UPWARD corresponds to UPWARD here and FE_TONEAREST corresponds"
-                  "to AWAY_FROM_ZERO rounding mode.");
+                  "https://www.gnu.org/software/libc/manual/html_node/Rounding.html.");
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 65106c770862..78ae724af71e 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -25,7 +25,7 @@ def requantize(data,
                input_zero_point,
                output_scale,
                output_zero_point,
-               rounding="AWAY_FROM_ZERO",
+               rounding="TONEAREST",
                out_dtype="int8"):
     r"""Requantized operator.
 
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 7d5969054a36..2c5c951d8163 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -47,7 +47,6 @@ using runtime::TypedPackedFunc;
 /*
  * \brief Convert FP32 representation into fixed point representation.
  * \param double_multplier The input FP32 number.
- * \param idtype The input datatype.
  * \return The pair of multiplier and shift for fixed point representation.
  * \note Converts a floating point number so that it can be represented by
  *       integers. The representation is
@@ -64,37 +63,37 @@ using runtime::TypedPackedFunc;
  *
  *       Credit to TFLite reference implementation.
  */
-std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
-    const DataType& idtype) {
-  int significand, exponent;
+std::pair<int32_t, int32_t> GetFixedPointMultiplierShift(
+    double double_multiplier) {
+  int32_t significand, exponent;
   if (double_multiplier == 0.) {
     significand = 0;
     exponent = 0;
-    return std::pair<int, int>(significand, exponent);
+    return std::make_pair(significand, exponent);
   }
-  int idtype_bits = idtype.bits();
 
-  // Get the significand (significand) and exponent (exponent)
+  // Get the significand and exponent.
   double significand_d = std::frexp(double_multiplier, &exponent);
 
-  // Convert the double significand to int significand.
-  significand_d = std::round(significand_d * (1ll << (idtype_bits - 1)));
+  // Convert the double significand to int significand, i.e., convert into a
+  // integer where the decimal point is between bit 31 and 30. This is done by
+  // multiplying the double value with 2^31 and then casting to int.
+  significand_d = std::round(significand_d * (1ll << 31));
   auto significand_int64 = static_cast<int64_t>(significand_d);
-  CHECK_LE(significand_int64, (1ll << (idtype_bits - 1)));
-  if (significand_int64 == (1ll << (idtype_bits - 1))) {
+  CHECK_LE(significand_int64, (1ll << 31));
+  if (significand_int64 == (1ll << 31)) {
     significand_int64 /= 2;
     ++exponent;
   }
-  CHECK_LE(significand_int64, std::numeric_limits<int>::max());
-  significand = static_cast<int>(significand_int64);
-  return std::pair<int, int>(significand, exponent);
+  CHECK_LE(significand_int64, std::numeric_limits<int32_t>::max());
+  significand = static_cast<int32_t>(significand_int64);
+  return std::make_pair(significand, exponent);
 }
 
 /*
  * \brief Lower requantize to a sequence of ops.
  * \param input_tensor The input tensor to requantize op.
  * \param param The requantize op attrs.
- * \param idtype The dtype of the input tensor.
  * \param out_shape The output shape of the requantize op.
  * \return The sequence of existing Relay ops.
  * \note Requantization using only integer computation. Here, the computation is
@@ -117,63 +116,59 @@ std::pair<int, int> GetFixedPointMultiplierShift(double double_multiplier,
  *       6) Add the output_zero_point.
  *       7) Cast to the out_dtype.
  */
-Expr RequantizeLower(const Expr& input_tensor,
-    const RequantizeAttrs* param, const DataType& idtype,
-    const Array<IndexExpr>& out_shape) {
-
+Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
+        const Array<IndexExpr>& out_shape) {
   double double_multiplier = param->input_scale/param->output_scale;
 
-  // The multiplication will be performed in higher precision. Find the dtype.
-  int idtype_bits = idtype.bits();
-  DataType up_idtype = Int(2 * idtype_bits);
+  // Choose high precision datatype to be int64. This is for avoiding overflow
+  // in multiplication of two int32 values.
+  DataType hp_dtype = Int(64);
 
   // 1) Calculating the integer multiplier and integer shift
-  std::pair<int, int> fixed_point_params =
-      GetFixedPointMultiplierShift(double_multiplier, idtype);
-  int fixed_point_multiplier = fixed_point_params.first;
-  int shift = fixed_point_params.second;
+  int32_t fixed_point_multiplier, shift;
+  std::tie(fixed_point_multiplier, shift) =
+      GetFixedPointMultiplierShift(double_multiplier);
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
 
   // 2) Subtract the input_zero_point
-  auto tensor = Cast(input_tensor, up_idtype);
+  auto tensor = Cast(input_tensor, hp_dtype);
   if (param->input_zero_point != 0) {
-    auto input_zp = MakeConstantScalar(up_idtype, param->input_zero_point);
+    auto input_zp = MakeConstantScalar(hp_dtype, param->input_zero_point);
     tensor = Subtract(tensor, input_zp);
   }
 
   // 3) Multiply the integer multiplier
   if (left_shift != 0) {
-    tensor = Multiply(tensor, MakeConstantScalar(up_idtype, 1 << left_shift));
+    tensor = Multiply(tensor, MakeConstantScalar(hp_dtype, 1 << left_shift));
   }
   // Perform the multiplication in higher precision.
-  // If idtype is Int(32), the scalar is a fixed point value of int32 where the
-  // decimal point is between bits 31 and 30. After multiplying with
-  // input_tensor, the result is in int64 where the decimal point is sitting
-  // between bits 31 and 30 (from the right, rightmost bit is bit 0). The
-  // computation is performed in higher precision to avoid overflow in
-  // multiplying two int32 values.
-  Expr scalar = MakeConstantScalar(up_idtype, fixed_point_multiplier);
+  // The scalar is a fixed point value of int32 where the decimal point is
+  // between bits 31 and 30. After multiplying with input_tensor, the result is
+  // in int64 where the decimal point is sitting between bits 31 and 30 (from
+  // the right, rightmost bit is bit 0). The computation is performed in higher
+  // precision to avoid overflow in multiplying two int32 values.
+  Expr scalar = MakeConstantScalar(hp_dtype, fixed_point_multiplier);
   auto multiplied_t = Multiply(tensor, scalar);
 
   // 4) Find the rounding scalar. This depends on where the final decimal point
   // sits. As we will be right shifting the multiplied_t, we need to first
   // calculate the total_right_shift.
-  int total_right_shift = right_shift + idtype_bits - 1;
+  int total_right_shift = right_shift + 31;
+  int64_t pos_rounding_value = (1ll << (total_right_shift - 1));
 
   tensor = multiplied_t;
   Expr round_scalar;
   if (param->rounding == "UPWARD") {
-    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
-    round_scalar = pos_rounder;
-  } else if (param->rounding == "AWAY_FROM_ZERO") {
-    auto pos_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)));
-    auto neg_rounder = MakeConstantScalar(up_idtype, (1ll << (total_right_shift - 1)) - 1);
-    auto pos_rounder_t = Full(pos_rounder, out_shape, up_idtype);
-    auto neg_rounder_t = Full(neg_rounder, out_shape, up_idtype);
-
-    auto zero = MakeConstantScalar(up_idtype, 0);
-    auto zero_t = Full(zero, out_shape, up_idtype);
+    round_scalar = MakeConstantScalar(hp_dtype, pos_rounding_value);
+  } else if (param->rounding == "TONEAREST") {
+    auto pos_rounder = MakeConstantScalar(hp_dtype, pos_rounding_value);
+    auto neg_rounder = MakeConstantScalar(hp_dtype, pos_rounding_value - 1);
+    auto pos_rounder_t = Full(pos_rounder, out_shape, hp_dtype);
+    auto neg_rounder_t = Full(neg_rounder, out_shape, hp_dtype);
+
+    auto zero = MakeConstantScalar(hp_dtype, 0);
+    auto zero_t = Full(zero, out_shape, hp_dtype);
     round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
             neg_rounder_t);
   }
@@ -182,19 +177,15 @@ Expr RequantizeLower(const Expr& input_tensor,
 
   // 5) Simply right shift the result to get the final output.
   auto scaled_int64_t = RightShift(tensor,
-          MakeConstantScalar(up_idtype, total_right_shift));
+          MakeConstantScalar(hp_dtype, total_right_shift));
 
   // 6) Add the output zero point.
-  auto output_zp = MakeConstantScalar(up_idtype, param->output_zero_point);
+  auto output_zp = MakeConstantScalar(hp_dtype, param->output_zero_point);
   auto shifted_int64_t = Add(output_zp, scaled_int64_t);
 
   // 7) Clip to the out_dtype min/max.
-  // Find the right clip min/maxes. While clipping, it is necessary that
-  // clip_min and clip_max are within the dtype range of the input tensor to the
-  // clip operator. For example, if the input to clip operator is int8, but the
-  // out_dtype is uint8, we will get incorrect results, if we set max as 255.
-  auto q_min = std::max(GetQmin(param->out_dtype), GetQmin(idtype));
-  auto q_max = std::min(GetQmax(param->out_dtype), GetQmax(idtype));
+  auto q_min = GetQmin(param->out_dtype);
+  auto q_max = GetQmax(param->out_dtype);
   auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
   auto requantized_output = Cast(clipped_t, param->out_dtype);
   return requantized_output;
@@ -221,25 +212,17 @@ Expr RequantizeForwardRewrite(const Call& ref_call,
   CHECK(param != nullptr);
 
   // Find output shape.
-  Array<IndexExpr> out_shape;
   auto ref_call_t = ref_call->checked_type();
   auto output_tt = ref_call_t.as<TensorTypeNode>();
   CHECK(output_tt != nullptr) << "Type information missing."
       << " Please run infer_type pass.";
-  out_shape = output_tt->shape;
-
-  // Find input dtype.
-  auto ref_input_t = ref_call->args[0]->checked_type();
-  auto input_tt = ref_input_t.as<TensorTypeNode>();
-  CHECK(input_tt != nullptr) << "Type information missing."
-      << " Please run infer_type pass.";
-  const auto input_dtype = input_tt->dtype;
+  Array<IndexExpr> out_shape = output_tt->shape;
 
   // Check rounding validity.
-  CHECK(param->rounding == "UPWARD" || param->rounding == "AWAY_FROM_ZERO")
+  CHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
       << "QNN requantize supports two rounding modes - UPWARD and "
-      << "AWAY_FROM_ZERO";
-  return RequantizeLower(quantized_data, param, input_dtype, out_shape);
+      << "TONEAREST";
+  return RequantizeLower(quantized_data, param, out_shape);
 }
 
 RELAY_REGISTER_OP("qnn.requantize")
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 09f1c543d2dc..ba6714628980 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -35,7 +35,8 @@ namespace qnn {
 
 static inline bool IsQNNDataType(const DataType& dtype) {
   return dtype == Int(8) || dtype == UInt(8)
-      || dtype == Int(16) || dtype == UInt(16);
+      || dtype == Int(16) || dtype == UInt(16)
+      || dtype == Int(32);
 }
 
 enum class QuantizeOpType {
@@ -48,11 +49,11 @@ static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
     case QuantizeOpType::Quantize:
-      return in_dtype == Float(32) || IsQNNDataType(in_dtype);
+      return in_dtype == Float(32);
     case QuantizeOpType::Dequantize:
       return IsQNNDataType(in_dtype);
     case QuantizeOpType::Requantize:
-      return in_dtype.is_int() || in_dtype.is_uint();
+      return IsQNNDataType(in_dtype);
     default:
       return false;
   }
@@ -66,15 +67,15 @@ static inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
     case QuantizeOpType::Dequantize:
       return out_dtype == Float(32);
     case QuantizeOpType::Requantize:
-      return out_dtype.is_int() || out_dtype.is_uint();
+      return IsQNNDataType(out_dtype);
     default:
       return false;
   }
 }
 
 static inline const int32_t GetQmin(const DataType& dtype) {
-  CHECK_LE(dtype.bits(), 32)
-      << "QNN ops support uint32/int32 or lower precision";
+  CHECK(IsQNNDataType(dtype))
+      << "QNN ops support int32 or lower precision";
   if (dtype.is_int()) {
     auto* min_value = as_const_int(dtype.min());
     CHECK(min_value != nullptr);
@@ -83,14 +84,15 @@ static inline const int32_t GetQmin(const DataType& dtype) {
     auto* min_value = as_const_uint(dtype.min());
     CHECK(min_value != nullptr);
     return static_cast<int32_t>(min_value[0]);
+  } else {
+    LOG(FATAL) << "Type not supported " << dtype;
+    return -1;  // To hide the warning
   }
-  LOG(FATAL) << "Type not supported " << dtype;
-  return -1;
 }
 
 static inline const int32_t GetQmax(const DataType& dtype) {
-  CHECK_LE(dtype.bits(), 32)
-      << "QNN ops support uint32/int32 or lower precision";
+  CHECK(IsQNNDataType(dtype))
+      << "QNN ops support int32 or lower precision";
   if (dtype.is_int()) {
     auto* max_value = as_const_int(dtype.max());
     CHECK(max_value != nullptr);
@@ -99,9 +101,10 @@ static inline const int32_t GetQmax(const DataType& dtype) {
     auto* max_value = as_const_uint(dtype.max());
     CHECK(max_value != nullptr);
     return static_cast<int32_t>(max_value[0]);
+  } else {
+    LOG(FATAL) << "Type not supported " << dtype;
+    return -1;  // To hide the warning
   }
-  LOG(FATAL) << "Type not supported " << dtype;
-  return -1;
 }
 
 }  // namespace qnn
diff --git a/tests/python/relay/test_qnn_requantize.py b/tests/python/relay/test_qnn_requantize.py
index e901fd7ac0e1..3925c1e5d573 100644
--- a/tests/python/relay/test_qnn_requantize.py
+++ b/tests/python/relay/test_qnn_requantize.py
@@ -21,7 +21,7 @@
 from tvm.relay.testing import create_workload
 from tvm.contrib import graph_runtime
 
-roundings = ["UPWARD", "AWAY_FROM_ZERO"]
+roundings = ["UPWARD", "TONEAREST"]
 
 def run_infer_type(expr):
     mod = relay.Module.from_expr(expr)
@@ -43,7 +43,7 @@ def verify(mod, goldens):
             np.testing.assert_equal(res, golden_output)
 
     def get_mod(data_shape, data_dtype, out_dtype, input_scale, output_scale,
-            input_zero_point=0, output_zero_point=0, rounding="AWAY_FROM_ZERO"):
+            input_zero_point=0, output_zero_point=0, rounding="TONEAREST"):
         quantized_data = relay.var("quantized_data", shape=data_shape,
                 dtype=data_dtype)
         mod = relay.qnn.op.requantize(

From 8d2c3adec2e36895faae8781836681a5f4a25c39 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 26 Jul 2019 20:28:47 +0000
Subject: [PATCH 34/37] Removing Quantize/Dequantize code. Restricting
 Requantize to (u)int8/int32.

---
 src/relay/qnn/util.h | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index ba6714628980..c7810943b640 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -33,12 +33,6 @@ namespace tvm {
 namespace relay {
 namespace qnn {
 
-static inline bool IsQNNDataType(const DataType& dtype) {
-  return dtype == Int(8) || dtype == UInt(8)
-      || dtype == Int(16) || dtype == UInt(16)
-      || dtype == Int(32);
-}
-
 enum class QuantizeOpType {
   Quantize,
   Dequantize,
@@ -48,12 +42,8 @@ enum class QuantizeOpType {
 static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
         const DataType& in_dtype) {
   switch (op_type) {
-    case QuantizeOpType::Quantize:
-      return in_dtype == Float(32);
-    case QuantizeOpType::Dequantize:
-      return IsQNNDataType(in_dtype);
     case QuantizeOpType::Requantize:
-      return IsQNNDataType(in_dtype);
+      return in_dtype == Int(8) || in_dtype == UInt(8) || in_dtype == Int(32);
     default:
       return false;
   }
@@ -62,19 +52,15 @@ static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
 static inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
         const DataType& out_dtype) {
   switch (op_type) {
-    case QuantizeOpType::Quantize:
-      return IsQNNDataType(out_dtype);
-    case QuantizeOpType::Dequantize:
-      return out_dtype == Float(32);
     case QuantizeOpType::Requantize:
-      return IsQNNDataType(out_dtype);
+      return out_dtype == Int(8) || out_dtype == UInt(8) || out_dtype == Int(32);
     default:
       return false;
   }
 }
 
 static inline const int32_t GetQmin(const DataType& dtype) {
-  CHECK(IsQNNDataType(dtype))
+  CHECK_LE(dtype.bits(), 32)
       << "QNN ops support int32 or lower precision";
   if (dtype.is_int()) {
     auto* min_value = as_const_int(dtype.min());
@@ -91,7 +77,7 @@ static inline const int32_t GetQmin(const DataType& dtype) {
 }
 
 static inline const int32_t GetQmax(const DataType& dtype) {
-  CHECK(IsQNNDataType(dtype))
+  CHECK_LE(dtype.bits(), 32)
       << "QNN ops support int32 or lower precision";
   if (dtype.is_int()) {
     auto* max_value = as_const_int(dtype.max());

From 2d15b54a7aec9d2f436449dff6d870e3f225e870 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 29 Jul 2019 18:04:43 +0000
Subject: [PATCH 35/37] Style fixes.

---
 python/tvm/relay/qnn/op/qnn.py  |  8 ++++----
 src/relay/pass/pattern_util.h   |  4 ++--
 src/relay/qnn/op/requantize.cc  | 13 +++++++------
 src/relay/qnn/pass/qnn_lower.cc |  9 ++++-----
 src/relay/qnn/util.h            | 26 --------------------------
 5 files changed, 17 insertions(+), 43 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 78ae724af71e..1717bc42fe94 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -41,16 +41,16 @@ def requantize(data,
         The input data to the operator.
 
     input_scale: float
-           The quantization scale for the input tensor.
+        The quantization scale for the input tensor.
 
     input_zero_point: int
-           The zero point of the input tensor.
+        The zero point of the input tensor.
 
     output_scale: float
-           The quantization scale for the output tensor.
+        The quantization scale for the output tensor.
 
     output_zero_point: int
-           The zero point of the output tensor.
+        The zero point of the output tensor.
 
     rounding : string, optional
         Defines the rounding direction when the value is midway between two
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 6970c885215c..3ccfff0c3463 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -405,8 +405,8 @@ static inline Expr GreaterEqual(const Expr& lhs, const Expr& rhs) {
 }
 
 static inline Expr Full(Expr fill_value,
-                 Array<IndexExpr> shape,
-                 DataType dtype) {
+                        Array<IndexExpr> shape,
+                        DataType dtype) {
   auto attrs = make_node<InitOpAttrs>();
   attrs->shape = std::move(shape);
   attrs->dtype = std::move(dtype);
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index ae9c874dae19..2e78d20721be 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -48,16 +48,17 @@ bool RequantizeRel(const Array<Type>& types,
                    const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-  const auto input_dtype = data->dtype;
-  CHECK(IsValidOpInputType(QuantizeOpType::Requantize, input_dtype))
-    << "Input type should be an integer but was " <<  input_dtype;
+  const auto in_dtype = data->dtype;
+  CHECK(in_dtype == Int(8) || in_dtype == UInt(8) || in_dtype == Int(32))
+    << "Input type should be an integer but was " <<  in_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
   // assign output type
   const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
-  CHECK(IsValidOpOutputType(QuantizeOpType::Requantize, param->out_dtype))
-    << "Output type should be an integer but was " <<  param->out_dtype;
-  reporter->Assign(types[1], TensorTypeNode::make(oshape, param->out_dtype));
+  auto out_dtype = param->out_dtype;
+  CHECK(out_dtype == Int(8) || out_dtype == UInt(8) || out_dtype == Int(32))
+    << "Output type should be an integer but was " << out_dtype;
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
 
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
index 2c5c951d8163..5ac6f4b7bda4 100644
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ b/src/relay/qnn/pass/qnn_lower.cc
@@ -109,11 +109,11 @@ std::pair<int32_t, int32_t> GetFixedPointMultiplierShift(
  *
  *       The whole computation this can be broken down into following steps
  *       1) Calculate the integer multiplier and integer shift.
- *       2) Subtract the input integer point.
- *       3) Multiply the integer fixed point multiplier with quantized tensor.
+ *       2) Subtract the input integer zero point.
+ *       3) Multiply the fixed point multiplier with quantized tensor.
  *       4) Round the result.
  *       5) Right shift the result.
- *       6) Add the output_zero_point.
+ *       6) Add the output zero point.
  *       7) Cast to the out_dtype.
  */
 Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
@@ -187,8 +187,7 @@ Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
   auto q_min = GetQmin(param->out_dtype);
   auto q_max = GetQmax(param->out_dtype);
   auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
-  auto requantized_output = Cast(clipped_t, param->out_dtype);
-  return requantized_output;
+  return Cast(clipped_t, param->out_dtype);
 }
 
 /*
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index c7810943b640..1ada7ecd070e 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -33,32 +33,6 @@ namespace tvm {
 namespace relay {
 namespace qnn {
 
-enum class QuantizeOpType {
-  Quantize,
-  Dequantize,
-  Requantize
-};
-
-static inline bool IsValidOpInputType(const QuantizeOpType& op_type,
-        const DataType& in_dtype) {
-  switch (op_type) {
-    case QuantizeOpType::Requantize:
-      return in_dtype == Int(8) || in_dtype == UInt(8) || in_dtype == Int(32);
-    default:
-      return false;
-  }
-}
-
-static inline bool IsValidOpOutputType(const QuantizeOpType& op_type,
-        const DataType& out_dtype) {
-  switch (op_type) {
-    case QuantizeOpType::Requantize:
-      return out_dtype == Int(8) || out_dtype == UInt(8) || out_dtype == Int(32);
-    default:
-      return false;
-  }
-}
-
 static inline const int32_t GetQmin(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support int32 or lower precision";

From ff17a91e61feee436a3d828ea9bc06fdf44169f1 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 2 Aug 2019 17:21:22 +0000
Subject: [PATCH 36/37] Fix the docs.

---
 docs/langref/relay_op.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 269895293f98..6950ecceee05 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -202,9 +202,9 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.contrib.adaptive_avg_pool2d
 
 
-**Level 11: QNN Dialect Operators**
+**Level 11: Dialect Operators**
 
-This level supports quantized operators present in the QNN dialect.
+This level supports dialect operators.
 
 .. autosummary::
    :nosignatures:

From c46b56c5031dcbf37d4d4ace38dc2dc176cadfd6 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 6 Aug 2019 23:29:32 +0000
Subject: [PATCH 37/37] Move to Legalize API.

---
 python/tvm/relay/qnn/__init__.py          |   1 -
 python/tvm/relay/qnn/_transform.py        |  22 --
 python/tvm/relay/qnn/transform.py         |  33 ---
 src/relay/qnn/op/requantize.cc            | 202 +++++++++++++++--
 src/relay/qnn/pass/qnn_lower.cc           | 253 ----------------------
 tests/python/relay/test_qnn_requantize.py |   2 +-
 6 files changed, 189 insertions(+), 324 deletions(-)
 delete mode 100644 python/tvm/relay/qnn/_transform.py
 delete mode 100644 python/tvm/relay/qnn/transform.py
 delete mode 100644 src/relay/qnn/pass/qnn_lower.cc

diff --git a/python/tvm/relay/qnn/__init__.py b/python/tvm/relay/qnn/__init__.py
index fa888d7ce7dd..a472109add39 100644
--- a/python/tvm/relay/qnn/__init__.py
+++ b/python/tvm/relay/qnn/__init__.py
@@ -18,4 +18,3 @@
 """QNN dialect operators and IR passes."""
 from __future__ import absolute_import as _abs
 from . import op
-from . import transform
diff --git a/python/tvm/relay/qnn/_transform.py b/python/tvm/relay/qnn/_transform.py
deleted file mode 100644
index e2ff6f9ed652..000000000000
--- a/python/tvm/relay/qnn/_transform.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#pylint: disable=unused-argument
-"""Internal module for quantization."""
-from __future__ import absolute_import
-from tvm._ffi.function import _init_api
-
-_init_api("relay.qnn._transform", __name__)
diff --git a/python/tvm/relay/qnn/transform.py b/python/tvm/relay/qnn/transform.py
deleted file mode 100644
index 6ca456b4fb81..000000000000
--- a/python/tvm/relay/qnn/transform.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""QNN Dialect transformation passes."""
-from __future__ import absolute_import
-
-from . import _transform
-
-def QnnLower():
-    """
-    Rewrites the high-level quantized ops into low-level exisiting Relay ops.
-
-    Returns
-    -------
-    Pass : tvm.relay.transform.Pass
-        The optmized pas.
-    """
-    return _transform.QnnLower()
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 2e78d20721be..04f7e80d5c64 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -23,9 +23,10 @@
  * \brief QNN requantize operator.
  */
 
-#include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
+#include "../../pass/pattern_util.h"
 #include "../util.h"
 
 namespace tvm {
@@ -34,6 +35,185 @@ namespace qnn {
 
 TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
 
+// Lowering of qnn.requantize op
+
+/*
+ * \brief Convert FP32 representation into fixed point representation.
+ * \param double_multplier The input FP32 number.
+ * \return The pair of multiplier and shift for fixed point representation.
+ * \note Converts a floating point number so that it can be represented by
+ *       integers. The representation is
+ *             float_number = (significand) * 2^(exponent)
+ *
+ *       The significand is a number between 0.5 and 1. This is represented by
+ *       an integer number. For example, if it is int32, then the decimal point
+ *       exists between bit 31 and 30 from LSB (or between first and second bit
+ *       from the left).
+ *
+ *       Some examples are
+ *           0.25 = (0.5) * 2^(-1)
+ *           0.125 = (0.5) * 2^(-2)
+ *
+ *       Credit to TFLite reference implementation.
+ */
+std::pair<int32_t, int32_t> GetFixedPointMultiplierShift(double double_multiplier) {
+  int32_t significand, exponent;
+  if (double_multiplier == 0.) {
+    significand = 0;
+    exponent = 0;
+    return std::make_pair(significand, exponent);
+  }
+
+  // Get the significand and exponent.
+  double significand_d = std::frexp(double_multiplier, &exponent);
+
+  // Convert the double significand to int significand, i.e., convert into a
+  // integer where the decimal point is between bit 31 and 30. This is done by
+  // multiplying the double value with 2^31 and then casting to int.
+  significand_d = std::round(significand_d * (1ll << 31));
+  auto significand_int64 = static_cast<int64_t>(significand_d);
+  CHECK_LE(significand_int64, (1ll << 31));
+  if (significand_int64 == (1ll << 31)) {
+    significand_int64 /= 2;
+    ++exponent;
+  }
+  CHECK_LE(significand_int64, std::numeric_limits<int32_t>::max());
+  significand = static_cast<int32_t>(significand_int64);
+  return std::make_pair(significand, exponent);
+}
+
+/*
+ * \brief Lower requantize to a sequence of ops.
+ * \param input_tensor The input tensor to requantize op.
+ * \param param The requantize op attrs.
+ * \param input_shape The input tensor shape of the requantize op.
+ * \return The sequence of existing Relay ops.
+ * \note Requantization using only integer computation. Here, the computation is
+ *       converted to a fixed point computation by computing output multiplier
+ *       and shift. This is useful, if the target device does not support/have
+ *       very expensive floating point computations.
+ *
+ *       Original compuation is scale_fp32 * quantized_tensor.  To convert into
+ *       integer computation, the multiplication with fp32 scalar can be
+ *       replaced by multiplication with an int value and then right shifting
+ *       the result. This approximates the floating point computation with a
+ *       fixed point computation.
+ *
+ *       The whole computation this can be broken down into following steps
+ *       1) Calculate the integer multiplier and integer shift.
+ *       2) Subtract the input integer zero point.
+ *       3) Multiply the fixed point multiplier with quantized tensor.
+ *       4) Round the result.
+ *       5) Right shift the result.
+ *       6) Add the output zero point.
+ *       7) Cast to the out_dtype.
+ */
+Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
+                     const Array<IndexExpr>& input_shape) {
+  double double_multiplier = param->input_scale / param->output_scale;
+
+  // Choose high precision datatype to be int64. This is for avoiding overflow
+  // in multiplication of two int32 values.
+  DataType hp_dtype = Int(64);
+
+  // 1) Calculating the integer multiplier and integer shift
+  int32_t fixed_point_multiplier, shift;
+  std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(double_multiplier);
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+
+  // 2) Subtract the input_zero_point
+  auto tensor = Cast(input_tensor, hp_dtype);
+  if (param->input_zero_point != 0) {
+    auto input_zp = MakeConstantScalar(hp_dtype, param->input_zero_point);
+    tensor = Subtract(tensor, input_zp);
+  }
+
+  // 3) Multiply the integer multiplier
+  if (left_shift != 0) {
+    tensor = Multiply(tensor, MakeConstantScalar(hp_dtype, 1 << left_shift));
+  }
+  // Perform the multiplication in higher precision.
+  // The scalar is a fixed point value of int32 where the decimal point is
+  // between bits 31 and 30. After multiplying with input_tensor, the result is
+  // in int64 where the decimal point is sitting between bits 31 and 30 (from
+  // the right, rightmost bit is bit 0). The computation is performed in higher
+  // precision to avoid overflow in multiplying two int32 values.
+  Expr scalar = MakeConstantScalar(hp_dtype, fixed_point_multiplier);
+  auto multiplied_t = Multiply(tensor, scalar);
+
+  // 4) Find the rounding scalar. This depends on where the final decimal point
+  // sits. As we will be right shifting the multiplied_t, we need to first
+  // calculate the total_right_shift.
+  int total_right_shift = right_shift + 31;
+  int64_t pos_rounding_value = (1ll << (total_right_shift - 1));
+
+  tensor = multiplied_t;
+  Expr round_scalar;
+  if (param->rounding == "UPWARD") {
+    round_scalar = MakeConstantScalar(hp_dtype, pos_rounding_value);
+  } else if (param->rounding == "TONEAREST") {
+    auto pos_rounder = MakeConstantScalar(hp_dtype, pos_rounding_value);
+    auto neg_rounder = MakeConstantScalar(hp_dtype, pos_rounding_value - 1);
+    auto pos_rounder_t = Full(pos_rounder, input_shape, hp_dtype);
+    auto neg_rounder_t = Full(neg_rounder, input_shape, hp_dtype);
+
+    auto zero = MakeConstantScalar(hp_dtype, 0);
+    auto zero_t = Full(zero, input_shape, hp_dtype);
+    round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t, neg_rounder_t);
+  }
+  // Add the rounding scalar.
+  tensor = Add(tensor, round_scalar);
+
+  // 5) Simply right shift the result to get the final output.
+  auto scaled_int64_t = RightShift(tensor, MakeConstantScalar(hp_dtype, total_right_shift));
+
+  // 6) Add the output zero point.
+  auto output_zp = MakeConstantScalar(hp_dtype, param->output_zero_point);
+  auto shifted_int64_t = Add(output_zp, scaled_int64_t);
+
+  // 7) Clip to the out_dtype min/max.
+  auto q_min = GetQmin(param->out_dtype);
+  auto q_max = GetQmax(param->out_dtype);
+  auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
+  return Cast(clipped_t, param->out_dtype);
+}
+
+/*
+ * \brief Forward rewrite the requantize op.
+ * \param ref_call The original call that will be lowered.
+ * \param new_args The new mutated args to the call node.
+ * \param ctx The node context.
+ * \return The sequence of Relay ops for requantize op.
+ * \note Lowering of the requantize operation. The requantize operator converts
+ *       one quantized tensor to another quantized tensor. For the output
+ *       tensor, we are provided with output scale and zero point. The
+ *       computation looks like this
+ *
+ * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
+ */
+Expr RequantizeLegalize(const Attrs& attrs, const Array<Expr>& new_args,
+                        const Array<tvm::relay::Type>& arg_types) {
+  CHECK_EQ(new_args.size(), 1);
+  auto& quantized_data = new_args[0];
+  const auto* param = attrs.as<RequantizeAttrs>();
+  CHECK(param != nullptr);
+
+  // Find input shape.
+  CHECK_EQ(arg_types.size(), 1);
+  auto input_dtype = arg_types[0];
+  auto input_tensor_type = input_dtype.as<TensorTypeNode>();
+  CHECK(input_tensor_type != nullptr) << "Type information missing."
+                                      << " Please run infer_type pass.";
+  Array<IndexExpr> input_shape = input_tensor_type->shape;
+
+  // Check rounding validity.
+  CHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
+      << "QNN requantize supports two rounding modes - UPWARD and "
+      << "TONEAREST";
+  return RequantizeLower(quantized_data, param, input_shape);
+}
+
 /*
  * \brief Infer shape function of Requantize op.
  * \param types The types of input args.
@@ -42,35 +222,28 @@ TVM_REGISTER_NODE_TYPE(RequantizeAttrs);
  * \param reporter The type reporter that sets the dtype and shapes.
  * \return True if the infer shape succeeded.
  */
-bool RequantizeRel(const Array<Type>& types,
-                   int num_inputs,
-                   const Attrs& attrs,
+bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto in_dtype = data->dtype;
   CHECK(in_dtype == Int(8) || in_dtype == UInt(8) || in_dtype == Int(32))
-    << "Input type should be an integer but was " <<  in_dtype;
+      << "Input type should be an integer but was " << in_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
   // assign output type
   const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
   auto out_dtype = param->out_dtype;
   CHECK(out_dtype == Int(8) || out_dtype == UInt(8) || out_dtype == Int(32))
-    << "Output type should be an integer but was " << out_dtype;
+      << "Output type should be an integer but was " << out_dtype;
   reporter->Assign(types[1], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
 
 // Positional relay function to create qnn requantize operator
 // used by frontend FFI.
-Expr MakeRequantize(Expr data,
-                    double input_scale,
-                    int32_t input_zero_point,
-                    double output_scale,
-                    int32_t output_zero_point,
-                    std::string rounding,
-                    DataType out_dtype) {
+Expr MakeRequantize(Expr data, double input_scale, int32_t input_zero_point, double output_scale,
+                    int32_t output_zero_point, std::string rounding, DataType out_dtype) {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->input_scale = std::move(input_scale);
   attrs->input_zero_point = std::move(input_zero_point);
@@ -95,7 +268,8 @@ Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The quantized input tensor.")
 .set_support_level(11)
-.add_type_rel("Requantize", RequantizeRel);
+.add_type_rel("Requantize", RequantizeRel)
+.set_attr<FTVMLegalize>("FTVMLegalize", RequantizeLegalize);
 
 TVM_REGISTER_API("relay.qnn.op._make.requantize")
 .set_body_typed(MakeRequantize);
diff --git a/src/relay/qnn/pass/qnn_lower.cc b/src/relay/qnn/pass/qnn_lower.cc
deleted file mode 100644
index 5ac6f4b7bda4..000000000000
--- a/src/relay/qnn/pass/qnn_lower.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file qnn_lower.cc
- * \brief Lower qnn ops to a sequence of existing Relay ops.
- */
-
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/qnn/attrs.h>
-#include "../util.h"
-#include "../../pass/pattern_util.h"
-
-namespace tvm {
-namespace relay {
-namespace qnn {
-/*!
- * \brief namespace of qnn lower pass.
- *
- * Use namespace to reduce potential naming conflict.
- */
-namespace qnn_lower {
-
-using runtime::TypedPackedFunc;
-
-// Lowering of qnn.requantize op
-
-/*
- * \brief Convert FP32 representation into fixed point representation.
- * \param double_multplier The input FP32 number.
- * \return The pair of multiplier and shift for fixed point representation.
- * \note Converts a floating point number so that it can be represented by
- *       integers. The representation is
- *             float_number = (significand) * 2^(exponent)
- *
- *       The significand is a number between 0.5 and 1. This is represented by
- *       an integer number. For example, if it is int32, then the decimal point
- *       exists between bit 31 and 30 from LSB (or between first and second bit
- *       from the left).
- *
- *       Some examples are
- *           0.25 = (0.5) * 2^(-1)
- *           0.125 = (0.5) * 2^(-2)
- *
- *       Credit to TFLite reference implementation.
- */
-std::pair<int32_t, int32_t> GetFixedPointMultiplierShift(
-    double double_multiplier) {
-  int32_t significand, exponent;
-  if (double_multiplier == 0.) {
-    significand = 0;
-    exponent = 0;
-    return std::make_pair(significand, exponent);
-  }
-
-  // Get the significand and exponent.
-  double significand_d = std::frexp(double_multiplier, &exponent);
-
-  // Convert the double significand to int significand, i.e., convert into a
-  // integer where the decimal point is between bit 31 and 30. This is done by
-  // multiplying the double value with 2^31 and then casting to int.
-  significand_d = std::round(significand_d * (1ll << 31));
-  auto significand_int64 = static_cast<int64_t>(significand_d);
-  CHECK_LE(significand_int64, (1ll << 31));
-  if (significand_int64 == (1ll << 31)) {
-    significand_int64 /= 2;
-    ++exponent;
-  }
-  CHECK_LE(significand_int64, std::numeric_limits<int32_t>::max());
-  significand = static_cast<int32_t>(significand_int64);
-  return std::make_pair(significand, exponent);
-}
-
-/*
- * \brief Lower requantize to a sequence of ops.
- * \param input_tensor The input tensor to requantize op.
- * \param param The requantize op attrs.
- * \param out_shape The output shape of the requantize op.
- * \return The sequence of existing Relay ops.
- * \note Requantization using only integer computation. Here, the computation is
- *       converted to a fixed point computation by computing output multiplier
- *       and shift. This is useful, if the target device does not support/have
- *       very expensive floating point computations.
- *
- *       Original compuation is scale_fp32 * quantized_tensor.  To convert into
- *       integer computation, the multiplication with fp32 scalar can be
- *       replaced by multiplication with an int value and then right shifting
- *       the result. This approximates the floating point computation with a
- *       fixed point computation.
- *
- *       The whole computation this can be broken down into following steps
- *       1) Calculate the integer multiplier and integer shift.
- *       2) Subtract the input integer zero point.
- *       3) Multiply the fixed point multiplier with quantized tensor.
- *       4) Round the result.
- *       5) Right shift the result.
- *       6) Add the output zero point.
- *       7) Cast to the out_dtype.
- */
-Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
-        const Array<IndexExpr>& out_shape) {
-  double double_multiplier = param->input_scale/param->output_scale;
-
-  // Choose high precision datatype to be int64. This is for avoiding overflow
-  // in multiplication of two int32 values.
-  DataType hp_dtype = Int(64);
-
-  // 1) Calculating the integer multiplier and integer shift
-  int32_t fixed_point_multiplier, shift;
-  std::tie(fixed_point_multiplier, shift) =
-      GetFixedPointMultiplierShift(double_multiplier);
-  int left_shift = shift > 0 ? shift : 0;
-  int right_shift = shift > 0 ? 0 : -shift;
-
-  // 2) Subtract the input_zero_point
-  auto tensor = Cast(input_tensor, hp_dtype);
-  if (param->input_zero_point != 0) {
-    auto input_zp = MakeConstantScalar(hp_dtype, param->input_zero_point);
-    tensor = Subtract(tensor, input_zp);
-  }
-
-  // 3) Multiply the integer multiplier
-  if (left_shift != 0) {
-    tensor = Multiply(tensor, MakeConstantScalar(hp_dtype, 1 << left_shift));
-  }
-  // Perform the multiplication in higher precision.
-  // The scalar is a fixed point value of int32 where the decimal point is
-  // between bits 31 and 30. After multiplying with input_tensor, the result is
-  // in int64 where the decimal point is sitting between bits 31 and 30 (from
-  // the right, rightmost bit is bit 0). The computation is performed in higher
-  // precision to avoid overflow in multiplying two int32 values.
-  Expr scalar = MakeConstantScalar(hp_dtype, fixed_point_multiplier);
-  auto multiplied_t = Multiply(tensor, scalar);
-
-  // 4) Find the rounding scalar. This depends on where the final decimal point
-  // sits. As we will be right shifting the multiplied_t, we need to first
-  // calculate the total_right_shift.
-  int total_right_shift = right_shift + 31;
-  int64_t pos_rounding_value = (1ll << (total_right_shift - 1));
-
-  tensor = multiplied_t;
-  Expr round_scalar;
-  if (param->rounding == "UPWARD") {
-    round_scalar = MakeConstantScalar(hp_dtype, pos_rounding_value);
-  } else if (param->rounding == "TONEAREST") {
-    auto pos_rounder = MakeConstantScalar(hp_dtype, pos_rounding_value);
-    auto neg_rounder = MakeConstantScalar(hp_dtype, pos_rounding_value - 1);
-    auto pos_rounder_t = Full(pos_rounder, out_shape, hp_dtype);
-    auto neg_rounder_t = Full(neg_rounder, out_shape, hp_dtype);
-
-    auto zero = MakeConstantScalar(hp_dtype, 0);
-    auto zero_t = Full(zero, out_shape, hp_dtype);
-    round_scalar = Where(GreaterEqual(tensor, zero_t), pos_rounder_t,
-            neg_rounder_t);
-  }
-  // Add the rounding scalar.
-  tensor = Add(tensor, round_scalar);
-
-  // 5) Simply right shift the result to get the final output.
-  auto scaled_int64_t = RightShift(tensor,
-          MakeConstantScalar(hp_dtype, total_right_shift));
-
-  // 6) Add the output zero point.
-  auto output_zp = MakeConstantScalar(hp_dtype, param->output_zero_point);
-  auto shifted_int64_t = Add(output_zp, scaled_int64_t);
-
-  // 7) Clip to the out_dtype min/max.
-  auto q_min = GetQmin(param->out_dtype);
-  auto q_max = GetQmax(param->out_dtype);
-  auto clipped_t = Clip(shifted_int64_t, q_min, q_max);
-  return Cast(clipped_t, param->out_dtype);
-}
-
-/*
- * \brief Forward rewrite the requantize op.
- * \param ref_call The original call that will be lowered.
- * \param new_args The new mutated args to the call node.
- * \param ctx The node context.
- * \return The sequence of Relay ops for requantize op.
- * \note Lowering of the requantize operation. The requantize operator converts
- *       one quantized tensor to another quantized tensor. For the output
- *       tensor, we are provided with output scale and zero point. The
- *       computation looks like this
- *
- * Q_output = zp_output +  (scale_input)/(scale_ouptut) * (Q_input - zp_input)
- */
-Expr RequantizeForwardRewrite(const Call& ref_call,
-    const Array<Expr>& new_args, const NodeRef& ctx) {
-  CHECK_EQ(new_args.size(), 1);
-  Expr quantized_data = new_args[0];
-  const auto* param = ref_call->attrs.as<RequantizeAttrs>();
-  CHECK(param != nullptr);
-
-  // Find output shape.
-  auto ref_call_t = ref_call->checked_type();
-  auto output_tt = ref_call_t.as<TensorTypeNode>();
-  CHECK(output_tt != nullptr) << "Type information missing."
-      << " Please run infer_type pass.";
-  Array<IndexExpr> out_shape = output_tt->shape;
-
-  // Check rounding validity.
-  CHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
-      << "QNN requantize supports two rounding modes - UPWARD and "
-      << "TONEAREST";
-  return RequantizeLower(quantized_data, param, out_shape);
-}
-
-RELAY_REGISTER_OP("qnn.requantize")
-.set_attr<FForwardRewrite>("FQnnForwardRewrite", RequantizeForwardRewrite);
-
-Expr QnnLower(const Expr& expr) {
-  return ForwardRewrite(expr, "FQnnForwardRewrite", nullptr, nullptr);
-}
-}  // namespace qnn_lower
-
-namespace transform {
-using namespace tvm::relay::transform;
-Pass QnnLower() {
-  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
-    [=](Function f, Module m, PassContext pc) {
-      return Downcast<Function>(
-          relay::qnn::qnn_lower::QnnLower(f));
-  };
-  return CreateFunctionPass(pass_func, 0, "QnnLower",
-                            {ir::StringImm::make("InferType")});
-}
-
-TVM_REGISTER_API("relay.qnn._transform.QnnLower")
-.set_body_typed(QnnLower);
-}  // namespace transform
-
-}  // namespace qnn
-}  // namespace relay
-}  // namespace tvm
diff --git a/tests/python/relay/test_qnn_requantize.py b/tests/python/relay/test_qnn_requantize.py
index 3925c1e5d573..cd478fb5ba22 100644
--- a/tests/python/relay/test_qnn_requantize.py
+++ b/tests/python/relay/test_qnn_requantize.py
@@ -57,7 +57,7 @@ def get_mod(data_shape, data_dtype, out_dtype, input_scale, output_scale,
 
         mod = relay.Function(relay.analysis.free_vars(mod), mod)
         mod = relay.Module.from_expr(mod)
-        mod = relay.qnn.transform.QnnLower()(mod)
+        mod = relay.transform.Legalize()(mod)
         return mod
 
     def same_scale_test():