From 275ea3b11ca835bb3378ae1d4899ba291c9275af Mon Sep 17 00:00:00 2001
From: Matthew <mbrookhart@octoml.ai>
Date: Tue, 22 Jun 2021 11:30:44 -0600
Subject: [PATCH 1/3] support QLinearAdd

---
 python/tvm/relay/frontend/onnx.py          | 43 +++++++++++++++-
 tests/python/frontend/onnx/test_forward.py | 60 +++++++++++++++++++++-
 2 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index e8f0fbffc0dc..ff386bd49b30 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2973,6 +2973,8 @@ def _impl_v13(cls, inputs, attr, params):
         data, scale, zp = inputs
         out_dtype = infer_type(zp).checked_type.dtype
         axis = attr.get("axis", 1)
+        if len(infer_shape(data)) < 2:
+            axis = 0
         return _qnn.op.quantize(data, scale, _op.cast(zp, "int32"), axis, out_dtype)
 
 
@@ -3033,10 +3035,11 @@ def get_scalar(x, dtype="float32"):
         weight = inputs[3]
         w_scale = get_scalar(inputs[4])
         w_zero_point = get_scalar(inputs[5], "int32")
-        y_scale = get_scalar(inputs[6])
+        y_scale = fold_constant(get_scalar(inputs[6]))
         y_zero_point = get_scalar(inputs[7], "int32")
 
         input_shape = infer_shape(data)
+
         ndim = len(input_shape)
         kernel_type = infer_type(weight)
         kernel_shapes = [get_const_tuple(kernel_type.checked_type.shape)]
@@ -3116,6 +3119,43 @@ def get_scalar(x, dtype="float32"):
         return out
 
 
+class QLinearAdd(OnnxOpConverter):
+    """Operator converter for QLinearAdd from Microsoft onnxruntime contrib opset."""
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        def get_scalar(x, dtype="float32"):
+            if isinstance(x, _expr.Var) and x.name_hint in params:
+                return _op.const(params[x.name_hint].numpy(), dtype)
+            rank = len(infer_shape(x))
+            assert rank <= 1, "QLinearConv scale and zero_point input must be scalars"
+            if rank == 1:
+                x = _op.squeeze(x, [0])
+            return _op.cast(x, dtype)
+
+        a = inputs[0]
+        a_scale = get_scalar(inputs[1])
+        a_zero_point = get_scalar(inputs[2], "int32")
+        b = inputs[3]
+        b_scale = get_scalar(inputs[4])
+        b_zero_point = get_scalar(inputs[5], "int32")
+        c_scale = get_scalar(inputs[6])
+        c_zero_point = get_scalar(inputs[7], "int32")
+
+        dtype = infer_type(a).checked_type.dtype
+
+        ## Onnxruntime doesn't actually do this op in integer, they dequantize to fp32 and then requantize afer
+        ## see https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/mlas/lib/qladd.cpp
+        a = _qnn.op.dequantize(
+            inputs[0], a_scale, a_zero_point
+        )  # , c_scale, c_zero_point, out_dtype = dtype)
+        b = _qnn.op.dequantize(
+            inputs[3], b_scale, b_zero_point
+        )  # , c_scale, c_zero_point, out_dtype = dtype)
+        out = _op.add(a, b)
+        return _qnn.op.quantize(out, c_scale, c_zero_point, out_dtype=dtype)
+
+
 class BitShift(OnnxOpConverter):
     """Operator converter for NonZero"""
 
@@ -3343,6 +3383,7 @@ def _get_convert_map(opset):
         "DynamicQuantizeLinear": DynamicQuantizeLinear.get_converter(opset),
         "ReverseSequence": ReverseSequence.get_converter(opset),
         "QLinearConv": QLinearConv.get_converter(opset),
+        "QLinearAdd": QLinearAdd.get_converter(opset),
     }
 
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index db71855fd80f..b878822a6dbf 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -120,7 +120,7 @@ def get_tvm_output(
 def get_onnxruntime_output(model, inputs):
     import onnxruntime.backend
 
-    rep = onnxruntime.backend.prepare(model, "CPU")
+    rep = onnxruntime.backend.prepare(model.SerializeToString(), "CPU")
     if isinstance(inputs, list) and len(inputs) == 1:
         inp = inputs[0]
     else:
@@ -149,6 +149,7 @@ def verify_with_ort_with_inputs(
 ):
     if opset is not None:
         model.opset_import[0].version = opset
+
     ort_out = get_onnxruntime_output(model, inputs)
 
     if targets is None:
@@ -4755,6 +4756,63 @@ def repeat(N, D):
     )
 
 
+def verify_qlinearadd(a_shape, b_shape, c_shape):
+
+    a_array = np.random.random(a_shape).astype("float32")
+    b_array = np.random.random(b_shape).astype("float32")
+
+    input_nodes = [
+        helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
+        helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
+    ]
+    input_names = [
+        "a",
+        "b",
+    ]
+    input_values = [a_array, b_array]
+
+    node = helper.make_node("QLinearAdd", inputs=input_names, outputs=["C"])
+
+    node = helper.make_node("Add", ["a", "b"], ["C"])
+    graph = helper.make_graph(
+        [node],
+        "qlinearadd_test",
+        inputs=input_nodes,
+        outputs=[helper.make_tensor_value_info("C", TensorProto.FLOAT, list(c_shape))],
+    )
+    model = helper.make_model(graph, producer_name="qlinearconv_test")
+    from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType
+
+    class RandomDataReader(CalibrationDataReader):
+        def __init__(self, n=10):
+            self.data = iter(
+                [
+                    {
+                        "a": np.random.random(a_shape).astype("float32"),
+                        "b": np.random.random(b_shape).astype("float32"),
+                    }
+                    for _ in range(n)
+                ]
+            )
+
+        def get_next(self):
+            return next(self.data, None)
+
+    model_fp32 = "/tmp/model.onnx"
+    onnx.save_model(model, model_fp32)
+    model_quant = "/tmp/model.quant.onnx"
+    quantized_model = quantize_static(model_fp32, model_quant, RandomDataReader())
+    # opt_level=1 will cause error with qnn lowering
+    model = onnx.load(model_quant)
+    verify_with_ort_with_inputs(model, input_values, opt_level=2)
+
+
+def test_qlinearadd():
+    verify_qlinearadd([4, 2], [4, 2], [4, 2])
+    verify_qlinearadd([4, 2], [2], [4, 2])
+    verify_qlinearadd([5, 1, 7], [2, 7], [5, 2, 7])
+
+
 if __name__ == "__main__":
     test_flatten()
     test_reshape()

From d7f841fc7747d2d3a15a56ae93da1f5a688f0d9b Mon Sep 17 00:00:00 2001
From: Matthew <mbrookhart@octoml.ai>
Date: Tue, 22 Jun 2021 11:42:50 -0600
Subject: [PATCH 2/3] fix comment line length

---
 python/tvm/relay/frontend/onnx.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ff386bd49b30..7135fccdf43b 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -3144,8 +3144,9 @@ def get_scalar(x, dtype="float32"):
 
         dtype = infer_type(a).checked_type.dtype
 
-        ## Onnxruntime doesn't actually do this op in integer, they dequantize to fp32 and then requantize afer
-        ## see https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/mlas/lib/qladd.cpp
+        ## Onnxruntime doesn't actually do this op in integer, they dequantize to fp32
+        ## and then requantize afer
+        ## https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/mlas/lib/qladd.cpp
         a = _qnn.op.dequantize(
             inputs[0], a_scale, a_zero_point
         )  # , c_scale, c_zero_point, out_dtype = dtype)

From 3dae290535bd80862a2ba25b46936f23f4f16590 Mon Sep 17 00:00:00 2001
From: Matthew <mbrookhart@octoml.ai>
Date: Wed, 30 Jun 2021 12:16:58 -0600
Subject: [PATCH 3/3] use platform independent temp directory

---
 tests/python/frontend/onnx/test_forward.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index b878822a6dbf..52c3346e5807 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import os
 import re
 
 import numpy as np
@@ -4798,9 +4799,10 @@ def __init__(self, n=10):
         def get_next(self):
             return next(self.data, None)
 
-    model_fp32 = "/tmp/model.onnx"
+    d = tvm.contrib.utils.tempdir()
+    model_fp32 = os.path.join(d.temp_dir, "model.onnx")
     onnx.save_model(model, model_fp32)
-    model_quant = "/tmp/model.quant.onnx"
+    model_quant = os.path.join(d.temp_dir, "model.quant.onnx")
     quantized_model = quantize_static(model_fp32, model_quant, RandomDataReader())
     # opt_level=1 will cause error with qnn lowering
     model = onnx.load(model_quant)