From 230facd864723b37c12484cb8a40d406614dbcab Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Thu, 23 Apr 2020 01:04:08 +0000
Subject: [PATCH 1/2] Fixes for quantizing a BERT from HuggingFace

---
 .../providers/nuphar/scripts/model_quantizer.py  |  4 ++--
 .../nuphar/scripts/symbolic_shape_infer.py       | 16 ++++++++++------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
index eec594c58b0e7..bf6140bca33f3 100644
--- a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
@@ -158,7 +158,7 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
             Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
             Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)])
             Q_Xf = nf.make_node('Floor', Q_Xf)
-            Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
+            Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)])
             Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8  : onnx.TensorProto.UINT8,
                                                         np.int8   : onnx.TensorProto.INT8,
                                                         np.uint16 : onnx.TensorProto.UINT16,
@@ -294,4 +294,4 @@ def parse_arguments():
     print('Quantize MatMul to MatMulInteger...')
     assert not args.export_qcfg_json or args.qcfg_json, "--qcfg_json must be specified when --export_qcfg_json is used"
     convert_matmul_model(args.input, args.output, args.only_for_scan, args.share_input_quantization, args.default_qcfg, args.qcfg_json, args.export_qcfg_json)
-    print('Done!')
\ No newline at end of file
+    print('Done!')
diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
index 5ef2ac4c2aa18..17a0d6ed4abce 100755
--- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
@@ -230,12 +230,16 @@ def _merge_symbols(self, dims):
             if self.auto_merge_:
                 assert len(dims) == 2 # only allow symbol->int merge in binary ops for now
                 is_int = [is_literal(d) for d in dims]
-                assert sum(is_int) == 1
-                int_dim = is_int.index(1)
-                if self.verbose_ > 0:
-                    print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim]))
-                self._check_merged_dims(dims, allow_broadcast=False)
-                return dims[int_dim]
+                if sum(is_int) == 1:
+                  int_dim = is_int.index(1)
+                  if self.verbose_ > 0:
+                      print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim]))
+                  self._check_merged_dims(dims, allow_broadcast=False)
+                  return dims[int_dim]
+                else:
+                  if self.verbose_ > 0:
+                      print('dim {} has been mergd with dim {}'.format(dims[0], dims[1]))
+                  return dims[0]
             else:
                 return None
         if all([d == dims[0] for d in dims]):

From c864c8e50d25d669dfc1700b87db0a38ab29c294 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Fri, 15 May 2020 18:44:33 -0700
Subject: [PATCH 2/2] Address CR and some other minor fixes

---
 .../nuphar/scripts/model_quantizer.py         | 16 +++++----
 .../nuphar/scripts/symbolic_shape_infer.py    | 36 +++++++++----------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
index bf6140bca33f3..c2bda5b52352e 100644
--- a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
@@ -53,7 +53,7 @@ def __iter__(self): # need this to make dict for json
                      ('QuantizationType', 'Signed' if self.sign_bit_ else 'Unsigned'),
                      ('ReservedBit', self.reserved_bits_)])
 
-def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg):
+def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg, onnx_opset_ver):
     assert in_node.op_type == 'MatMul'
 
     # quantize weight
@@ -158,7 +158,11 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
             Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
             Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)])
             Q_Xf = nf.make_node('Floor', Q_Xf)
-            Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)])
+            if onnx_opset_ver < 11:
+                Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
+            else:
+                # Clip changed min max to inputs in opset 11
+                Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)])
             Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8  : onnx.TensorProto.UINT8,
                                                         np.int8   : onnx.TensorProto.INT8,
                                                         np.uint16 : onnx.TensorProto.UINT16,
@@ -238,7 +242,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
     out_mp = onnx.ModelProto()
     out_mp.CopyFrom(in_mp)
     out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input
-    ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
+    onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
     ensure_opset(out_mp, 1, 'com.microsoft') # add MS domain for MatMulInteger16
     out_mp.graph.ClearField('node')
     nf = NodeFactory(out_mp.graph)
@@ -249,7 +253,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
             continue
 
         if in_n.op_type == 'MatMul' and not only_for_scan:
-            if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg):
+            if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
                 continue
 
         out_n = out_mp.graph.node.add()
@@ -262,7 +266,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
             subgraph_quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls
             for in_sn in in_subgraph.node:
                 if in_sn.op_type == 'MatMul':
-                    if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg):
+                    if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
                         continue
 
                 if upgrade_op(scan_nf, in_sn):
@@ -294,4 +298,4 @@ def parse_arguments():
     print('Quantize MatMul to MatMulInteger...')
     assert not args.export_qcfg_json or args.qcfg_json, "--qcfg_json must be specified when --export_qcfg_json is used"
     convert_matmul_model(args.input, args.output, args.only_for_scan, args.share_input_quantization, args.default_qcfg, args.qcfg_json, args.export_qcfg_json)
-    print('Done!')
+    print('Done!')
\ No newline at end of file
diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
index 17a0d6ed4abce..9fac020e4adee 100755
--- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
@@ -649,12 +649,10 @@ def _infer_ConstantOfShape(self, node):
     def _infer_Expand(self, node):
         expand_to_shape = self._try_get_value(node, 1)
         if expand_to_shape is not None:
-            sympy_shape = self._get_sympy_shape(node, 0)
-            new_sympy_shape = self._broadcast_shapes(sympy_shape, expand_to_shape)
-
-            # new_shape's dim can come from 'Expand' computation
-            self._update_computed_dims(new_sympy_shape)
-            new_shape = get_shape_from_sympy_shape(new_sympy_shape)
+            # new_shape's dim can come from shape value
+            self._update_computed_dims(expand_to_shape)
+            shape = self._get_shape(node, 0)
+            new_shape = self._broadcast_shapes(shape, get_shape_from_sympy_shape(expand_to_shape))
             vi = self.known_vi_[node.output[0]]
             vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, new_shape))
 
@@ -784,13 +782,13 @@ def _infer_Pad(self, node):
             rank = len(sympy_shape)
             if pads is not None:
                 assert len(pads) == 2*rank
-                new_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])]
-                self._update_computed_dims(new_shape)
+                new_sympy_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])]
+                self._update_computed_dims(new_sympy_shape)
             else:
                 # dynamic pads, create new symbolic dimensions
-                new_shape = self._new_symbolic_shape(rank, node)
+                new_sympy_shape = self._new_symbolic_shape(rank, node)
             output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_shape)))
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape)))
 
     def _infer_Pool(self, node):
         sympy_shape = self._compute_conv_pool_shape(node)
@@ -808,12 +806,12 @@ def _infer_Range(self, node):
             start = as_scalar(input_data[0])
             limit = as_scalar(input_data[1])
             delta = as_scalar(input_data[2])
-            new_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)]
+            new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)]
         else:
             new_dim = self._new_symbolic_dim_from_output(node)
-            new_shape = [self.symbolic_dims_[new_dim]]
-        self._update_computed_dims(new_shape)
-        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_shape)))
+            new_sympy_shape = [self.symbolic_dims_[new_dim]]
+        self._update_computed_dims(new_sympy_shape)
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_sympy_shape)))
 
     def _infer_ReduceProd(self, node):
         axes = get_attribute(node, 'axes')
@@ -1046,15 +1044,15 @@ def _infer_Squeeze(self, node):
     def _infer_Tile(self, node):
         repeats_value = self._get_value(node, 1)
         input_sympy_shape = self._get_sympy_shape(node, 0)
-        new_shape = []
+        new_sympy_shape = []
         for i,d in enumerate(input_sympy_shape):
             new_dim = d * repeats_value[i]
-            new_shape.append(new_dim)
-        self._update_computed_dims(new_shape)
+            new_sympy_shape.append(new_dim)
+        self._update_computed_dims(new_sympy_shape)
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0],
                                                   vi.type.tensor_type.elem_type,
-                                                  get_shape_from_sympy_shape(new_shape)))
+                                                  get_shape_from_sympy_shape(new_sympy_shape)))
 
     def _infer_TopK(self, node):
         rank = self._get_shape_rank(node, 0)
@@ -1272,4 +1270,4 @@ def parse_arguments():
         print('output model ' + args.output)
     print('Doing symbolic shape inference...')
     out_mp = SymbolicShapeInference.infer_shapes(args.input, args.output, args.int_max, args.auto_merge, args.guess_output_rank, args.verbose)
-    print('Done!')
+    print('Done!')
\ No newline at end of file