From 230facd864723b37c12484cb8a40d406614dbcab Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Thu, 23 Apr 2020 01:04:08 +0000 Subject: [PATCH 1/2] Fixes for quantizing a BERT from HuggingFace --- .../providers/nuphar/scripts/model_quantizer.py | 4 ++-- .../nuphar/scripts/symbolic_shape_infer.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py index eec594c58b0e7..bf6140bca33f3 100644 --- a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py +++ b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py @@ -158,7 +158,7 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)]) Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)]) Q_Xf = nf.make_node('Floor', Q_Xf) - Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()}) + Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)]) Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8 : onnx.TensorProto.UINT8, np.int8 : onnx.TensorProto.INT8, np.uint16 : onnx.TensorProto.UINT16, @@ -294,4 +294,4 @@ def parse_arguments(): print('Quantize MatMul to MatMulInteger...') assert not args.export_qcfg_json or args.qcfg_json, "--qcfg_json must be specified when --export_qcfg_json is used" convert_matmul_model(args.input, args.output, args.only_for_scan, args.share_input_quantization, args.default_qcfg, args.qcfg_json, args.export_qcfg_json) - print('Done!') \ No newline at end of file + print('Done!') diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py index 5ef2ac4c2aa18..17a0d6ed4abce 100755 --- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py +++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py @@ -230,12 +230,16 @@ def _merge_symbols(self, dims): if self.auto_merge_: assert len(dims) == 2 # only allow symbol->int merge in binary ops for now is_int = [is_literal(d) for d in dims] - assert sum(is_int) == 1 - int_dim = is_int.index(1) - if self.verbose_ > 0: - print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim])) - self._check_merged_dims(dims, allow_broadcast=False) - return dims[int_dim] + if sum(is_int) == 1: + int_dim = is_int.index(1) + if self.verbose_ > 0: + print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim])) + self._check_merged_dims(dims, allow_broadcast=False) + return dims[int_dim] + else: + if self.verbose_ > 0: + print('dim {} has been mergd with dim {}'.format(dims[0], dims[1])) + return dims[0] else: return None if all([d == dims[0] for d in dims]): From c864c8e50d25d669dfc1700b87db0a38ab29c294 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Fri, 15 May 2020 18:44:33 -0700 Subject: [PATCH 2/2] Address CR and some other minor fixes --- .../nuphar/scripts/model_quantizer.py | 16 +++++---- .../nuphar/scripts/symbolic_shape_infer.py | 36 +++++++++---------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py index bf6140bca33f3..c2bda5b52352e 100644 --- a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py +++ b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py @@ -53,7 +53,7 @@ def __iter__(self): # need this to make dict for json ('QuantizationType', 'Signed' if self.sign_bit_ else 'Unsigned'), ('ReservedBit', self.reserved_bits_)]) -def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg): +def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg, onnx_opset_ver): assert in_node.op_type == 'MatMul' # quantize weight @@ -158,7 +158,11 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)]) Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)]) Q_Xf = nf.make_node('Floor', Q_Xf) - Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)]) + if onnx_opset_ver < 11: + Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()}) + else: + # Clip changed min max to inputs in opset 11 + Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)]) Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8 : onnx.TensorProto.UINT8, np.int8 : onnx.TensorProto.INT8, np.uint16 : onnx.TensorProto.UINT16, @@ -238,7 +242,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i out_mp = onnx.ModelProto() out_mp.CopyFrom(in_mp) out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input - ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger + onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger ensure_opset(out_mp, 1, 'com.microsoft') # add MS domain for MatMulInteger16 out_mp.graph.ClearField('node') nf = NodeFactory(out_mp.graph) @@ -249,7 +253,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i continue if in_n.op_type == 'MatMul' and not only_for_scan: - if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg): + if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver): continue out_n = out_mp.graph.node.add() @@ -262,7 +266,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i subgraph_quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls for in_sn in in_subgraph.node: if in_sn.op_type == 'MatMul': - if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg): + if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver): continue if upgrade_op(scan_nf, in_sn): @@ -294,4 +298,4 @@ def parse_arguments(): print('Quantize MatMul to MatMulInteger...') assert not args.export_qcfg_json or args.qcfg_json, "--qcfg_json must be specified when --export_qcfg_json is used" convert_matmul_model(args.input, args.output, args.only_for_scan, args.share_input_quantization, args.default_qcfg, args.qcfg_json, args.export_qcfg_json) - print('Done!') + print('Done!') \ No newline at end of file diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py index 17a0d6ed4abce..9fac020e4adee 100755 --- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py +++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py @@ -649,12 +649,10 @@ def _infer_ConstantOfShape(self, node): def _infer_Expand(self, node): expand_to_shape = self._try_get_value(node, 1) if expand_to_shape is not None: - sympy_shape = self._get_sympy_shape(node, 0) - new_sympy_shape = self._broadcast_shapes(sympy_shape, expand_to_shape) - - # new_shape's dim can come from 'Expand' computation - self._update_computed_dims(new_sympy_shape) - new_shape = get_shape_from_sympy_shape(new_sympy_shape) + # new_shape's dim can come from shape value + self._update_computed_dims(expand_to_shape) + shape = self._get_shape(node, 0) + new_shape = self._broadcast_shapes(shape, get_shape_from_sympy_shape(expand_to_shape)) vi = self.known_vi_[node.output[0]] vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, new_shape)) @@ -784,13 +782,13 @@ def _infer_Pad(self, node): rank = len(sympy_shape) if pads is not None: assert len(pads) == 2*rank - new_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])] - self._update_computed_dims(new_shape) + new_sympy_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])] + self._update_computed_dims(new_sympy_shape) else: # dynamic pads, create new symbolic dimensions - new_shape = self._new_symbolic_shape(rank, node) + new_sympy_shape = self._new_symbolic_shape(rank, node) output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type - vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_shape))) + vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape))) def _infer_Pool(self, node): sympy_shape = self._compute_conv_pool_shape(node) @@ -808,12 +806,12 @@ def _infer_Range(self, node): start = as_scalar(input_data[0]) limit = as_scalar(input_data[1]) delta = as_scalar(input_data[2]) - new_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)] + new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)] else: new_dim = self._new_symbolic_dim_from_output(node) - new_shape = [self.symbolic_dims_[new_dim]] - self._update_computed_dims(new_shape) - vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_shape))) + new_sympy_shape = [self.symbolic_dims_[new_dim]] + self._update_computed_dims(new_sympy_shape) + vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_sympy_shape))) def _infer_ReduceProd(self, node): axes = get_attribute(node, 'axes') @@ -1046,15 +1044,15 @@ def _infer_Squeeze(self, node): def _infer_Tile(self, node): repeats_value = self._get_value(node, 1) input_sympy_shape = self._get_sympy_shape(node, 0) - new_shape = [] + new_sympy_shape = [] for i,d in enumerate(input_sympy_shape): new_dim = d * repeats_value[i] - new_shape.append(new_dim) - self._update_computed_dims(new_shape) + new_sympy_shape.append(new_dim) + self._update_computed_dims(new_sympy_shape) vi = self.known_vi_[node.output[0]] vi.CopyFrom(helper.make_tensor_value_info(node.output[0], vi.type.tensor_type.elem_type, - get_shape_from_sympy_shape(new_shape))) + get_shape_from_sympy_shape(new_sympy_shape))) def _infer_TopK(self, node): rank = self._get_shape_rank(node, 0) @@ -1272,4 +1270,4 @@ def parse_arguments(): print('output model ' + args.output) print('Doing symbolic shape inference...') out_mp = SymbolicShapeInference.infer_shapes(args.input, args.output, args.int_max, args.auto_merge, args.guess_output_rank, args.verbose) - print('Done!') + print('Done!') \ No newline at end of file