From eb917d8b122c985b794581f8586e307b4578157a Mon Sep 17 00:00:00 2001 From: An Wang Date: Tue, 1 Mar 2022 16:56:51 -0800 Subject: [PATCH 01/10] implement matmulinteger --- python/tvm/relay/frontend/onnx.py | 59 +++++++++++++++++++++- tests/python/frontend/onnx/test_forward.py | 2 +- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 474b688e2ad8..4ff5b034194f 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -3916,7 +3916,7 @@ class QLinearMatMul(OnnxOpConverter): """ @classmethod - def _impl_v10(cls, inputs, attr, params): + def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=["int8", "uint8"]): # Some of the ops used below take scalar-like inputs, and may require either # of the following: @@ -3966,7 +3966,7 @@ def try_resolve_to_const(x, dtype_override=None): assert b_zp_type.dtype == b_type.dtype assert y_scale_type.dtype == "float32" - assert y_zp_type.dtype in ["int8", "uint8"] + assert y_zp_type.dtype in expected_out_dtypes # TODO: relax this limitation in a future version of this importer. a_rank = len(a_shape) @@ -4053,6 +4053,60 @@ def try_resolve_to_const(x, dtype_override=None): return y +class MatMulInteger(OnnxOpConverter): + """Operator converter for MatMulInteger.""" + + @classmethod + def _impl_v10(cls, inputs, attr, params): + # The production MUST never overflow. The accumulation may overflow if and only if in 32 bits + a = inputs[0] + b = inputs[1] + + a_dtype = infer_type(a).checked_type.dtype + b_dtype = infer_type(b).checked_type.dtype + + assert a_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for first input" + assert b_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for second input" + + assert a_dtype == b_dtype, "MatMulInteger: input dtypes must match" + + a_scale = _op.const(1.0, dtype="float32") + b_scale = _op.const(1.0, dtype="float32") + out_scale = _op.const(1.0, dtype="float32") + + a_zero_point = _op.const(0.0, dtype=a_dtype) + b_zero_point = _op.const(0.0, dtype=b_dtype) + # We use a_dtype here because a_dtype and b_dtype are equivalent + out_zero_point = _op.const(0.0, dtype="int32") + + if len(inputs) == 4: + a_zero_point = inputs[2] + b_zero_point = inputs[3] + + a_zp_dtype = infer_type(a_zero_point).checked_type.dtype + b_zp_dtype = infer_type(b_zero_point).checked_type.dtype + assert ( + a_zp_dtype == a_dtype and b_zp_dtype == b_dtype + ), "MatMulInteger: input dtype doesn't match zero point dtype" + elif len(inputs) != 2: + raise AssertionError( + "MatMulInteger op takes 2 or 4 inputs, {} given".format(len(inputs)) + ) + + inputs = [ + a, + a_scale, + a_zero_point, + b, + b_scale, + b_zero_point, + out_scale, + out_zero_point, + ] + + return QLinearMatMul.get_converter(10)(inputs, attr, params, expected_out_dtypes=["int32"]) + + class QLinearMul(OnnxOpConverter): """Operator converter for QLinearMul from Microsoft onnxruntime contrib opset.""" @@ -4781,6 +4835,7 @@ def _get_convert_map(opset): "Softsign": Softsign.get_converter(opset), "Gemm": Gemm.get_converter(opset), "MatMul": MatMul.get_converter(opset), + "MatMulInteger": MatMulInteger.get_converter(opset), "MatMulInteger16": MatMulInteger16.get_converter(opset), "Mod": Mod.get_converter(opset), "Xor": Renamer("logical_xor"), diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 0751f4a2e293..7a5b5d190a3a 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -5053,7 +5053,7 @@ def verify_eyelike(indata): "test_loop11", "test_loop13_seq", "test_lstm_batchwise", - "test_matmulinteger", + # "test_matmulinteger", "test_maxpool_with_argmax_2d_precomputed_pads", "test_maxpool_with_argmax_2d_precomputed_strides", "test_maxunpool_export_with_output_shape", From 8cde209cfa5b2d647fc4c18346bced7868f29386 Mon Sep 17 00:00:00 2001 From: An Wang Date: Wed, 2 Mar 2022 08:43:34 -0800 Subject: [PATCH 02/10] rm test --- tests/python/frontend/onnx/test_forward.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 7a5b5d190a3a..94fd0a5de40b 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -5053,7 +5053,6 @@ def verify_eyelike(indata): "test_loop11", "test_loop13_seq", "test_lstm_batchwise", - # "test_matmulinteger", "test_maxpool_with_argmax_2d_precomputed_pads", "test_maxpool_with_argmax_2d_precomputed_strides", "test_maxunpool_export_with_output_shape", From 696399f62868c570d51b7c39ca06e83ab8602619 Mon Sep 17 00:00:00 2001 From: An Wang Date: Wed, 2 Mar 2022 08:46:56 -0800 Subject: [PATCH 03/10] rm outdated comments --- python/tvm/relay/frontend/onnx.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 4ff5b034194f..f70d3522bfa1 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -4058,7 +4058,6 @@ class MatMulInteger(OnnxOpConverter): @classmethod def _impl_v10(cls, inputs, attr, params): - # The production MUST never overflow. The accumulation may overflow if and only if in 32 bits a = inputs[0] b = inputs[1] @@ -4076,7 +4075,6 @@ def _impl_v10(cls, inputs, attr, params): a_zero_point = _op.const(0.0, dtype=a_dtype) b_zero_point = _op.const(0.0, dtype=b_dtype) - # We use a_dtype here because a_dtype and b_dtype are equivalent out_zero_point = _op.const(0.0, dtype="int32") if len(inputs) == 4: From 7f4588d1b38106eba1181dba74d24f24ccdb8017 Mon Sep 17 00:00:00 2001 From: An Wang Date: Thu, 3 Mar 2022 00:29:46 -0800 Subject: [PATCH 04/10] fix lint and review --- python/tvm/relay/frontend/onnx.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index f70d3522bfa1..ab0eeb091043 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -3913,10 +3913,17 @@ class QLinearMatMul(OnnxOpConverter): - Only supports 2D input tensors. - Not guaranteed to meet the integer-overflow behavior stipulated in the ONNX documentation for this operator. + + The QLinearMatMul converter is re-used for MatMulInteger and is adapted for + the latter with the optional `expected_out_dtypes` argument. """ @classmethod - def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=["int8", "uint8"]): + def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=None): + if expected_out_dtypes is None: + # The default QLinearMatMul converter is expected to have one of + # these output dtypes. + expected_out_dtypes = ["int8", "uint8"] # Some of the ops used below take scalar-like inputs, and may require either # of the following: @@ -4028,6 +4035,11 @@ def try_resolve_to_const(x, dtype_override=None): matmul_result_scale_scalar = fold_constant(_op.multiply(a_scale_scalar, b_scale_scalar)) matmul_result_zp_scalar = _op.const(0, dtype="int32") + if "int32" in expected_out_dtypes: + # This is the adaptation of the QLinearMatMul converter for MatMulInteger, + # in the MatMulInteger case we skip the unnecessary requantization step. + return matmul_result + # requantize requires y_scale to be constant, # if y_scale is not constant, doing dequantize -> quantize if isinstance(y_scale_scalar, _expr.Constant): From f80198cbcea46e685f8bc56b25f847fe4eee5570 Mon Sep 17 00:00:00 2001 From: An Wang Date: Fri, 4 Mar 2022 12:38:51 -0800 Subject: [PATCH 05/10] wip --- tests/python/frontend/onnx/test_forward.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 94fd0a5de40b..b15b87550615 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -5150,6 +5150,9 @@ def verify_eyelike(indata): @pytest.mark.parametrize("onnx_test", onnx_test_folders) @tvm.testing.parametrize_targets def test_onnx_nodes(target, dev, onnx_test): + # breakpoint() + target = "cuda" + dev = tvm.cuda(0) target_kind = tvm.target.Target(target).kind.name if onnx_test in unsupported_onnx_tests: @@ -5189,6 +5192,8 @@ def test_onnx_nodes(target, dev, onnx_test): else: raise ImportError(str(tensor) + " not labeled as an import or an output") + breakpoint() + print(inputs) tvm_val = get_tvm_output_with_vm(onnx_model, inputs, target, dev) if len(outputs) == 1: tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=rtol, atol=atol) From 132c31464f9ff9846b72d2dca0f9ba4b883429ea Mon Sep 17 00:00:00 2001 From: An Wang Date: Mon, 28 Mar 2022 15:16:03 -0700 Subject: [PATCH 06/10] fixes --- python/tvm/topi/cuda/tensorcore_alter_op.py | 6 ++++-- tests/python/frontend/onnx/test_forward.py | 3 --- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py index 080ddf28b7c2..d1d27587d128 100644 --- a/python/tvm/topi/cuda/tensorcore_alter_op.py +++ b/python/tvm/topi/cuda/tensorcore_alter_op.py @@ -148,16 +148,18 @@ def _dense_legalize(attrs, inputs, arg_types): # Pad input and output channels to use tensorcore schedule. if dtype in ["float16", "int8", "uint8"]: - # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) + # The shape of (M, K, N) must be multiple of + # (16, 16, 16) or (32, 16, 8) or (8, 16, 32) or (4, 4, 4) if ( (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) + or (M % 4 == 0 and K % 4 == 0 and N % 4 == 0) ): # no need to pad return None - candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)] + candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32), (4, 4, 4)] elif dtype in ["int4", "uint4"]: if M % 8 == 0 and K % 32 == 0 and N % 8 == 0: # no need to pad diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index b15b87550615..bce89fb38a21 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -5150,7 +5150,6 @@ def verify_eyelike(indata): @pytest.mark.parametrize("onnx_test", onnx_test_folders) @tvm.testing.parametrize_targets def test_onnx_nodes(target, dev, onnx_test): - # breakpoint() target = "cuda" dev = tvm.cuda(0) target_kind = tvm.target.Target(target).kind.name @@ -5192,8 +5191,6 @@ def test_onnx_nodes(target, dev, onnx_test): else: raise ImportError(str(tensor) + " not labeled as an import or an output") - breakpoint() - print(inputs) tvm_val = get_tvm_output_with_vm(onnx_model, inputs, target, dev) if len(outputs) == 1: tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=rtol, atol=atol) From 36c2690b6e1852b667775f4d67d328fed3cbf849 Mon Sep 17 00:00:00 2001 From: An Wang Date: Mon, 28 Mar 2022 15:16:56 -0700 Subject: [PATCH 07/10] fix --- tests/python/frontend/onnx/test_forward.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index bce89fb38a21..94fd0a5de40b 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -5150,8 +5150,6 @@ def verify_eyelike(indata): @pytest.mark.parametrize("onnx_test", onnx_test_folders) @tvm.testing.parametrize_targets def test_onnx_nodes(target, dev, onnx_test): - target = "cuda" - dev = tvm.cuda(0) target_kind = tvm.target.Target(target).kind.name if onnx_test in unsupported_onnx_tests: From a3ba9ad7754eb9b89f5f21133e581a291a8b49dd Mon Sep 17 00:00:00 2001 From: An Wang Date: Tue, 5 Apr 2022 17:42:30 -0700 Subject: [PATCH 08/10] alter tests --- tests/python/relay/test_pass_legalize_tensorcore.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py index 97860630dea5..4933520ac3fe 100644 --- a/tests/python/relay/test_pass_legalize_tensorcore.py +++ b/tests/python/relay/test_pass_legalize_tensorcore.py @@ -249,6 +249,7 @@ def expected(): a = before() a = run_opt_pass(a, transform.Legalize()) b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b) # dense @@ -258,8 +259,8 @@ def expected(): _test_legalize_dense((8, 15), (32, 15), (0, 1, 0), dtype) _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype) _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype) - _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype) - _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), dtype, False) + _test_legalize_dense((3, 16), (32, 16), (1, 0, 0), dtype) + _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), dtype, False) # Test if units parameter is correctly updated _test_legalize_dense((8, 16), (30, 16), (0, 0, 2), "float16", units=30) From 749cc1642679ad8ca614ca3261a44dc409fe107c Mon Sep 17 00:00:00 2001 From: An Wang Date: Tue, 12 Apr 2022 16:08:15 -0700 Subject: [PATCH 09/10] extra 4x4x4 step --- python/tvm/topi/cuda/tensorcore_alter_op.py | 26 +++++++++++++++---- .../relay/test_pass_legalize_tensorcore.py | 4 +-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py index d1d27587d128..972f77cf6f36 100644 --- a/python/tvm/topi/cuda/tensorcore_alter_op.py +++ b/python/tvm/topi/cuda/tensorcore_alter_op.py @@ -149,17 +149,17 @@ def _dense_legalize(attrs, inputs, arg_types): # Pad input and output channels to use tensorcore schedule. if dtype in ["float16", "int8", "uint8"]: # The shape of (M, K, N) must be multiple of - # (16, 16, 16) or (32, 16, 8) or (8, 16, 32) or (4, 4, 4) + # (16, 16, 16) or (32, 16, 8) or (8, 16, 32) + # from https://arxiv.org/pdf/1811.09736.pdf if ( (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) - or (M % 4 == 0 and K % 4 == 0 and N % 4 == 0) ): # no need to pad return None - candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32), (4, 4, 4)] + candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)] elif dtype in ["int4", "uint4"]: if M % 8 == 0 and K % 32 == 0 and N % 8 == 0: # no need to pad @@ -172,7 +172,19 @@ def _dense_legalize(attrs, inputs, arg_types): if extra_flops_ratio > 2: logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio) - return None + + # If tensorcore schedule padding fails, pad to nearest upward 4x4x4 as long as + # the additional flops ratio isn't double or more. + # Note that 4x4x4 is invalid for tensorcore scheduling, but padding upwards to 4x4x4 + # doesn't hurt if tensorcore padding has already failed. + if M % 4 == 0 and K % 4 == 0 and N % 4 == 0: + # No need to pad + return None + (dm, dk, dn) = _pad_to(M, K, N, (4, 4, 4)) + extra_flops_ratio = _extra_flops(M, K, N, dm, dk, dn) / (M * K * N) + + if extra_flops_ratio > 2: + return None logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio) @@ -200,7 +212,7 @@ def pad_to_tensorcore(M, K, N, candidates): best_pad = (0, 0, 0) for padding in candidates: dm, dk, dn = _pad_to(M, K, N, padding) - e = (M + dm) * (N + dn) * (K + dk) - M * N * K + e = _extra_flops(M, K, N, dm, dk, dn) # print(dm, dk, dn, e, flops) if e < extra_flops: extra_flops = e @@ -208,6 +220,10 @@ def pad_to_tensorcore(M, K, N, candidates): return best_pad, extra_flops / flops +def _extra_flops(M, K, N, dm, dk, dn): + return (M + dm) * (N + dn) * (K + dk) - M * N * K + + def _pad_to(M, K, N, PADDING): dm, dk, dn = 0, 0, 0 diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py index 4933520ac3fe..89ec57c3f8c8 100644 --- a/tests/python/relay/test_pass_legalize_tensorcore.py +++ b/tests/python/relay/test_pass_legalize_tensorcore.py @@ -259,7 +259,7 @@ def expected(): _test_legalize_dense((8, 15), (32, 15), (0, 1, 0), dtype) _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype) _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype) - _test_legalize_dense((3, 16), (32, 16), (1, 0, 0), dtype) + _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype) _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), dtype, False) # Test if units parameter is correctly updated @@ -272,7 +272,7 @@ def expected(): _test_legalize_dense((7, 31), (31, 31), (1, 1, 1), "int4") _test_legalize_dense((3, 32), (32, 32), (5, 0, 0), "int4") _test_legalize_dense((8, 16), (32, 16), (0, 16, 0), "int4") - _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), "int4", False) + _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), "int4", False) @tvm.testing.uses_gpu From bbdf02c9e62f123bd37b0c78e0bb9967cce8b5e3 Mon Sep 17 00:00:00 2001 From: An Wang Date: Tue, 12 Apr 2022 17:06:01 -0700 Subject: [PATCH 10/10] comments --- python/tvm/topi/cuda/tensorcore_alter_op.py | 16 ++++++++-------- .../relay/test_pass_legalize_tensorcore.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py index 972f77cf6f36..0ba428014548 100644 --- a/python/tvm/topi/cuda/tensorcore_alter_op.py +++ b/python/tvm/topi/cuda/tensorcore_alter_op.py @@ -148,9 +148,7 @@ def _dense_legalize(attrs, inputs, arg_types): # Pad input and output channels to use tensorcore schedule. if dtype in ["float16", "int8", "uint8"]: - # The shape of (M, K, N) must be multiple of - # (16, 16, 16) or (32, 16, 8) or (8, 16, 32) - # from https://arxiv.org/pdf/1811.09736.pdf + # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) if ( (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) @@ -169,10 +167,10 @@ def _dense_legalize(attrs, inputs, arg_types): return None (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates) + skip_pad = extra_flops_ratio > 2 - if extra_flops_ratio > 2: - logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio) - + if skip_pad and dtype in ["int8", "uint8"]: + skip_pad = False # If tensorcore schedule padding fails, pad to nearest upward 4x4x4 as long as # the additional flops ratio isn't double or more. # Note that 4x4x4 is invalid for tensorcore scheduling, but padding upwards to 4x4x4 @@ -182,9 +180,11 @@ def _dense_legalize(attrs, inputs, arg_types): return None (dm, dk, dn) = _pad_to(M, K, N, (4, 4, 4)) extra_flops_ratio = _extra_flops(M, K, N, dm, dk, dn) / (M * K * N) + skip_pad = extra_flops_ratio > 2 - if extra_flops_ratio > 2: - return None + if skip_pad: + logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio) + return None logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio) diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py index 89ec57c3f8c8..0e3c171d87da 100644 --- a/tests/python/relay/test_pass_legalize_tensorcore.py +++ b/tests/python/relay/test_pass_legalize_tensorcore.py @@ -272,7 +272,7 @@ def expected(): _test_legalize_dense((7, 31), (31, 31), (1, 1, 1), "int4") _test_legalize_dense((3, 32), (32, 32), (5, 0, 0), "int4") _test_legalize_dense((8, 16), (32, 16), (0, 16, 0), "int4") - _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), "int4", False) + _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), "int4", False) @tvm.testing.uses_gpu